SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)

来源:互联网 发布:淘宝男装店名起名大全 编辑:程序博客网 时间:2024/05/18 01:54
[bind]
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen){struct socket *sock;int err, fput_needed;sock = sockfd_lookup_light(fd, &err, &fput_needed);
从文件fd中得到对应的socket:
[bind->sockfd_lookup_light]
static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed){struct fd f = fdget(fd);struct socket *sock;*err = -EBADF;if (f.file) {sock = sock_from_file(f.file, err);if (likely(sock)) {*fput_needed = f.flags;return sock;}fdput(f);}return NULL;}
调用fdget先从进程的文件列表中通过fd找到对应的文件,然后从中得到socket,文件的flag通过fput_needed返回:
[bind->sockfd_lookup_light->sock_from_file]
struct socket *sock_from_file(struct file *file, int *err){if (file->f_op == &socket_file_ops)return file->private_data;/* set in sock_map_fd */*err = -ENOTSOCK;return NULL;}
在初始化文件时,file->private_data指向新建的socket
[bind]
if (sock) {err = move_addr_to_kernel(umyaddr, addrlen, &address);
将IP地址从用户空间copy到内核空间:
[bind->move_addr_to_kernel]
/** *move_addr_to_kernel-copy a socket address into kernel space *@uaddr: Address in user space *@kaddr: Address in kernel space *@ulen: Length in user space * *The address is copied into kernel space. If the provided address is *too long an error code of -EINVAL is returned. If the copy gives *invalid addresses -EFAULT is returned. On a success 0 is returned. */int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr){if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))return -EINVAL;if (ulen == 0)return 0;if (copy_from_user(kaddr, uaddr, ulen))return -EFAULT; return audit_sockaddr(ulen, kaddr);}
对地址进行一些检测,将地址copy到内核空间。audit_sockaddr对地址进一步验证。
[bind]
if (err >= 0) {err = sock->ops->bind(sock, (struct sockaddr *)&address, addrlen);}} return err;}
对TCP,这里的ops为inet_stream_ops,调用的是inet_bind。
[bind->inet_bind]
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len){struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;struct net *net = sock_net(sk);int chk_addr_ret;if (addr_len < sizeof(struct sockaddr_in))goto out; chk_addr_ret = __inet_dev_addr_type(net, NULL, addr->sin_addr.s_addr);
先对地址的长度检查,然后得到地址的类型:
[bind->inet_bind->__inet_dev_addr_type]
/* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. */static inline unsigned int __inet_dev_addr_type(struct net *net,const struct net_device *dev,__be32 addr){if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))return RTN_BROADCAST;if (ipv4_is_multicast(addr))return RTN_MULTICAST;
如果地址(addr & htonl(0xff000000)) == htonl(0x00000000)或者addr == htonl(0xffffffff),类型为广播。如果(addr & htonl(0xf0000000)) == htonl(0xe0000000)类型为多播。
struct flowi4fl4 = { .daddr = addr };struct fib_resultres;unsigned int ret = RTN_BROADCAST;struct fib_table *local_table;local_table = fib_get_table(net, RT_TABLE_LOCAL);if (local_table) {ret = RTN_UNICAST;if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {if (!dev || dev == res.fi->fib_dev)ret = res.type;}}return ret;}
如果在本地路由表中查找到,类型为路由表中的值。如果不存在本地路由表,类型为广播。否则类型为单播。
[bind->inet_bind]
snum = ntohs(addr->sin_port);if (snum && snum < PROT_SOCK &&    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))goto out;        /* Check these errors (active socket, double bind). */if (sk->sk_state != TCP_CLOSE || inet->inet_num)goto out_release_sock;inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)inet->inet_saddr = 0;  /* Use device *//* Make sure we are allowed to bind here. */if (sk->sk_prot->get_port(sk, snum)) {inet->inet_saddr = inet->inet_rcv_saddr = 0;err = -EADDRINUSE;goto out_release_sock;}
得到端口号并对其范围做一个检测。绑定时socket如果不处于关闭状态或是本地端口不为0(己绑定),出错。根据上面的地址类型设置地址。然后调用get_port,对TCP,调用inet_csk_get_port确定此地址是否可以被绑定:
[bind->inet_bind->inet_csk_get_port]
/* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. */int inet_csk_get_port(struct sock *sk, unsigned short snum){struct net *net = sock_net(sk);if (!snum) {int low, high;again:inet_get_local_port_range(net, &low, &high);
当端口号为0时,先得到本地端口号的范围
[bind->inet_bind->inet_csk_get_port->inet_get_local_port_range]
void inet_get_local_port_range(struct net *net, int *low, int *high){*low = net->ipv4.sysctl_local_ports.range[0];*high = net->ipv4.sysctl_local_ports.range[1];}
net->ipv4.sysctl_local_ports在初始化sysctl模块时被设置,在函数ipv4_sysctl_init_net中,范围为:[ 32768, 61000 ]
[bind->inet_bind->inet_csk_get_port]
int smallest_size = -1;if (!snum) { int remaining, rover;again:                ...                remaining = (high - low) + 1;rover = prandom_u32() % remaining + low; smallest_size = -1; do { if (inet_is_reserved_local_port(rover)) goto next_nolock;
先计算出一个随机的端口号,判断它是不是已经存在了:
[bind->inet_bind->inet_csk_get_port->inet_is_reserved_local_port]
extern unsigned long *sysctl_local_reserved_ports;static inline int inet_is_reserved_local_port(int port){return test_bit(port, sysctl_local_reserved_ports);}
sysctl_local_reserved_ports在inet_init中被初始化,是一个8192字节的内存块,用来保存己存在的端口号。
[bind->inet_bind->inet_csk_get_port]
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;struct inet_bind_hashbucket *head;struct inet_bind_bucket *tb;int smallest_size = -1, smallest_rover;kuid_t uid = sock_i_uid(sk);if (!snum) {int remaining, rover, low, high;again:...smallest_size = -1;do {...head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)];inet_bind_bucket_for_each(tb, &head->chain)if (net_eq(ib_net(tb), net) && tb->port == rover) {/* 列表项的fastreuse被设置,sk的sk_reuse被设置且状态不是TCP_LISTEN  * 或者列表项的fastreuseport大于0并且sk的sk_reuseport不为0并且两者的用户ID相同  * 上面满足后,找到列表项的num_owners最小的一个 */if ( (    ( tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN )        || (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)) )   && ( tb->num_owners < smallest_size || smallest_size == -1) ) {smallest_size = tb->num_owners;smallest_rover = rover;                                 /* 列表的bsockets值大于本地端口的范围                                                 * 在tb中无冲突,找到位置                                                 */ if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {snum = smallest_rover; goto tb_found; }                                           }                                           if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {                                                 snum = rover;                                                 goto tb_found;                                           }                                           goto next;                                  }                              break;                      next:                      next_nolock:                              if (++rover > high)                                   rover = low;                } while (--remaining > 0)
所有被绑定的端口都通过一个结构(inet_bind_bucket)放在一个列表中。对此列表进行循环,如果它们在同一个网络名字空间并且端口号相同,进一步判断。当tb->fastreuse大于0时,表示当中的socket的sk_reuse不为0且状态不是TCP_LISTEN;当tb->fastreuseport大于0时表示当中的socket的sk_reuseport不为0。如果端口号的范围超过范围,使用最小的端口号。TCP时,icsk_af_ops为ipv4_specific,调用的接口为inet_csk_bind_conflict。tb中将相同的socket列入到列表中,对这个列表中的元素也进行查询。remaining初始化为5,这样的检查总共进行5次。
[bind->inet_bind->inet_csk_get_port->bind_conflict]
int inet_csk_bind_conflict(const struct sock *sk,const struct inet_bind_bucket *tb, bool relax){struct sock *sk2;int reuse = sk->sk_reuse;int reuseport = sk->sk_reuseport;kuid_t uid = sock_i_uid((struct sock *)sk);/* * Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed * in tb->owners list belong to the same net - the * one this bucket belongs to. */sk_for_each_bound(sk2, &tb->owners) {/* socket不同并且 * 两个socket的sk_bound_dev_if相同且不为0  */if (  sk != sk2    && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {/* 两者的sk_reuse有一个不为0或列表项的状态为TCP_LISTEN;并且  * 两者的sk_reuseport 有一个不为0;或列表项的状态不是TCP_TIME_WAIT并且两者UID不同   */ if (   (!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN)     && (!reuseport || !sk2->sk_reuseport || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2))))) { /* 两者的sk_rcv_saddr 相同且不为0 */ if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || sk2->sk_rcv_saddr == sk->sk_rcv_saddr)break; } /* 两者的sk_reuse都不为0并且列表项的状态不是TCP_LISTEN  */if ( !relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) { /* 两者的sk_rcv_saddr 相同且不为0 */ if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || sk2->sk_rcv_saddr == sk->sk_rcv_saddr)break; }} }/* 找到的列表项不为0,有冲突  */return sk2 != NULL; }
列表中的socket与传进来的socket满足下面的条件:在同一个设备上(设备号不为0);接收地址相同(地址不为空);列表中的状态不是TCP_LISTEN且两者的sk_reuse都不为0;或列表中的状态是TCP_LISTEN或两者的sk_reuse有一个不为0,并且两者的sk_reuseport 有一个不为0,或列表项的状态不是TCP_TIME_WAIT且两者UID不同
[bind->inet_bind->inet_csk_get_port]
/* Exhausted local port range during search?  It is not * possible for us to be holding one of the bind hash * locks if this test triggers, because if 'remaining' * drops to zero, we broke out of the do/while loop at * the top level, not from the 'break;' statement. */ret = 1;if (remaining <= 0) {if (smallest_size != -1) {snum = smallest_rover;goto have_snum;}goto fail;}/* OK, here is the one we will use.  HEAD is * non-NULL and we hold it's mutex. */snum = rover;
如果remaining不为0,说明找到了。如果为0,并且smallest_size不为-1,就将商品号设为刚才的最小的端口号然后进一步检查,否则就出错了。下面就是当snum不为0的情况:
} else {have_snum:head = &hashinfo->bhash[inet_bhashfn(net, snum, hashinfo->bhash_size)];inet_bind_bucket_for_each(tb, &head->chain)if (net_eq(ib_net(tb), net) && tb->port == snum)goto tb_found;}tb = NULL;goto tb_not_found;
这时简单,只要在列表中找同一网络名字下相同的商品号就可以了。
tb_found:if (!hlist_empty(&tb->owners)) {if (sk->sk_reuse == SK_FORCE_REUSE)goto success;/* tb的fastreuse大于0并且sk的sk_reuse不为0且sk的状态不为TCP_LISTEN * 或tb的fastreuseport大于0并且sk的sk_reuseport不为0且tb的fastuid和sk的UID相同 */if (   (     (tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN)           || (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)) )     && smallest_size == -1) {goto success;} else {ret = 1;if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {/* sk的sk_reuse不为0且状态不为TCP_LISTEN;或tb的fastreuseport大于0并且sk的sk_reuseport不为0且tb的fastuid和sk的UID相同 */if (    (     (sk->sk_reuse && sk->sk_state != TCP_LISTEN)            || (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)))      && smallest_size != -1 && --attempts >= 0) {spin_unlock(&head->lock);goto again;}goto fail_unlock;}}}
如果tb中的列表为空,说明还是没找到。如果sk的sk_reuse值为SK_FORCE_REUSE,成功了。attempts为5,当不成功时会尝试5次
tb_not_found:ret = 1;if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, net, head, snum)) == NULL)goto fail_unlock;if (hlist_empty(&tb->owners)) {if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)tb->fastreuse = 1;elsetb->fastreuse = 0;if (sk->sk_reuseport) {tb->fastreuseport = 1;tb->fastuid = uid;} elsetb->fastreuseport = 0;} else {if (tb->fastreuse &&    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))tb->fastreuse = 0;if (tb->fastreuseport &&    (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))tb->fastreuseport = 0;}
当tb为0时,新建一个tb当作节点。当sk的状态不是TCP_LISTEN且sk_reuse不为0时,tb的fastreuse为1。当sk的sk_reuseport不国0时,tb的fastreuseport为1且fastuid设为sk的UID。当tb不为0时,根据sk的值要重设fastreuse和fastreuseport的值。
success:if (!inet_csk(sk)->icsk_bind_hash)inet_bind_hash(sk, tb, snum);WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);ret = 0;fail_unlock:spin_unlock(&head->lock);fail:local_bh_enable();return ret;}
最后将tb和端口号加入到列表中。
[bind->inet_bind->inet_csk_get_port->inet_bind_hash]
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum){ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; atomic_inc(&hashinfo->bsockets);inet_sk(sk)->inet_num = snum; hlist_add_head(&sk->sk_bind_node, &tb->owners);tb->num_owners++; inet_csk(sk)->icsk_bind_hash = tb;}
设置socket的本地端口号为snum,sk通过sk_bind_node,加入到tb->owners中
[bind->inet_bind]
if (inet->inet_rcv_saddr)sk->sk_userlocks |= SOCK_BINDADDR_LOCK;if (snum)sk->sk_userlocks |= SOCK_BINDPORT_LOCK;inet->inet_sport = htons(inet->inet_num);inet->inet_daddr = 0;inet->inet_dport = 0;sk_dst_reset(sk);err = 0;out_release_sock:release_sock(sk);out:return err;}
设置源端口号inet_sport为本地端口号inet_num。设置sk中的路由为空。如果找到,返回为0,说明端口号被己被绑定,出错。
0 0
原创粉丝点击