TCP绑定套接口链表

内核中全局变量tcp_hashinfo的成员bhash保存有全局的bind套接口。bhash是以哈希值为索引的数组,其中inet_bhashfn函数负责依据端口号和命名空间生成哈希值。每个数组元素包含一个哈希值相同的inet_bind_bucket结构组成的链表(chain链表)。

struct inet_hashinfo tcp_hashinfo;

struct inet_hashinfo {
    ...
	
    /* Ok, let's try this, I give up, we do need a local binding
     * TCP hash as well as the others for fast bind/connect.
     */
    struct kmem_cache       *bind_bucket_cachep;
    struct inet_bind_hashbucket *bhash;
    unsigned int            bhash_size;
	
struct inet_bind_hashbucket {
    spinlock_t      lock;
    struct hlist_head   chain;
};

每个inet_bind_bucket结构由其成员port端口号区分,成员owners链表包括绑定此端口的套接口,包括TCP子套接口,TIMEWAIT套接口等。

struct inet_bind_bucket {
    possible_net_t      ib_net;
    int         l3mdev;
    unsigned short      port;
    ...
    struct hlist_node   node;
    struct hlist_head   owners;
};

在TCP初始化时,创建绑定套接口桶所使用的mem_cache,赋值到bind_bucket_cachep,其用于之后分配inet_bind_bucket结构。之后分配(tcp_hashinfo.ehash_mask + 1)数量的inet_bind_hashbucket结构,将最大数量限定在64K,其中(ehash_mask + 1)数量对应于分配的established状态套接口(还包括其它状态)哈希桶的数量。

void __init tcp_init(void)
{
    ...
    tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
                  sizeof(struct inet_bind_bucket), 0,
                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);

    tcp_hashinfo.bhash =
        alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket),
                    tcp_hashinfo.ehash_mask + 1,
                    17, /* one slot per 128 KB of memory */
                    0,
                    &tcp_hashinfo.bhash_size,
                    NULL,
                    0,
                    64 * 1024);
    tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
    for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
        spin_lock_init(&tcp_hashinfo.bhash[i].lock);
        INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
    }

函数alloc_large_system_hash返回的值tcp_hashinfo.bhash_size为一个移位量,将1左移这个量得到分配的inet_bind_hashbucket结构的数量。接下来,初始化每个桶bucket的锁和链表头。

绑定套接口结构inet_bind_bucket由以下函数inet_bind_bucket_create分配创建,并且将其链接到相应的bind哈希链表上。

struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
                         struct net *net, struct inet_bind_hashbucket *head,
                         const unsigned short snum, int l3mdev)
{
    struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);

    if (tb) {
        write_pnet(&tb->ib_net, net);
        tb->l3mdev    = l3mdev;
        tb->port      = snum;
        tb->fastreuse = 0;
        tb->fastreuseport = 0;
        INIT_HLIST_HEAD(&tb->owners);
        hlist_add_head(&tb->node, &head->chain);
    }
    return tb;

TCP客户端

用户层在调用connect时(内核函数tcp_v4_connect),由以下函数__inet_hash_connect确定套接口的唯一性。首先,检查本地源端口是否已经选定,如果port有值,并且当前套接口为第一个绑定此端口的套接口,端口可用,处理结束。

int __inet_hash_connect(struct inet_timewait_death_row *death_row,
        struct sock *sk, u32 port_offset,...)
{
    int port = inet_sk(sk)->inet_num;

    if (port) {
        head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
        tb = inet_csk(sk)->icsk_bind_hash;
        spin_lock_bh(&head->lock);
        if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
            inet_ehash_nolisten(sk, NULL, NULL);
            spin_unlock_bh(&head->lock);
            return 0;
        }
        spin_unlock(&head->lock);
        /* No definite answer... Walk to established hash table */
        ret = check_established(death_row, sk, port, NULL);
        local_bh_enable();
        return ret;
    }

如果没有指定端口,由系统自动选择源端口,并且遍历端口对应的全局哈希链表,检查端口是否可用。

    for (i = 0; i < remaining; i += 2, port += 2) {
        if (unlikely(port >= high))
            port -= remaining;
        if (inet_is_local_reserved_port(net, port))
            continue;
        head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
        spin_lock_bh(&head->lock);

        /* Does not bother with rcv_saddr checks, because
         * the established check is already unique enough.
         */
        inet_bind_bucket_for_each(tb, &head->chain) {
            if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && tb->port == port) {
                if (tb->fastreuse >= 0 || tb->fastreuseport >= 0)
                    goto next_port;
                WARN_ON(hlist_empty(&tb->owners));
                if (!check_established(death_row, sk, port, &tw))
                    goto ok;
                goto next_port;
            }
        }

inet_bind_bucket结构遍历结束,并没有找到可用端口。此时创建新的对应于此端口的inet_bind_bucket结构。

        tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port, l3mdev);
        if (!tb) {       
            spin_unlock_bh(&head->lock);
            return -ENOMEM;
        }   
        goto ok;
    ...

ok:
    hint += i + 2;

    /* Head lock still held and bh's disabled */
    inet_bind_hash(sk, tb, port);

最后,如下的inet_bind_hash函数,将套接口绑定在桶inet_bind_bucket的成员owners上,并且将套接口的icsk_bind_hash成员指向inet_bind_bucket桶,完成两者的相互关联。

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
            const unsigned short snum)
{
    inet_sk(sk)->inet_num = snum;
    sk_add_bind_node(sk, &tb->owners);
    inet_csk(sk)->icsk_bind_hash = tb;
}

TCP服务端

用户层listen调用,对应于内核函数inet_listen,其中调用inet_csk_listen_start进行处理。

int inet_listen(struct socket *sock, int backlog)
{
    struct sock *sk = sock->sk;

    ...
    /* Really, if the socket is already in listen state we can only allow the backlog to be adjusted.
     */
    if (old_state != TCP_LISTEN) {
        ...
        err = inet_csk_listen_start(sk, backlog);
        if (err) goto out;

核心函数sk->sk_prot->get_port来确定监听端口的可用性。另外对于函数__inet_bind和inet_autobind等和端口相关的操作中,也需要使用get_port确定端口号。

int inet_csk_listen_start(struct sock *sk, int backlog)
{
    struct inet_connection_sock *icsk = inet_csk(sk);
    struct inet_sock *inet = inet_sk(sk);

    ...
    /* There is race window here: we announce ourselves listening,
     * but this transition is still not validated by get_port().
     * It is OK, because this socket enters to hash table only
     * after validation is complete.
     */
    inet_sk_state_store(sk, TCP_LISTEN);
    if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
        inet->inet_sport = htons(inet->inet_num);

TCP协议注册的get_port函数为inet_csk_get_port,如下所示。如果参数snum为零,表示可选取任一可用的本地端口号,内核尝试选取奇数端口,偶数端口在connect系统调用中分配。如果找到一个可用端口,并且对应的inet_bind_bucket结构已经分配好,返回成功。否则,创建新的inet_bind_bucket结构。

int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
    bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
    struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
    int ret = 1, port = snum;
    struct inet_bind_hashbucket *head;
    struct net *net = sock_net(sk);
    struct inet_bind_bucket *tb = NULL;

    l3mdev = inet_sk_bound_l3mdev(sk);

    if (!port) {
        head = inet_csk_find_open_port(sk, &tb, &port);
        if (!head)
            return ret;
        if (!tb)
            goto tb_not_found;
        goto success;
    }

这里是函数参数snum指定的端口号不为零的情况,遍历对应的链表,查找是否存在已分配好的inet_bind_bucket结构,为真跳转到tb_found标签,进行地址可用性判断。否则,创建一个新的inet_bind_bucket结构,此时,其owners链表为空。

对于未指定端口号,由系统自动选择的情况,这里也是新建inet_bind_bucket结构。

    head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
    spin_lock_bh(&head->lock);
    inet_bind_bucket_for_each(tb, &head->chain)
        if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
            tb->port == port)
            goto tb_found;
tb_not_found:
    tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port, l3mdev);
    if (!tb)
        goto fail_unlock;

只有函数参数snum不为零,并且系统中已经存在对应的inet_bind_bucket结构时,其成员owners链表才可能不为空,需要判断可用性。对于新创建的inet_bind_bucket结构,由函数inet_bind_hash将其和套接口进行关联。

tb_found:
    if (!hlist_empty(&tb->owners)) {
        if (sk->sk_reuse == SK_FORCE_REUSE)
            goto success;

        if ((tb->fastreuse > 0 && reuse) || sk_reuseport_match(tb, sk))
            goto success;
        if (inet_csk_bind_conflict(sk, tb, true, true))
            goto fail_unlock;
    }
success:
    inet_csk_update_fastreuse(tb, sk);

    if (!inet_csk(sk)->icsk_bind_hash)
        inet_bind_hash(sk, tb, port);
    WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); 

对于TCP子套接口,在完成三次握手之后,对于新分配的子套接口,由函数__inet_inherit_port负责将子套接口链接到正确的哈希链表上。

struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
                  struct request_sock *req, struct dst_entry *dst,
                  struct request_sock *req_unhash, bool *own_req)
{

    newsk = tcp_create_openreq_child(sk, req, skb);
    if (!newsk)
        goto exit_nonewsk;

    ...
    if (__inet_inherit_port(sk, newsk) < 0)
        goto put_and_exit;
    *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
                       &found_dup_sk);
    if (likely(*own_req)) {
        tcp_move_syn(newtp, req);
        ireq->ireq_opt = NULL;
    }

对于透明代理tproxy或者重定向redirecting,报文skb可能会被监听在不同端口的代理程序的套接字接收,此时,需要判断对应于此端口的bucket是否存在,否则,重新创建一个新的inet_bind_bucket结构。

最后,函数inet_bind_hash将子套接口链接在新创建的inet_bind_bucket结构成员owners链表上。

int __inet_inherit_port(const struct sock *sk, struct sock *child)
{
    struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
    unsigned short port = inet_sk(child)->inet_num;
    const int bhash = inet_bhashfn(sock_net(sk), port, table->bhash_size);
    struct inet_bind_hashbucket *head = &table->bhash[bhash];
    struct inet_bind_bucket *tb;

    spin_lock(&head->lock);
    tb = inet_csk(sk)->icsk_bind_hash;
    if (unlikely(!tb)) {
        spin_unlock(&head->lock);
        return -ENOENT;
    }
    if (tb->port != port) {
        l3mdev = inet_sk_bound_l3mdev(sk);

        /* NOTE: using tproxy and redirecting skbs to a proxy
         * on a different listener port breaks the assumption
         * that the listener socket's icsk_bind_hash is the same
         * as that of the child socket. We have to look up or
         * create a new bind bucket for the child here. */
        inet_bind_bucket_for_each(tb, &head->chain) {
            if (net_eq(ib_net(tb), sock_net(sk)) &&
                tb->l3mdev == l3mdev && tb->port == port)
                break;
        }
        if (!tb) {
            tb = inet_bind_bucket_create(table->bind_bucket_cachep,
                             sock_net(sk), head, port, l3mdev);
            if (!tb) {
                spin_unlock(&head->lock);
                return -ENOMEM;
            }
        }
        inet_csk_update_fastreuse(tb, child);
    }
    inet_bind_hash(child, tb, port);

TIMEWAIT套接口

在分配新的TIMEWAIT套接口之后,由函数inet_twsk_hashdance将其链接到绑定链表中。

void tcp_time_wait(struct sock *sk, int state, int timeo)
{
    const struct inet_connection_sock *icsk = inet_csk(sk);
    const struct tcp_sock *tp = tcp_sk(sk);
    struct inet_timewait_sock *tw;
    struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;

    tw = inet_twsk_alloc(sk, tcp_death_row, state);
    if (tw) {
        ...
        inet_twsk_schedule(tw, timeo);
        /* Linkage updates.
         * Note that access to tw after this point is illegal.
         */
        inet_twsk_hashdance(tw, sk, &tcp_hashinfo);

将新创建的TIMEWAIT状态套接口链接到原套接口对应的绑定结构的owners链表上。

void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
               struct inet_hashinfo *hashinfo)
{
    const struct inet_sock *inet = inet_sk(sk);
    const struct inet_connection_sock *icsk = inet_csk(sk);
    struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
    spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
    struct inet_bind_hashbucket *bhead;
	
    /* Step 1: Put TW into bind hash. Original socket stays there too.
       Note, that any socket with inet->num != 0 MUST be bound in
       binding cache, even if it is closed.
     */
    bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)];
    spin_lock(&bhead->lock);
    tw->tw_tb = icsk->icsk_bind_hash;
    WARN_ON(!icsk->icsk_bind_hash);
    inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);

套接口销毁

如果销毁时icsk_bind_hash有值,由函数inet_put_port处理。

void tcp_v4_destroy_sock(struct sock *sk)
{

    /* Clean up a referenced TCP bind bucket. */
    if (inet_csk(sk)->icsk_bind_hash)
        inet_put_port(sk);

当套接口释放时,尝试释放其使用的端口,如下函数__inet_put_port,将套接口由inet_bind_bucket结构的owners链表删除,将套接口指向icsk_bind_hash结构的指针设为空。

如果inet_bind_bucket结构的owners链表为空了,将其由inet_bind_hashbucket结构的成员chain链表中删除,并释放inet_bind_bucket结构。

static void __inet_put_port(struct sock *sk)
{
    struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
    const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
            hashinfo->bhash_size);
    struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
    struct inet_bind_bucket *tb;

    spin_lock(&head->lock);
    tb = inet_csk(sk)->icsk_bind_hash;
    __sk_del_bind_node(sk);
    inet_csk(sk)->icsk_bind_hash = NULL;
    inet_sk(sk)->inet_num = 0;
    inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);

内核版本 5.10

相关推荐
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页