UDP传输队列长度sk_wmem_alloc统计

TCPIP协议 专栏收录该内容
105 篇文章 6 订阅

UDP协议使用sk_wmem_alloc统计当前UDP相关套接口发送缓存的占用。

统计初始化

在应用层创建套接口时,内核将新分配的套接口结构的成员变量sk_wmem_alloc初始化为1。

struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern)
{
    struct sock *sk;

    sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
    if (sk) {
        refcount_set(&sk->sk_wmem_alloc, 1);
    }
}

 

增加sk_wmwm_alloc统计

增加sk_wmem_alloc统计的基础函数为skb_set_owner_w。在发送流程相关的函数中,其能够将数据包skb占用的空间truesize添加到sk_wmem_alloc统计中,同时skb的销毁回调函数destructor赋值为sock_wfree函数,其将在skb销毁时,减去skb缓存长度相应的sk_wmem_alloc统计值。

void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
    skb->destructor = sock_wfree;
    /*
     * We used to take a refcount on sk, but following operation
     * is enough to guarantee sk_free() wont free this sock until
     * all in-flight packets are completed
     */
    refcount_add(skb->truesize, &sk->sk_wmem_alloc);
}

内核UDP的两个发送函数为udp_sendmsg和udp_sendpage,此处以发送数据函数udp_sendmsg为例,无论是非CORK模式的ip_make_skb函数或是CORK模式的ip_append_data函数,最终都会调用到__ip_append_data函数,前者利用__ip_append_data函数将数据包添加到自定义的队列中发送,后者将数据添加到通用的套接口发送队列中(sk_write_queue)。__ip_append_data在分配将数据由用户层拷贝到内核的skb缓存时,使用skb_set_owner_w增加sk_wmem_alloc的统计值。

int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
    /* Lockless fast path for the non-corking case. */
    if (!corkreq) {
        skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, msg->msg_flags);
        err = PTR_ERR(skb);
        if (!IS_ERR_OR_NULL(skb))
            err = udp_send_skb(skb, fl4);
        goto out;
    }
do_append_data:
    up->len += ulen;
    err = ip_append_data(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
    if (err)
        udp_flush_pending_frames(sk);
    else if (!corkreq)
        err = udp_push_pending_frames(sk);
    else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
        up->pending = 0;
}

函数__ip_append_data如下所示,其调用的两个skb缓存分配函数sock_alloc_send_skb和sock_wmalloc。前者调用alloc_skb_with_frags函数分配指定的线性缓存和页面片段,分配成功后调用的skb_set_owner_w增加sk_wmem_alloc统计。如果发送设备支持SG特性(NETIF_F_SG标志),将数据拷贝到skb的共享页面片段区,并且增加sk_wmem_alloc的统计值。

static int __ip_append_data(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, ...)
{
    while (length > 0) {
        if (copy <= 0) {
            if (transhdrlen) {
                skb = sock_alloc_send_skb(sk, alloclen + hh_len + 15, (flags & MSG_DONTWAIT), &err);
            } else {
                if (refcount_read(&sk->sk_wmem_alloc) <= 2 * sk->sk_sndbuf)
                    skb = sock_wmalloc(sk, alloclen + hh_len + 15, 1, sk->sk_allocation);
            }
        }
        if (!(rt->dst.dev->features&NETIF_F_SG)) {
        } else {
            skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
            skb->truesize += copy;
            refcount_add(copy, &sk->sk_wmem_alloc);
        }
    }
}
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, ...)
{   
    skb = alloc_skb_with_frags(header_len, data_len, max_page_order, errcode, sk->sk_allocation);
    if (skb)
        skb_set_owner_w(skb, sk);
    return skb;
}

而后者在分配skb缓存之前执行了两次判断,首先在函数__ip_append_data中,如果sk_wmem_alloc统计值小于两倍的套接口指定最大发送缓存值sk_sndbuf,才能执行sock_wmalloc分配函数,否则返回无内存错误-ENOBUFS。其次是在函数sock_wmalloc中,如果强制分配force为真(__ip_append_data将其固定为真1)或者sk_wmem_alloc统计值小于套接口指定的最大发送缓存值sk_sndbuf,则进行skb分配。最后也是使用skb_set_owner_w函数增加sk_wmem_alloc的统计值。

struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority)
{               
    if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
        struct sk_buff *skb = alloc_skb(size, priority);
        if (skb) {
            skb_set_owner_w(skb, sk);
            return skb;
        }   
    }
}

最后,两个IP层发送函数ip_finish_output2和ip_do_fragment在由需要的情况下需要分配skb缓存,对于前者而言,由于skb缓存的头部可用空间不足以容纳二层协议头部信息时,需要重新分配一个skb缓存,将新的长度值添加的sk_wmem_alloc统计中。

static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
    if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
        skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
        if (skb->sk)
            skb_set_owner_w(skb2, skb->sk);
    }
}

对于分片函数ip_do_fragment,在执行慢速路径中,需要分配新的skb缓存拷贝需分片的skb缓存的部分长度,内核将新分配的skb缓存长度增加到sk_wmem_alloc统计中。

int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *))
{
slow_path:
    while (left > 0) {
        skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
        ip_copy_metadata(skb2, skb);

        if (skb->sk)
            skb_set_owner_w(skb2, skb->sk);
    }
}

减少sk_wmem_alloc


UDP两个基本的sk_wmem_alloc统计值减少函数分别为:套接口释放函数sk_free和skb缓存释放函数sock_wfree,另外,相对应的TCP的skb缓存释放函数为__sock_wfree。由于在套接口创建时,sk_wmem_alloc初始化为1,如果建议之后其值为0,表明套接口关联的发送队列已空,调用__sk_free释放套接口。否则,此套接口将在发送队列中最后一个skb缓存释放时被清除(调用__sk_free),参见函数sock_wfree。

void sk_free(struct sock *sk)
{
    if (refcount_dec_and_test(&sk->sk_wmem_alloc))
        __sk_free(sk);
}

void sock_wfree(struct sk_buff *skb)
{
    struct sock *sk = skb->sk;
    unsigned int len = skb->truesize;

    if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
        WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
        sk->sk_write_space(sk);
        len = 1;
    }
    if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
        __sk_free(sk);
}

统计值判断控制

如前所述的UDP发送路径上的skb分配函数sock_alloc_send_pskb,如果sk_wmem_alloc统计值低于套接口限定的发送缓存最大值sk_sndbuf,直接进行skb分配。否则,设置套接口的空间不足标志SOCKWQ_ASYNC_NOSPACE,如果用户设置了不需等待标志MSG_DONTWAIT立即返回错误码-EAGAIN,反之,等待缓存可用,参见函数sock_wait_for_wmem。

需要注意sk_wmem_alloc_get函数,其返回值为sk_wmem_alloc统计值减去1,由于在套接口分配时sk_wmem_alloc初始化为1,此处减去1意味着取出的是发送缓存skb及数据占用的空间长度。

struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, ...)
{ 
    timeo = sock_sndtimeo(sk, noblock);
    for (;;) { 
        if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
            break;
        
        sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
        set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
        err = -EAGAIN;
        if (!timeo)
            goto failure;
        if (signal_pending(current))
            goto interrupted;
        timeo = sock_wait_for_wmem(sk, timeo);
    }
    skb = alloc_skb_with_frags(header_len, data_len, max_page_order, errcode, sk->sk_allocation);
}
static inline int sk_wmem_alloc_get(const struct sock *sk)
{
    return refcount_read(&sk->sk_wmem_alloc) - 1;
}

直到在发送的缓存sk_wmem_alloc统计值降低到小于sk_sndbuf或者超时,sock_wait_for_wmem函数才会退出。

static long sock_wait_for_wmem(struct sock *sk, long timeo)
{
    sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
    for (;;) {
        if (!timeo)
            break;
        if (signal_pending(current))
            break;
        set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
        if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
            break;
        timeo = schedule_timeout(timeo);
    }
}

另外,内核网络协议中最重要的缓存空间判断函数__sk_mem_raise_allocated,对于UDP协议而言,如果sk_wmem_alloc统计值小于系统限定的协议最小值(/proc/sys/net/ipv4/udp_wmem_min),允许内存分配。

int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
    if (kind == SK_MEM_RECV) {
    } else { /* SK_MEM_SEND */
        int wmem0 = sk_get_wmem0(sk, prot);

        if (sk->sk_type == SOCK_STREAM) {
        } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
                return 1;
        }
    }
}

最后,函数sock_def_write_space,如果发送缓存空间sk_sndbuf大于等于sk_wmem_alloc统计值的一倍时,说明套接口已有足够的空间,唤醒等待在套接口队列上的进程,对于异步等待的进程,使用函数sock_writeable再次确认sk_wmem_alloc统计值低于sk_sndbuf的一半大小,唤醒异步等待的进程。此函数在skb缓存释放函数sock_wfree和用户层重设置最大发送缓存空间sk_sndbuf值的函数sock_setsockopt中调用。

static void sock_def_write_space(struct sock *sk)
{
    struct socket_wq *wq;

    if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
            wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND);

        /* Should agree with poll, otherwise some programs break */
        if (sock_writeable(sk))
            sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
    }
}
static inline bool sock_writeable(const struct sock *sk)
{           
    return refcount_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1);
}
int sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
{
    switch (optname) {
    case SO_SNDBUF:
        val = min_t(u32, val, sysctl_wmem_max);
set_sndbuf:
        sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
        sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
        /* Wake up sending tasks if we upped the value. */
        sk->sk_write_space(sk);
        break;
    } 
}

sk_wmem_alloc获取

应用层可通过IOCTL命令SIOCOUTQ获取到当前套接口的sk_wmem_alloc统计值,内核处理函数为如下udp_ioctl。

int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
    switch (cmd) {
    case SIOCOUTQ:
    {
        int amount = sk_wmem_alloc_get(sk);
        return put_user(amount, (int __user *)arg);
    }
}

另外,通过getsockopt接口选项SO_MEMINFO可获得所有的套接口缓存统计信息,其中包括sk_wmem_alloc统计值。UDP的diag接口也可获取到sk_wmem_alloc统计值,见函数udp_diag_get_info。

int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
{
    switch (optname) {
    case SO_MEMINFO:
    {
        u32 meminfo[SK_MEMINFO_VARS];

        sk_get_meminfo(sk, meminfo);
    }
    }
}
void sk_get_meminfo(const struct sock *sk, u32 *mem)
{
    memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);

    mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
    mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
    mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
    mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
    mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
    mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
    mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
    mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
    mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
}
static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info)
{
    r->idiag_rqueue = sk_rmem_alloc_get(sk);
    r->idiag_wqueue = sk_wmem_alloc_get(sk);
}

 

内核版本 4.15

 

  • 2
    点赞
  • 0
    评论
  • 2
    收藏
  • 一键三连
    一键三连
  • 扫一扫,分享海报

相关推荐
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值