TCP之QUICKACK

路由系统 同时被 2 个专栏收录
31 篇文章 1 订阅
105 篇文章 6 订阅

当TCP套接口的ACK策略处于QUICKACK模式时,意味着TCP套接口将尝试立即回复对端ACK确认报文。

配置


用户层可通过setsockopt系统调用的选项TCP_QUICKACK开启QUICKACK模式,但是这不是永久生效的,内核根据之后的报文处理,可能退出此模式。在QUICKACK模式开启之后,如果套接口存在需要调度的ACK,调用tcp_cleanup_rbuf函数进一步检查ACK是否需要发送。

static int do_tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen)
{
    case TCP_QUICKACK:
        if (!val) {
            icsk->icsk_ack.pingpong = 1;
        } else {
            icsk->icsk_ack.pingpong = 0;
            if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) {
                icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
                tcp_cleanup_rbuf(sk, 1);
                if (!(val & 1)) icsk->icsk_ack.pingpong = 1;
            }
        }
        break;
}

使用路由命令控制匹配路由的TCP数据流开启quickack模式:

$ sudo ip route add 10.10.0.0/24 via 192.168.1.1 quickack 1
$ 
$ ip -d route
unicast 10.10.0.0/24 via 192.168.1.1 dev eth0 proto boot scope global  quickack 1
unicast 192.168.1.0/24 dev eth0 proto kernel scope link src 192.168.1.103 

如果路由的metric中开启了RTAX_QUICKACK,或者QUICK ACK的配额还未消耗完并且当前连接不是交互类型,判定处在QUICKACK模式。

/* Send ACKs quickly, if "quick" count is not exhausted and the session is not interactive. */
static bool tcp_in_quickack_mode(struct sock *sk)
{
    const struct inet_connection_sock *icsk = inet_csk(sk);
    const struct dst_entry *dst = __sk_dst_get(sk);

    return (dst && dst_metric(dst, RTAX_QUICKACK)) || (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
}

内核将一个套接口的ACK模式设置为QUICKACK,意味着更新QUICK ACK的配额数值,关闭交互模式pingpong,初始化ACK超时时间为TCP_ATO_MIN(40毫秒)。

static void tcp_enter_quickack_mode(struct sock *sk)
{
    struct inet_connection_sock *icsk = inet_csk(sk);
    tcp_incr_quickack(sk);
    icsk->icsk_ack.pingpong = 0;
    icsk->icsk_ack.ato = TCP_ATO_MIN;
}

当前套接口接收窗口除以两倍的对端MSS值的结果作为新的quick ack配额数值,但是如果其值等于0,更正为2最小给与两个配额,并且只有在其值大于当前的quick ack配额时才会更新,新配额不能大于16个。

static void tcp_incr_quickack(struct sock *sk)
{
    struct inet_connection_sock *icsk = inet_csk(sk);
    unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);

    if (quickacks == 0)
        quickacks = 2;
    if (quickacks > icsk->icsk_ack.quick)
        icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
}
/* Maximal number of ACKs sent quickly to accelerate slow-start. */
#define TCP_MAX_QUICKACKS   16U

进入QUICKACK模式


对于接收到的重复数据,即报文的结束序号在待接收序号之前的数据,表明对端有可能发送序号混乱,所以需要立即回复一个ACK进行序号纠正,进入QUICKACK模式。另外,如果报文的开始序号超出接收窗口的最大序号,表明接收到了超出窗口范围的数据,同样进入QUICKACK模式。最后,可能接收到的报文为在窗口内的乱序报文,同样进入QUICKACK报文。以上可见,所有异常接收情况,都进入QUICKACK模式以便快速纠正,而在接收到保序的正常报文后,则不必进入QUICKACK模式。

static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
    struct tcp_sock *tp = tcp_sk(sk);

    /* Queue data for delivery to the user.  Packets in sequence go to the receive queue. Out of sequence packets to the out_of_order_queue. */
    if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
	    ...
        return;
    }
    if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
        /* A retransmit, 2nd most common case.  Force an immediate ack. */
        NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
        tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);

out_of_window:
        tcp_enter_quickack_mode(sk);
        inet_csk_schedule_ack(sk);
drop:
        tcp_drop(sk, skb);
        return;
    }
    /* Out of window. F.e. zero window probe. */
    if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
        goto out_of_window;

    tcp_enter_quickack_mode(sk);
}

对于TCP客户端而言,在连接建立过程中接收到服务端的SYN+ACK报文之后,如果本地套接口有数据正等待发送sk_write_pending,或者属于交互类型套接口pingpong(意味着马上将会有数据发送),在或者开启了延迟ACCEPT功能(将尝试将数据与ACK合并在一个报文中),进入QUICKACK模式。

static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
{
    if (th->ack) {
        if (!th->syn)
            goto discard_and_undo;

        if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) {
            /* Save one ACK. Data will be ready after several ticks, if write_pending is set.
             * It may be deleted, but with this feature tcpdumps look so _wonderfully_ clever, that I was not able
             * to stand against the temptation 8)     --ANK
             */
            inet_csk_schedule_ack(sk);
            tcp_enter_quickack_mode(sk);
            inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX);
discard:
            tcp_drop(sk, skb);
            return 0;
        } else {
            tcp_send_ack(sk);
        }
        return -1;
    }
}

在做数据报文接收检查时,如果其未能通过PAWS检查,并且不是RST复位报文(RST报文不受PAWS约束),发送一个重复的ACK报文给对端。或者如果报文的数据未在接收窗口之内,并且不是RST报文也不是SYN报文,回复重复的ACK报文。以上dupack的发送都收到Out-Of-Window设定的发送速率限制。

static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr)
{
    struct tcp_sock *tp = tcp_sk(sk);
    bool rst_seq_match = false;

    /* RFC1323: H1. Apply PAWS check first. */
    if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) && tp->rx_opt.saw_tstamp && tcp_paws_discard(sk, skb)) {
        if (!th->rst) {
            NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
            if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDPAWS, &tp->last_oow_ack_time))
                tcp_send_dupack(sk, skb);
            goto discard;
        }
        /* Reset is accepted even if it did not pass PAWS. */
    }
    /* Step 1: check sequence number */
    if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
        /* RFC793, page 37: "In all states except SYN-SENT, all reset (RST) segments are validated by checking their SEQ-fields."
         * And page 69: "If an incoming segment is not acceptable, an acknowledgment should be sent in reply (unless the RST
         * bit is set, if so drop the segment and return)".
         */
        if (!th->rst) {
            if (th->syn)  goto syn_challenge;
            if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSEQ, &tp->last_oow_ack_time))
                tcp_send_dupack(sk, skb);
        } else if (tcp_reset_check(sk, skb)) {
            tcp_reset(sk);
        }
        goto discard;
    }
}

如果报文的结束序号与开始序号不相等,表明携带有数据,不是单纯的控制报文;并且开始序号在待接收序号之前,意味值此报文为重复报文,可能由于本端发送的ACK丢失,而导致了对端的重传。进入QUICKACK模式。函数最后发送ACK确认报文。

static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
{
    struct tcp_sock *tp = tcp_sk(sk);

    if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
        NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
        tcp_enter_quickack_mode(sk);

        if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
            u32 end_seq = TCP_SKB_CB(skb)->end_seq;

            if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
                end_seq = tp->rcv_nxt;
            tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
        }
    }
    tcp_send_ack(sk);
}

如果TCP套接口已经被设置了TCP_ECN_SEEN标志,但是当前报文的IP头中却显示不支持ECN功能INET_ECN_NOT_ECT,意味着和可能接收到了重传的报文,进入QUICKACK模式,以尽快纠正对端的错误。另外,如果接收到的报文设置了网络拥塞INET_ECN_CE标志,而本地作为接收方需要尽快的通知对端拥塞的情况,以避免对端发送更多的报文到网络中,进入QUICKACK模式。

static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
{
    switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
    case INET_ECN_NOT_ECT:
        /* Funny extension: if ECT is not set on a segment, and we already seen ECT on a previous segment, it is probably a retransmit. */
        if (tp->ecn_flags & TCP_ECN_SEEN)
            tcp_enter_quickack_mode((struct sock *)tp);
        break;
    case INET_ECN_CE:
        if (tcp_ca_needs_ecn((struct sock *)tp))
            tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);

        if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
            /* Better not delay acks, sender can have a very low cwnd */
            tcp_enter_quickack_mode((struct sock *)tp);
            tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
        }
        tp->ecn_flags |= TCP_ECN_SEEN;
        break;
    default:
        if (tcp_ca_needs_ecn((struct sock *)tp))
            tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
        tp->ecn_flags |= TCP_ECN_SEEN;
        break;
    }
}

ACK发送与配额消耗


进入QUICKACK模式并不意味着立即发送ACK确认报文,以下函数__tcp_ack_snd_check负责作出发送决定。其一当接收到了1个以上的完整数据包(报文长度为rcv_mss),并且接收窗口的右边界前进了足够大的距离;其二,处于QUICKACK模式;其三,套接口接收到乱序数据;以上三种情况满足其一,立即回复ACK确认报文。否则,启用延时ACK。

此函数在处理完接收数据之后被调用。需要注意,在__tcp_select_window函数中有可能执行退出QUICKACK模式的操作,参见以下的介绍。

static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
    struct tcp_sock *tp = tcp_sk(sk);

        /* More than one full frame received... */
    if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
         /* ... and right edge of window advances far enough. (tcp_recvmsg() will send ACK otherwise). Or... */
         __tcp_select_window(sk) >= tp->rcv_wnd) ||
        /* We ACK each frame or... */
        tcp_in_quickack_mode(sk) ||
        /* We have out of order data. */
        (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {

        tcp_send_ack(sk);         /* Then ack it now */
    } else {
        /* Else, send delayed ack. */
        tcp_send_delayed_ack(sk);
    }
}

ACK确认报文发送函数tcp_send_ack,如果遇到SKB内存分配失败的情况,将会启动延迟ACK定时器,稍后再发送此ACK报文。

/* This routine sends an ack and also updates the window. */
void tcp_send_ack(struct sock *sk)
{
    struct sk_buff *buff;

    buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
    if (unlikely(!buff)) {
        inet_csk_schedule_ack(sk);
        inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX);
        return;
    }
    /* Send it off, this clears delayed acks for us. */
    tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0);
}

如果一切正常,函数tcp_transmit_skb在报文发送之前,针对ACK报文,调用tcp_event_ack_sent函数,递减QUICK ACK的发送次数配额,并且停止延迟ACK定时器,因为马上就可将此ACK发送出去。

static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
{
    if (likely(tcb->tcp_flags & TCPHDR_ACK))
        tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
		
    err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
}

递减QUICK ACK配额的前提是配额不为零。如果本次发送的数据包数量超过配额值,复位配额为零;否则,配额值减去报文个数。

static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
    tcp_dec_quickack_mode(sk, pkts);
    inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts)
{
    struct inet_connection_sock *icsk = inet_csk(sk);
     
    if (icsk->icsk_ack.quick) {
        if (pkts >= icsk->icsk_ack.quick) {
            icsk->icsk_ack.quick = 0;
            icsk->icsk_ack.ato   = TCP_ATO_MIN;   /* Leaving quickack mode we deflate ATO. */
        } else 
            icsk->icsk_ack.quick -= pkts;
    }
} 

另外,在用户层读取由内核读取套接口数据后,例如tcp_recvmsg函数中,调用以下的tcp_cleanup_rbuf判断是否回复ACK报文。首先在有需要调度的ACK情况下,a)如果有被阻塞的ACK发送,经常在大量数据传输时发生,由于延迟ACK定时器到期时,用户层正在调用接收程序占用套接口而阻塞。b)或者,接收到了两个连续的报文。c)或者由于用户层读取了接收数据,接收缓存变为空,并且存在要发送的ACK:1)ACK的pending标记了ICSK_ACK_PUSHED2;2)或是标记了ICSK_ACK_PUSHED而且处在QUICKACK模式。

满足以上abc三个条件中的一个,都要发送ACK确认报文。

static void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
    struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);

    if (inet_csk_ack_scheduled(sk)) {
        if (icsk->icsk_ack.blocked ||     /* Delayed ACKs frequently hit locked sockets during bulk receive. */
            tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||   /* Once-per-two-segments ACK was not sent by tcp_input.c */
            /* If this read emptied read buffer, we send ACK, if connection is not bidirectional, user drained
             * receive buffer and there was a small segment in queue. */
            (copied > 0 &&
             ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && !icsk->icsk_ack.pingpong)) &&
              !atomic_read(&sk->sk_rmem_alloc)))
            time_to_ack = true;
    }

如果以上条件都不成立,但是,由于用户层对接收数据的读取,致使新的窗口比当前窗口增大了一倍以上,需要发送ACK报文,因为由足够了空间用于接收报文。

    /* We send an ACK if we can now advertise a non-zero window which has been raised "significantly".
     * Even if window raised up to infinity, do not send window open ACK in states, where we will not receive more. It is useless.
     */
    if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
        __u32 rcv_window_now = tcp_receive_window(tp);
        if (2*rcv_window_now <= tp->window_clamp) {      /* Optimize, __tcp_select_window() is not cheap. */
            __u32 new_window = __tcp_select_window(sk);

            /* Send ACK now, if this read freed lots of space in our buffer. Certainly, new_window is new window.
             * We can advertise it now, if it is not less than current one. "Lots" means "at least twice" here.
             */
            if (new_window && new_window >= 2 * rcv_window_now)
                time_to_ack = true;
        }
    }
    if (time_to_ack) tcp_send_ack(sk);
}

QUICKACK额度控制


在接收到对端数据报文后,函数tcp_event_data_recv执行相应处理。在第一次接收到对端数据时,延迟ACK功能还未启动,ACK超时时间ATO为0,因为接收到数据,应当回复ACK确认报文,所以增加QUICK ACK的配额,并且开启延迟ACK定时器。

如果并非首次接收到数据报文,本次的接收时间与上次时间的间隔大于重传超时时间RTO,意味值对端重启窗口失败,增加QUICK ACK发送配额,以便快速发送ACK确认报文。如果报文间隔很短,小于最小的ACK超时时间ATO的一半,可将ATO时间降低为原ATO时间的一半与最小ATO一半的和。但是如果报文间隔大于最小ATO的一半,并且不超过当前ATO值,将ATO值更新为原ATO时间的一半与当前报文间隔的和,更新后的ATO值最大不超过重传超时RTO。

static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct inet_connection_sock *icsk = inet_csk(sk);

    inet_csk_schedule_ack(sk);
    tcp_measure_rcv_mss(sk, skb);
    tcp_rcv_rtt_measure(tp);
    now = tcp_jiffies32;

    if (!icsk->icsk_ack.ato) {
        tcp_incr_quickack(sk);             /* The _first_ data packet received, initialize delayed ACK engine. */
        icsk->icsk_ack.ato = TCP_ATO_MIN;
    } else {
        int m = now - icsk->icsk_ack.lrcvtime;
        if (m <= TCP_ATO_MIN / 2) {
            icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;   /* The fastest case is the first. */
        } else if (m < icsk->icsk_ack.ato) {
            icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
            if (icsk->icsk_ack.ato > icsk->icsk_rto)
                icsk->icsk_ack.ato = icsk->icsk_rto;
        } else if (m > icsk->icsk_rto) {
            tcp_incr_quickack(sk);   /* Too long gap. Apparently sender failed to restart window, so that we send ACKs quickly. */
            sk_mem_reclaim(sk);
        }
    }
    icsk->icsk_ack.lrcvtime = now;
    if (skb->len >= 128)
        tcp_grow_window(sk, skb);
}

在选择新窗口值时,如果空余的TCP内存空间小于总空间的一半,对端可能很快会填满本地的空余空间,内核清空QUICK ACK发送次数配额,退出QUICKACK模式,延迟回复ACK确认报文以尝试降低对端的发包速度。

u32 __tcp_select_window(struct sock *sk)
{
    struct inet_connection_sock *icsk = inet_csk(sk);
    struct tcp_sock *tp = tcp_sk(sk);
    int mss = icsk->icsk_ack.rcv_mss;
    int free_space = tcp_space(sk);
    int allowed_space = tcp_full_space(sk);
    int full_space = min_t(int, tp->window_clamp, allowed_space);

    if (unlikely(mss > full_space)) {
        mss = full_space;
        if (mss <= 0) return 0;
    }
    if (free_space < (full_space >> 1)) {
        icsk->icsk_ack.quick = 0;
}

当TCP套接口的接收缓存中的数据量超过限定的接收缓存最大值(sk_rcvbuf)时,TCP缓存告急,需要钳制住接收窗口。函数tcp_clamp_window首先要做的与以上介绍的__tcp_select_window函数相同,清空QUICK ACK配置,退出QUICKACK模式。

static int tcp_prune_queue(struct sock *sk)
{
    struct tcp_sock *tp = tcp_sk(sk);
    if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
        tcp_clamp_window(sk);
}
/* 5. Recalculate window clamp after socket hit its memory bounds. */
static void tcp_clamp_window(struct sock *sk)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct inet_connection_sock *icsk = inet_csk(sk);

    icsk->icsk_ack.quick = 0;

    if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
        !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
        sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), net->ipv4.sysctl_tcp_rmem[2]);
    }
    if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
        tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
}

在接收到对端数据之后,内核尝试增长接收窗口,例如以上的tcp_event_data_recv函数,在接收的数据长度大于128时,调用tcp_grow_window函数。如果满足窗口增长的条件,除去增加窗口之外,还增加QUICK ACK的额度值,以便尽快回复ACK确认报文,促使对端发送更多数据。

static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
{   
    struct tcp_sock *tp = tcp_sk(sk);
    /* Check #1 */
    if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && !tcp_under_memory_pressure(sk)) {        
        /* Check #2. Increase window, if skb with such overhead will fit to rcvbuf in future. */
        if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
            incr = 2 * tp->advmss;
        else
            incr = __tcp_grow_window(sk, skb);
        if (incr) {
            incr = max_t(int, incr, 2 * skb->len);
            tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
            inet_csk(sk)->icsk_ack.quick |= 1;
        }
    }
}

 

内核版本 4.15

 

  • 0
    点赞
  • 0
    评论
  • 0
    收藏
  • 一键三连
    一键三连
  • 扫一扫,分享海报

相关推荐
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值