基础内容参见上一篇: SACK报文乱序级别reordering
Reno乱序调整
首先看一下,在Reno算法中,对重复ACK数量的修正。如果sacked_out表示的DupAck数量与丢失报文数量之和,大于发出去的总报文数量,可能是因为网络自身的报文复制所导致,包括对数据报文的复制或者对ACK报文的复制,这时候需要调整DupAck数量值。
static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
{
u32 holes;
holes = max(tp->lost_out, 1U);
holes = min(holes, tp->packets_out);
if ((tp->sacked_out + holes) > tp->packets_out) {
tp->sacked_out = tp->packets_out - holes;
return true;
}
return false;
之后,在乱序检测函数tcp_check_reno_reordering中,如果DupAck数量没有被调整,即没有出错,将当前发出的报文数量与添加值addend之和作为乱序级别reordering。
static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
struct tcp_sock *tp = tcp_sk(sk);
if (!tcp_limit_reno_sacked(tp))
return;
tp->reordering = min_t(u32, tp->packets_out + addend,
sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
tp->reord_seen++;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
函数tcp_add_reno_sack用于增加DupAck数量,并且调整乱序级别。
static void tcp_add_reno_sack(struct sock *sk, int num_dupack)
{
if (num_dupack) {
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
s32 delivered;
tp->sacked_out += num_dupack;
tcp_check_reno_reordering(sk, 0);
函数tcp_remove_reno_sacks函数用于减小DupAck数量,并且调整乱序级别。其中,如果ACK确认的报文数量(acked)大于DupAck数量,将DupAck计数清零,否则将DupAck计数减去(确认报文数量-1)的值。
static void tcp_remove_reno_sacks(struct sock *sk, int acked)
{
struct tcp_sock *tp = tcp_sk(sk);
if (acked > 0) {
/* One ACK acked hole. The rest eat duplicate ACKs. */
tp->delivered += max_t(int, acked - tp->sacked_out, 1);
if (acked - 1 >= tp->sacked_out)
tp->sacked_out = 0;
else
tp->sacked_out -= acked - 1;
}
tcp_check_reno_reordering(sk, acked);
清空DupAck计数。
static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
{
tp->sacked_out = 0;
}
Reno乱序检测
在如下tcp_fastretrans_alert函数中,如果套接口当前处于TCP_CA_Recovery拥塞状态,接收到DupAck报文,对于Reno,使用函数tcp_add_reno_sack增加重复ACK数量,并且更新乱序级别。反之,如果接收到ACK报文推进了SND.UNA,在函数tcp_try_undo_partial中检查是否要更新乱序级别。
对于TCP_CA_Loss拥塞状态的套接口,在函数tcp_process_loss中处理乱序级别更新。而对于其它拥塞状态(以下的default分支),如果ACK报文推进了SND.UNA,将DupAck计数清零,之后由函数tcp_add_reno_sack更新乱序级别。
static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
int num_dupack, int *ack_flag, int *rexmit)
{
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
if (tcp_is_reno(tp))
tcp_add_reno_sack(sk, num_dupack);
} else {
if (tcp_try_undo_partial(sk, prior_snd_una))
return;
/* Partial ACK arrived. Force fast retransmit. */
do_lost = tcp_is_reno(tp) || tcp_force_fast_retransmit(sk);
}
...
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, num_dupack, rexmit);
...
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
tcp_add_reno_sack(sk, num_dupack);
}
在函数tcp_try_undo_partial中,如果ACK确认报文对应的数据报文被延迟了,而不是丢失,这里使用tcp_check_sack_reordering更新乱序级别。关于此函数详见:SACK报文乱序级别reordering
/* Undo during fast recovery after partial ACK. */
static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tp->undo_marker && tcp_packet_delayed(tp)) {
/* Plain luck! Hole if filled with delayed
* packet, rather than with a retransmit. Check reordering.
*/
tcp_check_sack_reordering(sk, prior_snd_una, 1);
在TCP_CA_Loss拥塞状态,如果当前的SND.NXT在发送拥塞时的SND.NXT(high_seq)之后,即拥塞之后发送过新报文(F-RTO发送),并且DupAck有值,表明F-RTO发送的新报文是重复的,原始报文的发送只是发生了乱序。
否则,如果F-RTO未发送新报文,或者DupAck数量为零,这里接收到正常的ACK确认报文(推进SND.UNA的值),清空DupAck计数。
static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, int *rexmit)
{
if (tcp_is_reno(tp)) {
/* A Reno DUPACK means new data in F-RTO step 2.b above are
* delivered. Lower inflight to clock out (re)tranmissions.
*/
if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
tcp_add_reno_sack(sk, num_dupack);
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
*rexmit = REXMIT_LOST;
重传队列与乱序
在重传队列清理函数tcp_clean_rtx_queue中,对于被确认报文,由变量pkts_acked表示确认数量,对于Reno算法,使用函数tcp_remove_reno_sacks将此数量由DupAck计数中剔除。
static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
u32 prior_snd_una, struct tcp_sacktag_state *sack)
{
u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
u32 pkts_acked = 0;
for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
...
tp->packets_out -= acked_pcount;
pkts_acked += acked_pcount;
...
}
if (flag & FLAG_ACKED) {
if (tcp_is_reno(tp)) {
tcp_remove_reno_sacks(sk, pkts_acked);
...
} else {
/* Non-retransmitted hole got filled? That's reordering */
if (before(reord, prior_fack))
tcp_check_sack_reordering(sk, reord, 0);
DUPACK清零
在RTO超时处理中,套接口将进入TCP_CA_Loss拥塞状态,对于Reno算法,此时要清空DupAck计数。
static void tcp_timeout_mark_lost(struct sock *sk)
{
head = tcp_rtx_queue_head(sk);
is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
if (is_reneg) {
...
} else if (tcp_is_reno(tp)) {
tcp_reset_reno_sack(tp);
}
或者,在可疑ACK报文处理函数tcp_fastretrans_alert中,如果当前套接口没有处在TCP_CA_Open状态,并且,当前未确认序号SND.UNA大于等于high_seq(进入拥塞状态时的SND.NXT),表明已经由拥塞状态恢复,如果套接口处在TCP_CA_Recovery状态,清空DupAck计数。
static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
int num_dupack, int *ack_flag, int *rexmit)
{
/* D. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
if (icsk->icsk_ca_state == TCP_CA_Open) {
WARN_ON(tp->retrans_out != 0);
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
switch (icsk->icsk_ca_state) {
case TCP_CA_CWR:
...
break;
case TCP_CA_Recovery:
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
快速重传
如下ACK报文处理函数tcp_fastretrans_alert中,调用tcp_time_to_recover函数决定非TCP_CA_Recovery和非TCP_CA_Loss拥塞状态的套接口是否进入恢复阶段。
static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
int num_dupack, int *ack_flag, int *rexmit)
{
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
...
break;
case TCP_CA_Loss:
...
/* fall through */
default:
...
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
}
...
/* Otherwise enter Recovery state */
tcp_enter_recovery(sk, (flag & FLAG_ECE));
fast_rexmit = 1;
如果发生了丢包(lost_out计算不为空),或者对于非RACK算法,DupAck数量超过了乱序级别(默认为3),进入TCP_CA_Recovery状态,开始快速重传。
static bool tcp_time_to_recover(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Trick#1: The loss is proven. */
if (tp->lost_out)
return true;
/* Not-A-Trick#2 : Classic rule... */
if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
return true;
return false;
拥塞窗口与乱序
如下函数tcp_may_raise_cwnd检查是否要执行拥塞控制算法的窗口增长函数,增加拥塞窗口。如果当前的乱序级别高于系统指定的乱序级别,无论是接收到正常ACK还是重复ACK,都增长拥塞窗口。否则,如果当前乱序级别较低,只有接收到正常的ACK确认报文,才可增加拥塞窗口。
static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
/* If reordering is high then always grow cwnd whenever data is
* delivered regardless of its ordering. Otherwise stay conservative
* and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
* new SACK or ECE mark may first advance cwnd here and later reduce
* cwnd in tcp_fastretrans_alert() based on more states.
*/
if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
return flag & FLAG_FORWARD_PROGRESS;
return flag & FLAG_DATA_ACKED;
如下函数tcp_cong_control在ACK报文处理的最后调用,如果当前套接口未处于TCPF_CA_CWR或者TCPF_CA_Recovery拥塞状态,即拥塞窗口未在减少阶段,并且以上介绍函数tcp_may_raise_cwnd返回真,调用拥塞算法的拥塞避免处理函数,增加拥塞窗口。此时有助于发送更多的新报文,如果确实有丢包发送,将触发更多的DupAck,尽快触发对丢失报文的重传。
另外,当前内核中只有BBR算法实现了拥塞函数cong_control,通过此函数设置Pacing速率和窗口大小。内核默认算法Cubic未实现此函数,拥塞窗口大小由之后的代码处理。
static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
int flag, const struct rate_sample *rs)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->cong_control) {
icsk->icsk_ca_ops->cong_control(sk, rs);
return;
}
if (tcp_in_cwnd_reduction(sk)) {
/* Reduce cwnd if state mandates */
tcp_cwnd_reduction(sk, acked_sacked, flag);
} else if (tcp_may_raise_cwnd(sk, flag)) {
/* Advance cwnd if state allows */
tcp_cong_avoid(sk, ack, acked_sacked);
}
强制快速重传
如果未确认序号SND.UNA加上乱序级别对应的报文序号长度,小于当前SACK确认的最大序号,认为SND.UNA对应的报文已经丢失,而非乱序。
static bool tcp_force_fast_retransmit(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
return after(tcp_highest_sack_seq(tp),
tp->snd_una + tp->reordering * tp->mss_cache);
如下在可疑ACK报文处理函数tcp_fastretrans_alert中,如果tcp_force_fast_retransmit函数返回真,将标记丢失报文,并且重传丢失报文。
static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
int num_dupack, int *ack_flag, int *rexmit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int fast_rexmit = 0, flag = *ack_flag;
bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
tcp_force_fast_retransmit(sk));
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
if (tcp_is_reno(tp))
tcp_add_reno_sack(sk, num_dupack);
} else {
if (tcp_try_undo_partial(sk, prior_snd_una))
return;
/* Partial ACK arrived. Force fast retransmit. */
do_lost = tcp_is_reno(tp) ||
tcp_force_fast_retransmit(sk);
}
...
if (!tcp_is_rack(sk) && do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
*rexmit = REXMIT_LOST;
RTO乱序调整
在小于等于TCP_CA_Disorder拥塞状态,发生RTO超时,并且当前的DupAck报文数量大于等于系统设定的TCP乱序级别(默认为3),表明reordering级别被高估了。将乱序级别降低至系统设定的tcp_reordering值之下。有助于检查到乱序之后,尽快重传丢失报文,避免发生RTO超时。
void tcp_enter_loss(struct sock *sk)
{
...
/* Timeout in disordered state after receiving substantial DUPACKs
* suggests that the degree of reordering is over-estimated.
*/
if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
tp->reordering = min_t(unsigned int, tp->reordering,
net->ipv4.sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
发送缓存与乱序级别
如下tcp_sndbuf_expand发送缓冲区动态扩展函数,发送缓冲区最大可容纳的报文数量,不低于套接口当前的乱序级别再加上1。因为低于此数量的未确认报文,将不能触发快速重传。
static void tcp_sndbuf_expand(struct sock *sk)
{
...
nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
/* Fast Recovery (RFC 5681 3.2) :
* Cubic needs 1.7 factor, rounded to 2 to include
* extra cushion (application might react slowly to EPOLLOUT)
*/
sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
sndmem *= nr_segs * per_mss;
if (sk->sk_sndbuf < sndmem)
sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
内核版本 5.0