TCP套接口丢失与重传报文线索由变量lost_skb_hint与retransmit_skb_hint所表示。
丢失报文线索
变量lost_skb_hint记录重传队列中上一次标记丢失报文的位置,其自身对应的skb不一定为丢失报文,也可能是SACK确认报文。如下函数tcp_mark_head_lost,其仅在sack恢复算法中使用,用于将重传队列中的相应报文标记为丢失状态,标记为丢失状态的报文数量不超过packets指定数量的SACK确认报文。如下所示,如果packets等于3,将3所指定的报文之前的所有未被SACK确认的报文,标记为丢失状态。 在lost_skb_hint为空的情况下,由重传队列的首个skb开始进行丢失标记;否则,由lost_skb_hint指定的skb开始标记。
3
SACK确认报文 | | | |
重传队列 | | | | | | | | | | | | | | |
变量lost_skb_hint记录最后重传队列遍历的位置,其之前的报文skb,一种情况是被标记为丢失,或者是已经被SACK确认。最后,lost_cnt_hint记录的不是丢失报文的数量,而是丢失处理中用到的SACK确认报文数量。以上这两个变量的实际意义与字面意思有比较大的差别。
static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
{
/* Use SACK to deduce losses of new sequences sent during recovery */
const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
WARN_ON(packets > tp->packets_out);
skb = tp->lost_skb_hint;
if (skb) {
/* Head already handled? */
if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
return;
cnt = tp->lost_cnt_hint;
} else {
skb = tcp_rtx_queue_head(sk);
cnt = 0;
}
skb_rbtree_walk_from(skb) {
/* TODO: do this better */
/* this is not the most efficient way to do this... */
tp->lost_skb_hint = skb;
tp->lost_cnt_hint = cnt;
if (tcp_is_reno(tp) ||
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
cnt += tcp_skb_pcount(skb);
看一下lost_cnt_hint值的变化。在函数tcp_sacktag_one中,如果接收到的ACK确认报文中,SACK序号块确认的报文还没有设置TCPCB_SACKED_ACKED标志,即之前没有被SACK确认过,SACK序号块为新的确认数据。检查是否需要更新lost_skb_hint,如果当前SACK确认的序号块起始序号在lost_skb_hint记录的起始序号之前,增加SACK确认报文数量值lost_cnt_hint。
static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state,
u8 sacked, u32 start_seq, u32 end_seq,
int dup_sack, int pcount, u64 xmit_time)
{
if (!(sacked & TCPCB_SACKED_ACKED)) {
...
sacked |= TCPCB_SACKED_ACKED;
state->flag |= FLAG_DATA_SACKED;
tp->sacked_out += pcount;
tp->delivered += pcount; /* Out-of-order packets delivered */
/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
if (tp->lost_skb_hint &&
before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
tp->lost_cnt_hint += pcount;
}
以上函数仅是检查了skb开始序号是否在lost_skb_hint开始序号之前,另外一种情况是两者相等,在函数tcp_shifted_skb将处理此状况。检查skb的是否等于lost_skb_hint,相等意味着两者的开始序号相同,增加SACK确认报文数量(lost_cnt_hint)。
static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
struct sk_buff *skb, struct tcp_sacktag_state *state,
unsigned int pcount, int shifted, int mss, bool dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
BUG_ON(!pcount);
/* Adjust counters and hints for the newly sacked sequence
* range but discard the return value since prev is already
* marked. We must tag the range first because the seq
* advancement below implicitly advances
* tcp_highest_sack_seq() when skb is highest_sack.
*/
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount,
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
如果之后,当前SACK确认的skb被完全的合并到了前一个skb中(prev),将lost_skb_hint更新为之前skb结构(prev),同时,参考在函数tcp_mark_head_lost的遍历代码,lost_skb_hint记录了下次遍历的开始位置,这里提前减去其所对应的SACK确认报文数量。
if (skb->len > 0) {
BUG_ON(!tcp_skb_pcount(skb));
NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
return false;
}
/* Whole SKB was eaten :-) */
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = prev;
if (skb == tp->lost_skb_hint) {
tp->lost_skb_hint = prev;
tp->lost_cnt_hint -= tcp_skb_pcount(prev);
}
最后,在如下重传队列清理函数tcp_clean_rtx_queue中,如果队列中的某个报文已经被完全的确认(序号位于SND.UNA之前),并且此报文等于lost_skb_hint,将lost_skb_hint清空,即所有标记丢失的报文都已经被确认。注意,这里,如果此确认报文skb等于重传skb线索retransmit_skb_hint变量的值,也将其清空。
static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
u32 prior_snd_una, struct tcp_sacktag_state *sack)
{
for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
if (!fully_acked)
break;
next = skb_rb_next(skb);
if (unlikely(skb == tp->retransmit_skb_hint))
tp->retransmit_skb_hint = NULL;
if (unlikely(skb == tp->lost_skb_hint))
tp->lost_skb_hint = NULL;
tcp_rtx_queue_unlink_and_free(skb, sk);
如果对重传队列中报文的分片(tcp_fragment),或者合并(tcp_collapse_retrans)等操作,减小了lost_skb_hint位置之前的SACK确认报文数量,由以下函数tcp_adjust_pcount更新lost_cnt_hint的值。
/* Pcount in the middle of the write queue got changed, we need to do various tweaks to fix counters
*/
static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->packets_out -= decr;
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= decr;
...
if (tp->lost_skb_hint &&
before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
tp->lost_cnt_hint -= decr;
重传skb线索
无论是NewReno/SACK/RACK等丢包恢复算法,在标记丢包时,都会通过tcp_verify_retransmit_hint函数设置重传skb线索的值。
如下函数tcp_verify_retransmit_hint,重传skb线索变量retransmit_skb_hint,如果还没有值,并且,重传报文数量大于等于丢失报文数量; 或者,retransmit_skb_hint已经赋值,但是当前报文的起始序号在retransmit_skb_hint记录的序号之前,更新retransmit_skb_hint变量的值,设置为丢失报文中序号最小的值。
/* This must be called before lost_out is incremented */
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
(tp->retransmit_skb_hint &&
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
tp->retransmit_skb_hint = skb;
}
在重传队列的报文合并函数tcp_collapse_retrans中,如果被合并的报文等于重传线索retransmit_skb_hint变量指向的报文,将其更新为合并之后的报文。
/* Collapses two adjacent SKB's during retransmission. */
static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
{
...
/* changed transmit queue under us so clear hints */
tcp_clear_retrans_hints_partial(tp);
if (next_skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = skb;
tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
在SACK处理过程中,如果retransmit_skb_hint指向报文被合并到前一个skb中,更新retransmit_skb_hint为之前的skb(prev)的值。
static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
struct sk_buff *skb, struct tcp_sacktag_state *state,
unsigned int pcount, int shifted, int mss, bool dup_sack)
{
...
if (skb->len > 0) {
BUG_ON(!tcp_skb_pcount(skb));
NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
return false;
}
/* Whole SKB was eaten :-) */
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = prev;
报文重传
在重传函数tcp_xmit_retransmit_queue中,如果重传skb线索变量retransmit_skb_hint有值,使用其值代表的skb进行重传操作,否则,使用重传队列中首个skb。
void tcp_xmit_retransmit_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (!tp->packets_out)
return;
rtx_head = tcp_rtx_queue_head(sk);
skb = tp->retransmit_skb_hint ?: rtx_head;
max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
在遍历skb开始的重传队列过程中,除了发送重传报文之后,还更新重传skb线索变量的值。如下,如果遇到某个报文skb,没有被标记为丢失(TCPCB_LOST),也没有被SACK确认,还没有进行重传,并且是在遍历过程中第一次遇到此类skb,将其更新为重传skb线索变量retransmit_skb_hint的值,下一次重传此报文。
skb_rbtree_walk_from(skb) {
...
/* we could do better than to assign each time */
if (!hole)
tp->retransmit_skb_hint = skb;
...
sacked = TCP_SKB_CB(skb)->sacked;
/* In case tcp_shift_skb_data() have aggregated large skbs,
* we need to make sure not sending too bigs TSO packets
*/
segs = min_t(int, segs, max_segs);
if (tp->retrans_out >= tp->lost_out) {
break;
} else if (!(sacked & TCPCB_LOST)) {
if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
hole = skb;
continue;
if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
continue;
if (tcp_small_queue_check(sk, skb, 1))
return;
if (tcp_retransmit_skb(sk, skb, segs))
return;
清除丢失和重传skb线索
如下函数tcp_clear_all_retrans_hints用于清除丢失和重传skb线索。
static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp)
{
tp->lost_skb_hint = NULL;
}
static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
{
tcp_clear_retrans_hints_partial(tp);
tp->retransmit_skb_hint = NULL;
一种情况是在RTO超时之后,
static void tcp_timeout_mark_lost(struct sock *sk)
{
...
tcp_clear_all_retrans_hints(tp);
另外,检测到不必要的拥塞窗口减少,即网络非拥塞时,撤销拥塞窗口状态,清除所有的重传线索。
static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
{
struct tcp_sock *tp = tcp_sk(sk);
if (unmark_loss) {
struct sk_buff *skb;
skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
}
tp->lost_out = 0;
tcp_clear_all_retrans_hints(tp);
最后,在套接口接收到复位报文,或者断开连接,套接口出错等的情况下,清空重传队列时,连带清除重传线索。
void tcp_write_queue_purge(struct sock *sk)
{
struct sk_buff *skb;
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
tcp_skb_tsorted_anchor_cleanup(skb);
sk_wmem_free_skb(sk, skb);
}
tcp_rtx_queue_purge(sk);
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
内核版本 5.0