++++++++++转载请标明出处++++++++++++
RACK即recent ack,表示根据最新的(s)ack来做丢包判断,是google去年新提出的一个丢包检测算法,目前主要用于解决尾部丢包和二次重传的问题,但是它应该可以FACK,dupACK并驾齐驱,理解起来也比较直观。
以下分析是基于linux-4.5内核版本
如果数据包p1在p2之前发送,没有收到p1的确认,当收到p2的SACK后,推断p1丢了。
每一个skb记录发送时间xmit_time,传输控制块维护全局变量:rack.xmit_time,rack.reo_wnd。Rack.xmit_time是接收方已经收到的最新的那个skb的发送时间;rack.reo_wnd是乱序的时间窗口大小。
a. 每当收到一个ACK或者SACK的时候,更新rack.xmit_time.再去遍历发送队列上已经发送但还没有收到确认的skb,如果满足如下条件:
rack.xmit_time - skb.xmit_time > reo_wnd, 那么标记这个skb丢了。
b. 如果没有收到确认,那么就用定时器每个一段时间看看有哪些包丢了,如果满足如下条件:
currentTime > skb.xmit_time + min_rtt + rack.reo_wnd + 1ms, 那么把这个skb标记为已经丢了
注:目前linux内核中只实现了第一种判断方法,定时器还没有实现,这样的话就还没有解决对于尾部包丢失的问题。
在tcp_sock中加入数据结构rack和rtt_min:
struct tcp_rack {
struct skb_mstamp mstamp; /* (Re)sent time of the skb 接收方已经收到的最新的那个skb的发送时间*/
u8 advanced; /* mstamp advanced since last lost marking 用于标记mstamp是否有更新*/
u8 reord; /* reordering detected 是否检测到乱序*/
} rack;
Rtt_min是一个数组,里面放了3个rtt_meas,用来记录最小的3个rtt测试量值,rtt_min[3]分别。rtt一开始初始化为~0U
struct rtt_meas {
u32 rtt, ts; /* RTT in usec and sampling time in jiffies. ts是设置这个rtt的时刻,即这个rtt_min[i]最近被更新的时间 */
} rtt_min[3];
收到ACK后,先更新rtt_min,从而更新乱序时间窗口reo_wnd。
reo_wnd = max(rtt_min / 4, 1ms)Rtt_min的更新算法是Kathleen Nichol算法,用于记录每固定时间间隔内的rtt最小值(比如获得刚刚5分钟内的rtt最小值)。设时间窗口为wlen, rtt[0]是长度为wlen的时间窗口内的最小rtt, rtt[1]是长度为wlen/2的时间窗口内的最小rtt, rtt[2]是长度为wlen/4的时间窗口内的最小rtt,满足rtt[0] <= rtt[1] <= rtt[2]
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
{
const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
struct rtt_meas *m = tcp_sk(sk)->rtt_min;
struct rtt_meas rttm = {
.rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
.ts = now,
};
u32 elapsed;
/* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
/*如果最新值比m[0].rtt都还小,那么毫无疑问它既是一个wlen/4时间窗口内的最小rtt,也是wlen/2时间窗口,wlen个时间窗口内的最小值。*/
if (unlikely(rttm.rtt <= m[0].rtt))
m[0] = m[1] = m[2] = rttm;
/*同理,如果最新的值比m[1].rtt还小,那它肯定比m[2].rtt更小,所以可以更新最近wlen/4时间窗口内的最小值,最近wlen/2时间窗口内的最小值*/
else if (rttm.rtt <= m[1].rtt)
m[1] = m[2] = rttm;
else if (rttm.rtt <= m[2].rtt)
m[2] = rttm;
/*我们要看看当前的记录是不是已经过期了*/
elapsed = now - m[0].ts;
if (unlikely(elapsed > wlen)) {
/* Passed entire window without a new min so make 2nd choice
* the new min & 3rd choice the new 2nd. So forth and so on.
*/
/* 如果已经m[0]的记录已经是一个wlen以前的了,那么需要更新它,后面新的值依次往前挪 */
m[0] = m[1];
m[1] = m[2];
m[2] = rttm;
if (now - m[0].ts > wlen) {
m[0] = m[1];
m[1] = rttm;
if (now - m[0].ts > wlen)
m[0] = rttm;
}
} else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
/* Passed a quarter of the window without a new min so
* take 2nd choice from the 2nd quarter of the window.
*/
m[2] = m[1] = rttm;
} else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
/* Passed half the window without a new min so take the 3rd
* choice from the last half of the window.
*/
m[2] = rttm;
}
}
若要获得当前这段时间内的rtt最小值,取tcp_sock->rtt_min[0].rtt即可。
/* Record the most recently (re)sent time among the (s)acked packets */
void tcp_rack_advance(struct tcp_sock *tp,
const struct skb_mstamp *xmit_time, u8 sacked)
{
/*如果这个发送时间比现在的rack.mstamp还早,那就不需要更新rack.mstamp了*/
if (tp->rack.mstamp.v64 &&
!skb_mstamp_after(xmit_time, &tp->rack.mstamp))
return;
/*如果这个包被重传过,那么这个包最近一次发送至少应该在mint_rtt之前*/
if (sacked & TCPCB_RETRANS) {
struct skb_mstamp now;
/* If the sacked packet was retransmitted, it's ambiguous
* whether the retransmission or the original (or the prior
* retransmission) was sacked.
*
* If the original is lost, there is no ambiguity. Otherwise
* we assume the original can be delayed up to aRTT + min_rtt.
* the aRTT term is bounded by the fast recovery or timeout,
* so it's at least one RTT (i.e., retransmission is at least
* an RTT later).
*/
skb_mstamp_get(&now);
if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
return;
}
/*更新rack.mstamp为这个包的发送时间,并置rack.advanced= 1,表示rack已经更新了*/
tp->rack.mstamp = *xmit_time;
tp->rack.advanced = 1;
}
符合条件是指:对于所有已经发送但未收到确认的包,如果这个包的发送时间pkt.xmit_time与rack.xmit_time的关系满足:
rack.xmit_time - skb.xmit_time > reo_wnd那么将这个skb标记为已丢失。实现函数是tcp_rack_mark_lost.
/* Marks a packet lost, if some packet sent later has been (s)acked.
* The underlying idea is similar to the traditional dupthresh and FACK
* but they look at different metrics:
*
* dupthresh: 3 OOO packets delivered (packet count)
* FACK: sequence delta to highest sacked sequence (sequence space)
* RACK: sent time delta to the latest delivered packet (time domain)
*
* The advantage of RACK is it applies to both original and retransmitted
* packet and therefore is robust against tail losses. Another advantage
* is being more resilient to reordering by simply allowing some
* "settling delay", instead of tweaking the dupthresh.
*
* The current version is only used after recovery starts but can be
* easily extended to detect the first loss.
*/
int tcp_rack_mark_lost(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
u32 reo_wnd, prior_retrans = tp->retrans_out;
if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
return 0;
/* Reset the advanced flag to avoid unnecessary queue scanning */
tp->rack.advanced = 0;
/* To be more reordering resilient, allow min_rtt/4 settling delay
* (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
* RTT because reordering is often a path property and less related
* to queuing or delayed ACKs.
*
* TODO: measure and adapt to the observed reordering delay, and
* use a timer to retransmit like the delayed early retransmit.
*/
/*允许乱序的时间窗口 = max( min_rtt/4, 1000uS)*/
reo_wnd = 1000;
if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
/*遍历重传队列write_queue,看看要把哪些skb标记为已丢失*/
tcp_for_write_queue(skb, sk) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
/*已经遍历到了没有发过的包,停止遍历*/
if (skb == tcp_send_head(sk))
break;
/* Skip ones already (s)acked */
/* 跳过这些已经被累积确认或者被sack确认的skb*/
if (!after(scb->end_seq, tp->snd_una) ||
scb->sacked & TCPCB_SACKED_ACKED)
continue;
/* 如果这个skb是在最近一次发包时间之前发送的*/
if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
/* 如果这个skb的发包时间与最近一次的发包时间的时间距离小于允许乱序的时间窗口
* 那么认为这个包只是乱序而已,没有丢,跳过*/
if (skb_mstamp_us_delta(&tp->rack.mstamp,
&skb->skb_mstamp) <= reo_wnd)
continue;
/* skb is lost if packet sent later is sacked */
/* 如果这个在skb后面一段时间发的包都收到了sack,那么把这个包标记为TCPCB_LOST*/
tcp_skb_mark_lost_uncond_verify(tp, skb);
/*就算这个skb以前重传过,也要标记为丢失准备下一次重传,因此要把重传标记清除*/
if (scb->sacked & TCPCB_SACKED_RETRANS) {
scb->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPLOSTRETRANSMIT);
}
} else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early
* b/c the rest are all sent after rack_sent
*/
break;
}
}
return prior_retrans - tp->retrans_out;
}
个人点评:
rack的优点:相比之前的丢包检测算法,rack最大的不同之处在于基于时间,而以前的算法都是基于序列号,所以可以很轻易地解决二次重传的问题。不过对于流尾部丢包,也需要一个额外的定时器来辅助。
rack的缺点:算法中对于乱序窗口的设置reo_wnd = max(min_rtt/4, 1ms)的设置比较僵硬(也没有给出足够的解释为什么是四分之一,为什么是1ms)。reo_wnd和min_rtt之间的比值大小应该跟当前的乱序程度有关。RACK只是专注于如何检测丢包,而对于乱序检测并没有做。
以上观点可能考虑不全,若有错误之处,还请指正~