linux内核计算次方,linux内核bic和cubic实现

越风史

2023-12-01

BIC算法

当发生丢包的时候，当前窗口为W-max, 减小窗口后的大小为W， BIC算法就是根据这个原理在(W, W-max]区间内做二分搜索

当接近于W-max时曲线应该更平滑，当离W-max较远的时候，曲线可以更加陡峭

32static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)

{

struct tcp_sock *tp = tcp_sk(sk);

struct bictcp *ca = inet_csk_ca(sk);

if (!tcp_is_cwnd_limited(sk))

return;

if (tcp_in_slow_start(tp))

tcp_slow_start(tp, acked);

else {

bictcp_update(ca, tp->snd_cwnd);// 计算ca->cnt, 表示增加一个cwnd需要的ack数量

tcp_cong_avoid_ai(tp, ca->cnt, 1); // 根据ca->cnt计算snd_cwnd

}

void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)

{

/* If credits accumulated at a higher w, apply them gently now. */

if (tp->snd_cwnd_cnt >= w) {

tp->snd_cwnd_cnt = 0;

tp->snd_cwnd++;

}

tp->snd_cwnd_cnt += acked;

if (tp->snd_cwnd_cnt >= w) {

u32 delta = tp->snd_cwnd_cnt / w;

tp->snd_cwnd_cnt -= delta * w;

tp->snd_cwnd += delta;

}

tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);

}

bictcp_recalc_ssthresh计算慢启动阈值

在snd_cwnd > low_window(14)的时候，引入beta/BICTCP_BETA_SCALE (819/1024), 不至于下降的过快

并记录当前拥塞时的last_max_cwnd

21static u32 bictcp_recalc_ssthresh(struct sock *sk)

{

const struct tcp_sock *tp = tcp_sk(sk);

struct bictcp *ca = inet_csk_ca(sk);

ca->epoch_start = 0;/* end of epoch */

/* Wmax and fast convergence */

if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)

ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))

/ (2 * BICTCP_BETA_SCALE);

else

ca->last_max_cwnd = tp->snd_cwnd;

ca->loss_cwnd = tp->snd_cwnd;

if (tp->snd_cwnd <= low_window)

return max(tp->snd_cwnd >> 1U, 2U);

else

return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);

}

bictcp_acked

计算ca->delayed_ack，表示每收到一个ack，平均确认的packet数量，这里通过ACK_RATIO_SHIFT做了加权计算

12//ratio = (15*ratio + sample) / 16

static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)

{

const struct inet_connection_sock *icsk = inet_csk(sk);

if (icsk->icsk_ca_state == TCP_CA_Open) {

struct bictcp *ca = inet_csk_ca(sk);

ca->delayed_ack += sample->pkts_acked -

(ca->delayed_ack >> ACK_RATIO_SHIFT);

}

bictcp_update计算ca->cnt

max_increment：一个rtt时间内最大增加的cwnd数量，默认16

BICTCP_B: 增加到last_max_cwnd需要的rtt数，默认4

smooth_part：在非常逼近last_max_cwnd的时候，减慢cwnd增加的速度，默认20

当cwnd < last_max_cwnd的时候，需要根据差距选择合适的逼近方式

当last_max_cwnd - cwnd > BICTCP_Bmax_increment(416)时，ca->cnt=cwnd / max_increment线性增加，一个rtt最大增加max_increment

当last_max_cwnd - cwnd <= BICTCP_B(4)时，说明差距较小，需要缓慢逼近last_max_cwnd，5个rtt后cwnd+1

当last_max_cwnd - cwnd 在(BICTCP_B, BICTCP_B*max_increment]区间时，BICTCP_B个rtt时间后，达到last_max_cwnd

当cwnd >= last_max_cwnd时

当cwnd - ca->last_max_cwnd < BICTCP_B, 说明只超过last_max_cwnd一点，继续谨慎增加，ca->cnt = (cwnd * smooth_part) / BICTCP_B， 5个rtt+1

当cwnd - ca->last_max_cwnd 在[BICTCP_B,max_increment*(BICTCP_B-1))区间时，一个rtt后增加(cwnd - ca->last_max_cwnd)/(BICTCP_B-1) 个cwnd, 这里随着rtt内cwnd的增加，又会增大增长速率

当cwnd - ca->last_max_cwnd >= max_increment*(BICTCP_B-1)时，说明上一个last_max_cwnd已经没有参考意义，开始线性增加，一个rtt增加max_increment

56static inline void bictcp_update(struct bictcp *ca, u32 cwnd)

{

if (ca->last_cwnd == cwnd &&

(s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)

return;

ca->last_cwnd = cwnd;

ca->last_time = tcp_time_stamp;

if (ca->epoch_start == 0) /* record the beginning of an epoch */

ca->epoch_start = tcp_time_stamp;

/* start off normal */

if (cwnd <= low_window) {

ca->cnt = cwnd;

return;

}

/* binary increase */

if (cwnd < ca->last_max_cwnd) {

__u32dist = (ca->last_max_cwnd - cwnd)

/ BICTCP_B;

if (dist > max_increment)

/* linear increase */

ca->cnt = cwnd / max_increment;

else if (dist <= 1U)

/* binary search increase */

ca->cnt = (cwnd * smooth_part) / BICTCP_B;

else

/* binary search increase */

ca->cnt = cwnd / dist;

} else {

/* slow start AMD linear increase */

if (cwnd < ca->last_max_cwnd + BICTCP_B)

/* slow start */

ca->cnt = (cwnd * smooth_part) / BICTCP_B;

else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))

/* slow start */

ca->cnt = (cwnd * (BICTCP_B-1))

/ (cwnd - ca->last_max_cwnd);

else

/* linear increase */

ca->cnt = cwnd / max_increment;

}

/* if in slow start or link utilization is very low */

if (ca->last_max_cwnd == 0) {

if (ca->cnt > 20) /* increase cwnd 5% per RTT */

ca->cnt = 20;

}

ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;

if (ca->cnt == 0)/* cannot be zero */

ca->cnt = 1;

}

最后通过除以delayed_ack，来考虑延时ack的情况，来计算需要的ack包的数量

bictcp_state

在TCP_CA_Loss状态的回掉中，会调用bictcp_state来reset ca

其中包括了last_max_cwnd被设置为0

这是不就说每次丢包后，cwnd=1，bic算法都要重新开始？那bic算法还有什么意义呢？

当然不是。

TCP_CA_Loss是当被超时定时器超时的时候，调用tcp_enter_loss,在其中被设置

而内核的丢包检测有:超时丢包检测,快速丢包检测以及rack之类的丢包检测

举例来说，一下子发了20个包，前16个中任何一个丢掉，几乎都会收到3个重复ack(进入recovery)，就不会超时, 如果尾部丢包还有tlp，并且rack进一步减少了超时情况

还需要的说明的是，当丢包时会保存ssthresh，通过ssthresh来记录当前cwnd信息，下一次可以慢启动快速达到该阈值

bic算法总结bic算法在超过ssthresh后有max_increment限制，只能线性增加

对于两个rtt不同的tcp连接，因为max_increment的控制，导致两个连接达到相同速率的不公平性

cubic算法

很容易看到bic的cwnd／时间坐标轴是在超时丢包点是凸到凹类似三次方的曲线，可以拟合到三次方程中

W(t) = C*(t-K)^3 + W_max (Eq. 1)

where C is a CUBIC parameter, t is the elapsed time from the last

window reduction,and K is the time period that the above function

takes to increase W to W_max when there is no further loss event and

is calculated by using the following equation:

K = cubic_root(W_max*beta/C) (Eq. 2)

where beta is the multiplication decrease factor.

相同rtt流的公平

RTT相同的两个流，如果一个连接的W-max较大，则乘法减小后beta*W-max就减少的更多，并且W_max较大的流K较大，则会导致W_max较大的流增加的缓慢，从而导致两个流收敛到同一个cwnd

不同rtt流的公平

因为丢包后，都是根据流逝时间t来计算的

有相对较快的增长速度

也就是三次方系数C， C又决定了K的大小，C越大，K越小，增长到W-max的速度就越快

cubic参数 & bictcp_update1

38beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3/ (BICTCP_BETA_SCALE - beta);

cube_rtt_scale = (bic_scale * 10);/* 1024*c/rtt */

cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */

do_div(cube_factor, bic_scale * 10);

static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)

{

//...

ca->bic_K = cubic_root(cube_factor * (ca->last_max_cwnd - cwnd));

//...

t = (s32)(tcp_time_stamp - ca->epoch_start);

t += msecs_to_jiffies(ca->delay_min >> 3);

/* change the unit from HZ to bictcp_HZ */

t <<= BICTCP_HZ;

do_div(t, HZ);

if (t < ca->bic_K)/* t - K */

offs = ca->bic_K - t;

else

offs = t - ca->bic_K;

/* c/rtt * (t-K)^3 */

delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);

if (t < ca->bic_K) /* below origin*/

bic_target = ca->bic_origin_point - delta;

else /* above origin*/

bic_target = ca->bic_origin_point + delta;

/* cubic function - calc bictcp_cnt*/

if (bic_target > cwnd) {

ca->cnt = cwnd / (bic_target - cwnd);

} else {

ca->cnt = 100 * cwnd; /* very small increment*/

}

...

}

beta & BICTCP_BETA_SCALE

由bictcp_recalc_ssthresh函数的返回结果知道

paper中的beta这里是beta / BICTCP_BETA_SCALE， beta默认为717， BICTCP_BETA_SCALE为1024

因此丢包后cubic窗口减小0.7的倍数

beta_scale & tcp_friendliness

因为cubic实时计算调整cwnd值，因此比传统的AIMD算法更加友好，尤其是对于RTT较小的连接来说，都需要经过K时间才能到达W-max

在论文中提到，为了和AIMD算法的per RTT内有相同的增长率,窗口增加的大小至少为$W_{tcp}$

$$W_{tcp} = W_{max}\beta + 3\frac{1-\beta}{1+\beta}\frac{t}{RTT}$$

因此开启tcp_friendiness实际上意味着有更好的增长率，而非因为更友好而更加保守

bic_scale & cube_rtt_scale

bic_scale就是paper中三次方系数C的1024倍缩放值

cube_rtt_scale是在bic_scale/RTT, 这里rtt=100ms=0.1s

cube_factor & bic_K & BICTCP_HZ

通过bic_K和paper中的公式对比，可以知道cube_factor就是1/(C/RTT)

而实际上因为bic_scale放大了1024倍，并且计算K的时候要开三次方，放大10^(3*BICTCP_HZ)就是让K放大BICTCP_HZ

BICTCP_HZ这个值决定了K的时间精度

最后通过BIC类似的方式更新ca->cnt

cubic hystart

cubic hystart提供了一种在RTT很小或者快速收到连续ack的时候，退出slowstart的检测机制

因为新建连接并不丢包的时候, K=0, 曲线增长率很大，启用hystart可以快速到达W-max值

调整Cubic参数

fast_convergence = 0 开启后快速收敛，丢包后略小的W-max，可以关闭

tcp_friendliness = 1 默认参数，始终开启，开启后实际上保证了per RTT内的增长率

beta=717 可以设置的更大(beta/1024)控制丢包后的窗口大小，早期值为819

BICTCP_HZ=10 对于HZ>=1000可以适当调整这个值，来让K有高的精度, 影响不大

hystart = 1 保持默认开启，快速退出慢启动，使用三次方增长曲线

bic_scale = 41 系数C的1024倍，C越大K越小，可以适当增大这个数值

其他:

代码中cube_rtt_scale和cube_factor使用了RTT=100ms来计算，可以适当减小这个数值

更小的RTT意味着

cube_rtt_scale更大，更高的增长率系数C

cube_factor更小，更小的K值

参考资料

linux内核计算次方,linux内核bic和cubic实现

相关阅读

相关文章

相关问答

相关文档