如下定义,TCP的ACK状态由如下4个。
enum inet_csk_ack_state_t {
ICSK_ACK_SCHED = 1,
ICSK_ACK_TIMER = 2,
ICSK_ACK_PUSHED = 4,
ICSK_ACK_PUSHED2 = 8
};
一、初始状态
在TCP套接口初始化时,注册延迟ACK等一系列超时处理函数。并且在inet_csk_init_xmit_timers函数中,清空ACK的pending状态变量。参考以上的状态枚举类型定义,设置为0为无效值。
void tcp_init_xmit_timers(struct sock *sk)
{
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer);
}
void inet_csk_init_xmit_timers(struct sock *sk,
void (*retransmit_handler)(struct timer_list *t),
void (*delack_handler)(struct timer_list *t),
void (*keepalive_handler)(struct timer_list *t))
{
struct inet_connection_sock *icsk = inet_csk(sk);
timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0);
timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
timer_setup(&sk->sk_timer, keepalive_handler, 0);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
当套接口销毁tcp_v4_destroy_sock或者断开连接tcp_disconnect时,内核将ACK状态pending变量恢复为零。另外,在套接口发送带有ACK的报文时,在函数inet_csk_clear_xmit_timer中,也将ACK的状态pending变量清零。由此可将,ACK的状态变化以两次ACK发送之间为一个周期。
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
{
if (likely(tcb->tcp_flags & TCPHDR_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
}
static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
tcp_dec_quickack_mode(sk, pkts);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what)
{
struct inet_connection_sock *icsk = inet_csk(sk);
if (what == ICSK_TIME_DACK) {
icsk->icsk_ack.blocked = icsk->icsk_ack.pending = 0;
}
二、ACK的调度状态
函数inet_csk_schedule_ack设置ACK的调度状态ICSK_ACK_SCHED。当套接口接收到对端TCP报文,由于某种原因不能够立即回复对端ACK确认报文时,将置位此状态,进入此状态,意味着需要在之后完成ACK报文的发送,类似与一个记录。
static inline void inet_csk_schedule_ack(struct sock *sk)
{
inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_SCHED;
}
在接收到对端的数据报文,接收事件函数tcp_event_data_recv将设置ACK调度状态。由于接收到了对端数据,应回复ACK报文,记录此状态。
static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
{
inet_csk_schedule_ack(sk);
tcp_measure_rcv_mss(sk, skb);
}
对于接收到的FIN报文,由于其SKB长度值为0,在函数tcp_fin中设置ACK的调度状态。FIN报文需要ACK报文确认。
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
tcp_fin(sk);
}
void tcp_fin(struct sock *sk)
{
inet_csk_schedule_ack(sk);
}
以上时对于保序的TCP报文,对于乱序的TCP数据报文,在接收函数tcp_data_queue_ofo中,设置ACK的调度状态。
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
inet_csk_schedule_ack(sk);
}
对于重传的报文,在函数tcp_data_queue中设置ACK的调度状态。
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
/* A retransmit, 2nd most common case. Force an immediate ack. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
tcp_enter_quickack_mode(sk);
inet_csk_schedule_ack(sk);
drop:
tcp_drop(sk, skb);
return;
}
}
在TCP三次握手期间,客户端接收到服务端回复的第二个SYN+ACK报文时,如果套接口由数据正在等待发送,或者设置了延迟ACCEPT功能,又或者ACK策略处于pingpong交互模式,设置ACK的调度状态,延迟回复ACK报文。
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
if (th->ack) {
if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) {
inet_csk_schedule_ack(sk);
tcp_enter_quickack_mode(sk);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX);
discard:
tcp_drop(sk, skb);
return 0;
} else {
tcp_send_ack(sk);
}
return -1;
}
}
三、ACK定时器状态
以下函数inet_csk_reset_xmit_timer启动延迟ACK定时器,设置ACK为定时器状态ICSK_ACK_TIMER。标志着延迟ACK定时器的启动。
static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, const unsigned long max_when)
{
struct inet_connection_sock *icsk = inet_csk(sk);
if (what == ICSK_TIME_DACK) {
icsk->icsk_ack.pending |= ICSK_ACK_TIMER;
icsk->icsk_ack.timeout = jiffies + when;
sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
}
}
如果TCP套接口在发送ACK报文时,遇到内存不足,分配SKB缓存失败的情况,将首先设置ACK的调度状态。之后如以上函数inet_csk_reset_xmit_timer所示,再设置ACK的定时器状态,并且启动定时器。
void tcp_send_ack(struct sock *sk)
{
/* We are not putting this on the write queue, so tcp_transmit_skb() will set the ownership to this sock. */
buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
if (unlikely(!buff)) {
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX);
return;
}
}
如上节所示,TCP客户端在函数tcp_rcv_synsent_state_process中处理服务端的SYN+ACK报文时,如果不立即回复ACK报文,将设置ACK的调度状态和定时器状态。
在ACK发送决策函数__tcp_ack_snd_check中,如果判定要延迟回复ACK报文,在处理函数tcp_send_delayed_ack中,同时设置ACK的调度和定时器状态,并且,启动延迟ACK定时器。注意在满足一定条件下,tcp_send_delayed_ack函数由可能并不启动定时器,而是立即回复ACK确认报文。
void tcp_send_delayed_ack(struct sock *sk)
{
if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
/* If delack timer was blocked or is about to expire, send ACK now. */
if (icsk->icsk_ack.blocked || time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
tcp_send_ack(sk);
return;
}
if (!time_before(timeout, icsk->icsk_ack.timeout))
timeout = icsk->icsk_ack.timeout;
}
icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
icsk->icsk_ack.timeout = timeout;
sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}
在延迟ACK超时处理函数tcp_delack_timer_handler中,将清除ACK的ICSK_ACK_TIMER状态。并且如果存在需要调度的ACK报文,立即进行ACK发送tcp_send_ack。
void tcp_delack_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
if (inet_csk_ack_scheduled(sk)) {
if (!icsk->icsk_ack.pingpong) {
icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); /* Delayed ACK missed: inflate ATO. */
} else {
/* Delayed ACK missed: leave pingpong mode and deflate ATO. */
icsk->icsk_ack.pingpong = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
tcp_mstamp_refresh(tcp_sk(sk));
tcp_send_ack(sk);
}
四、ACK的PUSHED状态
ACK的两个状态ICSK_ACK_PUSHED和ICSK_ACK_PUSHED2,主要用于应用层读取套接口数据之后,是否发送ACK的判断。在函数tcp_cleanup_rbuf中,如果copied大于零,用户确实读取了数据;并且套接口接收缓存已空sk_rmem_alloc,此时只要满足以下两个条件之一,就需要回复对端ACK报文:
1)ACK的状态为ICSK_ACK_PUSHED2;
2)ACK的状态为ICSK_ACK_PUSHED,并且套接口为非交互,pingpong等于零。如果是交互型套接口,即使处于ICSK_ACK_PUSHED状态,也不回复ACK报文。
static void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
if (inet_csk_ack_scheduled(sk)) {
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ack.blocked ||
/* Once-per-two-segments ACK was not sent by tcp_input.c */
tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
/*
* If this read emptied read buffer, we send ACK, if connection is not bidirectional, user drained
* receive buffer and there was a small segment in queue.
*/
(copied > 0 &&
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && !icsk->icsk_ack.pingpong)) &&
!atomic_read(&sk->sk_rmem_alloc)))
time_to_ack = true;
}
以下函数tcp_measure_rcv_mss设置ACK的两个PUSHED状态,此函数在TCP的接收函数tcp_event_data_recv中被调用,旨在重新计算对端的MSS值,用于延迟ACK的使用。如果检测到对端的MSS值减小,并且连续两次的检查值相同,将更新rcv_mss检测值。但是在检测到对端的MSS值在增加时,不需要进行两次检测。
当对端MSS值减小,设置ACK的状态为ICSK_ACK_PUSHED,并且如果在第二次检查的时候,ACK状态ICSK_ACK_PUSHED还在设置,将设置ACK状态ICSK_ACK_PUSHED2,表明检测到了两次MSS减小,当时这两个的值不相同,所以并没有更新rcv_mss的值。在此期间,内核也没有发送ACK报文,因为发送ACK报文将清空ACK状态的pending变量。
如果在随后的ACK发送判定函数__tcp_ack_snd_check中没有立即发送ACK报文。在应用层读取数据后,将会执行tcp_cleanup_rbuf函数中的逻辑。
static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const unsigned int lss = icsk->icsk_ack.last_seg_size;
icsk->icsk_ack.last_seg_size = 0;
len = skb_shinfo(skb)->gso_size ? : skb->len;
if (len >= icsk->icsk_ack.rcv_mss) {
} else {
/* we make more careful check, that SACKs block is variable. "len" is invariant segment length, including TCP header. */
len += skb->data - skb_transport_header(skb);
if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
(len >= TCP_MIN_MSS + sizeof(struct tcphdr) && !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
len -= tcp_sk(sk)->tcp_header_len;
icsk->icsk_ack.last_seg_size = len;
if (len == lss) {
icsk->icsk_ack.rcv_mss = len;
return;
}
}
if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
}
感谢redwingz博主分享优等文章