概述

shutdown系统调用在tcp层会调用两个函数,对于ESTABLISHED状态需要调用tcp_shutdown关闭连接,对于LISTEN和SYN_SENT状态则需要以非阻塞模式调用tcp_disconnect断开连接;本文除了对这两个函数进行分析以外,还会分析在shutdown关闭了读或者写之后,读写系统调用sendmsg和recvmsg将如何处理对应操作;

 /* 关闭操作 */
int inet_shutdown(struct socket *sock, int how)
{
/*...*/
switch (sk->sk_state) {
case TCP_CLOSE:
err = -ENOTCONN;
/* Hack to wake up other listeners, who can poll for
POLLHUP, even on eg. unconnected UDP sockets -- RR */
default:
/* 设置how值到sk_shutdown,并且调用传输层的shutdown */
sk->sk_shutdown |= how;
if (sk->sk_prot->shutdown)
sk->sk_prot->shutdown(sk, how);
break; /* Remaining two branches are temporary solution for missing
* close() in multithreaded environment. It is _not_ a good idea,
* but we have no choice until close() is repaired at VFS level.
*/
case TCP_LISTEN:
/* 监听状态,如果无接收方向的关闭操作,跳出 */
if (!(how & RCV_SHUTDOWN))
break;
/* 有接收方向的关闭,继续 */
/* Fall through */
case TCP_SYN_SENT:
/* 调用传输层的disconnect断开连接 */
err = sk->sk_prot->disconnect(sk, O_NONBLOCK); /* 调增状态 */
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
break;
} /* Wake up anyone sleeping in poll. */
/* 状态改变,唤醒等待的进程 */
sk->sk_state_change(sk);
release_sock(sk);
return err;
}
tcp_shutdown

tcp_shutdown函数完成设置关闭之后的状态,并且发送fin;注意只有接收端关闭时,不发送fin,只是在recvmsg系统调用中判断状态,不接收数据;

 /*
* Shutdown the sending side of a connection. Much like close except
* that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
*/ void tcp_shutdown(struct sock *sk, int how)
{
/* We need to grab some memory, and put together a FIN,
* and then put it into the queue to be sent.
* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
*/
/* 不含有SEND_SHUTDOWN,返回,接收方关闭,不发fin */
if (!(how & SEND_SHUTDOWN))
return; /* If we've already sent a FIN, or it's a closed state, skip this. */ /* 以下这几个状态发fin */
if (( << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_SYN_SENT |
TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
/* Clear out any half completed packets. FIN if needed. */
/* 设置新状态,发送fin */
if (tcp_close_state(sk))
tcp_send_fin(sk);
}
}

tcp_close_state函数根据new_state状态表进行跳转,比如TCP_ESTABLISHED关闭时会跳转到TCP_FIN_WAIT1 | TCP_ACTION_FIN;

 static const unsigned char new_state[] = {
/* current state: new state: action: */
[ /* (Invalid) */] = TCP_CLOSE,
[TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
[TCP_SYN_SENT] = TCP_CLOSE,
[TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
[TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
[TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
[TCP_TIME_WAIT] = TCP_CLOSE,
[TCP_CLOSE] = TCP_CLOSE,
[TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
[TCP_LAST_ACK] = TCP_LAST_ACK,
[TCP_LISTEN] = TCP_CLOSE,
[TCP_CLOSING] = TCP_CLOSING,
[TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
}; static int tcp_close_state(struct sock *sk)
{
int next = (int)new_state[sk->sk_state];
int ns = next & TCP_STATE_MASK; tcp_set_state(sk, ns); return next & TCP_ACTION_FIN;
}

tcp_send_fin完成fin的发送,如果队列中有数据段未发送,则共用最后一个数据段,在上面打fin标记,没有能重用的情况下,则新分配数据段;然后关闭nagle算法,并将队列中的数据段都发送出去;(注: 对于压力下,判断是否有数据这个逻辑未理解清楚)

 /* Send a FIN. The caller locks the socket for us.
* We should try to send a FIN packet really hard, but eventually give up.
*/
void tcp_send_fin(struct sock *sk)
{
struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk); /* Optimization, tack on the FIN if we have one skb in write queue and
* this skb was not yet sent, or we are under memory pressure.
* Note: in the latter case, FIN packet will be sent after a timeout,
* as TCP stack thinks it has already been transmitted.
*/
/* 取到尾skb指针&& (有数据要发送 || 内存压力之下) */
if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
coalesce:
/* 尾skb上打fin标记 */
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
/* fin标记占用一个序号 */
TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++; /* tskb已经发送了,压力之下,认为已经发送了?? */
if (!tcp_send_head(sk)) {
/* This means tskb was already sent.
* Pretend we included the FIN on previous transmit.
* We need to set tp->snd_nxt to the value it would have
* if FIN had been sent. This is because retransmit path
* does not change tp->snd_nxt.
*/
tp->snd_nxt++;
return;
}
}
/* 不满足上述情况,需要重新分配内存 */
else {
/* 分配skb */
skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
if (unlikely(!skb)) {
/* 队列为空无压力情况?? 冲走一遍最后包共用fin流程*/
if (tskb)
goto coalesce;
return;
} /* 初始化skb */
skb_reserve(skb, MAX_TCP_HEADER);
sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN); /* 添加到发送队列 */
tcp_queue_skb(sk, skb);
} /* 关闭nagle算法,将队列中的数据段全部发送出去 */
__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
}
tcp_disconnect

在连接为LISTEN或者SYN_SENT状态,会调用tcp_disconnect端口连接;函数首先对各种状态做分别的特有处理,然后再统一清理资源;

 int tcp_disconnect(struct sock *sk, int flags)
{
struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int err = ;
int old_state = sk->sk_state; /* 不是close状态则设置为close,从hash中删除控制块 */
if (old_state != TCP_CLOSE)
tcp_set_state(sk, TCP_CLOSE); /* ABORT function of RFC793 */
/* LISTEN状态,停止监听 */
if (old_state == TCP_LISTEN) {
inet_csk_listen_stop(sk);
}
/* 修复模式 */
else if (unlikely(tp->repair)) {
sk->sk_err = ECONNABORTED;
}
/* 需要发送rst
|| 下一个发送序号并不是最后一个队列数据段序号
&& 是被动关闭的结束状态 */
else if (tcp_need_reset(old_state) ||
(tp->snd_nxt != tp->write_seq &&
( << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
/* The last check adjusts for discrepancy of Linux wrt. RFC
* states
*/
/* 发送rst */
tcp_send_active_reset(sk, gfp_any());
sk->sk_err = ECONNRESET;
}
/* SYN_SENT状态 */
else if (old_state == TCP_SYN_SENT)
sk->sk_err = ECONNRESET; /* 清除定时器 */
tcp_clear_xmit_timers(sk); /* 释放接收队列中的skb */
__skb_queue_purge(&sk->sk_receive_queue); /* 释放发送队列中的skb */
tcp_write_queue_purge(sk);
tcp_fastopen_active_disable_ofo_check(sk);
/*释放未按顺序达到的skb */
skb_rbtree_purge(&tp->out_of_order_queue); /* 其他各种清理工作 */ inet->inet_dport = ; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk); sk->sk_shutdown = ;
sock_reset_flag(sk, SOCK_DONE);
tp->srtt_us = ;
tp->write_seq += tp->max_window + ;
if (tp->write_seq == )
tp->write_seq = ;
icsk->icsk_backoff = ;
tp->snd_cwnd = ;
icsk->icsk_probes_out = ;
tp->packets_out = ;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_cnt = ;
tp->window_clamp = ;
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
inet_csk_delack_init(sk);
/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
* issue in __tcp_select_window()
*/
icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
tcp_init_send_head(sk);
memset(&tp->rx_opt, , sizeof(tp->rx_opt));
__sk_dst_reset(sk);
dst_release(sk->sk_rx_dst);
sk->sk_rx_dst = NULL;
tcp_saved_syn_free(tp); /* Clean up fastopen related fields */
tcp_free_fastopen_req(tp);
inet->defer_connect = ; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); sk->sk_error_report(sk);
return err;
}
tcp_sendmsg&&tcp_recvmsg

在使用shutdown关闭了发送之后,再次调用tcp_sendmsg发送数据,那么该函数会返回错误;

 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
}

在使用shutdown关闭了接收之后,再次调用tcp_recvmsg接收数据,那么函数不会读取数据,而是立即返回;

 int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len)
{
/*... */ do {
u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
if (tp->urg_data && tp->urg_seq == *seq) {
if (copied)
break;
if (signal_pending(current)) {
copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
break;
}
} /* Next get a buffer. */ last = skb_peek_tail(&sk->sk_receive_queue);
skb_queue_walk(&sk->sk_receive_queue, skb) {
last = skb;
/* Now that we have two receive queues this
* shouldn't happen.
*/
if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
"recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
flags))
break; offset = *seq - TCP_SKB_CB(skb)->seq;
if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
pr_err_once("%s: found a SYN, please report !\n", __func__);
offset--;
}
if (offset < skb->len)
goto found_ok_skb;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
WARN(!(flags & MSG_PEEK),
"recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
} /* Well, if we have backlog, try to process it now yet. */ if (copied >= target && !sk->sk_backlog.tail)
break; if (copied) {
if (sk->sk_err ||
sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
!timeo ||
signal_pending(current))
break;
} else {
if (sock_flag(sk, SOCK_DONE))
break; if (sk->sk_err) {
copied = sock_error(sk);
break;
} if (sk->sk_shutdown & RCV_SHUTDOWN)
break; if (sk->sk_state == TCP_CLOSE) {
if (!sock_flag(sk, SOCK_DONE)) {
/* This occurs when user tries to read
* from never connected socket.
*/
copied = -ENOTCONN;
break;
}
break;
} if (!timeo) {
copied = -EAGAIN;
break;
} if (signal_pending(current)) {
copied = sock_intr_errno(timeo);
break;
}
}
} while (len > );
}
05-28 15:11