本文的copyleft归gfree.wind@gmail.com所有,使用GPL发布,可以自由拷贝,转载。但转载请保持文档的完整性,注明原作者及原链接,严禁用于任何商业用途。
作者:gfree.wind@gmail.com
博客:linuxfocus.blog.chinaunix.net
    

在前面的分析数据包接收流程的博文中,都是针对的UDP协议。随着这个流程的贯通,对我来讲一个基本的TCP/IP协议栈的主干已经创建了。后面的学习过程就是从这个主干不断地延伸分支,最后形成一棵完整的TCP/IP协议栈的流程图。


今天时间不多,就延伸一个L4 TCP如何选择正确的socket接收数据吧。在inet_init()中通过tcp_protocol注册了TCP 数据包的处理函数tcp_rcv。下面就由它开始:
  1. int tcp_v4_rcv(struct sk_buff *skb)
  2. {
  3.     const struct iphdr *iph;
  4.     struct tcphdr *th;
  5.     struct sock *sk;
  6.     int ret;
  7.     struct net *net = dev_net(skb->dev);
     /*
     TCP是面向连接的协议,也就是端对端。如果包的类型不是HOST,当然不正确了,
     所以直接drop 
     */
  1.     if (skb->pkt_type != PACKET_HOST)
  2.         goto discard_it;

  3.     /* Count it even if it's bad */
  4.     /* 更新统计信息 */
  5.     TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);

  6.     if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
  7.         goto discard_it;
     /* 得到TCP报文头 */
  1.     th = tcp_hdr(skb);
     /* 做sanity check */
  1.     if (th->doff < sizeof(struct tcphdr) / 4)
  2.         goto bad_packet;
  3.     if (!pskb_may_pull(skb, th->doff * 4))
  4.         goto discard_it;

  5.     /* An explanation is required here, I think.
  6.      * Packet length and doff are validated by header prediction,
  7.      * provided case of th->doff==0 is eliminated.
  8.      * So, we defer the checks. */
  9.     /* 检测checksum */
  10.     if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
  11.         goto bad_packet;

  12.     /* 得到TCP的seq,ack等  */
  13.     th = tcp_hdr(skb);
  14.     iph = ip_hdr(skb);
  15.     TCP_SKB_CB(skb)->seq = ntohl(th->seq);
  16.     TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
  17.                  skb->len - th->doff * 4);
  18.     TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
  19.     TCP_SKB_CB(skb)->when     = 0;
  20.     TCP_SKB_CB(skb)->flags     = iph->tos;
  21.     TCP_SKB_CB(skb)->sacked     = 0;
     /* 查找对应的socket */
  1.     sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
  2.     if (!sk)
  3.         goto no_tcp_socket;
     /* 后面的暂不关心 */
  1.     ...... ......
  1. }
进入__inet_lookup_skb->__inet_lookup
  1. static inline struct sock *__inet_lookup(struct net *net,
  2.                      struct inet_hashinfo *hashinfo,
  3.                      const __be32 saddr, const __be16 sport,
  4.                      const __be32 daddr, const __be16 dport,
  5.                      const int dif)
  6. {
  7.     u16 hnum = ntohs(dport);
  8.     /* 先尝试查找处于连接成功的socket */
  9.     struct sock *sk = __inet_lookup_established(net, hashinfo,
  10.                 saddr, sport, daddr, hnum, dif);
     /* 如果没有找到连接成功的socket,那么就去处于listen状态的socket查找 */
  1.     return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif);
  2. }
先看__inet_lookup_established
  1. struct sock * __inet_lookup_established(struct net *net,
  2.                  struct inet_hashinfo *hashinfo,
  3.                  const __be32 saddr, const __be16 sport,
  4.                  const __be32 daddr, const u16 hnum,
  5.                  const int dif)
  6. {
  7.     INET_ADDR_COOKIE(acookie, saddr, daddr)
  8.     const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
  9.     struct sock *sk;
  10.     const struct hlist_nulls_node *node;
  11.     /* Optimize here for direct hit, only listening connections can
  12.      * have wildcards anyways.
  13.      */
  14.     unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
  15.     unsigned int slot = hash & hashinfo->ehash_mask;
     /* 获得处于连接状态(established
  1.     struct inet_ehash_bucket *head = &hashinfo->ehash[slot];

  2.     rcu_read_lock();
  3. begin:
  4.     
  5.     sk_nulls_for_each_rcu(sk, node, &head->chain) {
  6.         if (INET_MATCH(sk, net, hash, acookie,
  7.                     saddr, daddr, ports, dif)) {
  8.             /* 地址端口等均匹配 */
     
             /* 大致的看了一下进入TIME_WAIT的函数,当socket进入TW时,并没有从ehash中移除,所以可能需     要检查TW*/
  1.             if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
  2.                 goto begintw; /* 该socket的引用计数为0,必须检测是处于TIME_WAIT状态 */

     /*
             这里为什么要两次检验:通过google搜索,直到当时加上2次检验的原因是因为RCU的缘故。想了半天,终于明白了。在第一次INET_MATCH时,该sk还没有被hold。只有执行了atomic_inc_not_zero,才相当于hold了这个sk。但是正常的RCU的操作,应该是先hold,才能保证内容没有变化。所以需要二次判断。
             */
  1.             if (unlikely(!INET_MATCH(sk, net, hash, acookie,
  2.                 saddr, daddr, ports, dif))) {
  3.                 sock_put(sk);
  4.                 goto begin;
  5.             }
     /* 找到了socket */
  1.             goto out;
  2.         }
  3.     }
  4.     /*
  5.      * if the nulls value we got at the end of this lookup is
  6.      * not the expected one, we must restart lookup.
  7.      * We probably met an item that was moved to another chain.
  8.      */
  9.     if (get_nulls_value(node) != slot)
  10.         goto begin;

  11. begintw:
  12.     /* Must check for a TIME_WAIT'er before going to listener hash. */
  13.     /* 确保该socket不处于TIME_WAIT状态 */
  14.     sk_nulls_for_each_rcu(sk, node, &head->twchain) {
  15.         if (INET_TW_MATCH(sk, net, hash, acookie,
  16.                     saddr, daddr, ports, dif)) {
  17.             if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
  18.                 /* 
  19.                 该socket的引用计数为0,那么意味着该socket已经无人使用,所以可视为该socket无效。
  20.                 */
  21.                 sk = NULL;
  22.                 goto out;
  23.             }
     /* 二次比较。原因同上 */
  1.             if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
  2.                  saddr, daddr, ports, dif))) {
  3.                 sock_put(sk);
  4.                 goto begintw;
  5.             }
  6.             goto out;
  7.         }
  8.     }
  9.     /*
  10.      * if the nulls value we got at the end of this lookup is
  11.      * not the expected one, we must restart lookup.
  12.      * We probably met an item that was moved to another chain.
  13.      */
  14.     if (get_nulls_value(node) != slot)
  15.         goto begintw;
  16.     sk = NULL;
  17. out:
  18.     rcu_read_unlock();
  19.     return sk;
  20. }
这是到连接成功的socket的查找,下面是处于listen状态的查找。
  1. struct sock *__inet_lookup_listener(struct net *net,
  2.                  struct inet_hashinfo *hashinfo,
  3.                  const __be32 daddr, const unsigned short hnum,
  4.                  const int dif)
  5. {
  6.     struct sock *sk, *result;
  7.     struct hlist_nulls_node *node;
  8.     unsigned int hash = inet_lhashfn(net, hnum);
  9.     struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
  10.     int score, hiscore;

  11.     rcu_read_lock();
  12. begin:
  13.     result = NULL;
  14.     hiscore = -1;
  15.     /* 这里的查找与UDP相似,都是计算匹配的得分,取最佳匹配的socket */
  16.     sk_nulls_for_each_rcu(sk, node, &ilb->head) {
  17.         score = compute_score(sk, net, hnum, daddr, dif);
  18.         if (score > hiscore) {
  19.             result = sk;
  20.             hiscore = score;
  21.         }
  22.     }
  23.     /*
  24.      * if the nulls value we got at the end of this lookup is
  25.      * not the expected one, we must restart lookup.
  26.      * We probably met an item that was moved to another chain.
  27.      */
  28.     if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
  29.         goto begin;
  30.     if (result) {
  31.         /* 如果该socket已不再被使用,则放弃这个socket */
  32.         if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
  33.             result = NULL;
  34.         else if (unlikely(compute_score(result, net, hnum, daddr,
  35.                  dif) < hiscore)) { /* 仍然是二次计算,原因仍然同上 */
  36.             sock_put(result);
  37.             goto begin;
  38.         }
  39.     }
  40.     rcu_read_unlock();
  41.     return result;
  42. }
到此,TCP数据包选择对应socket的过程已经完成。


今天学习的东西看上去很简单,但是居然遇到了问题——为什么TCP中,要有这种二次重复计算呢?睡了一晚上,终于想明白了。已经在原代码中更新了原因。


12-23 00:47