Php文档 Php问答行业资讯 Php论坛 Php手册 Php博客

游戏榜单

软件榜单

关闭导航

热搜榜

热门下载

热门标签

关闭搜索

php爱好者> php文档>LINUX下PING与TCP_IP协议栈学习笔记(2)

LINUX下PING与TCP_IP协议栈学习笔记(2)

时间：2009-04-02 来源：superfight

ip_append_data在/net/ipv4/ip_output.c中

int ip_append_data(struct sock *sk,
         int getfrag(void *from, char *to, int offset, int len,
             int odd, struct sk_buff *skb),
         void *from, int length, int transhdrlen,
         struct ipcm_cookie *ipc, struct rtable *rt,
         unsigned int flags)
{
    struct inet_sock *inet = inet_sk(sk);
    struct sk_buff *skb;
    struct ip_options *opt = NULL;
    int hh_len;
    int exthdrlen;
    int mtu;
    int copy;
    int err;
    int offset = 0;
    unsigned int maxfraglen, fragheaderlen;
    int csummode = CHECKSUM_NONE;
    //检测是否只初始化而不发送
    if (flags & MSG_PROBE)
        return 0;
    //检测发送队列是否为空
    if (skb_queue_empty(&sk->sk_write_queue))
    {
        /*
         * setup for corking.
         */
         //取得IP选项
        opt = ipc->opt;
        //检测IP选项是否为空
        if (opt)
        {
            if (inet->cork.opt == NULL)
            {
                inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
                if (unlikely(inet->cork.opt == NULL))
                    return -ENOBUFS;
            }
            memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
            inet->cork.flags |= IPCORK_OPT;
            inet->cork.addr = ipc->addr;
        }
        dst_hold(&rt->u.dst);
        //取得MTU
        inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
                                                  rt->u.dst.dev->mtu :
                                                  dst_mtu(rt->u.dst.path);
        //取得dst_entry结构
        inet->cork.dst = &rt->u.dst;
        inet->cork.length = 0;
        sk->sk_sndmsg_page = NULL;
        sk->sk_sndmsg_off = 0;
        if ((exthdrlen = rt->u.dst.header_len) != 0)
        {
            length += exthdrlen;
            transhdrlen += exthdrlen;
        }
    }
    //发送队列不为空
    else
    {
        rt = (struct rtable *)inet->cork.dst;
        if (inet->cork.flags & IPCORK_OPT)
            opt = inet->cork.opt;
        transhdrlen = 0;
        exthdrlen = 0;
        mtu = inet->cork.fragsize;
    }
    //取得硬件帧所需要的头部空间大小
    hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
    //取得IP报头的大小
    fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
    //计算数据可用的最大空间大小
    maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
    if (inet->cork.length + length > 0xFFFF - fragheaderlen)
    {
        ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
        return -EMSGSIZE;
    }
    /*
     * transhdrlen > 0 means that this is the first fragment and we wish
     * it won't be fragmented in the future.
     */
    if (transhdrlen &&
     length + fragheaderlen <= mtu &&
     rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
     !exthdrlen)
        csummode = CHECKSUM_PARTIAL;
    inet->cork.length += length;
    //检测数据长度是否超过MTU
    //检测发送队列是否为空
    //检测协议类型是否为UDP
    if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) &&
     (sk->sk_protocol == IPPROTO_UDP) &&
     (rt->u.dst.dev->features & NETIF_F_UFO))
    {
        //进行UDP分片
        err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
                     fragheaderlen, transhdrlen, mtu,
                     flags);
        if (err)
            goto error;
        return 0;
    }
    /* So, what's going on in the loop below?
     *
     * We use calculated fragment length to generate chained skb,
     * each of segments is IP fragment ready for sending to network after
     * adding appropriate IP header.
     */
    //检测发送队列是否为空
    if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
        goto alloc_new_skb;
    //检测数据装载到skb中是否全部完成
    while (length > 0)
    {
        /* Check if the remaining data fits into current packet. */
        copy = mtu - skb->len;
        if (copy < length)
            copy = maxfraglen - skb->len;
        if (copy <= 0)
        {
            char *data;
            unsigned int datalen;
            unsigned int fraglen;
            unsigned int fraggap;
            unsigned int alloclen;
            struct sk_buff *skb_prev;
alloc_new_skb:
            skb_prev = skb;
            //检测上一个skb是否存在
            if (skb_prev)
                fraggap = skb_prev->len - maxfraglen;
            else
                //碎片长度为0
                fraggap = 0;
            /*
             * If remaining data exceeds the mtu,
             * we know we need more fragment(s).
             */
             //取得总数据长度,当前数据长度加上碎片
            datalen = length + fraggap;
            //检测数据长度加上IP报头长度是否超过MTU
            if (datalen > mtu - fragheaderlen)
                //超过则为IP报头预留空间
                datalen = maxfraglen - fragheaderlen;
            //计算数据长度加上IP报头长度的值
            fraglen = datalen + fragheaderlen;
            if ((flags & MSG_MORE) &&
             !(rt->u.dst.dev->features & NETIF_F_SG))
                alloclen = mtu;
            else
                //实际分配的数据空间为数据长度加IP报头
                alloclen = datalen + fragheaderlen;
            /* The last fragment gets additional space at tail.
             * Note, with MSG_MORE we overallocate on fragments,
             * because we have no idea what fragment will be
             * the last.
             */
             //最后一个碎片需要更多的空间
            if (datalen == length + fraggap)
                alloclen += rt->u.dst.trailer_len;
            if (transhdrlen)
            {
                skb = sock_alloc_send_skb(sk,
                        alloclen + hh_len + 15,
                        (flags & MSG_DONTWAIT), &err);
            }
            else
            {
                skb = NULL;
                //检测空间是否足够再分配一个sk_buff
                if (atomic_read(&sk->sk_wmem_alloc) <=
                 2 * sk->sk_sndbuf)
                     //分配sk_buff
                     //数据空间大小为数据加上MAC加上15位的填充
                    skb = sock_wmalloc(sk,
                             alloclen + hh_len + 15, 1,
                             sk->sk_allocation);
                if (unlikely(skb == NULL))
                    err = -ENOBUFS;
            }
            //检测分配是否成功
            if (skb == NULL)
                goto error;
            /*
             *    Fill in the control structures
             */
             //设置IP段效验和模式
            skb->ip_summed = csummode;
            //初始化效验和
            skb->csum = 0;
            //预留硬件头部空间
            skb_reserve(skb, hh_len);
            /*
             *    Find where to start putting bytes.
             */
             //占用fraglen大小的空间,并返回头位置
            data = skb_put(skb, fraglen);
            //设置IP层指针的位置
            skb_set_network_header(skb, exthdrlen);
            //设置运输层指针的位置
            skb->transport_header = (skb->network_header + fragheaderlen);
            //取得运输层在缓冲区的起始位置
            data += fragheaderlen;
            //检测是否有碎片
            if (fraggap)
            {
                skb->csum = skb_copy_and_csum_bits(
                    skb_prev, maxfraglen,
                    data + transhdrlen, fraggap, 0);
                skb_prev->csum = csum_sub(skb_prev->csum,
                             skb->csum);
                data += fraggap;
                pskb_trim_unique(skb_prev, maxfraglen);
            }
            //计算实际需要拷贝的数据长度
            copy = datalen - transhdrlen - fraggap;
            //拷贝数据到缓冲区
            if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
                err = -EFAULT;
                kfree_skb(skb);
                goto error;
            }
            //计算偏移
            offset += copy;
            //计算数据剩余长度
            length -= datalen - fraggap;
            transhdrlen = 0;
            exthdrlen = 0;
            csummode = CHECKSUM_NONE;
            /*
             * Put the packet on the pending queue.
             */
             //把该skb加入发送队列中
            __skb_queue_tail(&sk->sk_write_queue, skb);
            continue;
        }
        if (copy > length)
            copy = length;
        if (!(rt->u.dst.dev->features & NETIF_F_SG))
        {
            unsigned int off;
            off = skb->len;
            if (getfrag(from, skb_put(skb, copy),offset, copy, off, skb) < 0)
            {
                __skb_trim(skb, off);
                err = -EFAULT;
                goto error;
            }
        }
        else
        {
            int i = skb_shinfo(skb)->nr_frags;
            skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
            struct page *page = sk->sk_sndmsg_page;
            int off = sk->sk_sndmsg_off;
            unsigned int left;
            if (page && (left = PAGE_SIZE - off) > 0)
            {
                if (copy >= left)
                    copy = left;
                if (page != frag->page)
                {
                    if (i == MAX_SKB_FRAGS)
                    {
                        err = -EMSGSIZE;
                        goto error;
                    }
                    get_page(page);
                    skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
                    frag = &skb_shinfo(skb)->frags[i];
                }
            }
            else if (i < MAX_SKB_FRAGS)
            {
                if (copy > PAGE_SIZE)
                    copy = PAGE_SIZE;
                page = alloc_pages(sk->sk_allocation, 0);
                if (page == NULL)
                {
                    err = -ENOMEM;
                    goto error;
                }
                sk->sk_sndmsg_page = page;
                sk->sk_sndmsg_off = 0;
                skb_fill_page_desc(skb, i, page, 0, 0);
                frag = &skb_shinfo(skb)->frags[i];
            }
            else
            {
                err = -EMSGSIZE;
                goto error;
            }
            if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0)
            {
                err = -EFAULT;
                goto error;
            }
            sk->sk_sndmsg_off += copy;
            frag->size += copy;
            skb->len += copy;
            skb->data_len += copy;
            skb->truesize += copy;
            atomic_add(copy, &sk->sk_wmem_alloc);
        }
        offset += copy;
        length -= copy;
    }
    return 0;
error:
    inet->cork.length -= length;
    IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
    return err;
}

因为我们的发送队列为空,来到alloc_new_skb,一路向下走到sock_wmalloc
sock_wmalloc负责从高速缓存中分配一个skb
sock_wmalloc在/net/core/sock.c中

struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
             gfp_t priority)
{
    if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
    {
        struct sk_buff * skb = alloc_skb(size, priority);

        if (skb)
        {
            skb_set_owner_w(skb, sk);
            return skb;
        }
    }
    return NULL;
}

alloc_skb负责调用分配函数
alloc_skb在/include/linux/skbuff.h中

static inline struct sk_buff *alloc_skb(unsigned int size,
gfp_t priority)
{
return __alloc_skb(size, priority, 0, -1);
}

__alloc_skb负责实际的分配
__alloc_skb在/net/core/skbuff.c中

struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
             int fclone, int node)
{
    struct kmem_cache *cache;
    struct skb_shared_info *shinfo;
    struct sk_buff *skb;
    u8 *data;
    cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
    /* Get the HEAD */
    //从高速缓存中分配一个sk_buff结构
    skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
    //检测skb分配是否成功
    if (!skb)
        goto out;
    //计算所需要对齐后的数据空间
    size = SKB_DATA_ALIGN(size);
    //分配数据空间和一个skb_shared_info结构
    data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
            gfp_mask, node);
    //检测数据空间分配是否成功
    if (!data)
        goto nodata;
    /*
     * Only clear those fields we need to clear, not those that we will
     * actually initialise below. Hence, don't put any more fields after
     * the tail pointer in struct sk_buff!
     */
    //将sk_buff中tail指针之前的数据清0
    memset(skb, 0, offsetof(struct sk_buff, tail));
    //取得实际大小
    skb->truesize = size + sizeof(struct sk_buff);
    //增加使用计数器
    atomic_set(&skb->users, 1);
    //设置头指针到数据空间的起始地址
    skb->head = data;
    //设置数据指针到数据空间的起始地址
    skb->data = data;
    //复位尾部指针到数据指针的位置
    skb_reset_tail_pointer(skb);
    //设置结束指针到数据空间的结束位置
    skb->end = skb->tail + size;
    /* make sure we initialize shinfo sequentially */
    //将数据空间尾部强制转换成skb_shared_info结构
    shinfo = skb_shinfo(skb);
    atomic_set(&shinfo->dataref, 1);
    shinfo->nr_frags = 0;
    shinfo->gso_size = 0;
    shinfo->gso_segs = 0;
    shinfo->gso_type = 0;
    shinfo->ip6_frag_id = 0;
    shinfo->frag_list = NULL;
    if (fclone)
    {
        struct sk_buff *child = skb + 1;
        atomic_t *fclone_ref = (atomic_t *) (child + 1);
        skb->fclone = SKB_FCLONE_ORIG;
        atomic_set(fclone_ref, 1);
        child->fclone = SKB_FCLONE_UNAVAILABLE;
    }
out:
    return skb;
nodata:
    kmem_cache_free(cache, skb);
    skb = NULL;
    goto out;
}

由于我们传递的fclone为0,所以是不会进入到if (fclone)中的

好,返回到sock_wmalloc,分配skb成功之后进入skb_set_owner_w
skb_set_owner_w在/include/net/sock.h中

static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
    sock_hold(sk);
    //连接sock到sk_buff上
    skb->sk = sk;
    //设置回收函数
    skb->destructor = sock_wfree;
    //增加空间使用计数器
    atomic_add(skb->truesize, &sk->sk_wmem_alloc);
}

执行完后一个skb便分配好了
结构图如下

500)this.width=500;" border=0> 然后设置ip_summed和csum为0 skb_reserve(skb, hh_len)这句函数的作用是保留hh_len长度的空间,data和tail指针向后移动hh_len个单位,如下
500)this.width=500;" border=0>

然后到data = skb_put(skb, fraglen)
放入数据和IP报头,并且返回起初的data指针位置,skb的len加上放入的大小
如下图

500)this.width=500;" border=0> skb_set_network_header(skb, exthdrlen);
接着设置设置IP层指针的位置,这里exthdrlen为0 如下图
500)this.width=500;" border=0> skb->transport_header = (skb->network_header + fragheaderlen);
接着设置运输层指针的位置
如下图
500)this.width=500;" border=0>

然后到data += fragheaderlen;
使得data(不是skb呢个)指向运输层头部,如下图

500)this.width=500;" border=0> 因为我们没有碎片,不进入if (fraggap)
来到if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) 这里getfrag为调用ip_append_data时所传递的参数ip_generic_getfrag
ip_generic_getfrag就不分析了,他负责把数据从用户空间拷贝到data指针(不是skb呢个)的位置接着到__skb_queue_tail(&sk->sk_write_queue, skb);
把这个skb挂接到sk的发送队列中然后conninue回到while头,因为我们把length长的数据拷贝完成了,这里length为0,跳出while循环 return 0 退出回到raw_sendmsg,现在进入到ip_push_pending_frames
ip_push_pending_frames负责skb的重新排列
ip_push_pending_frames在/net/ipv4/ip_output.c中

int ip_push_pending_frames(struct sock *sk)
{
    struct sk_buff *skb, *tmp_skb;
    struct sk_buff **tail_skb;
    struct inet_sock *inet = inet_sk(sk);
    struct ip_options *opt = NULL;
    struct rtable *rt = (struct rtable *)inet->cork.dst;
    struct iphdr *iph;
    __be16 df = 0;
    __u8 ttl;
    int err = 0;
    //检测待发送队列是否为空
    //并返回队首skb
    if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
        goto out;
    tail_skb = &(skb_shinfo(skb)->frag_list);
    /* move skb->data to ip header from ext header */
    //检测数据头部是否小于网络层头部
    if (skb->data < skb_network_header(skb))
        __skb_pull(skb, skb_network_offset(skb));
    //历遍发送队列
    while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
    {
        __skb_pull(tmp_skb, skb_network_header_len(skb));
        *tail_skb = tmp_skb;
        tail_skb = &(tmp_skb->next);
        skb->len += tmp_skb->len;
        skb->data_len += tmp_skb->len;
        skb->truesize += tmp_skb->truesize;
        __sock_put(tmp_skb->sk);
        tmp_skb->destructor = NULL;
        tmp_skb->sk = NULL;
    }
    /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
     * to fragment the frame generated here. No matter, what transforms
     * how transforms change size of the packet, it will come out.
     */
    if (inet->pmtudisc < IP_PMTUDISC_DO)
        skb->local_df = 1;
    /* DF bit is set when we want to see DF on outgoing frames.
     * If local_df is set too, we still allow to fragment this frame
     * locally. */
    if (inet->pmtudisc >= IP_PMTUDISC_DO ||
     (skb->len <= dst_mtu(&rt->u.dst) &&
     ip_dont_fragment(sk, &rt->u.dst)))
        df = htons(IP_DF);
    if (inet->cork.flags & IPCORK_OPT)
        opt = inet->cork.opt;
    //检测是否为多播
    if (rt->rt_type == RTN_MULTICAST)
        ttl = inet->mc_ttl;
    else
        //设置ttl
        ttl = ip_select_ttl(inet, &rt->u.dst);
    //将data所指的地址强制转换成iphdr结构
    iph = (struct iphdr *)skb->data;
    //设置版本为v4
    iph->version = 4;
    //设置ip报头长度为5个字节
    iph->ihl = 5;
    //检测是否有ip_options
    if (opt)
    {
        iph->ihl += opt->optlen>>2;
        ip_options_build(skb, opt, inet->cork.addr, rt, 0);
    }
    //设置服务类型
    iph->tos = inet->tos;
    //设置片偏移
    iph->frag_off = df;
    //设置id
    ip_select_ident(iph, &rt->u.dst, sk);
    //设置生存时间
    iph->ttl = ttl;
    //设置协议
    iph->protocol = sk->sk_protocol;
    //设置发送方IP地址
    iph->saddr = rt->rt_src;
    //设置目的IP地址
    iph->daddr = rt->rt_dst;
    skb->priority = sk->sk_priority;
    skb->mark = sk->sk_mark;
    //取得路由结构
    skb->dst = dst_clone(&rt->u.dst);
    //检测是否为ICMP协议
    if (iph->protocol == IPPROTO_ICMP)
        //增加ICMP累积计数器
        icmp_out_count(((struct icmphdr *)skb_transport_header(skb))->type);
    /* Netfilter gets whole the not fragmented skb. */
    //发送该skb
    err = ip_local_out(skb);
    if (err)
    {
        if (err > 0)
            err = inet->recverr ? net_xmit_errno(err) : 0;
        if (err)
            goto error;
    }
out:
    ip_cork_release(inet);
    return err;
error:
    IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
    goto out;
}

由于skb_shared_info-> frag_list为NULL,所以这里tail_skb为空

这里data和network_header指针相等
所以不会进入到if (skb->data < skb_network_header(skb))中

因为我们的发送队列中只有一个skb,在之前已经出队了
所以这里不会进入到while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)中

接下来设置ip头部信息,设置好之后的结构如下

500)this.width=500;" border=0> 然后连接skb与rtable中的dst_entry 然后进入ip_local_out
ip_local_out在/net/ipv4/ip_output.c中

int ip_local_out(struct sk_buff *skb)
{
    int err;
    //发送skb
    err = __ip_local_out(skb);
    if (likely(err == 1))
        err = dst_output(skb);
    return err;
}

继续进入__ip_local_out
__ip_local_out在/net/ipv4/ip_output.c中

int __ip_local_out(struct sk_buff *skb)
{
    //取得IP报头结构
    struct iphdr *iph = ip_hdr(skb);
    //设置数据总长度
    iph->tot_len = htons(skb->len);
    //设置效验和
    ip_send_check(iph);
    return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
         dst_output);
}

NF_HOOK和Netfilter有关,我们不关心Netfilter,直接进入dst_output
dst_output在/include/net/dst.h中

static inline int dst_output(struct sk_buff *skb)
{
//由路由结构中的output函数进行发送
return skb->dst->output(skb);
}

看看之前的结构图,得知dst->output为ip_output
ip_output在/net/ipv4/ip_output.c中

int ip_output(struct sk_buff *skb)
{
    //取得路由结构中连接的网卡设备
    struct net_device *dev = skb->dst->dev;
    IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
    //连接网卡设备到skb
    skb->dev = dev;
    //设置skb的协议为IP
    skb->protocol = htons(ETH_P_IP);
    return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
             ip_finish_output,
             !(IPCB(skb)->flags & IPSKB_REROUTED));
}

继续进入到ip_finish_output
ip_finish_output在/net/ipv4/ip_output.c中

static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
    /* Policy lookup after SNAT yielded a new policy */
    if (skb->dst->xfrm != NULL) {
        IPCB(skb)->flags |= IPSKB_REROUTED;
        return dst_output(skb);
    }
#endif
    //检测skb的中的数据大小是否超过MTU,超过则分片
    if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
        return ip_fragment(skb, ip_finish_output2);
    else
        return ip_finish_output2(skb);
}

这里我们的数据大小不会超过MTU,所以不需要分片,进入到ip_finish_output2
ip_finish_output2在/net/ipv4/ip_output.c中

static inline int ip_finish_output2(struct sk_buff *skb)
{
    struct dst_entry *dst = skb->dst;
    struct rtable *rt = (struct rtable *)dst;
    struct net_device *dev = dst->dev;
    unsigned int hh_len = LL_RESERVED_SPACE(dev);
    if (rt->rt_type == RTN_MULTICAST)
        IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
    else if (rt->rt_type == RTN_BROADCAST)
        IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
    /* Be paranoid, rather than too clever. */
    //检测硬件帧头部空间是否足够大
    //检测网卡设备是否有header_ops操作集
    if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops))
    {
        struct sk_buff *skb2;
        skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
        if (skb2 == NULL)
        {
            kfree_skb(skb);
            return -ENOMEM;
        }
        if (skb->sk)
            skb_set_owner_w(skb2, skb->sk);
        kfree_skb(skb);
        skb = skb2;
    }
    //检测邻居结构
    if (dst->hh)
        return neigh_hh_output(dst->hh, skb);
    else if (dst->neighbour)
        return dst->neighbour->output(skb);
    if (net_ratelimit())
        printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
    kfree_skb(skb);
    return -EINVAL;
}

这里一开始我们的dst->hh为NULL,但是当第二次发送ICMP包的时候这里的hh就已经分配好了,也就是说只有第一次会进入dst->neighbour->output,以后都是neigh_hh_output
两个走向都会分析,现在先看hh为NULL时
neighbour->output为neigh_resolve_output
neigh_resolve_output在/net/core/neighbour.c中

int neigh_resolve_output(struct sk_buff *skb)
{
    struct dst_entry *dst = skb->dst;
    struct neighbour *neigh;
    int rc = 0;

    if (!dst || !(neigh = dst->neighbour))
        goto discard;
    __skb_pull(skb, skb_network_offset(skb));
    if (!neigh_event_send(neigh, skb))
    {
        int err;
        struct net_device *dev = neigh->dev;
        if (dev->header_ops->cache && !dst->hh)
        {
            write_lock_bh(&neigh->lock);
            if (!dst->hh)
                neigh_hh_init(neigh, dst, dst->ops->protocol);
            err = dev_hard_header(skb, dev, ntohs(skb->protocol),
                     neigh->ha, NULL, skb->len);
            write_unlock_bh(&neigh->lock);
        }
        else
        {
            read_lock_bh(&neigh->lock);
            err = dev_hard_header(skb, dev, ntohs(skb->protocol),
                     neigh->ha, NULL, skb->len);
            read_unlock_bh(&neigh->lock);
        }
        if (err >= 0)
            rc = neigh->ops->queue_xmit(skb);
        else
            goto out_kfree_skb;
    }
out:
    return rc;
discard:
    NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n",
         dst, dst ? dst->neighbour : NULL);
out_kfree_skb:
    rc = -EINVAL;
    kfree_skb(skb);
    goto out;
}

由于现在data和network_header指针指向同一个位置,所以skb_network_offset(skb)为0,
__skb_pull(skb, skb_network_offset(skb))并没有实际改变skb结构

之后是创建hh结构,和arp有关,也和路由表查询有关,我这里就不分析了

创建好的hh如下

500)this.width=500;" border=0>

然后到err = dev_hard_header(skb, dev, ntohs(skb->protocol),neigh->ha, NULL, skb->len)
dev_hard_header在/include/linux/netdevice.h中

static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
                 unsigned short type,
                 const void *daddr, const void *saddr,
                 unsigned len)
{
    //检测设备是否有头部操作集
    //检测操作集是否有创建操作
    if (!dev->header_ops || !dev->header_ops->create)
        return 0;
    return dev->header_ops->create(skb, dev, type, daddr, saddr, len);
}

我们的lo设备是有头部操作集的,eth_header_ops,结构如下

const struct header_ops eth_header_ops ____cacheline_aligned = {
    .create        = eth_header,
    .parse        = eth_header_parse,
    .rebuild    = eth_rebuild_header,
    .cache        = eth_header_cache,
    .cache_update    = eth_header_cache_update,
};

可以看见是有create函数的
eth_header在/net/ethernet/eth.c中

int eth_header(struct sk_buff *skb, struct net_device *dev,
     unsigned short type,
     const void *daddr, const void *saddr, unsigned len)
{
    struct ethhdr *eth = (struct ethhdr *)skb_push(skb, ETH_HLEN);

    if (type != ETH_P_802_3)
        eth->h_proto = htons(type);
    else
        eth->h_proto = htons(len);
    /*
     * Set the source hardware address.
     */
    if (!saddr)
        saddr = dev->dev_addr;
    memcpy(eth->h_source, saddr, ETH_ALEN);
    if (daddr)
    {
        memcpy(eth->h_dest, daddr, ETH_ALEN);
        return ETH_HLEN;
    }
    /*
     * Anyway, the loopback-device should never use this function...
     */
    if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
    {
        memset(eth->h_dest, 0, ETH_ALEN);
        return ETH_HLEN;
    }
    return -ETH_HLEN;
}

主要是注意struct ethhdr *eth = (struct ethhdr *)skb_push(skb, ETH_HLEN);
这里会进行数据的压入
压入后的结构如下

500)this.width=500;" border=0> 然后对以太网报头进行初始化初始化完成后回到neigh_resolve_output中
来到rc = neigh->ops->queue_xmit(skb)
neigh->ops->queue_xmit为dev_queue_xmit
在进入dev_queue_xmit之前我们回到ip_finish_output2中看看有hh时的流程
neigh_hh_output在/include/net/neighbour.h中

static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
{
    unsigned seq;
    int hh_len;

    do {
        int hh_alen;
        seq = read_seqbegin(&hh->hh_lock);
        hh_len = hh->hh_len;
        hh_alen = HH_DATA_ALIGN(hh_len);
        memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
    } while (read_seqretry(&hh->hh_lock, seq));
    skb_push(skb, hh_len);
    return hh->hh_output(skb);
}

neigh_hh_output也会对skb进行数据压入的操作然后调用hh_output
在之前hh的结构中hh_output正是dev_queue_xmit
所以最后两边都会回到dev_queue_xmit