首先弹出IP报头然后复位transport_header的位置
结构图如下
500)this.width=500;" border=0>
然后进入到raw_local_deliver中
raw_local_deliver在/net/ipv4/raw.c中
int raw_local_deliver(struct sk_buff *skb, int protocol)
{
int hash;
struct sock *raw_sk;
//计算协议类型的哈希值
hash = protocol & (RAW_HTABLE_SIZE - 1);
//取得对应的sock
raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
/* If there maybe a raw socket we must check - if not we
* don't care less
*/
//检测sock是否为空
//将skb发送到上层处理
if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
raw_sk = NULL;
return raw_sk != NULL;
}
|
还记得raw_v4_hashinfo么?~ 回顾一下最前面sock的结构图吧 = 3=)/
继续大步往上层走,来到raw_v4_input
raw_v4_input在/net/ipv4/raw.c中
static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
{
struct sock *sk;
struct hlist_head *head;
int delivered = 0;
struct net *net;
//锁上raw_v4_hashinfo
read_lock(&raw_v4_hashinfo.lock);
//取得队列元素
head = &raw_v4_hashinfo.ht[hash];
//检测队列元素是否为空
if (hlist_empty(head))
goto out;
net = dev_net(skb->dev);
sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
iph->saddr, iph->daddr,
skb->dev->ifindex);
while (sk)
{
delivered = 1;
//检测协议是否为ICMP
if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb))
{
//克隆一个skb
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
/* Not releasing hash table! */
//检测克隆是否成功
if (clone)
//成功则将sk和skb传递到上一层
raw_rcv(sk, clone);
}
//寻找下一个匹配的sock
sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
iph->saddr, iph->daddr,
skb->dev->ifindex);
}
out:
read_unlock(&raw_v4_hashinfo.lock);
return delivered;
}
|
__raw_v4_lookup负责匹配sock
__raw_v4_lookup在/net/ipv4/raw.c中
static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
unsigned short num, __be32 raddr, __be32 laddr, int dif)
{
struct hlist_node *node;
//历遍sock队列
sk_for_each_from(sk, node)
{
//取得对应的inet_sock结构
struct inet_sock *inet = inet_sk(sk);
//检测sock的net类型是否与传入的net类型相等
//检测端口号是否相等
//检测目的地址是否存在,检测目的地址是否等于发送地址
//检测本地跳跃地址是否存在,检测本地跳跃地址是否等于目的地址
//
if (net_eq(sock_net(sk), net) &&
inet->num == num &&
!(inet->daddr && inet->daddr != raddr) &&
!(inet->rcv_saddr && inet->rcv_saddr != laddr) &&
!(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
goto found; /* gotcha */
}
sk = NULL;
found:
return sk;
}
|
我不明白sk_bound_dev_if这个参数的用途............ 请大家赐教 T ^T
回到raw_v4_input中
现在要克隆skb, skb_clone负责这个任务
skb_clone在/net/core/skbuff.c中
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
struct sk_buff *n;
//取得该skb的下一个skb
n = skb + 1;
//检测skb的克隆模式是否为独占
//检测下一个skb的克隆模式是否为无效的
if (skb->fclone == SKB_FCLONE_ORIG &&
n->fclone == SKB_FCLONE_UNAVAILABLE)
{
atomic_t *fclone_ref = (atomic_t *) (n + 1);
n->fclone = SKB_FCLONE_CLONE;
atomic_inc(fclone_ref);
}
else
{
//否则从缓冲区中分配一个新的skb
n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
if (!n)
return NULL;
//设置克隆模式为无效
n->fclone = SKB_FCLONE_UNAVAILABLE;
}
//执行克隆
return __skb_clone(n, skb);
}
|
我们在发送ICMP中所申请的skb是没有克隆标志的,所以这里会进入else中,重缓冲区中分配一个新的skb
__skb_clone执行具体的拷贝任务
__skb_clone在/net/core/skbuff.c中
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
{
#define C(x) n->x = skb->x
//初始化队列指针
n->next = n->prev = NULL;
//初始化sock指针
n->sk = NULL;
//拷贝所有信息层的信息
__copy_skb_header(n, skb);
C(len);
C(data_len);
C(mac_len);
n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
n->cloned = 1;
n->nohdr = 0;
//初始化回收函数
n->destructor = NULL;
C(iif);
C(tail);
C(end);
C(head);
C(data);
C(truesize);
atomic_set(&n->users, 1);
atomic_inc(&(skb_shinfo(skb)->dataref));
skb->cloned = 1;
return n;
#undef C
}
|
__skb_clone主要拷贝数据方面的内容,各种数据指针和数据长度
回到raw_v4_input,克隆成功后便进入到raw_rcv
raw_rcv在/net/ipv4/raw.c中
int raw_rcv(struct sock *sk, struct sk_buff *skb)
{
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
{
atomic_inc(&sk->sk_drops);
kfree_skb(skb);
return NET_RX_DROP;
}
nf_reset(skb);
//将skb的data指针指向网络层头部
skb_push(skb, skb->data - skb_network_header(skb));
raw_rcv_skb(sk, skb);
return 0;
}
|
主要是完成 skb_push(skb, skb->data - skb_network_header(skb))这个任务
执行完成后的结构图如下
500)this.width=500;" border=0>
然后到raw_rcv_skb
raw_rcv_skb在/net/ipv4/raw.c中
static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
{
/* Charge it to the socket. */
//发送sk和skb到上一层
if (sock_queue_rcv_skb(sk, skb) < 0)
{
//增加发送失败计数器
atomic_inc(&sk->sk_drops);
//释放skb
kfree_skb(skb);
return NET_RX_DROP;
}
return NET_RX_SUCCESS;
}
|
很简单,调用sock_queue_rcv_skb
sock_queue_rcv_skb在/net/core/sock.c中
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int err = 0;
int skb_len;
/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
number of warnings when compiling with -W --ANK
*/
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
(unsigned)sk->sk_rcvbuf)
{
err = -ENOMEM;
goto out;
}
err = sk_filter(sk, skb);
if (err)
goto out;
if (!sk_rmem_schedule(sk, skb->truesize))
{
err = -ENOBUFS;
goto out;
}
skb->dev = NULL;
//关联skb与sk
skb_set_owner_r(skb, sk);
/* Cache the SKB length before we tack it onto the receive
* queue. Once it is added it no longer belongs to us and
* may be freed by other threads of control pulling packets
* from the queue.
*/
//设置数据长度
skb_len = skb->len;
//把该skb添加到sock的接受队列上
skb_queue_tail(&sk->sk_receive_queue, skb);
//检测sock是否处于死亡状态
if (!sock_flag(sk, SOCK_DEAD))
//提交sock到上一层
sk->sk_data_ready(sk, skb_len);
out:
return err;
}
|
sk_filter和sk_rmem_schedule的内容不是很明白 T ^T 继续请大家指教
把skb挂接到sock的sk_receive_queue队列之后就跳用sk->sk_data_ready
sk->sk_data_ready为sock_def_readable
sock_def_readable在/net/core/sock.c中
static void sock_def_readable(struct sock *sk, int len)
{
read_lock(&sk->sk_callback_lock);
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
//唤醒sock的sk_sleep
wake_up_interruptible_sync(sk->sk_sleep);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
read_unlock(&sk->sk_callback_lock);
}
|
唤醒sk_sleep!!! 终于在这里唤醒了读取啊~ 如果这时候sk_sleep在睡眠的话就会被唤醒,从而拿到需要的数据
raw这边走完了,别急,还有icmp呢边呢
回到ip_local_deliver_finish中
继续往下走,来到ipprot = rcu_dereference(inet_protos[hash]),这里会根据哈希值拿到协议
我们当然是要ICMP协议的结构icmp_protocol了
icmp_protocol的结构如下
static struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
.no_policy = 1,
.netns_ok = 1,
};
|
继续往下走,来到ret = ipprot->handler(skb),在这里运行协议的handler函数,也就是icmp_rcv
icmp_rcv在/net/ipv4/icmp.c中
int icmp_rcv(struct sk_buff *skb)
{
struct icmphdr *icmph;
struct rtable *rt = skb->rtable;
//检测安全
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
int nh;
if (!(skb->sp && skb->sp->xvec[skb->sp->len - 1]->props.flags &
XFRM_STATE_ICMP))
goto drop;
if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr)))
goto drop;
nh = skb_network_offset(skb);
skb_set_network_header(skb, sizeof(*icmph));
if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
goto drop;
skb_set_network_header(skb, nh);
}
//增加ICMP包累积计数器
ICMP_INC_STATS_BH(ICMP_MIB_INMSGS);
//检测效验和模式
switch (skb->ip_summed) {
case CHECKSUM_COMPLETE:
if (!csum_fold(skb->csum))
break;
/* fall through */
case CHECKSUM_NONE:
skb->csum = 0;
if (__skb_checksum_complete(skb))
goto error;
}
//检测数据空间是否满足icmph包的大小
//满足则推出ICMP包结构
if (!pskb_pull(skb, sizeof(*icmph)))
goto error;
//取得icmp结构
icmph = icmp_hdr(skb);
//增加icmp包的类型的计数器
ICMPMSGIN_INC_STATS_BH(icmph->type);
/*
* 18 is the highest 'known' ICMP type. Anything else is a mystery
*
* RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
* discarded.
*/
//检测类型是否超出范围
if (icmph->type > NR_ICMP_TYPES)
goto error;
/*
* Parse the ICMP message
*/
//检测是否为广播或者多播
if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
{
struct net *net;
net = dev_net(rt->u.dst.dev);
/*
* RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
* silently ignored (we let user decide with a sysctl).
* RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
* discarded if to broadcast/multicast.
*/
if ((icmph->type == ICMP_ECHO ||
icmph->type == ICMP_TIMESTAMP) &&
net->ipv4.sysctl_icmp_echo_ignore_broadcasts)
{
goto error;
}
if (icmph->type != ICMP_ECHO &&
icmph->type != ICMP_TIMESTAMP &&
icmph->type != ICMP_ADDRESS &&
icmph->type != ICMP_ADDRESSREPLY)
{
goto error;
}
}
//递交skb给相应的icmp包类型处理函数
icmp_pointers[icmph->type].handler(skb);
drop:
kfree_skb(skb);
return 0;
error:
ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
goto drop;
}
|
xfrm4_policy_check是安全检测模块的,跳过
先看一下我们现在skb的结构,虽然在raw处理中改变了结构,不过呢个是克隆体,不会影响到我们这边的skb
500)this.width=500;" border=0>
然后执行pskb_pull(skb, sizeof(*icmph),弹出icmp数据包
执行完后数据结构如下
500)this.width=500;" border=0>
为什么这里data不和tail重合呢? 是因为我们在发送的时候把数据大小设置成了64个字节,超过了ICMP包的大小,所以这里是不会到尾端的
到最后的icmp_pointers[icmph->type].handler(skb),我们这里ICMP包类型为8,请求回显,呢么就是到icmp_echo中
icmp_echo在/net/ipv4/icmp.c中
static void icmp_echo(struct sk_buff *skb)
{
struct net *net;
net = dev_net(skb->dst->dev);
//检测是否忽略回显
if (!net->ipv4.sysctl_icmp_echo_ignore_all)
{
struct icmp_bxm icmp_param;
//复制icmp包信息
icmp_param.data.icmph = *icmp_hdr(skb);
//设置icmp包为回显应答
icmp_param.data.icmph.type = ICMP_ECHOREPLY;
icmp_param.skb = skb;
icmp_param.offset = 0;
icmp_param.data_len = skb->len;
icmp_param.head_len = sizeof(struct icmphdr);
//发送icmp包
icmp_reply(&icmp_param, skb);
}
}
|
设置完后来到icmp_reply
icmp_reply在/net/ipv4/icmp.c中
static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
{
struct ipcm_cookie ipc;
struct rtable *rt = skb->rtable;
struct net *net = dev_net(rt->u.dst.dev);
struct sock *sk = icmp_sk(net);
struct inet_sock *inet = inet_sk(sk);
__be32 daddr;
if (ip_options_echo(&icmp_param->replyopts, skb))
return;
if (icmp_xmit_lock(sk))
return;
//初始化效验和
icmp_param->data.icmph.checksum = 0;
//设置服务类型
inet->tos = ip_hdr(skb)->tos;
//设置发送地址
daddr = ipc.addr = rt->rt_src;
//初始化ip_options为NULL
ipc.opt = NULL;
//检测是否有ip_options选项
if (icmp_param->replyopts.optlen)
{
ipc.opt = &icmp_param->replyopts;
if (ipc.opt->srr)
daddr = icmp_param->replyopts.faddr;
}
{
struct flowi fl = { .nl_u = { .ip4_u =
{ .daddr = daddr,
.saddr = rt->rt_spec_dst,
.tos = RT_TOS(ip_hdr(skb)->tos) } },
.proto = IPPROTO_ICMP };
security_skb_classify_flow(skb, &fl);
//查找路由
if (ip_route_output_key(net, &rt, &fl))
goto out_unlock;
}
if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type,
icmp_param->data.icmph.code))
//发送icmp包
icmp_push_reply(icmp_param, &ipc, rt);
//释放路由结构
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
}
|
继续来到icmp_push_reply
icmp_push_reply在/net/ipv4/icmp.c中
static void icmp_push_reply(struct icmp_bxm *icmp_param,
struct ipcm_cookie *ipc, struct rtable *rt)
{
struct sock *sk;
struct sk_buff *skb;
//取得sock结构
sk = icmp_sk(dev_net(rt->u.dst.dev));
//复制数据到skb中
if (ip_append_data(sk, icmp_glue_bits, icmp_param,
icmp_param->data_len+icmp_param->head_len,
icmp_param->head_len,
ipc, rt, MSG_DONTWAIT) < 0)
//清空sk下的所有skb
ip_flush_pending_frames(sk);
//检测发送队列是否为空
else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL)
{
struct icmphdr *icmph = icmp_hdr(skb);
__wsum csum = 0;
struct sk_buff *skb1;
//历遍所有skb
skb_queue_walk(&sk->sk_write_queue, skb1)
{
//计算累积效验和
csum = csum_add(csum, skb1->csum);
}
csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
(char *)icmph,
icmp_param->head_len, csum);
//设置效验和
icmph->checksum = csum_fold(csum);
//设置效验和模式
skb->ip_summed = CHECKSUM_NONE;
//发送skb
ip_push_pending_frames(sk);
}
}
|
ip_push_pending_frames,终于把ICMP包发送出去了,然后icmp_rcv又收到了一个ICMP包
不过这次的包类型为0,是回显应答
而类型0的处理函数是icmp_discard
icmp_discard里面是个空函数,什么都不干,到此ICMP包的发送就完成了
大家有没有注意到一个问题呢,就是ICMP处理函数收到ICMP包的话,RAW也会收到ICMP包
呢么在一次PING本机中会有2个ICMP包,一个是请求回显,一个是回显应答,呢么RAW层也会收到2个ICMP包
所以大家在写PING程序的时候一定不要忘记判断ICMP包的类型啊
如果不判断的话,则PING本机收到的第一个ICMP包一定是自己发出去的类型8的请求回显包,而不是类型0的回显应答包
好,现在让我们返回到__skb_recv_datagram中
现在收到了skb后就会但回到raw_recvmsg中拷贝数据
数据的拷贝由skb_copy_datagram_iovec来完成
skb_copy_datagram_iovec在/net/core/datagram.c中
int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
struct iovec *to, int len)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
/* Copy header. */
if (copy > 0)
{
if (copy > len)
copy = len;
//拷贝数据
if (memcpy_toiovec(to, skb->data + offset, copy))
goto fault;
//检测是否有剩余数据未拷贝
if ((len -= copy) == 0)
return 0;
offset += copy;
}
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
{
int end;
BUG_TRAP(start <= offset + len);
end = start + skb_shinfo(skb)->frags[i].size;
if ((copy = end - offset) > 0)
{
int err;
u8 *vaddr;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
struct page *page = frag->page;
if (copy > len)
copy = len;
vaddr = kmap(page);
err = memcpy_toiovec(to, vaddr + frag->page_offset +
offset - start, copy);
kunmap(page);
if (err)
goto fault;
if (!(len -= copy))
return 0;
offset += copy;
}
start = end;
}
if (skb_shinfo(skb)->frag_list)
{
struct sk_buff *list = skb_shinfo(skb)->frag_list;
for (; list; list = list->next)
{
int end;
BUG_TRAP(start <= offset + len);
end = start + list->len;
if ((copy = end - offset) > 0)
{
if (copy > len)
copy = len;
if (skb_copy_datagram_iovec(list,
offset - start,
to, copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
}
start = end;
}
}
if (!len)
return 0;
fault:
return -EFAULT;
}
|
我们拷贝的数据一次完成,大小刚好,所以不会到下面的for循环和if中的
然后是memcpy_toiovec
memcpy_toiovec在/net/core/iovec.c中
int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
{
//检测是否还有剩余数据未拷贝
while (len > 0)
{
if (iov->iov_len)
{
//取小的为准
int copy = min_t(unsigned int, iov->iov_len, len);
//拷贝数据
if (copy_to_user(iov->iov_base, kdata, copy))
return -EFAULT;
//增加数据量计数器
kdata += copy;
//减少剩余数据量
len -= copy;
//减少用户空间数据缓冲量
iov->iov_len -= copy;
//增加地址
iov->iov_base += copy;
}
//移动到下一个iovec结构
iov++;
}
return 0;
}
|
这里的拷贝也是一次完成,到这里~ 所有的4个部分就都完成了
用户层也终于拿到了数据,不过请注意哈,这个数据是包含IP层的,所以在PING程序中分析收到的ICMP包前,一定要先取得IP层数据的大小,跳过IP层才能拿到ICMP包数据的起始地址
笔记就到这里了,不知道大家对TCP/IP协议栈也是否有了一份自己的理解呢?~ = 3=)/