网络协议规定,ip头部必须使用检验和并被检验,tcp必须使用检验和,udp可以选择是否使用检验和。
1.ip头部检验和
ip头部检验和只检验ip头部,而不包含数据部分。这样做的目的是能减少计算量,加速ip协议的处理,在路由时很有用。
1.1 ip发送时的检验和计算
int ip_build_xmit(struct sock *sk,
int getfrag (const void *,
char *,
unsigned int,
unsigned int),
const void *frag,
unsigned length,
struct ipcm_cookie *ipc,
struct rtable *rt,
int flags)
{
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
static inline unsigned short ip_fast_csum(unsigned char * iph,
unsigned int ihl) {
unsigned int sum;
__asm__ __volatile__("
movl (%1), %0 #第0个双字放入%0
subl $4, %2
jbe 2f #ihl<=4,error
addl 4(%1), %0 #第1个双字加入%0
adcl 8(%1), %0 #第2个双字加入%0
adcl 12(%1), %0#第3个双字加入%0
1: adcl 16(%1), %0#第4个双字加入%0
lea 4(%1), %1 #%1后移4字节
decl %2 #后面还有数据吗?
jne 1b #>0有跳到1处
adcl $0, %0 #加入可能的进位
movl %0, %2
shrl $16, %0
addw %w2, %w0 #高字和低字相加
adcl $0, %0 #加入可能的进位
notl %0 #取反
2:
"
/* Since the input registers which are loaded with iph and ipl
are modified, we must also specify them as outputs, or gcc
will assume they contain their original values. */
: "=r" (sum), "=r" (iph), "=r" (ihl)
: "1" (iph), "2" (ihl));
return(sum);
}
1.2 ip接收时的检验和计算
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
{
if (skb->len < sizeof(struct iphdr) //实际长度小于基本ip头长 20
|| skb->len < (iph->ihl<<2))//实际长度小于实际ip头长
goto inhdr_error;
if (iph->ihl < 5
|| iph->version != 4
|| ip_fast_csum((u8 *)iph, iph->ihl) != 0)//首部检验和
goto inhdr_error;
}
计算结果不为0,出错
1.3 ip转发时的检验和计算
int ip_forward(struct sk_buff *skb)
{
/* Decrease ttl after skb cow done */
ip_decrease_ttl(iph);
}
ip转发时需要递减ttl,因此检验和要重新计算
/* The function in 2.2 was invalid, producing wrong result for
* check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */
static inline int ip_decrease_ttl(struct iphdr *iph)
{
u32 check = iph->check;
check += __constant_htons(0x0100);
iph->check = check + (check>=0xFFFF);
return --iph->ttl;
}
假定除去检验和之外的其他字段的检验和为x,则检验和为~x,x+~x=-1,经过ip_fast_csum计算后正好为0.
现在ttl--,相当于x=x-__constant_htons(0x0100),为了保证ip_fast_csum正确,只需(x-__constant_htons(0x0100))+(~x+__constant_htons(0x0100))=-1,所以新的检验和为~x+__constant_htons(0x0100),即check += __constant_htons(0x0100);
当check>=0xfeff,check += __constant_htons(0x0100)后,check>=0xffff,且只可能进一位,将近位重新加入check
iph->check = check + (check>=0xFFFF);
2.udp检验和
udp检验和是可选的,当检验和字段为0时,不检验,否则检验。如果检验的话,会附加一个伪首部一起检验。当检验和为0时,用0xffff代替,这样x+~x=-1,就成了0xffff+0xffff=0x1FFFE=0x1+0xfffe=0xffff=-1,仍然正确
2.1 udp发送检验和计算
int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
ufh.uh.len = htons(ulen);
ufh.uh.check = 0;
ufh.iov = msg->msg_iov;
ufh.wcheck = 0;
err = ip_build_xmit(sk,
(sk->no_check == UDP_CSUM_NOXMIT ?
udp_getfrag_nosum ://复制用户数据到内核但不计算检验和
udp_getfrag),//复制用户数据到内核并计算检验和
&ufh, ulen, &ipc, rt, msg->msg_flags);
}
如果sk->no_check设为
/* Note: this must match 'valbool' in sock_setsockopt */
#define UDP_CSUM_NOXMIT 1
则不检验,否则检验.sk->no_check可以用
int sock_setsockopt(struct socket *sock, int level, int optname,
char *optval, int optlen)
{
case SO_NO_CHECK:
sk->no_check = valbool;
break;
}
修改
static int udp_getfrag_nosum(const void *p, char * to, unsigned int offset, unsigned int fraglen)
{
//p指向udp伪首部
//to复制目的地址
//offset从udp首部开始的偏移
//fraglen本次复制的数据长度
//被分片的udp数据包只有第一个分片有udp头,其它的都没有
struct udpfakehdr *ufh = (struct udpfakehdr *)p;//指向udp伪首部
if (offset==0) {//需要复制首部
memcpy(to, ufh, sizeof(struct udphdr));//只复制udp首部
return memcpy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset,
fraglen-sizeof(struct udphdr));
}
return memcpy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr),
fraglen);
}
static int udp_getfrag(const void *p, char * to, unsigned int offset, unsigned int fraglen)
{
//参看udp_getfrag_nosum
struct udpfakehdr *ufh = (struct udpfakehdr *)p;
if (offset==0) {
if (csum_partial_copy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset,
fraglen-sizeof(struct udphdr), &ufh->wcheck))//复制并检验数据部分
return -EFAULT;
ufh->wcheck = csum_partial((char *)ufh, sizeof(struct udphdr),
ufh->wcheck);//检验udp首部
ufh->uh.check = csum_tcpudp_magic(ufh->saddr, ufh->daddr,
ntohs(ufh->uh.len),
IPPROTO_UDP, ufh->wcheck);//检验伪首部
if (ufh->uh.check == 0)
ufh->uh.check = -1;
memcpy(to, ufh, sizeof(struct udphdr));
return 0;
}
if (csum_partial_copy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr),
fraglen, &ufh->wcheck))
return -EFAULT;
return 0;
}
int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
int offset, unsigned int len, int *csump)
{
int csum = *csump;
int partial_cnt = 0, err = 0;
/* Skip over the finished iovecs */
while (offset >= iov->iov_len)//跳过前面的数据
{
offset -= iov->iov_len;
iov++;
}
while (len > 0)
{
u8 *base = iov->iov_base + offset;
unsigned int copy = min(len, iov->iov_len - offset);//本次复制的长度
offset = 0;
/* There is a remnant剩余的, 残留的 from previous上个iov iov. */
if (partial_cnt)// 1,2,3
{
int par_len = 4 - partial_cnt;//本iov需要复制的填充数据
//par_len 3,2,1
/* iov component is too short ... */
if (par_len > copy) {//本次复制数据少于par_len
if (copy_from_user(kdata, base, copy))
goto out_fault;
kdata += copy;
base += copy;
//partial_cnt+par_len=4,par_len>copy,partial_cnt+copy<4
partial_cnt += copy;//未检验数据长
len -= copy;//已复制
iov++;//当len>copy,本iov数据不足par_len,当len==copy,len==0,后面会跳出goto out
if (len)//还有需要复制的数据
continue;
*csump = csum_partial(kdata - partial_cnt,
partial_cnt, csum);//计算检验和
goto out;
}
if (copy_from_user(kdata, base, par_len))//复制头部
goto out_fault;
//计算前一个iovec尾部+后一个iovec头部
csum = csum_partial(kdata - partial_cnt, 4, csum);
kdata += par_len;
base += par_len;
copy -= par_len;
len -= par_len;
partial_cnt = 0;
}
if (len > copy)//len为未拷贝数
{
partial_cnt = copy % 4;//是否不是4字节对齐
if (partial_cnt)//不是
{
copy -= partial_cnt;
if (copy_from_user(kdata + copy, base + copy,
partial_cnt))//把尾部数据先复制
goto out_fault;
}
}
if (copy) {
csum = csum_and_copy_from_user(base, kdata, copy,
csum, &err);//再复制前面,并计算检验和
if (err)
goto out;
}
len -= copy + partial_cnt;//已复制数据
kdata += copy + partial_cnt;//内核数据区指针后移
iov++;
}
*csump = csum;
out:
return err;
out_fault:
err = -EFAULT;
goto out;
}
函数声明asmlinkage unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum);
定义arch/i386/lib/checsum.S
/*
2006.9.13
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* IP/TCP/UDP checksumming routines
*
* Authors: Jorge Cwik, <[email protected]>
* Arnt Gulbrandsen, <[email protected]>
* Tom May, <[email protected]>
* Pentium Pro/II routines:
* Alexander Kjeldaas <[email protected]>
* Finn Arne Gangstad <[email protected]>
* Lots of code moved from tcp.c and ip.c; see those files
* for more names.
*
* Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
* handling.
* Andi Kleen, add zeroing on error
* converted to pure assembler
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/config.h>
#include <asm/errno.h>
/*
* computes a partial checksum, e.g. for TCP/UDP fragments
计算啊部分检验和,例如针对TCP/UDP分片
*/
/*
unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
*/
.text
.align 4
.globl csum_partial
#ifndef CONFIG_X86_USE_PPRO_CHECKSUM //Pentium Pro?
/*
* Experiments with Ethernet and SLIP connections show that buff
针对以太网和SLIP连接的实验显示
* is aligned on either a 2-byte or 4-byte boundary. We get at
buff被对齐到2字节或4字节边界
* least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
如果是4字节对齐的化,我们在486和奔腾上至少能够获得2倍的加速
* Fortunately, it is easy to convert 2-byte alignment to 4-byte
幸运的是,很容易从2字节对齐转换到4字节对齐
* alignment for the unrolled 解开, 打开loop.
*/
csum_partial:
//不使用栈帧
pushl %esi
pushl %ebx
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: unsigned char *buff
testl $2, %esi # Check alignment. and操作
jz 2f # Jump if alignment is ok.4字节对齐
subl $2, %ecx # Alignment uses up two bytes. 2字节对齐
jae 1f # Jump if we had at least two bytes. above or equal
addl $2, %ecx # ecx was < 2. Deal with it. 直接处理
jmp 4f
1: movw (%esi), %bx #处理头部的两个字节
addl $2, %esi
addw %bx, %ax
adcl $0, %eax #不可能两次溢出
2:
movl %ecx, %edx #保存len
shrl $5, %ecx #右移5位,一次处理32字节
jz 2f #不足32字节
testl %esi, %esi #?????清进位标志
1: movl (%esi), %ebx
adcl %ebx, %eax #eax前面被初始化为sum
movl 4(%esi), %ebx
adcl %ebx, %eax
movl 8(%esi), %ebx
adcl %ebx, %eax
movl 12(%esi), %ebx
adcl %ebx, %eax
movl 16(%esi), %ebx
adcl %ebx, %eax
movl 20(%esi), %ebx
adcl %ebx, %eax
movl 24(%esi), %ebx
adcl %ebx, %eax
movl 28(%esi), %ebx
adcl %ebx, %eax
#处理32个字节了,指针后移32字节
lea 32(%esi), %esi
dec %ecx
jne 1b
adcl $0, %eax #进位
2: movl %edx, %ecx #还原len,处理后面不足32字节的数据
andl $0x1c, %edx # 1 1100
je 4f # 只有1,2,3这几种可能
shrl $2, %edx # This clears CF,shr指令清CF 4字节的倍数
3: adcl (%esi), %eax #一次处理四个字节
lea 4(%esi), %esi
dec %edx
jne 3b
adcl $0, %eax
4: andl $3, %ecx #1,2,3
jz 7f #处理完毕
cmpl $2, %ecx
jb 5f #below 1
movw (%esi),%cx # 2,3
leal 2(%esi),%esi
je 6f # 2
shll $16,%ecx #ecx清0
5: movb (%esi),%cl #ecx高位都为0
6: addl %ecx,%eax
adcl $0, %eax
7:
popl %ebx
popl %esi
ret
#else
/* Version for PentiumII/PPro */
csum_partial:
pushl %esi
pushl %ebx
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: const unsigned char *buf
testl $2, %esi #2字节边界
jnz 30f
10: #四字节边界
#len被分成三段,32字节的倍数被存入ecx,剩下的长度4字节的倍数被存入ebx,还有剩下不足4字节的长度
movl %ecx, %edx #保存len
movl %ecx, %ebx
andl $0x7c, %ebx #0111 1100 #处理不足32字节但大于3字节的部分
shrl $7, %ecx #ecx中保存的是32字节的倍数
addl %ebx,%esi
shrl $2, %ebx #双字个数
negl %ebx
lea 45f(%ebx,%ebx,2), %ebx #%ebx+%ebx*2+45f,用来索引下面的adc代码数组
testl %esi, %esi #???????清进位标志
jmp *%ebx #处理不足32字节部分
# Handle 2-byte-aligned regions
20: addw (%esi), %ax
lea 2(%esi), %esi
adcl $0, %eax
jmp 10b
30: subl $2, %ecx
ja 20b #len>2
je 32f #len==2
movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
addl %ebx, %eax
adcl $0, %eax
jmp 80f
32:
addw (%esi), %ax # csumming 2 bytes, 2-aligned ????不影响最后的结果???
adcl $0, %eax
jmp 80f
40:
addl -128(%esi), %eax
adcl -124(%esi), %eax #每个指令占3字节,减少寄存器争用,增加流水线并行度
adcl -120(%esi), %eax
adcl -116(%esi), %eax
adcl -112(%esi), %eax
adcl -108(%esi), %eax
adcl -104(%esi), %eax
adcl -100(%esi), %eax
adcl -96(%esi), %eax
adcl -92(%esi), %eax
adcl -88(%esi), %eax
adcl -84(%esi), %eax
adcl -80(%esi), %eax
adcl -76(%esi), %eax
adcl -72(%esi), %eax
adcl -68(%esi), %eax
adcl -64(%esi), %eax
adcl -60(%esi), %eax
adcl -56(%esi), %eax
adcl -52(%esi), %eax
adcl -48(%esi), %eax
adcl -44(%esi), %eax
adcl -40(%esi), %eax
adcl -36(%esi), %eax
adcl -32(%esi), %eax
adcl -28(%esi), %eax
adcl -24(%esi), %eax
adcl -20(%esi), %eax
adcl -16(%esi), %eax
adcl -12(%esi), %eax
adcl -8(%esi), %eax
adcl -4(%esi), %eax
45:
lea 128(%esi), %esi #然后一次处理32字节
adcl $0, %eax
dec %ecx
jge 40b
#处理完毕
movl %edx, %ecx #还原len
50: andl $3, %ecx #是否有不足4字节部分
jz 80f
# Handle the last 1-3 bytes without jumping
notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
# 11111111 11111111 11111111 11111110 1
# 11111111 11111111 11111111 11111101 2
# 11111111 11111111 11111111 11111100 3
movl $0xffffff,%ebx # by the shll and shrl instructions 1右移16位 2右移8位 3不动
# 00000000 11111111 11111111 11111111 ebx
shll $3,%ecx
# 11111111 11111111 11111111 11110000 1
# 11111111 11111111 11111111 11101000 2
# 11111111 11111111 11111111 11100000 3
shrl %cl,%ebx # 移动cl mod 32位
# 00000000 00000000 00000000 11111111 1
# 00000000 00000000 11111111 11111111 2
# 00000000 11111111 11111111 11111111 3
andl -128(%esi),%ebx # esi is 4-aligned so should be ok
addl %ebx,%eax
adcl $0,%eax
80:
popl %ebx
popl %esi
ret
#endif
/*
unsigned int csum_partial_copy_generic (const char *src, char *dst,
int len, int sum, int *src_err_ptr, int *dst_err_ptr)
*/
/*
* Copy from ds while checksumming, otherwise like csum_partial
*
* The macros SRC and DST specify the type of access for the instruction.
* thus we can call a custom exception handler for all access types.
*
* FIXME: could someone double-check whether I haven't mixed up some SRC and
* DST definitions? It's damn hard to trigger all cases. I hope I got
* them all but there's no guarantee.
*/
#define SRC(y...) \
9999: y; \
.section __ex_table, "a"; \
.long 9999b, 6001f ; \
.previous
#define DST(y...) \
9999: y; \
.section __ex_table, "a"; \
.long 9999b, 6002f ; \
.previous
.align 4
.globl csum_partial_copy_generic
#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
#define ARGBASE 16
#define FP 12
csum_partial_copy_generic:
subl $4,%esp
pushl %edi
pushl %esi
pushl %ebx
movl ARGBASE+16(%esp),%eax # sum
movl ARGBASE+12(%esp),%ecx # len
movl ARGBASE+4(%esp),%esi # src
movl ARGBASE+8(%esp),%edi # dst
testl $2, %edi # Check alignment.
jz 2f # Jump if alignment is ok. 4字节对齐
subl $2, %ecx # Alignment uses up two bytes.
jae 1f # Jump if we had at least two bytes. len >=2
addl $2, %ecx # ecx was < 2. Deal with it.
jmp 4f
SRC(1: movw (%esi), %bx )
addl $2, %esi
DST( movw %bx, (%edi) )
addl $2, %edi
addw %bx, %ax
adcl $0, %eax
2:
movl %ecx, FP(%esp) #由前面subl $4,%esp分配空间,保存需要处理的长度
shrl $5, %ecx #一次处理32字节
jz 2f #处理完否?是否足够32个字节
testl %esi, %esi #清进位标志
SRC(1: movl (%esi), %ebx )
SRC( movl 4(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, (%edi) )
adcl %edx, %eax
DST( movl %edx, 4(%edi) )
SRC( movl 8(%esi), %ebx )
SRC( movl 12(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, 8(%edi) )
adcl %edx, %eax
DST( movl %edx, 12(%edi) )
SRC( movl 16(%esi), %ebx )
SRC( movl 20(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, 16(%edi) )
adcl %edx, %eax
DST( movl %edx, 20(%edi) )
SRC( movl 24(%esi), %ebx )
SRC( movl 28(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, 24(%edi) )
adcl %edx, %eax
DST( movl %edx, 28(%edi) )
lea 32(%esi), %esi
lea 32(%edi), %edi
dec %ecx
jne 1b #为处理完
adcl $0, %eax
2: movl FP(%esp), %edx #还原len
movl %edx, %ecx
andl $0x1c, %edx #0001 1100
je 4f #是否有[4,31]之间这一段要处理
shrl $2, %edx # This clears CF,转换成4字节个数
SRC(3: movl (%esi), %ebx )
adcl %ebx, %eax
DST( movl %ebx, (%edi) )
lea 4(%esi), %esi
lea 4(%edi), %edi
dec %edx
jne 3b
adcl $0, %eax
4: andl $3, %ecx
jz 7f #是否有[1,3]这一段要处理
cmpl $2, %ecx
jb 5f #还剩一字节?
SRC( movw (%esi), %cx )
leal 2(%esi), %esi
DST( movw %cx, (%edi) )
leal 2(%edi), %edi
je 6f
shll $16,%ecx #放到高字节处
SRC(5: movb (%esi), %cl )
DST( movb %cl, (%edi) )
6: addl %ecx, %eax #不影响最后结果
adcl $0, %eax
7:
5000:
# Exception handler:
.section .fixup, "ax"
6001:
movl ARGBASE+20(%esp), %ebx # src_err_ptr
movl $-EFAULT, (%ebx) #源内存出错
# zero the complete destination - computing the rest
# is too much work #目的全部清0
movl ARGBASE+8(%esp), %edi # dst
movl ARGBASE+12(%esp), %ecx # len
xorl %eax,%eax
//用户到内核:保证内核存在
rep ; stosb #不出错???????目的地是内核,源是用户空间???
jmp 5000b
6002:
movl ARGBASE+24(%esp), %ebx # dst_err_ptr
movl $-EFAULT,(%ebx) #目的内存出错
//内核到用户:保证内核存在
jmp 5000b
.previous
popl %ebx
popl %esi
popl %edi
popl %ecx # equivalent to addl $4,%esp
ret
#else
/* Version for PentiumII/PPro */
#define ROUND1(x) \
SRC(movl x(%esi), %ebx ) ; \
addl %ebx, %eax ; \
DST(movl %ebx, x(%edi) ) ;
#define ROUND(x) \
SRC(movl x(%esi), %ebx ) ; \
adcl %ebx, %eax ; \
DST(movl %ebx, x(%edi) ) ;
#define ARGBASE 12
csum_partial_copy_generic:
pushl %ebx
pushl %edi
pushl %esi
movl ARGBASE+4(%esp),%esi #src
movl ARGBASE+8(%esp),%edi #dst
movl ARGBASE+12(%esp),%ecx #len
movl ARGBASE+16(%esp),%eax #sum
# movl %ecx, %edx
movl %ecx, %ebx #len
movl %esi, %edx #src
shrl $6, %ecx #一次处理64字节???
andl $0x3c, %ebx #还剩下的长度是否在[4,63] 中
negl %ebx #没有右移
subl %ebx, %esi
subl %ebx, %edi
lea -1(%esi),%edx #%esi-1
andl $-32,%edx #FFFFFFE0 1111 1111 1111 1111 1111 1111 1110 0000,向下对齐到32字节边界
lea 3f(%ebx,%ebx), %ebx #%ebx+%ebx*1+3f (src代码+dst代码)
testl %esi, %esi #clear CF
jmp *%ebx
1: addl $64,%esi
addl $64,%edi
SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) #使数据进入cache line??
//movb -32(%edx),%bl) 取前32个字节缓冲行
//SRC(movb (%edx),%bl) 取后32个字节缓冲行
/*
8b 5e c0 mov 0xffffffc0(%esi),%ebx
01 d8 add %ebx,%eax
89 5f c0 mov %ebx,0xffffffc0(%edi)
8b 5e c4 mov 0xffffffc4(%esi),%ebx
11 d8 adc %ebx,%eax
89 5f c4 mov %ebx,0xffffffc4(%edi)
*/
ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
3: adcl $0,%eax
addl $64, %edx
dec %ecx
jge 1b
4: movl ARGBASE+12(%esp),%edx #len
andl $3, %edx
jz 7f #剩下的长度为0
cmpl $2, %edx
jb 5f #剩下的长度为1
SRC( movw (%esi), %dx )
leal 2(%esi), %esi
DST( movw %dx, (%edi) )
leal 2(%edi), %edi
je 6f
shll $16,%edx
5:
SRC( movb (%esi), %dl )
DST( movb %dl, (%edi) )
6: addl %edx, %eax
adcl $0, %eax
7:
.section .fixup, "ax"
6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
movl $-EFAULT, (%ebx)
# zero the complete destination (computing the rest is too much work)
movl ARGBASE+8(%esp),%edi # dst
movl ARGBASE+12(%esp),%ecx # len
xorl %eax,%eax
rep; stosb
jmp 7b
6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
movl $-EFAULT, (%ebx)
jmp 7b
.previous
popl %esi
popl %edi
popl %ebx
ret
#undef ROUND
#undef ROUND1
#endif
//计算伪首部检验和
static inline unsigned short int csum_tcpudp_magic(unsigned long saddr,
unsigned long daddr,
unsigned short len,
unsigned short proto,
unsigned int sum)
{
return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
}
static inline unsigned long csum_tcpudp_nofold(unsigned long saddr,
unsigned long daddr,
unsigned short len,
unsigned short proto,
unsigned int sum)
{
__asm__("
addl %1, %0
adcl %2, %0
adcl %3, %0
adcl $0, %0
"
: "=r" (sum)
: "g" (daddr), "g"(saddr), "g"((ntohs(len)<<16)+proto*256), "0"(sum));
return sum;
}
static inline unsigned int csum_fold(unsigned int sum)
{
__asm__("
addl %1, %0 #对折到寄存器高16位并相加,可能置进位
adcl $0xffff, %0 #如果有进位,则相当于1+0xffff+%0=0x10000+%0->即将%0的高16位+1,否则0xffff+%0,%0的低16位为0,对%0的高16位无影响
"
: "=r" (sum)
: "r" (sum << 16), #将sum的低16位移到寄存器1的高16位
"0" (sum & 0xffff0000) #sum的高16位在寄存器0的高16位,寄存器0的低16位为0
);
return (~sum) >> 16;//取反,将高16位移到低16位中
}
2.2 udp接收检验和计算
int udp_rcv(struct sk_buff *skb, unsigned short len)
{
if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0)//先初始化检验和
goto csum_error;
sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);//查询套接字
if (sk != NULL) {
udp_queue_rcv_skb(sk, skb);
sock_put(sk);
return 0;
}
}
static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
{
/*
* Charge it to the socket, dropping if the queue is full.
*/
#if defined(CONFIG_FILTER)
if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {//需要计算
if (__udp_checksum_complete(skb)) {//完成最后的计算
UDP_INC_STATS_BH(UdpInErrors);
IP_INC_STATS_BH(IpInDiscards);
ip_statistics[smp_processor_id()*2].IpInDelivers--;
kfree_skb(skb);
return -1;
}
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
#endif
}
static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
unsigned short ulen, u32 saddr, u32 daddr)
{
if (uh->check == 0) {//没有检验和
skb->ip_summed = CHECKSUM_UNNECESSARY;//不必计算
} else if (skb->ip_summed == CHECKSUM_HW) {//硬件检验过
//加上伪首部检验和
if (udp_check(uh, ulen, saddr, daddr, skb->csum))
return -1;
skb->ip_summed = CHECKSUM_UNNECESSARY;//已完成计算
} else if (skb->ip_summed != CHECKSUM_UNNECESSARY)//需要检验
//先计算伪首部检验和
skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
/* Probably, we should checksum udp header (it should be in cache
* in any case) and data in tiny packets (< rx copybreak).
*/
return 0;
}
static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
{
return (unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum));
}
3 tcp的检验和计算
3.1 tcp发送检验和计算
int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
//检验和
tp->af_specific->send_check(sk, th, skb->len, skb);//tcp_v4_send_check
}
void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
struct sk_buff *skb)
{
th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
csum_partial((char *)th, th->doff<<2, skb->csum));
}
static __inline__ u16 tcp_v4_check(struct tcphdr *th, int len,
unsigned long saddr, unsigned long daddr,
unsigned long base)
{
return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);//加上伪首部
}
3.2 tcp接收检验和计算
int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
{
if (th->doff < sizeof(struct tcphdr)/4 ||
(skb->ip_summed != CHECKSUM_UNNECESSARY &&
tcp_v4_checksum_init(skb) < 0))
goto bad_packet;
sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
if (!sk->lock.users) {
if (!tcp_prequeue(sk, skb))//为加速数据传输,使用的prequeue技术
//tcp_rcvmsg和tcp_rcv_established配合加速传输
ret = tcp_v4_do_rcv(sk, skb);
} else
sk_add_backlog(sk, skb);//加到backlog中,其它函数在release_sock时处理
}
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
if (sk->state == TCP_ESTABLISHED) { /* Fast path */
TCP_CHECK_TIMER(sk);
if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
goto reset;
TCP_CHECK_TIMER(sk);
return 0;
}
}
int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
if (tcp_checksum_complete_user(sk, skb))
goto csum_error;
}
static int tcp_v4_checksum_init(struct sk_buff *skb)
{
if (skb->ip_summed == CHECKSUM_HW) {
if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
skb->nh.iph->daddr,skb->csum)) {
NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
return -1;
}
skb->ip_summed = CHECKSUM_UNNECESSARY;
} else {
if (skb->len <= 76) {//包很小,全部计算
if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
skb->nh.iph->daddr,
csum_partial((char *)skb->h.th, skb->len, 0)))
return -1;
skb->ip_summed = CHECKSUM_UNNECESSARY;
} else {//只计算伪首部,后面再计算剩下的
skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
skb->nh.iph->daddr,0);
}
}
return 0;
}
static __inline__ int
tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
{
return skb->ip_summed != CHECKSUM_UNNECESSARY &&//计算剩下的
__tcp_checksum_complete_user(sk, skb);
}
static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
{
int result;
if (sk->lock.users) {
local_bh_enable();
result = __tcp_checksum_complete(skb);
local_bh_disable();
} else {
result = __tcp_checksum_complete(skb);
}
return result;
}
static __inline__ int __tcp_checksum_complete(struct sk_buff *skb)
{
return (unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum));
}