linux 1.0 内核注解 linux/fs/buffer.c
时间:2009-05-03 来源:taozhijiangscu
/********************************************
*Created By: 陶治江
*Date: 2009年5月2日20:34:33
********************************************/
#include <stdarg.h>
#include <linux/config.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/string.h>
#include <linux/locks.h>
#include <linux/errno.h> #include <asm/system.h>
#include <asm/io.h> #ifdef CONFIG_SCSI
#ifdef CONFIG_BLK_DEV_SR
extern int check_cdrom_media_change(int, int);
#endif
#ifdef CONFIG_BLK_DEV_SD
extern int check_scsidisk_media_change(int, int);
extern int revalidate_scsidisk(int, int);
#endif
#endif
#ifdef CONFIG_CDU31A
extern int check_cdu31a_media_change(int, int);
#endif
#ifdef CONFIG_MCD
extern int check_mcd_media_change(int, int);
#endif static int grow_buffers(int pri, int size); static struct buffer_head * hash_table[NR_HASH];
static struct buffer_head * free_list = NULL;
static struct buffer_head * unused_list = NULL;
static struct wait_queue * buffer_wait = NULL; int nr_buffers = 0;
int buffermem = 0;
int nr_buffer_heads = 0;
static int min_free_pages = 20; /* nr free pages needed before buffer grows */
extern int *blksize_size[]; //等待指定的缓冲区头部(实际就是等待缓冲区解锁啦)
void __wait_on_buffer(struct buffer_head * bh)
{
struct wait_queue wait = { current, NULL }; //将引用计数增加后,将当前进程增加到缓冲队列头部的等待队列中
bh->b_count++;
add_wait_queue(&bh->b_wait, &wait);
repeat: //同inode等待一样的原理
current->state = TASK_UNINTERRUPTIBLE;
if (bh->b_lock) {
schedule();
goto repeat;
}
remove_wait_queue(&bh->b_wait, &wait);
bh->b_count--;
current->state = TASK_RUNNING;
} /* Call sync_buffers with wait!=0 to ensure that the call does not
return until all buffer writes have completed. Sync() may return
before the writes have finished; fsync() may not. */
static int sync_buffers(dev_t dev, int wait)
{
int i, retry, pass = 0, err = 0;
struct buffer_head * bh; //这里是对轮数(pass)和等待标志(wait)的说明啊:
/* One pass for no-wait, three for wait:
0) write out all dirty, unlocked buffers; 能够使用wait标志强制等待
1) write out all dirty buffers, waiting if locked;
2) wait for completion by waiting for all buffers to unlock.
*/
repeat:
retry = 0;
bh = free_list;
//是不是这里本身就进行两轮啊!!!
//的却如此,老版本上讲的就是两次,据说这样做可以提高效率
for (i = nr_buffers*2 ; i-- > 0 ; bh = bh->b_next_free)
{
if (dev && bh->b_dev != dev)
continue;
if (bh->b_lock)
{
/* Buffer is locked; skip it unless wait is
requested AND pass > 0. */
//当wait!=0&&pass!=0的时候,那么要么是因为wait设置了强制等待的
//标志,要不就是后面的轮训次数,就进行等待了
if (!wait || !pass) {
retry = 1;
continue;
}
wait_on_buffer (bh);
}
/* If an unlocked buffer is not uptodate, there has been
an IO error. Skip it. */
//呃,这里的buffer应该是解锁之后的了,那么如果dirty=0并且update=0,就
//认为是错误了
//反正在刚申请一块新的缓冲块的时候
//是没有设置dirt和uptodate的标志的,
//可能这里表示是无效的缓冲块
if (wait && bh->b_req && !bh->b_lock &&
!bh->b_dirt && !bh->b_uptodate)
{
err = 1;
continue;
}
/* Don't write clean buffers. Don't write ANY buffers
on the third pass. */
if (!bh->b_dirt || pass>=2) //第三轮不写,只是等待解锁
continue;
bh->b_count++;
ll_rw_block(WRITE, 1, &bh);
bh->b_count--;
retry = 1; //当调用了底层的写函数就要设置这个标志了
}
/* If we are waiting for the sync to succeed, and if any dirty
blocks were written, then repeat; on the second pass, only
wait for buffers being written (do not pass to write any
more buffers on the second pass). */
if (wait && retry && ++pass<=2)
goto repeat;
return err;
} //注意下面两个函数最后对于sync_buffers的调用的不同,
//一个不是是等待类型的,一个是等待类型的
void sync_dev(dev_t dev)
{
sync_buffers(dev, 0);
sync_supers(dev);
sync_inodes(dev);
sync_buffers(dev, 0);
} int fsync_dev(dev_t dev)
{
sync_buffers(dev, 0);
sync_supers(dev);
sync_inodes(dev);
return sync_buffers(dev, 1);
} asmlinkage int sys_sync(void)
{
sync_dev(0);
return 0;
} int file_fsync (struct inode *inode, struct file *filp)
{
return fsync_dev(inode->i_dev);
} //fsync比sync要严格的多,这个函数在返回前要求所有的信息必须同
//存储设备的信息进行同步来保证完整性的,而后者没有这个功能
asmlinkage int sys_fsync(unsigned int fd)
{
struct file * file;
struct inode * inode; if (fd>=NR_OPEN || !(file=current->filp[fd]) || !(inode=file->f_inode))
return -EBADF;
if (!file->f_op || !file->f_op->fsync)
return -EINVAL;
if (file->f_op->fsync(inode,file))
return -EIO;
return 0;
} void invalidate_buffers(dev_t dev)
{
int i;
struct buffer_head * bh; bh = free_list;
for (i = nr_buffers*2 ; --i > 0 ; bh = bh->b_next_free)
{
if (bh->b_dev != dev)
continue;
wait_on_buffer(bh);
if (bh->b_dev == dev)
bh->b_uptodate = bh->b_dirt = bh->b_req = 0;
//设制为无效的,也就是上面的err的设置啊
}
} void check_disk_change(dev_t dev)
{
int i;
struct buffer_head * bh; switch(MAJOR(dev)){ //根据主设备号来判断设别的类型
case FLOPPY_MAJOR:
if (!(bh = getblk(dev,0,1024)))
return;
i = floppy_change(bh);
brelse(bh);
break; #if defined(CONFIG_BLK_DEV_SD) && defined(CONFIG_SCSI)
case SCSI_DISK_MAJOR:
i = check_scsidisk_media_change(dev, 0);
break;
#endif #if defined(CONFIG_BLK_DEV_SR) && defined(CONFIG_SCSI)
case SCSI_CDROM_MAJOR:
i = check_cdrom_media_change(dev, 0);
break;
#endif #if defined(CONFIG_CDU31A)
case CDU31A_CDROM_MAJOR:
i = check_cdu31a_media_change(dev, 0);
break;
#endif #if defined(CONFIG_MCD)
case MITSUMI_CDROM_MAJOR:
i = check_mcd_media_change(dev, 0);
break;
#endif default:
return;
}; if (!i) return; //i==0,没有更换 printk("VFS: Disk change detected on device %d/%d\n",
MAJOR(dev), MINOR(dev));
for (i=0 ; i<NR_SUPER ; i++)
if (super_blocks[i].s_dev == dev) //放回设备的超级块
put_super(super_blocks[i].s_dev);
//将i节点、缓冲区都失效
invalidate_inodes(dev);
invalidate_buffers(dev); #if defined(CONFIG_BLK_DEV_SD) && defined(CONFIG_SCSI)
/* This is trickier for a removable hardisk, because we have to invalidate
all of the partitions that lie on the disk. */
if (MAJOR(dev) == SCSI_DISK_MAJOR)
revalidate_scsidisk(dev, 0);
#endif
} //hash结构,对于块设备,只要有设备号dev和块号block就
//可以使用hash来唯一确定这个设备的块对应的缓冲区头部地址
#define _hashfn(dev,block) (((unsigned)(dev^block))%NR_HASH)
#define hash(dev,block) hash_table[_hashfn(dev,block)] //将缓冲块头部从hash链表中删除
static inline void remove_from_hash_queue(struct buffer_head * bh)
{
//数组+双向链表的结构
if (bh->b_next)
bh->b_next->b_prev = bh->b_prev;
if (bh->b_prev)
bh->b_prev->b_next = bh->b_next;
if (hash(bh->b_dev,bh->b_blocknr) == bh)
hash(bh->b_dev,bh->b_blocknr) = bh->b_next;
//bh成为一个孤岛的双向链表了 bye...
bh->b_next = bh->b_prev = NULL;
} //将缓冲块头部从空闲链表中删除
static inline void remove_from_free_list(struct buffer_head * bh)
{
//空闲的缓冲块的头部组成了一个双向循环链表结构
//这里进行了一个简单的检查,如果不是循环链表表
//示结构被破坏了,这样内核就该傻掉了 :-)
if (!(bh->b_prev_free) || !(bh->b_next_free))
panic("VFS: Free block list corrupted");
//双向循环链表结构,这是一定存在的
bh->b_prev_free->b_next_free = bh->b_next_free;
bh->b_next_free->b_prev_free = bh->b_prev_free;
/*free_list指向了最空闲的那一块,如果取得的是本块,就
*调整free_list指针*/
if (free_list == bh)
free_list = bh->b_next_free;
/*取出来了哦 ...*/
bh->b_next_free = bh->b_prev_free = NULL;
} //可见缓冲区头部中空闲缓冲块是使用hash队列和空闲链表共同维持的
static inline void remove_from_queues(struct buffer_head * bh)
{
remove_from_hash_queue(bh);
remove_from_free_list(bh);
} //将bh放置到空闲队列的最前端,如果本身就是就直接返回
static inline void put_first_free(struct buffer_head * bh)
{
if (!bh || (bh == free_list))
return;
remove_from_free_list(bh); //先删除后才可以加入
bh->b_next_free = free_list;
bh->b_prev_free = free_list->b_prev_free;
free_list->b_prev_free->b_next_free = bh;
free_list->b_prev_free = bh;
free_list = bh; /*free_list指向本块*/
} static inline void put_last_free(struct buffer_head * bh)
{
if (!bh)
return;
if (bh == free_list) {
free_list = bh->b_next_free; //直接调整就可以了,还是蛮微妙的~~~
return;
}
remove_from_free_list(bh);
bh->b_next_free = free_list;
bh->b_prev_free = free_list->b_prev_free;
free_list->b_prev_free->b_next_free = bh;
free_list->b_prev_free = bh;
} static inline void insert_into_queues(struct buffer_head * bh)
{
/*加入到空闲队列的尾部*/
bh->b_next_free = free_list;
bh->b_prev_free = free_list->b_prev_free;
free_list->b_prev_free->b_next_free = bh;
free_list->b_prev_free = bh;
/*插入到hash表中(但是hash表必须要有设备号哦),如果不能插入
*就需要将b_prev和b_next设置为NULL,这样使用remove_from_hash_queue
*也不会有问题了 :-)*/
bh->b_prev = NULL;
bh->b_next = NULL;
if (!bh->b_dev)
return;
/*这里将bh插入到hash队列中,插入的是hash表项的最前面*/
bh->b_next = hash(bh->b_dev,bh->b_blocknr);
hash(bh->b_dev,bh->b_blocknr) = bh;
//注意的是双向链表结构,一定不能断裂哦
if (bh->b_next)
bh->b_next->b_prev = bh;
} //查找指定的缓冲块头部,这里提供了设备号、块号和大小
//实际的操作就是使用两个参数查找并用第三个尺寸参数来
//进行校验
static struct buffer_head * find_buffer(dev_t dev, int block, int size)
{
struct buffer_head * tmp; for (tmp = hash(dev,block) ; tmp != NULL ; tmp = tmp->b_next)
if (tmp->b_dev==dev && tmp->b_blocknr==block)
if (tmp->b_size == size)
return tmp;
else {
printk("VFS: Wrong blocksize on device %d/%d\n",
MAJOR(dev), MINOR(dev));
return NULL;
}
return NULL;
} //这个函数同find_buffer差不多,但是这个返回后
//就可用了(引用计数也增加了),在函数中就等待解锁了
struct buffer_head * get_hash_table(dev_t dev, int block, int size)
{
struct buffer_head * bh; for (;;)
{
if (!(bh=find_buffer(dev,block,size)))
return NULL;
bh->b_count++;
wait_on_buffer(bh);
if (bh->b_dev == dev && bh->b_blocknr == block && bh->b_size == size)
return bh; //注意如果这里返回,那么引用次数就增加了
bh->b_count--;
}
} void set_blocksize(dev_t dev, int size)
{
int i;
struct buffer_head * bh, *bhnext; //关于blksize_size的定义是 int* blksize_size[MAX_BLKDEV]
//是指针的数组(我就叫做指针数组,元素是指针)
if (!blksize_size[MAJOR(dev)])
return; //这段代码看起来很古怪吧,但是一定是先执行case然后执行
//default的,这样写代码的目的无非是想少写个break
//难怪人们都说Linux中的程序员都很懒的 :-)
switch(size) {
default: panic("Invalid blocksize passed to set_blocksize");
case 512: case 1024: case 2048: case 4096:;
}
//指针数组当作二维数组使用,使用主设备号和次设备号共同
//确定块大小,这就是说同一个主设备的不同次设备是可以size不同的
//如果原先没有设置size并且要设置的大小是默认的大小(1024)
//就直接设置返回
if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
blksize_size[MAJOR(dev)][MINOR(dev)] = size;
return;
}//如果设置的值同原先的值一样,也直接返回
if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
return;
/*执行这里说明是设置非规格(默认)尺寸的size了,但是这些尺寸也是有
*一定的限制的哦,512的整数倍*/
sync_buffers(dev, 2);
blksize_size[MAJOR(dev)][MINOR(dev)] = size;
bh = free_list;
for (i = nr_buffers*2 ; --i > 0 ; bh = bhnext)
{
bhnext = bh->b_next_free;
if (bh->b_dev != dev)
continue;
if (bh->b_size == size)
continue;
//这里是对指定设备dev(不要混淆,包含了主设备号和次设备号,
//唯一确定了)中blksize_size和新设置值不同的缓冲块头部进行
//处理,就是使其无效并从hash链表中移除掉
//这里的put_first_free感觉是有必要的,为什么要注释起来???
//我想可能的原因是,这些缓冲块的大小是不符合的,所以即使是空闲的
//也不能被使用~~~
wait_on_buffer(bh);
if (bh->b_dev == dev && bh->b_size != size)
bh->b_uptodate = bh->b_dirt = 0; //设置为无效
remove_from_hash_queue(bh);
/* put_first_free(bh); */
}
} //定义缓冲区修改标志比锁定标志权重要大
#define BADNESS(bh) (((bh)->b_dirt<<1)+(bh)->b_lock) //获得空闲的缓冲块并用dev,block初始化它
struct buffer_head * getblk(dev_t dev,int block, int size)
{
struct buffer_head * bh, * tmp;
int buffers;
static int grow_size = 0; repeat:
bh = get_hash_table(dev, block, size);
if (bh) {
if (bh->b_uptodate && !bh->b_dirt)
put_last_free(bh); //最后被访问过,最不空闲了
return bh;
}
grow_size -= size;
//起码上面刚下来的时候,grow_size会为负值,这样就可以分配内核
//页面进行缓冲区的增长了
if (nr_free_pages > min_free_pages && grow_size <= 0) {
if (grow_buffers(GFP_BUFFER, size))
grow_size = PAGE_SIZE;
}
buffers = nr_buffers; //int
bh = NULL; //在所有的缓冲块中寻找空闲的缓冲块
for (tmp = free_list; buffers-- > 0 ; tmp = tmp->b_next_free)
{
if (tmp->b_count || tmp->b_size != size)
continue;
if (mem_map[MAP_NR((unsigned long) tmp->b_data)] != 1)
continue;
if (!bh || BADNESS(tmp)<BADNESS(bh)) { //修改标志比锁定标志权重要大
bh = tmp; //没有就先捡一个可能锁定的
if (!BADNESS(tmp)) //没有被修改,也没有被锁定,找到!!
break;
}
} if (!bh) { //没有空余的了
if (nr_free_pages > 5)
if (grow_buffers(GFP_BUFFER, size))
goto repeat;
if (!grow_buffers(GFP_ATOMIC, size)) //没办法,访问保留的内存
sleep_on(&buffer_wait);
goto repeat;
} wait_on_buffer(bh); //可能被锁定
if (bh->b_count || bh->b_size != size) //又被强走了!
goto repeat;
if (bh->b_dirt) {
sync_buffers(0,0);
goto repeat;
}
/* NOTE!! While we slept waiting for this block, somebody else might */
/* already have added "this" block to the cache. check it */ //How can it happen?
if (find_buffer(dev,block,size))
goto repeat;
//将新的缓冲块进行初始化,哈~~~
bh->b_count=1;
bh->b_dirt=0; //这是初始化的操作,表示无效的标志
bh->b_uptodate=0;
bh->b_req=0;
remove_from_queues(bh);
bh->b_dev=dev;
bh->b_blocknr=block;
insert_into_queues(bh);
return bh;
} void brelse(struct buffer_head * buf)
{
if (!buf)
return;
wait_on_buffer(buf);
if (buf->b_count) {
if (--buf->b_count) //还有引用,不能释放
return;
/*否则唤醒等待的进程,然后返回,注意此时的b_count==0
*了,等待的很可能是需要空闲缓冲块的进程哦*/
wake_up(&buffer_wait);
return;
}
printk("VFS: brelse: Trying to free free buffer\n");
} //块读取
struct buffer_head * bread(dev_t dev, int block, int size)
{
struct buffer_head * bh; if (!(bh = getblk(dev, block, size))) {
printk("VFS: bread: READ error on device %d/%d\n",
MAJOR(dev), MINOR(dev));
return NULL;
}
if (bh->b_uptodate) //对于读,如果数据是更新的,就不用读了
return bh;
//否则就调用底层的读函数,并且等待返回并查看是否成功了
//如果失败就释放缓冲区,返回NULL
ll_rw_block(READ, 1, &bh);
wait_on_buffer(bh);
if (bh->b_uptodate)
return bh;
brelse(bh);
return NULL;
} //预先读,块号使用可变参数来指定,使用负数来结尾
//这里第一个参数是必须读取的,否则返回错误,而后面
//的读取成功与否被封装到了getblk函数中了
struct buffer_head * breada(dev_t dev,int first, ...)
{
va_list args;
unsigned int blocksize;
struct buffer_head * bh, *tmp; va_start(args,first); //这时默认值,但是还是需要具体检查设置
blocksize = BLOCK_SIZE;
if (blksize_size[MAJOR(dev)] && blksize_size[MAJOR(dev)][MINOR(dev)])
blocksize = blksize_size[MAJOR(dev)][MINOR(dev)]; if (!(bh = getblk(dev, first, blocksize))) { //第一个块是必须读取成功的~~
printk("VFS: breada: READ error on device %d/%d\n",
MAJOR(dev), MINOR(dev));
return NULL;
}
if (!bh->b_uptodate)
ll_rw_block(READ, 1, &bh);
while ((first=va_arg(args,int))>=0)
{
//这里是预读,所以没有真正的引用,而getblk函数(实际是
//get_hash_table函数)中递增了引用,这里需要递减(感觉应该是这样的)
tmp = getblk(dev, first, blocksize);
if (tmp) {
if (!tmp->b_uptodate)
ll_rw_block(READA, 1, &tmp);
tmp->b_count--;
}
}
va_end(args);
wait_on_buffer(bh);
//检查的是第一个参数的读取,所以第一块是必读的
if (bh->b_uptodate)
return bh;
brelse(bh);
return (NULL);
} static void put_unused_buffer_head(struct buffer_head * bh)
{
struct wait_queue * wait; wait = ((volatile struct buffer_head *) bh)->b_wait;
//将缓冲头结构全部初始化为0
memset((void *) bh,0,sizeof(*bh));
//这说明缓冲区头部可以全部为0,但是b_wait的数据是不能被覆盖
//掉的,这是用来恢复b_wait数据的,而volatile可能是避免编译器
//的不合理的优化操作吧 :-)
((volatile struct buffer_head *) bh)->b_wait = wait;
//可能unused_list保存了所有的没有使用的缓冲块头部结构的地址
//本身指向了链表头部
bh->b_next_free = unused_list;
unused_list = bh;
} static void get_more_buffer_heads(void)
{
int i;
struct buffer_head * bh; //如果unused_list不为空,表示有空闲的缓冲块头部结构
//这里正常返回
if (unused_list)
return; //这里说明unused_list==NULL,没有空闲的缓冲区头部结构了
//就另外分配一页内存,然后将一页内存分割成各个头结构
if(! (bh = (struct buffer_head*) get_free_page(GFP_BUFFER)))
return; //nr_buffer_heads是总的计数,要增加
for (nr_buffer_heads+=i=PAGE_SIZE/sizeof*bh ; i>0; i--) {
bh->b_next_free = unused_list; //组成一个链表结构
unused_list = bh++;
}
} //返回空闲的缓冲块头结构的指针
static struct buffer_head * get_unused_buffer_head(void)
{
struct buffer_head * bh; get_more_buffer_heads(); //这实际是检查操作,确保unused_list!=NULL
//因为get_more_buffer_heads是不可靠的
if (!unused_list)
return NULL;
bh = unused_list;
unused_list = bh->b_next_free;
/*必要的初始化操作*/
bh->b_next_free = NULL;
bh->b_data = NULL;
bh->b_size = 0;
bh->b_req = 0;
return bh;
} //在指定的页地址处创建指定大小的缓冲块,这里使用了buffers
//就表示创建多块,实际上是将一页分成同样大小的所有的缓冲块
//size可能同块设备的大小是一致的,这样完整分割是没有问题的
static struct buffer_head * create_buffers(unsigned long page, unsigned long size)
{
struct buffer_head *bh, *head;
unsigned long offset; head = NULL;
offset = PAGE_SIZE;
//看看,offset的数据类型是unsigned long,过0后将是无穷大~~~
//呃,胆子好大哦!
while ((offset -= size) < PAGE_SIZE) {
bh = get_unused_buffer_head();
if (!bh)
goto no_grow;
//b_this_page可能是同一页中的缓冲块的链表
//结构哦(后面看来的却如此),这样便于对整个一页
//的缓冲块进行操作
bh->b_this_page = head; //end with null!
head = bh;
bh->b_data = (char *) (page+offset); //从后向前初始化的
bh->b_size = size;
}
return head;
no_grow:
/*的却如此,失败后依次将这页中已经分配的缓冲块去除,并
*放回头部结构到空闲头部结构链中,这里的页面是传递进来
*的,所以没有涉及到页面的释放操作了。因为每次获得空闲
*头部都调用get_unused_buffer_head,所以如果一切正常是
*没有问题的 :-)*/
bh = head;
while (bh) {
head = bh;
bh = bh->b_this_page;
put_unused_buffer_head(head);
}
return NULL;
} //这里讲是read_buffers,感觉是对传递进来的buffer_head数组的
//所有块进行检测,如果不是更新的就调用底层的读,这个函数的
//可靠性是比较高的
static void read_buffers(struct buffer_head * bh[], int nrbuf)
{
int i;
int bhnum = 0;
//没办法,老的C语言,进行了数组的硬编码操作,但是后面的
//函数可以看出,这个8一定可以保证能读一页的数据
struct buffer_head * bhr[8]; for (i = 0 ; i < nrbuf ; i++)
{
if (bh[i] && !bh[i]->b_uptodate)
//uptodate==0 需要读取,bhr保存了需要读取的缓
//冲块头部地址,而bhnum保存了要读取的数目
bhr[bhnum++] = bh[i];
}
if (bhnum)
//如果有需要底层读取的就从底层设备中读取,起始这里即使
//bhr是一个局部变量也不要紧,因为只要在调用ll_rw_block时
//候能提供信息就可以了,实际的数据还是读取到缓冲中的,自
//己可不能先晕哦 :-)
ll_rw_block(READ, bhnum, bhr);
for (i = 0 ; i < nrbuf ; i++) {
if (bh[i]) { /*即使没有读取也可能需要解锁操作*/
wait_on_buffer(bh[i]);
}
}
} //first指向的缓冲块必须是内存页对其的,b是块号的数组啊
//这里的功能就是检查b所指定的所有的dev的缓冲块是否在
//first所指向的缓冲块的同一页~
static unsigned long check_aligned(struct buffer_head * first, unsigned long address,
dev_t dev, int *b, int size)
{
struct buffer_head * bh[8];
unsigned long page;
unsigned long offset;
int block;
int nrbuf; page = (unsigned long) first->b_data; if (page & ~PAGE_MASK) { //要求页对齐
brelse(first);
return 0;
}
mem_map[MAP_NR(page)]++;
bh[0] = first;
nrbuf = 1;
for (offset = size ; offset < PAGE_SIZE ; offset += size)
{
block = *++b;
if (!block) //设备块号
goto no_go;
first = get_hash_table(dev, block, size); //获得指定的缓冲块头部
if (!first)
goto no_go;
bh[nrbuf++] = first;
//offset是不断增加的,同时first块也是不断改变的
if (page+offset != (unsigned long) first->b_data) //right,in the same page
goto no_go;
}
//果真,read_buffers只是检测的作用
read_buffers(bh,nrbuf); /* make sure they are actually read correctly */
while (nrbuf-- > 0)
brelse(bh[nrbuf]);
free_page(address); //???
++current->min_flt;
return page; //失败
no_go:
while (nrbuf-- > 0)
brelse(bh[nrbuf]);
free_page(page);
return 0;
} //尝试着加载设备块的缓冲在同一个设备中
static unsigned long try_to_load_aligned(unsigned long address,
dev_t dev, int b[], int size)
{
struct buffer_head * bh, * tmp, * arr[8];
unsigned long offset;
int * p;
int block; bh = create_buffers(address, size);
if (!bh)
return 0;
/* do any of the buffers already exist? punt if so.. */
p = b;
for (offset = 0 ; offset < PAGE_SIZE ; offset += size)
{
block = *(p++);
if (!block)
goto not_aligned;
if (find_buffer(dev, block, size)) //缓冲块已经存在了~~
goto not_aligned;
}
tmp = bh;
p = b;
block = 0;
while (1) {
arr[block++] = bh;
bh->b_count = 1;
bh->b_dirt = 0; //这是缓冲块头部的初始化的值啊
bh->b_uptodate = 0;
bh->b_dev = dev;
bh->b_blocknr = *(p++); //块号
nr_buffers++;
insert_into_queues(bh);
if (bh->b_this_page)
bh = bh->b_this_page;
else //bh->b_this_page==NULL,最后一块的处理了!!
break;
}
buffermem += PAGE_SIZE;
bh->b_this_page = tmp; //呃,注意了,在创建的时候,一个页面的缓冲块链表是以NULL结尾的
//但是在这里使用之后就是一页之中的一个循环链表了!!!
mem_map[MAP_NR(address)]++;
read_buffers(arr,block);
while (block-- > 0)
brelse(arr[block]);
++current->maj_flt;
return address;
not_aligned:
while ((tmp = bh) != NULL) //这里不是循环链表!!
{
bh = bh->b_this_page;
put_unused_buffer_head(tmp);
}
return 0;
} //说明很详细了,如果加载了就测试是否是在同一个页面中,
//如果还没有加载就尝试在一个页面中加载
//一般用于代码和缓冲在同一个页面中~~~
/* Try-to-share-buffers tries to minimize memory use by trying to keep
* both code pages and the buffer area in the same page. This is done by
* (a) checking if the buffers are already aligned correctly in memory and
* (b) if none of the buffer heads are in memory at all, trying to load
* them into memory the way we want them.
*
* This doesn't guarantee that the memory is shared, but should under most
* circumstances work very well indeed (ie >90% sharing of code pages on
* demand-loadable executables).*/
static inline unsigned long try_to_share_buffers(unsigned long address,
dev_t dev, int *b, int size)
{
struct buffer_head * bh;
int block; block = b[0];
if (!block)
return 0;
bh = get_hash_table(dev, block, size);
if (bh)
return check_aligned(bh, address, dev, b, size); //这里可以看出address的作用了,如果原先是
//就有,那么就可以释放这个页面了
return try_to_load_aligned(address, dev, b, size);
} //from->to 这里指定了 size大小了(字节为单位)
#define COPYBLK(size,from,to) \
__asm__ __volatile__("rep ; movsl": \
:"c" (((unsigned long) size) >> 2),"S" (from),"D" (to) \
:"cx","di","si") //读取缓冲块到address地方,释放原来的缓冲块
unsigned long bread_page(unsigned long address, dev_t dev, int b[], int size, int prot)
{
//块大小最少是512,所以最多读8块填满一页
struct buffer_head * bh[8];
unsigned long where;
int i, j; if (!(prot & PAGE_RW))
{
where = try_to_share_buffers(address,dev,b,size);
if (where)
return where;
}
++current->maj_flt;
for (i=0, j=0; j<PAGE_SIZE ; i++, j+= size)
{
bh[i] = NULL;
if (b[i])
bh[i] = getblk(dev, b[i], size); //有数据
}
read_buffers(bh,i);
where = address;
for (i=0, j=0; j<PAGE_SIZE ; i++, j += size,address += size)
{
if (bh[i])
{
if (bh[i]->b_uptodate) //如果是更新的就直接拷贝,否则就释放
//可见这里的函数主要是对于代码段或者文件系统的
//缓冲,可能一般的写入操作的可能性不大
//总之,这里十分讲究速度!!!
COPYBLK(size, (unsigned long) bh[i]->b_data,address);
brelse(bh[i]);
}
}
return where;
} //创建一页size大小的缓冲块,而pri是用来内存分配的参数的
static int grow_buffers(int pri, int size)
{
unsigned long page;
struct buffer_head *bh, *tmp; if ((size & 511) || (size > PAGE_SIZE)) {
printk("VFS: grow_buffers: size = %d\n",size);
return 0;
}
if(!(page = __get_free_page(pri)))
return 0;
bh = create_buffers(page, size);
if (!bh) {
free_page(page);
return 0;
}
tmp = bh;
while (1) {
if (free_list) {
tmp->b_next_free = free_list;
tmp->b_prev_free = free_list->b_prev_free;
free_list->b_prev_free->b_next_free = tmp;
free_list->b_prev_free = tmp;
} else {
tmp->b_prev_free = tmp;
tmp->b_next_free = tmp;
}
free_list = tmp;
++nr_buffers;
if (tmp->b_this_page)
tmp = tmp->b_this_page; //一页中的链表操作
else
break;
}
tmp->b_this_page = bh;
buffermem += PAGE_SIZE;
return 1;
} //试着释放同一个页面的缓冲块,这个页面由bh尝试着确定
static int try_to_free(struct buffer_head * bh, struct buffer_head ** bhp)
{
unsigned long page;
struct buffer_head * tmp, * p; *bhp = bh;
page = (unsigned long) bh->b_data;
page &= PAGE_MASK; //获得缓冲块对应的页面
tmp = bh;
do {
if (!tmp)
return 0;
if (tmp->b_count || tmp->b_dirt || tmp->b_lock || tmp->b_wait) //有用,不能释放
return 0;
tmp = tmp->b_this_page;
} while (tmp != bh);
//执行到了这里,表示上面的两个return 都没有被执行,那么
//一页的缓冲块都是空闲的,通过检查了!!!
tmp = bh;
do {
p = tmp;
tmp = tmp->b_this_page;
nr_buffers--;
//关于bhp,可能是一个头部啊什么的,保持一致性
if (p == *bhp)
*bhp = p->b_prev_free;
remove_from_queues(p);
put_unused_buffer_head(p);
} while (tmp != bh); //头部了
buffermem -= PAGE_SIZE;
free_page(page); //释放页面
return !mem_map[MAP_NR(page)]; //0
} //priority表示了收缩的紧迫性,数值越小越强劲(执行的轮训越多)!
int shrink_buffers(unsigned int priority)
{
struct buffer_head *bh;
int i; if (priority < 2)
sync_buffers(0,0);
bh = free_list;
i = nr_buffers >> priority;
for ( ; i-- > 0 ; bh = bh->b_next_free) //空闲链表
{
if (bh->b_count || //有用,肯定不能释放啊
(priority >= 5 &&
mem_map[MAP_NR((unsigned long) bh->b_data)] > 1))
{
put_last_free(bh); //最不空闲
continue;
}
if (!bh->b_this_page)
continue;
if (bh->b_lock)
if (priority)
continue;
else //要的更多,等吧
wait_on_buffer(bh);
if (bh->b_dirt)
{
bh->b_count++;
ll_rw_block(WRITEA, 1, &bh);
bh->b_count--;
continue;
}
if (try_to_free(bh, &bh))
return 1;
}
return 0;
} //这里对所有的缓存块进行了数量和状态的统计操作
void show_buffers(void)
{
struct buffer_head * bh;
int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0; printk("Buffer memory: %6dkB\n",buffermem>>10);
printk("Buffer heads: %6d\n",nr_buffer_heads);
printk("Buffer blocks: %6d\n",nr_buffers);
bh = free_list;
do {
found++;
if (bh->b_lock)
locked++;
if (bh->b_dirt)
dirty++;
if (bh->b_count)
used++, lastused = found;
bh = bh->b_next_free;
} while (bh != free_list);
printk("Buffer mem: %d buffers, %d used (last=%d), %d locked, %d dirty\n",
found, used, lastused, locked, dirty);
} //缓冲的初始化操作
void buffer_init(void)
{
int i; if (high_memory >= 4*1024*1024)
min_free_pages = 200;
else
min_free_pages = 20;
//进行hash表的初始化,因为散列表中的hash码的计算用到了
//对NR_HASH的取余操作,这里是没有问题的
for (i = 0 ; i < NR_HASH ; i++)
hash_table[i] = NULL;
free_list = 0;
grow_buffers(GFP_KERNEL, BLOCK_SIZE);
if (!free_list)
panic("VFS: Unable to initialize buffer free list!");
return;
}
文档地址:http://blogimg.chinaunix.net/blog/upfile2/090503214430.pdf
*Created By: 陶治江
*Date: 2009年5月2日20:34:33
********************************************/
#include <stdarg.h>
#include <linux/config.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/string.h>
#include <linux/locks.h>
#include <linux/errno.h> #include <asm/system.h>
#include <asm/io.h> #ifdef CONFIG_SCSI
#ifdef CONFIG_BLK_DEV_SR
extern int check_cdrom_media_change(int, int);
#endif
#ifdef CONFIG_BLK_DEV_SD
extern int check_scsidisk_media_change(int, int);
extern int revalidate_scsidisk(int, int);
#endif
#endif
#ifdef CONFIG_CDU31A
extern int check_cdu31a_media_change(int, int);
#endif
#ifdef CONFIG_MCD
extern int check_mcd_media_change(int, int);
#endif static int grow_buffers(int pri, int size); static struct buffer_head * hash_table[NR_HASH];
static struct buffer_head * free_list = NULL;
static struct buffer_head * unused_list = NULL;
static struct wait_queue * buffer_wait = NULL; int nr_buffers = 0;
int buffermem = 0;
int nr_buffer_heads = 0;
static int min_free_pages = 20; /* nr free pages needed before buffer grows */
extern int *blksize_size[]; //等待指定的缓冲区头部(实际就是等待缓冲区解锁啦)
void __wait_on_buffer(struct buffer_head * bh)
{
struct wait_queue wait = { current, NULL }; //将引用计数增加后,将当前进程增加到缓冲队列头部的等待队列中
bh->b_count++;
add_wait_queue(&bh->b_wait, &wait);
repeat: //同inode等待一样的原理
current->state = TASK_UNINTERRUPTIBLE;
if (bh->b_lock) {
schedule();
goto repeat;
}
remove_wait_queue(&bh->b_wait, &wait);
bh->b_count--;
current->state = TASK_RUNNING;
} /* Call sync_buffers with wait!=0 to ensure that the call does not
return until all buffer writes have completed. Sync() may return
before the writes have finished; fsync() may not. */
static int sync_buffers(dev_t dev, int wait)
{
int i, retry, pass = 0, err = 0;
struct buffer_head * bh; //这里是对轮数(pass)和等待标志(wait)的说明啊:
/* One pass for no-wait, three for wait:
0) write out all dirty, unlocked buffers; 能够使用wait标志强制等待
1) write out all dirty buffers, waiting if locked;
2) wait for completion by waiting for all buffers to unlock.
*/
repeat:
retry = 0;
bh = free_list;
//是不是这里本身就进行两轮啊!!!
//的却如此,老版本上讲的就是两次,据说这样做可以提高效率
for (i = nr_buffers*2 ; i-- > 0 ; bh = bh->b_next_free)
{
if (dev && bh->b_dev != dev)
continue;
if (bh->b_lock)
{
/* Buffer is locked; skip it unless wait is
requested AND pass > 0. */
//当wait!=0&&pass!=0的时候,那么要么是因为wait设置了强制等待的
//标志,要不就是后面的轮训次数,就进行等待了
if (!wait || !pass) {
retry = 1;
continue;
}
wait_on_buffer (bh);
}
/* If an unlocked buffer is not uptodate, there has been
an IO error. Skip it. */
//呃,这里的buffer应该是解锁之后的了,那么如果dirty=0并且update=0,就
//认为是错误了
//反正在刚申请一块新的缓冲块的时候
//是没有设置dirt和uptodate的标志的,
//可能这里表示是无效的缓冲块
if (wait && bh->b_req && !bh->b_lock &&
!bh->b_dirt && !bh->b_uptodate)
{
err = 1;
continue;
}
/* Don't write clean buffers. Don't write ANY buffers
on the third pass. */
if (!bh->b_dirt || pass>=2) //第三轮不写,只是等待解锁
continue;
bh->b_count++;
ll_rw_block(WRITE, 1, &bh);
bh->b_count--;
retry = 1; //当调用了底层的写函数就要设置这个标志了
}
/* If we are waiting for the sync to succeed, and if any dirty
blocks were written, then repeat; on the second pass, only
wait for buffers being written (do not pass to write any
more buffers on the second pass). */
if (wait && retry && ++pass<=2)
goto repeat;
return err;
} //注意下面两个函数最后对于sync_buffers的调用的不同,
//一个不是是等待类型的,一个是等待类型的
void sync_dev(dev_t dev)
{
sync_buffers(dev, 0);
sync_supers(dev);
sync_inodes(dev);
sync_buffers(dev, 0);
} int fsync_dev(dev_t dev)
{
sync_buffers(dev, 0);
sync_supers(dev);
sync_inodes(dev);
return sync_buffers(dev, 1);
} asmlinkage int sys_sync(void)
{
sync_dev(0);
return 0;
} int file_fsync (struct inode *inode, struct file *filp)
{
return fsync_dev(inode->i_dev);
} //fsync比sync要严格的多,这个函数在返回前要求所有的信息必须同
//存储设备的信息进行同步来保证完整性的,而后者没有这个功能
asmlinkage int sys_fsync(unsigned int fd)
{
struct file * file;
struct inode * inode; if (fd>=NR_OPEN || !(file=current->filp[fd]) || !(inode=file->f_inode))
return -EBADF;
if (!file->f_op || !file->f_op->fsync)
return -EINVAL;
if (file->f_op->fsync(inode,file))
return -EIO;
return 0;
} void invalidate_buffers(dev_t dev)
{
int i;
struct buffer_head * bh; bh = free_list;
for (i = nr_buffers*2 ; --i > 0 ; bh = bh->b_next_free)
{
if (bh->b_dev != dev)
continue;
wait_on_buffer(bh);
if (bh->b_dev == dev)
bh->b_uptodate = bh->b_dirt = bh->b_req = 0;
//设制为无效的,也就是上面的err的设置啊
}
} void check_disk_change(dev_t dev)
{
int i;
struct buffer_head * bh; switch(MAJOR(dev)){ //根据主设备号来判断设别的类型
case FLOPPY_MAJOR:
if (!(bh = getblk(dev,0,1024)))
return;
i = floppy_change(bh);
brelse(bh);
break; #if defined(CONFIG_BLK_DEV_SD) && defined(CONFIG_SCSI)
case SCSI_DISK_MAJOR:
i = check_scsidisk_media_change(dev, 0);
break;
#endif #if defined(CONFIG_BLK_DEV_SR) && defined(CONFIG_SCSI)
case SCSI_CDROM_MAJOR:
i = check_cdrom_media_change(dev, 0);
break;
#endif #if defined(CONFIG_CDU31A)
case CDU31A_CDROM_MAJOR:
i = check_cdu31a_media_change(dev, 0);
break;
#endif #if defined(CONFIG_MCD)
case MITSUMI_CDROM_MAJOR:
i = check_mcd_media_change(dev, 0);
break;
#endif default:
return;
}; if (!i) return; //i==0,没有更换 printk("VFS: Disk change detected on device %d/%d\n",
MAJOR(dev), MINOR(dev));
for (i=0 ; i<NR_SUPER ; i++)
if (super_blocks[i].s_dev == dev) //放回设备的超级块
put_super(super_blocks[i].s_dev);
//将i节点、缓冲区都失效
invalidate_inodes(dev);
invalidate_buffers(dev); #if defined(CONFIG_BLK_DEV_SD) && defined(CONFIG_SCSI)
/* This is trickier for a removable hardisk, because we have to invalidate
all of the partitions that lie on the disk. */
if (MAJOR(dev) == SCSI_DISK_MAJOR)
revalidate_scsidisk(dev, 0);
#endif
} //hash结构,对于块设备,只要有设备号dev和块号block就
//可以使用hash来唯一确定这个设备的块对应的缓冲区头部地址
#define _hashfn(dev,block) (((unsigned)(dev^block))%NR_HASH)
#define hash(dev,block) hash_table[_hashfn(dev,block)] //将缓冲块头部从hash链表中删除
static inline void remove_from_hash_queue(struct buffer_head * bh)
{
//数组+双向链表的结构
if (bh->b_next)
bh->b_next->b_prev = bh->b_prev;
if (bh->b_prev)
bh->b_prev->b_next = bh->b_next;
if (hash(bh->b_dev,bh->b_blocknr) == bh)
hash(bh->b_dev,bh->b_blocknr) = bh->b_next;
//bh成为一个孤岛的双向链表了 bye...
bh->b_next = bh->b_prev = NULL;
} //将缓冲块头部从空闲链表中删除
static inline void remove_from_free_list(struct buffer_head * bh)
{
//空闲的缓冲块的头部组成了一个双向循环链表结构
//这里进行了一个简单的检查,如果不是循环链表表
//示结构被破坏了,这样内核就该傻掉了 :-)
if (!(bh->b_prev_free) || !(bh->b_next_free))
panic("VFS: Free block list corrupted");
//双向循环链表结构,这是一定存在的
bh->b_prev_free->b_next_free = bh->b_next_free;
bh->b_next_free->b_prev_free = bh->b_prev_free;
/*free_list指向了最空闲的那一块,如果取得的是本块,就
*调整free_list指针*/
if (free_list == bh)
free_list = bh->b_next_free;
/*取出来了哦 ...*/
bh->b_next_free = bh->b_prev_free = NULL;
} //可见缓冲区头部中空闲缓冲块是使用hash队列和空闲链表共同维持的
static inline void remove_from_queues(struct buffer_head * bh)
{
remove_from_hash_queue(bh);
remove_from_free_list(bh);
} //将bh放置到空闲队列的最前端,如果本身就是就直接返回
static inline void put_first_free(struct buffer_head * bh)
{
if (!bh || (bh == free_list))
return;
remove_from_free_list(bh); //先删除后才可以加入
bh->b_next_free = free_list;
bh->b_prev_free = free_list->b_prev_free;
free_list->b_prev_free->b_next_free = bh;
free_list->b_prev_free = bh;
free_list = bh; /*free_list指向本块*/
} static inline void put_last_free(struct buffer_head * bh)
{
if (!bh)
return;
if (bh == free_list) {
free_list = bh->b_next_free; //直接调整就可以了,还是蛮微妙的~~~
return;
}
remove_from_free_list(bh);
bh->b_next_free = free_list;
bh->b_prev_free = free_list->b_prev_free;
free_list->b_prev_free->b_next_free = bh;
free_list->b_prev_free = bh;
} static inline void insert_into_queues(struct buffer_head * bh)
{
/*加入到空闲队列的尾部*/
bh->b_next_free = free_list;
bh->b_prev_free = free_list->b_prev_free;
free_list->b_prev_free->b_next_free = bh;
free_list->b_prev_free = bh;
/*插入到hash表中(但是hash表必须要有设备号哦),如果不能插入
*就需要将b_prev和b_next设置为NULL,这样使用remove_from_hash_queue
*也不会有问题了 :-)*/
bh->b_prev = NULL;
bh->b_next = NULL;
if (!bh->b_dev)
return;
/*这里将bh插入到hash队列中,插入的是hash表项的最前面*/
bh->b_next = hash(bh->b_dev,bh->b_blocknr);
hash(bh->b_dev,bh->b_blocknr) = bh;
//注意的是双向链表结构,一定不能断裂哦
if (bh->b_next)
bh->b_next->b_prev = bh;
} //查找指定的缓冲块头部,这里提供了设备号、块号和大小
//实际的操作就是使用两个参数查找并用第三个尺寸参数来
//进行校验
static struct buffer_head * find_buffer(dev_t dev, int block, int size)
{
struct buffer_head * tmp; for (tmp = hash(dev,block) ; tmp != NULL ; tmp = tmp->b_next)
if (tmp->b_dev==dev && tmp->b_blocknr==block)
if (tmp->b_size == size)
return tmp;
else {
printk("VFS: Wrong blocksize on device %d/%d\n",
MAJOR(dev), MINOR(dev));
return NULL;
}
return NULL;
} //这个函数同find_buffer差不多,但是这个返回后
//就可用了(引用计数也增加了),在函数中就等待解锁了
struct buffer_head * get_hash_table(dev_t dev, int block, int size)
{
struct buffer_head * bh; for (;;)
{
if (!(bh=find_buffer(dev,block,size)))
return NULL;
bh->b_count++;
wait_on_buffer(bh);
if (bh->b_dev == dev && bh->b_blocknr == block && bh->b_size == size)
return bh; //注意如果这里返回,那么引用次数就增加了
bh->b_count--;
}
} void set_blocksize(dev_t dev, int size)
{
int i;
struct buffer_head * bh, *bhnext; //关于blksize_size的定义是 int* blksize_size[MAX_BLKDEV]
//是指针的数组(我就叫做指针数组,元素是指针)
if (!blksize_size[MAJOR(dev)])
return; //这段代码看起来很古怪吧,但是一定是先执行case然后执行
//default的,这样写代码的目的无非是想少写个break
//难怪人们都说Linux中的程序员都很懒的 :-)
switch(size) {
default: panic("Invalid blocksize passed to set_blocksize");
case 512: case 1024: case 2048: case 4096:;
}
//指针数组当作二维数组使用,使用主设备号和次设备号共同
//确定块大小,这就是说同一个主设备的不同次设备是可以size不同的
//如果原先没有设置size并且要设置的大小是默认的大小(1024)
//就直接设置返回
if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
blksize_size[MAJOR(dev)][MINOR(dev)] = size;
return;
}//如果设置的值同原先的值一样,也直接返回
if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
return;
/*执行这里说明是设置非规格(默认)尺寸的size了,但是这些尺寸也是有
*一定的限制的哦,512的整数倍*/
sync_buffers(dev, 2);
blksize_size[MAJOR(dev)][MINOR(dev)] = size;
bh = free_list;
for (i = nr_buffers*2 ; --i > 0 ; bh = bhnext)
{
bhnext = bh->b_next_free;
if (bh->b_dev != dev)
continue;
if (bh->b_size == size)
continue;
//这里是对指定设备dev(不要混淆,包含了主设备号和次设备号,
//唯一确定了)中blksize_size和新设置值不同的缓冲块头部进行
//处理,就是使其无效并从hash链表中移除掉
//这里的put_first_free感觉是有必要的,为什么要注释起来???
//我想可能的原因是,这些缓冲块的大小是不符合的,所以即使是空闲的
//也不能被使用~~~
wait_on_buffer(bh);
if (bh->b_dev == dev && bh->b_size != size)
bh->b_uptodate = bh->b_dirt = 0; //设置为无效
remove_from_hash_queue(bh);
/* put_first_free(bh); */
}
} //定义缓冲区修改标志比锁定标志权重要大
#define BADNESS(bh) (((bh)->b_dirt<<1)+(bh)->b_lock) //获得空闲的缓冲块并用dev,block初始化它
struct buffer_head * getblk(dev_t dev,int block, int size)
{
struct buffer_head * bh, * tmp;
int buffers;
static int grow_size = 0; repeat:
bh = get_hash_table(dev, block, size);
if (bh) {
if (bh->b_uptodate && !bh->b_dirt)
put_last_free(bh); //最后被访问过,最不空闲了
return bh;
}
grow_size -= size;
//起码上面刚下来的时候,grow_size会为负值,这样就可以分配内核
//页面进行缓冲区的增长了
if (nr_free_pages > min_free_pages && grow_size <= 0) {
if (grow_buffers(GFP_BUFFER, size))
grow_size = PAGE_SIZE;
}
buffers = nr_buffers; //int
bh = NULL; //在所有的缓冲块中寻找空闲的缓冲块
for (tmp = free_list; buffers-- > 0 ; tmp = tmp->b_next_free)
{
if (tmp->b_count || tmp->b_size != size)
continue;
if (mem_map[MAP_NR((unsigned long) tmp->b_data)] != 1)
continue;
if (!bh || BADNESS(tmp)<BADNESS(bh)) { //修改标志比锁定标志权重要大
bh = tmp; //没有就先捡一个可能锁定的
if (!BADNESS(tmp)) //没有被修改,也没有被锁定,找到!!
break;
}
} if (!bh) { //没有空余的了
if (nr_free_pages > 5)
if (grow_buffers(GFP_BUFFER, size))
goto repeat;
if (!grow_buffers(GFP_ATOMIC, size)) //没办法,访问保留的内存
sleep_on(&buffer_wait);
goto repeat;
} wait_on_buffer(bh); //可能被锁定
if (bh->b_count || bh->b_size != size) //又被强走了!
goto repeat;
if (bh->b_dirt) {
sync_buffers(0,0);
goto repeat;
}
/* NOTE!! While we slept waiting for this block, somebody else might */
/* already have added "this" block to the cache. check it */ //How can it happen?
if (find_buffer(dev,block,size))
goto repeat;
//将新的缓冲块进行初始化,哈~~~
bh->b_count=1;
bh->b_dirt=0; //这是初始化的操作,表示无效的标志
bh->b_uptodate=0;
bh->b_req=0;
remove_from_queues(bh);
bh->b_dev=dev;
bh->b_blocknr=block;
insert_into_queues(bh);
return bh;
} void brelse(struct buffer_head * buf)
{
if (!buf)
return;
wait_on_buffer(buf);
if (buf->b_count) {
if (--buf->b_count) //还有引用,不能释放
return;
/*否则唤醒等待的进程,然后返回,注意此时的b_count==0
*了,等待的很可能是需要空闲缓冲块的进程哦*/
wake_up(&buffer_wait);
return;
}
printk("VFS: brelse: Trying to free free buffer\n");
} //块读取
struct buffer_head * bread(dev_t dev, int block, int size)
{
struct buffer_head * bh; if (!(bh = getblk(dev, block, size))) {
printk("VFS: bread: READ error on device %d/%d\n",
MAJOR(dev), MINOR(dev));
return NULL;
}
if (bh->b_uptodate) //对于读,如果数据是更新的,就不用读了
return bh;
//否则就调用底层的读函数,并且等待返回并查看是否成功了
//如果失败就释放缓冲区,返回NULL
ll_rw_block(READ, 1, &bh);
wait_on_buffer(bh);
if (bh->b_uptodate)
return bh;
brelse(bh);
return NULL;
} //预先读,块号使用可变参数来指定,使用负数来结尾
//这里第一个参数是必须读取的,否则返回错误,而后面
//的读取成功与否被封装到了getblk函数中了
struct buffer_head * breada(dev_t dev,int first, ...)
{
va_list args;
unsigned int blocksize;
struct buffer_head * bh, *tmp; va_start(args,first); //这时默认值,但是还是需要具体检查设置
blocksize = BLOCK_SIZE;
if (blksize_size[MAJOR(dev)] && blksize_size[MAJOR(dev)][MINOR(dev)])
blocksize = blksize_size[MAJOR(dev)][MINOR(dev)]; if (!(bh = getblk(dev, first, blocksize))) { //第一个块是必须读取成功的~~
printk("VFS: breada: READ error on device %d/%d\n",
MAJOR(dev), MINOR(dev));
return NULL;
}
if (!bh->b_uptodate)
ll_rw_block(READ, 1, &bh);
while ((first=va_arg(args,int))>=0)
{
//这里是预读,所以没有真正的引用,而getblk函数(实际是
//get_hash_table函数)中递增了引用,这里需要递减(感觉应该是这样的)
tmp = getblk(dev, first, blocksize);
if (tmp) {
if (!tmp->b_uptodate)
ll_rw_block(READA, 1, &tmp);
tmp->b_count--;
}
}
va_end(args);
wait_on_buffer(bh);
//检查的是第一个参数的读取,所以第一块是必读的
if (bh->b_uptodate)
return bh;
brelse(bh);
return (NULL);
} static void put_unused_buffer_head(struct buffer_head * bh)
{
struct wait_queue * wait; wait = ((volatile struct buffer_head *) bh)->b_wait;
//将缓冲头结构全部初始化为0
memset((void *) bh,0,sizeof(*bh));
//这说明缓冲区头部可以全部为0,但是b_wait的数据是不能被覆盖
//掉的,这是用来恢复b_wait数据的,而volatile可能是避免编译器
//的不合理的优化操作吧 :-)
((volatile struct buffer_head *) bh)->b_wait = wait;
//可能unused_list保存了所有的没有使用的缓冲块头部结构的地址
//本身指向了链表头部
bh->b_next_free = unused_list;
unused_list = bh;
} static void get_more_buffer_heads(void)
{
int i;
struct buffer_head * bh; //如果unused_list不为空,表示有空闲的缓冲块头部结构
//这里正常返回
if (unused_list)
return; //这里说明unused_list==NULL,没有空闲的缓冲区头部结构了
//就另外分配一页内存,然后将一页内存分割成各个头结构
if(! (bh = (struct buffer_head*) get_free_page(GFP_BUFFER)))
return; //nr_buffer_heads是总的计数,要增加
for (nr_buffer_heads+=i=PAGE_SIZE/sizeof*bh ; i>0; i--) {
bh->b_next_free = unused_list; //组成一个链表结构
unused_list = bh++;
}
} //返回空闲的缓冲块头结构的指针
static struct buffer_head * get_unused_buffer_head(void)
{
struct buffer_head * bh; get_more_buffer_heads(); //这实际是检查操作,确保unused_list!=NULL
//因为get_more_buffer_heads是不可靠的
if (!unused_list)
return NULL;
bh = unused_list;
unused_list = bh->b_next_free;
/*必要的初始化操作*/
bh->b_next_free = NULL;
bh->b_data = NULL;
bh->b_size = 0;
bh->b_req = 0;
return bh;
} //在指定的页地址处创建指定大小的缓冲块,这里使用了buffers
//就表示创建多块,实际上是将一页分成同样大小的所有的缓冲块
//size可能同块设备的大小是一致的,这样完整分割是没有问题的
static struct buffer_head * create_buffers(unsigned long page, unsigned long size)
{
struct buffer_head *bh, *head;
unsigned long offset; head = NULL;
offset = PAGE_SIZE;
//看看,offset的数据类型是unsigned long,过0后将是无穷大~~~
//呃,胆子好大哦!
while ((offset -= size) < PAGE_SIZE) {
bh = get_unused_buffer_head();
if (!bh)
goto no_grow;
//b_this_page可能是同一页中的缓冲块的链表
//结构哦(后面看来的却如此),这样便于对整个一页
//的缓冲块进行操作
bh->b_this_page = head; //end with null!
head = bh;
bh->b_data = (char *) (page+offset); //从后向前初始化的
bh->b_size = size;
}
return head;
no_grow:
/*的却如此,失败后依次将这页中已经分配的缓冲块去除,并
*放回头部结构到空闲头部结构链中,这里的页面是传递进来
*的,所以没有涉及到页面的释放操作了。因为每次获得空闲
*头部都调用get_unused_buffer_head,所以如果一切正常是
*没有问题的 :-)*/
bh = head;
while (bh) {
head = bh;
bh = bh->b_this_page;
put_unused_buffer_head(head);
}
return NULL;
} //这里讲是read_buffers,感觉是对传递进来的buffer_head数组的
//所有块进行检测,如果不是更新的就调用底层的读,这个函数的
//可靠性是比较高的
static void read_buffers(struct buffer_head * bh[], int nrbuf)
{
int i;
int bhnum = 0;
//没办法,老的C语言,进行了数组的硬编码操作,但是后面的
//函数可以看出,这个8一定可以保证能读一页的数据
struct buffer_head * bhr[8]; for (i = 0 ; i < nrbuf ; i++)
{
if (bh[i] && !bh[i]->b_uptodate)
//uptodate==0 需要读取,bhr保存了需要读取的缓
//冲块头部地址,而bhnum保存了要读取的数目
bhr[bhnum++] = bh[i];
}
if (bhnum)
//如果有需要底层读取的就从底层设备中读取,起始这里即使
//bhr是一个局部变量也不要紧,因为只要在调用ll_rw_block时
//候能提供信息就可以了,实际的数据还是读取到缓冲中的,自
//己可不能先晕哦 :-)
ll_rw_block(READ, bhnum, bhr);
for (i = 0 ; i < nrbuf ; i++) {
if (bh[i]) { /*即使没有读取也可能需要解锁操作*/
wait_on_buffer(bh[i]);
}
}
} //first指向的缓冲块必须是内存页对其的,b是块号的数组啊
//这里的功能就是检查b所指定的所有的dev的缓冲块是否在
//first所指向的缓冲块的同一页~
static unsigned long check_aligned(struct buffer_head * first, unsigned long address,
dev_t dev, int *b, int size)
{
struct buffer_head * bh[8];
unsigned long page;
unsigned long offset;
int block;
int nrbuf; page = (unsigned long) first->b_data; if (page & ~PAGE_MASK) { //要求页对齐
brelse(first);
return 0;
}
mem_map[MAP_NR(page)]++;
bh[0] = first;
nrbuf = 1;
for (offset = size ; offset < PAGE_SIZE ; offset += size)
{
block = *++b;
if (!block) //设备块号
goto no_go;
first = get_hash_table(dev, block, size); //获得指定的缓冲块头部
if (!first)
goto no_go;
bh[nrbuf++] = first;
//offset是不断增加的,同时first块也是不断改变的
if (page+offset != (unsigned long) first->b_data) //right,in the same page
goto no_go;
}
//果真,read_buffers只是检测的作用
read_buffers(bh,nrbuf); /* make sure they are actually read correctly */
while (nrbuf-- > 0)
brelse(bh[nrbuf]);
free_page(address); //???
++current->min_flt;
return page; //失败
no_go:
while (nrbuf-- > 0)
brelse(bh[nrbuf]);
free_page(page);
return 0;
} //尝试着加载设备块的缓冲在同一个设备中
static unsigned long try_to_load_aligned(unsigned long address,
dev_t dev, int b[], int size)
{
struct buffer_head * bh, * tmp, * arr[8];
unsigned long offset;
int * p;
int block; bh = create_buffers(address, size);
if (!bh)
return 0;
/* do any of the buffers already exist? punt if so.. */
p = b;
for (offset = 0 ; offset < PAGE_SIZE ; offset += size)
{
block = *(p++);
if (!block)
goto not_aligned;
if (find_buffer(dev, block, size)) //缓冲块已经存在了~~
goto not_aligned;
}
tmp = bh;
p = b;
block = 0;
while (1) {
arr[block++] = bh;
bh->b_count = 1;
bh->b_dirt = 0; //这是缓冲块头部的初始化的值啊
bh->b_uptodate = 0;
bh->b_dev = dev;
bh->b_blocknr = *(p++); //块号
nr_buffers++;
insert_into_queues(bh);
if (bh->b_this_page)
bh = bh->b_this_page;
else //bh->b_this_page==NULL,最后一块的处理了!!
break;
}
buffermem += PAGE_SIZE;
bh->b_this_page = tmp; //呃,注意了,在创建的时候,一个页面的缓冲块链表是以NULL结尾的
//但是在这里使用之后就是一页之中的一个循环链表了!!!
mem_map[MAP_NR(address)]++;
read_buffers(arr,block);
while (block-- > 0)
brelse(arr[block]);
++current->maj_flt;
return address;
not_aligned:
while ((tmp = bh) != NULL) //这里不是循环链表!!
{
bh = bh->b_this_page;
put_unused_buffer_head(tmp);
}
return 0;
} //说明很详细了,如果加载了就测试是否是在同一个页面中,
//如果还没有加载就尝试在一个页面中加载
//一般用于代码和缓冲在同一个页面中~~~
/* Try-to-share-buffers tries to minimize memory use by trying to keep
* both code pages and the buffer area in the same page. This is done by
* (a) checking if the buffers are already aligned correctly in memory and
* (b) if none of the buffer heads are in memory at all, trying to load
* them into memory the way we want them.
*
* This doesn't guarantee that the memory is shared, but should under most
* circumstances work very well indeed (ie >90% sharing of code pages on
* demand-loadable executables).*/
static inline unsigned long try_to_share_buffers(unsigned long address,
dev_t dev, int *b, int size)
{
struct buffer_head * bh;
int block; block = b[0];
if (!block)
return 0;
bh = get_hash_table(dev, block, size);
if (bh)
return check_aligned(bh, address, dev, b, size); //这里可以看出address的作用了,如果原先是
//就有,那么就可以释放这个页面了
return try_to_load_aligned(address, dev, b, size);
} //from->to 这里指定了 size大小了(字节为单位)
#define COPYBLK(size,from,to) \
__asm__ __volatile__("rep ; movsl": \
:"c" (((unsigned long) size) >> 2),"S" (from),"D" (to) \
:"cx","di","si") //读取缓冲块到address地方,释放原来的缓冲块
unsigned long bread_page(unsigned long address, dev_t dev, int b[], int size, int prot)
{
//块大小最少是512,所以最多读8块填满一页
struct buffer_head * bh[8];
unsigned long where;
int i, j; if (!(prot & PAGE_RW))
{
where = try_to_share_buffers(address,dev,b,size);
if (where)
return where;
}
++current->maj_flt;
for (i=0, j=0; j<PAGE_SIZE ; i++, j+= size)
{
bh[i] = NULL;
if (b[i])
bh[i] = getblk(dev, b[i], size); //有数据
}
read_buffers(bh,i);
where = address;
for (i=0, j=0; j<PAGE_SIZE ; i++, j += size,address += size)
{
if (bh[i])
{
if (bh[i]->b_uptodate) //如果是更新的就直接拷贝,否则就释放
//可见这里的函数主要是对于代码段或者文件系统的
//缓冲,可能一般的写入操作的可能性不大
//总之,这里十分讲究速度!!!
COPYBLK(size, (unsigned long) bh[i]->b_data,address);
brelse(bh[i]);
}
}
return where;
} //创建一页size大小的缓冲块,而pri是用来内存分配的参数的
static int grow_buffers(int pri, int size)
{
unsigned long page;
struct buffer_head *bh, *tmp; if ((size & 511) || (size > PAGE_SIZE)) {
printk("VFS: grow_buffers: size = %d\n",size);
return 0;
}
if(!(page = __get_free_page(pri)))
return 0;
bh = create_buffers(page, size);
if (!bh) {
free_page(page);
return 0;
}
tmp = bh;
while (1) {
if (free_list) {
tmp->b_next_free = free_list;
tmp->b_prev_free = free_list->b_prev_free;
free_list->b_prev_free->b_next_free = tmp;
free_list->b_prev_free = tmp;
} else {
tmp->b_prev_free = tmp;
tmp->b_next_free = tmp;
}
free_list = tmp;
++nr_buffers;
if (tmp->b_this_page)
tmp = tmp->b_this_page; //一页中的链表操作
else
break;
}
tmp->b_this_page = bh;
buffermem += PAGE_SIZE;
return 1;
} //试着释放同一个页面的缓冲块,这个页面由bh尝试着确定
static int try_to_free(struct buffer_head * bh, struct buffer_head ** bhp)
{
unsigned long page;
struct buffer_head * tmp, * p; *bhp = bh;
page = (unsigned long) bh->b_data;
page &= PAGE_MASK; //获得缓冲块对应的页面
tmp = bh;
do {
if (!tmp)
return 0;
if (tmp->b_count || tmp->b_dirt || tmp->b_lock || tmp->b_wait) //有用,不能释放
return 0;
tmp = tmp->b_this_page;
} while (tmp != bh);
//执行到了这里,表示上面的两个return 都没有被执行,那么
//一页的缓冲块都是空闲的,通过检查了!!!
tmp = bh;
do {
p = tmp;
tmp = tmp->b_this_page;
nr_buffers--;
//关于bhp,可能是一个头部啊什么的,保持一致性
if (p == *bhp)
*bhp = p->b_prev_free;
remove_from_queues(p);
put_unused_buffer_head(p);
} while (tmp != bh); //头部了
buffermem -= PAGE_SIZE;
free_page(page); //释放页面
return !mem_map[MAP_NR(page)]; //0
} //priority表示了收缩的紧迫性,数值越小越强劲(执行的轮训越多)!
int shrink_buffers(unsigned int priority)
{
struct buffer_head *bh;
int i; if (priority < 2)
sync_buffers(0,0);
bh = free_list;
i = nr_buffers >> priority;
for ( ; i-- > 0 ; bh = bh->b_next_free) //空闲链表
{
if (bh->b_count || //有用,肯定不能释放啊
(priority >= 5 &&
mem_map[MAP_NR((unsigned long) bh->b_data)] > 1))
{
put_last_free(bh); //最不空闲
continue;
}
if (!bh->b_this_page)
continue;
if (bh->b_lock)
if (priority)
continue;
else //要的更多,等吧
wait_on_buffer(bh);
if (bh->b_dirt)
{
bh->b_count++;
ll_rw_block(WRITEA, 1, &bh);
bh->b_count--;
continue;
}
if (try_to_free(bh, &bh))
return 1;
}
return 0;
} //这里对所有的缓存块进行了数量和状态的统计操作
void show_buffers(void)
{
struct buffer_head * bh;
int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0; printk("Buffer memory: %6dkB\n",buffermem>>10);
printk("Buffer heads: %6d\n",nr_buffer_heads);
printk("Buffer blocks: %6d\n",nr_buffers);
bh = free_list;
do {
found++;
if (bh->b_lock)
locked++;
if (bh->b_dirt)
dirty++;
if (bh->b_count)
used++, lastused = found;
bh = bh->b_next_free;
} while (bh != free_list);
printk("Buffer mem: %d buffers, %d used (last=%d), %d locked, %d dirty\n",
found, used, lastused, locked, dirty);
} //缓冲的初始化操作
void buffer_init(void)
{
int i; if (high_memory >= 4*1024*1024)
min_free_pages = 200;
else
min_free_pages = 20;
//进行hash表的初始化,因为散列表中的hash码的计算用到了
//对NR_HASH的取余操作,这里是没有问题的
for (i = 0 ; i < NR_HASH ; i++)
hash_table[i] = NULL;
free_list = 0;
grow_buffers(GFP_KERNEL, BLOCK_SIZE);
if (!free_list)
panic("VFS: Unable to initialize buffer free list!");
return;
}
文档地址:http://blogimg.chinaunix.net/blog/upfile2/090503214430.pdf
相关阅读 更多 +
排行榜 更多 +