文章详情

  • 游戏榜单
  • 软件榜单
关闭导航
热搜榜
热门下载
热门标签
php爱好者> php文档>022 mm/swapfile.c

022 mm/swapfile.c

时间:2009-03-27  来源:hylpro

2006-8-9 
mm/swapfile.c

linux可以指定交换分区也可以指定交换文件(专门的分区效率高). 交换设备分为两种:块
设备和交换文件.
这里交换设备是交换用的设备文件和普通文件的通称,希望不要引起混淆.每个交换设备在
内核中都有一个swap_info_struct与之对应:

struct swap_info_struct {
unsigned int flags;
kdev_t swap_device;
spinlock_t sdev_lock;
struct dentry * swap_file;
struct vfsmount *swap_vfsmnt;
unsigned short * swap_map; /*记录交换设备上page 的引用计数(SWAP_MAP_MAX)*/
/* 数组大小为this->max */

/*按簇分配算法变量,一个簇包含SWAPFILE_CLUSTER 个页面 */
unsigned int lowest_bit; /* 和highest_bit一起构成有可能空闲的页面的索引范围*/
unsigned int highest_bit;
unsigned int cluster_next; /*swap cluster 中下一个可分配页面*/
unsigned int cluster_nr; /*本簇内剩余页面数量*/

int prio; /* swap priority ,数值小的在swap list中靠前 */
int pages; /*nr_good_pages*/
unsigned long max; /*来自swap_header 的last_page(和),见sys_swapon*/
int next; /* next entry on swap list */
};

这个结构保存存在一个数组:
struct swap_info_struct swap_info[MAX_SWAPFILES];

所有可用的swap设备按照优先级从高到低组成一个链表,其表头是:
struct swap_list_t swap_list = {-1, -1};

这是一个存储在数组中的链表,next域是整数,-1代表空指针.
struct swap_list_t {
int head; /* head of priority-ordered swapfile list */
int next; /* swapfile to be used next */
};
head就是表头,优先级最高. swap_list.next指向下次分配swap page页面时应优先考虑的
swap设备.

系统中可能存在几个swap设备,优先级可能不同,也可能相同.分配swap page的时候在最高
优先级的swap设备中轮换分配.只有当所有的高优先级的交换设备上的空间都耗尽了,才启用
低优先级的交换设备.

交换设备有一个固定的头,其大小是一个PAGE_SIZE. 2.4支持两种格式的swap设备:
union swap_header {
struct
{
char reserved[PAGE_SIZE - 10];
char magic[10]; /*version 1:"SWAP-SPACE" version 2:"SWAPSPACE2"*/
} magic;
struct
{ /*version 2 使用的结构*/
char bootbits[1024]; /* Space for disklabel etc. */
unsigned int version; /*subversion, olny 1 now*/
unsigned int last_page; /* 交换设备上最后一个页面的nr*/
unsigned int nr_badpages;
unsigned int padding[125];
unsigned int badpages[1];
} info;
};
1)version 1
结构很简单,swap header的末尾是10字节的version magic 字符,剩下的部分是一个位图,置
1的位对应的此交换换设备上的page可用.置1则不可用.

2)version 2
version 2中2.4只支持subversion(swap_hearder.info.version)为1的交换分区.相对于ver1
增加了几个有用的数据,比如最大页面号,等.其结构如下:
预留1024字节用于disk label(引导扇区,赫赫,要从swap引导么!),然后跟着,subversion,最大
页面号,预留页面个数,pading,预留(bad page)数组.

在一个交换设备内部,采用簇来管理分配,见swap_info_struct的相关数据.一个簇含有SWAPFILE_
CLUSTER(256)个页面. 每次分配的时候,尽量在当前簇中分配,如果失败,寻找下一个完整的空簇,如
还是失败,就放弃簇分配方式.

簇分配算法的核心函数是:(注意一下簇分配所用到的swap info中的变量)
/*
* 从指定swap设备中分配swap page, 引用计数置为count
* 使用簇来减少碎片
*/
static inline int scan_swap_map(struct swap_info_struct *si, unsigned short count)
{
unsigned long offset;
/*
* Cluster ---- 连续的SWAPFILE_CLUSTER 个空页面
*/

/*
* [lowest_bit, highest_bit] 内也许可用
*
* 但此范围之外绝对不可用
*/
/*
* We try to cluster swap pages by allocating them
* sequentially in swap. Once we've allocated
* SWAPFILE_CLUSTER pages this way, however, we resort to
* first-free allocation, starting a new cluster. This
* prevents us from scattering swap pages all over the entire
* swap partition, so that we reduce overall disk seek times
* between swap pages. -- sct */
if (si->cluster_nr) { //这个群中还剩余页面可以一试
while (si->cluster_next <= si->highest_bit) {
offset = si->cluster_next++;
if (si->swap_map[offset])
continue;
si->cluster_nr--;
goto got_page;
}
}
si->cluster_nr = SWAPFILE_CLUSTER;

/* try to find an empty (even not aligned) cluster. */
offset = si->lowest_bit; //从可分配处寻找一个群
check_next_cluster:
if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
{
int nr; //群的始索引
for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
if (si->swap_map[nr])
{
offset = nr+1;
goto check_next_cluster;
}
/* We found a completly empty cluster, so start
* using it.
*/
goto got_page;
}
/* 不幸, 没有连续的空闲页,
* 只能缩小搜索粒度, 一页一页的看看有没有空闲页
* finegrined-----有细致的纹理的, 像篦子一样密
*/
for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
if (si->swap_map[offset])
continue;
got_page:
if (offset == si->lowest_bit)
si->lowest_bit++; //可分配范围调整
if (offset == si->highest_bit)
si->highest_bit--;
si->swap_map[offset] = count;
nr_swap_pages--;
si->cluster_next = offset+1; //页群中下一个准备分配的页面在
// swap_map 中的索引
return offset;
}
return 0;
}

swap设备轮换分配算法:
/*
* 尽量在高优先级的设备中分配swap 页面
* 并且在这些高级别的swap设备中轮换分配
* 分担io压力. 只有在高于当前级别的所有
* 设备都耗尽时才启用的优先级设备.
*
*/
swp_entry_t __get_swap_page(unsigned short count)
{
struct swap_info_struct * p;
unsigned long offset;
swp_entry_t entry;
int type, wrapped = 0; /*wrapped:是否找遍所有的swap设备*/

entry.val = 0; /* Out of memory */
if (count >= SWAP_MAP_MAX)
goto bad_count;
swap_list_lock();
type = swap_list.next;
if (type < 0)
goto out;
if (nr_swap_pages == 0)
goto out;

while (1) {
p = &swap_info[type];
if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
swap_device_lock(p);
offset = scan_swap_map(p, count);
swap_device_unlock(p);
if (offset) {
/* 分配成功,下一次在同(or high)优先级的下
*一个设备中分配swap entry*/
entry = SWP_ENTRY(type,offset);

type = swap_info[type].next;/*next*/
if (type < 0 || /*NULL*/
p->prio != swap_info[type].prio ){/*优先级下降*/
swap_list.next = swap_list.head; /*从头开始*/
} else {
swap_list.next = type;
}
goto out;
}
}

/*当前设备, swap_list.head,空间耗尽*/
type = p->next; /*考察下一个设备*/

if (!wrapped) {
if (type < 0 || p->prio != swap_info[type].prio) {
type = swap_list.head; /*尽量使用高优先级设备*/
wrapped = 1;
}
} else/*高优先级的设备都耗尽才用低优先级设备*/
if (type < 0)/*全部耗尽*/
goto out; /* out of swap space */
}
out:
swap_list_unlock();
return entry;

bad_count:
printk(KERN_ERR "get_swap_page: bad count %hd from %p\n",
count, __builtin_return_address(0));
goto out;
}

交换页面的释放算法就极为的简单了:__swap_free,代码略.
get_swaparea_info为proc文件系统提供数据,略.is_swap_partition,si_swapinfo,
swap_duplicate,swap_count,get_swaphandle_info等接口提供一些简单功能,不详细
介绍了.

int valid_swaphandles 在介绍sapin_readahead,的时候有提及.其功能是从cluster
边界开始,寻找一个连续可读取范围(无坏页,预留页,在使用中),并将引用计数增1.
/*
* Kernel_lock protects against swap device deletion. Grab an extra
* reference on the swaphandle so that it dos not become unused.
*/
/*call from swapin_readahead*/
int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
{
int ret = 0, i = 1 << page_cluster; /*一次预读一个cluster*/
unsigned long toff;
struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;

*offset = SWP_OFFSET(entry);
/*将预读设置到一个cluster的边界*/
toff = *offset = (*offset >> page_cluster) << page_cluster;

swap_device_lock(swapdev);
/*从cluster边界开始,寻找一个连续可读取范围*/
do {

/* Don't read-ahead past the end of the swap area */
if (toff >= swapdev->max)
break;
/* Don't read in bad or busy pages */
if (!swapdev->swap_map[toff])
break;
if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
break;
swapdev->swap_map[toff]++;
toff++;
ret++;
} while (--i);
swap_device_unlock(swapdev);
return ret;
}

剩下的部分是系统调用,sys_swapon和sys_swapoff. swapon就是打开交换设备,建立swap
info结构,读取swap header以设置swap info.
/*
* Written 01/25/92 by Simmule Turner, heavily changed by Linus.
*
* The swapon system call
*/
/*
* 加载swap 分区,或者启用swap 文件
*/
asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
{
struct swap_info_struct * p;
struct nameidata nd;
struct inode * swap_inode;
unsigned int type;
int i, j, prev;
int error;
static int least_priority = 0;
union swap_header *swap_header = 0;
int swap_header_version;
int nr_good_pages = 0;
unsigned long maxpages;
int swapfilesize;
struct block_device *bdev = NULL;

if (!capable(CAP_SYS_ADMIN))
return -EPERM;
lock_kernel();

/*寻找空闲swap info,初始化swap info*/
.... //略

if (swap_flags & SWAP_FLAG_PREFER) {/*指定了优先级*/
p->prio =
(swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
} else {
p->prio = --least_priority;
}

/*找到文件或者设备*/
error = user_path_walk(specialfile, &nd);
if (error)
goto bad_swap_2;

p->swap_file = nd.dentry;
p->swap_vfsmnt = nd.mnt;
swap_inode = nd.dentry->d_inode;
error = -EINVAL;

if (S_ISBLK(swap_inode->i_mode)) {/*真正的交换设备*/
kdev_t dev = swap_inode->i_rdev;
struct block_device_operations *bdops;

/*强制交换设备的block为PAGE_SIZE*/
p->swap_device = dev;
set_blocksize(dev, PAGE_SIZE);

/*打开交换设备*/
bdev = swap_inode->i_bdev;
bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode));
if (bdops) bdev->bd_op = bdops;

error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP);
if (error)
goto bad_swap_2;
set_blocksize(dev, PAGE_SIZE);

error = -ENODEV;
if (!dev || (blk_size[MAJOR(dev)] &&
!blk_size[MAJOR(dev)][MINOR(dev)]))
goto bad_swap;
error = -EBUSY;
/*是否已经是交换设备*/
for (i = 0 ; i < nr_swapfiles ; i++) {
if (i == type)
continue;
if (dev == swap_info[i].swap_device)
goto bad_swap;
}
swapfilesize = 0;
if (blk_size[MAJOR(dev)])
swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
>> (PAGE_SHIFT - 10);/*blk_size 记录的大小是以1024为单位的*/
} else if (S_ISREG(swap_inode->i_mode)) {/*交换文件*/
error = -EBUSY;
for (i = 0 ; i < nr_swapfiles ; i++) {
if (i == type || !swap_info[i].swap_file)
continue;
if (swap_inode == swap_info[i].swap_file->d_inode)
goto bad_swap;
}
swapfilesize = swap_inode->i_size >> PAGE_SHIFT;
} else
goto bad_swap;

/*读入交换设备/文件的交换信息*/
swap_header = (void *) __get_free_page(GFP_USER);
if (!swap_header) {
printk("Unable to start swapping: out of memory :-)\n");
error = -ENOMEM;
goto bad_swap;
}

lock_page(virt_to_page(swap_header));
rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header, 1);

/*有两个版本的swap*/
if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
swap_header_version = 1;
else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
swap_header_version = 2;
else {
printk("Unable to find swap-space signature\n");
error = -EINVAL;
goto bad_swap;
}

switch (swap_header_version) {
case 1:/*swap version 1*/
memset(((char *) swap_header)+PAGE_SIZE-10,0,10);/*clear magic*/
/*查找可使用swap 页面的范围和数量*/
j = 0;
p->lowest_bit = 0;
p->highest_bit = 0;
for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
if (test_bit(i,(char *) swap_header)) {
if (!p->lowest_bit)
p->lowest_bit = i;
p->highest_bit = i;
p->max = i+1;
j++;
}
}
/*预留交换页面,设置mem map(引用技术)*/
nr_good_pages = j;
p->swap_map = vmalloc(p->max * sizeof(short));
if (!p->swap_map) {
error = -ENOMEM;
goto bad_swap;
}
for (i = 1 ; i < p->max ; i++) {
if (test_bit(i,(char *) swap_header))
p->swap_map[i] = 0;
else
p->swap_map[i] = SWAP_MAP_BAD;
}
break;

case 2: /*swap version 2*/
/* Check the swap header's sub-version and the size of
the swap file and bad block lists */
if (swap_header->info.version != 1) {
printk(KERN_WARNING
"Unable to handle swap header version %d\n",
swap_header->info.version);
error = -EINVAL;
goto bad_swap;
}
/*version 2的范围和数量有记录*/
p->lowest_bit = 1;
p->highest_bit = swap_header->info.last_page - 1;
p->max = swap_header->info.last_page;

maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL));
if (p->max >= maxpages)
p->max = maxpages-1;

error = -EINVAL;
if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
goto bad_swap;

/* OK, set up the swap map and apply the bad block list */
if (!(p->swap_map = vmalloc (p->max * sizeof(short)))) {
error = -ENOMEM;
goto bad_swap;
}
/*查找预留页面即可*/
error = 0;
memset(p->swap_map, 0, p->max * sizeof(short));
for (i=0; i<swap_header->info.nr_badpages; i++) {
int page = swap_header->info.badpages[i];
if (page <= 0 || page >= swap_header->info.last_page)
error = -EINVAL;
else
p->swap_map[page] = SWAP_MAP_BAD;
}
nr_good_pages = swap_header->info.last_page -
swap_header->info.nr_badpages -
1 /* header page */;
if (error)
goto bad_swap;
}

...... //检查,忽略
p->swap_map[0] = SWAP_MAP_BAD; /*第一个swap page是交换设备信息*/
p->flags = SWP_WRITEOK;
p->pages = nr_good_pages;
swap_list_lock();
nr_swap_pages += nr_good_pages;
printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
nr_good_pages<<(PAGE_SHIFT-10), p->prio);

/* insert swap space into swap_list: */
prev = -1;
for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
if (p->prio >= swap_info[i].prio) {
break;
}
prev = i;
}
p->next = i;
if (prev < 0) {
swap_list.head = swap_list.next = p - swap_info;
} else {
swap_info[prev].next = p - swap_info;
}
swap_list_unlock();
error = 0;
goto out;
..........//失败时的clear工作,略
return error;
}

相对swap on,swap off就复杂些. 关闭swap设备的时候需要将已经交换到swap设备上的页
面重新读入内存. 然后才能释放交换设备.
思路是这样的: 只有user使用的页面才能够被swap out,内核自己使用的页面是不可以swap
out的,所以遍历用户的页目录,页表,逐项检查pte,如果是not in mem,并且是一个swap entry
则释放他.不过要遍历所有进程的页表.所以速度是不快的.

不过,内核使用的页面真的不能交换吗? maybe吧,不过特殊的是tmpfs(shmmem.c),从内核引
用了swap entry. 介绍shmem.c的时候已经提及过那个函数了:shmem_unuse.参考 shmem.c的分
析.

/*
* 关闭指定交换设备(读入所有交换设备上的page)
*/
asmlinkage long sys_swapoff(const char * specialfile)
{
struct swap_info_struct * p = NULL;
struct nameidata nd;
int i, type, prev;
int err;

........//capable check

/*找到要关闭的文件*/
err = user_path_walk(specialfile, &nd);
if (err)
goto out;

lock_kernel();
prev = -1;
swap_list_lock();
/*swap_list遍历在看看要关闭那个设备,略*/
...............

/*关闭设备前调整一下分配策略*/
if (prev < 0) {
swap_list.head = p->next;
} else {
swap_info[prev].next = p->next;
}
if (type == swap_list.next) {
/* just pick something that's safe... */
swap_list.next = swap_list.head;
}
nr_swap_pages -= p->pages;
swap_list_unlock();
p->flags = SWP_USED;

err = try_to_unuse(type); //释放所有swap entry,将在交换设备上的page读入内存

if (err) {/*由于内存不足关闭失败,重新启用此swap设备*/
/* re-insert swap space back into swap_list */
swap_list_lock();
for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
if (p->prio >= swap_info[i].prio)
break;
p->next = i;
if (prev < 0)
swap_list.head = swap_list.next = p - swap_info;
else
swap_info[prev].next = p - swap_info;
nr_swap_pages += p->pages;
swap_list_unlock();
p->flags = SWP_WRITEOK;
goto out_dput;
}

/*释放交换设备*/
............
}

核心函数:

/*
* We completely avoid races by reading each swap page in advance,
* and then search for the process using it. All the necessary
* page table adjustments can then be made atomically.
*/
/*除非内存耗尽,否则肯定可以释放swap entry*/
static int try_to_unuse(unsigned int type)
{
struct swap_info_struct * si = &swap_info[type];
struct task_struct *p;
struct page *page;
swp_entry_t entry;
int i;

while (1) {
/*
* Find a swap page in use and read it in.
*/
swap_device_lock(si);
for (i = 1; i < si->max ; i++) {
if (si->swap_map[i] > 0 && si->swap_map[i] != SWAP_MAP_BAD) {
/*
* Prevent swaphandle from being completely
* unused by swap_free while we are trying
* to read in the page - this prevents warning
* messages from rw_swap_page_base.
*/
if (si->swap_map[i] != SWAP_MAP_MAX)
si->swap_map[i]++; /*我们要从此swap page读入*/
swap_device_unlock(si);
goto found_entry;
}
}
swap_device_unlock(si);
break;

found_entry:
entry = SWP_ENTRY(type, i);

/* Get a page for the entry, using the existing swap
cache page if there is one. Otherwise, get a clean
page and read the swap into it. */
page = read_swap_cache(entry);
if (!page) {/*swap entry 不再有人用,或者内存分配失败见read_swap_cache*/
swap_free(entry); /*不可能无人用,swap off还用呢*/
return -ENOMEM; /*所以就是内存分配失败*/
}

/*有必要,可能进程已经读入另外一个page*/
if (PageSwapCache(page))
delete_from_swap_cache(page);/*那时必须彻底释放此page*/

read_lock(&tasklist_lock);
for_each_task(p)
unuse_process(p->mm, entry, page);/*已经读入page,让各个进程释放swap entry*/
read_unlock(&tasklist_lock);
shmem_unuse(entry, page); /*让shmem 也释放对swap entry的引用*/
/* Now get rid of the extra reference to the temporary
page we've been using. */
page_cache_release(page);/* read_swap_cache -> alloc */
/*
* Check for and clear any overflowed swap map counts.
*/
swap_free(entry); /* over*/
swap_list_lock();
swap_device_lock(si);
if (si->swap_map[i] > 0) {
if (si->swap_map[i] != SWAP_MAP_MAX)
printk("VM: Undead swap entry %08lx\n",
entry.val);
nr_swap_pages++;
si->swap_map[i] = 0;
}
swap_device_unlock(si);
swap_list_unlock();
}
return 0;
}

shmem_unuse我们就不提了.看unuse_process,这是一个遍进程vma,遍历每个vma涉及到的
pmd,page table, pte.这种遍历我们见过很多了.只看看unuse_pte,
/*
* The swap entry has been read in advance, and we return 1 to indicate
* that the page has been used or is no longer needed.
*
* Always set the resulting pte to be nowrite (the same as COW pages
* after one process has exited). We don't know just how many PTEs will
* share this swap entry, so be cautious and let do_wp_page work out
* what to do if a write is requested later.
*/
static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
pte_t *dir, swp_entry_t entry, struct page* page)
{
pte_t pte = *dir;

if (pte_none(pte))
return;
if (pte_present(pte)) {
/* If this entry is swap-cached, then page must already
hold the right address for any copies in physical
memory */
if (pte_page(pte) != page)
return;
/* We will be removing the swap cache in a moment, so... */
ptep_mkdirty(dir);
return;
}

/*swap out的page 进程页表 设置成swap entry,这里是一个理由*/
if (pte_to_swp_entry(pte).val != entry.val) /*赫赫,动作真快,已经换了一个swap设备*/
return;
set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
get_page(page); /*此进程使用此page*/
++vma->vm_mm->rss;
}

over. 2006.8.9

相关阅读 更多 +
排行榜 更多 +
房间毁灭模拟器最新版

房间毁灭模拟器最新版

休闲益智 下载
街头追逐者最新版

街头追逐者最新版

休闲益智 下载
弓箭手2内置作弊菜单

弓箭手2内置作弊菜单

休闲益智 下载