2006-8-9 mm/swapfile.c
linux可以指定交换分区也可以指定交换文件(专门的分区效率高). 交换设备分为两种:块 设备和交换文件. 这里交换设备是交换用的设备文件和普通文件的通称,希望不要引起混淆.每个交换设备在 内核中都有一个swap_info_struct与之对应:
struct swap_info_struct { unsigned int flags; kdev_t swap_device; spinlock_t sdev_lock; struct dentry * swap_file; struct vfsmount *swap_vfsmnt; unsigned short * swap_map; /*记录交换设备上page 的引用计数(SWAP_MAP_MAX)*/ /* 数组大小为this->max */
/*按簇分配算法变量,一个簇包含SWAPFILE_CLUSTER 个页面 */ unsigned int lowest_bit; /* 和highest_bit一起构成有可能空闲的页面的索引范围*/ unsigned int highest_bit; unsigned int cluster_next; /*swap cluster 中下一个可分配页面*/ unsigned int cluster_nr; /*本簇内剩余页面数量*/
int prio; /* swap priority ,数值小的在swap list中靠前 */ int pages; /*nr_good_pages*/ unsigned long max; /*来自swap_header 的last_page(和),见sys_swapon*/ int next; /* next entry on swap list */ };
这个结构保存存在一个数组: struct swap_info_struct swap_info[MAX_SWAPFILES];
所有可用的swap设备按照优先级从高到低组成一个链表,其表头是: struct swap_list_t swap_list = {-1, -1};
这是一个存储在数组中的链表,next域是整数,-1代表空指针. struct swap_list_t { int head; /* head of priority-ordered swapfile list */ int next; /* swapfile to be used next */ }; head就是表头,优先级最高. swap_list.next指向下次分配swap page页面时应优先考虑的 swap设备.
系统中可能存在几个swap设备,优先级可能不同,也可能相同.分配swap page的时候在最高 优先级的swap设备中轮换分配.只有当所有的高优先级的交换设备上的空间都耗尽了,才启用 低优先级的交换设备.
交换设备有一个固定的头,其大小是一个PAGE_SIZE. 2.4支持两种格式的swap设备: union swap_header { struct { char reserved[PAGE_SIZE - 10]; char magic[10]; /*version 1:"SWAP-SPACE" version 2:"SWAPSPACE2"*/ } magic; struct { /*version 2 使用的结构*/ char bootbits[1024]; /* Space for disklabel etc. */ unsigned int version; /*subversion, olny 1 now*/ unsigned int last_page; /* 交换设备上最后一个页面的nr*/ unsigned int nr_badpages; unsigned int padding[125]; unsigned int badpages[1]; } info; }; 1)version 1 结构很简单,swap header的末尾是10字节的version magic 字符,剩下的部分是一个位图,置 1的位对应的此交换换设备上的page可用.置1则不可用.
2)version 2 version 2中2.4只支持subversion(swap_hearder.info.version)为1的交换分区.相对于ver1 增加了几个有用的数据,比如最大页面号,等.其结构如下: 预留1024字节用于disk label(引导扇区,赫赫,要从swap引导么!),然后跟着,subversion,最大 页面号,预留页面个数,pading,预留(bad page)数组.
在一个交换设备内部,采用簇来管理分配,见swap_info_struct的相关数据.一个簇含有SWAPFILE_ CLUSTER(256)个页面. 每次分配的时候,尽量在当前簇中分配,如果失败,寻找下一个完整的空簇,如 还是失败,就放弃簇分配方式.
簇分配算法的核心函数是:(注意一下簇分配所用到的swap info中的变量) /* * 从指定swap设备中分配swap page, 引用计数置为count * 使用簇来减少碎片 */ static inline int scan_swap_map(struct swap_info_struct *si, unsigned short count) { unsigned long offset; /* * Cluster ---- 连续的SWAPFILE_CLUSTER 个空页面 */
/* * [lowest_bit, highest_bit] 内也许可用 * * 但此范围之外绝对不可用 */ /* * We try to cluster swap pages by allocating them * sequentially in swap. Once we've allocated * SWAPFILE_CLUSTER pages this way, however, we resort to * first-free allocation, starting a new cluster. This * prevents us from scattering swap pages all over the entire * swap partition, so that we reduce overall disk seek times * between swap pages. -- sct */ if (si->cluster_nr) { //这个群中还剩余页面可以一试 while (si->cluster_next <= si->highest_bit) { offset = si->cluster_next++; if (si->swap_map[offset]) continue; si->cluster_nr--; goto got_page; } } si->cluster_nr = SWAPFILE_CLUSTER;
/* try to find an empty (even not aligned) cluster. */ offset = si->lowest_bit; //从可分配处寻找一个群 check_next_cluster: if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit) { int nr; //群的始索引 for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++) if (si->swap_map[nr]) { offset = nr+1; goto check_next_cluster; } /* We found a completly empty cluster, so start * using it. */ goto got_page; } /* 不幸, 没有连续的空闲页, * 只能缩小搜索粒度, 一页一页的看看有没有空闲页 * finegrined-----有细致的纹理的, 像篦子一样密 */ for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { if (si->swap_map[offset]) continue; got_page: if (offset == si->lowest_bit) si->lowest_bit++; //可分配范围调整 if (offset == si->highest_bit) si->highest_bit--; si->swap_map[offset] = count; nr_swap_pages--; si->cluster_next = offset+1; //页群中下一个准备分配的页面在 // swap_map 中的索引 return offset; } return 0; }
swap设备轮换分配算法: /* * 尽量在高优先级的设备中分配swap 页面 * 并且在这些高级别的swap设备中轮换分配 * 分担io压力. 只有在高于当前级别的所有 * 设备都耗尽时才启用的优先级设备. * */ swp_entry_t __get_swap_page(unsigned short count) { struct swap_info_struct * p; unsigned long offset; swp_entry_t entry; int type, wrapped = 0; /*wrapped:是否找遍所有的swap设备*/
entry.val = 0; /* Out of memory */ if (count >= SWAP_MAP_MAX) goto bad_count; swap_list_lock(); type = swap_list.next; if (type < 0) goto out; if (nr_swap_pages == 0) goto out;
while (1) { p = &swap_info[type]; if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { swap_device_lock(p); offset = scan_swap_map(p, count); swap_device_unlock(p); if (offset) { /* 分配成功,下一次在同(or high)优先级的下 *一个设备中分配swap entry*/ entry = SWP_ENTRY(type,offset);
type = swap_info[type].next;/*next*/ if (type < 0 || /*NULL*/ p->prio != swap_info[type].prio ){/*优先级下降*/ swap_list.next = swap_list.head; /*从头开始*/ } else { swap_list.next = type; } goto out; } }
/*当前设备, swap_list.head,空间耗尽*/ type = p->next; /*考察下一个设备*/
if (!wrapped) { if (type < 0 || p->prio != swap_info[type].prio) { type = swap_list.head; /*尽量使用高优先级设备*/ wrapped = 1; } } else/*高优先级的设备都耗尽才用低优先级设备*/ if (type < 0)/*全部耗尽*/ goto out; /* out of swap space */ } out: swap_list_unlock(); return entry;
bad_count: printk(KERN_ERR "get_swap_page: bad count %hd from %p\n", count, __builtin_return_address(0)); goto out; }
交换页面的释放算法就极为的简单了:__swap_free,代码略. get_swaparea_info为proc文件系统提供数据,略.is_swap_partition,si_swapinfo, swap_duplicate,swap_count,get_swaphandle_info等接口提供一些简单功能,不详细 介绍了.
int valid_swaphandles 在介绍sapin_readahead,的时候有提及.其功能是从cluster 边界开始,寻找一个连续可读取范围(无坏页,预留页,在使用中),并将引用计数增1. /* * Kernel_lock protects against swap device deletion. Grab an extra * reference on the swaphandle so that it dos not become unused. */ /*call from swapin_readahead*/ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) { int ret = 0, i = 1 << page_cluster; /*一次预读一个cluster*/ unsigned long toff; struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
*offset = SWP_OFFSET(entry); /*将预读设置到一个cluster的边界*/ toff = *offset = (*offset >> page_cluster) << page_cluster;
swap_device_lock(swapdev); /*从cluster边界开始,寻找一个连续可读取范围*/ do {
/* Don't read-ahead past the end of the swap area */ if (toff >= swapdev->max) break; /* Don't read in bad or busy pages */ if (!swapdev->swap_map[toff]) break; if (swapdev->swap_map[toff] == SWAP_MAP_BAD) break; swapdev->swap_map[toff]++; toff++; ret++; } while (--i); swap_device_unlock(swapdev); return ret; }
剩下的部分是系统调用,sys_swapon和sys_swapoff. swapon就是打开交换设备,建立swap info结构,读取swap header以设置swap info. /* * Written 01/25/92 by Simmule Turner, heavily changed by Linus. * * The swapon system call */ /* * 加载swap 分区,或者启用swap 文件 */ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) { struct swap_info_struct * p; struct nameidata nd; struct inode * swap_inode; unsigned int type; int i, j, prev; int error; static int least_priority = 0; union swap_header *swap_header = 0; int swap_header_version; int nr_good_pages = 0; unsigned long maxpages; int swapfilesize; struct block_device *bdev = NULL;
if (!capable(CAP_SYS_ADMIN)) return -EPERM; lock_kernel();
/*寻找空闲swap info,初始化swap info*/ .... //略
if (swap_flags & SWAP_FLAG_PREFER) {/*指定了优先级*/ p->prio = (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; } else { p->prio = --least_priority; }
/*找到文件或者设备*/ error = user_path_walk(specialfile, &nd); if (error) goto bad_swap_2;
p->swap_file = nd.dentry; p->swap_vfsmnt = nd.mnt; swap_inode = nd.dentry->d_inode; error = -EINVAL;
if (S_ISBLK(swap_inode->i_mode)) {/*真正的交换设备*/ kdev_t dev = swap_inode->i_rdev; struct block_device_operations *bdops;
/*强制交换设备的block为PAGE_SIZE*/ p->swap_device = dev; set_blocksize(dev, PAGE_SIZE);
/*打开交换设备*/ bdev = swap_inode->i_bdev; bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode)); if (bdops) bdev->bd_op = bdops;
error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP); if (error) goto bad_swap_2; set_blocksize(dev, PAGE_SIZE);
error = -ENODEV; if (!dev || (blk_size[MAJOR(dev)] && !blk_size[MAJOR(dev)][MINOR(dev)])) goto bad_swap; error = -EBUSY; /*是否已经是交换设备*/ for (i = 0 ; i < nr_swapfiles ; i++) { if (i == type) continue; if (dev == swap_info[i].swap_device) goto bad_swap; } swapfilesize = 0; if (blk_size[MAJOR(dev)]) swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)] >> (PAGE_SHIFT - 10);/*blk_size 记录的大小是以1024为单位的*/ } else if (S_ISREG(swap_inode->i_mode)) {/*交换文件*/ error = -EBUSY; for (i = 0 ; i < nr_swapfiles ; i++) { if (i == type || !swap_info[i].swap_file) continue; if (swap_inode == swap_info[i].swap_file->d_inode) goto bad_swap; } swapfilesize = swap_inode->i_size >> PAGE_SHIFT; } else goto bad_swap;
/*读入交换设备/文件的交换信息*/ swap_header = (void *) __get_free_page(GFP_USER); if (!swap_header) { printk("Unable to start swapping: out of memory :-)\n"); error = -ENOMEM; goto bad_swap; }
lock_page(virt_to_page(swap_header)); rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header, 1);
/*有两个版本的swap*/ if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) swap_header_version = 1; else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) swap_header_version = 2; else { printk("Unable to find swap-space signature\n"); error = -EINVAL; goto bad_swap; }
switch (swap_header_version) { case 1:/*swap version 1*/ memset(((char *) swap_header)+PAGE_SIZE-10,0,10);/*clear magic*/ /*查找可使用swap 页面的范围和数量*/ j = 0; p->lowest_bit = 0; p->highest_bit = 0; for (i = 1 ; i < 8*PAGE_SIZE ; i++) { if (test_bit(i,(char *) swap_header)) { if (!p->lowest_bit) p->lowest_bit = i; p->highest_bit = i; p->max = i+1; j++; } } /*预留交换页面,设置mem map(引用技术)*/ nr_good_pages = j; p->swap_map = vmalloc(p->max * sizeof(short)); if (!p->swap_map) { error = -ENOMEM; goto bad_swap; } for (i = 1 ; i < p->max ; i++) { if (test_bit(i,(char *) swap_header)) p->swap_map[i] = 0; else p->swap_map[i] = SWAP_MAP_BAD; } break;
case 2: /*swap version 2*/ /* Check the swap header's sub-version and the size of the swap file and bad block lists */ if (swap_header->info.version != 1) { printk(KERN_WARNING "Unable to handle swap header version %d\n", swap_header->info.version); error = -EINVAL; goto bad_swap; } /*version 2的范围和数量有记录*/ p->lowest_bit = 1; p->highest_bit = swap_header->info.last_page - 1; p->max = swap_header->info.last_page;
maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)); if (p->max >= maxpages) p->max = maxpages-1;
error = -EINVAL; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) goto bad_swap;
/* OK, set up the swap map and apply the bad block list */ if (!(p->swap_map = vmalloc (p->max * sizeof(short)))) { error = -ENOMEM; goto bad_swap; } /*查找预留页面即可*/ error = 0; memset(p->swap_map, 0, p->max * sizeof(short)); for (i=0; i<swap_header->info.nr_badpages; i++) { int page = swap_header->info.badpages[i]; if (page <= 0 || page >= swap_header->info.last_page) error = -EINVAL; else p->swap_map[page] = SWAP_MAP_BAD; } nr_good_pages = swap_header->info.last_page - swap_header->info.nr_badpages - 1 /* header page */; if (error) goto bad_swap; }
...... //检查,忽略 p->swap_map[0] = SWAP_MAP_BAD; /*第一个swap page是交换设备信息*/ p->flags = SWP_WRITEOK; p->pages = nr_good_pages; swap_list_lock(); nr_swap_pages += nr_good_pages; printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n", nr_good_pages<<(PAGE_SHIFT-10), p->prio);
/* insert swap space into swap_list: */ prev = -1; for (i = swap_list.head; i >= 0; i = swap_info[i].next) { if (p->prio >= swap_info[i].prio) { break; } prev = i; } p->next = i; if (prev < 0) { swap_list.head = swap_list.next = p - swap_info; } else { swap_info[prev].next = p - swap_info; } swap_list_unlock(); error = 0; goto out; ..........//失败时的clear工作,略 return error; }
相对swap on,swap off就复杂些. 关闭swap设备的时候需要将已经交换到swap设备上的页 面重新读入内存. 然后才能释放交换设备. 思路是这样的: 只有user使用的页面才能够被swap out,内核自己使用的页面是不可以swap out的,所以遍历用户的页目录,页表,逐项检查pte,如果是not in mem,并且是一个swap entry 则释放他.不过要遍历所有进程的页表.所以速度是不快的.
不过,内核使用的页面真的不能交换吗? maybe吧,不过特殊的是tmpfs(shmmem.c),从内核引 用了swap entry. 介绍shmem.c的时候已经提及过那个函数了:shmem_unuse.参考 shmem.c的分 析.
/* * 关闭指定交换设备(读入所有交换设备上的page) */ asmlinkage long sys_swapoff(const char * specialfile) { struct swap_info_struct * p = NULL; struct nameidata nd; int i, type, prev; int err;
........//capable check
/*找到要关闭的文件*/ err = user_path_walk(specialfile, &nd); if (err) goto out;
lock_kernel(); prev = -1; swap_list_lock(); /*swap_list遍历在看看要关闭那个设备,略*/ ...............
/*关闭设备前调整一下分配策略*/ if (prev < 0) { swap_list.head = p->next; } else { swap_info[prev].next = p->next; } if (type == swap_list.next) { /* just pick something that's safe... */ swap_list.next = swap_list.head; } nr_swap_pages -= p->pages; swap_list_unlock(); p->flags = SWP_USED;
err = try_to_unuse(type); //释放所有swap entry,将在交换设备上的page读入内存
if (err) {/*由于内存不足关闭失败,重新启用此swap设备*/ /* re-insert swap space back into swap_list */ swap_list_lock(); for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) if (p->prio >= swap_info[i].prio) break; p->next = i; if (prev < 0) swap_list.head = swap_list.next = p - swap_info; else swap_info[prev].next = p - swap_info; nr_swap_pages += p->pages; swap_list_unlock(); p->flags = SWP_WRITEOK; goto out_dput; }
/*释放交换设备*/ ............ }
核心函数:
/* * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. */ /*除非内存耗尽,否则肯定可以释放swap entry*/ static int try_to_unuse(unsigned int type) { struct swap_info_struct * si = &swap_info[type]; struct task_struct *p; struct page *page; swp_entry_t entry; int i;
while (1) { /* * Find a swap page in use and read it in. */ swap_device_lock(si); for (i = 1; i < si->max ; i++) { if (si->swap_map[i] > 0 && si->swap_map[i] != SWAP_MAP_BAD) { /* * Prevent swaphandle from being completely * unused by swap_free while we are trying * to read in the page - this prevents warning * messages from rw_swap_page_base. */ if (si->swap_map[i] != SWAP_MAP_MAX) si->swap_map[i]++; /*我们要从此swap page读入*/ swap_device_unlock(si); goto found_entry; } } swap_device_unlock(si); break;
found_entry: entry = SWP_ENTRY(type, i);
/* Get a page for the entry, using the existing swap cache page if there is one. Otherwise, get a clean page and read the swap into it. */ page = read_swap_cache(entry); if (!page) {/*swap entry 不再有人用,或者内存分配失败见read_swap_cache*/ swap_free(entry); /*不可能无人用,swap off还用呢*/ return -ENOMEM; /*所以就是内存分配失败*/ }
/*有必要,可能进程已经读入另外一个page*/ if (PageSwapCache(page)) delete_from_swap_cache(page);/*那时必须彻底释放此page*/
read_lock(&tasklist_lock); for_each_task(p) unuse_process(p->mm, entry, page);/*已经读入page,让各个进程释放swap entry*/ read_unlock(&tasklist_lock); shmem_unuse(entry, page); /*让shmem 也释放对swap entry的引用*/ /* Now get rid of the extra reference to the temporary page we've been using. */ page_cache_release(page);/* read_swap_cache -> alloc */ /* * Check for and clear any overflowed swap map counts. */ swap_free(entry); /* over*/ swap_list_lock(); swap_device_lock(si); if (si->swap_map[i] > 0) { if (si->swap_map[i] != SWAP_MAP_MAX) printk("VM: Undead swap entry %08lx\n", entry.val); nr_swap_pages++; si->swap_map[i] = 0; } swap_device_unlock(si); swap_list_unlock(); } return 0; }
shmem_unuse我们就不提了.看unuse_process,这是一个遍进程vma,遍历每个vma涉及到的 pmd,page table, pte.这种遍历我们见过很多了.只看看unuse_pte, /* * The swap entry has been read in advance, and we return 1 to indicate * that the page has been used or is no longer needed. * * Always set the resulting pte to be nowrite (the same as COW pages * after one process has exited). We don't know just how many PTEs will * share this swap entry, so be cautious and let do_wp_page work out * what to do if a write is requested later. */ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, pte_t *dir, swp_entry_t entry, struct page* page) { pte_t pte = *dir;
if (pte_none(pte)) return; if (pte_present(pte)) { /* If this entry is swap-cached, then page must already hold the right address for any copies in physical memory */ if (pte_page(pte) != page) return; /* We will be removing the swap cache in a moment, so... */ ptep_mkdirty(dir); return; }
/*swap out的page 进程页表 设置成swap entry,这里是一个理由*/ if (pte_to_swp_entry(pte).val != entry.val) /*赫赫,动作真快,已经换了一个swap设备*/ return; set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot))); swap_free(entry); get_page(page); /*此进程使用此page*/ ++vma->vm_mm->rss; }
over. 2006.8.9
|