025 mm/vmscan.c

时间：2009-03-27 来源：hylpro
2006-8-11 
 mm/vmscan.c 

I)概述 

在分析page alloc的时候,分析过linux对内存页面的空闲量有些什么样的要求:

1)可分配页面的保有量要求:inactive_clean+free pages(in buddy pages)
 系统的期望值是freepages.high + inactive_target / 3,inactive_target就是
min((memory_pressure >> INACTIVE_SHIFT),num_physpages / 4)).可见期望的保有量有动态的因素在内.因为memory_pressure
是一段时间内的平均值,根据内存需求的不同所期望的保有量也不一样.

而现在的保有量是nr_free_pages() + nr_inactive_clean_pages();
 函数free_shortage,计算期望的可分配页面和现实之差距.如果保有量合格,就看zone中的inbuddy free pages是否比期望值少.
只要有一个保有量不合格,就必须立即加以调整.

2)潜在可分配页面的保有量要求(inactive_shortage):
 潜在可分配页面就是:buddyfree+inactiveclean+inactive_dirty
 期望保有量:freepages.high+inactive_target
 现存量:
 nr_free_pages()+nr_inactive_clean_pages()+nr_inactive_dirty_pages.

保证这两种页面有一个合理的水平,这样在分配内存的时候,(期望)一般不会出现内存紧缺.free_shortage,inactive_shortage这两
个函数不复杂,略.

这个文件就是处理lru,负责页面的回收. 包括映射断裂,逐级的调整页面所在的lru 队列,
直至最后回到buddy 系统. 另外对于dcache,icache, slab系统的空闲页面,在内存紧张的
时候也要进行回收.从而保证空闲内存保有量得到满足.

lru cache的组成,前面已经有叙述了:
 struct list_head active_list;
 struct list_head inactive_dirty_list;
 每个zone 中的inactive clean 队列.

为了完成这些任务,有两个内核线程,kswapd和kreclaimd,见下图. 图中忽略了调用的条件
简单的列出了每个线程的作用:                   
                                                               
                                            

为了更加直观,请看下面的图:(page alloc中也有) 

 

II)swap out

一次扫描一定量的进程页面,从不太忙碌的进程中回收可用页面是保证空闲页面保有量,
实现VM(back store)的一个重要操作.
 swap_out选择一个进程(当然不包括内核),通过try_to_swap_out断开此进程对页面的映
射.根据页面的熟悉(是否已经在swap,是否是page cache(page->mapping)),采取不同操作.
 进程有一个rss统计,还有一个swap_cnt=(mm->rss >> SWAP_SHIFT),swap cnt代表的是
这个进程在swap out的过程中应该贡献的页面数量.(虽然不一定能找到合适页面)所以swap
cnt大的进程优先被选中.

/*
 * 选择swap_cnt 值最大的任务试着交换出一些空间.swap_cnt 最大表明他最应该受到检查.
 * 看看能不能换出一些页面. (try_to_swap_out)
 * N.B. This function returns only 0 or 1. Return values != 1 from
 * the lower level routines result in continued processing.
 */
#define SWAP_SHIFT 5
#define SWAP_MIN 8
static int swap_out(unsigned int priority, int gfp_mask)
{
 int counter;
 int __ret = 0;

/* 
 * 进行两种方式的扫描, assign = 0,按照swap_cnt选择进程.
 * assign = 1,重新计算rss 和swap_cnt 的值,然后同时根据
 * swp_cnt选择合适进程.
 *
 * 如果选中的进程不能提供空闲页面,则清楚swap_cnt(swap_out_mm),这样 
 * 任务就不会再次被选中.
 */
 counter = (nr_threads << SWAP_SHIFT) >> priority;
 if (counter < 1)
 counter = 1;

for (; counter >= 0; counter--) { //这个循环次数受优秀级控制
 struct list_head *p; //代表了努力程度
 unsigned long max_cnt = 0;
 struct mm_struct *best = NULL;
 int assign = 0; /*assign 在某一轮尝试如果不能找到一个best,
 则进行第二种方式的扫描*/
 int found_task = 0;
 select:
 spin_lock(&mmlist_lock);
 p = init_mm.mmlist.next;
 for (; p != &init_mm.mmlist; p = p->next) {
 struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist);
 if (mm->rss <= 0)
 continue;
 found_task++;
 /* Refresh swap_cnt? */
 if (assign == 1) { //选择进程同时重新计算swap cnt
 mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
 if (mm->swap_cnt < SWAP_MIN)
 mm->swap_cnt = SWAP_MIN;
 }
 if (mm->swap_cnt > max_cnt) {
 max_cnt = mm->swap_cnt;
 best = mm;
 }
 }

/* Make sure it doesn't disappear */
 if (best) 
 atomic_inc(&best->mm_users);
 spin_unlock(&mmlist_lock);

/*
 * We have dropped the tasklist_lock, but we
 * know that "mm" still exists: we are running
 * with the big kernel lock, and exit_mm()
 * cannot race with us.
 */
 if (!best) { /* 找不到best 说明swap_cnt 都变成了0*/
 if (!assign && found_task > 0) {
 assign = 1;
 goto select;
 }
 break;
 } else {
 __ret = swap_out_mm(best, gfp_mask);
 mmput(best);
 break;
 }
 }
 return __ret;
}

然后看看try_to_swap_out,其他swap out相关的函数不再分析.
/*
 * rss(驻留页面集合)
 *
 * 希望caller 继续就返回0, 否则返回1.
 *
 * NOTE! If it sleeps, it *must* return 1 to make sure we
 * don't continue with the swap-out. Otherwise we may be
 * using a process that no longer actually exists (it might
 * have died while we slept).
 */
static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, 
 unsigned long address, pte_t * page_table, 
 int gfp_mask)
{

pte_t pte;
 swp_entry_t entry;
 struct page * page;
 int onlist;

pte = *page_table;
 if (!pte_present(pte))
 goto out_failed;
 page = pte_page(pte);
 if ((!VALID_PAGE(page)) || PageReserved(page))
 goto out_failed;

if (!mm->swap_cnt)
 return 1;

mm->swap_cnt--; /*swap_cnt: 根据rss,有个初始值每检查一个页面减1 */

//是否在activ list
 onlist = PageActive(page);

/* 如果最近被访问过,提升age */
 if (ptep_test_and_clear_young(page_table)) {
 age_page_up(page);
 goto out_failed;
 }

/*最近未被访问,则降低age*/
 if (!onlist) /* 不在activ list age down 由swap_out负责, refill_inactive_scan 
 负责active list 的age down*/
 age_page_down_ageonly(page);

/*寿命未尽,不管*/
 if (page->age > 0)
 goto out_failed;

if (TryLockPage(page))
 goto out_failed;

/* age 耗尽可以断开映射了*/
 pte = ptep_get_and_clear(page_table);
 flush_tlb_page(vma, address);

if (PageSwapCache(page)) {
 /* 
 * 页面已经在交换空间(swap cache)了,
 * 返回0, 通知上层继续扫描.
 */
 entry.val = page->index;
 if (pte_dirty(pte))
 set_page_dirty(page);
set_swap_pte:
 swap_duplicate(entry); /*此进程引用此swap entry*/
 set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
 UnlockPage(page);
 mm->rss--;

deactivate_page(page); /* active->inactive, if on list*/

page_cache_release(page); /*此进程不再能够使用此页面*/
out_failed:
 return 0;
 }

/*
 * 下面判断一下这个page 是否是一个clean page.
 * 如果是我们就可以断开映射了.
 * 
 * 如果不是的话我们应该将page->flags 
 * 的dity标志置位.
 */
 flush_cache_page(vma, address);
 if (!pte_dirty(pte))
 goto drop_pte;

/*
 * dity 页,看看是否属于一个mapping,上面的PageSwapCache
 * 只是判断是否在swapper_space,只是mapping的一个特例.
 */
 if (page->mapping) {
 set_page_dirty(page);
 goto drop_pte;
 }

/*
 * dity, 还没有交换空间
 * 分配一个交换空间给他
 */
 entry = get_swap_page();
 if (!entry.val)
 goto out_unlock_restore; /* No swap space left */

/* 挂入swap cache, 因为page->age 为零, 
 * 不会挂入入全局active 队列.
 * 同时置位flag 的dity 位.
 */
 add_to_swap_cache(page, entry);

set_page_dirty(page);

goto set_swap_pte;

out_unlock_restore:
 set_pte(page_table, pte);
 UnlockPage(page);
 return 0;
}

swap out扫描的过程中根据页面最近是否被访问过,调整页面的age值,对于age=0的页面
可以断开其映射. 可以看到page cache(mapping)和swap cache的不同处理.
 swap空间的页面,pte记录的是swap entry. 普通maping则直接断开映射. pte置0.

III) page_launder

将dity 的页面清洗成clean 并移入clean 队列. 对inactive list 扫描两次,第一次把已
经干净的页面移入inactive_clean 链表,第二次异步回写dirty 页面.当kswapd 无法供应所需
的页面时,则进行同步的回写. 原作者已经写了很多注释.仔细看看吧.有问题则讨论.
#define MAX_LAUNDER (4 * (1 << page_cluster))
int page_launder(int gfp_mask, int sync)
{
 int launder_loop, maxscan, cleaned_pages, maxlaunder;
 int can_get_io_locks;
 struct list_head * page_lru;
 struct page * page;

/*
 * We can only grab the IO locks (eg. for flushing dirty
 * buffers to disk) if __GFP_IO is set.
 */
 can_get_io_locks = gfp_mask & __GFP_IO;

launder_loop = 0;
 maxlaunder = 0;
 cleaned_pages = 0;

dirty_page_rescan:
 spin_lock(&pagemap_lru_lock);
 maxscan = nr_inactive_dirty_pages;
 while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
 maxscan-- > 0) {
 page = list_entry(page_lru, struct page, lru);

/* Wrong page on list?! (list corruption, should not happen) */
 if (!PageInactiveDirty(page)) {
 printk("VM: page_launder, wrong page on list.\n");
 list_del(page_lru);
 nr_inactive_dirty_pages--;
 page->zone->inactive_dirty_pages--;
 continue;
 }

/* Page is or was in use? Move it to the active list. */
 if (PageTestandClearReferenced(page) || page->age > 0 ||
 (!page->buffers && page_count(page) > 1) ||
 page_ramdisk(page)) {
 del_page_from_inactive_dirty_list(page);
 add_page_to_active_list(page);
 continue;
 }

/*
 * The page is locked. IO in progress?
 * Move it to the back of the list.
 */
 if (TryLockPage(page)) {
 list_del(page_lru);
 list_add(page_lru, &inactive_dirty_list);
 continue;
 }

/*
 * Dirty swap-cache page? Write it out if
 * last copy..
 */
 if (PageDirty(page)) {
 int (*writepage)(struct page *) = page->mapping->a_ops->writepage;
 int result;

if (!writepage)
 goto page_active;

/* First time through? Move it to the back of the list */
 if (!launder_loop) {
 list_del(page_lru);
 list_add(page_lru, &inactive_dirty_list);
 UnlockPage(page);
 continue;
 }

/* OK, do a physical asynchronous write to swap. */
 ClearPageDirty(page);
 page_cache_get(page);
 spin_unlock(&pagemap_lru_lock);

result = writepage(page);
 page_cache_release(page);

/* And re-start the thing.. */
 spin_lock(&pagemap_lru_lock);
 if (result != 1)
 continue;
 /* writepage refused to do anything */
 set_page_dirty(page);
 goto page_active;
 }

/*
 * If the page has buffers, try to free the buffer mappings
 * associated with this page. If we succeed we either free
 * the page (in case it was a buffercache only page) or we
 * move the page to the inactive_clean list.
 *
 * On the first round, we should free all previously cleaned
 * buffer pages
 */
 if (page->buffers) {
 int wait, clearedbuf;
 int freed_page = 0;
 /*
 * Since we might be doing disk IO, we have to
 * drop the spinlock and take an extra reference
 * on the page so it doesn't go away from under us.
 */
 del_page_from_inactive_dirty_list(page);
 page_cache_get(page);
 spin_unlock(&pagemap_lru_lock);

/* Will we do (asynchronous) IO? */
 if (launder_loop && maxlaunder == 0 && sync)
 wait = 2; /* Synchrounous IO */
 else if (launder_loop && maxlaunder-- > 0)
 wait = 1; /* Async IO */
 else
 wait = 0; /* No IO */

/* Try to free the page buffers. */
 clearedbuf = try_to_free_buffers(page, wait);

/*
 * Re-take the spinlock. Note that we cannot
 * unlock the page yet since we're still
 * accessing the page_struct here...
 */
 spin_lock(&pagemap_lru_lock);

/* The buffers were not freed. */
 if (!clearedbuf) {
 add_page_to_inactive_dirty_list(page);

/* The page was only in the buffer cache. */
 } else if (!page->mapping) {
 atomic_dec(&buffermem_pages);
 freed_page = 1;
 cleaned_pages++;

/* The page has more users besides the cache and us. */
 } else if (page_count(page) > 2) {
 add_page_to_active_list(page);

/* OK, we "created" a freeable page. */
 } else /* page->mapping && page_count(page) == 2 */ {
 add_page_to_inactive_clean_list(page);
 cleaned_pages++;
 }

/*
 * Unlock the page and drop the extra reference.
 * We can only do it here because we ar accessing
 * the page struct above.
 */
 UnlockPage(page);
 page_cache_release(page);

/* 
 * If we're freeing buffer cache pages, stop when
 * we've got enough free memory.
 */
 if (freed_page && !free_shortage())
 break;
 continue;
 } else if (page->mapping && !PageDirty(page)) {
 /*
 * If a page had an extra reference in
 * deactivate_page(), we will find it here.
 * Now the page is really freeable, so we
 * move it to the inactive_clean list.
 */
 del_page_from_inactive_dirty_list(page);
 add_page_to_inactive_clean_list(page);
 UnlockPage(page);
 cleaned_pages++;
 } else {
page_active:
 /*
 * OK, we don't know what to do with the page.
 * It's no use keeping it here, so we move it to
 * the active list.
 */
 del_page_from_inactive_dirty_list(page);
 add_page_to_active_list(page);
 UnlockPage(page);
 }
 }
 spin_unlock(&pagemap_lru_lock);

/*
 * If we don't have enough free pages, we loop back once
 * to queue the dirty pages for writeout. When we were called
 * by a user process (that /needs/ a free page) and we didn't
 * free anything yet, we wait synchronously on the writeout of
 * MAX_SYNC_LAUNDER pages.
 *
 * We also wake up bdflush, since bdflush should, under most
 * loads, flush out the dirty pages before we have to wait on
 * IO.
 */
 if (can_get_io_locks && !launder_loop && free_shortage()) {
 launder_loop = 1;
 /* If we cleaned pages, never do synchronous IO. */
 if (cleaned_pages)
 sync = 0;
 /* We only do a few "out of order" flushes. */
 maxlaunder = MAX_LAUNDER;
 /* Kflushd takes care of the rest. */
 wakeup_bdflush(0);
 goto dirty_page_rescan;
 }

/* Return the number of pages moved to the inactive_clean list. */
 return cleaned_pages;
}

IV) refill_inactive_scan

/*
 * refill_inactive_scan - 扫描active 列表找到应该deactivate 的页面
 * @priority: the priority at which to scan
 * @oneshot: exit after deactivating one page
 *
 * This function will scan a portion of the active list to find
 * unused pages, those pages will then be moved to the inactive list.
 */
 /*
 * 在acitve_list 中的页面有两种情况:
 *
 * 1. 拥有mapping 的page 
 * 2. 后来加入这个队列的(加入swapper mapping).age 为0，
 * 但是有可能随时恢复映射 
 */
int refill_inactive_scan(unsigned int priority, int oneshot)
{
 struct list_head * page_lru;
 struct page * page;
 int maxscan, page_active = 0;
 int ret = 0;

/* Take the lock while messing with the list... */
 spin_lock(&pagemap_lru_lock);

maxscan = nr_active_pages >> priority; 
 //队尾的页面被换出的概率小

while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
 page = list_entry(page_lru, struct page, lru);

/* Wrong page on list?! (list corruption, should not happen) */
 if (!PageActive(page)) {
 printk("VM: refill_inactive, wrong page on list.\n");
 list_del(page_lru);
 nr_active_pages--;
 continue;
 }

/* Do aging on the pages. */
 if (PageTestandClearReferenced(page)) {
 // 内核持有此页面(如touch_buffer)
 age_page_up_nolock(page);
 page_active = 1; 
 //置此标记会把page 移到队尾
 } else {
 age_page_down_ageonly(page);
 /*
 * Since we don't hold a reference on the page
 * ourselves, we have to do our test a bit more
 * strict then deactivate_page(). This is needed
 * since otherwise the system could hang shuffling
 * unfreeable pages from the active list to the
 * inactive_dirty list and back again...
 *
 * SUBTLE: we can have buffer pages with count 1.
 */
 if (page->age == 0 && page_count(page) <=
 (page->buffers ? 2 : 1)) {
 deactivate_page_nolock(page); 
 page_active = 0;
 } else {
 page_active = 1;
 }
 }
 /*
 * If the page is still on the active list, move it
 * to the other end of the list. Otherwise it was
 * deactivated by age_page_down and we exit successfully.
 */
 if (page_active || PageActive(page)) { 
 // 把页面移到队尾
 list_del(page_lru);
 list_add(page_lru, &active_list); 
 } else {
 ret = 1;
 if (oneshot)
 break;
 }
 }
 spin_unlock(&pagemap_lru_lock);

return ret;
}

再次讨论PageTestandClearReferenced.看看2.6关于此位的说明:
 * For choosing which pages to swap out, inode pages carry a PG_referenced bit,
 * which is set any time the system accesses that page through the (mapping,
 * index) hash table. This referenced bit, together with the referenced bit
 * in the page tables, is used to manipulate page->age and move the page across
 * the active, inactive_dirty and inactive_clean lists.

意思是,对于缓存文件内容的页面,除了来自用户进程的访问,内核本身访问此页面也应该
age up.
 对于2.4内核,这种页面主要来自buffer,见grow_buffers,getblk. buffer中的页面也使用
lru队列进行swap.但是有可能没有映射到用户页面(blk dev 文件的读取),无法age up.只好
通过touch_buffer进行by hand的age up.

page->count不死

关于 page_launder 还有一个话题,就是这个函数标号page_active的地方.论坛中有很多关
于这里的讨论.仅发表一下看法,供大家参考.
 以前的讨论中提到page_active:处的条件中隐含有
 (page->count==1&&page->mapping==0&&page->buffer==0)
 其实这个标号的条件是:
 1)no Buffer, no mapping(one process)(dirty can't write out or not), 
 2)no buffer, mapping,dirty but can't write out)*/

对于第一种页面,在2.4.0的内核中,似乎就没有可能进入lru cache.(2.4.20有). 因为查看
页面所有进入lru的途径,必然是,或者有mapping,或者有buffer. 并且进入lru后,没有找到恢
复进程映射时将mapping,buffer去掉而留在lru中的情况.
 关于lru, page_launder,linus有一个讨论可以参考一下: 
page_launder, linux option 
 linus认为,进入inactive dirty的page,不应该没有mapping的.否则dirty了又能如何.当然
2.4.20以后这种页面可以存在的,但是swap out还是会给他分配mapping(swap space),只是临
存在.
 第二种倒是有,比如ramfs,以前的tmpfs也是.

本论坛还有一个对这个函数的讨论,值得一看:
linuxfourm关于这个问题的讨论 
 以上的分析也希望对这个问题有所帮助. 

其他函数不再讨论.swap out的过程通过各种手段进行调整,经过不断的演化,成了现在这
个样子.我感觉应该把握的是这种思路,然后看代码的时候就不觉得很乱了.

linux mm分析暂告一个段落.

2006.8.11