2006-8-11 mm/vmscan.c
I)概述
在分析page alloc的时候,分析过linux对内存页面的空闲量有些什么样的要求:
1)可分配页面的保有量要求:inactive_clean+free pages(in buddy pages) 系统的期望值是freepages.high + inactive_target / 3,inactive_target就是 min((memory_pressure >> INACTIVE_SHIFT),num_physpages / 4)).可见期望的保有量有动态的因素在内.因为memory_pressure 是一段时间内的平均值,根据内存需求的不同所期望的保有量也不一样.
而现在的保有量是nr_free_pages() + nr_inactive_clean_pages(); 函数free_shortage,计算期望的可分配页面和现实之差距.如果保有量合格,就看zone中的inbuddy free pages是否比期望值少. 只要有一个保有量不合格,就必须立即加以调整.
2)潜在可分配页面的保有量要求(inactive_shortage): 潜在可分配页面就是:buddyfree+inactiveclean+inactive_dirty 期望保有量:freepages.high+inactive_target 现存量: nr_free_pages()+nr_inactive_clean_pages()+nr_inactive_dirty_pages.
保证这两种页面有一个合理的水平,这样在分配内存的时候,(期望)一般不会出现内存紧缺.free_shortage,inactive_shortage这两 个函数不复杂,略.
这个文件就是处理lru,负责页面的回收. 包括映射断裂,逐级的调整页面所在的lru 队列, 直至最后回到buddy 系统. 另外对于dcache,icache, slab系统的空闲页面,在内存紧张的 时候也要进行回收.从而保证空闲内存保有量得到满足.
lru cache的组成,前面已经有叙述了: struct list_head active_list; struct list_head inactive_dirty_list; 每个zone 中的inactive clean 队列.
为了完成这些任务,有两个内核线程,kswapd和kreclaimd,见下图. 图中忽略了调用的条件 简单的列出了每个线程的作用:
为了更加直观,请看下面的图:(page alloc中也有)
II)swap out
一次扫描一定量的进程页面,从不太忙碌的进程中回收可用页面是保证空闲页面保有量, 实现VM(back store)的一个重要操作. swap_out选择一个进程(当然不包括内核),通过try_to_swap_out断开此进程对页面的映 射.根据页面的熟悉(是否已经在swap,是否是page cache(page->mapping)),采取不同操作. 进程有一个rss统计,还有一个swap_cnt=(mm->rss >> SWAP_SHIFT),swap cnt代表的是 这个进程在swap out的过程中应该贡献的页面数量.(虽然不一定能找到合适页面)所以swap cnt大的进程优先被选中.
/* * 选择swap_cnt 值最大的任务试着交换出一些空间.swap_cnt 最大表明他最应该受到检查. * 看看能不能换出一些页面. (try_to_swap_out) * N.B. This function returns only 0 or 1. Return values != 1 from * the lower level routines result in continued processing. */ #define SWAP_SHIFT 5 #define SWAP_MIN 8 static int swap_out(unsigned int priority, int gfp_mask) { int counter; int __ret = 0;
/* * 进行两种方式的扫描, assign = 0,按照swap_cnt选择进程. * assign = 1,重新计算rss 和swap_cnt 的值,然后同时根据 * swp_cnt选择合适进程. * * 如果选中的进程不能提供空闲页面,则清楚swap_cnt(swap_out_mm),这样 * 任务就不会再次被选中. */ counter = (nr_threads << SWAP_SHIFT) >> priority; if (counter < 1) counter = 1;
for (; counter >= 0; counter--) { //这个循环次数受优秀级控制 struct list_head *p; //代表了努力程度 unsigned long max_cnt = 0; struct mm_struct *best = NULL; int assign = 0; /*assign 在某一轮尝试如果不能找到一个best, 则进行第二种方式的扫描*/ int found_task = 0; select: spin_lock(&mmlist_lock); p = init_mm.mmlist.next; for (; p != &init_mm.mmlist; p = p->next) { struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist); if (mm->rss <= 0) continue; found_task++; /* Refresh swap_cnt? */ if (assign == 1) { //选择进程同时重新计算swap cnt mm->swap_cnt = (mm->rss >> SWAP_SHIFT); if (mm->swap_cnt < SWAP_MIN) mm->swap_cnt = SWAP_MIN; } if (mm->swap_cnt > max_cnt) { max_cnt = mm->swap_cnt; best = mm; } }
/* Make sure it doesn't disappear */ if (best) atomic_inc(&best->mm_users); spin_unlock(&mmlist_lock);
/* * We have dropped the tasklist_lock, but we * know that "mm" still exists: we are running * with the big kernel lock, and exit_mm() * cannot race with us. */ if (!best) { /* 找不到best 说明swap_cnt 都变成了0*/ if (!assign && found_task > 0) { assign = 1; goto select; } break; } else { __ret = swap_out_mm(best, gfp_mask); mmput(best); break; } } return __ret; }
然后看看try_to_swap_out,其他swap out相关的函数不再分析. /* * rss(驻留页面集合) * * 希望caller 继续就返回0, 否则返回1. * * NOTE! If it sleeps, it *must* return 1 to make sure we * don't continue with the swap-out. Otherwise we may be * using a process that no longer actually exists (it might * have died while we slept). */ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) {
pte_t pte; swp_entry_t entry; struct page * page; int onlist;
pte = *page_table; if (!pte_present(pte)) goto out_failed; page = pte_page(pte); if ((!VALID_PAGE(page)) || PageReserved(page)) goto out_failed;
if (!mm->swap_cnt) return 1;
mm->swap_cnt--; /*swap_cnt: 根据rss,有个初始值每检查一个页面减1 */
//是否在activ list onlist = PageActive(page);
/* 如果最近被访问过,提升age */ if (ptep_test_and_clear_young(page_table)) { age_page_up(page); goto out_failed; }
/*最近未被访问,则降低age*/ if (!onlist) /* 不在activ list age down 由swap_out负责, refill_inactive_scan 负责active list 的age down*/ age_page_down_ageonly(page);
/*寿命未尽,不管*/ if (page->age > 0) goto out_failed;
if (TryLockPage(page)) goto out_failed;
/* age 耗尽可以断开映射了*/ pte = ptep_get_and_clear(page_table); flush_tlb_page(vma, address);
if (PageSwapCache(page)) { /* * 页面已经在交换空间(swap cache)了, * 返回0, 通知上层继续扫描. */ entry.val = page->index; if (pte_dirty(pte)) set_page_dirty(page); set_swap_pte: swap_duplicate(entry); /*此进程引用此swap entry*/ set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: UnlockPage(page); mm->rss--;
deactivate_page(page); /* active->inactive, if on list*/
page_cache_release(page); /*此进程不再能够使用此页面*/ out_failed: return 0; }
/* * 下面判断一下这个page 是否是一个clean page. * 如果是我们就可以断开映射了. * * 如果不是的话我们应该将page->flags * 的dity标志置位. */ flush_cache_page(vma, address); if (!pte_dirty(pte)) goto drop_pte;
/* * dity 页,看看是否属于一个mapping,上面的PageSwapCache * 只是判断是否在swapper_space,只是mapping的一个特例. */ if (page->mapping) { set_page_dirty(page); goto drop_pte; }
/* * dity, 还没有交换空间 * 分配一个交换空间给他 */ entry = get_swap_page(); if (!entry.val) goto out_unlock_restore; /* No swap space left */
/* 挂入swap cache, 因为page->age 为零, * 不会挂入入全局active 队列. * 同时置位flag 的dity 位. */ add_to_swap_cache(page, entry);
set_page_dirty(page);
goto set_swap_pte;
out_unlock_restore: set_pte(page_table, pte); UnlockPage(page); return 0; }
swap out扫描的过程中根据页面最近是否被访问过,调整页面的age值,对于age=0的页面 可以断开其映射. 可以看到page cache(mapping)和swap cache的不同处理. swap空间的页面,pte记录的是swap entry. 普通maping则直接断开映射. pte置0.
III) page_launder
将dity 的页面清洗成clean 并移入clean 队列. 对inactive list 扫描两次,第一次把已 经干净的页面移入inactive_clean 链表,第二次异步回写dirty 页面.当kswapd 无法供应所需 的页面时,则进行同步的回写. 原作者已经写了很多注释.仔细看看吧.有问题则讨论. #define MAX_LAUNDER (4 * (1 << page_cluster)) int page_launder(int gfp_mask, int sync) { int launder_loop, maxscan, cleaned_pages, maxlaunder; int can_get_io_locks; struct list_head * page_lru; struct page * page;
/* * We can only grab the IO locks (eg. for flushing dirty * buffers to disk) if __GFP_IO is set. */ can_get_io_locks = gfp_mask & __GFP_IO;
launder_loop = 0; maxlaunder = 0; cleaned_pages = 0;
dirty_page_rescan: spin_lock(&pagemap_lru_lock); maxscan = nr_inactive_dirty_pages; while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list && maxscan-- > 0) { page = list_entry(page_lru, struct page, lru);
/* Wrong page on list?! (list corruption, should not happen) */ if (!PageInactiveDirty(page)) { printk("VM: page_launder, wrong page on list.\n"); list_del(page_lru); nr_inactive_dirty_pages--; page->zone->inactive_dirty_pages--; continue; }
/* Page is or was in use? Move it to the active list. */ if (PageTestandClearReferenced(page) || page->age > 0 || (!page->buffers && page_count(page) > 1) || page_ramdisk(page)) { del_page_from_inactive_dirty_list(page); add_page_to_active_list(page); continue; }
/* * The page is locked. IO in progress? * Move it to the back of the list. */ if (TryLockPage(page)) { list_del(page_lru); list_add(page_lru, &inactive_dirty_list); continue; }
/* * Dirty swap-cache page? Write it out if * last copy.. */ if (PageDirty(page)) { int (*writepage)(struct page *) = page->mapping->a_ops->writepage; int result;
if (!writepage) goto page_active;
/* First time through? Move it to the back of the list */ if (!launder_loop) { list_del(page_lru); list_add(page_lru, &inactive_dirty_list); UnlockPage(page); continue; }
/* OK, do a physical asynchronous write to swap. */ ClearPageDirty(page); page_cache_get(page); spin_unlock(&pagemap_lru_lock);
result = writepage(page); page_cache_release(page);
/* And re-start the thing.. */ spin_lock(&pagemap_lru_lock); if (result != 1) continue; /* writepage refused to do anything */ set_page_dirty(page); goto page_active; }
/* * If the page has buffers, try to free the buffer mappings * associated with this page. If we succeed we either free * the page (in case it was a buffercache only page) or we * move the page to the inactive_clean list. * * On the first round, we should free all previously cleaned * buffer pages */ if (page->buffers) { int wait, clearedbuf; int freed_page = 0; /* * Since we might be doing disk IO, we have to * drop the spinlock and take an extra reference * on the page so it doesn't go away from under us. */ del_page_from_inactive_dirty_list(page); page_cache_get(page); spin_unlock(&pagemap_lru_lock);
/* Will we do (asynchronous) IO? */ if (launder_loop && maxlaunder == 0 && sync) wait = 2; /* Synchrounous IO */ else if (launder_loop && maxlaunder-- > 0) wait = 1; /* Async IO */ else wait = 0; /* No IO */
/* Try to free the page buffers. */ clearedbuf = try_to_free_buffers(page, wait);
/* * Re-take the spinlock. Note that we cannot * unlock the page yet since we're still * accessing the page_struct here... */ spin_lock(&pagemap_lru_lock);
/* The buffers were not freed. */ if (!clearedbuf) { add_page_to_inactive_dirty_list(page);
/* The page was only in the buffer cache. */ } else if (!page->mapping) { atomic_dec(&buffermem_pages); freed_page = 1; cleaned_pages++;
/* The page has more users besides the cache and us. */ } else if (page_count(page) > 2) { add_page_to_active_list(page);
/* OK, we "created" a freeable page. */ } else /* page->mapping && page_count(page) == 2 */ { add_page_to_inactive_clean_list(page); cleaned_pages++; }
/* * Unlock the page and drop the extra reference. * We can only do it here because we ar accessing * the page struct above. */ UnlockPage(page); page_cache_release(page);
/* * If we're freeing buffer cache pages, stop when * we've got enough free memory. */ if (freed_page && !free_shortage()) break; continue; } else if (page->mapping && !PageDirty(page)) { /* * If a page had an extra reference in * deactivate_page(), we will find it here. * Now the page is really freeable, so we * move it to the inactive_clean list. */ del_page_from_inactive_dirty_list(page); add_page_to_inactive_clean_list(page); UnlockPage(page); cleaned_pages++; } else { page_active: /* * OK, we don't know what to do with the page. * It's no use keeping it here, so we move it to * the active list. */ del_page_from_inactive_dirty_list(page); add_page_to_active_list(page); UnlockPage(page); } } spin_unlock(&pagemap_lru_lock);
/* * If we don't have enough free pages, we loop back once * to queue the dirty pages for writeout. When we were called * by a user process (that /needs/ a free page) and we didn't * free anything yet, we wait synchronously on the writeout of * MAX_SYNC_LAUNDER pages. * * We also wake up bdflush, since bdflush should, under most * loads, flush out the dirty pages before we have to wait on * IO. */ if (can_get_io_locks && !launder_loop && free_shortage()) { launder_loop = 1; /* If we cleaned pages, never do synchronous IO. */ if (cleaned_pages) sync = 0; /* We only do a few "out of order" flushes. */ maxlaunder = MAX_LAUNDER; /* Kflushd takes care of the rest. */ wakeup_bdflush(0); goto dirty_page_rescan; }
/* Return the number of pages moved to the inactive_clean list. */ return cleaned_pages; }
IV) refill_inactive_scan
/* * refill_inactive_scan - 扫描active 列表找到应该deactivate 的页面 * @priority: the priority at which to scan * @oneshot: exit after deactivating one page * * This function will scan a portion of the active list to find * unused pages, those pages will then be moved to the inactive list. */ /* * 在acitve_list 中的页面有两种情况: * * 1. 拥有mapping 的page * 2. 后来加入这个队列的(加入swapper mapping).age 为0, * 但是有可能随时恢复映射 */ int refill_inactive_scan(unsigned int priority, int oneshot) { struct list_head * page_lru; struct page * page; int maxscan, page_active = 0; int ret = 0;
/* Take the lock while messing with the list... */ spin_lock(&pagemap_lru_lock);
maxscan = nr_active_pages >> priority; //队尾的页面被换出的概率小
while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) { page = list_entry(page_lru, struct page, lru);
/* Wrong page on list?! (list corruption, should not happen) */ if (!PageActive(page)) { printk("VM: refill_inactive, wrong page on list.\n"); list_del(page_lru); nr_active_pages--; continue; }
/* Do aging on the pages. */ if (PageTestandClearReferenced(page)) { // 内核持有此页面(如touch_buffer) age_page_up_nolock(page); page_active = 1; //置此标记会把page 移到队尾 } else { age_page_down_ageonly(page); /* * Since we don't hold a reference on the page * ourselves, we have to do our test a bit more * strict then deactivate_page(). This is needed * since otherwise the system could hang shuffling * unfreeable pages from the active list to the * inactive_dirty list and back again... * * SUBTLE: we can have buffer pages with count 1. */ if (page->age == 0 && page_count(page) <= (page->buffers ? 2 : 1)) { deactivate_page_nolock(page); page_active = 0; } else { page_active = 1; } } /* * If the page is still on the active list, move it * to the other end of the list. Otherwise it was * deactivated by age_page_down and we exit successfully. */ if (page_active || PageActive(page)) { // 把页面移到队尾 list_del(page_lru); list_add(page_lru, &active_list); } else { ret = 1; if (oneshot) break; } } spin_unlock(&pagemap_lru_lock);
return ret; }
再次讨论PageTestandClearReferenced.看看2.6关于此位的说明: * For choosing which pages to swap out, inode pages carry a PG_referenced bit, * which is set any time the system accesses that page through the (mapping, * index) hash table. This referenced bit, together with the referenced bit * in the page tables, is used to manipulate page->age and move the page across * the active, inactive_dirty and inactive_clean lists.
意思是,对于缓存文件内容的页面,除了来自用户进程的访问,内核本身访问此页面也应该 age up. 对于2.4内核,这种页面主要来自buffer,见grow_buffers,getblk. buffer中的页面也使用 lru队列进行swap.但是有可能没有映射到用户页面(blk dev 文件的读取),无法age up.只好 通过touch_buffer进行by hand的age up.
page->count不死
关于 page_launder 还有一个话题,就是这个函数标号page_active的地方.论坛中有很多关 于这里的讨论.仅发表一下看法,供大家参考. 以前的讨论中提到page_active:处的条件中隐含有 (page->count==1&&page->mapping==0&&page->buffer==0) 其实这个标号的条件是: 1)no Buffer, no mapping(one process)(dirty can't write out or not), 2)no buffer, mapping,dirty but can't write out)*/
对于第一种页面,在2.4.0的内核中,似乎就没有可能进入lru cache.(2.4.20有). 因为查看 页面所有进入lru的途径,必然是,或者有mapping,或者有buffer. 并且进入lru后,没有找到恢 复进程映射时将mapping,buffer去掉而留在lru中的情况. 关于lru, page_launder,linus有一个讨论可以参考一下: page_launder, linux option linus认为,进入inactive dirty的page,不应该没有mapping的.否则dirty了又能如何.当然 2.4.20以后这种页面可以存在的,但是swap out还是会给他分配mapping(swap space),只是临 存在. 第二种倒是有,比如ramfs,以前的tmpfs也是.
本论坛还有一个对这个函数的讨论,值得一看: linuxfourm关于这个问题的讨论 以上的分析也希望对这个问题有所帮助.
其他函数不再讨论.swap out的过程通过各种手段进行调整,经过不断的演化,成了现在这 个样子.我感觉应该把握的是这种思路,然后看代码的时候就不觉得很乱了.
linux mm分析暂告一个段落.
2006.8.11
|