__alloc_pages源代码分析

时间：2009-04-25 来源：litary1986

/* * This is the 'heart' of the zoned buddy allocator. */ struct page * fastcall __alloc_pages(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct zone **z; struct page *page; struct reclaim_state reclaim_state; struct task_struct *p = current; int do_retry; int alloc_flags; int did_some_progress; might_sleep_if(wait); /*一系列宏定义，最后调用cond_resched() might_sleep_if-> might_sleep->might_resched->cond_resched 如果在gfp_mask中设置了__GFP_WAIT位，表明内核可以阻塞当前进程，来等待空闲页面。在分配开始之前即阻塞，目的是为了等待其它进程释放更多的页面？？

*/ if (should_fail_alloc_page(gfp_mask, order)) /*通过简单算法在真正分配前检查分配是否会失败，避免进入真正的分配程序后浪费系统时间*/ return NULL; restart: z = zonelist->zones; /* the list of zones suitable for gfp_mask */ if (unlikely(*z == NULL)) { /* Should this ever happen?? */ return NULL; } page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); /*get_page_from_freelist以指定的watermark来分配页面

每个zone struct中定义了三个watermark：pages_min, pages_low, pages_high，表示zone中应保持的空闲页面的阈值。

get_page_from_freelist函数通过设置Alloc flags来选择watermark。

#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */

#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */

#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */

#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */

#define ALLOC_HARDER 0x10 /* try to alloc harder */

#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */

#define ALLOC_CPUSET 0x40 /* check for correct cpuset */

*/ /*首先以pages_low watermark分配页面，如果分配成功，则跳转到got_pg*/ if (page) goto got_pg; /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and * __GFP_NOWARN set) should not cause reclaim since the subsystem * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim * using a larger set of nodes after it has established that the * allowed per node queues are empty and that nodes are * over allocated. */ /*如果pages_low watermark分配失败的话，检查gfp_mask，如果GFP_THISNODE标志被设置，表明不能重试，因此跳转到nopage，返回失败!*/ if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; /*否则调用kswapd对zonelist中的所有zone进行页面回首，期待能将一些闲置页面交换到文件系统中*/ for (z = zonelist->zones; *z; z++) wakeup_kswapd(*z, order); /* * OK, we're below the kswapd watermark and have kicked background * reclaim. Now things get more complex, so set up alloc_flags according * to how we want to proceed. * * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ alloc_flags = ALLOC_WMARK_MIN; if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) alloc_flags |= ALLOC_HARDER; if (gfp_mask & __GFP_HIGH) alloc_flags |= ALLOC_HIGH; if (wait) alloc_flags |= ALLOC_CPUSET; /* * Go through the zonelist again. Let __GFP_HIGH and allocations * coming from realtime tasks go deeper into reserves. * * This is the last chance, in general, before the goto nopage. * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ /*降低对zone中空闲页面得要求，以pages_min再次分配页面*/ page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); if (page) goto got_pg; /* This allocation should allow future memory freeing. */ rebalance: if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) { if (!(gfp_mask & __GFP_NOMEMALLOC)) { nofail_alloc: /*如果gfp_mask设置不需要保留紧急内存区域，以不设water_mark再次分配页面*/ /* go through the zonelist yet again, ignoring mins */ page = get_page_from_freelist(gfp_mask, order, zonelist, ALLOC_NO_WATERMARKS); if (page) goto got_pg; /*如果gfp_mask设置了__GFP_NOFAIL，则不断重试，直到分配成功*/ if (gfp_mask & __GFP_NOFAIL) { congestion_wait(WRITE, HZ/50); goto nofail_alloc; } } goto nopage; } /* Atomic allocations - we can't balance anything */ if (!wait) goto nopage; /*重新调度之后，试图释放一些不常用的页面*/ cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_divssure_bump(); p->flags |= PF_MEMALLOC; reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; cond_resched(); /*调度之后，如果确实释放了一部分页面，则重新分配页面*/ if (likely(did_some_progress)) { page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); if (page) goto got_pg; /*如果没有释放页面，并且允许重试，则以pages_high watermark分配页面，因为以pages_high分配，所以除非此时其它的cpu杀死了某个进程，释放了一部分页面，这次分配肯定会失败，失败之后内核通过调用out_of_momery函数杀死某个经过选择的进程，获得一部分空闲页面。采用pages_high watermark可以避免两个cpu都选择某个进程来杀死，从而造成不必要的损失。*/ } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { /* * Go through the zonelist yet one more time, keep * very high watermark here, this is only to catch * a parallel oom killing, we must fail if we're still * under heavy divssure. */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); if (page) goto got_pg; /*在这里选择某个进程杀死，释放部分空闲页面*/ out_of_memory(zonelist, gfp_mask, order); goto restart; } /* * Don't let big-order allocations loop unless the caller explicitly * requests that. Wait for some write requests to complete then retry. * * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order * <= 3, but that may not be true in other implementations. */ do_retry = 0; if (!(gfp_mask & __GFP_NORETRY)) { if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) do_retry = 1; if (gfp_mask & __GFP_NOFAIL) do_retry = 1; } if (do_retry) { congestion_wait(WRITE, HZ/50); goto rebalance; } nopage: if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { printk(KERN_WARNING "%s: page allocation failure." " order:%d, mode:0x%x\n", p->comm, order, gfp_mask); dump_stack(); show_mem(); } got_pg: return page; }