2007-3-6 mm/slab.c
内存管理中,大家讨论最多的一块.我们将分几部分来阐述. 第一部分 概述 slab从zone-buddy系统分配页面,然后拆分成小块内存共内核使用,相当于程序使用 的malloc/fee. 内核使用的每一个对象(inode,dentry, sock........)都有一个专门的cache,就是 这里的slab机制,并且如果对内存的种类有要求,需要另外创建单独的cache,比如DMA, HIGH mem. slab最主要的作用就是小块内存分配,另外对SMP,cache进行了优化.内核通过buddy 控制大粒度的内存碎片,通过slab控制小块的内存碎片----slab 一次分配一个或者几个 物理地址连续的内存页. 关于slab对cache的优化这里采用一示意图副图: 1)假设只有两个cache line,0,1. 2)每个page 两个cache line大小 3)第一个slab color为0,从page的起始位置开始存放slab_t 4)第二个slab color 为1,从page 偏移cache size开始存放slab_t +------------------+ +------------------+ | cache line 0 | | cache line 1 | /------------------+ +------------------+ | / | | /---<-+ +-->-----------------------------------/ | | | | | \ | | | +--------------------+--------------------+-----------------+--------------+ | | cache line 0 | cache line 1 | cache line 0 | cache line 1 | | +--------------------+--------------------+-----------------+--------------+ | page0 |page1 | slab_t color=0 | slab_t clolor=1 | | | | | | | | | | \-----------------------------------------------\
如果遍历cache的所有slab,访问slab1,然后访问slab2,然后又访问slab1.这个期间 cache line0,cache line 1不用失效即可完成访问. 如果两个slab的color相同,每次访问都会使cache line 失效从而降低效率. 在这里不讨论关于cache本身的详细问题,此图仅为参考.(fix me)
第二部分 数据结构和示意图 还是先给出一张图,然后详解相关的数据结构.这是一个非off slab(slab_t不在slab 之中)的示意图.图中将关键的数据清楚的表明了其用图. 1)kmem_cache_s slabs是所有属于此cache的slab(连续的几个页面)的一个链表. firtnotfull:这些slab按照 full, part used, empty的顺序排列,这个指针指向第 一个部分使用的slab. objsize: cache 管理的对象之字节大小. flags: like off_sab bit color: color的最大值,把slab的剩余空间按照coloroff的粒度分成'clolor' coloroff: 颜色粒度,一般和cache line大小相关(n倍) 2)slab_t : slab管理所使用的结构 list: 挂入kmem_cache_s的slabs链表 colouroff:从slab起始页面到第一个obj(s_mem)的偏移.cache align(color,slab_t,bufctl) inuse: 已分配objs数量 free: bufctrl是一个数组构成的链表,从索引free开始,是其空闲objs组成的链表. kmem_cache_s +-------------+ /-|slabs | | |*firstnotfull->>-----------------------/ | |objsize | | | |flags; | | | |num; | | | | | | | |color(max) |=left/cache.coloroff | | |coloroff |L1 cache size | | | | | | | | | | | | slab_t struct slab_s | | | /--|-----+----------+ page --/--------+----------+page | \---->|list --------------/ | | | | |colouroff;| | | | color |=cache.colornext*cache.coloroff | |*s_mem;---------/ | | | | | |inuse; | | | | +----------+ slab_t | |free -------/ | \-----+------->|list | | +----------+ | | | |colouroff;| \ | bufctl | | | | |*s_mem;---------/ colouroff |----------| | | | |inuse; | | / /-->... | | | |free -------/ | | || | | | | +----------+ | | | \---next <--/ | \ | bufctl | | | | | | | colouroff |----------| | | | +----------+ | / /-->... | | | ... | | || | | | | cache align | | \---next <--/ | -\--------+----------+<----/ | | | | | obj | | +----------+ | +----------| | ... / | | | cache align | | | ---/--------+----------+<----/ | | | obj | .... | | +----------+ | | | | | | | | | | .... | | | | | | | | | | | | | | | | +----------+ | | | | | | | | +----------+
需要说明的是,一旦page从buddy分配到slab, 其page 描述符中的prev指向所属slab, next 指向所属的cache.(见kmem_cache_grow).这样,对于一个obj, rounddowntopage(objp)->next 就是其cache,rounddowntopage(objp)->prev就是其slab.
然后列出其定义: struct kmem_cache_s { /* 1) each alloc & free */ /* full, partial first, then free */ struct list_head slabs; //cache 管理的slabs 之表头 struct list_head *firstnotfull; unsigned int objsize; unsigned int flags; /* constant flags */ unsigned int num; /* # of objs per slab */ spinlock_t spinlock; #ifdef CONFIG_SMP unsigned int batchcount; /*SMP的时候每个cpu一个快速分配链表,一次从slab分配的个数*/ #endif
/* 2) slab additions /removals */ /* order of pgs per slab (2^n) */ unsigned int gfporder;
/* force GFP flags, e.g. GFP_DMA */ unsigned int gfpflags;
size_t colour; /* cache colouring range */ unsigned int colour_off; /* colour 粒度, colour*color_off 是slab 的color_off */ unsigned int colour_next; /* cache colouring */ kmem_cache_t *slabp_cache; /*当这个cache中的slab,其管理部分 (slab描述符和kmem_bufctl_t数组)放在 slab外面时,这个指针指向放置 的通用cache*/ unsigned int growing; unsigned int dflags; /* dynamic flags */
/* constructor func de-constructor func 忽略*/ /* 3) cache creation/removal */ char name[CACHE_NAMELEN]; struct list_head next; /* 用它和其它的cache串成一个链,这个链上按照时钟算法 定期地回收某个cache的部分slab*/ #ifdef CONFIG_SMP /* 4) per-cpu data */ cpucache_t *cpudata[NR_CPUS]; #endif #if STATS //忽略 #endif };
/* * slab_t * * 管理一个slab 中的对象. 放在一个slab 开始的地方 * , 或从general cache 中分配. * Slabs 链成一个有序的list: 满员, 部分使用, 空slab. */ typedef struct slab_s { struct list_head list; //链成list, 表头kmem_cache_s::slabs unsigned long colouroff; //s_mem = SlabBase(buddypage)+colouroff void *s_mem; /* 第一个对象所处位置*/ unsigned int inuse; /* slab 中正在使用中的对象个数*/ kmem_bufctl_t free; /*空闲对象表头*/ } slab_t;
另外一个对slab工作起到重要作用的是cache_cache: /* internal cache of cache description objs */ static kmem_cache_t cache_cache = { slabs: LIST_HEAD_INIT(cache_cache.slabs), firstnotfull: &cache_cache.slabs, objsize: sizeof(kmem_cache_t), flags: SLAB_NO_REAP, spinlock: SPIN_LOCK_UNLOCKED, colour_off: L1_CACHE_BYTES, name: "kmem_cache", };
这是一个手工简立的cache, 其管理的对象是kmem_cache_t.用于分配kmem_cache_t,所以 是cache 的cache.
再内核初始化slab的时候,首先初始化cache_cache:kmem_cache_init然后初始化通用 cache: kmem_cache_sizes_init.这里有个问题要提一下:
kmem_cache_sizes_init->kmem_cache_create(name, sizes->cs_size,0,..):
kmem_cache_t * kmem_cache_create (...) { const char *func_nm = KERN_ERR "kmem_create: "; size_t left_over, align, slab_size; kmem_cache_t *cachep = NULL;
/* Sanity checks... debug */ .... /* 分配cache 的描述对象. 没有问题,cache_cache已经初始化 */ cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); .... /* 确定以那种方式管理对象 ( 'on' or 'off' slab.) */ if (size >= (PAGE_SIZE>>3))//大于512字节就要off了(一般,4k页面) flags |= CFLGS_OFF_SLAB; .......... if (flags & CFLGS_OFF_SLAB) cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);//再通用cache中分配 ........ }
这里初始化通用cache的时候,如果是offslab,需要在通用cache中分配slab_t,问题是选中的通 用cache初始化好了吗? 是这样的:先初始化32-512字节的通用cache,使用inslab的slab_t没有问题,等初始化>512字节 的cachesize时,只要slab_size=cachealign(sizeof(slab_t)+num*sizeof(bufctrl))小于512即可. static unsigned long offslab_limit;/* 采用off-slab 管理方式的slab所含对象的上限.*/ 这个限制也是在这里计算的,每初始化一个非offslab的通用cache,就修改此值: kmem_cache_sizes_init: if (!(OFF_SLAB(sizes->cs_cachep))) { offslab_limit = sizes->cs_size-sizeof(slab_t); offslab_limit /= 2; } 这里最大的inslab通用cache是256:offslab_limit=(256-24)/2=112.有了这个限制,这个问题就 解决了. (注意:其实顺序创建通用cache的时候并不会有问题,因为创建size是512的通用cache的时候一个slab才8个obj ,并且越往后用的越少..,其实上面的offslab_limit /= 2也错了,就是应该/4,但是问题是,不必要除以4,2.6中已经 意识到这个问题,把这个变量去掉了,根本不会出问题的一般情况下。)
static cache_sizes_t cache_sizes[] = { #if PAGE_SIZE == 4096 { 32, NULL, NULL}, #endif { 64, NULL, NULL}, { 128, NULL, NULL}, { 256, NULL, NULL}, { 512, NULL, NULL},//start off slab (老版本文档这里写错了) { 1024, NULL, NULL}, { 2048, NULL, NULL}, { 4096, NULL, NULL}, { 8192, NULL, NULL}, { 16384, NULL, NULL}, { 32768, NULL, NULL}, { 65536, NULL, NULL}, {131072, NULL, NULL}, { 0, NULL, NULL} };
第三部分 核心算法
内存分配相关算法 这里讨论slab密切相关的几个函数,对于理解slab的关键操作很有好处.第一个我们讨论 kmem_cache_estimate,通过这个函数可以看到slab_t,bufctrl,obj的相对位置,关系. 此函数根据gfporder计算此slab可以包含的obj个数,并返回剩余的可以做color的大小. /* 对一个给定的slab 计算可以包含的对象个数num, * bytes left over(除了对象,管理机构后剩余的字节). * gfporder: slab size 2^gfporder*PAGE_SIZE * size: 对象大小, flags: may be CFLGS_OFF_SLAB */ static void kmem_cache_estimate (unsigned long gfporder, size_t size, int flags, size_t *left_over, unsigned int *num) { int i; size_t wastage = PAGE_SIZE<<gfporder; /*全部空间*/ size_t extra = 0; /*bufctl占用的空间*/ size_t base = 0; /*slab_t大小*/
/*如果是off slab_t,只考虑有多少个obj即可*/ if (!(flags & CFLGS_OFF_SLAB)) { base = sizeof(slab_t); extra = sizeof(kmem_bufctl_t); } i = 0;
/*逐步加大对象个数,只要能盛的下即可*/ while (i*size + L1_CACHE_ALIGN(base+i*extra) <= wastage) i++; /* * 计算说明: base+i*extra 应该是第一个对象的开始位置。 * L1_CACHE_ALIGN 作用其上,得到对齐后的地址。 */ if (i > 0) i--;/*while已经算到盛不下了,要减1*/
if (i > SLAB_LIMIT)/*不要有太多对象*/ i = SLAB_LIMIT;
*num = i; wastage -= i*size; /*总空间减去对象大小*/ wastage -= L1_CACHE_ALIGN(base+i*extra);/*减去管理结构大小*/ *left_over = wastage; /*剩下的用作color*/ }
然后是为slab分配slab_t的算法,根据off/in slab 的slab_t,分成两种情况.
/* *为一个slab 的slab_t 分配内存 *非offlab的情况直接使用slab 的顶端 */ static inline slab_t * kmem_cache_slabmgmt (kmem_cache_t *cachep, void *objp, int colour_off, int local_flags) { slab_t *slabp; if (OFF_SLAB(cachep)) {/*offslab的话从指定的通用cache分配slab_t*/ /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); if (!slabp) return NULL; } else {/*in slab 的slab_t*/ /* FIXME: change to slabp = objp * if you enable OPTIMIZE */ slabp = objp+colour_off; /*看图,color在slab的顶端*/ colour_off += L1_CACHE_ALIGN(cachep->num * sizeof(kmem_bufctl_t) + sizeof(slab_t));/*看图,colour_off和s_mem的关系*/ } slabp->inuse = 0; slabp->colouroff = colour_off; slabp->s_mem = objp+colour_off;
return slabp; }
下一个是static inline void kmem_cache_init_objs (kmem_cache_t * cachep, slab_t * slabp, unsigned long ctor_flags)详细代码略.(罗列代码于事无补^_^). 其作用是:初始化一个cache 中指定slab中的所有对象,并构建基于bufctl数组的free链表.
然后是从slab中分配obj的核心部分:
/* * 从slab 分配一个object */ static inline void * kmem_cache_alloc_one_tail (kmem_cache_t *cachep, slab_t *slabp) { void *objp;
STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep);
/* get obj pointer */ slabp->inuse++; objp = slabp->s_mem + slabp->free*cachep->objsize; /*对象指针是这样算的!*/ slabp->free=slab_bufctl(slabp)[slabp->free];/*next=obj->next*/
if (slabp->free == BUFCTL_END) /*这个slab用完了,firstnotfull=next slab*/ /* slab now full: move to next slab for next alloc */ cachep->firstnotfull = slabp->list.next; #if DEBUG ... #endif return objp; }
有关分配的关键函数,最后一个是kmem_cache_grow,分配新的slab给slab cache.(已经做了 很重的注释,不需要更多了吧 ?)(在分析很多函数的时候,都没有把其同步/互斥操作作为重点 以后再做此项工作吧.) /* * 为指定的cache 分配一个slab. */ static int kmem_cache_grow (kmem_cache_t * cachep, int flags) { slab_t *slabp; struct page *page; void *objp; size_t offset; unsigned int i, local_flags; unsigned long ctor_flags; unsigned long save_flags;
/* Be lazy and only check for valid flags here, * keeping it out of the critical path in kmem_cache_alloc(). */ if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) BUG();/*其他flags都是无效的*/ if (flags & SLAB_NO_GROW) return 0;
/* * The test for missing atomic flag is performed here, rather than * the more obvious place, simply to reduce the critical path length * in kmem_cache_alloc(). If a caller is seriously mis-behaving they * will eventually be caught here (where it matters). */ if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC) BUG();/*在中断环境中,只容许进行SLAB_ATOMIC类型(not sleep...)的分配*/
ctor_flags = SLAB_CTOR_CONSTRUCTOR; local_flags = (flags & SLAB_LEVEL_MASK); if (local_flags == SLAB_ATOMIC)/*禁止sleep的slab分配,告诉constructor*/ /* * Not allowed to sleep. Need to tell a constructor about * this - it might need to know... */ ctor_flags |= SLAB_CTOR_ATOMIC;
/* About to mess with non-constant members - lock. */ spin_lock_irqsave(&cachep->spinlock, save_flags);
/* Get colour for the slab, and cal the next value. */ offset = cachep->colour_next; /*这个slab应有的color*/ cachep->colour_next++; if (cachep->colour_next >= cachep->colour) cachep->colour_next = 0; offset *= cachep->colour_off;/*slab头部预留这多color bytes*/ cachep->dflags |= DFLGS_GROWN;
cachep->growing++; spin_unlock_irqrestore(&cachep->spinlock, save_flags);
/* 一个新slab的一系列内存分配动作. * Neither the cache-chain semaphore, or cache-lock, are * held, but the incrementing c_growing prevents this * cache from being reaped or shrunk. * Note: The cache could be selected in for reaping in * kmem_cache_reap(), but when the final test is made the * growing value will be seen. */
/* 分配slab,一组连续的buddy页面 */ if (!(objp = kmem_getpages(cachep, flags))) goto failed;
/* 分配slab 的管理结构,in/off slab的slab_t */ if (!(slabp = kmem_cache_slabmgmt(cachep, objp, offset, local_flags))) goto opps1;
/* !!!!!! I hope this is OK. */ i = 1 << cachep->gfporder; page = virt_to_page(objp); do { /* 分给slab 的页面,其page 描述符中的prev * 指向所属slab, next指向所属的cache */ SET_PAGE_CACHE(page, cachep); SET_PAGE_SLAB(page, slabp);
PageSetSlab(page); page++; } while (--i);/*slab中每个页面都有此设置*/
kmem_cache_init_objs(cachep, slabp, ctor_flags); /*初始化*/
spin_lock_irqsave(&cachep->spinlock, save_flags); cachep->growing--;
/* Make slab active. */ list_add_tail(&slabp->list,&cachep->slabs); /*加入cache的slab列表尾部*/ if (cachep->firstnotfull == &cachep->slabs)/*调整firstnotfull指针*/ cachep->firstnotfull = &slabp->list; STATS_INC_GROWN(cachep); cachep->failures = 0;
spin_unlock_irqrestore(&cachep->spinlock, save_flags); return 1; opps1: kmem_freepages(cachep, objp); failed: spin_lock_irqsave(&cachep->spinlock, save_flags); cachep->growing--; spin_unlock_irqrestore(&cachep->spinlock, save_flags); return 0; }
内存释放的相关算法 kmem_cache_free_one: 根据objp 找到其所属的slab( 所属的cache已经找到,即cachep)把 这个obj挂入空闲对象链表根据slab 的状态调整他在cache->slab列表的位置.
static inline void kmem_cache_free_one(kmem_cache_t *cachep, void *objp) { slab_t* slabp;
CHECK_PAGE(virt_to_page(objp)); /* reduces memory footprint * if (OPTIMIZE(cachep)) slabp = (void*)((unsigned long)objp&(~(PAGE_SIZE-1))); else */ slabp = GET_PAGE_SLAB(virt_to_page(objp)); /*这里的page->prev 记录了obj 所在的slab,见GET_PAGE_SLAB*/ #if DEBUG .... #endif { /*把这个obj挂入空闲对象链接表*/ unsigned int objnr = (objp-slabp->s_mem)/cachep->objsize; slab_bufctl(slabp)[objnr] = slabp->free; slabp->free = objnr; } STATS_DEC_ACTIVE(cachep);/*统计*/ /* fixup slab chain */ if (slabp->inuse-- == cachep->num) /*从full到->part full*/ goto moveslab_partial; if (!slabp->inuse)/*从part->empty*/ goto moveslab_free; return;
/*移动slab的位置,simple..*/ moveslab_partial: /* was full. * Even if the page is now empty, we can set c_firstnotfull to * slabp: there are no partial slabs in this case */ { .... } moveslab_free: /* * was partial, now empty. * c_firstnotfull might point to slabp * FIXME: optimize */ { .... } } kmem_slab_destroy: 使用cachep->dtor释放此slab中所有obj.释放此slab使用的页面,还 有slab_t. 代码略.
第四部分 提供的主要接口 对外提供的接口分成几个部分:初始化,cache的创建删除,obj的分配释放,proc系统支持. I) slab初始化部分 最早调用的函数: void __init kmem_cache_init(void)设置cache_cache(cache 的全局 管理机构). void __init kmem_cache_init(void) {/*已经静态初始化了许多,这里比较简单*/ size_t left_over;
init_MUTEX(&cache_chain_sem); INIT_LIST_HEAD(&cache_chain);
kmem_cache_estimate(0, cache_cache.objsize, 0, &left_over, &cache_cache.num); if (!cache_cache.num) BUG();
cache_cache.colour = left_over/cache_cache.colour_off; cache_cache.colour_next = 0; } 然后初始化通用cache(cache_size): void __init kmem_cache_sizes_init(void)不再 罗列其代码,有关cache size初始化的特殊问题已经在第二部分说明.这里不再做其他分析 了. 最后一个处世话是__initcall(kmem_cpucache_init),再init进程中调用.主要工作是由 static void enable_all_cpucaches (void) //完成的. { struct list_head* p;
down(&cache_chain_sem);
p = &cache_cache.next; do { //遍历所有cache,初始化SMP, kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
enable_cpucache(cachep);/*设置bache alloc limit,然后通过IPI中断让各个cpu 初始化私有数据*/
2007-3-6 mm/slab.c
内存管理中,大家讨论最多的一块.我们将分几部分来阐述. 第一部分 概述 slab从zone-buddy系统分配页面,然后拆分成小块内存共内核使用,相当于程序使用 的malloc/fee. 内核使用的每一个对象(inode,dentry, sock........)都有一个专门的cache,就是 这里的slab机制,并且如果对内存的种类有要求,需要另外创建单独的cache,比如DMA, HIGH mem. slab最主要的作用就是小块内存分配,另外对SMP,cache进行了优化.内核通过buddy 控制大粒度的内存碎片,通过slab控制小块的内存碎片----slab 一次分配一个或者几个 物理地址连续的内存页. 关于slab对cache的优化这里采用一示意图副图: 1)假设只有两个cache line,0,1. 2)每个page 两个cache line大小 3)第一个slab color为0,从page的起始位置开始存放slab_t 4)第二个slab color 为1,从page 偏移cache size开始存放slab_t +------------------+ +------------------+ | cache line 0 | | cache line 1 | /------------------+ +------------------+ | / | | /---<-+ +-->-----------------------------------/ | | | | | \ | | | +--------------------+--------------------+-----------------+--------------+ | | cache line 0 | cache line 1 | cache line 0 | cache line 1 | | +--------------------+--------------------+-----------------+--------------+ | page0 |page1 | slab_t color=0 | slab_t clolor=1 | | | | | | | | | | \-----------------------------------------------\ 如果遍历cache的所有slab,访问slab1,然后访问slab2,然后又访问slab1.这个期间 cache line0,cache line 1不用失效即可完成访问. 如果两个slab的color相同,每次访问都会使cache line 失效从而降低效率. 在这里不讨论关于cache本身的详细问题,此图仅为参考.(fix me)
第二部分 数据结构和示意图 还是先给出一张图,然后详解相关的数据结构.这是一个非off slab(slab_t不在slab 之中)的示意图.图中将关键的数据清楚的表明了其用图. 1)kmem_cache_s slabs是所有属于此cache的slab(连续的几个页面)的一个链表. firtnotfull:这些slab按照 full, part used, empty的顺序排列,这个指针指向第 一个部分使用的slab. objsize: cache 管理的对象之字节大小. flags: like off_sab bit color: color的最大值,把slab的剩余空间按照coloroff的粒度分成'clolor' coloroff: 颜色粒度,一般和cache line大小相关(n倍) 2)slab_t : slab管理所使用的结构 list: 挂入kmem_cache_s的slabs链表 colouroff:从slab起始页面到第一个obj(s_mem)的偏移.cache align(color,slab_t,bufctl) inuse: 已分配objs数量 free: bufctrl是一个数组构成的链表,从索引free开始,是其空闲objs组成的链表. kmem_cache_s +-------------+ /-|slabs | | |*firstnotfull->>-----------------------/ | |objsize | | | |flags; | | | |num; | | | | | | | |color(max) |=left/cache.coloroff | | |coloroff |L1 cache size | | | | | | | | | | | | slab_t struct slab_s | | | /--|-----+----------+ page --/--------+----------+page | \---->|list --------------/ | | | | |colouroff;| | | | color |=cache.colornext*cache.coloroff | |*s_mem;---------/ | | | | | |inuse; | | | | +----------+ slab_t | |free -------/ | \-----+------->|list | | +----------+ | | | |colouroff;| \ | bufctl | | | | |*s_mem;---------/ colouroff |----------| | | | |inuse; | | / /-->... | | | |free -------/ | | || | | | | +----------+ | | | \---next <--/ | \ | bufctl | | | | | | | colouroff |----------| | | | +----------+ | / /-->... | | | ... | | || | | | | cache align | | \---next <--/ | -\--------+----------+<----/ | | | | | obj | | +----------+ | +----------| | ... / | | | cache align | | | ---/--------+----------+<----/ | | | obj | .... | | +----------+ | | | | | | | | | | .... | | | | | | | | | | | | | | | | +----------+ | | | | | | | | +----------+ 需要说明的是,一旦page从buddy分配到slab, 其page 描述符中的prev指向所属slab, next 指向所属的cache.(见kmem_cache_grow).这样,对于一个obj, rounddowntopage(objp)->next 就是其cache,rounddowntopage(objp)->prev就是其slab.
然后列出其定义: struct kmem_cache_s { /* 1) each alloc & free */ /* full, partial first, then free */ struct list_head slabs; //cache 管理的slabs 之表头 struct list_head *firstnotfull; unsigned int objsize; unsigned int flags; /* constant flags */ unsigned int num; /* # of objs per slab */ spinlock_t spinlock; #ifdef CONFIG_SMP unsigned int batchcount; /*SMP的时候每个cpu一个快速分配链表,一次从slab分配的个数*/ #endif
/* 2) slab additions /removals */ /* order of pgs per slab (2^n) */ unsigned int gfporder;
/* force GFP flags, e.g. GFP_DMA */ unsigned int gfpflags;
size_t colour; /* cache colouring range */ unsigned int colour_off; /* colour 粒度, colour*color_off 是slab 的color_off */ unsigned int colour_next; /* cache colouring */ kmem_cache_t *slabp_cache; /*当这个cache中的slab,其管理部分 (slab描述符和kmem_bufctl_t数组)放在 slab外面时,这个指针指向放置 的通用cache*/ unsigned int growing; unsigned int dflags; /* dynamic flags */
/* constructor func de-constructor func 忽略*/ /* 3) cache creation/removal */ char name[CACHE_NAMELEN]; struct list_head next; /* 用它和其它的cache串成一个链,这个链上按照时钟算法 定期地回收某个cache的部分slab*/ #ifdef CONFIG_SMP /* 4) per-cpu data */ cpucache_t *cpudata[NR_CPUS]; #endif #if STATS //忽略 #endif };
/* * slab_t * * 管理一个slab 中的对象. 放在一个slab 开始的地方 * , 或从general cache 中分配. * Slabs 链成一个有序的list: 满员, 部分使用, 空slab. */ typedef struct slab_s { struct list_head list; //链成list, 表头kmem_cache_s::slabs unsigned long colouroff; //s_mem = SlabBase(buddypage)+colouroff void *s_mem; /* 第一个对象所处位置*/ unsigned int inuse; /* slab 中正在使用中的对象个数*/ kmem_bufctl_t free; /*空闲对象表头*/ } slab_t;
另外一个对slab工作起到重要作用的是cache_cache: /* internal cache of cache description objs */ static kmem_cache_t cache_cache = { slabs: LIST_HEAD_INIT(cache_cache.slabs), firstnotfull: &cache_cache.slabs, objsize: sizeof(kmem_cache_t), flags: SLAB_NO_REAP, spinlock: SPIN_LOCK_UNLOCKED, colour_off: L1_CACHE_BYTES, name: "kmem_cache", };
这是一个手工简立的cache, 其管理的对象是kmem_cache_t.用于分配kmem_cache_t,所以 是cache 的cache.
再内核初始化slab的时候,首先初始化cache_cache:kmem_cache_init然后初始化通用 cache: kmem_cache_sizes_init.这里有个问题要提一下:
kmem_cache_sizes_init->kmem_cache_create(name, sizes->cs_size,0,..):
kmem_cache_t * kmem_cache_create (...) { const char *func_nm = KERN_ERR "kmem_create: "; size_t left_over, align, slab_size; kmem_cache_t *cachep = NULL;
/* Sanity checks... debug */ .... /* 分配cache 的描述对象. 没有问题,cache_cache已经初始化 */ cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); .... /* 确定以那种方式管理对象 ( 'on' or 'off' slab.) */ if (size >= (PAGE_SIZE>>3))//大于512字节就要off了(一般,4k页面) flags |= CFLGS_OFF_SLAB; .......... if (flags & CFLGS_OFF_SLAB) cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);//再通用cache中分配 ........ }
这里初始化通用cache的时候,如果是offslab,需要在通用cache中分配slab_t,问题是选中的通 用cache初始化好了吗? 是这样的:先初始化32-512字节的通用cache,使用inslab的slab_t没有问题,等初始化>512字节 的cachesize时,只要slab_size=cachealign(sizeof(slab_t)+num*sizeof(bufctrl))小于512即可. static unsigned long offslab_limit;/* 采用off-slab 管理方式的slab所含对象的上限.*/ 这个限制也是在这里计算的,每初始化一个非offslab的通用cache,就修改此值: kmem_cache_sizes_init: if (!(OFF_SLAB(sizes->cs_cachep))) { offslab_limit = sizes->cs_size-sizeof(slab_t); offslab_limit /= 2; } 这里最大的inslab通用cache是256:offslab_limit=(256-24)/2=112.有了这个限制,这个问题就 解决了. (注意:其实顺序创建通用cache的时候并不会有问题,因为创建size是512的通用cache的时候一个slab才8个obj ,并且越往后用的越少..,其实上面的offslab_limit /= 2也错了,就是应该/4,但是问题是,不必要除以4,2.6中已经 意识到这个问题,把这个变量去掉了,根本不会出问题的一般情况下。)
static cache_sizes_t cache_sizes[] = { #if PAGE_SIZE == 4096 { 32, NULL, NULL}, #endif { 64, NULL, NULL}, { 128, NULL, NULL}, { 256, NULL, NULL}, { 512, NULL, NULL},//start off slab (老版本文档这里写错了) { 1024, NULL, NULL}, { 2048, NULL, NULL}, { 4096, NULL, NULL}, { 8192, NULL, NULL}, { 16384, NULL, NULL}, { 32768, NULL, NULL}, { 65536, NULL, NULL}, {131072, NULL, NULL}, { 0, NULL, NULL} };
第三部分 核心算法
内存分配相关算法 这里讨论slab密切相关的几个函数,对于理解slab的关键操作很有好处.第一个我们讨论 kmem_cache_estimate,通过这个函数可以看到slab_t,bufctrl,obj的相对位置,关系. 此函数根据gfporder计算此slab可以包含的obj个数,并返回剩余的可以做color的大小. /* 对一个给定的slab 计算可以包含的对象个数num, * bytes left over(除了对象,管理机构后剩余的字节). * gfporder: slab size 2^gfporder*PAGE_SIZE * size: 对象大小, flags: may be CFLGS_OFF_SLAB */ static void kmem_cache_estimate (unsigned long gfporder, size_t size, int flags, size_t *left_over, unsigned int *num) { int i; size_t wastage = PAGE_SIZE<<gfporder; /*全部空间*/ size_t extra = 0; /*bufctl占用的空间*/ size_t base = 0; /*slab_t大小*/
/*如果是off slab_t,只考虑有多少个obj即可*/ if (!(flags & CFLGS_OFF_SLAB)) { base = sizeof(slab_t); extra = sizeof(kmem_bufctl_t); } i = 0;
/*逐步加大对象个数,只要能盛的下即可*/ while (i*size + L1_CACHE_ALIGN(base+i*extra) <= wastage) i++; /* * 计算说明: base+i*extra 应该是第一个对象的开始位置。 * L1_CACHE_ALIGN 作用其上,得到对齐后的地址。 */ if (i > 0) i--;/*while已经算到盛不下了,要减1*/
if (i > SLAB_LIMIT)/*不要有太多对象*/ i = SLAB_LIMIT;
*num = i; wastage -= i*size; /*总空间减去对象大小*/ wastage -= L1_CACHE_ALIGN(base+i*extra);/*减去管理结构大小*/ *left_over = wastage; /*剩下的用作color*/ }
然后是为slab分配slab_t的算法,根据off/in slab 的slab_t,分成两种情况.
/* *为一个slab 的slab_t 分配内存 *非offlab的情况直接使用slab 的顶端 */ static inline slab_t * kmem_cache_slabmgmt (kmem_cache_t *cachep, void *objp, int colour_off, int local_flags) { slab_t *slabp; if (OFF_SLAB(cachep)) {/*offslab的话从指定的通用cache分配slab_t*/ /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); if (!slabp) return NULL; } else {/*in slab 的slab_t*/ /* FIXME: change to slabp = objp * if you enable OPTIMIZE */ slabp = objp+colour_off; /*看图,color在slab的顶端*/ colour_off += L1_CACHE_ALIGN(cachep->num * sizeof(kmem_bufctl_t) + sizeof(slab_t));/*看图,colour_off和s_mem的关系*/ } slabp->inuse = 0; slabp->colouroff = colour_off; slabp->s_mem = objp+colour_off;
return slabp; }
下一个是static inline void kmem_cache_init_objs (kmem_cache_t * cachep, slab_t * slabp, unsigned long ctor_flags)详细代码略.(罗列代码于事无补^_^). 其作用是:初始化一个cache 中指定slab中的所有对象,并构建基于bufctl数组的free链表.
然后是从slab中分配obj的核心部分:
/* * 从slab 分配一个object */ static inline void * kmem_cache_alloc_one_tail (kmem_cache_t *cachep, slab_t *slabp) { void *objp;
STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep);
/* get obj pointer */ slabp->inuse++; objp = slabp->s_mem + slabp->free*cachep->objsize; /*对象指针是这样算的!*/ slabp->free=slab_bufctl(slabp)[slabp->free];/*next=obj->next*/
if (slabp->free == BUFCTL_END) /*这个slab用完了,firstnotfull=next slab*/ /* slab now full: move to next slab for next alloc */ cachep->firstnotfull = slabp->list.next; #if DEBUG ... #endif return objp; }
有关分配的关键函数,最后一个是kmem_cache_grow,分配新的slab给slab cache.(已经做了 很重的注释,不需要更多了吧 ?)(在分析很多函数的时候,都没有把其同步/互斥操作作为重点 以后再做此项工作吧.) /* * 为指定的cache 分配一个slab. */ static int kmem_cache_grow (kmem_cache_t * cachep, int flags) { slab_t *slabp; struct page *page; void *objp; size_t offset; unsigned int i, local_flags; unsigned long ctor_flags; unsigned long save_flags;
/* Be lazy and only check for valid flags here, * keeping it out of the critical path in kmem_cache_alloc(). */ if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) BUG();/*其他flags都是无效的*/ if (flags & SLAB_NO_GROW) return 0;
/* * The test for missing atomic flag is performed here, rather than * the more obvious place, simply to reduce the critical path length * in kmem_cache_alloc(). If a caller is seriously mis-behaving they * will eventually be caught here (where it matters). */ if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC) BUG();/*在中断环境中,只容许进行SLAB_ATOMIC类型(not sleep...)的分配*/
ctor_flags = SLAB_CTOR_CONSTRUCTOR; local_flags = (flags & SLAB_LEVEL_MASK); if (local_flags == SLAB_ATOMIC)/*禁止sleep的slab分配,告诉constructor*/ /* * Not allowed to sleep. Need to tell a constructor about * this - it might need to know... */ ctor_flags |= SLAB_CTOR_ATOMIC;
/* About to mess with non-constant members - lock. */ spin_lock_irqsave(&cachep->spinlock, save_flags);
/* Get colour for the slab, and cal the next value. */ offset = cachep->colour_next; /*这个slab应有的color*/ cachep->colour_next++; if (cachep->colour_next >= cachep->colour) cachep->colour_next = 0; offset *= cachep->colour_off;/*slab头部预留这多color bytes*/ cachep->dflags |= DFLGS_GROWN;
cachep->growing++; spin_unlock_irqrestore(&cachep->spinlock, save_flags);
/* 一个新slab的一系列内存分配动作. * Neither the cache-chain semaphore, or cache-lock, are * held, but the incrementing c_growing prevents this * cache from being reaped or shrunk. * Note: The cache could be selected in for reaping in * kmem_cache_reap(), but when the final test is made the * growing value will be seen. */
/* 分配slab,一组连续的buddy页面 */ if (!(objp = kmem_getpages(cachep, flags))) goto failed;
/* 分配slab 的管理结构,in/off slab的slab_t */ if (!(slabp = kmem_cache_slabmgmt(cachep, objp, offset, local_flags))) goto opps1;
/* !!!!!! I hope this is OK. */ i = 1 << cachep->gfporder; page = virt_to_page(objp); do { /* 分给slab 的页面,其page 描述符中的prev * 指向所属slab, next指向所属的cache */ SET_PAGE_CACHE(page, cachep); SET_PAGE_SLAB(page, slabp);
PageSetSlab(page); page++; } while (--i);/*slab中每个页面都有此设置*/
kmem_cache_init_objs(cachep, slabp, ctor_flags); /*初始化*/
spin_lock_irqsave(&cachep->spinlock, save_flags); cachep->growing--;
/* Make slab active. */ list_add_tail(&slabp->list,&cachep->slabs); /*加入cache的slab列表尾部*/ if (cachep->firstnotfull == &cachep->slabs)/*调整firstnotfull指针*/ cachep->firstnotfull = &slabp->list; STATS_INC_GROWN(cachep); cachep->failures = 0;
spin_unlock_irqrestore(&cachep->spinlock, save_flags); return 1; opps1: kmem_freepages(cachep, objp); failed: spin_lock_irqsave(&cachep->spinlock, save_flags); cachep->growing--; spin_unlock_irqrestore(&cachep->spinlock, save_flags); return 0; }
内存释放的相关算法 kmem_cache_free_one: 根据objp 找到其所属的slab( 所属的cache已经找到,即cachep)把 这个obj挂入空闲对象链表根据slab 的状态调整他在cache->slab列表的位置.
static inline void kmem_cache_free_one(kmem_cache_t *cachep, void *objp) { slab_t* slabp;
CHECK_PAGE(virt_to_page(objp)); /* reduces memory footprint * if (OPTIMIZE(cachep)) slabp = (void*)((unsigned long)objp&(~(PAGE_SIZE-1))); else */ slabp = GET_PAGE_SLAB(virt_to_page(objp)); /*这里的page->prev 记录了obj 所在的slab,见GET_PAGE_SLAB*/ #if DEBUG .... #endif { /*把这个obj挂入空闲对象链接表*/ unsigned int objnr = (objp-slabp->s_mem)/cachep->objsize; slab_bufctl(slabp)[objnr] = slabp->free; slabp->free = objnr; } STATS_DEC_ACTIVE(cachep);/*统计*/ /* fixup slab chain */ if (slabp->inuse-- == cachep->num) /*从full到->part full*/ goto moveslab_partial; if (!slabp->inuse)/*从part->empty*/ goto moveslab_free; return;
/*移动slab的位置,simple..*/ moveslab_partial: /* was full. * Even if the page is now empty, we can set c_firstnotfull to * slabp: there are no partial slabs in this case */ { .... } moveslab_free: /* * was partial, now empty. * c_firstnotfull might point to slabp * FIXME: optimize */ { .... } } kmem_slab_destroy: 使用cachep->dtor释放此slab中所有obj.释放此slab使用的页面,还 有slab_t. 代码略.
第四部分 提供的主要接口 对外提供的接口分成几个部分:初始化,cache的创建删除,obj的分配释放,proc系统支持. I) slab初始化部分 最早调用的函数: void __init kmem_cache_init(void)设置cache_cache(cache 的全局 管理机构). void __init kmem_cache_init(void) {/*已经静态初始化了许多,这里比较简单*/ size_t left_over;
init_MUTEX(&cache_chain_sem); INIT_LIST_HEAD(&cache_chain);
kmem_cache_estimate(0, cache_cache.objsize, 0, &left_over, &cache_cache.num); if (!cache_cache.num) BUG();
cache_cache.colour = left_over/cache_cache.colour_off; cache_cache.colour_next = 0; } 然后初始化通用cache(cache_size): void __init kmem_cache_sizes_init(void)不再 罗列其代码,有关cache size初始化的特殊问题已经在第二部分说明.这里不再做其他分析 了. 最后一个处世话是__initcall(kmem_cpucache_init),再init进程中调用.主要工作是由 static void enable_all_cpucaches (void) //完成的. { struct list_head* p;
down(&cache_chain_sem);
p = &cache_cache.next; do { //遍历所有cache,初始化SMP, kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
enable_cpucache(cachep);/*设置bache alloc limit,然后通过IPI中断让各个cpu 初始化私有数据*/ p = cachep->next.next; } while (p != &cache_cache.next);
up(&cache_chain_sem); } II)cache的创建和销毁 kmem_cache_t *kmem_cache_create (const char *name, size_t size, size_t offset, unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), void (*dtor)(void*, kmem_cache_t *, unsigned long)) int kmem_cache_destroy (kmem_cache_t * cachep) int kmem_cache_shrink(kmem_cache_t *cachep) void kmem_cache_reap (int gfp_mask)
/** * kmem_cache_create - 创建一个cache * @name: 用于/proc/slabinfo:本cache的标识符串 * @size: 这cache中对象的大小. * @offset: 从这个偏移使用slab. * @flags: SLAB flags * @ctor: cache 中page 的构造函数. * @dtor: cahce 中page 的解构函数. * * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. * The @ctor is run when new pages are allocated by the cache * and the @dtor is run before the pages are handed back. * The flags are * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) * to catch references to uninitialised memory. * * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check * for buffer overruns. * * %SLAB_NO_REAP - Don't automatically reap this cache when we're under * memory pressure. * * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ kmem_cache_t * kmem_cache_create (const char *name, size_t size, size_t offset, unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), void (*dtor)(void*, kmem_cache_t *, unsigned long)) { const char *func_nm = KERN_ERR "kmem_create: "; size_t left_over, align, slab_size; kmem_cache_t *cachep = NULL;
/* * Sanity checks... these are all serious usage bugs. */ ... //略 #if DEBUG .......... //略 #endif
/* * Always checks flags, a caller might be expecting debug * support which isn't available. */ if (flags & ~CREATE_MASK) //不得指定除此之外的其他flags BUG();
/* 分配 kmem_cache_s*/ cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); if (!cachep) goto opps; memset(cachep, 0, sizeof(kmem_cache_t));
/* Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ if (size & (BYTES_PER_WORD-1)) { //强制obj size WORD 对齐 size += (BYTES_PER_WORD-1); size &= ~(BYTES_PER_WORD-1); printk("%sForcing size word alignment - %s\n", func_nm, name); } #if DEBUG .... //略 #endif align = BYTES_PER_WORD; if (flags & SLAB_HWCACHE_ALIGN) align = L1_CACHE_BYTES;
/* 确定以那种方式管理对象 ( 'on' or 'off' slab.) */ if (size >= (PAGE_SIZE>>3)) //obj过大,则采用off slab 的slab_t /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ flags |= CFLGS_OFF_SLAB;
if (flags & SLAB_HWCACHE_ALIGN) {/*调整obj size,以做到cache line 对齐*/ /* Need to adjust size so that objs are cache aligned. */ /* Small obj size, can get at least two per cache line. */ /* FIXME: only power of 2 supported, was better */ while (size < align/2) align /= 2; size = (size+align-1)&(~(align-1)); }
/* 计算slab的大小(单位page_size), 和每个slab中的对象个数. * This could be made much more intelligent. For now, try to avoid * using high page-orders for slabs. When the gfp() funcs are more * friendly towards high-order requests, this should be changed. */ do { unsigned int break_flag = 0; cal_wastage: kmem_cache_estimate(cachep->gfporder, size, flags, &left_over, &cachep->num); if (break_flag) break; if (cachep->gfporder >= MAX_GFP_ORDER) //slab绝对不超过这个size break; if (!cachep->num) //放不下任何obj? goto next; //只有加大slab if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) {//这个检查 //第二部分有详述 /* Oops, this num of objs will cause problems. */ cachep->gfporder--; break_flag++; goto cal_wastage; }
/* * Large num of objs is good, but v. large slabs are currently * bad for the gfp()s. */ if (cachep->gfporder >= slab_break_gfp_order)//slab中能够放的下obj时 break;//最多2个页面分配给slab.
if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))//浪费少于1/8,可以接受 break; /* Acceptable internal fragmentation. */ next: cachep->gfporder++; } while (1);
if (!cachep->num) { ...//略 goto opps; } /*slab_t 加上bufctl数组的大小*/ slab_size = L1_CACHE_ALIGN(cachep->num*sizeof(kmem_bufctl_t)+sizeof(slab_t)); /* * If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. */ if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {//剩余空间能够盛的下slab_t flags &= ~CFLGS_OFF_SLAB ; //改成in slab slab_t left_over -= slab_size; }
/* Offset must be a multiple of the alignment. */ offset += (align-1); offset &= ~(align-1); if (!offset) offset = L1_CACHE_BYTES; cachep->colour_off = offset; cachep->colour = left_over/offset;
/* init remaining fields */ ........//simple,略
#ifdef CONFIG_SMP if (g_cpucache_up) enable_cpucache(cachep); #endif /* Need the semaphore to access the chain. */ ....//检查是否有相同名字的cache.略 /* There is no reason to lock our new cache before we * link it in - no one knows about it yet... */ list_add(&cachep->next, &cache_chain); up(&cache_chain_sem); opps: return cachep; }
这两个函数: int kmem_cache_destroy (kmem_cache_t * cachep) int kmem_cache_shrink(kmem_cache_t *cachep) 比较,怎么说,简单.destroy就是整个释放掉啊,cache不再存在. shrink将全empty的slab释 放掉.(注意一下cachep->growing这个值.
III)obj的分配释放 void * kmem_cache_alloc (kmem_cache_t *cachep, int flags) void kmem_cache_free (kmem_cache_t *cachep, void *objp) void * kmalloc (size_t size, int flags) void kfree (const void *objp)
1)分配, 核心函数已经讨论了,这里就是smp的一个逻辑,主要的smp优化方式是去掉spin lock-- per cpu的快速分配.
void * kmem_cache_alloc (kmem_cache_t *cachep, int flags)------------------------> static inline void * __kmem_cache_alloc (kmem_cache_t *cachep, int flags) { unsigned long save_flags; void* objp;
kmem_cache_alloc_head(cachep, flags); /* Debug only */ try_again: local_irq_save(save_flags); #ifdef CONFIG_SMP { //对于smp,在per cpu的快速分配表中分配,主要是不用锁cpu cpucache_t *cc = cc_data(cachep);
if (cc) { if (cc->avail) { STATS_INC_ALLOCHIT(cachep); objp = cc_entry(cc)[--cc->avail]; } else { STATS_INC_ALLOCMISS(cachep); objp = kmem_cache_alloc_batch(cachep,flags); //用完就批发点 if (!objp) goto alloc_new_slab_nolock; } } else { spin_lock(&cachep->spinlock); objp = kmem_cache_alloc_one(cachep); //否则就只能到slab中分配了 spin_unlock(&cachep->spinlock); } } #else objp = kmem_cache_alloc_one(cachep); #endif local_irq_restore(save_flags); return objp; alloc_new_slab: #ifdef CONFIG_SMP spin_unlock(&cachep->spinlock); alloc_new_slab_nolock: #endif local_irq_restore(save_flags); if (kmem_cache_grow(cachep, flags)) /* Someone may have stolen our objs. Doesn't matter, we'll * just come back here again. */ goto try_again; return NULL; }
2) 释放kmem_cache_free,和分配雷同,不再讨论. 3) void * kmalloc (size_t size, int flags) void kfree (const void *objp) 和1),2)核心一样,但是就是在通用cache 中寻找一个合适的而已.
IV)proc 支持 这里仅列出参数意义,实现就很简单. proc read: /** * slabinfo_read_proc - generates /proc/slabinfo * @page: scratch area, one page long(向这个buffer 写数据) * @start: pointer to the pointer to the output buffer * @off: offset within /proc/slabinfo the caller is interested in * @count: requested len in bytes(用户想读取的长度) * @eof: eof marker * @data: unused * * The contents of the buffer are * cache-name * num-active-objs * total-objs * object size * num-active-slabs * total-slabs * num-pages-per-slab * + further values on SMP and with statistics enabled */ int slabinfo_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data) ------------------------------> /* * page: buffer to write * start: 有效数据起始指针 * off: user read from 'off' ,in this 'file' * count: quantity user want read */ static int proc_getdata (char*page, char**start, off_t off, int count)
proc write:
/** * slabinfo_write_proc - SMP tuning for the slab allocator * @file: unused * @buffer: user buffer * @count: data len * @data: unused */ int slabinfo_write_proc (struct file *file, const char *buffer, unsigned long count, void *data) 写操作给用户一个机会配置smp per cpu的参数.
第五部分SMP支持简述 SMP这么一个听起来吓人的东西,slab的实现还是简单,清晰,估计,也是有效的.利用per cpu 的快速分配队列,避免一些lock,提高响应速度. 主要就是,smp 分配/释放 对per cpu data的处理:批量分配和释放obj. 支持proc的统计和 配置.一些初始化. 就说这么多. 在罗列代码和概要描述之间,很难取舍. 最重要的是,我想, 自己多看. 2006.8.4 22:02
|
p = cachep->next.next; } while (p != &cache_cache.next);
up(&cache_chain_sem); } II)cache的创建和销毁 kmem_cache_t *kmem_cache_create (const char *name, size_t size, size_t offset, unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), void (*dtor)(void*, kmem_cache_t *, unsigned long)) int kmem_cache_destroy (kmem_cache_t * cachep) int kmem_cache_shrink(kmem_cache_t *cachep) void kmem_cache_reap (int gfp_mask)
/** * kmem_cache_create - 创建一个cache * @name: 用于/proc/slabinfo:本cache的标识符串 * @size: 这cache中对象的大小. * @offset: 从这个偏移使用slab. * @flags: SLAB flags * @ctor: cache 中page 的构造函数. * @dtor: cahce 中page 的解构函数. * * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. * The @ctor is run when new pages are allocated by the cache * and the @dtor is run before the pages are handed back. * The flags are * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) * to catch references to uninitialised memory. * * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check * for buffer overruns. * * %SLAB_NO_REAP - Don't automatically reap this cache when we're under * memory pressure. * * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ kmem_cache_t * kmem_cache_create (const char *name, size_t size, size_t offset, unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), void (*dtor)(void*, kmem_cache_t *, unsigned long)) { const char *func_nm = KERN_ERR "kmem_create: "; size_t left_over, align, slab_size; kmem_cache_t *cachep = NULL;
/* * Sanity checks... these are all serious usage bugs. */ ... //略 #if DEBUG .......... //略 #endif
/* * Always checks flags, a caller might be expecting debug * support which isn't available. */ if (flags & ~CREATE_MASK) //不得指定除此之外的其他flags BUG();
/* 分配 kmem_cache_s*/ cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); if (!cachep) goto opps; memset(cachep, 0, sizeof(kmem_cache_t));
/* Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ if (size & (BYTES_PER_WORD-1)) { //强制obj size WORD 对齐 size += (BYTES_PER_WORD-1); size &= ~(BYTES_PER_WORD-1); printk("%sForcing size word alignment - %s\n", func_nm, name); } #if DEBUG .... //略 #endif align = BYTES_PER_WORD; if (flags & SLAB_HWCACHE_ALIGN) align = L1_CACHE_BYTES;
/* 确定以那种方式管理对象 ( 'on' or 'off' slab.) */ if (size >= (PAGE_SIZE>>3)) //obj过大,则采用off slab 的slab_t /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ flags |= CFLGS_OFF_SLAB;
if (flags & SLAB_HWCACHE_ALIGN) {/*调整obj size,以做到cache line 对齐*/ /* Need to adjust size so that objs are cache aligned. */ /* Small obj size, can get at least two per cache line. */ /* FIXME: only power of 2 supported, was better */ while (size < align/2) align /= 2; size = (size+align-1)&(~(align-1)); }
/* 计算slab的大小(单位page_size), 和每个slab中的对象个数. * This could be made much more intelligent. For now, try to avoid * using high page-orders for slabs. When the gfp() funcs are more * friendly towards high-order requests, this should be changed. */ do { unsigned int break_flag = 0; cal_wastage: kmem_cache_estimate(cachep->gfporder, size, flags, &left_over, &cachep->num); if (break_flag) break; if (cachep->gfporder >= MAX_GFP_ORDER) //slab绝对不超过这个size break; if (!cachep->num) //放不下任何obj? goto next; //只有加大slab if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) {//这个检查 //第二部分有详述 /* Oops, this num of objs will cause problems. */ cachep->gfporder--; break_flag++; goto cal_wastage; }
/* * Large num of objs is good, but v. large slabs are currently * bad for the gfp()s. */ if (cachep->gfporder >= slab_break_gfp_order)//slab中能够放的下obj时 break;//最多2个页面分配给slab.
if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))//浪费少于1/8,可以接受 break; /* Acceptable internal fragmentation. */ next: cachep->gfporder++; } while (1);
if (!cachep->num) { ...//略 goto opps; } /*slab_t 加上bufctl数组的大小*/ slab_size = L1_CACHE_ALIGN(cachep->num*sizeof(kmem_bufctl_t)+sizeof(slab_t)); /* * If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. */ if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {//剩余空间能够盛的下slab_t flags &= ~CFLGS_OFF_SLAB ; //改成in slab slab_t left_over -= slab_size; }
/* Offset must be a multiple of the alignment. */ offset += (align-1); offset &= ~(align-1); if (!offset) offset = L1_CACHE_BYTES; cachep->colour_off = offset; cachep->colour = left_over/offset;
/* init remaining fields */ ........//simple,略
#ifdef CONFIG_SMP if (g_cpucache_up) enable_cpucache(cachep); #endif /* Need the semaphore to access the chain. */ ....//检查是否有相同名字的cache.略 /* There is no reason to lock our new cache before we * link it in - no one knows about it yet... */ list_add(&cachep->next, &cache_chain); up(&cache_chain_sem); opps: return cachep; }
这两个函数: int kmem_cache_destroy (kmem_cache_t * cachep) int kmem_cache_shrink(kmem_cache_t *cachep) 比较,怎么说,简单.destroy就是整个释放掉啊,cache不再存在. shrink将全empty的slab释 放掉.(注意一下cachep->growing这个值.
III)obj的分配释放 void * kmem_cache_alloc (kmem_cache_t *cachep, int flags) void kmem_cache_free (kmem_cache_t *cachep, void *objp) void * kmalloc (size_t size, int flags) void kfree (const void *objp)
1)分配, 核心函数已经讨论了,这里就是smp的一个逻辑,主要的smp优化方式是去掉spin lock-- per cpu的快速分配.
void * kmem_cache_alloc (kmem_cache_t *cachep, int flags)------------------------> static inline void * __kmem_cache_alloc (kmem_cache_t *cachep, int flags) { unsigned long save_flags; void* objp;
kmem_cache_alloc_head(cachep, flags); /* Debug only */ try_again: local_irq_save(save_flags); #ifdef CONFIG_SMP { //对于smp,在per cpu的快速分配表中分配,主要是不用锁cpu cpucache_t *cc = cc_data(cachep);
if (cc) { if (cc->avail) { STATS_INC_ALLOCHIT(cachep); objp = cc_entry(cc)[--cc->avail]; } else { STATS_INC_ALLOCMISS(cachep); objp = kmem_cache_alloc_batch(cachep,flags); //用完就批发点 if (!objp) goto alloc_new_slab_nolock; } } else { spin_lock(&cachep->spinlock); objp = kmem_cache_alloc_one(cachep); //否则就只能到slab中分配了 spin_unlock(&cachep->spinlock); } } #else objp = kmem_cache_alloc_one(cachep); #endif local_irq_restore(save_flags); return objp; alloc_new_slab: #ifdef CONFIG_SMP spin_unlock(&cachep->spinlock); alloc_new_slab_nolock: #endif local_irq_restore(save_flags); if (kmem_cache_grow(cachep, flags)) /* Someone may have stolen our objs. Doesn't matter, we'll * just come back here again. */ goto try_again; return NULL; }
2) 释放kmem_cache_free,和分配雷同,不再讨论. 3) void * kmalloc (size_t size, int flags) void kfree (const void *objp) 和1),2)核心一样,但是就是在通用cache 中寻找一个合适的而已.
IV)proc 支持 这里仅列出参数意义,实现就很简单. proc read: /** * slabinfo_read_proc - generates /proc/slabinfo * @page: scratch area, one page long(向这个buffer 写数据) * @start: pointer to the pointer to the output buffer * @off: offset within /proc/slabinfo the caller is interested in * @count: requested len in bytes(用户想读取的长度) * @eof: eof marker * @data: unused * * The contents of the buffer are * cache-name * num-active-objs * total-objs * object size * num-active-slabs * total-slabs * num-pages-per-slab * + further values on SMP and with statistics enabled */ int slabinfo_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data) ------------------------------> /* * page: buffer to write * start: 有效数据起始指针 * off: user read from 'off' ,in this 'file' * count: quantity user want read */ static int proc_getdata (char*page, char**start, off_t off, int count)
proc write:
/** * slabinfo_write_proc - SMP tuning for the slab allocator * @file: unused * @buffer: user buffer * @count: data len * @data: unused */ int slabinfo_write_proc (struct file *file, const char *buffer, unsigned long count, void *data) 写操作给用户一个机会配置smp per cpu的参数.
第五部分SMP支持简述 SMP这么一个听起来吓人的东西,slab的实现还是简单,清晰,估计,也是有效的.利用per cpu 的快速分配队列,避免一些lock,提高响应速度. 主要就是,smp 分配/释放 对per cpu data的处理:批量分配和释放obj. 支持proc的统计和 配置.一些初始化. 就说这么多. 在罗列代码和概要描述之间,很难取舍. 最重要的是,我想, 自己多看. 2006.8.4 22:02
|