专题:Linux内存管理专题

关键词:slab/slub/slob、slab描述符、kmalloc、本地/共享对象缓冲池、slabs_partial/slabs_full/slabs_free、avail/limit/batchcount。

Linux内存管理框架图可以知道:slab/slub/slob都是基于伙伴系统。

伙伴系统是以page为单位进行操作的。但是很多场景并不需要如此大的内存分配,slab就是用在这种场景的。

本章节主要内容:从slab相关数据结构讲起,对slab有一个静态的认识;然后介绍slab从创建描述符->分配缓存->释放缓存->销毁描述符介绍整个slab生命周;最后介绍基于slab分配器的kmalloc的运行原理。

slab分配器最终还是由伙伴系统来分配出实际的物理页面,只不过slab分配器在这些连续的物理页面上实现了自己的算法,以此来对小内存块进行管理。

slab分配器相关重要函数有:

struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,---------创建slab描述符kmem_cache,此时并没有真正分配内存
unsigned long, void (*)(void *));
void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);------------------分配slab缓存对象
void kmem_cache_free(struct kmem_cache *, void *);-------------------------释放slab缓存对象
void kmem_cache_destroy(struct kmem_cache *);-----------------------------销毁slab描述符

1. slab相关数据结构

slab对象的描述符struct kmem_cache:

struct kmem_cache {
struct array_cache __percpu *cpu_cache; /* 1) Cache tunables. Protected by slab_mutex */
unsigned int batchcount;-----------------------------------表示当前CPU本地缓冲池array_cache为空时,从共享缓冲池或者slabs_partial/slabs_free列表中获取对象的数目。
unsigned int limit;----------------------------------------表示当本地对象缓冲池空闲对象数目大于limit时就会主动释放batchcount个对象,便于内核回收和销毁slab。
unsigned int shared; unsigned int size;-----------------------------------------align过后的对象长度
struct reciprocal_value reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend */ unsigned int flags; /* constant flags */------------分配掩码
unsigned int num; /* # of objs per slab */----------slab中有多少个对象 /* 3) cache_grow/shrink */
/* order of pgs per slab (2^n) */
unsigned int gfporder;------------------------------------此slab占用z^gfporder个页面 /* force GFP flags, e.g. GFP_DMA */
gfp_t allocflags; size_t colour; /* cache colouring range */----一个slab有几个不同的cache line
unsigned int colour_off; /* colour offset */----------一个cache order的长度,和L1 Cache Line长度相同
struct kmem_cache *freelist_cache;
unsigned int freelist_size; /* constructor func */
void (*ctor)(void *obj); /* 4) cache creation/removal */
const char *name;----------------------------------------slab描述符的名称
struct list_head list;
int refcount;--------------------------------------------被引用的次数,供slab描述符销毁参考
int object_size;-----------------------------------------对象的实际大小
int align;-----------------------------------------------对齐的大小 /* 5) statistics */
#ifdef CONFIG_DEBUG_SLAB
unsigned long num_active;
unsigned long num_allocations;
unsigned long high_mark;
unsigned long grown;
unsigned long reaped;
unsigned long errors;
unsigned long max_freeable;
unsigned long node_allocs;
unsigned long node_frees;
unsigned long node_overflow;
atomic_t allochit;
atomic_t allocmiss;
atomic_t freehit;
atomic_t freemiss; /*
* If debugging is enabled, then the allocator can add additional
* fields and/or padding to every object. size contains the total
* object size including these internal fields, the following two
* variables contain the offset to the user object and its size.
*/
int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
#ifdef CONFIG_MEMCG_KMEM
struct memcg_cache_params memcg_params;
#endif struct kmem_cache_node *node[MAX_NUMNODES];-------slab对应的节点的struct kmem_cache_node数据结构
}

本地CPU缓冲池struct array_cache:

struct array_cache {
unsigned int avail;-------------对象缓冲池中可用的对象数目
unsigned int limit;
unsigned int batchcount;
unsigned int touched;----------从缓冲池移除一个对象时,touched置1;收缩缓存时,touched置0。
void *entry[];-----------------保存对象的实体
};

内存节点的slab列表:

/*
* The slab lists for all objects.
*/
struct kmem_cache_node {
spinlock_t list_lock; #ifdef CONFIG_SLAB
struct list_head slabs_partial; /* partial list first, better asm code */----slab链表中部分对象空闲
struct list_head slabs_full;----------------------------------------------------slab链表中没有对象空闲
struct list_head slabs_free;----------------------------------------------------slab链表中所有对象空闲
unsigned long free_objects;-----------------------------------------------------三个链表中空闲对象数目
unsigned int free_limit;--------------------------------------------------------slab中可容许的空闲对象数目最大阈值。
unsigned int colour_next; /* Per-node cache coloring */
struct array_cache *shared; /* shared per node */----------------------------多核CPU公用的共享对象缓冲池
struct alien_cache **alien; /* on other nodes */
unsigned long next_reap; /* updated without locking */
int free_touched; /* updated without locking */
#endif #ifdef CONFIG_SLUB
unsigned long nr_partial;
struct list_head partial;
#ifdef CONFIG_SLUB_DEBUG
atomic_long_t nr_slabs;
atomic_long_t total_objects;
struct list_head full;
#endif
#endif };

SLAB Flags

/*
* Flags to pass to kmem_cache_create().
* The ones marked DEBUG are only valid if CONFIG_SLAB_DEBUG is set.
*/
#define SLAB_DEBUG_FREE 0x00000100UL /* DEBUG: Perform (expensive) checks on free */
#define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */
#define SLAB_POISON 0x00000800UL /* DEBUG: Poison objects */
#define SLAB_HWCACHE_ALIGN 0x00002000UL /* Align objs on cache lines */
#define SLAB_CACHE_DMA 0x00004000UL /* Use GFP_DMA memory */
#define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */
#define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */

2. 创建slab描述符

kmem_cache_create的最主要功能就是填充struct kmem_cache,主要参数有:

name:slab描述符的名称

size:缓存对象的大小

align:对齐的大小

flags:分配掩码

ctor:对象的构造函数

kmem_cache_create函数调用核心流程是:

kmem_cache_create-----------------------------进行合法性检查,以及是否有现成slab描述符可用
do_kmem_cache_create----------------------将主要参数配置到slab描述符,然后将得到的描述符加入slab_caches全局链表中。
__kmem_cache_create-------------------是创建slab描述符的核心进行对齐操作,计算需要页面,对象数目,对slab着色等等操作。
calculate_slab_order--------------计算slab对象需要的大小,以及一个slab描述符需要多少page
setup_cpu_cache-------------------继续配置slab描述符
struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
...
s = __kmem_cache_alias(name, size, align, flags, ctor);----------------检查是否有现成的slab描述符可用,有即跳转到out_unlock。
if (s)
goto out_unlock; cache_name = kstrdup_const(name, GFP_KERNEL);
if (!cache_name) {
err = -ENOMEM;
goto out_unlock;
} s = do_kmem_cache_create(cache_name, size, size,----------------------调用do_kmem_cache_create创建slab描述符
calculate_alignment(flags, align, size),
flags, ctor, NULL, NULL);
...
return s;
}
static struct kmem_cache *
do_kmem_cache_create
(const char *name, size_t object_size, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *),
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
struct kmem_cache *s;
int err; err = -ENOMEM;
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);-----------------------分配一个struct kmem_cache结构体
if (!s)
goto out; s->name = name;
s->object_size = object_size;
s->size = size;
s->align = align;
s->ctor = ctor;-----------------------------------------------------将参数填入struct kmem_cache结构体
...
err = __kmem_cache_create(s, flags);-------------------------------
...
s->refcount = ;
list_add(&s->list, &slab_caches);----------------------------------将创建的slab描述符加入到全局变量slab_caches中
...
}

__kmem_cache_create是创建slab描述符的核心:

int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
size_t left_over, freelist_size;
size_t ralign = BYTES_PER_WORD;
gfp_t gfp;
int err;
size_t size = cachep->size;
...
/*
* Check that size is in terms of words. This is needed to avoid
* unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned.
*/
if (size & (BYTES_PER_WORD - )) {
size += (BYTES_PER_WORD - );
size &= ~(BYTES_PER_WORD - );----------------------4字节对齐
} if (flags & SLAB_RED_ZONE) {
ralign = REDZONE_ALIGN;
/* If redzoning, ensure that the second redzone is suitably
* aligned, by adjusting the object size accordingly. */
size += REDZONE_ALIGN - ;
size &= ~(REDZONE_ALIGN - );
} /* 3) caller mandated alignment */
if (ralign < cachep->align) {
ralign = cachep->align;
}
/* disable debug if necessary */
if (ralign > __alignof__(unsigned long long))
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
*/
cachep->align = ralign;------------------------------对齐大小设置到struct kmem_cache if (slab_is_available())-----------------------------slab_state>=UP时,可以使用GFP_KERNEL分配,否则只能使用GFP_NOWAIT
gfp = GFP_KERNEL;
else
gfp = GFP_NOWAIT;
...
/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
* it too early on. Always use on-slab management when
* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
if ((size >= (PAGE_SIZE >> )) && !slab_early_init &&
!(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
*/
flags |= CFLGS_OFF_SLAB; size = ALIGN(size, cachep->align);------------------按照cachep->align对size进行对齐
/*
* We should restrict the number of objects in a slab to implement
* byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
*/
if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); left_over = calculate_slab_order(cachep, size, cachep->align, flags); if (!cachep->num)
return -E2BIG; freelist_size = calculate_freelist_size(cachep->num, cachep->align); /*
* If the slab has been placed off-slab, and we have enough space then
* move it on-slab. This is at the expense of any extra colouring.
*/
if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
flags &= ~CFLGS_OFF_SLAB;
left_over -= freelist_size;
} if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
freelist_size = calculate_freelist_size(cachep->num, ); #ifdef CONFIG_PAGE_POISONING
/* If we're going to use the generic kernel_map_pages()
* poisoning, then it's going to smash the contents of
* the redzone and userword anyhow, so switch them off.
*/
if (size % PAGE_SIZE == && flags & SLAB_POISON)
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
} cachep->colour_off = cache_line_size();----------------------------------------L1 Cache line大小,由CONFIG_ARM_L1_CACHE_SHIFT配置,此处为64B。
/* Offset must be a multiple of the alignment. */
if (cachep->colour_off < cachep->align)
cachep->colour_off = cachep->align;
cachep->colour = left_over / cachep->colour_off;
cachep->freelist_size = freelist_size;
cachep->flags = flags;
cachep->allocflags = __GFP_COMP;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
cachep->allocflags |= GFP_DMA;
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) {
cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
/*
* This is a possibility for one of the kmalloc_{dma,}_caches.
* But since we go off slab only for object size greater than
* PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
* in ascending order,this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
} err = setup_cpu_cache(cachep, gfp);-------------------------------------根据slab_state状态进行不同处理,计算limit/batchcount,分配本地对象缓冲池,共享对象缓冲池
if (err) {
__kmem_cache_shutdown(cachep);
return err;
} return ;
}

slab_state用于表示slab分配器的状态:

/*
* State of the slab allocator.
*
* This is used to describe the states of the allocator during bootup.
* Allocators use this to gradually bootstrap themselves. Most allocators
* have the problem that the structures used for managing slab caches are
* allocated from slab caches themselves.
*/
enum slab_state {
DOWN, /* No slab functionality yet */
PARTIAL, /* SLUB: kmem_cache_node available */
PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */
UP, /* Slab caches usable but not all extras yet */
FULL /* Everything is working */------------------------完全初始化
};

calculate_slab_order计算slab的大小,返回值是page order。同时也计算此slab中可以容纳多少个同样大小的对象。

static size_t calculate_slab_order(struct kmem_cache *cachep,
size_t size, size_t align, unsigned long flags)
{
unsigned long offslab_limit;
size_t left_over = ;
int gfporder; for (gfporder = ; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {------从gfporder=0开始,直到KMALLOC_MAX_ORDER=10,即从4KB到4MB大小。
unsigned int num;
size_t remainder; cache_estimate(gfporder, size, align, flags, &remainder, &num);
if (!num)---------------------------------------------------------不等于0则表示gfporder已经满足条件,最低分配到一个size大小的对象。等于0则继续下一次for循环。
continue; /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
if (num > SLAB_OBJ_MAX_NUM)--------------------------------------slab中对象最大数目,SLAB_OBJ_MAX_NUM为255,所以所有的slab对象不超过255
break; if (flags & CFLGS_OFF_SLAB) {
size_t freelist_size_per_obj = sizeof(freelist_idx_t);
/*
* Max number of objs-per-slab for caches which
* use off-slab slabs. Needed to avoid a possible
* looping condition in cache_grow().
*/
if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
freelist_size_per_obj += sizeof(char);
offslab_limit = size;
offslab_limit /= freelist_size_per_obj; if (num > offslab_limit)
break;
} /* Found something acceptable - save it away */
cachep->num = num;
cachep->gfporder = gfporder;
left_over = remainder;-------------------------------------------确定对象个数和需要的页面数
...
        if (left_over * 8 <= (PAGE_SIZE << gfporder))-------------------满足着色条件,退出for循环。
break;
}
return left_over;
}

cache_eastimate根据当前大小2^gfporder来计算可以容纳多少个对象,以及剩下多少空间用于着色。

static void cache_estimate(unsigned long gfporder, size_t buffer_size,
size_t align, int flags, size_t *left_over,
unsigned int *num)
{
int nr_objs;
size_t mgmt_size;
size_t slab_size = PAGE_SIZE << gfporder;
...
if (flags & CFLGS_OFF_SLAB) {
mgmt_size = ;
nr_objs = slab_size / buffer_size; } else {
nr_objs = calculate_nr_objs(slab_size, buffer_size,--------------可以容纳对象数
sizeof(freelist_idx_t), align);
mgmt_size = calculate_freelist_size(nr_objs, align);
}
*num = nr_objs;
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;------------除去对象大小和管理slab额外开销外,剩余部分
}

3. 分配slab对象

kmem_cache_alloc是slab分配缓存对象的核心函数,在slab分配缓存过程中是全程关闭本地中断的。

kmem_cache_alloc-->slab_alloc-->__do_cache_alloc是关中断的。

static __always_inline void *
slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
...
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);-------------------------全程关本地中断
local_irq_restore(save_flags);
...
}

由于没有定义NUMA,所以__do_cache_alloc就仅通过___cache_alloc来分配缓存。

static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
bool force_refill = false; check_irq_off(); ac = cpu_cache_get(cachep);----------------------------------------获取本地对象缓冲池
if (likely(ac->avail)) {-------------------------------------------本地对象缓冲池是否有空闲对象
ac->touched = ;
objp = ac_get_obj(cachep, ac, flags, false);-------------------从本地对象缓冲池中分配一个对象 /*
* Allow for the possibility all avail objects are not allowed
* by the current flags
*/
if (objp) {
STATS_INC_ALLOCHIT(cachep);
goto out;-------------------------------------------------如果成功获得objp,那么直接返回指针。
}
force_refill = true;
} STATS_INC_ALLOCMISS(cachep);
objp = cache_alloc_refill(cachep, flags, force_refill);------------是slab分配缓存的核心
/*
* the 'ac' may be updated by cache_alloc_refill(),
* and kmemleak_erase() requires its correct value.
*/
ac = cpu_cache_get(cachep); out:
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
* treat the array pointers as a reference to the object.
*/
if (objp)
kmemleak_erase(&ac->entry[ac->avail]);
return objp;
}

cache_alloc_refill是slab分配缓存的核心:

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
bool force_refill)
{
int batchcount;
struct kmem_cache_node *n;
struct array_cache *ac;
int node; check_irq_off();
node = numa_mem_id();
if (unlikely(force_refill))
goto force_grow;
retry:
ac = cpu_cache_get(cachep);-----------------------------------------获取本地对象缓冲池ac
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
* If there was little recent activity on this cache, then
* perform only a partial refill. Otherwise we could generate
* refill bouncing.
*/
batchcount = BATCHREFILL_LIMIT;
}
n = get_node(cachep, node);-----------------------------------------找到对应的slab节点 BUG_ON(ac->avail > || !n);
spin_lock(&n->list_lock); /* See if we can refill from the shared array */
if (n->shared && transfer_objects(ac, n->shared, batchcount)) {----判断共享对象缓冲池(n->shared)是否有空想对象。tansfer_objects尝试迁移batchcount个空闲对象到ac中。
n->shared->touched = ;
goto alloc_done;
} while (batchcount > ) {---------------------------------尝试从slabs_partial/slabs_free中分配对象
struct list_head *entry;
struct page *page;
/* Get slab alloc is to come from. */
entry = n->slabs_partial.next;
if (entry == &n->slabs_partial) {
n->free_touched = ;
entry = n->slabs_free.next;
if (entry == &n->slabs_free)
goto must_grow;-----------------------------如果slabs_partial/slabs_free都为空,则跳到must_grow分配对象。
} page = list_entry(entry, struct page, lru);
check_spinlock_acquired(cachep); /*
* The slab was either on partial or free list so
* there must be at least one object available for
* allocation.
*/
BUG_ON(page->active >= cachep->num); while (page->active < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep); ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
node));---------------------ac_put_obj将slab_get_obj获取到的对象迁移到ac中。
} /* move slabp to correct slabp list: */
list_del(&page->lru);
if (page->active == cachep->num)------------------------将获取到的slab挂到合适的链表。
list_add(&page->lru, &n->slabs_full);
else
list_add(&page->lru, &n->slabs_partial);
} must_grow:
n->free_objects -= ac->avail;
alloc_done:
spin_unlock(&n->list_lock); if (unlikely(!ac->avail)) {--------------------------------ac->avail为0表示从共享对象缓冲池、slabs_free/slabs_partial都失败了。
int x;
force_grow:
x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);---在cachep中创建一个slab,并挂到slabs_free链表中。 /* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
node = numa_mem_id(); /* no objects in sight? abort */
if (!x && (ac->avail == || force_refill))
return NULL; if (!ac->avail) /* objects refilled by interrupt? */
goto retry;
}
ac->touched = ; return ac_get_obj(cachep, ac, flags, force_refill);
}

4. 释放slab对象

slab释放对象通过kmem_cache_free进行,在释放过程中也是全程关中断的。

一个slab描述符中可能有多个对象,因此释放对象需要两个参数才能确定释放内容。

void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
unsigned long flags;
cachep = cache_from_obj(cachep, objp);-----------------------------通过对象找到slab描述符
if (!cachep)
return; local_irq_save(flags);
debug_check_no_locks_freed(objp, cachep->object_size);
if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(objp, cachep->object_size);
__cache_free(cachep, objp, _RET_IP_);-------------------------------关本地中断
local_irq_restore(flags); trace_kmem_cache_free(_RET_IP_, objp);
}

cache_from_obj通过要释放对象虚拟地址,找到所在页面,继而找到对应的struct kmem_cache结构体。

然后将转换得到的slab描述符和入参描述符对比,即可判断两者是否有效。

static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
...
page = virt_to_head_page(x);-----------由virt_to_page找到对应的page,再找到first_page。
cachep = page->slab_cache;-------------first_page中有指向slab描述符的slab_cache
if (slab_equal_or_root(cachep, s))-----判断两者是否吻合
return cachep;
...
return s;
}

__cache_free是释放slab对象的核心:

首先通过slab描述符找到本地对象缓冲池;

然后判断ac->avail和ac->limit大小,如果avail超过limit,则需要cache_flusharray去回收空闲对象;

最后ac_put_obj将对象释放到本地对象缓冲池ac中,释放过程结束。

static inline void __cache_free(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
struct array_cache *ac = cpu_cache_get(cachep);----------------找到本地对象缓冲池 check_irq_off();
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller); kmemcheck_slab_free(cachep, objp, cachep->object_size); /*
* Skip calling cache_free_alien() when the platform is not numa.
* This will avoid cache misses that happen while accessing slabp (which
* is per page memory reference) to get nodeid. Instead use a global
* variable to skip the call, which is mostly likely to be present in
* the cache.
*/
if (nr_online_nodes > && cache_free_alien(cachep, objp))
return; if (ac->avail < ac->limit) {
STATS_INC_FREEHIT(cachep);
} else {
STATS_INC_FREEMISS(cachep);
cache_flusharray(cachep, ac);---------------------------------尝试回收空闲对象
} ac_put_obj(cachep, ac, objp);-------------------------------------将对象释放到本地对象缓冲池ac中
}

5. kmalloc分配函数

kmalloc函数基于slab机制,分配的内存大小也是对齐到2^order个字节。

分配的时候是从kmalloc-xxx的slab描述符种分配一个对象。

这些kmalloc-xxx的slab描述符是由create_kmalloc_caches在系统初始换的时候创建的。

PS:下面代码根据slub进行分析。

5.1 kmalloc slab描述符创建

create_kmalloc_caches的调用路径是start_kernel-->mm_init-->kmem_cache_init-->create_kmalloc_caches。

再初始化之前,弄明白这三个参数KMALLOC_SHIFT_LOW, KMALLOC_SHIFT_HIGH, KMALLOC_SHIFT_MAX很重要。

#define CONFIG_ARM_L1_CACHE_SHIFT 6----------------------------------------6,对应64B
=================================================
#define L1_CACHE_SHIFT CONFIG_ARM_L1_CACHE_SHIFT
#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)------------------------即为64B /*
* Memory returned by kmalloc() may be used for DMA, so we must make
* sure that all such allocations are cache aligned. Otherwise,
* unrelated code may cause parts of the buffer to be read into the
* cache before the transfer is done, causing old data to be seen by
* the CPU.
*/
#define ARCH_DMA_MINALIGN L1_CACHE_BYTES-------------------------------和L1 Cache对齐,即64B对齐 =================================================
#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN---------------------------即为64B
#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN--------------------------------即为64B
#define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)------------------------位移量为6,对应64B
=================================================/*
* SLUB directly allocates requests fitting in to an order-1 page
* (PAGE_SIZE*2). Larger requests are passed to the page allocator.
*/
#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)---------------------------位移量为13,对应8KB大小
#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT)--------------------位移量为23,对应8MB大小

所以:

KMALLOC_MIN_SIZE=64 KMALLOC_SHIFT_LOW=6 KMALLOC_SHIFT_HIGH=13 KMALLOC_SHIFT_MAX=23

对于kmalloc尺寸小于192B从哪个slab描述符中分配缓存,进行了特殊的映射。

/*
* Conversion table for small slabs sizes / 8 to the index in the
* kmalloc array. This is necessary for slabs < 192 since we have non power
* of two cache sizes there. The size of larger slabs can be determined using
* fls.
*/
static s8 size_index[] = {
, /* 8 */
, /* 16 */
, /* 24 */
, /* 32 */
, /* 40 */
, /* 48 */
, /* 56 */
, /* 64 */
, /* 72 */
, /* 80 */
, /* 88 */
, /* 96 */
, /* 104 */
, /* 112 */
, /* 120 */
, /* 128 */
, /* 136 */
, /* 144 */
, /* 152 */
, /* 160 */
, /* 168 */
, /* 176 */
, /* 184 */
/* 192 */
};

size_index的数值对应kmalloc_caches的下标,kmalloc_caches的内容由create_kmalloc_caches创建。

/*
* Create the kmalloc array. Some of the regular kmalloc arrays
* may already have been created because they were needed to
* enable allocations for slab creation.
*/
void __init create_kmalloc_caches(unsigned long flags)
{
int i; /*
* Patch up the size_index table if we have strange large alignment
* requirements for the kmalloc array. This is only the case for
* MIPS it seems. The standard arches will not generate any code here.
*
* Largest permitted alignment is 256 bytes due to the way we
* handle the index determination for the smaller caches.
*
* Make sure that nothing crazy happens if someone starts tinkering
* around with ARCH_KMALLOC_MINALIGN
*/
BUILD_BUG_ON(KMALLOC_MIN_SIZE > ||
(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - ))); for (i = ; i < KMALLOC_MIN_SIZE; i += ) {
int elem = size_index_elem(i); if (elem >= ARRAY_SIZE(size_index))
break;
size_index[elem] = KMALLOC_SHIFT_LOW;----------------------------
} if (KMALLOC_MIN_SIZE >= ) {
/*
* The 96 byte size cache is not used if the alignment
* is 64 byte.
*/
for (i = + ; i <= ; i += )
size_index[size_index_elem(i)] = ; } if (KMALLOC_MIN_SIZE >= ) {
/*
* The 192 byte sized cache is not used if the alignment
* is 128 byte. Redirect kmalloc to use the 256 byte cache
* instead.
*/
for (i = + ; i <= ; i += )
size_index[size_index_elem(i)] = ;
}
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {------------------------从order=8开始到order=13,这里创建kmalloc-64/kmalloc-128/kmalloc-256/kmalloc-512/kmalloc-1024/kmalloc-2048/kmalloc-4096/kmalloc-8192
if (!kmalloc_caches[i]) {
kmalloc_caches[i] = create_kmalloc_cache(NULL,
<< i, flags);
} /*
* Caches that are not of the two-to-the-power-of size.
* These have to be created immediately after the
* earlier power of two caches
*/
if (KMALLOC_MIN_SIZE <= && !kmalloc_caches[] && i == )----------------KMALLOC_MIN_SIZE为64,跳过
kmalloc_caches[] = create_kmalloc_cache(NULL, , flags); if (KMALLOC_MIN_SIZE <= && !kmalloc_caches[] && i == )----------------创建kmalloc-192
kmalloc_caches[] = create_kmalloc_cache(NULL, , flags);
} /* Kmalloc array is now usable */
slab_state = UP; for (i = ; i <= KMALLOC_SHIFT_HIGH; i++) {
struct kmem_cache *s = kmalloc_caches[i];
char *n; if (s) {
n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));--------------修改slab描述符名称 BUG_ON(!n);
s->name = n;
}
}
...
}

其中size_index经过重映射之后变成了如下。

所以8B/16B/24B/32B/40B/48B/56B/64B都使用kmalloc-64;

72B/80B/88B/96B/104B/112B/120B/128B都是用kmalloc-128;

136B/144B/152B/160B/168B/176B/184B/192B都使用kmalloc-192。

size_index[0]=6 /*8*/
size_index[1]=6 /*16*/
size_index[2]=6 /*24*/
size_index[3]=6 /*32*/
size_index[4]=6 /*40*/
size_index[5]=6 /*48*/
size_index[6]=6 /*56*/
size_index[7]=6 /*64*/
size_index[8]=7 /*72*/
size_index[9]=7 /*80*/
size_index[10]=7 /*88*/
size_index[11]=7 /*96*/
size_index[12]=7 /*104*/
size_index[13]=7 /*112*/
size_index[14]=7 /*120*/
size_index[15]=7 /*128*/
size_index[16]=2 /*136*/
size_index[17]=2 /*144*/
size_index[18]=2 /*152*/
size_index[19]=2 /*160*/
size_index[20]=2 /*168*/
size_index[21]=2 /*176*/
size_index[22]=2 /*184*/
size_index[23]=2 /*192*/

看看/proc/slabinfo中的最终结果如何?

kmalloc-                           : tunables             : slabdata
kmalloc- : tunables : slabdata
kmalloc- : tunables : slabdata
kmalloc- : tunables : slabdata
kmalloc- : tunables : slabdata
kmalloc- : tunables : slabdata
kmalloc- : tunables : slabdata
kmalloc- : tunables : slabdata
kmalloc- : tunables : slabdata

5.2 kmalloc

kmalloc是按字节分配内存的接口,针对不同大小采取了不同的操作。

KMALLOC_MAX_CACHE_SIZE是一个分界线,大于8KB的内存分配需要kmalloc_large进行处理。

另外对于小于等于192B,通过size_index映射到不同kmalloc-xxx slab描述符。

大于192B小于KMALLOC_MAX_CACHE_SIZE,通过fls找到对应的kmalloc_caches索引号。

static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
if (__builtin_constant_p(size)) {
if (size > KMALLOC_MAX_CACHE_SIZE)---------------------------------大于8KB内存使用kmalloc_large来分配
return kmalloc_large(size, flags);
#ifndef CONFIG_SLOB
if (!(flags & GFP_DMA)) {
int index = kmalloc_index(size);-------------------------------找到slab描述符 if (!index)
return ZERO_SIZE_PTR; return kmem_cache_alloc_trace(kmalloc_caches[index],-----------调用slab_alloc分配缓存
flags, size);
}
#endif
}
return __kmalloc(size, flags);-----------------------------------------另一种情况分支
} 不同分配器分支,这里取slub:
void *__kmalloc(size_t size, gfp_t flags)
{
struct kmem_cache *s;
void *ret; if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))--------------------------再一次检查8KB这个大小,kmalloc_large分配8KB+缓存
return kmalloc_large(size, flags); s = kmalloc_slab(size, flags);----------------------------------------从预分配slab描述符中找到struct kmem_cache。 if (unlikely(ZERO_OR_NULL_PTR(s)))
return s; ret = slab_alloc(s, flags, _RET_IP_);---------------------------------调用slab_alloc分配缓存 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); kasan_kmalloc(s, ret, size); return ret;
} /*
* Find the kmem_cache structure that serves a given size of
* allocation
*/
struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
int index; if (unlikely(size > KMALLOC_MAX_SIZE)) {
WARN_ON_ONCE(!(flags & __GFP_NOWARN));
return NULL;
} if (size <= ) {
if (!size)
return ZERO_SIZE_PTR; index = size_index[size_index_elem(size)];------------------小于等于192B大小通过size_index得出slab描述符索引
} else
index = fls(size - );--------------------------------------fls根据大小计算most-significant位索引,范围从192B~8KB。 #ifdef CONFIG_ZONE_DMA
if (unlikely((flags & GFP_DMA)))
return kmalloc_dma_caches[index]; #endif
return kmalloc_caches[index];
}

为了提高分配缓存的速度,降低函数调用路径。关键函数进行了__always_inline修饰。

kmem_cache_alloc
slab_alloc-->
slab_alloc_node-->
static __always_inline void *slab_alloc(struct kmem_cache *s,
gfp_t gfpflags, unsigned long addr)
{
return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
} static __always_inline void *slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr)

6. 创建slab描述符实验

7. slab分配器相关调试接口

7.1 解读/proc/slabinfo

/proc/slabinfo是slab分配器的统计信息,打开CONFIG_DEBUG_SLAB可以获取更多信息。

slabinfo - version: 2.1 (statistics)
# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab> : tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs> <num_slabs> <sharedavail> : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow> : cpustat <allochit> <allocmiss> <freehit> <freemiss>...
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmalloc- : tunables : slabdata : globalstat : cpustat
kmem_cache : tunables : slabdata : globalstat : cpustat

8 kmem相关Tracepoint

kmem跟踪事件主要跟踪内核slab和page的分配和释放行为,主要可以分为5大部分。

这些events的详细解释参考:Documentation/trace/events-kmem.txt。

8.1 Slab allocation of small objects of unknown type (kmalloc)

那些函数调用?Trace什么样子?有什么用途?

kfree---------------------------kfree
kmalloc-------------------------kmalloc/__kmalloc等类kmalloc函数
kmalloc_node--------------------kmalloc_node/__kmalloc_node等类kmalloc_node函数

kmalloc_node和kmalloc的区别是多了个node参数,对NUMA系统来说需要node进行区分。在非NUMA系统,意义不大。

相关Log如下,从中可以看出调用者call_site,分配内存地址ptr,请求分配大小bytes_req,实际分配大小bytes_alloc,分配掩码gfp_flags。

bytes_alloc>=bytes_req,并且进行了2^order对齐;但是call_site是个地址,可读性较差。

# tracer: nop
#
# entries-in-buffer/entries-written: / #P:
#
# _-----=> irqs-off
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / delay
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
# | | | |||| | |
sh- [] .... 843.814154: kmalloc: call_site=c012fbec ptr=ee042600 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL|GFP_ZERO
sh- [] .... 843.815146: kmalloc: call_site=c0175e4c ptr=eeab2580 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL
sh- [] .... 843.815185: kmalloc: call_site=c0174818 ptr=ee042a00 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL
sh- [] .... 843.816017: kfree: call_site=c0176744 ptr= (null)
sh- [] .... 843.816029: kfree: call_site=c017674c ptr=ee042a00
sh- [] .... 843.816129: kfree: call_site=c0175eb4 ptr=eeab2580
sh- [] .... 843.816143: kfree: call_site=c012ebdc ptr=ee042600
sh- [] .... 843.816149: kfree: call_site=c01300e0 ptr= (null)
sh- [] .... 843.816776: kmalloc: call_site=c0184928 ptr=ee9994c0 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL
sh- [] .... 843.816868: kfree: call_site=c014ff80 ptr=ee9994c0
...

对call_site进行一下简单的改造,使其可以直接打印字符串:

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
old mode
new mode
index 4ad10ba..5c404bb
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -, +, @@ DECLARE_EVENT_CLASS(kmem_alloc,
__entry->gfp_flags = gfp_flags;
), - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
+ TP_printk("call_site=%pf ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
__entry->call_site,
__entry->ptr,
__entry->bytes_req,
@@ -, +, @@ DECLARE_EVENT_CLASS(kmem_alloc_node,
__entry->node = node;
), - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
+ TP_printk("call_site=%pf ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
__entry->call_site,
__entry->ptr,
__entry->bytes_req,
@@ -, +, @@ DECLARE_EVENT_CLASS(kmem_free,
__entry->ptr = ptr;
), - TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+ TP_printk("call_site=%pf ptr=%p", __entry->call_site, __entry->ptr)
); DEFINE_EVENT(kmem_free, kfree,

修改后的结果如下,ptr是kmalloc和kfree的联系枢纽,两者必须成对,不然就可能存在内存泄露。

同时可以看到同一个ptr的kmalloc和kfree的call_site,对此内存的申请释放路径就有个大概的了解。

# tracer: nop
#
# entries-in-buffer/entries-written: / #P:
#
# _-----=> irqs-off
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / delay
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
# | | | |||| | |
sh- [] .... 97.451247: kmalloc: call_site=tracepoint_probe_register ptr=ee3ef400 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL
sh- [] .... 102.511304: kmalloc: call_site=do_execveat_common ptr=ee0dd400 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL|GFP_ZERO
sh- [] .... 102.513041: kmalloc: call_site=load_elf_binary ptr=eead9880 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL
sh- [] .... 102.513149: kmalloc: call_site=load_elf_phdrs ptr=ee0dd000 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL
sh- [] .... 102.513831: kfree: call_site=load_elf_binary ptr= (null)
sh- [] .... 102.513878: kfree: call_site=load_elf_binary ptr=ee0dd000
sh- [] .... 102.513981: kfree: call_site=load_elf_binary ptr=eead9880
sh- [] .... 102.513996: kfree: call_site=free_bprm ptr=ee0dd400
sh- [] .... 102.514002: kfree: call_site=do_execveat_common ptr= (null)
sh- [] .... 102.514629: kmalloc: call_site=proc_self_follow_link ptr=ed4aaf80 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL
sh- [] .... 102.514721: kfree: call_site=kfree_put_link ptr=ed4aaf80
...

所以基于kmalloc/kmalloc_node/kfree这几个events,可以判断一个进程分配了多少内存;在运行过程中是否存在内存泄露,即kmalloc没有对应的kfree。

8.2 Slab allocation of small objects of known type

kmem_cache_alloc/kmem_cache_alloc_node/kmem_cache_free基本上和类kmalloc函数一一对应,两者的使用和表达的含义基本一致。只是对应的分配函数不一样。

kmem_cache_alloc------------------------kmem_cache_alloc
kmem_cache_alloc_node-------------------kmem_cache_alloc_node
kmem_cache_free-------------------------kmem_cache_free

kmem_cache_alloc类事件的用途和kmalloc类基本差不多,可以通过call_site找到调用者;可以通过kmem_cache_alloc和kmem_cache_free是否成对出现而判断内存泄露问题。

实例如下:

# tracer: nop
#
# entries-in-buffer/entries-written: / #P:
#
# _-----=> irqs-off
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / delay
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
# | | | |||| | | sh- [] .... 598.446568: kmem_cache_alloc: call_site=getname_flags ptr=ed4af000 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL
sh- [] .... 598.446611: kmem_cache_alloc: call_site=get_empty_filp ptr=eeb17ac0 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL|GFP_ZERO
sh- [] .... 598.446673: kmem_cache_alloc: call_site=__d_alloc ptr=ee445660 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL
sh- [] .... 598.446751: kmem_cache_free: call_site=putname ptr=ed4af000
sh- [] .... 598.446808: kmem_cache_alloc: call_site=SyS_getcwd ptr=ed4af000 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL
sh- [] .... 598.446839: kmem_cache_free: call_site=SyS_getcwd ptr=ed4af000
sh- [] .... 601.702831: kmem_cache_alloc: call_site=getname_flags ptr=ed4af000 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL
sh- [] .... 601.702884: kmem_cache_alloc: call_site=get_empty_filp ptr=eeb17ac0 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL|GFP_ZERO
sh- [] .... 601.703028: kmem_cache_free: call_site=putname ptr=ed4af000
sh- [] .... 601.703560: kmem_cache_alloc: call_site=copy_process.part. ptr=ee9af080 bytes_req= bytes_alloc= gfp_flags=GFP_KERNEL
...

8.3 Page allocation

mm_page_alloc
mm_page_alloc_zone_locked
mm_page_free
mm_page_free_batched

8.4 Per-CPU Allocator Activity

mm_page_alloc_zone_locked
mm_page_pcpu_drain

8.5 External Fragmentation

mm_page_alloc_extfrag
04-20 11:29