引言

本文重点讲解samples/bpf/sockex1_kern.csamples/bpf/sockex1_user.c这两个文件,并剖析了它们调用的其他外部函数。

  • sockex1_kern.c包含eBPF数据结构和eBPF程序的定义,sockex1_kern.c会被编译为sockex1_kern.o,这是一个ELF格式的文件。
  • sockex1_user.c中会实现加载器和用户空间逻辑,其中加载器会解析sockex1_kern.o文件,创建map并将其中的eBPF代码挂载到对应的hook点上。

分析samples/bpf/sockex1_kern.c

在不同section上定义相应结构/程序

  • sockex1_kern.c是关于eBPF map和eBPF程序的定义,SEC对应的宏定义是:

    #define SEC(NAME) __attribute__((section(NAME), used))
    

    被SEC(NAME)修饰的定义会被放到elf文件名为"NAME"的section中,有了"used"字段,即便这个定义未被使用过,也不会被编译器移除。

  • 执行readelf -e sockex1_kern.o命令,可以看到我们这个程序自定义的3个section字段,分别是"socket1"、“maps"和"license”。

    [root@localhost bpf]# readelf -e sockex1_kern.o -W
    ELF Header:
      Magic:   7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00 
      Class:                             ELF64
      Data:                              2's complement, little endian
      Version:                           1 (current)
      OS/ABI:                            UNIX - System V
      ABI Version:                       0
      Type:                              REL (Relocatable file)
      Machine:                           <unknown>: 0xf7
      Version:                           0x1
      Entry point address:               0x0
      Start of program headers:          0 (bytes into file)
      Start of section headers:          528 (bytes into file)
      Flags:                             0x0
      Size of this header:               64 (bytes)
      Size of program headers:           0 (bytes)
      Number of program headers:         0
      Size of section headers:           64 (bytes)
      Number of section headers:         10
      Section header string table index: 1
    
    Section Headers:
      [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
      [ 0]                   NULL            0000000000000000 000000 000000 00      0   0  0
      [ 1] .strtab           STRTAB          0000000000000000 0001b8 000057 00      0   0  1
      [ 2] .text             PROGBITS        0000000000000000 000040 000000 00  AX  0   0  4
      [ 3] socket1           PROGBITS        0000000000000000 000040 000078 00  AX  0   0  8
      [ 4] .relsocket1       REL             0000000000000000 000198 000010 10      9   3  8
      [ 5] maps              PROGBITS        0000000000000000 0000b8 00001c 00  WA  0   0  4
      [ 6] license           PROGBITS        0000000000000000 0000d4 000004 00  WA  0   0  1
      [ 7] .eh_frame         PROGBITS        0000000000000000 0000d8 000030 00   A  0   0  8
      [ 8] .rel.eh_frame     REL             0000000000000000 0001a8 000010 10      9   7  8
      [ 9] .symtab           SYMTAB          0000000000000000 000108 000090 18      1   3  8
    Key to Flags:
      W (write), A (alloc), X (execute), M (merge), S (strings), I (info),
      L (link order), O (extra OS processing required), G (group), T (TLS),
      C (compressed), x (unknown), o (OS specific), E (exclude),
      p (processor specific)
    
    There are no program headers in this file.
    
  • 其中:

    • .strtab中会存放各个section的名字以及符号表中entry的名字,可以通过readelf -p .strtab sockex1_kern.o -W命令查看
    • .text是代码,这个例子中为空
    • .rel开头的包含一些重定位信息,是给链接器用的
    • .symtab是符号表,存放自定义的一些符号,比如map名
    • .eh_frame是调试信息段

sockex1_kern.c源码

#include <uapi/linux/bpf.h>
#include <uapi/linux/if_ether.h>
#include <uapi/linux/if_packet.h>
#include <uapi/linux/ip.h>
#include "bpf_helpers.h"

struct bpf_map_def SEC("maps") my_map = {
	.type = BPF_MAP_TYPE_ARRAY,
	.key_size = sizeof(u32),
	.value_size = sizeof(long),
	.max_entries = 256,
};

SEC("socket1")
int bpf_prog1(struct __sk_buff *skb)
{
	int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
	long *value;

	if (skb->pkt_type != PACKET_OUTGOING)
		return 0;

	value = bpf_map_lookup_elem(&my_map, &index);
	if (value)
		__sync_fetch_and_add(value, skb->len);

	return 0;
}
char _license[] SEC("license") = "GPL";

分析samples/bpf/sockex1_user.c

读取ELF文件信息

  • 可以看到,sockex1_user.c一上来就是去解析sockex1_kern.o文件,使用了load_bpf_file(filename)函数,这个函数会去调用do_load_bpf_file(),这个函数会解析elf文件并进行创建、加载工作。

    static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
    {
    	int fd, i, ret, maps_shndx = -1, strtabidx = -1;
    	Elf *elf;
    	GElf_Ehdr ehdr;
    	GElf_Shdr shdr, shdr_prog;
    	Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL;
    	char *shname, *shname_prog;
    	int nr_maps = 0;
    	//... ...
    done:
    	close(fd);
    	return ret;
    }
    
  • 这其中涉及了几个结构体定义:

    • struct Elf:对于ELF文件的整体描述
    • struct GElf_Ehdr:ELF header,记录了ELF的元数据信息,在每个ELF文件最前面定义,包含magic number(标识文件类型)、section的数量等
    • struct GElf_Shdr:ELF section的header,记录section的元数据信息,包含section名称(为整型,对应.strtab中的索引)、类型等
    • struct Elf_Data:具体section中的数据
    {
        //... ...
    /* reset global variables */
    	kern_version = 0;
    	memset(license, 0, sizeof(license));
    	memset(processed_sec, 0, sizeof(processed_sec));
    
    	if (elf_version(EV_CURRENT) == EV_NONE)
    		return 1;
    
    	fd = open(path, O_RDONLY, 0);
    	if (fd < 0)
    		return 1;
    
    	elf = elf_begin(fd, ELF_C_READ, NULL);
    
    	if (!elf)
    		return 1;
    
    	if (gelf_getehdr(elf, &ehdr) != &ehdr)
    		return 1;
    
    	/* clear all kprobes */
    	i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
    	//... ...
    }
    
  • 接下来是一些初始化的工作,读取ELF文件,并从中提取了ELF header信息,然后清除掉了"/sys/kernel/debug/tracing/kprobe_events"下的kprobes探测点

    {
        //... ...
    	/* scan over all elf sections to get license and map info */
    	for (i = 1; i < ehdr.e_shnum; i++) { //ehdr.e_shnum就是section的数量
    
            
    		if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
    			continue;
    
    		if (1) /* helpful for llvm debugging */
    			printf("section %d:%s data %p size %zd link %d flags %d\n",
    			       i, shname, data->d_buf, data->d_size,
    			       shdr.sh_link, (int) shdr.sh_flags);
    
    		if (strcmp(shname, "license") == 0) {
    			processed_sec[i] = true;
    			memcpy(license, data->d_buf, data->d_size);
    		} else if (strcmp(shname, "version") == 0) {
    			processed_sec[i] = true;
    			if (data->d_size != sizeof(int)) {
    				printf("invalid size of version section %zd\n",
    				       data->d_size);
    				return 1;
    			}
    			memcpy(&kern_version, data->d_buf, sizeof(int));
    		} else if (strcmp(shname, "maps") == 0) {
    			int j;
    
    			maps_shndx = i;
    			data_maps = data;
    			for (j = 0; j < MAX_MAPS; j++)
    				map_data[j].fd = -1;
    		} else if (shdr.sh_type == SHT_SYMTAB) {
    			strtabidx = shdr.sh_link;
    			symbols = data;
    		}
    	}
        //... ...
    }
    
  • 接下来的工作就是扫描这个ELF文件,获取每个section的具体信息。get_sec(elf, i, &ehdr, &shname, &shdr, &data)获取具体section的信息,包括section名字、header和数据,具体细节见注释:

    static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname,
    		   GElf_Shdr *shdr, Elf_Data **data)
    {
    	Elf_Scn *scn;
    
    	scn = elf_getscn(elf, i); //获取第i个section
    	if (!scn)
    		return 1;
    
    	if (gelf_getshdr(scn, shdr) != shdr) //获取这个section的header
    		return 2;
    
        //extern char *elf_strptr (Elf *__elf, size_t __index, size_t __offset);
        //这个函数表示要在__elf中索引为index的section里,获取offset索引的string的指针
        //shname是这个section的名字,ehdr->e_shstrndx是.strtab这个section在elf所有section中的索引值
        //shdr->sh_name是整型,表示.strtab中的索引
        //也就是说,最终会去.strtab中shdr->sh_name索引下读取这个section的名字
    	*shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); 
    	if (!*shname || !shdr->sh_size)
    		return 3;
    
    	*data = elf_getdata(scn, 0); //获取section数据
    	if (!*data || elf_getdata(scn, *data) != NULL)
    		return 4;
    
    	return 0;
    }
    
  • 获取到以上每一个section的数据,对licenseversionmaps以及.symtab这几个section做了处理。其中,我们可以看到,在处理maps这个section时,初始化了该section的index值以及section的data,并将map数组的fd都初始化为-1。

加载map定义信息

接下来,开始具体处理maps的内容,这里的核心函数是load_elf_maps_section(map_data, maps_shndx, elf, symbols, strtabidx),首先,明确一下传入的函数参数的含义:

  • map_data:获取到的maps section中的data部分(也就是除开section header后的maps section)
  • maps_shndx:maps section在整个ELF sections中的索引值
  • elf:elf文件的指针
  • symbols:.symtab section中的data部分(符号表数据)
  • strtabidx:.strtab section在整个ELF sections中的索引值(细心的话可以发现,.symtab section header的link字段是指向.strtab section的索引值的,在这个例子中也是通过这个link字段获取的)
{
    //... ...
	if (data_maps) {
		nr_maps = load_elf_maps_section(map_data, maps_shndx,
						elf, symbols, strtabidx);
		if (nr_maps < 0) {
			printf("Error: Failed loading ELF maps (errno:%d):%s\n",
			       nr_maps, strerror(-nr_maps));
			goto done;
		}
		if (load_maps(map_data, nr_maps, fixup_map))
			goto done;
		map_data_count = nr_maps;

		processed_sec[maps_shndx] = true;
	}
    //... ...
}

load_elf_maps_section中,我们遍历了这个symbols(符号表),用readelf命令可以查看这个符号表,符号表的Ndx列表示这个符号所在的section的索引值,Name是该符号的名字:

[root@localhost bpf]# readelf -s sockex1_kern.o -W
Symbol table '.symtab' contains 6 entries:
   Num:    Value          Size Type    Bind   Vis      Ndx Name
     0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT  UND 
     1: 0000000000000068     0 NOTYPE  LOCAL  DEFAULT    3 LBB0_3
     2: 0000000000000000     0 SECTION LOCAL  DEFAULT    3 
     3: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT    6 _license
     4: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT    3 bpf_prog1
     5: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT    5 my_map

结合我们得到的符号表,不难理解这段代码的含义:即遍历符号表,将所有创建的map找到(这个例子中只有一个my_map)。每一次将找到的这个map的symbol entry加入到sym数组中,并对nr_map计数值加1。

接着,我们调用qsort对这个sym数组按照sym entry的st_value值进行升序排序,这里应该就是为了保证在你创建多个map的时候,那个fd的大小顺序就是和你创建map的顺序是一致的(从小到大)。在这个例子里面只有一个map,如果有多个map,你就会发现,你代码里面定义的顺序和这个符号表出现的先后顺序可能不一致,但是和st_value的大小顺序是一致的。举个例子,我们查看sysdig的map定义,并查看其编译产物的elf:

[root@localhost bpf]# readelf -s probe.o -W | grep map
 11718: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT   31 bpf_sys_brk_munmap_mmap_x
 11739: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT   33 bpf_sys_mmap_e
 11937: 00000000000000a8     0 NOTYPE  GLOBAL DEFAULT  227 frame_scratch_map
 11939: 00000000000000fc     0 NOTYPE  GLOBAL DEFAULT  227 local_state_map
 11940: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT  227 perf_map
 11941: 0000000000000150     0 NOTYPE  GLOBAL DEFAULT  227 pgft_major_map
 11943: 0000000000000118     0 NOTYPE  GLOBAL DEFAULT  227 rtt_static_map
 11944: 00000000000000e0     0 NOTYPE  GLOBAL DEFAULT  227 settings_map
 11945: 000000000000016c     0 NOTYPE  GLOBAL DEFAULT  227 stash_map
 11946: 0000000000000134     0 NOTYPE  GLOBAL DEFAULT  227 stash_tuple_map
 11949: 000000000000001c     0 NOTYPE  GLOBAL DEFAULT  227 tail_map
 11950: 00000000000000c4     0 NOTYPE  GLOBAL DEFAULT  227 tmp_scratch_map

可以看出来,这个st_value(第二列的值)的大小顺序是乱的,因此必须要排序,才能保证和我们代码定义的顺序一致。

接下来,我们要从maps section中读取map数据,这里首先有一个兼容问题。这里我们假设每一个map定义所占用的空间是相同的,直接用data_maps->d_size / nr_maps作为每一个map定义所占用的空间。这里有两种情况:

  • elf中的定义所占空间比struct bpf_load_map_def所占空间小,这时候就读取elf中那部分长度就可以
  • elf中的定义所占空间比struct bpf_load_map_def所占空间大,这时候读取struct bpf_load_map_def所需字节,然后判断剩下没读的elf文件中有没有有效值,没有即忽略,否则返回提示信息,说明有比struct bpf_load_map_def更多的有效信息。
static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
				 Elf *elf, Elf_Data *symbols, int strtabidx)
{
	int map_sz_elf, map_sz_copy;
	bool validate_zero = false;
	Elf_Data *data_maps;
	int i, nr_maps;
	GElf_Sym *sym;
	Elf_Scn *scn;
	int copy_sz;

	if (maps_shndx < 0)
		return -EINVAL;
	if (!symbols)
		return -EINVAL;

	/* Get data for maps section via elf index */
	scn = elf_getscn(elf, maps_shndx);
	if (scn)
		data_maps = elf_getdata(scn, NULL); //获取map section data
	if (!scn || !data_maps) {
		printf("Failed to get Elf_Data from maps section %d\n",
		       maps_shndx);
		return -EINVAL;
	}

	/* For each map get corrosponding symbol table entry */
	sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym)); 
	/*symbols是符号表,对应.symtab这个section,sockex1_kern.o的symtab有144字节,GElf_Sym的大小是24字节*/
    //遍历符号表,把所有创建的map的symbol entry加入到sym数组中
	for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
		assert(nr_maps < MAX_MAPS+1);
		if (!gelf_getsym(symbols, i, &sym[nr_maps]))
			continue;
		if (sym[nr_maps].st_shndx != maps_shndx)
			continue;
		/* Only increment iif maps section */
		nr_maps++;
	}

	/* Align to map_fd[] order, via sort on offset in sym.st_value */
    //将所有的map symbol entry,按照它们的sym.st_value值进行升序排序,保证和map_fd[]顺序对齐
    //这里应该就是为了保证在你创建多个map的时候,那个fd的大小顺序就是和你创建map的顺序是一致的(从小到大)
	qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols);

	/* Keeping compatible with ELF maps section changes
	 * ------------------------------------------------
	 * The program size of struct bpf_load_map_def is known by loader
	 * code, but struct stored in ELF file can be different.
	 *
	 * Unfortunately sym[i].st_size is zero.  To calculate the
	 * struct size stored in the ELF file, assume all struct have
	 * the same size, and simply divide with number of map
	 * symbols.
	 */
	map_sz_elf = data_maps->d_size / nr_maps;
	map_sz_copy = sizeof(struct bpf_load_map_def);
	if (map_sz_elf < map_sz_copy) {
		/*
		 * Backward compat, loading older ELF file with
		 * smaller struct, keeping remaining bytes zero.
		 */
		map_sz_copy = map_sz_elf;
	} else if (map_sz_elf > map_sz_copy) {
		/*
		 * Forward compat, loading newer ELF file with larger
		 * struct with unknown features. Assume zero means
		 * feature not used.  Thus, validate rest of struct
		 * data is zero.
		 */
		validate_zero = true;
	}

	/* Memcpy relevant part of ELF maps data to loader maps */
	for (i = 0; i < nr_maps; i++) {
		struct bpf_load_map_def *def; //map定义的结构体
		unsigned char *addr, *end;
		const char *map_name;
		size_t offset;

        //从strtab中获取map名,下标由sym[i].st_name给出
		map_name = elf_strptr(elf, strtabidx, sym[i].st_name);
		maps[i].name = strdup(map_name);
		if (!maps[i].name) {
			printf("strdup(%s): %s(%d)\n", map_name,
			       strerror(errno), errno);
			free(sym);
			return -errno;
		}

		/* Symbol value is offset into ELF maps section data area */
		offset = sym[i].st_value; //获取maps section内部偏移
		def = (struct bpf_load_map_def *)(data_maps->d_buf + offset);
		maps[i].elf_offset = offset;
		memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def));
		memcpy(&maps[i].def, def, map_sz_copy); //将定义拷贝给maps[i].def

		/* Verify no newer features were requested */
        //如果elf文件中的定义比bpf_load_map_def更长,则判断没有拷贝过来的那部分空间中有没有有效值,
        //有的话就返回-EFBIG,否则忽略
		if (validate_zero) { 
			addr = (unsigned char*) def + map_sz_copy;
			end  = (unsigned char*) def + map_sz_elf;
			for (; addr < end; addr++) {
				if (*addr != 0) {
					free(sym);
					return -EFBIG;
				}
			}
		}
	}

	free(sym);
	return nr_maps;
}
  • 总之,load_elf_maps_section函数从maps section中获取了所有map的定义信息,并存储到了maps数组中,然后返回map的数量。

创建map

接下来我们关注load_maps(map_data, nr_maps, fixup_map)函数,这个函数负责利用刚才获得的maps数组信息,真正创建map:这个函数逻辑很简单,fixup_map字段允许用户自定义map的fd,这里不过多阐述。我们往下可以看到,BPF Map分为两大类:

  • BPF_MAP_TYPE_ARRAY_OF_MAPS或BPF_MAP_TYPE_HASH_OF_MAPS:这是一种map-in-map类型,我们调用bpf_create_map_in_map_node进行创建
  • 其他普通类型,我们调用bpf_create_map_node进行创建

这两个创建函数最终都会调用bpf()系统调用去创建map,这里没有什么特别的地方,就不贴源码了。创建的map,会返回fd,这个fd会存入map_fd[]这个全局数组变量中,以供用户空间对Map进行操作。如果是BPF_MAP_TYPE_PROG_ARRAY,还会单独赋值给prog_array_fd这个变量。这里有一个细节,用户空间可以通过fd获取这个map,那么内核空间呢?答案是内核空间可以直接用该map定义时的名字去访问,因为这个map的名字在创建时也是该map的一部分,也就是说该map既可以通过name标识也可以通过fd来标识(不过这个map名字会在后面的阶段被替换为真正的map指针,见下文)。

static int load_maps(struct bpf_map_data *maps, int nr_maps,
		     fixup_map_cb fixup_map)
{
	int i, numa_node;

	for (i = 0; i < nr_maps; i++) {
		if (fixup_map) {
			fixup_map(&maps[i], i);
			/* Allow userspace to assign map FD prior to creation */
			if (maps[i].fd != -1) {
				map_fd[i] = maps[i].fd;
				continue;
			}
		}

		numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ?
			maps[i].def.numa_node : -1;

		if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
		    maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
			int inner_map_fd = map_fd[maps[i].def.inner_map_idx];

			map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type,
							maps[i].name,
							maps[i].def.key_size,
							inner_map_fd,
							maps[i].def.max_entries,
							maps[i].def.map_flags,
							numa_node);
		} else {
			map_fd[i] = bpf_create_map_node(maps[i].def.type,
							maps[i].name,
							maps[i].def.key_size,
							maps[i].def.value_size,
							maps[i].def.max_entries,
							maps[i].def.map_flags,
							numa_node);
		}
		if (map_fd[i] < 0) {
			printf("failed to create a map: %d %s\n",
			       errno, strerror(errno));
			return 1;
		}
		maps[i].fd = map_fd[i];

		if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY)
			prog_array_fd = map_fd[i];
	}
	return 0;
}

行文至此,我们把创建map这一段讲解完了。

解析可重定位section并为map重写bpf指令

接下来do_load_bpf_file将开始解析可重定位section部分:在此之前,我们先看一下socket1这个section的内容:一条bpf字节码指令占用8个字节,可以看出这个Section共有15条bpf指令。

[root@localhost bpf]# readelf -x socket1 sockex1_kern.o -W
Hex dump of section 'socket1':
 NOTE: This section has relocations against it, but these have NOT been applied to this dump.
  0x00000000 bf160000 00000000 30000000 17000000 ........0.......
  0x00000010 630afcff 00000000 61610400 00000000 c.......aa......
  0x00000020 55010800 04000000 bfa20000 00000000 U...............
  0x00000030 07020000 fcffffff 18010000 00000000 ................
  0x00000040 00000000 00000000 85000000 01000000 ................
  0x00000050 15000200 00000000 61610000 00000000 ........aa......
  0x00000060 db100000 00000000 b7000000 00000000 ................
  0x00000070 95000000 00000000                   ........

利用llvm-objdump工具先看一下"socket1"这个section中的BPF指令:

[root@localhost bpf]# llvm-objdump -disassemble-all sockex1_kern.o
sockex1_kern.o:	file format ELF64-BPF
#... ...省略一些
Disassembly of section socket1:
bpf_prog1:
       0:	bf 16 00 00 00 00 00 00 	r6 = r1
       1:	30 00 00 00 17 00 00 00 	r0 = *(u8 *)skb[23]
       2:	63 0a fc ff 00 00 00 00 	*(u32 *)(r10 - 4) = r0
       3:	61 61 04 00 00 00 00 00 	r1 = *(u32 *)(r6 + 4)
       4:	55 01 08 00 04 00 00 00 	if r1 != 4 goto +8 <LBB0_3>
       5:	bf a2 00 00 00 00 00 00 	r2 = r10
       6:	07 02 00 00 fc ff ff ff 	r2 += -4
       7:	18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 	r1 = 0 ll
       9:	85 00 00 00 01 00 00 00 	call 1
      10:	15 00 02 00 00 00 00 00 	if r0 == 0 goto +2 <LBB0_3>
      11:	61 61 00 00 00 00 00 00 	r1 = *(u32 *)(r6 + 0)
      12:	db 10 00 00 00 00 00 00 	lock *(u64 *)(r0 + 0) += r1
LBB0_3:
      13:	b7 00 00 00 00 00 00 00 	r0 = 0
      14:	95 00 00 00 00 00 00 00 	exit

回到我们处理重定位section的代码部分,可以看到,这里解析.rel的section实际上就是为了解析bpf字节码指令(过滤掉了其他无关.rel类型的section,比如.eh_frame)

static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
{
	//... ...
	/* process all relo sections, and rewrite bpf insns for maps */
	for (i = 1; i < ehdr.e_shnum; i++) {
		if (processed_sec[i])
			continue;

		if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
			continue;

		if (shdr.sh_type == SHT_REL) { //.rel类型的section
			struct bpf_insn *insns;

			/* locate prog sec that need map fixup (relocations) */
			//从相应的非重定位Section中读取eBPF程序指令 shdr.sh_info指向相应的非重定位的section索引
			if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog,
				    &shdr_prog, &data_prog))
				continue;

			if (shdr_prog.sh_type != SHT_PROGBITS ||
			    !(shdr_prog.sh_flags & SHF_EXECINSTR)) //过滤无关section,只关注eBPF代码的section
				continue;

			insns = (struct bpf_insn *) data_prog->d_buf; //获取bpf字节码指令指针
			processed_sec[i] = true; /* relo section */

			if (parse_relo_and_apply(data, symbols, &shdr, insns,
						 map_data, nr_maps))
				continue;
		}
	}
	//... ...

done:
	close(fd);
	return ret;
}

在获取bpf字节码程序后,调用了parse_relo_and_apply函数,该函数定义如下:这个函数的主要功能就是从可重定位section提供的信息中(包括在bpf字节码指令的哪个偏移位置重定位以及重定位的符号)去完成map_fd的替换。由于map时在用户态创建的,bpf程序被编译时并不知道fd的存在,因此会在操作map的地方留下一个可重定位的信息,在map被创建后,然后利用这个可重定位信息,先定位到对应指令(利用rel.r_offset),再获取对应的符号表表项(利用rel.r_info前32位)。然后我们判断这一条指令的操作码是否是BPF_LD | BPF_IMM | BPF_DW,这个操作码表示要将立即数加载到目标寄存器中,此时立即数为0,因为还没有为它填充fd。之后再去判断这个符号表表项是否和已经创建的map可以匹配关联起来,若可以,则将该map的fd赋值给这条指令的立即数字段。这里还有一个细节,在匹配map之前,src_reg被置为1(BPF_PSEUDO_MAP_FD),我们知道当src_reg为0时,表示立即数就在指令内,那么为什么还要多此一举把它置为1呢,这样看起来好像也不对?原因是,这个fd也不是最终 BPF指令执行时用来标志map的方式,之后,BPF验证器会使用replace_map_fd_with_map_ptr()函数将fd更改为最终的map指针,然后convert_pseudo_ld_imm64()函数会将BPF_PSEUDO_MAP_FD重置为0,这样表示完成了从map_fd到最终map指针的替换。网上有一个图比较清晰的展示了这个过程:

Linux内核源码sample/bpf全网最细解析(一)-LMLPHP

static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
				GElf_Shdr *shdr, struct bpf_insn *insn,
				struct bpf_map_data *maps, int nr_maps)
{
	int i, nrels;

	nrels = shdr->sh_size / shdr->sh_entsize;

	for (i = 0; i < nrels; i++) {
		GElf_Sym sym;
		GElf_Rel rel;
		unsigned int insn_idx;
		bool match = false;
		int j, map_idx;

		gelf_getrel(data, i, &rel);
		//可重定位地址对应的insn数组的索引值
		insn_idx = rel.r_offset / sizeof(struct bpf_insn);

		//获取symtab中的my_map,GELF_R_SYM用于获取前32位,rel.r_info前32位表示可重定位目标在symtab中的索引下标
		gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym);

		//可以反汇编看到对应的指令是0x18,即BPF_LD | BPF_IMM | BPF_DW, 该opcode 表示要将一个 64 位的立即数加载到目标寄存器。
		//此时src_reg为0,表示立即数在指令内
		if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
			printf("invalid relo for insn[%d].code 0x%x\n",
			       insn_idx, insn[insn_idx].code);
			return 1;
		}
		insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;  //src_reg置为1,等到最终替换为map指针时会清零

		/* Match FD relocation against recorded map_data[] offset */
		for (map_idx = 0; map_idx < nr_maps; map_idx++) {
			if (maps[map_idx].elf_offset == sym.st_value) {
				match = true;
				break;
			}
		}
		if (match) {
			insn[insn_idx].imm = maps[map_idx].fd; //将map的fd赋值给立即数
		} else {
			printf("invalid relo for insn[%d] no map_data match\n",
			       insn_idx);
			return 1;
		}

		/*之后,BPF验证器会使用replace_map_fd_with_map_ptr()函数将fd更改为最终的map指针
		然后convert_pseudo_ld_imm64()函数会将BPF_PSEUDO_MAP_FD重置为0,表示完成了map_fd的替换*/
	}

	return 0;
}

加载eBPF程序

接下来,就是具体调用load_and_attach()load_and_attach()会去调用bpf_load_program()(内核实现的函数),将eBPF程序挂载到hook点上(包括验证等),最终就是调用bpf()系统调用,这个系统调用包括了验证bpf指令字节码的实现,大致如下所示:

 sys_bpf()
    --> bpf_prog_load()
        --> bpf_check()
            --> replace_map_fd_with_map_ptr()
           --> do_check()
                --> check_ld_imm()
                ==> check_func_arg()
            --> convert_pseudo_ld_imm64()

加载eBPF程序部分的源码实现:

{
	//... ...
	/* load programs */
	for (i = 1; i < ehdr.e_shnum; i++) {

		if (processed_sec[i])
			continue;

		if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
			continue;

		if (memcmp(shname, "kprobe/", 7) == 0 ||
		    memcmp(shname, "kretprobe/", 10) == 0 ||
		    memcmp(shname, "tracepoint/", 11) == 0 ||
		    memcmp(shname, "raw_tracepoint/", 15) == 0 ||
		    memcmp(shname, "xdp", 3) == 0 ||
		    memcmp(shname, "perf_event", 10) == 0 ||
		    memcmp(shname, "socket", 6) == 0 ||
		    memcmp(shname, "cgroup/", 7) == 0 ||
		    memcmp(shname, "sockops", 7) == 0 ||
		    memcmp(shname, "sk_skb", 6) == 0 ||
		    memcmp(shname, "sk_msg", 6) == 0) {
			ret = load_and_attach(shname, data->d_buf,
					      data->d_size);
			if (ret != 0)
				goto done;
		}
	}
	//... ...
}

load_and_attach()除了实现eBPF程序的加载验证,还有一些其他工作,这里简要分析一下:这里sys_perf_event_open在PMU(Performance Monitoring Unit)上初始化一个硬件性能计数器(PMC: Performance Monitoring Counter)。ioctl开启PMC计数。PMC随着指定硬件事件的发生而自动累加。在PMC 溢出时,PMU 触发一个PMI(Performance Monitoring Interrupt)中断。内核在PMI 中断的处理函数中保存PMC 的计数值,触发中断时的指令地址,当前时间戳以及当前进程的PID,TID,comm 等信息。我们把这些信息统称为一个采样(sample)。内核会将收集到的sample放入用于跟用户空间通信的Ring Buffer。用户空间里的perf分析程序采用mmap机制从ring buffer 中读入采样,并对其解析。

static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
	bool is_socket = strncmp(event, "socket", 6) == 0;
	bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
	bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
	bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
	bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0;
	bool is_xdp = strncmp(event, "xdp", 3) == 0;
	bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
	bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
	bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
	bool is_sockops = strncmp(event, "sockops", 7) == 0;
	bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
	bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0;
	size_t insns_cnt = size / sizeof(struct bpf_insn);
	enum bpf_prog_type prog_type;
	char buf[256];
	int fd, efd, err, id;
	struct perf_event_attr attr = {};

	attr.type = PERF_TYPE_TRACEPOINT;
	attr.sample_type = PERF_SAMPLE_RAW;
	attr.sample_period = 1;
	attr.wakeup_events = 1;

	if (is_socket) {
		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
	} else if (is_kprobe || is_kretprobe) {
		prog_type = BPF_PROG_TYPE_KPROBE;
	} else if (is_tracepoint) {
		prog_type = BPF_PROG_TYPE_TRACEPOINT;
	} else if (is_raw_tracepoint) {
		prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT;
	} else if (is_xdp) {
		prog_type = BPF_PROG_TYPE_XDP;
	} else if (is_perf_event) {
		prog_type = BPF_PROG_TYPE_PERF_EVENT;
	} else if (is_cgroup_skb) {
		prog_type = BPF_PROG_TYPE_CGROUP_SKB;
	} else if (is_cgroup_sk) {
		prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
	} else if (is_sockops) {
		prog_type = BPF_PROG_TYPE_SOCK_OPS;
	} else if (is_sk_skb) {
		prog_type = BPF_PROG_TYPE_SK_SKB;
	} else if (is_sk_msg) {
		prog_type = BPF_PROG_TYPE_SK_MSG;
	} else {
		printf("Unknown event '%s'\n", event);
		return -1;
	}

	if (prog_cnt == MAX_PROGS)
		return -1;

	fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version,
			      bpf_log_buf, BPF_LOG_BUF_SIZE);
	if (fd < 0) {
		printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf);
		return -1;
	}

	prog_fd[prog_cnt++] = fd;

	if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
		return 0;

	if (is_socket || is_sockops || is_sk_skb || is_sk_msg) {
		if (is_socket)
			event += 6;
		else
			event += 7;
		if (*event != '/')
			return 0;
		event++;
		if (!isdigit(*event)) {
			printf("invalid prog number\n");
			return -1;
		}
		return populate_prog_array(event, fd);
	}

	if (is_raw_tracepoint) {
		efd = bpf_raw_tracepoint_open(event + 15, fd);
		if (efd < 0) {
			printf("tracepoint %s %s\n", event + 15, strerror(errno));
			return -1;
		}
		event_fd[prog_cnt - 1] = efd;
		return 0;
	}

	if (is_kprobe || is_kretprobe) {
		bool need_normal_check = true;
		const char *event_prefix = "";

		if (is_kprobe)
			event += 7;
		else
			event += 10;

		if (*event == 0) {
			printf("event name cannot be empty\n");
			return -1;
		}

		if (isdigit(*event))
			return populate_prog_array(event, fd);

#ifdef __x86_64__
		if (strncmp(event, "sys_", 4) == 0) {
			snprintf(buf, sizeof(buf),
				 "echo '%c:__x64_%s __x64_%s' >> /sys/kernel/debug/tracing/kprobe_events",
				 is_kprobe ? 'p' : 'r', event, event);
			err = system(buf);
			if (err >= 0) {
				need_normal_check = false;
				event_prefix = "__x64_";
			}
		}
#endif
		if (need_normal_check) {
			snprintf(buf, sizeof(buf),
				 "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
				 is_kprobe ? 'p' : 'r', event, event);
			err = system(buf);
			if (err < 0) {
				printf("failed to create kprobe '%s' error '%s'\n",
				       event, strerror(errno));
				return -1;
			}
		}

		strcpy(buf, DEBUGFS);
		strcat(buf, "events/kprobes/");
		strcat(buf, event_prefix);
		strcat(buf, event);
		strcat(buf, "/id");
	} else if (is_tracepoint) {
		event += 11;

		if (*event == 0) {
			printf("event name cannot be empty\n");
			return -1;
		}
		strcpy(buf, DEBUGFS);
		strcat(buf, "events/");
		strcat(buf, event);
		strcat(buf, "/id");
	}

	efd = open(buf, O_RDONLY, 0);
	if (efd < 0) {
		printf("failed to open event %s\n", event);
		return -1;
	}

	err = read(efd, buf, sizeof(buf));
	if (err < 0 || err >= sizeof(buf)) {
		printf("read from '%s' failed '%s'\n", event, strerror(errno));
		return -1;
	}

	close(efd);

	buf[err] = 0;
	id = atoi(buf);
	attr.config = id;

	efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
	if (efd < 0) {
		printf("event %d fd %d err %s\n", id, efd, strerror(errno));
		return -1;
	}
	event_fd[prog_cnt - 1] = efd;
	err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
	if (err < 0) {
		printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n",
		       strerror(errno));
		return -1;
	}
	err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
	if (err < 0) {
		printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n",
		       strerror(errno));
		return -1;
	}

	return 0;
}
  • 这里参考两篇文章
    • https://blog.csdn.net/hubingsong/article/details/126776029

    • https://zhuanlan.zhihu.com/p/141694060

sockex1_user.c源码

// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
#include <assert.h>
#include <linux/bpf.h>
#include <bpf/bpf.h>
#include "bpf_load.h"
#include "sock_example.h"
#include <unistd.h>
#include <arpa/inet.h>

int main(int ac, char **argv)
{
	char filename[256];
	FILE *f;
	int i, sock;

	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);

	if (load_bpf_file(filename)) {
		printf("%s", bpf_log_buf);
		return 1;
	}

	sock = open_raw_sock("lo");

	assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
			  sizeof(prog_fd[0])) == 0);

	f = popen("ping -c5 localhost", "r");
	(void) f;

	for (i = 0; i < 5; i++) {
		long long tcp_cnt, udp_cnt, icmp_cnt;
		int key;

		key = IPPROTO_TCP;
		assert(bpf_map_lookup_elem(map_fd[0], &key, &tcp_cnt) == 0);

		key = IPPROTO_UDP;
		assert(bpf_map_lookup_elem(map_fd[0], &key, &udp_cnt) == 0);

		key = IPPROTO_ICMP;
		assert(bpf_map_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0);

		printf("TCP %lld UDP %lld ICMP %lld bytes\n",
		       tcp_cnt, udp_cnt, icmp_cnt);
		sleep(1);
	}

	return 0;
}

09-28 06:48