From 317460317a02a1af512697e6e964298dedd8a163 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:07:59 -0800 Subject: bpf: Introduce bpf_arena. Introduce bpf_arena, which is a sparse shared memory region between the bpf program and user space. Use cases: 1. User space mmap-s bpf_arena and uses it as a traditional mmap-ed anonymous region, like memcached or any key/value storage. The bpf program implements an in-kernel accelerator. XDP prog can search for a key in bpf_arena and return a value without going to user space. 2. The bpf program builds arbitrary data structures in bpf_arena (hash tables, rb-trees, sparse arrays), while user space consumes it. 3. bpf_arena is a "heap" of memory from the bpf program's point of view. The user space may mmap it, but bpf program will not convert pointers to user base at run-time to improve bpf program speed. Initially, the kernel vm_area and user vma are not populated. User space can fault in pages within the range. While servicing a page fault, bpf_arena logic will insert a new page into the kernel and user vmas. The bpf program can allocate pages from that region via bpf_arena_alloc_pages(). This kernel function will insert pages into the kernel vm_area. The subsequent fault-in from user space will populate that page into the user vma. The BPF_F_SEGV_ON_FAULT flag at arena creation time can be used to prevent fault-in from user space. In such a case, if a page is not allocated by the bpf program and not present in the kernel vm_area, the user process will segfault. This is useful for use cases 2 and 3 above. bpf_arena_alloc_pages() is similar to user space mmap(). It allocates pages either at a specific address within the arena or allocates a range with the maple tree. bpf_arena_free_pages() is analogous to munmap(), which frees pages and removes the range from the kernel vm_area and from user process vmas. bpf_arena can be used as a bpf program "heap" of up to 4GB. The speed of bpf program is more important than ease of sharing with user space. This is use case 3. In such a case, the BPF_F_NO_USER_CONV flag is recommended. It will tell the verifier to treat the rX = bpf_arena_cast_user(rY) instruction as a 32-bit move wX = wY, which will improve bpf prog performance. Otherwise, bpf_arena_cast_user is translated by JIT to conditionally add the upper 32 bits of user vm_start (if the pointer is not NULL) to arena pointers before they are stored into memory. This way, user space sees them as valid 64-bit pointers. Diff https://github.com/llvm/llvm-project/pull/84410 enables LLVM BPF backend generate the bpf_addr_space_cast() instruction to cast pointers between address_space(1) which is reserved for bpf_arena pointers and default address space zero. All arena pointers in a bpf program written in C language are tagged as __attribute__((address_space(1))). Hence, clang provides helpful diagnostics when pointers cross address space. Libbpf and the kernel support only address_space == 1. All other address space identifiers are reserved. rX = bpf_addr_space_cast(rY, /* dst_as */ 1, /* src_as */ 0) tells the verifier that rX->type = PTR_TO_ARENA. Any further operations on PTR_TO_ARENA register have to be in the 32-bit domain. The verifier will mark load/store through PTR_TO_ARENA with PROBE_MEM32. JIT will generate them as kern_vm_start + 32bit_addr memory accesses. The behavior is similar to copy_from_kernel_nofault() except that no address checks are necessary. The address is guaranteed to be in the 4GB range. If the page is not present, the destination register is zeroed on read, and the operation is ignored on write. rX = bpf_addr_space_cast(rY, 0, 1) tells the verifier that rX->type = unknown scalar. If arena->map_flags has BPF_F_NO_USER_CONV set, then the verifier converts such cast instructions to mov32. Otherwise, JIT will emit native code equivalent to: rX = (u32)rY; if (rY) rX |= clear_lo32_bits(arena->user_vm_start); /* replace hi32 bits in rX */ After such conversion, the pointer becomes a valid user pointer within bpf_arena range. The user process can access data structures created in bpf_arena without any additional computations. For example, a linked list built by a bpf program can be walked natively by user space. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Reviewed-by: Barret Rhoden Link: https://lore.kernel.org/bpf/20240308010812.89848-2-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 7 +- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 10 + kernel/bpf/Makefile | 3 + kernel/bpf/arena.c | 558 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 11 + kernel/bpf/syscall.c | 36 +++ kernel/bpf/verifier.c | 1 + tools/include/uapi/linux/bpf.h | 10 + 9 files changed, 635 insertions(+), 2 deletions(-) create mode 100644 kernel/bpf/arena.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 95e07673cdc1a0..ea6ab6e0eef9ae 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -37,6 +37,7 @@ struct perf_event; struct bpf_prog; struct bpf_prog_aux; struct bpf_map; +struct bpf_arena; struct sock; struct seq_file; struct btf; @@ -528,8 +529,8 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock); - - +u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena); +u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena); int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; @@ -2215,6 +2216,8 @@ int generic_map_delete_batch(struct bpf_map *map, struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); +int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, + unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG_KMEM void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 94baced5a1ad64..9f2a6b83b49e14 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -132,6 +132,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85ec7fc799d7cc..e30d943db8a480 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1009,6 +1009,7 @@ enum bpf_map_type { BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, + BPF_MAP_TYPE_ARENA, __MAX_BPF_MAP_TYPE }; @@ -1396,6 +1397,12 @@ enum { /* BPF token FD is passed in a corresponding command's token_fd field */ BPF_F_TOKEN_FD = (1U << 16), + +/* When user space page faults in bpf_arena send SIGSEGV instead of inserting new page */ + BPF_F_SEGV_ON_FAULT = (1U << 17), + +/* Do not translate kernel bpf_arena pointers to user pointers */ + BPF_F_NO_USER_CONV = (1U << 18), }; /* Flags for BPF_PROG_QUERY. */ @@ -1467,6 +1474,9 @@ union bpf_attr { * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the * number of hash functions (if 0, the bloom filter will default * to using 5 hash functions). + * + * BPF_MAP_TYPE_ARENA - contains the address where user space + * is going to mmap() the arena. It has to be page aligned. */ __u64 map_extra; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 4ce95acfcaa728..368c5d86b5b7c8 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -15,6 +15,9 @@ obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o +ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) +obj-$(CONFIG_BPF_SYSCALL) += arena.o +endif obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c new file mode 100644 index 00000000000000..86571e760dd613 --- /dev/null +++ b/kernel/bpf/arena.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include +#include + +/* + * bpf_arena is a sparsely populated shared memory region between bpf program and + * user space process. + * + * For example on x86-64 the values could be: + * user_vm_start 7f7d26200000 // picked by mmap() + * kern_vm_start ffffc90001e69000 // picked by get_vm_area() + * For user space all pointers within the arena are normal 8-byte addresses. + * In this example 7f7d26200000 is the address of the first page (pgoff=0). + * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr + * (u32)7f7d26200000 -> 26200000 + * hence + * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb + * kernel memory region. + * + * BPF JITs generate the following code to access arena: + * mov eax, eax // eax has lower 32-bit of user pointer + * mov word ptr [rax + r12 + off], bx + * where r12 == kern_vm_start and off is s16. + * Hence allocate 4Gb + GUARD_SZ/2 on each side. + * + * Initially kernel vm_area and user vma are not populated. + * User space can fault-in any address which will insert the page + * into kernel and user vma. + * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc + * which will insert it into kernel vm_area. + * The later fault-in from user space will populate that page into user vma. + */ + +/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ +#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8) +#define KERN_VM_SZ ((1ull << 32) + GUARD_SZ) + +struct bpf_arena { + struct bpf_map map; + u64 user_vm_start; + u64 user_vm_end; + struct vm_struct *kern_vm; + struct maple_tree mt; + struct list_head vma_list; + struct mutex lock; +}; + +u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) +{ + return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0; +} + +u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) +{ + return arena ? arena->user_vm_start : 0; +} + +static long arena_map_peek_elem(struct bpf_map *map, void *value) +{ + return -EOPNOTSUPP; +} + +static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags) +{ + return -EOPNOTSUPP; +} + +static long arena_map_pop_elem(struct bpf_map *map, void *value) +{ + return -EOPNOTSUPP; +} + +static long arena_map_delete_elem(struct bpf_map *map, void *value) +{ + return -EOPNOTSUPP; +} + +static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EOPNOTSUPP; +} + +static long compute_pgoff(struct bpf_arena *arena, long uaddr) +{ + return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT; +} + +static struct bpf_map *arena_map_alloc(union bpf_attr *attr) +{ + struct vm_struct *kern_vm; + int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_arena *arena; + u64 vm_range; + int err = -ENOMEM; + + if (attr->key_size || attr->value_size || attr->max_entries == 0 || + /* BPF_F_MMAPABLE must be set */ + !(attr->map_flags & BPF_F_MMAPABLE) || + /* No unsupported flags present */ + (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV))) + return ERR_PTR(-EINVAL); + + if (attr->map_extra & ~PAGE_MASK) + /* If non-zero the map_extra is an expected user VMA start address */ + return ERR_PTR(-EINVAL); + + vm_range = (u64)attr->max_entries * PAGE_SIZE; + if (vm_range > (1ull << 32)) + return ERR_PTR(-E2BIG); + + if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32)) + /* user vma must not cross 32-bit boundary */ + return ERR_PTR(-ERANGE); + + kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP); + if (!kern_vm) + return ERR_PTR(-ENOMEM); + + arena = bpf_map_area_alloc(sizeof(*arena), numa_node); + if (!arena) + goto err; + + arena->kern_vm = kern_vm; + arena->user_vm_start = attr->map_extra; + if (arena->user_vm_start) + arena->user_vm_end = arena->user_vm_start + vm_range; + + INIT_LIST_HEAD(&arena->vma_list); + bpf_map_init_from_attr(&arena->map, attr); + mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE); + mutex_init(&arena->lock); + + return &arena->map; +err: + free_vm_area(kern_vm); + return ERR_PTR(err); +} + +static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) +{ + struct page *page; + pte_t pte; + + pte = ptep_get(ptep); + if (!pte_present(pte)) /* sanity check */ + return 0; + page = pte_page(pte); + /* + * We do not update pte here: + * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug + * 2. TLB flushing is batched or deferred. Even if we clear pte, + * the TLB entries can stick around and continue to permit access to + * the freed page. So it all relies on 1. + */ + __free_page(page); + return 0; +} + +static void arena_map_free(struct bpf_map *map) +{ + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + /* + * Check that user vma-s are not around when bpf map is freed. + * mmap() holds vm_file which holds bpf_map refcnt. + * munmap() must have happened on vma followed by arena_vm_close() + * which would clear arena->vma_list. + */ + if (WARN_ON_ONCE(!list_empty(&arena->vma_list))) + return; + + /* + * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area(). + * It unmaps everything from vmalloc area and clears pgtables. + * Call apply_to_existing_page_range() first to find populated ptes and + * free those pages. + */ + apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), + KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); + free_vm_area(arena->kern_vm); + mtree_destroy(&arena->mt); + bpf_map_area_free(arena); +} + +static void *arena_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EINVAL); +} + +static long arena_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return -EOPNOTSUPP; +} + +static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, const struct btf_type *value_type) +{ + return 0; +} + +static u64 arena_map_mem_usage(const struct bpf_map *map) +{ + return 0; +} + +struct vma_list { + struct vm_area_struct *vma; + struct list_head head; +}; + +static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) +{ + struct vma_list *vml; + + vml = kmalloc(sizeof(*vml), GFP_KERNEL); + if (!vml) + return -ENOMEM; + vma->vm_private_data = vml; + vml->vma = vma; + list_add(&vml->head, &arena->vma_list); + return 0; +} + +static void arena_vm_close(struct vm_area_struct *vma) +{ + struct bpf_map *map = vma->vm_file->private_data; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + struct vma_list *vml; + + guard(mutex)(&arena->lock); + vml = vma->vm_private_data; + list_del(&vml->head); + vma->vm_private_data = NULL; + kfree(vml); +} + +#define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */ + +static vm_fault_t arena_vm_fault(struct vm_fault *vmf) +{ + struct bpf_map *map = vmf->vma->vm_file->private_data; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + struct page *page; + long kbase, kaddr; + int ret; + + kbase = bpf_arena_get_kern_vm_start(arena); + kaddr = kbase + (u32)(vmf->address & PAGE_MASK); + + guard(mutex)(&arena->lock); + page = vmalloc_to_page((void *)kaddr); + if (page) + /* already have a page vmap-ed */ + goto out; + + if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) + /* User space requested to segfault when page is not allocated by bpf prog */ + return VM_FAULT_SIGSEGV; + + ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL); + if (ret) + return VM_FAULT_SIGSEGV; + + /* Account into memcg of the process that created bpf_arena */ + ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); + if (ret) { + mtree_erase(&arena->mt, vmf->pgoff); + return VM_FAULT_SIGSEGV; + } + + ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page); + if (ret) { + mtree_erase(&arena->mt, vmf->pgoff); + __free_page(page); + return VM_FAULT_SIGSEGV; + } +out: + page_ref_add(page, 1); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct arena_vm_ops = { + .close = arena_vm_close, + .fault = arena_vm_fault, +}; + +static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct bpf_map *map = filp->private_data; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + long ret; + + if (pgoff) + return -EINVAL; + if (len > (1ull << 32)) + return -E2BIG; + + /* if user_vm_start was specified at arena creation time */ + if (arena->user_vm_start) { + if (len > arena->user_vm_end - arena->user_vm_start) + return -E2BIG; + if (len != arena->user_vm_end - arena->user_vm_start) + return -EINVAL; + if (addr != arena->user_vm_start) + return -EINVAL; + } + + ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags); + if (IS_ERR_VALUE(ret)) + return ret; + if ((ret >> 32) == ((ret + len - 1) >> 32)) + return ret; + if (WARN_ON_ONCE(arena->user_vm_start)) + /* checks at map creation time should prevent this */ + return -EFAULT; + return round_up(ret, 1ull << 32); +} + +static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + guard(mutex)(&arena->lock); + if (arena->user_vm_start && arena->user_vm_start != vma->vm_start) + /* + * If map_extra was not specified at arena creation time then + * 1st user process can do mmap(NULL, ...) to pick user_vm_start + * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..); + * or + * specify addr in map_extra and + * use the same addr later with mmap(addr, MAP_FIXED..); + */ + return -EBUSY; + + if (arena->user_vm_end && arena->user_vm_end != vma->vm_end) + /* all user processes must have the same size of mmap-ed region */ + return -EBUSY; + + /* Earlier checks should prevent this */ + if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > (1ull << 32) || vma->vm_pgoff)) + return -EFAULT; + + if (remember_vma(arena, vma)) + return -ENOMEM; + + arena->user_vm_start = vma->vm_start; + arena->user_vm_end = vma->vm_end; + /* + * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and + * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid + * potential change of user_vm_start. + */ + vm_flags_set(vma, VM_DONTEXPAND); + vma->vm_ops = &arena_vm_ops; + return 0; +} + +static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) +{ + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if ((u64)off > arena->user_vm_end - arena->user_vm_start) + return -ERANGE; + *imm = (unsigned long)arena->user_vm_start; + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena) +const struct bpf_map_ops arena_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = arena_map_alloc, + .map_free = arena_map_free, + .map_direct_value_addr = arena_map_direct_value_addr, + .map_mmap = arena_map_mmap, + .map_get_unmapped_area = arena_get_unmapped_area, + .map_get_next_key = arena_map_get_next_key, + .map_push_elem = arena_map_push_elem, + .map_peek_elem = arena_map_peek_elem, + .map_pop_elem = arena_map_pop_elem, + .map_lookup_elem = arena_map_lookup_elem, + .map_update_elem = arena_map_update_elem, + .map_delete_elem = arena_map_delete_elem, + .map_check_btf = arena_map_check_btf, + .map_mem_usage = arena_map_mem_usage, + .map_btf_id = &bpf_arena_map_btf_ids[0], +}; + +static u64 clear_lo32(u64 val) +{ + return val & ~(u64)~0U; +} + +/* + * Allocate pages and vmap them into kernel vmalloc area. + * Later the pages will be mmaped into user space vma. + */ +static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id) +{ + /* user_vm_end/start are fixed before bpf prog runs */ + long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; + u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); + struct page **pages; + long pgoff = 0; + u32 uaddr32; + int ret, i; + + if (page_cnt > page_cnt_max) + return 0; + + if (uaddr) { + if (uaddr & ~PAGE_MASK) + return 0; + pgoff = compute_pgoff(arena, uaddr); + if (pgoff + page_cnt > page_cnt_max) + /* requested address will be outside of user VMA */ + return 0; + } + + /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */ + pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return 0; + + guard(mutex)(&arena->lock); + + if (uaddr) + ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1, + MT_ENTRY, GFP_KERNEL); + else + ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY, + page_cnt, 0, page_cnt_max - 1, GFP_KERNEL); + if (ret) + goto out_free_pages; + + ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, + node_id, page_cnt, pages); + if (ret) + goto out; + + uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); + /* Earlier checks make sure that uaddr32 + page_cnt * PAGE_SIZE will not overflow 32-bit */ + ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32, + kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages); + if (ret) { + for (i = 0; i < page_cnt; i++) + __free_page(pages[i]); + goto out; + } + kvfree(pages); + return clear_lo32(arena->user_vm_start) + uaddr32; +out: + mtree_erase(&arena->mt, pgoff); +out_free_pages: + kvfree(pages); + return 0; +} + +/* + * If page is present in vmalloc area, unmap it from vmalloc area, + * unmap it from all user space vma-s, + * and free it. + */ +static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) +{ + struct vma_list *vml; + + list_for_each_entry(vml, &arena->vma_list, head) + zap_page_range_single(vml->vma, uaddr, + PAGE_SIZE * page_cnt, NULL); +} + +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) +{ + u64 full_uaddr, uaddr_end; + long kaddr, pgoff, i; + struct page *page; + + /* only aligned lower 32-bit are relevant */ + uaddr = (u32)uaddr; + uaddr &= PAGE_MASK; + full_uaddr = clear_lo32(arena->user_vm_start) + uaddr; + uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT)); + if (full_uaddr >= uaddr_end) + return; + + page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; + + guard(mutex)(&arena->lock); + + pgoff = compute_pgoff(arena, uaddr); + /* clear range */ + mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL); + + if (page_cnt > 1) + /* bulk zap if multiple pages being freed */ + zap_pages(arena, full_uaddr, page_cnt); + + kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; + for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) { + page = vmalloc_to_page((void *)kaddr); + if (!page) + continue; + if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ + zap_pages(arena, full_uaddr, 1); + vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE); + __free_page(page); + } +} + +__bpf_kfunc_start_defs(); + +__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) + return NULL; + + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id); +} + +__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) + return; + arena_free_pages(arena, (long)ptr__ign, page_cnt); +} +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(arena_kfuncs) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_KFUNCS_END(arena_kfuncs) + +static const struct btf_kfunc_id_set common_kfunc_set = { + .owner = THIS_MODULE, + .set = &arena_kfuncs, +}; + +static int __init kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); +} +late_initcall(kfunc_init); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 134b7979f53764..a8ecf69c775481 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2976,6 +2976,17 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, { } +/* for configs without MMU or 32-bit */ +__weak const struct bpf_map_ops arena_map_ops; +__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) +{ + return 0; +} +__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) +{ + return 0; +} + #ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f63f4da4db5ea1..67923e41a07edf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -164,6 +164,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, if (bpf_map_is_offloaded(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || + map->map_type == BPF_MAP_TYPE_ARENA || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { return map->ops->map_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || @@ -479,6 +480,39 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif +int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, + unsigned long nr_pages, struct page **pages) +{ + unsigned long i, j; + struct page *pg; + int ret = 0; +#ifdef CONFIG_MEMCG_KMEM + struct mem_cgroup *memcg, *old_memcg; + + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); +#endif + for (i = 0; i < nr_pages; i++) { + pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); + + if (pg) { + pages[i] = pg; + continue; + } + for (j = 0; j < i; j++) + __free_page(pages[j]); + ret = -ENOMEM; + break; + } + +#ifdef CONFIG_MEMCG_KMEM + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); +#endif + return ret; +} + + static int btf_field_cmp(const void *a, const void *b) { const struct btf_field *f1 = a, *f2 = b; @@ -1176,6 +1210,7 @@ static int map_create(union bpf_attr *attr) } if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && + attr->map_type != BPF_MAP_TYPE_ARENA && attr->map_extra != 0) return -EINVAL; @@ -1265,6 +1300,7 @@ static int map_create(union bpf_attr *attr) case BPF_MAP_TYPE_LRU_PERCPU_HASH: case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_ARENA: if (!bpf_token_capable(token, CAP_BPF)) goto put_token; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bf084c69350753..fbcf2e5e635af5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18108,6 +18108,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_CGRP_STORAGE: case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: + case BPF_MAP_TYPE_ARENA: break; default: verbose(env, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 85ec7fc799d7cc..e30d943db8a480 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1009,6 +1009,7 @@ enum bpf_map_type { BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, + BPF_MAP_TYPE_ARENA, __MAX_BPF_MAP_TYPE }; @@ -1396,6 +1397,12 @@ enum { /* BPF token FD is passed in a corresponding command's token_fd field */ BPF_F_TOKEN_FD = (1U << 16), + +/* When user space page faults in bpf_arena send SIGSEGV instead of inserting new page */ + BPF_F_SEGV_ON_FAULT = (1U << 17), + +/* Do not translate kernel bpf_arena pointers to user pointers */ + BPF_F_NO_USER_CONV = (1U << 18), }; /* Flags for BPF_PROG_QUERY. */ @@ -1467,6 +1474,9 @@ union bpf_attr { * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the * number of hash functions (if 0, the bloom filter will default * to using 5 hash functions). + * + * BPF_MAP_TYPE_ARENA - contains the address where user space + * is going to mmap() the arena. It has to be page aligned. */ __u64 map_extra; -- cgit 1.2.3-korg From 667a86ad9b71d934c444eec193cf3508016f35c5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:00 -0800 Subject: bpf: Disasm support for addr_space_cast instruction. LLVM generates rX = addr_space_cast(rY, dst_addr_space, src_addr_space) instruction when pointers in non-zero address space are used by the bpf program. Recognize this insn in uapi and in bpf disassembler. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240308010812.89848-3-alexei.starovoitov@gmail.com --- include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/disasm.c | 10 ++++++++++ tools/include/uapi/linux/bpf.h | 4 ++++ 3 files changed, 18 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e30d943db8a480..3c42b9f1bada3d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1339,6 +1339,10 @@ enum { */ #define BPF_PSEUDO_KFUNC_CALL 2 +enum bpf_addr_space_cast { + BPF_ADDR_SPACE_CAST = 1, +}; + /* flags for BPF_MAP_UPDATE_ELEM command */ enum { BPF_ANY = 0, /* create new element or update existing */ diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 82b2dbdd048f8e..bd2e2dd04740c1 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -166,6 +166,12 @@ static bool is_movsx(const struct bpf_insn *insn) (insn->off == 8 || insn->off == 16 || insn->off == 32); } +static bool is_addr_space_cast(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && + insn->off == BPF_ADDR_SPACE_CAST; +} + void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) @@ -184,6 +190,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); + } else if (is_addr_space_cast(insn)) { + verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", + insn->code, insn->dst_reg, + insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e30d943db8a480..3c42b9f1bada3d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1339,6 +1339,10 @@ enum { */ #define BPF_PSEUDO_KFUNC_CALL 2 +enum bpf_addr_space_cast { + BPF_ADDR_SPACE_CAST = 1, +}; + /* flags for BPF_MAP_UPDATE_ELEM command */ enum { BPF_ANY = 0, /* create new element or update existing */ -- cgit 1.2.3-korg From 2fe99eb0ccf2bb73df65ebcbbf2f2ff70e63547b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:01 -0800 Subject: bpf: Add x86-64 JIT support for PROBE_MEM32 pseudo instructions. Add support for [LDX | STX | ST], PROBE_MEM32, [B | H | W | DW] instructions. They are similar to PROBE_MEM instructions with the following differences: - PROBE_MEM has to check that the address is in the kernel range with src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE check - PROBE_MEM doesn't support store - PROBE_MEM32 relies on the verifier to clear upper 32-bit in the register - PROBE_MEM32 adds 64-bit kern_vm_start address (which is stored in %r12 in the prologue) Due to bpf_arena constructions such %r12 + %reg + off16 access is guaranteed to be within arena virtual range, so no address check at run-time. - PROBE_MEM32 allows STX and ST. If they fault the store is a nop. When LDX faults the destination register is zeroed. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240308010812.89848-4-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 191 +++++++++++++++++++++++++++++++++++++++++++- include/linux/bpf.h | 1 + include/linux/filter.h | 3 + 3 files changed, 194 insertions(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e1390d1e331b58..38705a1abe62bd 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -113,6 +113,7 @@ static int bpf_size_to_x86_bytes(int bpf_size) /* Pick a register outside of BPF range for JIT internal work */ #define AUX_REG (MAX_BPF_JIT_REG + 1) #define X86_REG_R9 (MAX_BPF_JIT_REG + 2) +#define X86_REG_R12 (MAX_BPF_JIT_REG + 3) /* * The following table maps BPF registers to x86-64 registers. @@ -139,6 +140,7 @@ static const int reg2hex[] = { [BPF_REG_AX] = 2, /* R10 temp register */ [AUX_REG] = 3, /* R11 temp register */ [X86_REG_R9] = 1, /* R9 register, 6th function argument */ + [X86_REG_R12] = 4, /* R12 callee saved */ }; static const int reg2pt_regs[] = { @@ -167,6 +169,7 @@ static bool is_ereg(u32 reg) BIT(BPF_REG_8) | BIT(BPF_REG_9) | BIT(X86_REG_R9) | + BIT(X86_REG_R12) | BIT(BPF_REG_AX)); } @@ -205,6 +208,17 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2) return byte; } +static u8 add_3mod(u8 byte, u32 r1, u32 r2, u32 index) +{ + if (is_ereg(r1)) + byte |= 1; + if (is_ereg(index)) + byte |= 2; + if (is_ereg(r2)) + byte |= 4; + return byte; +} + /* Encode 'dst_reg' register into x86-64 opcode 'byte' */ static u8 add_1reg(u8 byte, u32 dst_reg) { @@ -645,6 +659,8 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog, pop_r12(&prog); } else { pop_callee_regs(&prog, callee_regs_used); + if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena)) + pop_r12(&prog); } EMIT1(0x58); /* pop rax */ @@ -704,6 +720,8 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog, pop_r12(&prog); } else { pop_callee_regs(&prog, callee_regs_used); + if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena)) + pop_r12(&prog); } EMIT1(0x58); /* pop rax */ @@ -887,6 +905,18 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) *pprog = prog; } +static void emit_insn_suffix_SIB(u8 **pprog, u32 ptr_reg, u32 val_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + if (is_imm8(off)) { + EMIT3(add_2reg(0x44, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off); + } else { + EMIT2_off32(add_2reg(0x84, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off); + } + *pprog = prog; +} + /* * Emit a REX byte if it will be necessary to address these registers */ @@ -968,6 +998,37 @@ static void emit_ldsx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) *pprog = prog; } +static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + switch (size) { + case BPF_B: + /* movzx rax, byte ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB6); + break; + case BPF_H: + /* movzx rax, word ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB7); + break; + case BPF_W: + /* mov eax, dword ptr [rax + r12 + off] */ + EMIT2(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x8B); + break; + case BPF_DW: + /* mov rax, qword ptr [rax + r12 + off] */ + EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x8B); + break; + } + emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off); + *pprog = prog; +} + +static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off); +} + /* STX: *(u8*)(dst_reg + off) = src_reg */ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -1002,6 +1063,71 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) *pprog = prog; } +/* STX: *(u8*)(dst_reg + index_reg + off) = src_reg */ +static void emit_stx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + switch (size) { + case BPF_B: + /* mov byte ptr [rax + r12 + off], al */ + EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x88); + break; + case BPF_H: + /* mov word ptr [rax + r12 + off], ax */ + EMIT3(0x66, add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89); + break; + case BPF_W: + /* mov dword ptr [rax + r12 + 1], eax */ + EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89); + break; + case BPF_DW: + /* mov qword ptr [rax + r12 + 1], rax */ + EMIT2(add_3mod(0x48, dst_reg, src_reg, index_reg), 0x89); + break; + } + emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off); + *pprog = prog; +} + +static void emit_stx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + emit_stx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off); +} + +/* ST: *(u8*)(dst_reg + index_reg + off) = imm32 */ +static void emit_st_index(u8 **pprog, u32 size, u32 dst_reg, u32 index_reg, int off, int imm) +{ + u8 *prog = *pprog; + + switch (size) { + case BPF_B: + /* mov byte ptr [rax + r12 + off], imm8 */ + EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC6); + break; + case BPF_H: + /* mov word ptr [rax + r12 + off], imm16 */ + EMIT3(0x66, add_3mod(0x40, dst_reg, 0, index_reg), 0xC7); + break; + case BPF_W: + /* mov dword ptr [rax + r12 + 1], imm32 */ + EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC7); + break; + case BPF_DW: + /* mov qword ptr [rax + r12 + 1], imm32 */ + EMIT2(add_3mod(0x48, dst_reg, 0, index_reg), 0xC7); + break; + } + emit_insn_suffix_SIB(&prog, dst_reg, 0, index_reg, off); + EMIT(imm, bpf_size_to_x86_bytes(size)); + *pprog = prog; +} + +static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm) +{ + emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm); +} + static int emit_atomic(u8 **pprog, u8 atomic_op, u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) { @@ -1043,12 +1169,15 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, return 0; } +#define DONT_CLEAR 1 + bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) { u32 reg = x->fixup >> 8; /* jump over faulting load and clear dest register */ - *(unsigned long *)((void *)regs + reg) = 0; + if (reg != DONT_CLEAR) + *(unsigned long *)((void *)regs + reg) = 0; regs->ip += x->fixup & 0xff; return true; } @@ -1147,11 +1276,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image bool tail_call_seen = false; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; + u64 arena_vm_start; int i, excnt = 0; int ilen, proglen = 0; u8 *prog = temp; int err; + arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena); + detect_reg_usage(insn, insn_cnt, callee_regs_used, &tail_call_seen); @@ -1172,8 +1304,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image push_r12(&prog); push_callee_regs(&prog, all_callee_regs_used); } else { + if (arena_vm_start) + push_r12(&prog); push_callee_regs(&prog, callee_regs_used); } + if (arena_vm_start) + emit_mov_imm64(&prog, X86_REG_R12, + arena_vm_start >> 32, (u32) arena_vm_start); ilen = prog - temp; if (rw_image) @@ -1564,6 +1701,56 @@ st: if (is_imm8(insn->off)) emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); break; + case BPF_ST | BPF_PROBE_MEM32 | BPF_B: + case BPF_ST | BPF_PROBE_MEM32 | BPF_H: + case BPF_ST | BPF_PROBE_MEM32 | BPF_W: + case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: + start_of_ldx = prog; + emit_st_r12(&prog, BPF_SIZE(insn->code), dst_reg, insn->off, insn->imm); + goto populate_extable; + + /* LDX: dst_reg = *(u8*)(src_reg + r12 + off) */ + case BPF_LDX | BPF_PROBE_MEM32 | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: + case BPF_STX | BPF_PROBE_MEM32 | BPF_B: + case BPF_STX | BPF_PROBE_MEM32 | BPF_H: + case BPF_STX | BPF_PROBE_MEM32 | BPF_W: + case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: + start_of_ldx = prog; + if (BPF_CLASS(insn->code) == BPF_LDX) + emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + else + emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); +populate_extable: + { + struct exception_table_entry *ex; + u8 *_insn = image + proglen + (start_of_ldx - temp); + s64 delta; + + if (!bpf_prog->aux->extable) + break; + + if (excnt >= bpf_prog->aux->num_exentries) { + pr_err("mem32 extable bug\n"); + return -EFAULT; + } + ex = &bpf_prog->aux->extable[excnt++]; + + delta = _insn - (u8 *)&ex->insn; + /* switch ex to rw buffer for writes */ + ex = (void *)rw_image + ((void *)ex - (void *)image); + + ex->insn = delta; + + ex->data = EX_TYPE_BPF; + + ex->fixup = (prog - start_of_ldx) | + ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8); + } + break; + /* LDX: dst_reg = *(u8*)(src_reg + off) */ case BPF_LDX | BPF_MEM | BPF_B: case BPF_LDX | BPF_PROBE_MEM | BPF_B: @@ -2036,6 +2223,8 @@ emit_jmp: pop_r12(&prog); } else { pop_callee_regs(&prog, callee_regs_used); + if (arena_vm_start) + pop_r12(&prog); } EMIT1(0xC9); /* leave */ emit_return(&prog, image + addrs[i - 1] + (prog - temp)); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ea6ab6e0eef9ae..8904d16061257f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1458,6 +1458,7 @@ struct bpf_prog_aux { bool xdp_has_frags; bool exception_cb; bool exception_boundary; + struct bpf_arena *arena; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 36cc29a2934cb1..b119f04ecb0b71 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -72,6 +72,9 @@ struct ctl_table_header; /* unused opcode to mark special ldsx instruction. Same as BPF_IND */ #define BPF_PROBE_MEMSX 0x40 +/* unused opcode to mark special load instruction. Same as BPF_MSH */ +#define BPF_PROBE_MEM32 0xa0 + /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 -- cgit 1.2.3-korg From 142fd4d2dcf58b1720a6af644f31de1a5551f219 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:02 -0800 Subject: bpf: Add x86-64 JIT support for bpf_addr_space_cast instruction. LLVM generates bpf_addr_space_cast instruction while translating pointers between native (zero) address space and __attribute__((address_space(N))). The addr_space=1 is reserved as bpf_arena address space. rY = addr_space_cast(rX, 0, 1) is processed by the verifier and converted to normal 32-bit move: wX = wY rY = addr_space_cast(rX, 1, 0) has to be converted by JIT: aux_reg = upper_32_bits of arena->user_vm_start aux_reg <<= 32 wX = wY // clear upper 32 bits of dst register if (wX) // if not zero add upper bits of user_vm_start wX |= aux_reg JIT can do it more efficiently: mov dst_reg32, src_reg32 // 32-bit move shl dst_reg, 32 or dst_reg, user_vm_start rol dst_reg, 32 xor r11, r11 test dst_reg32, dst_reg32 // check if lower 32-bit are zero cmove r11, dst_reg // if so, set dst_reg to zero // Intel swapped src/dst register encoding in CMOVcc Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240308010812.89848-5-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 42 +++++++++++++++++++++++++++++++++++++++++- include/linux/filter.h | 1 + kernel/bpf/core.c | 5 +++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 38705a1abe62bd..27058d7395f660 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1276,13 +1276,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image bool tail_call_seen = false; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; - u64 arena_vm_start; + u64 arena_vm_start, user_vm_start; int i, excnt = 0; int ilen, proglen = 0; u8 *prog = temp; int err; arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena); + user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena); detect_reg_usage(insn, insn_cnt, callee_regs_used, &tail_call_seen); @@ -1350,6 +1351,40 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image break; case BPF_ALU64 | BPF_MOV | BPF_X: + if (insn->off == BPF_ADDR_SPACE_CAST && + insn->imm == 1U << 16) { + if (dst_reg != src_reg) + /* 32-bit mov */ + emit_mov_reg(&prog, false, dst_reg, src_reg); + /* shl dst_reg, 32 */ + maybe_emit_1mod(&prog, dst_reg, true); + EMIT3(0xC1, add_1reg(0xE0, dst_reg), 32); + + /* or dst_reg, user_vm_start */ + maybe_emit_1mod(&prog, dst_reg, true); + if (is_axreg(dst_reg)) + EMIT1_off32(0x0D, user_vm_start >> 32); + else + EMIT2_off32(0x81, add_1reg(0xC8, dst_reg), user_vm_start >> 32); + + /* rol dst_reg, 32 */ + maybe_emit_1mod(&prog, dst_reg, true); + EMIT3(0xC1, add_1reg(0xC0, dst_reg), 32); + + /* xor r11, r11 */ + EMIT3(0x4D, 0x31, 0xDB); + + /* test dst_reg32, dst_reg32; check if lower 32-bit are zero */ + maybe_emit_mod(&prog, dst_reg, dst_reg, false); + EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg)); + + /* cmove r11, dst_reg; if so, set dst_reg to zero */ + /* WARNING: Intel swapped src/dst register encoding in CMOVcc !!! */ + maybe_emit_mod(&prog, AUX_REG, dst_reg, true); + EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg)); + break; + } + fallthrough; case BPF_ALU | BPF_MOV | BPF_X: if (insn->off == 0) emit_mov_reg(&prog, @@ -3432,6 +3467,11 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, } } +bool bpf_jit_supports_arena(void) +{ + return true; +} + bool bpf_jit_supports_ptr_xchg(void) { return true; diff --git a/include/linux/filter.h b/include/linux/filter.h index b119f04ecb0b71..c99bc3df2d28e3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -962,6 +962,7 @@ bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); +bool bpf_jit_supports_arena(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a8ecf69c775481..bdbdc75cdcd57b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2932,6 +2932,11 @@ bool __weak bpf_jit_supports_far_kfunc_call(void) return false; } +bool __weak bpf_jit_supports_arena(void) +{ + return false; +} + /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same -- cgit 1.2.3-korg From 6082b6c328b5486da2b356eae94b8b83c98b5565 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:03 -0800 Subject: bpf: Recognize addr_space_cast instruction in the verifier. rY = addr_space_cast(rX, 0, 1) tells the verifier that rY->type = PTR_TO_ARENA. Any further operations on PTR_TO_ARENA register have to be in 32-bit domain. The verifier will mark load/store through PTR_TO_ARENA with PROBE_MEM32. JIT will generate them as kern_vm_start + 32bit_addr memory accesses. rY = addr_space_cast(rX, 1, 0) tells the verifier that rY->type = unknown scalar. If arena->map_flags has BPF_F_NO_USER_CONV set then convert cast_user to mov32 as well. Otherwise JIT will convert it to: rY = (u32)rX; if (rY) rY |= arena->user_vm_start & ~(u64)~0U; Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240308010812.89848-6-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 1 + include/linux/bpf_verifier.h | 1 + kernel/bpf/log.c | 3 ++ kernel/bpf/syscall.c | 6 +++ kernel/bpf/verifier.c | 107 +++++++++++++++++++++++++++++++++++++++---- 5 files changed, 109 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8904d16061257f..d0c836ba009dd9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -883,6 +883,7 @@ enum bpf_reg_type { * an explicit null check is required for this struct. */ PTR_TO_MEM, /* reg points to valid memory region */ + PTR_TO_ARENA, PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4b0f6600e49919..7cb1b75eee3819 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -548,6 +548,7 @@ struct bpf_insn_aux_data { u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ + bool needs_zext; /* alu op needs to clear upper bits */ bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ bool is_iter_next; /* bpf_iter__next() kfunc call */ bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 63c34e7b071554..2a243cf37c60b1 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -458,6 +458,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_MEM] = "mem", + [PTR_TO_ARENA] = "arena", [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", @@ -693,6 +694,8 @@ static void print_reg_state(struct bpf_verifier_env *env, } verbose(env, "%s", reg_type_str(env, t)); + if (t == PTR_TO_ARENA) + return; if (t == PTR_TO_STACK) { if (state->frameno != reg->frameno) verbose(env, "[%d]", reg->frameno); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 67923e41a07edf..07f2a4db4511c8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4453,6 +4453,12 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, continue; } + if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX || + BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) { + insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM; + continue; + } + if (code != (BPF_LD | BPF_IMM | BPF_DW)) continue; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fbcf2e5e635af5..1358e20d315adf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4386,6 +4386,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_MEM: case PTR_TO_FUNC: case PTR_TO_MAP_KEY: + case PTR_TO_ARENA: return true; default: return false; @@ -5828,6 +5829,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_XDP_SOCK: pointer_desc = "xdp_sock "; break; + case PTR_TO_ARENA: + return 0; default: break; } @@ -6937,6 +6940,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_ARENA) { + if (t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str(env, reg->type)); @@ -8408,6 +8414,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_MEM | MEM_RINGBUF: case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: + case PTR_TO_ARENA: case SCALAR_VALUE: return 0; /* All the rest must be rejected, except PTR_TO_BTF_ID which allows @@ -13852,6 +13859,21 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; src_reg = NULL; + + if (dst_reg->type == PTR_TO_ARENA) { + struct bpf_insn_aux_data *aux = cur_aux(env); + + if (BPF_CLASS(insn->code) == BPF_ALU64) + /* + * 32-bit operations zero upper bits automatically. + * 64-bit operations need to be converted to 32. + */ + aux->needs_zext = true; + + /* Any arithmetic operations are allowed on arena pointers */ + return 0; + } + if (dst_reg->type != SCALAR_VALUE) ptr_reg = dst_reg; else @@ -13969,19 +13991,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0) { - verbose(env, "BPF_MOV uses reserved fields\n"); - return -EINVAL; - } - if (BPF_CLASS(insn->code) == BPF_ALU) { - if (insn->off != 0 && insn->off != 8 && insn->off != 16) { + if ((insn->off != 0 && insn->off != 8 && insn->off != 16) || + insn->imm) { verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } + } else if (insn->off == BPF_ADDR_SPACE_CAST) { + if (insn->imm != 1 && insn->imm != 1u << 16) { + verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n"); + return -EINVAL; + } } else { - if (insn->off != 0 && insn->off != 8 && insn->off != 16 && - insn->off != 32) { + if ((insn->off != 0 && insn->off != 8 && insn->off != 16 && + insn->off != 32) || insn->imm) { verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } @@ -14008,7 +14031,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) struct bpf_reg_state *dst_reg = regs + insn->dst_reg; if (BPF_CLASS(insn->code) == BPF_ALU64) { - if (insn->off == 0) { + if (insn->imm) { + /* off == BPF_ADDR_SPACE_CAST */ + mark_reg_unknown(env, regs, insn->dst_reg); + if (insn->imm == 1) /* cast from as(1) to as(0) */ + dst_reg->type = PTR_TO_ARENA; + } else if (insn->off == 0) { /* case: R1 = R2 * copy register state to dest reg */ @@ -15182,6 +15210,10 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { + if (map->map_type == BPF_MAP_TYPE_ARENA) { + __mark_reg_unknown(env, dst_reg); + return 0; + } dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; WARN_ON_ONCE(map->max_entries != 1); @@ -16568,6 +16600,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, * the same stack frame, since fp-8 in foo != fp-8 in bar */ return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno; + case PTR_TO_ARENA: + return true; default: return regs_exact(rold, rcur, idmap); } @@ -17443,6 +17477,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_TCP_SOCK: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_ARENA: return false; default: return true; @@ -18296,6 +18331,31 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) fdput(f); return -EBUSY; } + if (map->map_type == BPF_MAP_TYPE_ARENA) { + if (env->prog->aux->arena) { + verbose(env, "Only one arena per program\n"); + fdput(f); + return -EBUSY; + } + if (!env->allow_ptr_leaks || !env->bpf_capable) { + verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n"); + fdput(f); + return -EPERM; + } + if (!env->prog->jit_requested) { + verbose(env, "JIT is required to use arena\n"); + return -EOPNOTSUPP; + } + if (!bpf_jit_supports_arena()) { + verbose(env, "JIT doesn't support arena\n"); + return -EOPNOTSUPP; + } + env->prog->aux->arena = (void *)map; + if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) { + verbose(env, "arena's user address must be set via map_extra or mmap()\n"); + return -EINVAL; + } + } fdput(f); next_insn: @@ -18917,6 +18977,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) env->prog->aux->num_exentries++; } continue; + case PTR_TO_ARENA: + if (BPF_MODE(insn->code) == BPF_MEMSX) { + verbose(env, "sign extending loads from arena are not supported yet\n"); + return -EOPNOTSUPP; + } + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); + env->prog->aux->num_exentries++; + continue; default: continue; } @@ -19102,13 +19170,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->nr_linfo = prog->aux->nr_linfo; func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; + func[i]->aux->arena = prog->aux->arena; num_exentries = 0; insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEM32 || BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) num_exentries++; + if ((BPF_CLASS(insn->code) == BPF_STX || + BPF_CLASS(insn->code) == BPF_ST) && + BPF_MODE(insn->code) == BPF_PROBE_MEM32) + num_exentries++; } func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; @@ -19507,6 +19581,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env) } for (i = 0; i < insn_cnt;) { + if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) { + if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) || + (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) { + /* convert to 32-bit mov that clears upper 32-bit */ + insn->code = BPF_ALU | BPF_MOV | BPF_X; + /* clear off, so it's a normal 'wX = wY' from JIT pov */ + insn->off = 0; + } /* cast from as(0) to as(1) should be handled by JIT */ + goto next_insn; + } + + if (env->insn_aux_data[i + delta].needs_zext) + /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */ + insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code); + /* Make divide-by-zero exceptions impossible. */ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || -- cgit 1.2.3-korg From 2edc3de6fb650924a87fffebebc3b7572cbf6e38 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:04 -0800 Subject: bpf: Recognize btf_decl_tag("arg: Arena") as PTR_TO_ARENA. In global bpf functions recognize btf_decl_tag("arg:arena") as PTR_TO_ARENA. Note, when the verifier sees: __weak void foo(struct bar *p) it recognizes 'p' as PTR_TO_MEM and 'struct bar' has to be a struct with scalars. Hence the only way to use arena pointers in global functions is to tag them with "arg:arena". Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240308010812.89848-7-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 1 + kernel/bpf/btf.c | 19 +++++++++++++++---- kernel/bpf/verifier.c | 15 +++++++++++++++ 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0c836ba009dd9..08ad265cb19597 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -712,6 +712,7 @@ enum bpf_arg_type { * on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ + ARG_PTR_TO_ARENA, ARG_CONST_SIZE, /* number of bytes accessed from memory */ ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 170d017e8e4afc..90c4a32d89ff36 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7111,10 +7111,11 @@ cand_cache_unlock: } enum btf_arg_tag { - ARG_TAG_CTX = 0x1, - ARG_TAG_NONNULL = 0x2, - ARG_TAG_TRUSTED = 0x4, - ARG_TAG_NULLABLE = 0x8, + ARG_TAG_CTX = BIT_ULL(0), + ARG_TAG_NONNULL = BIT_ULL(1), + ARG_TAG_TRUSTED = BIT_ULL(2), + ARG_TAG_NULLABLE = BIT_ULL(3), + ARG_TAG_ARENA = BIT_ULL(4), }; /* Process BTF of a function to produce high-level expectation of function @@ -7226,6 +7227,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tags |= ARG_TAG_NONNULL; } else if (strcmp(tag, "nullable") == 0) { tags |= ARG_TAG_NULLABLE; + } else if (strcmp(tag, "arena") == 0) { + tags |= ARG_TAG_ARENA; } else { bpf_log(log, "arg#%d has unsupported set of tags\n", i); return -EOPNOTSUPP; @@ -7280,6 +7283,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) sub->args[i].btf_id = kern_type_id; continue; } + if (tags & ARG_TAG_ARENA) { + if (tags & ~ARG_TAG_ARENA) { + bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i); + return -EINVAL; + } + sub->args[i].arg_type = ARG_PTR_TO_ARENA; + continue; + } if (is_global) { /* generic user data pointer */ u32 mem_size; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1358e20d315adf..d64f7a9b60e81e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9379,6 +9379,18 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "arg#%d is expected to be non-NULL\n", i); return -EINVAL; } + } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) { + /* + * Can pass any value and the kernel won't crash, but + * only PTR_TO_ARENA or SCALAR make sense. Everything + * else is a bug in the bpf program. Point it out to + * the user at the verification time instead of + * run-time debug nightmare. + */ + if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) { + bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno); + return -EINVAL; + } } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0); if (ret) @@ -20448,6 +20460,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */ reg->btf_id = arg->btf_id; reg->id = ++env->id_gen; + } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) { + /* caller can pass either PTR_TO_ARENA or SCALAR */ + mark_reg_unknown(env, regs, i); } else { WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n", i - BPF_REG_1, arg->arg_type); -- cgit 1.2.3-korg From 4d2b56081c32cb33364745da434b88eeaa9d8d8d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:05 -0800 Subject: libbpf: Add __arg_arena to bpf_helpers.h Add __arg_arena to bpf_helpers.h Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240308010812.89848-8-alexei.starovoitov@gmail.com --- tools/lib/bpf/bpf_helpers.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 112b1504e072dc..cd17f6d0791fef 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -193,6 +193,7 @@ enum libbpf_tristate { #define __arg_nonnull __attribute((btf_decl_tag("arg:nonnull"))) #define __arg_nullable __attribute((btf_decl_tag("arg:nullable"))) #define __arg_trusted __attribute((btf_decl_tag("arg:trusted"))) +#define __arg_arena __attribute((btf_decl_tag("arg:arena"))) #ifndef ___bpf_concat #define ___bpf_concat(a, b) a ## b -- cgit 1.2.3-korg From 79ff13e99169ddb0e2277e046dbfb112f77dfac5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:06 -0800 Subject: libbpf: Add support for bpf_arena. mmap() bpf_arena right after creation, since the kernel needs to remember the address returned from mmap. This is user_vm_start. LLVM will generate bpf_arena_cast_user() instructions where necessary and JIT will add upper 32-bit of user_vm_start to such pointers. Fix up bpf_map_mmap_sz() to compute mmap size as map->value_size * map->max_entries for arrays and PAGE_SIZE * map->max_entries for arena. Don't set BTF at arena creation time, since it doesn't support it. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240308010812.89848-9-alexei.starovoitov@gmail.com --- tools/lib/bpf/libbpf.c | 47 +++++++++++++++++++++++++++++++++++-------- tools/lib/bpf/libbpf_probes.c | 7 +++++++ 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 567ad367e7aa7a..34b72207187412 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -185,6 +185,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf", [BPF_MAP_TYPE_CGRP_STORAGE] = "cgrp_storage", + [BPF_MAP_TYPE_ARENA] = "arena", }; static const char * const prog_type_name[] = { @@ -1684,7 +1685,7 @@ static struct bpf_map *bpf_object__add_map(struct bpf_object *obj) return map; } -static size_t bpf_map_mmap_sz(unsigned int value_sz, unsigned int max_entries) +static size_t array_map_mmap_sz(unsigned int value_sz, unsigned int max_entries) { const long page_sz = sysconf(_SC_PAGE_SIZE); size_t map_sz; @@ -1694,6 +1695,20 @@ static size_t bpf_map_mmap_sz(unsigned int value_sz, unsigned int max_entries) return map_sz; } +static size_t bpf_map_mmap_sz(const struct bpf_map *map) +{ + const long page_sz = sysconf(_SC_PAGE_SIZE); + + switch (map->def.type) { + case BPF_MAP_TYPE_ARRAY: + return array_map_mmap_sz(map->def.value_size, map->def.max_entries); + case BPF_MAP_TYPE_ARENA: + return page_sz * map->def.max_entries; + default: + return 0; /* not supported */ + } +} + static int bpf_map_mmap_resize(struct bpf_map *map, size_t old_sz, size_t new_sz) { void *mmaped; @@ -1847,7 +1862,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n", map->name, map->sec_idx, map->sec_offset, def->map_flags); - mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + mmap_sz = bpf_map_mmap_sz(map); map->mmaped = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (map->mmaped == MAP_FAILED) { @@ -5017,6 +5032,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b case BPF_MAP_TYPE_SOCKHASH: case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: + case BPF_MAP_TYPE_ARENA: create_attr.btf_fd = 0; create_attr.btf_key_type_id = 0; create_attr.btf_value_type_id = 0; @@ -5261,7 +5277,19 @@ retry: if (err < 0) goto err_out; } - + if (map->def.type == BPF_MAP_TYPE_ARENA) { + map->mmaped = mmap((void *)map->map_extra, bpf_map_mmap_sz(map), + PROT_READ | PROT_WRITE, + map->map_extra ? MAP_SHARED | MAP_FIXED : MAP_SHARED, + map->fd, 0); + if (map->mmaped == MAP_FAILED) { + err = -errno; + map->mmaped = NULL; + pr_warn("map '%s': failed to mmap arena: %d\n", + map->name, err); + return err; + } + } if (map->init_slots_sz && map->def.type != BPF_MAP_TYPE_PROG_ARRAY) { err = init_map_in_map_slots(obj, map); if (err < 0) @@ -8761,7 +8789,7 @@ static void bpf_map__destroy(struct bpf_map *map) if (map->mmaped) { size_t mmap_sz; - mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + mmap_sz = bpf_map_mmap_sz(map); munmap(map->mmaped, mmap_sz); map->mmaped = NULL; } @@ -9995,11 +10023,14 @@ int bpf_map__set_value_size(struct bpf_map *map, __u32 size) return libbpf_err(-EBUSY); if (map->mmaped) { - int err; size_t mmap_old_sz, mmap_new_sz; + int err; + + if (map->def.type != BPF_MAP_TYPE_ARRAY) + return -EOPNOTSUPP; - mmap_old_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); - mmap_new_sz = bpf_map_mmap_sz(size, map->def.max_entries); + mmap_old_sz = bpf_map_mmap_sz(map); + mmap_new_sz = array_map_mmap_sz(size, map->def.max_entries); err = bpf_map_mmap_resize(map, mmap_old_sz, mmap_new_sz); if (err) { pr_warn("map '%s': failed to resize memory-mapped region: %d\n", @@ -13530,7 +13561,7 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) for (i = 0; i < s->map_cnt; i++) { struct bpf_map *map = *s->maps[i].map; - size_t mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + size_t mmap_sz = bpf_map_mmap_sz(map); int prot, map_fd = map->fd; void **mmaped = s->maps[i].mmaped; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index ee9b1dbea9eb87..3021881224392a 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -338,6 +338,13 @@ static int probe_map_create(enum bpf_map_type map_type) key_size = 0; max_entries = 1; break; + case BPF_MAP_TYPE_ARENA: + key_size = 0; + value_size = 0; + max_entries = 1; /* one page */ + opts.map_extra = 0; /* can mmap() at any address */ + opts.map_flags = BPF_F_MMAPABLE; + break; case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PROG_ARRAY: -- cgit 1.2.3-korg From eed512e8ac64339cfc69da1a6a4b60982cb502ca Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:07 -0800 Subject: bpftool: Recognize arena map type Teach bpftool to recognize arena map type. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240308010812.89848-10-alexei.starovoitov@gmail.com --- tools/bpf/bpftool/Documentation/bpftool-map.rst | 2 +- tools/bpf/bpftool/map.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 3b7ba037af95da..9d6a314dfd7a59 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -55,7 +55,7 @@ MAP COMMANDS | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** | | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** } +| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index f98f7bbea2b158..b89bd792c1d510 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1463,7 +1463,7 @@ static int do_help(int argc, char **argv) " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" - " task_storage | bloom_filter | user_ringbuf | cgrp_storage }\n" + " task_storage | bloom_filter | user_ringbuf | cgrp_storage | arena }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-n|--nomount} }\n" "", -- cgit 1.2.3-korg From 2e7ba4f8fd1fa879b37db0b738c23ba2af8292ee Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 7 Mar 2024 17:08:08 -0800 Subject: libbpf: Recognize __arena global variables. LLVM automatically places __arena variables into ".arena.1" ELF section. In order to use such global variables bpf program must include definition of arena map in ".maps" section, like: struct { __uint(type, BPF_MAP_TYPE_ARENA); __uint(map_flags, BPF_F_MMAPABLE); __uint(max_entries, 1000); /* number of pages */ __ulong(map_extra, 2ull << 44); /* start of mmap() region */ } arena SEC(".maps"); libbpf recognizes both uses of arena and creates single `struct bpf_map *` instance in libbpf APIs. ".arena.1" ELF section data is used as initial data image, which is exposed through skeleton and bpf_map__initial_value() to the user, if they need to tune it before the load phase. During load phase, this initial image is copied over into mmap()'ed region corresponding to arena, and discarded. Few small checks here and there had to be added to make sure this approach works with bpf_map__initial_value(), mostly due to hard-coded assumption that map->mmaped is set up with mmap() syscall and should be munmap()'ed. For arena, .arena.1 can be (much) smaller than maximum arena size, so this smaller data size has to be tracked separately. Given it is enforced that there is only one arena for entire bpf_object instance, we just keep it in a separate field. This can be generalized if necessary later. All global variables from ".arena.1" section are accessible from user space via skel->arena->name_of_var. For bss/data/rodata the skeleton/libbpf perform the following sequence: 1. addr = mmap(MAP_ANONYMOUS) 2. user space optionally modifies global vars 3. map_fd = bpf_create_map() 4. bpf_update_map_elem(map_fd, addr) // to store values into the kernel 5. mmap(addr, MAP_FIXED, map_fd) after step 5 user spaces see the values it wrote at step 2 at the same addresses arena doesn't support update_map_elem. Hence skeleton/libbpf do: 1. addr = malloc(sizeof SEC ".arena.1") 2. user space optionally modifies global vars 3. map_fd = bpf_create_map(MAP_TYPE_ARENA) 4. real_addr = mmap(map->map_extra, MAP_SHARED | MAP_FIXED, map_fd) 5. memcpy(real_addr, addr) // this will fault-in and allocate pages At the end look and feel of global data vs __arena global data is the same from bpf prog pov. Another complication is: struct { __uint(type, BPF_MAP_TYPE_ARENA); } arena SEC(".maps"); int __arena foo; int bar; ptr1 = &foo; // relocation against ".arena.1" section ptr2 = &arena; // relocation against ".maps" section ptr3 = &bar; // relocation against ".bss" section Fo the kernel ptr1 and ptr2 has point to the same arena's map_fd while ptr3 points to a different global array's map_fd. For the verifier: ptr1->type == unknown_scalar ptr2->type == const_ptr_to_map ptr3->type == ptr_to_map_value After verification, from JIT pov all 3 ptr-s are normal ld_imm64 insns. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240308010812.89848-11-alexei.starovoitov@gmail.com --- tools/bpf/bpftool/gen.c | 13 ++++++ tools/lib/bpf/libbpf.c | 118 +++++++++++++++++++++++++++++++++++++++++++----- tools/lib/bpf/libbpf.h | 2 +- 3 files changed, 120 insertions(+), 13 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index a3d72be347b0cb..4fa4ade1ce7445 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -120,6 +120,12 @@ static bool get_datasec_ident(const char *sec_name, char *buf, size_t buf_sz) static const char *pfxs[] = { ".data", ".rodata", ".bss", ".kconfig" }; int i, n; + /* recognize hard coded LLVM section name */ + if (strcmp(sec_name, ".arena.1") == 0) { + /* this is the name to use in skeleton */ + snprintf(buf, buf_sz, "arena"); + return true; + } for (i = 0, n = ARRAY_SIZE(pfxs); i < n; i++) { const char *pfx = pfxs[i]; @@ -250,6 +256,13 @@ static const struct btf_type *find_type_for_map(struct btf *btf, const char *map static bool is_mmapable_map(const struct bpf_map *map, char *buf, size_t sz) { + size_t tmp_sz; + + if (bpf_map__type(map) == BPF_MAP_TYPE_ARENA && bpf_map__initial_value(map, &tmp_sz)) { + snprintf(buf, sz, "arena"); + return true; + } + if (!bpf_map__is_internal(map) || !(bpf_map__map_flags(map) & BPF_F_MMAPABLE)) return false; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 34b72207187412..efab29b8935bd9 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -498,6 +498,7 @@ struct bpf_struct_ops { #define KSYMS_SEC ".ksyms" #define STRUCT_OPS_SEC ".struct_ops" #define STRUCT_OPS_LINK_SEC ".struct_ops.link" +#define ARENA_SEC ".arena.1" enum libbpf_map_type { LIBBPF_MAP_UNSPEC, @@ -629,6 +630,7 @@ struct elf_state { Elf *elf; Elf64_Ehdr *ehdr; Elf_Data *symbols; + Elf_Data *arena_data; size_t shstrndx; /* section index for section name strings */ size_t strtabidx; struct elf_sec_desc *secs; @@ -638,6 +640,7 @@ struct elf_state { int text_shndx; int symbols_shndx; bool has_st_ops; + int arena_data_shndx; }; struct usdt_manager; @@ -697,6 +700,10 @@ struct bpf_object { struct usdt_manager *usdt_man; + struct bpf_map *arena_map; + void *arena_data; + size_t arena_data_sz; + struct kern_feature_cache *feat_cache; char *token_path; int token_fd; @@ -1443,6 +1450,7 @@ static void bpf_object__elf_finish(struct bpf_object *obj) elf_end(obj->efile.elf); obj->efile.elf = NULL; obj->efile.symbols = NULL; + obj->efile.arena_data = NULL; zfree(&obj->efile.secs); obj->efile.sec_cnt = 0; @@ -1851,7 +1859,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, def->value_size = data_sz; def->max_entries = 1; def->map_flags = type == LIBBPF_MAP_RODATA || type == LIBBPF_MAP_KCONFIG - ? BPF_F_RDONLY_PROG : 0; + ? BPF_F_RDONLY_PROG : 0; /* failures are fine because of maps like .rodata.str1.1 */ (void) map_fill_btf_type_info(obj, map); @@ -2843,6 +2851,32 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, return 0; } +static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map, + const char *sec_name, int sec_idx, + void *data, size_t data_sz) +{ + const long page_sz = sysconf(_SC_PAGE_SIZE); + size_t mmap_sz; + + mmap_sz = bpf_map_mmap_sz(obj->arena_map); + if (roundup(data_sz, page_sz) > mmap_sz) { + pr_warn("elf: sec '%s': declared ARENA map size (%zu) is too small to hold global __arena variables of size %zu\n", + sec_name, mmap_sz, data_sz); + return -E2BIG; + } + + obj->arena_data = malloc(data_sz); + if (!obj->arena_data) + return -ENOMEM; + memcpy(obj->arena_data, data, data_sz); + obj->arena_data_sz = data_sz; + + /* make bpf_map__init_value() work for ARENA maps */ + map->mmaped = obj->arena_data; + + return 0; +} + static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, const char *pin_root_path) { @@ -2892,6 +2926,33 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, return err; } + for (i = 0; i < obj->nr_maps; i++) { + struct bpf_map *map = &obj->maps[i]; + + if (map->def.type != BPF_MAP_TYPE_ARENA) + continue; + + if (obj->arena_map) { + pr_warn("map '%s': only single ARENA map is supported (map '%s' is also ARENA)\n", + map->name, obj->arena_map->name); + return -EINVAL; + } + obj->arena_map = map; + + if (obj->efile.arena_data) { + err = init_arena_map_data(obj, map, ARENA_SEC, obj->efile.arena_data_shndx, + obj->efile.arena_data->d_buf, + obj->efile.arena_data->d_size); + if (err) + return err; + } + } + if (obj->efile.arena_data && !obj->arena_map) { + pr_warn("elf: sec '%s': to use global __arena variables the ARENA map should be explicitly declared in SEC(\".maps\")\n", + ARENA_SEC); + return -ENOENT; + } + return 0; } @@ -3771,6 +3832,9 @@ static int bpf_object__elf_collect(struct bpf_object *obj) sec_desc->shdr = sh; sec_desc->data = data; obj->efile.has_st_ops = true; + } else if (strcmp(name, ARENA_SEC) == 0) { + obj->efile.arena_data = data; + obj->efile.arena_data_shndx = idx; } else { pr_info("elf: skipping unrecognized data section(%d) %s\n", idx, name); @@ -4400,6 +4464,15 @@ static int bpf_program__record_reloc(struct bpf_program *prog, type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); + /* arena data relocation */ + if (shdr_idx == obj->efile.arena_data_shndx) { + reloc_desc->type = RELO_DATA; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = obj->arena_map - obj->maps; + reloc_desc->sym_off = sym->st_value; + return 0; + } + /* generic map reference relocation */ if (type == LIBBPF_MAP_UNSPEC) { if (!bpf_object__shndx_is_maps(obj, shdr_idx)) { @@ -4940,6 +5013,7 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) bpf_gen__map_freeze(obj->gen_loader, map - obj->maps); return 0; } + err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0); if (err) { err = -errno; @@ -5289,6 +5363,10 @@ retry: map->name, err); return err; } + if (obj->arena_data) { + memcpy(map->mmaped, obj->arena_data, obj->arena_data_sz); + zfree(&obj->arena_data); + } } if (map->init_slots_sz && map->def.type != BPF_MAP_TYPE_PROG_ARRAY) { err = init_map_in_map_slots(obj, map); @@ -8786,13 +8864,9 @@ static void bpf_map__destroy(struct bpf_map *map) zfree(&map->init_slots); map->init_slots_sz = 0; - if (map->mmaped) { - size_t mmap_sz; - - mmap_sz = bpf_map_mmap_sz(map); - munmap(map->mmaped, mmap_sz); - map->mmaped = NULL; - } + if (map->mmaped && map->mmaped != map->obj->arena_data) + munmap(map->mmaped, bpf_map_mmap_sz(map)); + map->mmaped = NULL; if (map->st_ops) { zfree(&map->st_ops->data); @@ -8852,6 +8926,8 @@ void bpf_object__close(struct bpf_object *obj) if (obj->token_fd > 0) close(obj->token_fd); + zfree(&obj->arena_data); + free(obj); } @@ -10063,18 +10139,26 @@ __u32 bpf_map__btf_value_type_id(const struct bpf_map *map) int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size) { + size_t actual_sz; + if (map->obj->loaded || map->reused) return libbpf_err(-EBUSY); - if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG || - size != map->def.value_size) + if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG) + return libbpf_err(-EINVAL); + + if (map->def.type == BPF_MAP_TYPE_ARENA) + actual_sz = map->obj->arena_data_sz; + else + actual_sz = map->def.value_size; + if (size != actual_sz) return libbpf_err(-EINVAL); memcpy(map->mmaped, data, size); return 0; } -void *bpf_map__initial_value(struct bpf_map *map, size_t *psize) +void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize) { if (bpf_map__is_struct_ops(map)) { if (psize) @@ -10084,7 +10168,12 @@ void *bpf_map__initial_value(struct bpf_map *map, size_t *psize) if (!map->mmaped) return NULL; - *psize = map->def.value_size; + + if (map->def.type == BPF_MAP_TYPE_ARENA) + *psize = map->obj->arena_data_sz; + else + *psize = map->def.value_size; + return map->mmaped; } @@ -13573,6 +13662,11 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) continue; } + if (map->def.type == BPF_MAP_TYPE_ARENA) { + *mmaped = map->mmaped; + continue; + } + if (map->def.map_flags & BPF_F_RDONLY_PROG) prot = PROT_READ; else diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 5723cbbfcc41d5..7b510761f545d0 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1014,7 +1014,7 @@ LIBBPF_API int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra); LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size); -LIBBPF_API void *bpf_map__initial_value(struct bpf_map *map, size_t *psize); +LIBBPF_API void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize); /** * @brief **bpf_map__is_internal()** tells the caller whether or not the -- cgit 1.2.3-korg From 204c628730c62de5a0b593008549a9b95aa96b01 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:09 -0800 Subject: bpf: Add helper macro bpf_addr_space_cast() Introduce helper macro bpf_addr_space_cast() that emits: rX = rX instruction with off = BPF_ADDR_SPACE_CAST and encodes dest and src address_space-s into imm32. It's useful with older LLVM that doesn't emit this insn automatically. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240308010812.89848-12-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/bpf_experimental.h | 43 ++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index bc9a0832ae725a..a5b9df38c16259 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -343,6 +343,49 @@ l_true: \ asm volatile("%[reg]=%[reg]"::[reg]"r"((short)var)) #endif +/* emit instruction: + * rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as + */ +#ifndef bpf_addr_space_cast +#define bpf_addr_space_cast(var, dst_as, src_as)\ + asm volatile(".byte 0xBF; \ + .ifc %[reg], r0; \ + .byte 0x00; \ + .endif; \ + .ifc %[reg], r1; \ + .byte 0x11; \ + .endif; \ + .ifc %[reg], r2; \ + .byte 0x22; \ + .endif; \ + .ifc %[reg], r3; \ + .byte 0x33; \ + .endif; \ + .ifc %[reg], r4; \ + .byte 0x44; \ + .endif; \ + .ifc %[reg], r5; \ + .byte 0x55; \ + .endif; \ + .ifc %[reg], r6; \ + .byte 0x66; \ + .endif; \ + .ifc %[reg], r7; \ + .byte 0x77; \ + .endif; \ + .ifc %[reg], r8; \ + .byte 0x88; \ + .endif; \ + .ifc %[reg], r9; \ + .byte 0x99; \ + .endif; \ + .short %[off]; \ + .long %[as]" \ + : [reg]"+r"(var) \ + : [off]"i"(BPF_ADDR_SPACE_CAST) \ + , [as]"i"((dst_as << 16) | src_as)); +#endif + /* Description * Assert that a conditional expression is true. * Returns -- cgit 1.2.3-korg From 80a4129fcf20da3c6941411155a9b3b45caa5b8d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:10 -0800 Subject: selftests/bpf: Add unit tests for bpf_arena_alloc/free_pages Add unit tests for bpf_arena_alloc/free_pages() functionality and bpf_arena_common.h with a set of common helpers and macros that is used in this test and the following patches. Also modify test_loader that didn't support running bpf_prog_type_syscall programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240308010812.89848-13-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/DENYLIST.aarch64 | 1 + tools/testing/selftests/bpf/DENYLIST.s390x | 1 + tools/testing/selftests/bpf/bpf_arena_common.h | 70 ++++++++++ tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + tools/testing/selftests/bpf/progs/verifier_arena.c | 146 +++++++++++++++++++++ tools/testing/selftests/bpf/test_loader.c | 9 +- 6 files changed, 227 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/bpf_arena_common.h create mode 100644 tools/testing/selftests/bpf/progs/verifier_arena.c diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index 0445ac38bc07de..f9101651747b55 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -10,3 +10,4 @@ fill_link_info/kprobe_multi_link_info # bpf_program__attach_kprobe_mu fill_link_info/kretprobe_multi_link_info # bpf_program__attach_kprobe_multi_opts unexpected error: -95 fill_link_info/kprobe_multi_invalid_ubuff # bpf_program__attach_kprobe_multi_opts unexpected error: -95 missed/kprobe_recursion # missed_kprobe_recursion__attach unexpected error: -95 (errno 95) +verifier_arena # JIT does not support arena diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index cb810a98e78f48..aa8a620f3318d2 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -4,3 +4,4 @@ exceptions # JIT does not support calling kfunc bpf_throw (excepti get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) stacktrace_build_id # compare_map_keys stackid_hmap vs. stackmap err -2 errno 2 (?) verifier_iterating_callbacks +verifier_arena # JIT does not support arena diff --git a/tools/testing/selftests/bpf/bpf_arena_common.h b/tools/testing/selftests/bpf/bpf_arena_common.h new file mode 100644 index 00000000000000..bcf195c64a45c1 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_arena_common.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef WRITE_ONCE +#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val)) +#endif + +#ifndef NUMA_NO_NODE +#define NUMA_NO_NODE (-1) +#endif + +#ifndef arena_container_of +#define arena_container_of(ptr, type, member) \ + ({ \ + void __arena *__mptr = (void __arena *)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); \ + }) +#endif + +#ifdef __BPF__ /* when compiled as bpf program */ + +#ifndef PAGE_SIZE +#define PAGE_SIZE __PAGE_SIZE +/* + * for older kernels try sizeof(struct genradix_node) + * or flexible: + * static inline long __bpf_page_size(void) { + * return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node); + * } + * but generated code is not great. + */ +#endif + +#if defined(__BPF_FEATURE_ARENA_CAST) && !defined(BPF_ARENA_FORCE_ASM) +#define __arena __attribute__((address_space(1))) +#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ +#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ +#else +#define __arena +#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1) +#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0) +#endif + +void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt, + int node_id, __u64 flags) __ksym __weak; +void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak; + +#else /* when compiled as user space code */ + +#define __arena +#define __arg_arena +#define cast_kern(ptr) /* nop for user space */ +#define cast_user(ptr) /* nop for user space */ +__weak char arena[1]; + +#ifndef offsetof +#define offsetof(type, member) ((unsigned long)&((type *)0)->member) +#endif + +static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt, + int node_id, __u64 flags) +{ + return NULL; +} +static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) +{ +} + +#endif diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 9c6072a1974569..985273832f891c 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -4,6 +4,7 @@ #include "cap_helpers.h" #include "verifier_and.skel.h" +#include "verifier_arena.skel.h" #include "verifier_array_access.skel.h" #include "verifier_basic_stack.skel.h" #include "verifier_bitfield_write.skel.h" @@ -118,6 +119,7 @@ static void run_tests_aux(const char *skel_name, #define RUN(skel) run_tests_aux(#skel, skel##__elf_bytes, NULL) void test_verifier_and(void) { RUN(verifier_and); } +void test_verifier_arena(void) { RUN(verifier_arena); } void test_verifier_basic_stack(void) { RUN(verifier_basic_stack); } void test_verifier_bitfield_write(void) { RUN(verifier_bitfield_write); } void test_verifier_bounds(void) { RUN(verifier_bounds); } diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c new file mode 100644 index 00000000000000..5540b05ff9ee13 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_experimental.h" +#include "bpf_arena_common.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 2); /* arena of two pages close to 32-bit boundary*/ + __ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * 2 + 1)); /* start of mmap() region */ +} arena SEC(".maps"); + +SEC("syscall") +__success __retval(0) +int basic_alloc1(void *ctx) +{ +#if defined(__BPF_FEATURE_ARENA_CAST) + volatile int __arena *page1, *page2, *no_page, *page3; + + page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page1) + return 1; + *page1 = 1; + page2 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page2) + return 2; + *page2 = 2; + no_page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (no_page) + return 3; + if (*page1 != 1) + return 4; + if (*page2 != 2) + return 5; + bpf_arena_free_pages(&arena, (void __arena *)page2, 1); + if (*page1 != 1) + return 6; + if (*page2 != 0) /* use-after-free should return 0 */ + return 7; + page3 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page3) + return 8; + *page3 = 3; + if (page2 != page3) + return 9; + if (*page1 != 1) + return 10; +#endif + return 0; +} + +SEC("syscall") +__success __retval(0) +int basic_alloc2(void *ctx) +{ +#if defined(__BPF_FEATURE_ARENA_CAST) + volatile char __arena *page1, *page2, *page3, *page4; + + page1 = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0); + if (!page1) + return 1; + page2 = page1 + __PAGE_SIZE; + page3 = page1 + __PAGE_SIZE * 2; + page4 = page1 - __PAGE_SIZE; + *page1 = 1; + *page2 = 2; + *page3 = 3; + *page4 = 4; + if (*page1 != 1) + return 1; + if (*page2 != 2) + return 2; + if (*page3 != 0) + return 3; + if (*page4 != 0) + return 4; + bpf_arena_free_pages(&arena, (void __arena *)page1, 2); + if (*page1 != 0) + return 5; + if (*page2 != 0) + return 6; + if (*page3 != 0) + return 7; + if (*page4 != 0) + return 8; +#endif + return 0; +} + +struct bpf_arena___l { + struct bpf_map map; +} __attribute__((preserve_access_index)); + +SEC("syscall") +__success __retval(0) __log_level(2) +int basic_alloc3(void *ctx) +{ + struct bpf_arena___l *ar = (struct bpf_arena___l *)&arena; + volatile char __arena *pages; + + pages = bpf_arena_alloc_pages(&ar->map, NULL, ar->map.max_entries, NUMA_NO_NODE, 0); + if (!pages) + return 1; + return 0; +} + +SEC("iter.s/bpf_map") +__success __log_level(2) +int iter_maps1(struct bpf_iter__bpf_map *ctx) +{ + struct bpf_map *map = ctx->map; + + if (!map) + return 0; + bpf_arena_alloc_pages(map, NULL, map->max_entries, 0, 0); + return 0; +} + +SEC("iter.s/bpf_map") +__failure __msg("expected pointer to STRUCT bpf_map") +int iter_maps2(struct bpf_iter__bpf_map *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + + bpf_arena_alloc_pages((void *)seq, NULL, 1, 0, 0); + return 0; +} + +SEC("iter.s/bpf_map") +__failure __msg("untrusted_ptr_bpf_map") +int iter_maps3(struct bpf_iter__bpf_map *ctx) +{ + struct bpf_map *map = ctx->map; + + if (!map) + return 0; + bpf_arena_alloc_pages(map->inner_map_meta, NULL, map->max_entries, 0, 0); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index ba57601c2a4d5b..524c38e9cde48f 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -501,7 +501,7 @@ static bool is_unpriv_capable_map(struct bpf_map *map) } } -static int do_prog_test_run(int fd_prog, int *retval) +static int do_prog_test_run(int fd_prog, int *retval, bool empty_opts) { __u8 tmp_out[TEST_DATA_LEN << 2] = {}; __u8 tmp_in[TEST_DATA_LEN] = {}; @@ -514,6 +514,10 @@ static int do_prog_test_run(int fd_prog, int *retval) .repeat = 1, ); + if (empty_opts) { + memset(&topts, 0, sizeof(struct bpf_test_run_opts)); + topts.sz = sizeof(struct bpf_test_run_opts); + } err = bpf_prog_test_run_opts(fd_prog, &topts); saved_errno = errno; @@ -649,7 +653,8 @@ void run_subtest(struct test_loader *tester, } } - do_prog_test_run(bpf_program__fd(tprog), &retval); + do_prog_test_run(bpf_program__fd(tprog), &retval, + bpf_program__type(tprog) == BPF_PROG_TYPE_SYSCALL ? true : false); if (retval != subspec->retval && subspec->retval != POINTER_VALUE) { PRINT_FAIL("Unexpected retval: %d != %d\n", retval, subspec->retval); goto tobj_cleanup; -- cgit 1.2.3-korg From 9f2c156f90a422b4897a8c2831076a96a31413d1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:11 -0800 Subject: selftests/bpf: Add bpf_arena_list test. bpf_arena_alloc.h - implements page_frag allocator as a bpf program. bpf_arena_list.h - doubly linked link list as a bpf program. Compiled as a bpf program and as native C code. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240308010812.89848-14-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/bpf_arena_alloc.h | 67 ++++++++++++++++ tools/testing/selftests/bpf/bpf_arena_list.h | 92 ++++++++++++++++++++++ .../testing/selftests/bpf/prog_tests/arena_list.c | 68 ++++++++++++++++ tools/testing/selftests/bpf/progs/arena_list.c | 87 ++++++++++++++++++++ 4 files changed, 314 insertions(+) create mode 100644 tools/testing/selftests/bpf/bpf_arena_alloc.h create mode 100644 tools/testing/selftests/bpf/bpf_arena_list.h create mode 100644 tools/testing/selftests/bpf/prog_tests/arena_list.c create mode 100644 tools/testing/selftests/bpf/progs/arena_list.c diff --git a/tools/testing/selftests/bpf/bpf_arena_alloc.h b/tools/testing/selftests/bpf/bpf_arena_alloc.h new file mode 100644 index 00000000000000..c27678299e0c96 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_arena_alloc.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once +#include "bpf_arena_common.h" + +#ifndef __round_mask +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#endif +#ifndef round_up +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) +#endif + +#ifdef __BPF__ +#define NR_CPUS (sizeof(struct cpumask) * 8) + +static void __arena * __arena page_frag_cur_page[NR_CPUS]; +static int __arena page_frag_cur_offset[NR_CPUS]; + +/* Simple page_frag allocator */ +static inline void __arena* bpf_alloc(unsigned int size) +{ + __u64 __arena *obj_cnt; + __u32 cpu = bpf_get_smp_processor_id(); + void __arena *page = page_frag_cur_page[cpu]; + int __arena *cur_offset = &page_frag_cur_offset[cpu]; + int offset; + + size = round_up(size, 8); + if (size >= PAGE_SIZE - 8) + return NULL; + if (!page) { +refill: + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page) + return NULL; + cast_kern(page); + page_frag_cur_page[cpu] = page; + *cur_offset = PAGE_SIZE - 8; + obj_cnt = page + PAGE_SIZE - 8; + *obj_cnt = 0; + } else { + cast_kern(page); + obj_cnt = page + PAGE_SIZE - 8; + } + + offset = *cur_offset - size; + if (offset < 0) + goto refill; + + (*obj_cnt)++; + *cur_offset = offset; + return page + offset; +} + +static inline void bpf_free(void __arena *addr) +{ + __u64 __arena *obj_cnt; + + addr = (void __arena *)(((long)addr) & ~(PAGE_SIZE - 1)); + obj_cnt = addr + PAGE_SIZE - 8; + if (--(*obj_cnt) == 0) + bpf_arena_free_pages(&arena, addr, 1); +} +#else +static inline void __arena* bpf_alloc(unsigned int size) { return NULL; } +static inline void bpf_free(void __arena *addr) {} +#endif diff --git a/tools/testing/selftests/bpf/bpf_arena_list.h b/tools/testing/selftests/bpf/bpf_arena_list.h new file mode 100644 index 00000000000000..b99b9f408efffc --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_arena_list.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once +#include "bpf_arena_common.h" + +struct arena_list_node; + +typedef struct arena_list_node __arena arena_list_node_t; + +struct arena_list_node { + arena_list_node_t *next; + arena_list_node_t * __arena *pprev; +}; + +struct arena_list_head { + struct arena_list_node __arena *first; +}; +typedef struct arena_list_head __arena arena_list_head_t; + +#define list_entry(ptr, type, member) arena_container_of(ptr, type, member) + +#define list_entry_safe(ptr, type, member) \ + ({ typeof(*ptr) * ___ptr = (ptr); \ + ___ptr ? ({ cast_kern(___ptr); list_entry(___ptr, type, member); }) : NULL; \ + }) + +#ifndef __BPF__ +static inline void *bpf_iter_num_new(struct bpf_iter_num *it, int i, int j) { return NULL; } +static inline void bpf_iter_num_destroy(struct bpf_iter_num *it) {} +static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; } +#define cond_break ({}) +#endif + +/* Safely walk link list elements. Deletion of elements is allowed. */ +#define list_for_each_entry(pos, head, member) \ + for (void * ___tmp = (pos = list_entry_safe((head)->first, \ + typeof(*(pos)), member), \ + (void *)0); \ + pos && ({ ___tmp = (void *)pos->member.next; 1; }); \ + cond_break, \ + pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member)) + +static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h) +{ + arena_list_node_t *first = h->first, * __arena *tmp; + + cast_user(first); + cast_kern(n); + WRITE_ONCE(n->next, first); + cast_kern(first); + if (first) { + tmp = &n->next; + cast_user(tmp); + WRITE_ONCE(first->pprev, tmp); + } + cast_user(n); + WRITE_ONCE(h->first, n); + + tmp = &h->first; + cast_user(tmp); + cast_kern(n); + WRITE_ONCE(n->pprev, tmp); +} + +static inline void __list_del(arena_list_node_t *n) +{ + arena_list_node_t *next = n->next, *tmp; + arena_list_node_t * __arena *pprev = n->pprev; + + cast_user(next); + cast_kern(pprev); + tmp = *pprev; + cast_kern(tmp); + WRITE_ONCE(tmp, next); + if (next) { + cast_user(pprev); + cast_kern(next); + WRITE_ONCE(next->pprev, pprev); + } +} + +#define POISON_POINTER_DELTA 0 + +#define LIST_POISON1 ((void __arena *) 0x100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void __arena *) 0x122 + POISON_POINTER_DELTA) + +static inline void list_del(arena_list_node_t *n) +{ + __list_del(n); + n->next = LIST_POISON1; + n->pprev = LIST_POISON2; +} diff --git a/tools/testing/selftests/bpf/prog_tests/arena_list.c b/tools/testing/selftests/bpf/prog_tests/arena_list.c new file mode 100644 index 00000000000000..e61886debab127 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_list.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include + +#define PAGE_SIZE 4096 + +#include "bpf_arena_list.h" +#include "arena_list.skel.h" + +struct elem { + struct arena_list_node node; + __u64 value; +}; + +static int list_sum(struct arena_list_head *head) +{ + struct elem __arena *n; + int sum = 0; + + list_for_each_entry(n, head, node) + sum += n->value; + return sum; +} + +static void test_arena_list_add_del(int cnt) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct arena_list *skel; + int expected_sum = (u64)cnt * (cnt - 1) / 2; + int ret, sum; + + skel = arena_list__open_and_load(); + if (!ASSERT_OK_PTR(skel, "arena_list__open_and_load")) + return; + + skel->bss->cnt = cnt; + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_add), &opts); + ASSERT_OK(ret, "ret_add"); + ASSERT_OK(opts.retval, "retval"); + if (skel->bss->skip) { + printf("%s:SKIP:compiler doesn't support arena_cast\n", __func__); + test__skip(); + goto out; + } + sum = list_sum(skel->bss->list_head); + ASSERT_EQ(sum, expected_sum, "sum of elems"); + ASSERT_EQ(skel->arena->arena_sum, expected_sum, "__arena sum of elems"); + ASSERT_EQ(skel->arena->test_val, cnt + 1, "num of elems"); + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_del), &opts); + ASSERT_OK(ret, "ret_del"); + sum = list_sum(skel->bss->list_head); + ASSERT_EQ(sum, 0, "sum of list elems after del"); + ASSERT_EQ(skel->bss->list_sum, expected_sum, "sum of list elems computed by prog"); + ASSERT_EQ(skel->arena->arena_sum, expected_sum, "__arena sum of elems"); +out: + arena_list__destroy(skel); +} + +void test_arena_list(void) +{ + if (test__start_subtest("arena_list_1")) + test_arena_list_add_del(1); + if (test__start_subtest("arena_list_1000")) + test_arena_list_add_del(1000); +} diff --git a/tools/testing/selftests/bpf/progs/arena_list.c b/tools/testing/selftests/bpf/progs/arena_list.c new file mode 100644 index 00000000000000..cd35b844843560 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_list.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include "bpf_experimental.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 100); /* number of pages */ +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, 0x1ull << 32); /* start of mmap() region */ +#else + __ulong(map_extra, 0x1ull << 44); /* start of mmap() region */ +#endif +} arena SEC(".maps"); + +#include "bpf_arena_alloc.h" +#include "bpf_arena_list.h" + +struct elem { + struct arena_list_node node; + __u64 value; +}; + +struct arena_list_head __arena *list_head; +int list_sum; +int cnt; +bool skip = false; + +#ifdef __BPF_FEATURE_ARENA_CAST +long __arena arena_sum; +int __arena test_val = 1; +struct arena_list_head __arena global_head; +#else +long arena_sum SEC(".arena.1"); +int test_val SEC(".arena.1"); +#endif + +int zero; + +SEC("syscall") +int arena_list_add(void *ctx) +{ +#ifdef __BPF_FEATURE_ARENA_CAST + __u64 i; + + list_head = &global_head; + + for (i = zero; i < cnt; cond_break, i++) { + struct elem __arena *n = bpf_alloc(sizeof(*n)); + + test_val++; + n->value = i; + arena_sum += i; + list_add_head(&n->node, list_head); + } +#else + skip = true; +#endif + return 0; +} + +SEC("syscall") +int arena_list_del(void *ctx) +{ +#ifdef __BPF_FEATURE_ARENA_CAST + struct elem __arena *n; + int sum = 0; + + arena_sum = 0; + list_for_each_entry(n, list_head, node) { + sum += n->value; + arena_sum += n->value; + list_del(&n->node); + bpf_free(n); + } + list_sum = sum; +#else + skip = true; +#endif + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit 1.2.3-korg From 8df839ae23b8c581bdac4b6970d029d65a415852 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:12 -0800 Subject: selftests/bpf: Add bpf_arena_htab test. bpf_arena_htab.h - hash table implemented as bpf program Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240308010812.89848-15-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/DENYLIST.aarch64 | 1 + tools/testing/selftests/bpf/DENYLIST.s390x | 1 + tools/testing/selftests/bpf/bpf_arena_htab.h | 100 +++++++++++++++++++++ .../testing/selftests/bpf/prog_tests/arena_htab.c | 88 ++++++++++++++++++ tools/testing/selftests/bpf/progs/arena_htab.c | 48 ++++++++++ tools/testing/selftests/bpf/progs/arena_htab_asm.c | 5 ++ 6 files changed, 243 insertions(+) create mode 100644 tools/testing/selftests/bpf/bpf_arena_htab.h create mode 100644 tools/testing/selftests/bpf/prog_tests/arena_htab.c create mode 100644 tools/testing/selftests/bpf/progs/arena_htab.c create mode 100644 tools/testing/selftests/bpf/progs/arena_htab_asm.c diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index f9101651747b55..d8ade15e278989 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -11,3 +11,4 @@ fill_link_info/kretprobe_multi_link_info # bpf_program__attach_kprobe_mu fill_link_info/kprobe_multi_invalid_ubuff # bpf_program__attach_kprobe_multi_opts unexpected error: -95 missed/kprobe_recursion # missed_kprobe_recursion__attach unexpected error: -95 (errno 95) verifier_arena # JIT does not support arena +arena_htab # JIT does not support arena diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index aa8a620f3318d2..f4a2f66a683ddb 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -5,3 +5,4 @@ get_stack_raw_tp # user_stack corrupted user stack stacktrace_build_id # compare_map_keys stackid_hmap vs. stackmap err -2 errno 2 (?) verifier_iterating_callbacks verifier_arena # JIT does not support arena +arena_htab # JIT does not support arena diff --git a/tools/testing/selftests/bpf/bpf_arena_htab.h b/tools/testing/selftests/bpf/bpf_arena_htab.h new file mode 100644 index 00000000000000..acc01a876668ec --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_arena_htab.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once +#include +#include "bpf_arena_alloc.h" +#include "bpf_arena_list.h" + +struct htab_bucket { + struct arena_list_head head; +}; +typedef struct htab_bucket __arena htab_bucket_t; + +struct htab { + htab_bucket_t *buckets; + int n_buckets; +}; +typedef struct htab __arena htab_t; + +static inline htab_bucket_t *__select_bucket(htab_t *htab, __u32 hash) +{ + htab_bucket_t *b = htab->buckets; + + cast_kern(b); + return &b[hash & (htab->n_buckets - 1)]; +} + +static inline arena_list_head_t *select_bucket(htab_t *htab, __u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + +struct hashtab_elem { + int hash; + int key; + int value; + struct arena_list_node hash_node; +}; +typedef struct hashtab_elem __arena hashtab_elem_t; + +static hashtab_elem_t *lookup_elem_raw(arena_list_head_t *head, __u32 hash, int key) +{ + hashtab_elem_t *l; + + list_for_each_entry(l, head, hash_node) + if (l->hash == hash && l->key == key) + return l; + + return NULL; +} + +static int htab_hash(int key) +{ + return key; +} + +__weak int htab_lookup_elem(htab_t *htab __arg_arena, int key) +{ + hashtab_elem_t *l_old; + arena_list_head_t *head; + + cast_kern(htab); + head = select_bucket(htab, key); + l_old = lookup_elem_raw(head, htab_hash(key), key); + if (l_old) + return l_old->value; + return 0; +} + +__weak int htab_update_elem(htab_t *htab __arg_arena, int key, int value) +{ + hashtab_elem_t *l_new = NULL, *l_old; + arena_list_head_t *head; + + cast_kern(htab); + head = select_bucket(htab, key); + l_old = lookup_elem_raw(head, htab_hash(key), key); + + l_new = bpf_alloc(sizeof(*l_new)); + if (!l_new) + return -ENOMEM; + l_new->key = key; + l_new->hash = htab_hash(key); + l_new->value = value; + + list_add_head(&l_new->hash_node, head); + if (l_old) { + list_del(&l_old->hash_node); + bpf_free(l_old); + } + return 0; +} + +void htab_init(htab_t *htab) +{ + void __arena *buckets = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0); + + cast_user(buckets); + htab->buckets = buckets; + htab->n_buckets = 2 * PAGE_SIZE / sizeof(struct htab_bucket); +} diff --git a/tools/testing/selftests/bpf/prog_tests/arena_htab.c b/tools/testing/selftests/bpf/prog_tests/arena_htab.c new file mode 100644 index 00000000000000..0766702de84657 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_htab.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include + +#include "arena_htab_asm.skel.h" +#include "arena_htab.skel.h" + +#define PAGE_SIZE 4096 + +#include "bpf_arena_htab.h" + +static void test_arena_htab_common(struct htab *htab) +{ + int i; + + printf("htab %p buckets %p n_buckets %d\n", htab, htab->buckets, htab->n_buckets); + ASSERT_OK_PTR(htab->buckets, "htab->buckets shouldn't be NULL"); + for (i = 0; htab->buckets && i < 16; i += 4) { + /* + * Walk htab buckets and link lists since all pointers are correct, + * though they were written by bpf program. + */ + int val = htab_lookup_elem(htab, i); + + ASSERT_EQ(i, val, "key == value"); + } +} + +static void test_arena_htab_llvm(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct arena_htab *skel; + struct htab *htab; + size_t arena_sz; + void *area; + int ret; + + skel = arena_htab__open_and_load(); + if (!ASSERT_OK_PTR(skel, "arena_htab__open_and_load")) + return; + + area = bpf_map__initial_value(skel->maps.arena, &arena_sz); + /* fault-in a page with pgoff == 0 as sanity check */ + *(volatile int *)area = 0x55aa; + + /* bpf prog will allocate more pages */ + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_htab_llvm), &opts); + ASSERT_OK(ret, "ret"); + ASSERT_OK(opts.retval, "retval"); + if (skel->bss->skip) { + printf("%s:SKIP:compiler doesn't support arena_cast\n", __func__); + test__skip(); + goto out; + } + htab = skel->bss->htab_for_user; + test_arena_htab_common(htab); +out: + arena_htab__destroy(skel); +} + +static void test_arena_htab_asm(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct arena_htab_asm *skel; + struct htab *htab; + int ret; + + skel = arena_htab_asm__open_and_load(); + if (!ASSERT_OK_PTR(skel, "arena_htab_asm__open_and_load")) + return; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_htab_asm), &opts); + ASSERT_OK(ret, "ret"); + ASSERT_OK(opts.retval, "retval"); + htab = skel->bss->htab_for_user; + test_arena_htab_common(htab); + arena_htab_asm__destroy(skel); +} + +void test_arena_htab(void) +{ + if (test__start_subtest("arena_htab_llvm")) + test_arena_htab_llvm(); + if (test__start_subtest("arena_htab_asm")) + test_arena_htab_asm(); +} diff --git a/tools/testing/selftests/bpf/progs/arena_htab.c b/tools/testing/selftests/bpf/progs/arena_htab.c new file mode 100644 index 00000000000000..b7bb712cacfdcc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_htab.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include "bpf_experimental.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 100); /* number of pages */ +} arena SEC(".maps"); + +#include "bpf_arena_htab.h" + +void __arena *htab_for_user; +bool skip = false; + +int zero = 0; + +SEC("syscall") +int arena_htab_llvm(void *ctx) +{ +#if defined(__BPF_FEATURE_ARENA_CAST) || defined(BPF_ARENA_FORCE_ASM) + struct htab __arena *htab; + __u64 i; + + htab = bpf_alloc(sizeof(*htab)); + cast_kern(htab); + htab_init(htab); + + /* first run. No old elems in the table */ + for (i = zero; i < 1000; i++) + htab_update_elem(htab, i, i); + + /* should replace all elems with new ones */ + for (i = zero; i < 1000; i++) + htab_update_elem(htab, i, i); + cast_user(htab); + htab_for_user = htab; +#else + skip = true; +#endif + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/arena_htab_asm.c b/tools/testing/selftests/bpf/progs/arena_htab_asm.c new file mode 100644 index 00000000000000..6cd70ea12f0d7e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_htab_asm.c @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#define BPF_ARENA_FORCE_ASM +#define arena_htab_llvm arena_htab_asm +#include "arena_htab.c" -- cgit 1.2.3-korg