diff options
author | Andrii Nakryiko <andrii@kernel.org> | 2024-03-11 15:37:26 -0700 |
---|---|---|
committer | Andrii Nakryiko <andrii@kernel.org> | 2024-03-11 15:43:43 -0700 |
commit | 08701e306e480c56b68c1fa35f2c5b27204083e2 (patch) | |
tree | 0adc349c9b30acf84420c5e52584264a0b048f4f | |
parent | 365c2b32792e692bad6e3761ad19ac3f8f52c0fe (diff) | |
parent | 8df839ae23b8c581bdac4b6970d029d65a415852 (diff) | |
download | linux-can-next-08701e306e480c56b68c1fa35f2c5b27204083e2.tar.gz |
Merge branch 'bpf-introduce-bpf-arena'
Alexei Starovoitov says:
====================
bpf: Introduce BPF arena.
From: Alexei Starovoitov <ast@kernel.org>
v2->v3:
- contains bpf bits only, but cc-ing past audience for continuity
- since prerequisite patches landed, this series focus on the main
functionality of bpf_arena.
- adopted Andrii's approach to support arena in libbpf.
- simplified LLVM support. Instead of two instructions it's now only one.
- switched to cond_break (instead of open coded iters) in selftests
- implemented several follow-ups that will be sent after this set
. remember first IP and bpf insn that faulted in arena.
report to user space via bpftool
. copy paste and tweak glob_match() aka mini-regex as a selftests/bpf
- see patch 1 for detailed description of bpf_arena
v1->v2:
- Improved commit log with reasons for using vmap_pages_range() in arena.
Thanks to Johannes
- Added support for __arena global variables in bpf programs
- Fixed race conditions spotted by Barret
- Fixed wrap32 issue spotted by Barret
- Fixed bpf_map_mmap_sz() the way Andrii suggested
The work on bpf_arena was inspired by Barret's work:
https://github.com/google/ghost-userspace/blob/main/lib/queue.bpf.h
that implements queues, lists and AVL trees completely as bpf programs
using giant bpf array map and integer indices instead of pointers.
bpf_arena is a sparse array that allows to use normal C pointers to
build such data structures. Last few patches implement page_frag
allocator, link list and hash table as bpf programs.
v1:
bpf programs have multiple options to communicate with user space:
- Various ring buffers (perf, ftrace, bpf): The data is streamed
unidirectionally from bpf to user space.
- Hash map: The bpf program populates elements, and user space consumes
them via bpf syscall.
- mmap()-ed array map: Libbpf creates an array map that is directly
accessed by the bpf program and mmap-ed to user space. It's the fastest
way. Its disadvantage is that memory for the whole array is reserved at
the start.
====================
Link: https://lore.kernel.org/r/20240308010812.89848-1-alexei.starovoitov@gmail.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
37 files changed, 2028 insertions, 40 deletions
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e1390d1e331b58..27058d7395f660 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -113,6 +113,7 @@ static int bpf_size_to_x86_bytes(int bpf_size) /* Pick a register outside of BPF range for JIT internal work */ #define AUX_REG (MAX_BPF_JIT_REG + 1) #define X86_REG_R9 (MAX_BPF_JIT_REG + 2) +#define X86_REG_R12 (MAX_BPF_JIT_REG + 3) /* * The following table maps BPF registers to x86-64 registers. @@ -139,6 +140,7 @@ static const int reg2hex[] = { [BPF_REG_AX] = 2, /* R10 temp register */ [AUX_REG] = 3, /* R11 temp register */ [X86_REG_R9] = 1, /* R9 register, 6th function argument */ + [X86_REG_R12] = 4, /* R12 callee saved */ }; static const int reg2pt_regs[] = { @@ -167,6 +169,7 @@ static bool is_ereg(u32 reg) BIT(BPF_REG_8) | BIT(BPF_REG_9) | BIT(X86_REG_R9) | + BIT(X86_REG_R12) | BIT(BPF_REG_AX)); } @@ -205,6 +208,17 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2) return byte; } +static u8 add_3mod(u8 byte, u32 r1, u32 r2, u32 index) +{ + if (is_ereg(r1)) + byte |= 1; + if (is_ereg(index)) + byte |= 2; + if (is_ereg(r2)) + byte |= 4; + return byte; +} + /* Encode 'dst_reg' register into x86-64 opcode 'byte' */ static u8 add_1reg(u8 byte, u32 dst_reg) { @@ -645,6 +659,8 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog, pop_r12(&prog); } else { pop_callee_regs(&prog, callee_regs_used); + if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena)) + pop_r12(&prog); } EMIT1(0x58); /* pop rax */ @@ -704,6 +720,8 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog, pop_r12(&prog); } else { pop_callee_regs(&prog, callee_regs_used); + if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena)) + pop_r12(&prog); } EMIT1(0x58); /* pop rax */ @@ -887,6 +905,18 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) *pprog = prog; } +static void emit_insn_suffix_SIB(u8 **pprog, u32 ptr_reg, u32 val_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + if (is_imm8(off)) { + EMIT3(add_2reg(0x44, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off); + } else { + EMIT2_off32(add_2reg(0x84, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off); + } + *pprog = prog; +} + /* * Emit a REX byte if it will be necessary to address these registers */ @@ -968,6 +998,37 @@ static void emit_ldsx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) *pprog = prog; } +static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + switch (size) { + case BPF_B: + /* movzx rax, byte ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB6); + break; + case BPF_H: + /* movzx rax, word ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB7); + break; + case BPF_W: + /* mov eax, dword ptr [rax + r12 + off] */ + EMIT2(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x8B); + break; + case BPF_DW: + /* mov rax, qword ptr [rax + r12 + off] */ + EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x8B); + break; + } + emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off); + *pprog = prog; +} + +static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off); +} + /* STX: *(u8*)(dst_reg + off) = src_reg */ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -1002,6 +1063,71 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) *pprog = prog; } +/* STX: *(u8*)(dst_reg + index_reg + off) = src_reg */ +static void emit_stx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + switch (size) { + case BPF_B: + /* mov byte ptr [rax + r12 + off], al */ + EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x88); + break; + case BPF_H: + /* mov word ptr [rax + r12 + off], ax */ + EMIT3(0x66, add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89); + break; + case BPF_W: + /* mov dword ptr [rax + r12 + 1], eax */ + EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89); + break; + case BPF_DW: + /* mov qword ptr [rax + r12 + 1], rax */ + EMIT2(add_3mod(0x48, dst_reg, src_reg, index_reg), 0x89); + break; + } + emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off); + *pprog = prog; +} + +static void emit_stx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + emit_stx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off); +} + +/* ST: *(u8*)(dst_reg + index_reg + off) = imm32 */ +static void emit_st_index(u8 **pprog, u32 size, u32 dst_reg, u32 index_reg, int off, int imm) +{ + u8 *prog = *pprog; + + switch (size) { + case BPF_B: + /* mov byte ptr [rax + r12 + off], imm8 */ + EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC6); + break; + case BPF_H: + /* mov word ptr [rax + r12 + off], imm16 */ + EMIT3(0x66, add_3mod(0x40, dst_reg, 0, index_reg), 0xC7); + break; + case BPF_W: + /* mov dword ptr [rax + r12 + 1], imm32 */ + EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC7); + break; + case BPF_DW: + /* mov qword ptr [rax + r12 + 1], imm32 */ + EMIT2(add_3mod(0x48, dst_reg, 0, index_reg), 0xC7); + break; + } + emit_insn_suffix_SIB(&prog, dst_reg, 0, index_reg, off); + EMIT(imm, bpf_size_to_x86_bytes(size)); + *pprog = prog; +} + +static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm) +{ + emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm); +} + static int emit_atomic(u8 **pprog, u8 atomic_op, u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) { @@ -1043,12 +1169,15 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, return 0; } +#define DONT_CLEAR 1 + bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) { u32 reg = x->fixup >> 8; /* jump over faulting load and clear dest register */ - *(unsigned long *)((void *)regs + reg) = 0; + if (reg != DONT_CLEAR) + *(unsigned long *)((void *)regs + reg) = 0; regs->ip += x->fixup & 0xff; return true; } @@ -1147,11 +1276,15 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image bool tail_call_seen = false; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; + u64 arena_vm_start, user_vm_start; int i, excnt = 0; int ilen, proglen = 0; u8 *prog = temp; int err; + arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena); + user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena); + detect_reg_usage(insn, insn_cnt, callee_regs_used, &tail_call_seen); @@ -1172,8 +1305,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image push_r12(&prog); push_callee_regs(&prog, all_callee_regs_used); } else { + if (arena_vm_start) + push_r12(&prog); push_callee_regs(&prog, callee_regs_used); } + if (arena_vm_start) + emit_mov_imm64(&prog, X86_REG_R12, + arena_vm_start >> 32, (u32) arena_vm_start); ilen = prog - temp; if (rw_image) @@ -1213,6 +1351,40 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image break; case BPF_ALU64 | BPF_MOV | BPF_X: + if (insn->off == BPF_ADDR_SPACE_CAST && + insn->imm == 1U << 16) { + if (dst_reg != src_reg) + /* 32-bit mov */ + emit_mov_reg(&prog, false, dst_reg, src_reg); + /* shl dst_reg, 32 */ + maybe_emit_1mod(&prog, dst_reg, true); + EMIT3(0xC1, add_1reg(0xE0, dst_reg), 32); + + /* or dst_reg, user_vm_start */ + maybe_emit_1mod(&prog, dst_reg, true); + if (is_axreg(dst_reg)) + EMIT1_off32(0x0D, user_vm_start >> 32); + else + EMIT2_off32(0x81, add_1reg(0xC8, dst_reg), user_vm_start >> 32); + + /* rol dst_reg, 32 */ + maybe_emit_1mod(&prog, dst_reg, true); + EMIT3(0xC1, add_1reg(0xC0, dst_reg), 32); + + /* xor r11, r11 */ + EMIT3(0x4D, 0x31, 0xDB); + + /* test dst_reg32, dst_reg32; check if lower 32-bit are zero */ + maybe_emit_mod(&prog, dst_reg, dst_reg, false); + EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg)); + + /* cmove r11, dst_reg; if so, set dst_reg to zero */ + /* WARNING: Intel swapped src/dst register encoding in CMOVcc !!! */ + maybe_emit_mod(&prog, AUX_REG, dst_reg, true); + EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg)); + break; + } + fallthrough; case BPF_ALU | BPF_MOV | BPF_X: if (insn->off == 0) emit_mov_reg(&prog, @@ -1564,6 +1736,56 @@ st: if (is_imm8(insn->off)) emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); break; + case BPF_ST | BPF_PROBE_MEM32 | BPF_B: + case BPF_ST | BPF_PROBE_MEM32 | BPF_H: + case BPF_ST | BPF_PROBE_MEM32 | BPF_W: + case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: + start_of_ldx = prog; + emit_st_r12(&prog, BPF_SIZE(insn->code), dst_reg, insn->off, insn->imm); + goto populate_extable; + + /* LDX: dst_reg = *(u8*)(src_reg + r12 + off) */ + case BPF_LDX | BPF_PROBE_MEM32 | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: + case BPF_STX | BPF_PROBE_MEM32 | BPF_B: + case BPF_STX | BPF_PROBE_MEM32 | BPF_H: + case BPF_STX | BPF_PROBE_MEM32 | BPF_W: + case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: + start_of_ldx = prog; + if (BPF_CLASS(insn->code) == BPF_LDX) + emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + else + emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); +populate_extable: + { + struct exception_table_entry *ex; + u8 *_insn = image + proglen + (start_of_ldx - temp); + s64 delta; + + if (!bpf_prog->aux->extable) + break; + + if (excnt >= bpf_prog->aux->num_exentries) { + pr_err("mem32 extable bug\n"); + return -EFAULT; + } + ex = &bpf_prog->aux->extable[excnt++]; + + delta = _insn - (u8 *)&ex->insn; + /* switch ex to rw buffer for writes */ + ex = (void *)rw_image + ((void *)ex - (void *)image); + + ex->insn = delta; + + ex->data = EX_TYPE_BPF; + + ex->fixup = (prog - start_of_ldx) | + ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8); + } + break; + /* LDX: dst_reg = *(u8*)(src_reg + off) */ case BPF_LDX | BPF_MEM | BPF_B: case BPF_LDX | BPF_PROBE_MEM | BPF_B: @@ -2036,6 +2258,8 @@ emit_jmp: pop_r12(&prog); } else { pop_callee_regs(&prog, callee_regs_used); + if (arena_vm_start) + pop_r12(&prog); } EMIT1(0xC9); /* leave */ emit_return(&prog, image + addrs[i - 1] + (prog - temp)); @@ -3243,6 +3467,11 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, } } +bool bpf_jit_supports_arena(void) +{ + return true; +} + bool bpf_jit_supports_ptr_xchg(void) { return true; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 95e07673cdc1a0..08ad265cb19597 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -37,6 +37,7 @@ struct perf_event; struct bpf_prog; struct bpf_prog_aux; struct bpf_map; +struct bpf_arena; struct sock; struct seq_file; struct btf; @@ -528,8 +529,8 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock); - - +u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena); +u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena); int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; @@ -711,6 +712,7 @@ enum bpf_arg_type { * on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ + ARG_PTR_TO_ARENA, ARG_CONST_SIZE, /* number of bytes accessed from memory */ ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ @@ -882,6 +884,7 @@ enum bpf_reg_type { * an explicit null check is required for this struct. */ PTR_TO_MEM, /* reg points to valid memory region */ + PTR_TO_ARENA, PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ @@ -1457,6 +1460,7 @@ struct bpf_prog_aux { bool xdp_has_frags; bool exception_cb; bool exception_boundary; + struct bpf_arena *arena; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ @@ -2215,6 +2219,8 @@ int generic_map_delete_batch(struct bpf_map *map, struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); +int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, + unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG_KMEM void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 94baced5a1ad64..9f2a6b83b49e14 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -132,6 +132,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4b0f6600e49919..7cb1b75eee3819 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -548,6 +548,7 @@ struct bpf_insn_aux_data { u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ + bool needs_zext; /* alu op needs to clear upper bits */ bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */ bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 36cc29a2934cb1..c99bc3df2d28e3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -72,6 +72,9 @@ struct ctl_table_header; /* unused opcode to mark special ldsx instruction. Same as BPF_IND */ #define BPF_PROBE_MEMSX 0x40 +/* unused opcode to mark special load instruction. Same as BPF_MSH */ +#define BPF_PROBE_MEM32 0xa0 + /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 @@ -959,6 +962,7 @@ bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); +bool bpf_jit_supports_arena(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85ec7fc799d7cc..3c42b9f1bada3d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1009,6 +1009,7 @@ enum bpf_map_type { BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, + BPF_MAP_TYPE_ARENA, __MAX_BPF_MAP_TYPE }; @@ -1338,6 +1339,10 @@ enum { */ #define BPF_PSEUDO_KFUNC_CALL 2 +enum bpf_addr_space_cast { + BPF_ADDR_SPACE_CAST = 1, +}; + /* flags for BPF_MAP_UPDATE_ELEM command */ enum { BPF_ANY = 0, /* create new element or update existing */ @@ -1396,6 +1401,12 @@ enum { /* BPF token FD is passed in a corresponding command's token_fd field */ BPF_F_TOKEN_FD = (1U << 16), + +/* When user space page faults in bpf_arena send SIGSEGV instead of inserting new page */ + BPF_F_SEGV_ON_FAULT = (1U << 17), + +/* Do not translate kernel bpf_arena pointers to user pointers */ + BPF_F_NO_USER_CONV = (1U << 18), }; /* Flags for BPF_PROG_QUERY. */ @@ -1467,6 +1478,9 @@ union bpf_attr { * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the * number of hash functions (if 0, the bloom filter will default * to using 5 hash functions). + * + * BPF_MAP_TYPE_ARENA - contains the address where user space + * is going to mmap() the arena. It has to be page aligned. */ __u64 map_extra; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 4ce95acfcaa728..368c5d86b5b7c8 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -15,6 +15,9 @@ obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o +ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) +obj-$(CONFIG_BPF_SYSCALL) += arena.o +endif obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c new file mode 100644 index 00000000000000..86571e760dd613 --- /dev/null +++ b/kernel/bpf/arena.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/err.h> +#include <linux/btf_ids.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> + +/* + * bpf_arena is a sparsely populated shared memory region between bpf program and + * user space process. + * + * For example on x86-64 the values could be: + * user_vm_start 7f7d26200000 // picked by mmap() + * kern_vm_start ffffc90001e69000 // picked by get_vm_area() + * For user space all pointers within the arena are normal 8-byte addresses. + * In this example 7f7d26200000 is the address of the first page (pgoff=0). + * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr + * (u32)7f7d26200000 -> 26200000 + * hence + * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb + * kernel memory region. + * + * BPF JITs generate the following code to access arena: + * mov eax, eax // eax has lower 32-bit of user pointer + * mov word ptr [rax + r12 + off], bx + * where r12 == kern_vm_start and off is s16. + * Hence allocate 4Gb + GUARD_SZ/2 on each side. + * + * Initially kernel vm_area and user vma are not populated. + * User space can fault-in any address which will insert the page + * into kernel and user vma. + * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc + * which will insert it into kernel vm_area. + * The later fault-in from user space will populate that page into user vma. + */ + +/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ +#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8) +#define KERN_VM_SZ ((1ull << 32) + GUARD_SZ) + +struct bpf_arena { + struct bpf_map map; + u64 user_vm_start; + u64 user_vm_end; + struct vm_struct *kern_vm; + struct maple_tree mt; + struct list_head vma_list; + struct mutex lock; +}; + +u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) +{ + return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0; +} + +u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) +{ + return arena ? arena->user_vm_start : 0; +} + +static long arena_map_peek_elem(struct bpf_map *map, void *value) +{ + return -EOPNOTSUPP; +} + +static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags) +{ + return -EOPNOTSUPP; +} + +static long arena_map_pop_elem(struct bpf_map *map, void *value) +{ + return -EOPNOTSUPP; +} + +static long arena_map_delete_elem(struct bpf_map *map, void *value) +{ + return -EOPNOTSUPP; +} + +static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EOPNOTSUPP; +} + +static long compute_pgoff(struct bpf_arena *arena, long uaddr) +{ + return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT; +} + +static struct bpf_map *arena_map_alloc(union bpf_attr *attr) +{ + struct vm_struct *kern_vm; + int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_arena *arena; + u64 vm_range; + int err = -ENOMEM; + + if (attr->key_size || attr->value_size || attr->max_entries == 0 || + /* BPF_F_MMAPABLE must be set */ + !(attr->map_flags & BPF_F_MMAPABLE) || + /* No unsupported flags present */ + (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV))) + return ERR_PTR(-EINVAL); + + if (attr->map_extra & ~PAGE_MASK) + /* If non-zero the map_extra is an expected user VMA start address */ + return ERR_PTR(-EINVAL); + + vm_range = (u64)attr->max_entries * PAGE_SIZE; + if (vm_range > (1ull << 32)) + return ERR_PTR(-E2BIG); + + if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32)) + /* user vma must not cross 32-bit boundary */ + return ERR_PTR(-ERANGE); + + kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP); + if (!kern_vm) + return ERR_PTR(-ENOMEM); + + arena = bpf_map_area_alloc(sizeof(*arena), numa_node); + if (!arena) + goto err; + + arena->kern_vm = kern_vm; + arena->user_vm_start = attr->map_extra; + if (arena->user_vm_start) + arena->user_vm_end = arena->user_vm_start + vm_range; + + INIT_LIST_HEAD(&arena->vma_list); + bpf_map_init_from_attr(&arena->map, attr); + mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE); + mutex_init(&arena->lock); + + return &arena->map; +err: + free_vm_area(kern_vm); + return ERR_PTR(err); +} + +static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) +{ + struct page *page; + pte_t pte; + + pte = ptep_get(ptep); + if (!pte_present(pte)) /* sanity check */ + return 0; + page = pte_page(pte); + /* + * We do not update pte here: + * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug + * 2. TLB flushing is batched or deferred. Even if we clear pte, + * the TLB entries can stick around and continue to permit access to + * the freed page. So it all relies on 1. + */ + __free_page(page); + return 0; +} + +static void arena_map_free(struct bpf_map *map) +{ + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + /* + * Check that user vma-s are not around when bpf map is freed. + * mmap() holds vm_file which holds bpf_map refcnt. + * munmap() must have happened on vma followed by arena_vm_close() + * which would clear arena->vma_list. + */ + if (WARN_ON_ONCE(!list_empty(&arena->vma_list))) + return; + + /* + * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area(). + * It unmaps everything from vmalloc area and clears pgtables. + * Call apply_to_existing_page_range() first to find populated ptes and + * free those pages. + */ + apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), + KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); + free_vm_area(arena->kern_vm); + mtree_destroy(&arena->mt); + bpf_map_area_free(arena); +} + +static void *arena_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EINVAL); +} + +static long arena_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return -EOPNOTSUPP; +} + +static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, const struct btf_type *value_type) +{ + return 0; +} + +static u64 arena_map_mem_usage(const struct bpf_map *map) +{ + return 0; +} + +struct vma_list { + struct vm_area_struct *vma; + struct list_head head; +}; + +static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) +{ + struct vma_list *vml; + + vml = kmalloc(sizeof(*vml), GFP_KERNEL); + if (!vml) + return -ENOMEM; + vma->vm_private_data = vml; + vml->vma = vma; + list_add(&vml->head, &arena->vma_list); + return 0; +} + +static void arena_vm_close(struct vm_area_struct *vma) +{ + struct bpf_map *map = vma->vm_file->private_data; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + struct vma_list *vml; + + guard(mutex)(&arena->lock); + vml = vma->vm_private_data; + list_del(&vml->head); + vma->vm_private_data = NULL; + kfree(vml); +} + +#define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */ + +static vm_fault_t arena_vm_fault(struct vm_fault *vmf) +{ + struct bpf_map *map = vmf->vma->vm_file->private_data; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + struct page *page; + long kbase, kaddr; + int ret; + + kbase = bpf_arena_get_kern_vm_start(arena); + kaddr = kbase + (u32)(vmf->address & PAGE_MASK); + + guard(mutex)(&arena->lock); + page = vmalloc_to_page((void *)kaddr); + if (page) + /* already have a page vmap-ed */ + goto out; + + if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) + /* User space requested to segfault when page is not allocated by bpf prog */ + return VM_FAULT_SIGSEGV; + + ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL); + if (ret) + return VM_FAULT_SIGSEGV; + + /* Account into memcg of the process that created bpf_arena */ + ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); + if (ret) { + mtree_erase(&arena->mt, vmf->pgoff); + return VM_FAULT_SIGSEGV; + } + + ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page); + if (ret) { + mtree_erase(&arena->mt, vmf->pgoff); + __free_page(page); + return VM_FAULT_SIGSEGV; + } +out: + page_ref_add(page, 1); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct arena_vm_ops = { + .close = arena_vm_close, + .fault = arena_vm_fault, +}; + +static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct bpf_map *map = filp->private_data; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + long ret; + + if (pgoff) + return -EINVAL; + if (len > (1ull << 32)) + return -E2BIG; + + /* if user_vm_start was specified at arena creation time */ + if (arena->user_vm_start) { + if (len > arena->user_vm_end - arena->user_vm_start) + return -E2BIG; + if (len != arena->user_vm_end - arena->user_vm_start) + return -EINVAL; + if (addr != arena->user_vm_start) + return -EINVAL; + } + + ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags); + if (IS_ERR_VALUE(ret)) + return ret; + if ((ret >> 32) == ((ret + len - 1) >> 32)) + return ret; + if (WARN_ON_ONCE(arena->user_vm_start)) + /* checks at map creation time should prevent this */ + return -EFAULT; + return round_up(ret, 1ull << 32); +} + +static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + guard(mutex)(&arena->lock); + if (arena->user_vm_start && arena->user_vm_start != vma->vm_start) + /* + * If map_extra was not specified at arena creation time then + * 1st user process can do mmap(NULL, ...) to pick user_vm_start + * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..); + * or + * specify addr in map_extra and + * use the same addr later with mmap(addr, MAP_FIXED..); + */ + return -EBUSY; + + if (arena->user_vm_end && arena->user_vm_end != vma->vm_end) + /* all user processes must have the same size of mmap-ed region */ + return -EBUSY; + + /* Earlier checks should prevent this */ + if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > (1ull << 32) || vma->vm_pgoff)) + return -EFAULT; + + if (remember_vma(arena, vma)) + return -ENOMEM; + + arena->user_vm_start = vma->vm_start; + arena->user_vm_end = vma->vm_end; + /* + * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and + * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid + * potential change of user_vm_start. + */ + vm_flags_set(vma, VM_DONTEXPAND); + vma->vm_ops = &arena_vm_ops; + return 0; +} + +static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) +{ + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if ((u64)off > arena->user_vm_end - arena->user_vm_start) + return -ERANGE; + *imm = (unsigned long)arena->user_vm_start; + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena) +const struct bpf_map_ops arena_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = arena_map_alloc, + .map_free = arena_map_free, + .map_direct_value_addr = arena_map_direct_value_addr, + .map_mmap = arena_map_mmap, + .map_get_unmapped_area = arena_get_unmapped_area, + .map_get_next_key = arena_map_get_next_key, + .map_push_elem = arena_map_push_elem, + .map_peek_elem = arena_map_peek_elem, + .map_pop_elem = arena_map_pop_elem, + .map_lookup_elem = arena_map_lookup_elem, + .map_update_elem = arena_map_update_elem, + .map_delete_elem = arena_map_delete_elem, + .map_check_btf = arena_map_check_btf, + .map_mem_usage = arena_map_mem_usage, + .map_btf_id = &bpf_arena_map_btf_ids[0], +}; + +static u64 clear_lo32(u64 val) +{ + return val & ~(u64)~0U; +} + +/* + * Allocate pages and vmap them into kernel vmalloc area. + * Later the pages will be mmaped into user space vma. + */ +static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id) +{ + /* user_vm_end/start are fixed before bpf prog runs */ + long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; + u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); + struct page **pages; + long pgoff = 0; + u32 uaddr32; + int ret, i; + + if (page_cnt > page_cnt_max) + return 0; + + if (uaddr) { + if (uaddr & ~PAGE_MASK) + return 0; + pgoff = compute_pgoff(arena, uaddr); + if (pgoff + page_cnt > page_cnt_max) + /* requested address will be outside of user VMA */ + return 0; + } + + /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */ + pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return 0; + + guard(mutex)(&arena->lock); + + if (uaddr) + ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1, + MT_ENTRY, GFP_KERNEL); + else + ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY, + page_cnt, 0, page_cnt_max - 1, GFP_KERNEL); + if (ret) + goto out_free_pages; + + ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, + node_id, page_cnt, pages); + if (ret) + goto out; + + uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); + /* Earlier checks make sure that uaddr32 + page_cnt * PAGE_SIZE will not overflow 32-bit */ + ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32, + kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages); + if (ret) { + for (i = 0; i < page_cnt; i++) + __free_page(pages[i]); + goto out; + } + kvfree(pages); + return clear_lo32(arena->user_vm_start) + uaddr32; +out: + mtree_erase(&arena->mt, pgoff); +out_free_pages: + kvfree(pages); + return 0; +} + +/* + * If page is present in vmalloc area, unmap it from vmalloc area, + * unmap it from all user space vma-s, + * and free it. + */ +static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) +{ + struct vma_list *vml; + + list_for_each_entry(vml, &arena->vma_list, head) + zap_page_range_single(vml->vma, uaddr, + PAGE_SIZE * page_cnt, NULL); +} + +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) +{ + u64 full_uaddr, uaddr_end; + long kaddr, pgoff, i; + struct page *page; + + /* only aligned lower 32-bit are relevant */ + uaddr = (u32)uaddr; + uaddr &= PAGE_MASK; + full_uaddr = clear_lo32(arena->user_vm_start) + uaddr; + uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT)); + if (full_uaddr >= uaddr_end) + return; + + page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; + + guard(mutex)(&arena->lock); + + pgoff = compute_pgoff(arena, uaddr); + /* clear range */ + mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL); + + if (page_cnt > 1) + /* bulk zap if multiple pages being freed */ + zap_pages(arena, full_uaddr, page_cnt); + + kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; + for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) { + page = vmalloc_to_page((void *)kaddr); + if (!page) + continue; + if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ + zap_pages(arena, full_uaddr, 1); + vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE); + __free_page(page); + } +} + +__bpf_kfunc_start_defs(); + +__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) + return NULL; + + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id); +} + +__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) + return; + arena_free_pages(arena, (long)ptr__ign, page_cnt); +} +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(arena_kfuncs) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_KFUNCS_END(arena_kfuncs) + +static const struct btf_kfunc_id_set common_kfunc_set = { + .owner = THIS_MODULE, + .set = &arena_kfuncs, +}; + +static int __init kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); +} +late_initcall(kfunc_init); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 170d017e8e4afc..90c4a32d89ff36 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7111,10 +7111,11 @@ cand_cache_unlock: } enum btf_arg_tag { - ARG_TAG_CTX = 0x1, - ARG_TAG_NONNULL = 0x2, - ARG_TAG_TRUSTED = 0x4, - ARG_TAG_NULLABLE = 0x8, + ARG_TAG_CTX = BIT_ULL(0), + ARG_TAG_NONNULL = BIT_ULL(1), + ARG_TAG_TRUSTED = BIT_ULL(2), + ARG_TAG_NULLABLE = BIT_ULL(3), + ARG_TAG_ARENA = BIT_ULL(4), }; /* Process BTF of a function to produce high-level expectation of function @@ -7226,6 +7227,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tags |= ARG_TAG_NONNULL; } else if (strcmp(tag, "nullable") == 0) { tags |= ARG_TAG_NULLABLE; + } else if (strcmp(tag, "arena") == 0) { + tags |= ARG_TAG_ARENA; } else { bpf_log(log, "arg#%d has unsupported set of tags\n", i); return -EOPNOTSUPP; @@ -7280,6 +7283,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) sub->args[i].btf_id = kern_type_id; continue; } + if (tags & ARG_TAG_ARENA) { + if (tags & ~ARG_TAG_ARENA) { + bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i); + return -EINVAL; + } + sub->args[i].arg_type = ARG_PTR_TO_ARENA; + continue; + } if (is_global) { /* generic user data pointer */ u32 mem_size; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 134b7979f53764..bdbdc75cdcd57b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2932,6 +2932,11 @@ bool __weak bpf_jit_supports_far_kfunc_call(void) return false; } +bool __weak bpf_jit_supports_arena(void) +{ + return false; +} + /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same @@ -2976,6 +2981,17 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, { } +/* for configs without MMU or 32-bit */ +__weak const struct bpf_map_ops arena_map_ops; +__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) +{ + return 0; +} +__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) +{ + return 0; +} + #ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) { diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 82b2dbdd048f8e..bd2e2dd04740c1 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -166,6 +166,12 @@ static bool is_movsx(const struct bpf_insn *insn) (insn->off == 8 || insn->off == 16 || insn->off == 32); } +static bool is_addr_space_cast(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && + insn->off == BPF_ADDR_SPACE_CAST; +} + void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) @@ -184,6 +190,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); + } else if (is_addr_space_cast(insn)) { + verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", + insn->code, insn->dst_reg, + insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 63c34e7b071554..2a243cf37c60b1 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -458,6 +458,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_MEM] = "mem", + [PTR_TO_ARENA] = "arena", [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", @@ -693,6 +694,8 @@ static void print_reg_state(struct bpf_verifier_env *env, } verbose(env, "%s", reg_type_str(env, t)); + if (t == PTR_TO_ARENA) + return; if (t == PTR_TO_STACK) { if (state->frameno != reg->frameno) verbose(env, "[%d]", reg->frameno); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f63f4da4db5ea1..07f2a4db4511c8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -164,6 +164,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, if (bpf_map_is_offloaded(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || + map->map_type == BPF_MAP_TYPE_ARENA || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { return map->ops->map_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || @@ -479,6 +480,39 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif +int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, + unsigned long nr_pages, struct page **pages) +{ + unsigned long i, j; + struct page *pg; + int ret = 0; +#ifdef CONFIG_MEMCG_KMEM + struct mem_cgroup *memcg, *old_memcg; + + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); +#endif + for (i = 0; i < nr_pages; i++) { + pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); + + if (pg) { + pages[i] = pg; + continue; + } + for (j = 0; j < i; j++) + __free_page(pages[j]); + ret = -ENOMEM; + break; + } + +#ifdef CONFIG_MEMCG_KMEM + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); +#endif + return ret; +} + + static int btf_field_cmp(const void *a, const void *b) { const struct btf_field *f1 = a, *f2 = b; @@ -1176,6 +1210,7 @@ static int map_create(union bpf_attr *attr) } if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && + attr->map_type != BPF_MAP_TYPE_ARENA && attr->map_extra != 0) return -EINVAL; @@ -1265,6 +1300,7 @@ static int map_create(union bpf_attr *attr) case BPF_MAP_TYPE_LRU_PERCPU_HASH: case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_ARENA: if (!bpf_token_capable(token, CAP_BPF)) goto put_token; break; @@ -4417,6 +4453,12 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, continue; } + if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX || + BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) { + insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM; + continue; + } + if (code != (BPF_LD | BPF_IMM | BPF_DW)) continue; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bf084c69350753..d64f7a9b60e81e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4386,6 +4386,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_MEM: case PTR_TO_FUNC: case PTR_TO_MAP_KEY: + case PTR_TO_ARENA: return true; default: return false; @@ -5828,6 +5829,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_XDP_SOCK: pointer_desc = "xdp_sock "; break; + case PTR_TO_ARENA: + return 0; default: break; } @@ -6937,6 +6940,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_ARENA) { + if (t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str(env, reg->type)); @@ -8408,6 +8414,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_MEM | MEM_RINGBUF: case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: + case PTR_TO_ARENA: case SCALAR_VALUE: return 0; /* All the rest must be rejected, except PTR_TO_BTF_ID which allows @@ -9372,6 +9379,18 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "arg#%d is expected to be non-NULL\n", i); return -EINVAL; } + } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) { + /* + * Can pass any value and the kernel won't crash, but + * only PTR_TO_ARENA or SCALAR make sense. Everything + * else is a bug in the bpf program. Point it out to + * the user at the verification time instead of + * run-time debug nightmare. + */ + if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) { + bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno); + return -EINVAL; + } } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0); if (ret) @@ -13852,6 +13871,21 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; src_reg = NULL; + + if (dst_reg->type == PTR_TO_ARENA) { + struct bpf_insn_aux_data *aux = cur_aux(env); + + if (BPF_CLASS(insn->code) == BPF_ALU64) + /* + * 32-bit operations zero upper bits automatically. + * 64-bit operations need to be converted to 32. + */ + aux->needs_zext = true; + + /* Any arithmetic operations are allowed on arena pointers */ + return 0; + } + if (dst_reg->type != SCALAR_VALUE) ptr_reg = dst_reg; else @@ -13969,19 +14003,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0) { - verbose(env, "BPF_MOV uses reserved fields\n"); - return -EINVAL; - } - if (BPF_CLASS(insn->code) == BPF_ALU) { - if (insn->off != 0 && insn->off != 8 && insn->off != 16) { + if ((insn->off != 0 && insn->off != 8 && insn->off != 16) || + insn->imm) { verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } + } else if (insn->off == BPF_ADDR_SPACE_CAST) { + if (insn->imm != 1 && insn->imm != 1u << 16) { + verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n"); + return -EINVAL; + } } else { - if (insn->off != 0 && insn->off != 8 && insn->off != 16 && - insn->off != 32) { + if ((insn->off != 0 && insn->off != 8 && insn->off != 16 && + insn->off != 32) || insn->imm) { verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } @@ -14008,7 +14043,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) struct bpf_reg_state *dst_reg = regs + insn->dst_reg; if (BPF_CLASS(insn->code) == BPF_ALU64) { - if (insn->off == 0) { + if (insn->imm) { + /* off == BPF_ADDR_SPACE_CAST */ + mark_reg_unknown(env, regs, insn->dst_reg); + if (insn->imm == 1) /* cast from as(1) to as(0) */ + dst_reg->type = PTR_TO_ARENA; + } else if (insn->off == 0) { /* case: R1 = R2 * copy register state to dest reg */ @@ -15182,6 +15222,10 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { + if (map->map_type == BPF_MAP_TYPE_ARENA) { + __mark_reg_unknown(env, dst_reg); + return 0; + } dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; WARN_ON_ONCE(map->max_entries != 1); @@ -16568,6 +16612,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, * the same stack frame, since fp-8 in foo != fp-8 in bar */ return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno; + case PTR_TO_ARENA: + return true; default: return regs_exact(rold, rcur, idmap); } @@ -17443,6 +17489,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_TCP_SOCK: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_ARENA: return false; default: return true; @@ -18108,6 +18155,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_CGRP_STORAGE: case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: + case BPF_MAP_TYPE_ARENA: break; default: verbose(env, @@ -18295,6 +18343,31 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) fdput(f); return -EBUSY; } + if (map->map_type == BPF_MAP_TYPE_ARENA) { + if (env->prog->aux->arena) { + verbose(env, "Only one arena per program\n"); + fdput(f); + return -EBUSY; + } + if (!env->allow_ptr_leaks || !env->bpf_capable) { + verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n"); + fdput(f); + return -EPERM; + } + if (!env->prog->jit_requested) { + verbose(env, "JIT is required to use arena\n"); + return -EOPNOTSUPP; + } + if (!bpf_jit_supports_arena()) { + verbose(env, "JIT doesn't support arena\n"); + return -EOPNOTSUPP; + } + env->prog->aux->arena = (void *)map; + if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) { + verbose(env, "arena's user address must be set via map_extra or mmap()\n"); + return -EINVAL; + } + } fdput(f); next_insn: @@ -18916,6 +18989,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) env->prog->aux->num_exentries++; } continue; + case PTR_TO_ARENA: + if (BPF_MODE(insn->code) == BPF_MEMSX) { + verbose(env, "sign extending loads from arena are not supported yet\n"); + return -EOPNOTSUPP; + } + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); + env->prog->aux->num_exentries++; + continue; default: continue; } @@ -19101,13 +19182,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->nr_linfo = prog->aux->nr_linfo; func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; + func[i]->aux->arena = prog->aux->arena; num_exentries = 0; insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEM32 || BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) num_exentries++; + if ((BPF_CLASS(insn->code) == BPF_STX || + BPF_CLASS(insn->code) == BPF_ST) && + BPF_MODE(insn->code) == BPF_PROBE_MEM32) + num_exentries++; } func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; @@ -19506,6 +19593,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env) } for (i = 0; i < insn_cnt;) { + if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) { + if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) || + (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) { + /* convert to 32-bit mov that clears upper 32-bit */ + insn->code = BPF_ALU | BPF_MOV | BPF_X; + /* clear off, so it's a normal 'wX = wY' from JIT pov */ + insn->off = 0; + } /* cast from as(0) to as(1) should be handled by JIT */ + goto next_insn; + } + + if (env->insn_aux_data[i + delta].needs_zext) + /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */ + insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code); + /* Make divide-by-zero exceptions impossible. */ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || @@ -20358,6 +20460,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */ reg->btf_id = arg->btf_id; reg->id = ++env->id_gen; + } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) { + /* caller can pass either PTR_TO_ARENA or SCALAR */ + mark_reg_unknown(env, regs, i); } else { WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n", i - BPF_REG_1, arg->arg_type); diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 3b7ba037af95da..9d6a314dfd7a59 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -55,7 +55,7 @@ MAP COMMANDS | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** | | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** } +| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index a3d72be347b0cb..4fa4ade1ce7445 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -120,6 +120,12 @@ static bool get_datasec_ident(const char *sec_name, char *buf, size_t buf_sz) static const char *pfxs[] = { ".data", ".rodata", ".bss", ".kconfig" }; int i, n; + /* recognize hard coded LLVM section name */ + if (strcmp(sec_name, ".arena.1") == 0) { + /* this is the name to use in skeleton */ + snprintf(buf, buf_sz, "arena"); + return true; + } for (i = 0, n = ARRAY_SIZE(pfxs); i < n; i++) { const char *pfx = pfxs[i]; @@ -250,6 +256,13 @@ static const struct btf_type *find_type_for_map(struct btf *btf, const char *map static bool is_mmapable_map(const struct bpf_map *map, char *buf, size_t sz) { + size_t tmp_sz; + + if (bpf_map__type(map) == BPF_MAP_TYPE_ARENA && bpf_map__initial_value(map, &tmp_sz)) { + snprintf(buf, sz, "arena"); + return true; + } + if (!bpf_map__is_internal(map) || !(bpf_map__map_flags(map) & BPF_F_MMAPABLE)) return false; diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index f98f7bbea2b158..b89bd792c1d510 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1463,7 +1463,7 @@ static int do_help(int argc, char **argv) " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" - " task_storage | bloom_filter | user_ringbuf | cgrp_storage }\n" + " task_storage | bloom_filter | user_ringbuf | cgrp_storage | arena }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-n|--nomount} }\n" "", diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 85ec7fc799d7cc..3c42b9f1bada3d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1009,6 +1009,7 @@ enum bpf_map_type { BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, + BPF_MAP_TYPE_ARENA, __MAX_BPF_MAP_TYPE }; @@ -1338,6 +1339,10 @@ enum { */ #define BPF_PSEUDO_KFUNC_CALL 2 +enum bpf_addr_space_cast { + BPF_ADDR_SPACE_CAST = 1, +}; + /* flags for BPF_MAP_UPDATE_ELEM command */ enum { BPF_ANY = 0, /* create new element or update existing */ @@ -1396,6 +1401,12 @@ enum { /* BPF token FD is passed in a corresponding command's token_fd field */ BPF_F_TOKEN_FD = (1U << 16), + +/* When user space page faults in bpf_arena send SIGSEGV instead of inserting new page */ + BPF_F_SEGV_ON_FAULT = (1U << 17), + +/* Do not translate kernel bpf_arena pointers to user pointers */ + BPF_F_NO_USER_CONV = (1U << 18), }; /* Flags for BPF_PROG_QUERY. */ @@ -1467,6 +1478,9 @@ union bpf_attr { * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the * number of hash functions (if 0, the bloom filter will default * to using 5 hash functions). + * + * BPF_MAP_TYPE_ARENA - contains the address where user space + * is going to mmap() the arena. It has to be page aligned. */ __u64 map_extra; diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 112b1504e072dc..cd17f6d0791fef 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -193,6 +193,7 @@ enum libbpf_tristate { #define __arg_nonnull __attribute((btf_decl_tag("arg:nonnull"))) #define __arg_nullable __attribute((btf_decl_tag("arg:nullable"))) #define __arg_trusted __attribute((btf_decl_tag("arg:trusted"))) +#define __arg_arena __attribute((btf_decl_tag("arg:arena"))) #ifndef ___bpf_concat #define ___bpf_concat(a, b) a ## b diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 567ad367e7aa7a..efab29b8935bd9 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -185,6 +185,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf", [BPF_MAP_TYPE_CGRP_STORAGE] = "cgrp_storage", + [BPF_MAP_TYPE_ARENA] = "arena", }; static const char * const prog_type_name[] = { @@ -497,6 +498,7 @@ struct bpf_struct_ops { #define KSYMS_SEC ".ksyms" #define STRUCT_OPS_SEC ".struct_ops" #define STRUCT_OPS_LINK_SEC ".struct_ops.link" +#define ARENA_SEC ".arena.1" enum libbpf_map_type { LIBBPF_MAP_UNSPEC, @@ -628,6 +630,7 @@ struct elf_state { Elf *elf; Elf64_Ehdr *ehdr; Elf_Data *symbols; + Elf_Data *arena_data; size_t shstrndx; /* section index for section name strings */ size_t strtabidx; struct elf_sec_desc *secs; @@ -637,6 +640,7 @@ struct elf_state { int text_shndx; int symbols_shndx; bool has_st_ops; + int arena_data_shndx; }; struct usdt_manager; @@ -696,6 +700,10 @@ struct bpf_object { struct usdt_manager *usdt_man; + struct bpf_map *arena_map; + void *arena_data; + size_t arena_data_sz; + struct kern_feature_cache *feat_cache; char *token_path; int token_fd; @@ -1442,6 +1450,7 @@ static void bpf_object__elf_finish(struct bpf_object *obj) elf_end(obj->efile.elf); obj->efile.elf = NULL; obj->efile.symbols = NULL; + obj->efile.arena_data = NULL; zfree(&obj->efile.secs); obj->efile.sec_cnt = 0; @@ -1684,7 +1693,7 @@ static struct bpf_map *bpf_object__add_map(struct bpf_object *obj) return map; } -static size_t bpf_map_mmap_sz(unsigned int value_sz, unsigned int max_entries) +static size_t array_map_mmap_sz(unsigned int value_sz, unsigned int max_entries) { const long page_sz = sysconf(_SC_PAGE_SIZE); size_t map_sz; @@ -1694,6 +1703,20 @@ static size_t bpf_map_mmap_sz(unsigned int value_sz, unsigned int max_entries) return map_sz; } +static size_t bpf_map_mmap_sz(const struct bpf_map *map) +{ + const long page_sz = sysconf(_SC_PAGE_SIZE); + + switch (map->def.type) { + case BPF_MAP_TYPE_ARRAY: + return array_map_mmap_sz(map->def.value_size, map->def.max_entries); + case BPF_MAP_TYPE_ARENA: + return page_sz * map->def.max_entries; + default: + return 0; /* not supported */ + } +} + static int bpf_map_mmap_resize(struct bpf_map *map, size_t old_sz, size_t new_sz) { void *mmaped; @@ -1836,7 +1859,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, def->value_size = data_sz; def->max_entries = 1; def->map_flags = type == LIBBPF_MAP_RODATA || type == LIBBPF_MAP_KCONFIG - ? BPF_F_RDONLY_PROG : 0; + ? BPF_F_RDONLY_PROG : 0; /* failures are fine because of maps like .rodata.str1.1 */ (void) map_fill_btf_type_info(obj, map); @@ -1847,7 +1870,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n", map->name, map->sec_idx, map->sec_offset, def->map_flags); - mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + mmap_sz = bpf_map_mmap_sz(map); map->mmaped = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (map->mmaped == MAP_FAILED) { @@ -2828,6 +2851,32 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, return 0; } +static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map, + const char *sec_name, int sec_idx, + void *data, size_t data_sz) +{ + const long page_sz = sysconf(_SC_PAGE_SIZE); + size_t mmap_sz; + + mmap_sz = bpf_map_mmap_sz(obj->arena_map); + if (roundup(data_sz, page_sz) > mmap_sz) { + pr_warn("elf: sec '%s': declared ARENA map size (%zu) is too small to hold global __arena variables of size %zu\n", + sec_name, mmap_sz, data_sz); + return -E2BIG; + } + + obj->arena_data = malloc(data_sz); + if (!obj->arena_data) + return -ENOMEM; + memcpy(obj->arena_data, data, data_sz); + obj->arena_data_sz = data_sz; + + /* make bpf_map__init_value() work for ARENA maps */ + map->mmaped = obj->arena_data; + + return 0; +} + static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, const char *pin_root_path) { @@ -2877,6 +2926,33 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, return err; } + for (i = 0; i < obj->nr_maps; i++) { + struct bpf_map *map = &obj->maps[i]; + + if (map->def.type != BPF_MAP_TYPE_ARENA) + continue; + + if (obj->arena_map) { + pr_warn("map '%s': only single ARENA map is supported (map '%s' is also ARENA)\n", + map->name, obj->arena_map->name); + return -EINVAL; + } + obj->arena_map = map; + + if (obj->efile.arena_data) { + err = init_arena_map_data(obj, map, ARENA_SEC, obj->efile.arena_data_shndx, + obj->efile.arena_data->d_buf, + obj->efile.arena_data->d_size); + if (err) + return err; + } + } + if (obj->efile.arena_data && !obj->arena_map) { + pr_warn("elf: sec '%s': to use global __arena variables the ARENA map should be explicitly declared in SEC(\".maps\")\n", + ARENA_SEC); + return -ENOENT; + } + return 0; } @@ -3756,6 +3832,9 @@ static int bpf_object__elf_collect(struct bpf_object *obj) sec_desc->shdr = sh; sec_desc->data = data; obj->efile.has_st_ops = true; + } else if (strcmp(name, ARENA_SEC) == 0) { + obj->efile.arena_data = data; + obj->efile.arena_data_shndx = idx; } else { pr_info("elf: skipping unrecognized data section(%d) %s\n", idx, name); @@ -4385,6 +4464,15 @@ static int bpf_program__record_reloc(struct bpf_program *prog, type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); + /* arena data relocation */ + if (shdr_idx == obj->efile.arena_data_shndx) { + reloc_desc->type = RELO_DATA; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = obj->arena_map - obj->maps; + reloc_desc->sym_off = sym->st_value; + return 0; + } + /* generic map reference relocation */ if (type == LIBBPF_MAP_UNSPEC) { if (!bpf_object__shndx_is_maps(obj, shdr_idx)) { @@ -4925,6 +5013,7 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) bpf_gen__map_freeze(obj->gen_loader, map - obj->maps); return 0; } + err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0); if (err) { err = -errno; @@ -5017,6 +5106,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b case BPF_MAP_TYPE_SOCKHASH: case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: + case BPF_MAP_TYPE_ARENA: create_attr.btf_fd = 0; create_attr.btf_key_type_id = 0; create_attr.btf_value_type_id = 0; @@ -5261,7 +5351,23 @@ retry: if (err < 0) goto err_out; } - + if (map->def.type == BPF_MAP_TYPE_ARENA) { + map->mmaped = mmap((void *)map->map_extra, bpf_map_mmap_sz(map), + PROT_READ | PROT_WRITE, + map->map_extra ? MAP_SHARED | MAP_FIXED : MAP_SHARED, + map->fd, 0); + if (map->mmaped == MAP_FAILED) { + err = -errno; + map->mmaped = NULL; + pr_warn("map '%s': failed to mmap arena: %d\n", + map->name, err); + return err; + } + if (obj->arena_data) { + memcpy(map->mmaped, obj->arena_data, obj->arena_data_sz); + zfree(&obj->arena_data); + } + } if (map->init_slots_sz && map->def.type != BPF_MAP_TYPE_PROG_ARRAY) { err = init_map_in_map_slots(obj, map); if (err < 0) @@ -8758,13 +8864,9 @@ static void bpf_map__destroy(struct bpf_map *map) zfree(&map->init_slots); map->init_slots_sz = 0; - if (map->mmaped) { - size_t mmap_sz; - - mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); - munmap(map->mmaped, mmap_sz); - map->mmaped = NULL; - } + if (map->mmaped && map->mmaped != map->obj->arena_data) + munmap(map->mmaped, bpf_map_mmap_sz(map)); + map->mmaped = NULL; if (map->st_ops) { zfree(&map->st_ops->data); @@ -8824,6 +8926,8 @@ void bpf_object__close(struct bpf_object *obj) if (obj->token_fd > 0) close(obj->token_fd); + zfree(&obj->arena_data); + free(obj); } @@ -9995,11 +10099,14 @@ int bpf_map__set_value_size(struct bpf_map *map, __u32 size) return libbpf_err(-EBUSY); if (map->mmaped) { - int err; size_t mmap_old_sz, mmap_new_sz; + int err; + + if (map->def.type != BPF_MAP_TYPE_ARRAY) + return -EOPNOTSUPP; - mmap_old_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); - mmap_new_sz = bpf_map_mmap_sz(size, map->def.max_entries); + mmap_old_sz = bpf_map_mmap_sz(map); + mmap_new_sz = array_map_mmap_sz(size, map->def.max_entries); err = bpf_map_mmap_resize(map, mmap_old_sz, mmap_new_sz); if (err) { pr_warn("map '%s': failed to resize memory-mapped region: %d\n", @@ -10032,18 +10139,26 @@ __u32 bpf_map__btf_value_type_id(const struct bpf_map *map) int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size) { + size_t actual_sz; + if (map->obj->loaded || map->reused) return libbpf_err(-EBUSY); - if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG || - size != map->def.value_size) + if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG) + return libbpf_err(-EINVAL); + + if (map->def.type == BPF_MAP_TYPE_ARENA) + actual_sz = map->obj->arena_data_sz; + else + actual_sz = map->def.value_size; + if (size != actual_sz) return libbpf_err(-EINVAL); memcpy(map->mmaped, data, size); return 0; } -void *bpf_map__initial_value(struct bpf_map *map, size_t *psize) +void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize) { if (bpf_map__is_struct_ops(map)) { if (psize) @@ -10053,7 +10168,12 @@ void *bpf_map__initial_value(struct bpf_map *map, size_t *psize) if (!map->mmaped) return NULL; - *psize = map->def.value_size; + + if (map->def.type == BPF_MAP_TYPE_ARENA) + *psize = map->obj->arena_data_sz; + else + *psize = map->def.value_size; + return map->mmaped; } @@ -13530,7 +13650,7 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) for (i = 0; i < s->map_cnt; i++) { struct bpf_map *map = *s->maps[i].map; - size_t mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + size_t mmap_sz = bpf_map_mmap_sz(map); int prot, map_fd = map->fd; void **mmaped = s->maps[i].mmaped; @@ -13542,6 +13662,11 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) continue; } + if (map->def.type == BPF_MAP_TYPE_ARENA) { + *mmaped = map->mmaped; + continue; + } + if (map->def.map_flags & BPF_F_RDONLY_PROG) prot = PROT_READ; else diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 5723cbbfcc41d5..7b510761f545d0 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1014,7 +1014,7 @@ LIBBPF_API int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra); LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size); -LIBBPF_API void *bpf_map__initial_value(struct bpf_map *map, size_t *psize); +LIBBPF_API void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize); /** * @brief **bpf_map__is_internal()** tells the caller whether or not the diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index ee9b1dbea9eb87..3021881224392a 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -338,6 +338,13 @@ static int probe_map_create(enum bpf_map_type map_type) key_size = 0; max_entries = 1; break; + case BPF_MAP_TYPE_ARENA: + key_size = 0; + value_size = 0; + max_entries = 1; /* one page */ + opts.map_extra = 0; /* can mmap() at any address */ + opts.map_flags = BPF_F_MMAPABLE; + break; case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PROG_ARRAY: diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index 0445ac38bc07de..d8ade15e278989 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -10,3 +10,5 @@ fill_link_info/kprobe_multi_link_info # bpf_program__attach_kprobe_mu fill_link_info/kretprobe_multi_link_info # bpf_program__attach_kprobe_multi_opts unexpected error: -95 fill_link_info/kprobe_multi_invalid_ubuff # bpf_program__attach_kprobe_multi_opts unexpected error: -95 missed/kprobe_recursion # missed_kprobe_recursion__attach unexpected error: -95 (errno 95) +verifier_arena # JIT does not support arena +arena_htab # JIT does not support arena diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index cb810a98e78f48..f4a2f66a683ddb 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -4,3 +4,5 @@ exceptions # JIT does not support calling kfunc bpf_throw (excepti get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) stacktrace_build_id # compare_map_keys stackid_hmap vs. stackmap err -2 errno 2 (?) verifier_iterating_callbacks +verifier_arena # JIT does not support arena +arena_htab # JIT does not support arena diff --git a/tools/testing/selftests/bpf/bpf_arena_alloc.h b/tools/testing/selftests/bpf/bpf_arena_alloc.h new file mode 100644 index 00000000000000..c27678299e0c96 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_arena_alloc.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once +#include "bpf_arena_common.h" + +#ifndef __round_mask +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#endif +#ifndef round_up +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) +#endif + +#ifdef __BPF__ +#define NR_CPUS (sizeof(struct cpumask) * 8) + +static void __arena * __arena page_frag_cur_page[NR_CPUS]; +static int __arena page_frag_cur_offset[NR_CPUS]; + +/* Simple page_frag allocator */ +static inline void __arena* bpf_alloc(unsigned int size) +{ + __u64 __arena *obj_cnt; + __u32 cpu = bpf_get_smp_processor_id(); + void __arena *page = page_frag_cur_page[cpu]; + int __arena *cur_offset = &page_frag_cur_offset[cpu]; + int offset; + + size = round_up(size, 8); + if (size >= PAGE_SIZE - 8) + return NULL; + if (!page) { +refill: + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page) + return NULL; + cast_kern(page); + page_frag_cur_page[cpu] = page; + *cur_offset = PAGE_SIZE - 8; + obj_cnt = page + PAGE_SIZE - 8; + *obj_cnt = 0; + } else { + cast_kern(page); + obj_cnt = page + PAGE_SIZE - 8; + } + + offset = *cur_offset - size; + if (offset < 0) + goto refill; + + (*obj_cnt)++; + *cur_offset = offset; + return page + offset; +} + +static inline void bpf_free(void __arena *addr) +{ + __u64 __arena *obj_cnt; + + addr = (void __arena *)(((long)addr) & ~(PAGE_SIZE - 1)); + obj_cnt = addr + PAGE_SIZE - 8; + if (--(*obj_cnt) == 0) + bpf_arena_free_pages(&arena, addr, 1); +} +#else +static inline void __arena* bpf_alloc(unsigned int size) { return NULL; } +static inline void bpf_free(void __arena *addr) {} +#endif diff --git a/tools/testing/selftests/bpf/bpf_arena_common.h b/tools/testing/selftests/bpf/bpf_arena_common.h new file mode 100644 index 00000000000000..bcf195c64a45c1 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_arena_common.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef WRITE_ONCE +#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val)) +#endif + +#ifndef NUMA_NO_NODE +#define NUMA_NO_NODE (-1) +#endif + +#ifndef arena_container_of +#define arena_container_of(ptr, type, member) \ + ({ \ + void __arena *__mptr = (void __arena *)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); \ + }) +#endif + +#ifdef __BPF__ /* when compiled as bpf program */ + +#ifndef PAGE_SIZE +#define PAGE_SIZE __PAGE_SIZE +/* + * for older kernels try sizeof(struct genradix_node) + * or flexible: + * static inline long __bpf_page_size(void) { + * return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node); + * } + * but generated code is not great. + */ +#endif + +#if defined(__BPF_FEATURE_ARENA_CAST) && !defined(BPF_ARENA_FORCE_ASM) +#define __arena __attribute__((address_space(1))) +#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ +#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ +#else +#define __arena +#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1) +#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0) +#endif + +void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt, + int node_id, __u64 flags) __ksym __weak; +void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak; + +#else /* when compiled as user space code */ + +#define __arena +#define __arg_arena +#define cast_kern(ptr) /* nop for user space */ +#define cast_user(ptr) /* nop for user space */ +__weak char arena[1]; + +#ifndef offsetof +#define offsetof(type, member) ((unsigned long)&((type *)0)->member) +#endif + +static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt, + int node_id, __u64 flags) +{ + return NULL; +} +static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) +{ +} + +#endif diff --git a/tools/testing/selftests/bpf/bpf_arena_htab.h b/tools/testing/selftests/bpf/bpf_arena_htab.h new file mode 100644 index 00000000000000..acc01a876668ec --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_arena_htab.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once +#include <errno.h> +#include "bpf_arena_alloc.h" +#include "bpf_arena_list.h" + +struct htab_bucket { + struct arena_list_head head; +}; +typedef struct htab_bucket __arena htab_bucket_t; + +struct htab { + htab_bucket_t *buckets; + int n_buckets; +}; +typedef struct htab __arena htab_t; + +static inline htab_bucket_t *__select_bucket(htab_t *htab, __u32 hash) +{ + htab_bucket_t *b = htab->buckets; + + cast_kern(b); + return &b[hash & (htab->n_buckets - 1)]; +} + +static inline arena_list_head_t *select_bucket(htab_t *htab, __u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + +struct hashtab_elem { + int hash; + int key; + int value; + struct arena_list_node hash_node; +}; +typedef struct hashtab_elem __arena hashtab_elem_t; + +static hashtab_elem_t *lookup_elem_raw(arena_list_head_t *head, __u32 hash, int key) +{ + hashtab_elem_t *l; + + list_for_each_entry(l, head, hash_node) + if (l->hash == hash && l->key == key) + return l; + + return NULL; +} + +static int htab_hash(int key) +{ + return key; +} + +__weak int htab_lookup_elem(htab_t *htab __arg_arena, int key) +{ + hashtab_elem_t *l_old; + arena_list_head_t *head; + + cast_kern(htab); + head = select_bucket(htab, key); + l_old = lookup_elem_raw(head, htab_hash(key), key); + if (l_old) + return l_old->value; + return 0; +} + +__weak int htab_update_elem(htab_t *htab __arg_arena, int key, int value) +{ + hashtab_elem_t *l_new = NULL, *l_old; + arena_list_head_t *head; + + cast_kern(htab); + head = select_bucket(htab, key); + l_old = lookup_elem_raw(head, htab_hash(key), key); + + l_new = bpf_alloc(sizeof(*l_new)); + if (!l_new) + return -ENOMEM; + l_new->key = key; + l_new->hash = htab_hash(key); + l_new->value = value; + + list_add_head(&l_new->hash_node, head); + if (l_old) { + list_del(&l_old->hash_node); + bpf_free(l_old); + } + return 0; +} + +void htab_init(htab_t *htab) +{ + void __arena *buckets = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0); + + cast_user(buckets); + htab->buckets = buckets; + htab->n_buckets = 2 * PAGE_SIZE / sizeof(struct htab_bucket); +} diff --git a/tools/testing/selftests/bpf/bpf_arena_list.h b/tools/testing/selftests/bpf/bpf_arena_list.h new file mode 100644 index 00000000000000..b99b9f408efffc --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_arena_list.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once +#include "bpf_arena_common.h" + +struct arena_list_node; + +typedef struct arena_list_node __arena arena_list_node_t; + +struct arena_list_node { + arena_list_node_t *next; + arena_list_node_t * __arena *pprev; +}; + +struct arena_list_head { + struct arena_list_node __arena *first; +}; +typedef struct arena_list_head __arena arena_list_head_t; + +#define list_entry(ptr, type, member) arena_container_of(ptr, type, member) + +#define list_entry_safe(ptr, type, member) \ + ({ typeof(*ptr) * ___ptr = (ptr); \ + ___ptr ? ({ cast_kern(___ptr); list_entry(___ptr, type, member); }) : NULL; \ + }) + +#ifndef __BPF__ +static inline void *bpf_iter_num_new(struct bpf_iter_num *it, int i, int j) { return NULL; } +static inline void bpf_iter_num_destroy(struct bpf_iter_num *it) {} +static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; } +#define cond_break ({}) +#endif + +/* Safely walk link list elements. Deletion of elements is allowed. */ +#define list_for_each_entry(pos, head, member) \ + for (void * ___tmp = (pos = list_entry_safe((head)->first, \ + typeof(*(pos)), member), \ + (void *)0); \ + pos && ({ ___tmp = (void *)pos->member.next; 1; }); \ + cond_break, \ + pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member)) + +static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h) +{ + arena_list_node_t *first = h->first, * __arena *tmp; + + cast_user(first); + cast_kern(n); + WRITE_ONCE(n->next, first); + cast_kern(first); + if (first) { + tmp = &n->next; + cast_user(tmp); + WRITE_ONCE(first->pprev, tmp); + } + cast_user(n); + WRITE_ONCE(h->first, n); + + tmp = &h->first; + cast_user(tmp); + cast_kern(n); + WRITE_ONCE(n->pprev, tmp); +} + +static inline void __list_del(arena_list_node_t *n) +{ + arena_list_node_t *next = n->next, *tmp; + arena_list_node_t * __arena *pprev = n->pprev; + + cast_user(next); + cast_kern(pprev); + tmp = *pprev; + cast_kern(tmp); + WRITE_ONCE(tmp, next); + if (next) { + cast_user(pprev); + cast_kern(next); + WRITE_ONCE(next->pprev, pprev); + } +} + +#define POISON_POINTER_DELTA 0 + +#define LIST_POISON1 ((void __arena *) 0x100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void __arena *) 0x122 + POISON_POINTER_DELTA) + +static inline void list_del(arena_list_node_t *n) +{ + __list_del(n); + n->next = LIST_POISON1; + n->pprev = LIST_POISON2; +} diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index bc9a0832ae725a..a5b9df38c16259 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -343,6 +343,49 @@ l_true: \ asm volatile("%[reg]=%[reg]"::[reg]"r"((short)var)) #endif +/* emit instruction: + * rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as + */ +#ifndef bpf_addr_space_cast +#define bpf_addr_space_cast(var, dst_as, src_as)\ + asm volatile(".byte 0xBF; \ + .ifc %[reg], r0; \ + .byte 0x00; \ + .endif; \ + .ifc %[reg], r1; \ + .byte 0x11; \ + .endif; \ + .ifc %[reg], r2; \ + .byte 0x22; \ + .endif; \ + .ifc %[reg], r3; \ + .byte 0x33; \ + .endif; \ + .ifc %[reg], r4; \ + .byte 0x44; \ + .endif; \ + .ifc %[reg], r5; \ + .byte 0x55; \ + .endif; \ + .ifc %[reg], r6; \ + .byte 0x66; \ + .endif; \ + .ifc %[reg], r7; \ + .byte 0x77; \ + .endif; \ + .ifc %[reg], r8; \ + .byte 0x88; \ + .endif; \ + .ifc %[reg], r9; \ + .byte 0x99; \ + .endif; \ + .short %[off]; \ + .long %[as]" \ + : [reg]"+r"(var) \ + : [off]"i"(BPF_ADDR_SPACE_CAST) \ + , [as]"i"((dst_as << 16) | src_as)); +#endif + /* Description * Assert that a conditional expression is true. * Returns diff --git a/tools/testing/selftests/bpf/prog_tests/arena_htab.c b/tools/testing/selftests/bpf/prog_tests/arena_htab.c new file mode 100644 index 00000000000000..0766702de84657 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_htab.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <test_progs.h> +#include <sys/mman.h> +#include <network_helpers.h> + +#include "arena_htab_asm.skel.h" +#include "arena_htab.skel.h" + +#define PAGE_SIZE 4096 + +#include "bpf_arena_htab.h" + +static void test_arena_htab_common(struct htab *htab) +{ + int i; + + printf("htab %p buckets %p n_buckets %d\n", htab, htab->buckets, htab->n_buckets); + ASSERT_OK_PTR(htab->buckets, "htab->buckets shouldn't be NULL"); + for (i = 0; htab->buckets && i < 16; i += 4) { + /* + * Walk htab buckets and link lists since all pointers are correct, + * though they were written by bpf program. + */ + int val = htab_lookup_elem(htab, i); + + ASSERT_EQ(i, val, "key == value"); + } +} + +static void test_arena_htab_llvm(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct arena_htab *skel; + struct htab *htab; + size_t arena_sz; + void *area; + int ret; + + skel = arena_htab__open_and_load(); + if (!ASSERT_OK_PTR(skel, "arena_htab__open_and_load")) + return; + + area = bpf_map__initial_value(skel->maps.arena, &arena_sz); + /* fault-in a page with pgoff == 0 as sanity check */ + *(volatile int *)area = 0x55aa; + + /* bpf prog will allocate more pages */ + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_htab_llvm), &opts); + ASSERT_OK(ret, "ret"); + ASSERT_OK(opts.retval, "retval"); + if (skel->bss->skip) { + printf("%s:SKIP:compiler doesn't support arena_cast\n", __func__); + test__skip(); + goto out; + } + htab = skel->bss->htab_for_user; + test_arena_htab_common(htab); +out: + arena_htab__destroy(skel); +} + +static void test_arena_htab_asm(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct arena_htab_asm *skel; + struct htab *htab; + int ret; + + skel = arena_htab_asm__open_and_load(); + if (!ASSERT_OK_PTR(skel, "arena_htab_asm__open_and_load")) + return; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_htab_asm), &opts); + ASSERT_OK(ret, "ret"); + ASSERT_OK(opts.retval, "retval"); + htab = skel->bss->htab_for_user; + test_arena_htab_common(htab); + arena_htab_asm__destroy(skel); +} + +void test_arena_htab(void) +{ + if (test__start_subtest("arena_htab_llvm")) + test_arena_htab_llvm(); + if (test__start_subtest("arena_htab_asm")) + test_arena_htab_asm(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/arena_list.c b/tools/testing/selftests/bpf/prog_tests/arena_list.c new file mode 100644 index 00000000000000..e61886debab127 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_list.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <test_progs.h> +#include <sys/mman.h> +#include <network_helpers.h> + +#define PAGE_SIZE 4096 + +#include "bpf_arena_list.h" +#include "arena_list.skel.h" + +struct elem { + struct arena_list_node node; + __u64 value; +}; + +static int list_sum(struct arena_list_head *head) +{ + struct elem __arena *n; + int sum = 0; + + list_for_each_entry(n, head, node) + sum += n->value; + return sum; +} + +static void test_arena_list_add_del(int cnt) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct arena_list *skel; + int expected_sum = (u64)cnt * (cnt - 1) / 2; + int ret, sum; + + skel = arena_list__open_and_load(); + if (!ASSERT_OK_PTR(skel, "arena_list__open_and_load")) + return; + + skel->bss->cnt = cnt; + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_add), &opts); + ASSERT_OK(ret, "ret_add"); + ASSERT_OK(opts.retval, "retval"); + if (skel->bss->skip) { + printf("%s:SKIP:compiler doesn't support arena_cast\n", __func__); + test__skip(); + goto out; + } + sum = list_sum(skel->bss->list_head); + ASSERT_EQ(sum, expected_sum, "sum of elems"); + ASSERT_EQ(skel->arena->arena_sum, expected_sum, "__arena sum of elems"); + ASSERT_EQ(skel->arena->test_val, cnt + 1, "num of elems"); + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_del), &opts); + ASSERT_OK(ret, "ret_del"); + sum = list_sum(skel->bss->list_head); + ASSERT_EQ(sum, 0, "sum of list elems after del"); + ASSERT_EQ(skel->bss->list_sum, expected_sum, "sum of list elems computed by prog"); + ASSERT_EQ(skel->arena->arena_sum, expected_sum, "__arena sum of elems"); +out: + arena_list__destroy(skel); +} + +void test_arena_list(void) +{ + if (test__start_subtest("arena_list_1")) + test_arena_list_add_del(1); + if (test__start_subtest("arena_list_1000")) + test_arena_list_add_del(1000); +} diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 9c6072a1974569..985273832f891c 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -4,6 +4,7 @@ #include "cap_helpers.h" #include "verifier_and.skel.h" +#include "verifier_arena.skel.h" #include "verifier_array_access.skel.h" #include "verifier_basic_stack.skel.h" #include "verifier_bitfield_write.skel.h" @@ -118,6 +119,7 @@ static void run_tests_aux(const char *skel_name, #define RUN(skel) run_tests_aux(#skel, skel##__elf_bytes, NULL) void test_verifier_and(void) { RUN(verifier_and); } +void test_verifier_arena(void) { RUN(verifier_arena); } void test_verifier_basic_stack(void) { RUN(verifier_basic_stack); } void test_verifier_bitfield_write(void) { RUN(verifier_bitfield_write); } void test_verifier_bounds(void) { RUN(verifier_bounds); } diff --git a/tools/testing/selftests/bpf/progs/arena_htab.c b/tools/testing/selftests/bpf/progs/arena_htab.c new file mode 100644 index 00000000000000..b7bb712cacfdcc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_htab.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include "bpf_experimental.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 100); /* number of pages */ +} arena SEC(".maps"); + +#include "bpf_arena_htab.h" + +void __arena *htab_for_user; +bool skip = false; + +int zero = 0; + +SEC("syscall") +int arena_htab_llvm(void *ctx) +{ +#if defined(__BPF_FEATURE_ARENA_CAST) || defined(BPF_ARENA_FORCE_ASM) + struct htab __arena *htab; + __u64 i; + + htab = bpf_alloc(sizeof(*htab)); + cast_kern(htab); + htab_init(htab); + + /* first run. No old elems in the table */ + for (i = zero; i < 1000; i++) + htab_update_elem(htab, i, i); + + /* should replace all elems with new ones */ + for (i = zero; i < 1000; i++) + htab_update_elem(htab, i, i); + cast_user(htab); + htab_for_user = htab; +#else + skip = true; +#endif + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/arena_htab_asm.c b/tools/testing/selftests/bpf/progs/arena_htab_asm.c new file mode 100644 index 00000000000000..6cd70ea12f0d7e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_htab_asm.c @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#define BPF_ARENA_FORCE_ASM +#define arena_htab_llvm arena_htab_asm +#include "arena_htab.c" diff --git a/tools/testing/selftests/bpf/progs/arena_list.c b/tools/testing/selftests/bpf/progs/arena_list.c new file mode 100644 index 00000000000000..cd35b844843560 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_list.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include "bpf_experimental.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 100); /* number of pages */ +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, 0x1ull << 32); /* start of mmap() region */ +#else + __ulong(map_extra, 0x1ull << 44); /* start of mmap() region */ +#endif +} arena SEC(".maps"); + +#include "bpf_arena_alloc.h" +#include "bpf_arena_list.h" + +struct elem { + struct arena_list_node node; + __u64 value; +}; + +struct arena_list_head __arena *list_head; +int list_sum; +int cnt; +bool skip = false; + +#ifdef __BPF_FEATURE_ARENA_CAST +long __arena arena_sum; +int __arena test_val = 1; +struct arena_list_head __arena global_head; +#else +long arena_sum SEC(".arena.1"); +int test_val SEC(".arena.1"); +#endif + +int zero; + +SEC("syscall") +int arena_list_add(void *ctx) +{ +#ifdef __BPF_FEATURE_ARENA_CAST + __u64 i; + + list_head = &global_head; + + for (i = zero; i < cnt; cond_break, i++) { + struct elem __arena *n = bpf_alloc(sizeof(*n)); + + test_val++; + n->value = i; + arena_sum += i; + list_add_head(&n->node, list_head); + } +#else + skip = true; +#endif + return 0; +} + +SEC("syscall") +int arena_list_del(void *ctx) +{ +#ifdef __BPF_FEATURE_ARENA_CAST + struct elem __arena *n; + int sum = 0; + + arena_sum = 0; + list_for_each_entry(n, list_head, node) { + sum += n->value; + arena_sum += n->value; + list_del(&n->node); + bpf_free(n); + } + list_sum = sum; +#else + skip = true; +#endif + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c new file mode 100644 index 00000000000000..5540b05ff9ee13 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "bpf_experimental.h" +#include "bpf_arena_common.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 2); /* arena of two pages close to 32-bit boundary*/ + __ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * 2 + 1)); /* start of mmap() region */ +} arena SEC(".maps"); + +SEC("syscall") +__success __retval(0) +int basic_alloc1(void *ctx) +{ +#if defined(__BPF_FEATURE_ARENA_CAST) + volatile int __arena *page1, *page2, *no_page, *page3; + + page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page1) + return 1; + *page1 = 1; + page2 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page2) + return 2; + *page2 = 2; + no_page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (no_page) + return 3; + if (*page1 != 1) + return 4; + if (*page2 != 2) + return 5; + bpf_arena_free_pages(&arena, (void __arena *)page2, 1); + if (*page1 != 1) + return 6; + if (*page2 != 0) /* use-after-free should return 0 */ + return 7; + page3 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page3) + return 8; + *page3 = 3; + if (page2 != page3) + return 9; + if (*page1 != 1) + return 10; +#endif + return 0; +} + +SEC("syscall") +__success __retval(0) +int basic_alloc2(void *ctx) +{ +#if defined(__BPF_FEATURE_ARENA_CAST) + volatile char __arena *page1, *page2, *page3, *page4; + + page1 = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0); + if (!page1) + return 1; + page2 = page1 + __PAGE_SIZE; + page3 = page1 + __PAGE_SIZE * 2; + page4 = page1 - __PAGE_SIZE; + *page1 = 1; + *page2 = 2; + *page3 = 3; + *page4 = 4; + if (*page1 != 1) + return 1; + if (*page2 != 2) + return 2; + if (*page3 != 0) + return 3; + if (*page4 != 0) + return 4; + bpf_arena_free_pages(&arena, (void __arena *)page1, 2); + if (*page1 != 0) + return 5; + if (*page2 != 0) + return 6; + if (*page3 != 0) + return 7; + if (*page4 != 0) + return 8; +#endif + return 0; +} + +struct bpf_arena___l { + struct bpf_map map; +} __attribute__((preserve_access_index)); + +SEC("syscall") +__success __retval(0) __log_level(2) +int basic_alloc3(void *ctx) +{ + struct bpf_arena___l *ar = (struct bpf_arena___l *)&arena; + volatile char __arena *pages; + + pages = bpf_arena_alloc_pages(&ar->map, NULL, ar->map.max_entries, NUMA_NO_NODE, 0); + if (!pages) + return 1; + return 0; +} + +SEC("iter.s/bpf_map") +__success __log_level(2) +int iter_maps1(struct bpf_iter__bpf_map *ctx) +{ + struct bpf_map *map = ctx->map; + + if (!map) + return 0; + bpf_arena_alloc_pages(map, NULL, map->max_entries, 0, 0); + return 0; +} + +SEC("iter.s/bpf_map") +__failure __msg("expected pointer to STRUCT bpf_map") +int iter_maps2(struct bpf_iter__bpf_map *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + + bpf_arena_alloc_pages((void *)seq, NULL, 1, 0, 0); + return 0; +} + +SEC("iter.s/bpf_map") +__failure __msg("untrusted_ptr_bpf_map") +int iter_maps3(struct bpf_iter__bpf_map *ctx) +{ + struct bpf_map *map = ctx->map; + + if (!map) + return 0; + bpf_arena_alloc_pages(map->inner_map_meta, NULL, map->max_entries, 0, 0); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index ba57601c2a4d5b..524c38e9cde48f 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -501,7 +501,7 @@ static bool is_unpriv_capable_map(struct bpf_map *map) } } -static int do_prog_test_run(int fd_prog, int *retval) +static int do_prog_test_run(int fd_prog, int *retval, bool empty_opts) { __u8 tmp_out[TEST_DATA_LEN << 2] = {}; __u8 tmp_in[TEST_DATA_LEN] = {}; @@ -514,6 +514,10 @@ static int do_prog_test_run(int fd_prog, int *retval) .repeat = 1, ); + if (empty_opts) { + memset(&topts, 0, sizeof(struct bpf_test_run_opts)); + topts.sz = sizeof(struct bpf_test_run_opts); + } err = bpf_prog_test_run_opts(fd_prog, &topts); saved_errno = errno; @@ -649,7 +653,8 @@ void run_subtest(struct test_loader *tester, } } - do_prog_test_run(bpf_program__fd(tprog), &retval); + do_prog_test_run(bpf_program__fd(tprog), &retval, + bpf_program__type(tprog) == BPF_PROG_TYPE_SYSCALL ? true : false); if (retval != subspec->retval && subspec->retval != POINTER_VALUE) { PRINT_FAIL("Unexpected retval: %d != %d\n", retval, subspec->retval); goto tobj_cleanup; |