From 3e49a866c9dcbd8173e4f3e491293619a9e81fa4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 4 Mar 2024 19:05:15 -0800 Subject: mm: Enforce VM_IOREMAP flag and range in ioremap_page_range. There are various users of get_vm_area() + ioremap_page_range() APIs. Enforce that get_vm_area() was requested as VM_IOREMAP type and range passed to ioremap_page_range() matches created vm_area to avoid accidentally ioremap-ing into wrong address range. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/bpf/20240305030516.41519-2-alexei.starovoitov@gmail.com --- mm/vmalloc.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d12a17fc0c171..f42f98a127d56 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -307,8 +307,21 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end, int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { + struct vm_struct *area; int err; + area = find_vm_area((void *)addr); + if (!area || !(area->flags & VM_IOREMAP)) { + WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr); + return -EINVAL; + } + if (addr != (unsigned long)area->addr || + (void *)end != area->addr + get_vm_area_size(area)) { + WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n", + addr, end, (long)area->addr, + (long)area->addr + get_vm_area_size(area)); + return -ERANGE; + } err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), ioremap_max_page_shift); flush_cache_vmap(addr, end); -- cgit 1.2.3-korg From e6f798225a31485e47a6e4f6aa07ee9fdf80c2cb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 4 Mar 2024 19:05:16 -0800 Subject: mm: Introduce VM_SPARSE kind and vm_area_[un]map_pages(). vmap/vmalloc APIs are used to map a set of pages into contiguous kernel virtual space. get_vm_area() with appropriate flag is used to request an area of kernel address range. It's used for vmalloc, vmap, ioremap, xen use cases. - vmalloc use case dominates the usage. Such vm areas have VM_ALLOC flag. - the areas created by vmap() function should be tagged with VM_MAP. - ioremap areas are tagged with VM_IOREMAP. BPF would like to extend the vmap API to implement a lazily-populated sparse, yet contiguous kernel virtual space. Introduce VM_SPARSE flag and vm_area_map_pages(area, start_addr, count, pages) API to map a set of pages within a given area. It has the same sanity checks as vmap() does. It also checks that get_vm_area() was created with VM_SPARSE flag which identifies such areas in /proc/vmallocinfo and returns zero pages on read through /proc/kcore. The next commits will introduce bpf_arena which is a sparsely populated shared memory region between bpf program and user space process. It will map privately-managed pages into a sparse vm area with the following steps: // request virtual memory region during bpf prog verification area = get_vm_area(area_size, VM_SPARSE); // on demand vm_area_map_pages(area, kaddr, kend, pages); vm_area_unmap_pages(area, kaddr, kend); // after bpf program is detached and unloaded free_vm_area(area); Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Reviewed-by: Christoph Hellwig Reviewed-by: Pasha Tatashin Link: https://lore.kernel.org/bpf/20240305030516.41519-3-alexei.starovoitov@gmail.com --- include/linux/vmalloc.h | 5 +++++ mm/vmalloc.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 62 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c720be70c8ddd..0f72c85a377be 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -35,6 +35,7 @@ struct iov_iter; /* in uio.h */ #else #define VM_DEFER_KMEMLEAK 0 #endif +#define VM_SPARSE 0x00001000 /* sparse vm_area. not all pages are present. */ /* bits [20..32] reserved for arch specific ioremap internals */ @@ -232,6 +233,10 @@ static inline bool is_vm_area_hugepages(const void *addr) } #ifdef CONFIG_MMU +int vm_area_map_pages(struct vm_struct *area, unsigned long start, + unsigned long end, struct page **pages); +void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, + unsigned long end); void vunmap_range(unsigned long addr, unsigned long end); static inline void set_vm_flush_reset_perms(void *addr) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f42f98a127d56..e5b8c70950bc7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -648,6 +648,58 @@ static int vmap_pages_range(unsigned long addr, unsigned long end, return err; } +static int check_sparse_vm_area(struct vm_struct *area, unsigned long start, + unsigned long end) +{ + might_sleep(); + if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS)) + return -EINVAL; + if (WARN_ON_ONCE(area->flags & VM_NO_GUARD)) + return -EINVAL; + if (WARN_ON_ONCE(!(area->flags & VM_SPARSE))) + return -EINVAL; + if ((end - start) >> PAGE_SHIFT > totalram_pages()) + return -E2BIG; + if (start < (unsigned long)area->addr || + (void *)end > area->addr + get_vm_area_size(area)) + return -ERANGE; + return 0; +} + +/** + * vm_area_map_pages - map pages inside given sparse vm_area + * @area: vm_area + * @start: start address inside vm_area + * @end: end address inside vm_area + * @pages: pages to map (always PAGE_SIZE pages) + */ +int vm_area_map_pages(struct vm_struct *area, unsigned long start, + unsigned long end, struct page **pages) +{ + int err; + + err = check_sparse_vm_area(area, start, end); + if (err) + return err; + + return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT); +} + +/** + * vm_area_unmap_pages - unmap pages inside given sparse vm_area + * @area: vm_area + * @start: start address inside vm_area + * @end: end address inside vm_area + */ +void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, + unsigned long end) +{ + if (check_sparse_vm_area(area, start, end)) + return; + + vunmap_range(start, end); +} + int is_vmalloc_or_module_addr(const void *x) { /* @@ -3822,9 +3874,9 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) if (flags & VMAP_RAM) copied = vmap_ram_vread_iter(iter, addr, n, flags); - else if (!(vm && (vm->flags & VM_IOREMAP))) + else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE)))) copied = aligned_vread_iter(iter, addr, n); - else /* IOREMAP area is treated as memory hole */ + else /* IOREMAP | SPARSE area is treated as memory hole */ copied = zero_iter(iter, n); addr += copied; @@ -4415,6 +4467,9 @@ static int s_show(struct seq_file *m, void *p) if (v->flags & VM_IOREMAP) seq_puts(m, " ioremap"); + if (v->flags & VM_SPARSE) + seq_puts(m, " sparse"); + if (v->flags & VM_ALLOC) seq_puts(m, " vmalloc"); -- cgit 1.2.3-korg From d7bca9199a27b8690ae1c71dc11f825154af7234 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 8 Mar 2024 09:12:54 -0800 Subject: mm: Introduce vmap_page_range() to map pages in PCI address space ioremap_page_range() should be used for ranges within vmalloc range only. The vmalloc ranges are allocated by get_vm_area(). PCI has "resource" allocator that manages PCI_IOBASE, IO_SPACE_LIMIT address range, hence introduce vmap_page_range() to be used exclusively to map pages in PCI address space. Fixes: 3e49a866c9dc ("mm: Enforce VM_IOREMAP flag and range in ioremap_page_range.") Reported-by: Miguel Ojeda Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Reviewed-by: Christoph Hellwig Tested-by: Miguel Ojeda Link: https://lore.kernel.org/bpf/CANiq72ka4rir+RTN2FQoT=Vvprp_Ao-CvoYEkSNqtSY+RZj+AA@mail.gmail.com --- arch/arm/mm/ioremap.c | 8 ++++---- arch/loongarch/kernel/setup.c | 2 +- arch/mips/loongson64/init.c | 2 +- arch/powerpc/kernel/isa-bridge.c | 4 ++-- drivers/pci/pci.c | 4 ++-- include/linux/io.h | 7 +++++++ mm/vmalloc.c | 23 +++++++++++++++-------- 7 files changed, 32 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 2129070065c32..794cfea9f9d4c 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -110,8 +110,8 @@ void __init add_static_vm_early(struct static_vm *svm) int ioremap_page(unsigned long virt, unsigned long phys, const struct mem_type *mtype) { - return ioremap_page_range(virt, virt + PAGE_SIZE, phys, - __pgprot(mtype->prot_pte)); + return vmap_page_range(virt, virt + PAGE_SIZE, phys, + __pgprot(mtype->prot_pte)); } EXPORT_SYMBOL(ioremap_page); @@ -466,8 +466,8 @@ int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr) if (res->end > IO_SPACE_LIMIT) return -EINVAL; - return ioremap_page_range(vaddr, vaddr + resource_size(res), phys_addr, - __pgprot(get_mem_type(pci_ioremap_mem_type)->prot_pte)); + return vmap_page_range(vaddr, vaddr + resource_size(res), phys_addr, + __pgprot(get_mem_type(pci_ioremap_mem_type)->prot_pte)); } EXPORT_SYMBOL(pci_remap_iospace); diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 634ef17fd38bf..fd915ad69c09b 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -490,7 +490,7 @@ static int __init add_legacy_isa_io(struct fwnode_handle *fwnode, } vaddr = (unsigned long)(PCI_IOBASE + range->io_start); - ioremap_page_range(vaddr, vaddr + size, hw_start, pgprot_device(PAGE_KERNEL)); + vmap_page_range(vaddr, vaddr + size, hw_start, pgprot_device(PAGE_KERNEL)); return 0; } diff --git a/arch/mips/loongson64/init.c b/arch/mips/loongson64/init.c index 553142c1f14fe..a35dd73117958 100644 --- a/arch/mips/loongson64/init.c +++ b/arch/mips/loongson64/init.c @@ -180,7 +180,7 @@ static int __init add_legacy_isa_io(struct fwnode_handle *fwnode, resource_size_ vaddr = PCI_IOBASE + range->io_start; - ioremap_page_range(vaddr, vaddr + size, hw_start, pgprot_device(PAGE_KERNEL)); + vmap_page_range(vaddr, vaddr + size, hw_start, pgprot_device(PAGE_KERNEL)); return 0; } diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 48e0eaf1ad615..5c064485197a9 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -46,8 +46,8 @@ static void remap_isa_base(phys_addr_t pa, unsigned long size) WARN_ON_ONCE(size & ~PAGE_MASK); if (slab_is_available()) { - if (ioremap_page_range(ISA_IO_BASE, ISA_IO_BASE + size, pa, - pgprot_noncached(PAGE_KERNEL))) + if (vmap_page_range(ISA_IO_BASE, ISA_IO_BASE + size, pa, + pgprot_noncached(PAGE_KERNEL))) vunmap_range(ISA_IO_BASE, ISA_IO_BASE + size); } else { early_ioremap_range(ISA_IO_BASE, pa, size, diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index c3585229c12a2..ccee56615f784 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4353,8 +4353,8 @@ int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr) if (res->end > IO_SPACE_LIMIT) return -EINVAL; - return ioremap_page_range(vaddr, vaddr + resource_size(res), phys_addr, - pgprot_device(PAGE_KERNEL)); + return vmap_page_range(vaddr, vaddr + resource_size(res), phys_addr, + pgprot_device(PAGE_KERNEL)); #else /* * This architecture does not have memory mapped I/O space, diff --git a/include/linux/io.h b/include/linux/io.h index 7304f2a69960a..235ba7d80a8f0 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -23,12 +23,19 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count); #ifdef CONFIG_MMU int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot); +int vmap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot); #else static inline int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { return 0; } +static inline int vmap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) +{ + return 0; +} #endif /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e5b8c70950bc7..1e36322d83d89 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -304,11 +304,24 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end, return err; } +int vmap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) +{ + int err; + + err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), + ioremap_max_page_shift); + flush_cache_vmap(addr, end); + if (!err) + err = kmsan_ioremap_page_range(addr, end, phys_addr, prot, + ioremap_max_page_shift); + return err; +} + int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { struct vm_struct *area; - int err; area = find_vm_area((void *)addr); if (!area || !(area->flags & VM_IOREMAP)) { @@ -322,13 +335,7 @@ int ioremap_page_range(unsigned long addr, unsigned long end, (long)area->addr + get_vm_area_size(area)); return -ERANGE; } - err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), - ioremap_max_page_shift); - flush_cache_vmap(addr, end); - if (!err) - err = kmsan_ioremap_page_range(addr, end, phys_addr, prot, - ioremap_max_page_shift); - return err; + return vmap_page_range(addr, end, phys_addr, prot); } static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, -- cgit 1.2.3-korg