kernel/mm/virt.rs
1// SPDX-License-Identifier: GPL-2.0
2
3// Copyright (C) 2024 Google LLC.
4
5//! Virtual memory.
6//!
7//! This module deals with managing a single VMA in the address space of a userspace process. Each
8//! VMA corresponds to a region of memory that the userspace process can access, and the VMA lets
9//! you control what happens when userspace reads or writes to that region of memory.
10//!
11//! The module has several different Rust types that all correspond to the C type called
12//! `vm_area_struct`. The different structs represent what kind of access you have to the VMA, e.g.
13//! [`VmaRef`] is used when you hold the mmap or vma read lock. Using the appropriate struct
14//! ensures that you can't, for example, accidentally call a function that requires holding the
15//! write lock when you only hold the read lock.
16
17use crate::{
18 bindings,
19 error::{code::EINVAL, to_result, Result},
20 mm::MmWithUser,
21 page::Page,
22 types::Opaque,
23};
24
25use core::ops::Deref;
26
27/// A wrapper for the kernel's `struct vm_area_struct` with read access.
28///
29/// It represents an area of virtual memory.
30///
31/// # Invariants
32///
33/// The caller must hold the mmap read lock or the vma read lock.
34#[repr(transparent)]
35pub struct VmaRef {
36 vma: Opaque<bindings::vm_area_struct>,
37}
38
39// Methods you can call when holding the mmap or vma read lock (or stronger). They must be usable
40// no matter what the vma flags are.
41impl VmaRef {
42 /// Access a virtual memory area given a raw pointer.
43 ///
44 /// # Safety
45 ///
46 /// Callers must ensure that `vma` is valid for the duration of 'a, and that the mmap or vma
47 /// read lock (or stronger) is held for at least the duration of 'a.
48 #[inline]
49 pub unsafe fn from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self {
50 // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a.
51 unsafe { &*vma.cast() }
52 }
53
54 /// Returns a raw pointer to this area.
55 #[inline]
56 pub fn as_ptr(&self) -> *mut bindings::vm_area_struct {
57 self.vma.get()
58 }
59
60 /// Access the underlying `mm_struct`.
61 #[inline]
62 pub fn mm(&self) -> &MmWithUser {
63 // SAFETY: By the type invariants, this `vm_area_struct` is valid and we hold the mmap/vma
64 // read lock or stronger. This implies that the underlying mm has a non-zero value of
65 // `mm_users`.
66 unsafe { MmWithUser::from_raw((*self.as_ptr()).vm_mm) }
67 }
68
69 /// Returns the flags associated with the virtual memory area.
70 ///
71 /// The possible flags are a combination of the constants in [`flags`].
72 #[inline]
73 pub fn flags(&self) -> vm_flags_t {
74 // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this
75 // access is not a data race.
76 unsafe { (*self.as_ptr()).__bindgen_anon_2.vm_flags }
77 }
78
79 /// Returns the (inclusive) start address of the virtual memory area.
80 #[inline]
81 pub fn start(&self) -> usize {
82 // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this
83 // access is not a data race.
84 unsafe { (*self.as_ptr()).__bindgen_anon_1.__bindgen_anon_1.vm_start }
85 }
86
87 /// Returns the (exclusive) end address of the virtual memory area.
88 #[inline]
89 pub fn end(&self) -> usize {
90 // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this
91 // access is not a data race.
92 unsafe { (*self.as_ptr()).__bindgen_anon_1.__bindgen_anon_1.vm_end }
93 }
94
95 /// Zap pages in the given page range.
96 ///
97 /// This clears page table mappings for the range at the leaf level, leaving all other page
98 /// tables intact, and freeing any memory referenced by the VMA in this range. That is,
99 /// anonymous memory is completely freed, file-backed memory has its reference count on page
100 /// cache folio's dropped, any dirty data will still be written back to disk as usual.
101 ///
102 /// It may seem odd that we clear at the leaf level, this is however a product of the page
103 /// table structure used to map physical memory into a virtual address space - each virtual
104 /// address actually consists of a bitmap of array indices into page tables, which form a
105 /// hierarchical page table level structure.
106 ///
107 /// As a result, each page table level maps a multiple of page table levels below, and thus
108 /// span ever increasing ranges of pages. At the leaf or PTE level, we map the actual physical
109 /// memory.
110 ///
111 /// It is here where a zap operates, as it the only place we can be certain of clearing without
112 /// impacting any other virtual mappings. It is an implementation detail as to whether the
113 /// kernel goes further in freeing unused page tables, but for the purposes of this operation
114 /// we must only assume that the leaf level is cleared.
115 #[inline]
116 pub fn zap_page_range_single(&self, address: usize, size: usize) {
117 let (end, did_overflow) = address.overflowing_add(size);
118 if did_overflow || address < self.start() || self.end() < end {
119 // TODO: call WARN_ONCE once Rust version of it is added
120 return;
121 }
122
123 // SAFETY: By the type invariants, the caller has read access to this VMA, which is
124 // sufficient for this method call. This method has no requirements on the vma flags. The
125 // address range is checked to be within the vma.
126 unsafe {
127 bindings::zap_page_range_single(self.as_ptr(), address, size, core::ptr::null_mut())
128 };
129 }
130
131 /// If the [`VM_MIXEDMAP`] flag is set, returns a [`VmaMixedMap`] to this VMA, otherwise
132 /// returns `None`.
133 ///
134 /// This can be used to access methods that require [`VM_MIXEDMAP`] to be set.
135 ///
136 /// [`VM_MIXEDMAP`]: flags::MIXEDMAP
137 #[inline]
138 pub fn as_mixedmap_vma(&self) -> Option<&VmaMixedMap> {
139 if self.flags() & flags::MIXEDMAP != 0 {
140 // SAFETY: We just checked that `VM_MIXEDMAP` is set. All other requirements are
141 // satisfied by the type invariants of `VmaRef`.
142 Some(unsafe { VmaMixedMap::from_raw(self.as_ptr()) })
143 } else {
144 None
145 }
146 }
147}
148
149/// A wrapper for the kernel's `struct vm_area_struct` with read access and [`VM_MIXEDMAP`] set.
150///
151/// It represents an area of virtual memory.
152///
153/// This struct is identical to [`VmaRef`] except that it must only be used when the
154/// [`VM_MIXEDMAP`] flag is set on the vma.
155///
156/// # Invariants
157///
158/// The caller must hold the mmap read lock or the vma read lock. The `VM_MIXEDMAP` flag must be
159/// set.
160///
161/// [`VM_MIXEDMAP`]: flags::MIXEDMAP
162#[repr(transparent)]
163pub struct VmaMixedMap {
164 vma: VmaRef,
165}
166
167// Make all `VmaRef` methods available on `VmaMixedMap`.
168impl Deref for VmaMixedMap {
169 type Target = VmaRef;
170
171 #[inline]
172 fn deref(&self) -> &VmaRef {
173 &self.vma
174 }
175}
176
177impl VmaMixedMap {
178 /// Access a virtual memory area given a raw pointer.
179 ///
180 /// # Safety
181 ///
182 /// Callers must ensure that `vma` is valid for the duration of 'a, and that the mmap read lock
183 /// (or stronger) is held for at least the duration of 'a. The `VM_MIXEDMAP` flag must be set.
184 #[inline]
185 pub unsafe fn from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self {
186 // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a.
187 unsafe { &*vma.cast() }
188 }
189
190 /// Maps a single page at the given address within the virtual memory area.
191 ///
192 /// This operation does not take ownership of the page.
193 #[inline]
194 pub fn vm_insert_page(&self, address: usize, page: &Page) -> Result {
195 // SAFETY: By the type invariant of `Self` caller has read access and has verified that
196 // `VM_MIXEDMAP` is set. By invariant on `Page` the page has order 0.
197 to_result(unsafe { bindings::vm_insert_page(self.as_ptr(), address, page.as_ptr()) })
198 }
199}
200
201/// A configuration object for setting up a VMA in an `f_ops->mmap()` hook.
202///
203/// The `f_ops->mmap()` hook is called when a new VMA is being created, and the hook is able to
204/// configure the VMA in various ways to fit the driver that owns it. Using `VmaNew` indicates that
205/// you are allowed to perform operations on the VMA that can only be performed before the VMA is
206/// fully initialized.
207///
208/// # Invariants
209///
210/// For the duration of 'a, the referenced vma must be undergoing initialization in an
211/// `f_ops->mmap()` hook.
212#[repr(transparent)]
213pub struct VmaNew {
214 vma: VmaRef,
215}
216
217// Make all `VmaRef` methods available on `VmaNew`.
218impl Deref for VmaNew {
219 type Target = VmaRef;
220
221 #[inline]
222 fn deref(&self) -> &VmaRef {
223 &self.vma
224 }
225}
226
227impl VmaNew {
228 /// Access a virtual memory area given a raw pointer.
229 ///
230 /// # Safety
231 ///
232 /// Callers must ensure that `vma` is undergoing initial vma setup for the duration of 'a.
233 #[inline]
234 pub unsafe fn from_raw<'a>(vma: *mut bindings::vm_area_struct) -> &'a Self {
235 // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a.
236 unsafe { &*vma.cast() }
237 }
238
239 /// Internal method for updating the vma flags.
240 ///
241 /// # Safety
242 ///
243 /// This must not be used to set the flags to an invalid value.
244 #[inline]
245 unsafe fn update_flags(&self, set: vm_flags_t, unset: vm_flags_t) {
246 let mut flags = self.flags();
247 flags |= set;
248 flags &= !unset;
249
250 // SAFETY: This is not a data race: the vma is undergoing initial setup, so it's not yet
251 // shared. Additionally, `VmaNew` is `!Sync`, so it cannot be used to write in parallel.
252 // The caller promises that this does not set the flags to an invalid value.
253 unsafe { (*self.as_ptr()).__bindgen_anon_2.__vm_flags = flags };
254 }
255
256 /// Set the `VM_MIXEDMAP` flag on this vma.
257 ///
258 /// This enables the vma to contain both `struct page` and pure PFN pages. Returns a reference
259 /// that can be used to call `vm_insert_page` on the vma.
260 #[inline]
261 pub fn set_mixedmap(&self) -> &VmaMixedMap {
262 // SAFETY: We don't yet provide a way to set VM_PFNMAP, so this cannot put the flags in an
263 // invalid state.
264 unsafe { self.update_flags(flags::MIXEDMAP, 0) };
265
266 // SAFETY: We just set `VM_MIXEDMAP` on the vma.
267 unsafe { VmaMixedMap::from_raw(self.vma.as_ptr()) }
268 }
269
270 /// Set the `VM_IO` flag on this vma.
271 ///
272 /// This is used for memory mapped IO and similar. The flag tells other parts of the kernel to
273 /// avoid looking at the pages. For memory mapped IO this is useful as accesses to the pages
274 /// could have side effects.
275 #[inline]
276 pub fn set_io(&self) {
277 // SAFETY: Setting the VM_IO flag is always okay.
278 unsafe { self.update_flags(flags::IO, 0) };
279 }
280
281 /// Set the `VM_DONTEXPAND` flag on this vma.
282 ///
283 /// This prevents the vma from being expanded with `mremap()`.
284 #[inline]
285 pub fn set_dontexpand(&self) {
286 // SAFETY: Setting the VM_DONTEXPAND flag is always okay.
287 unsafe { self.update_flags(flags::DONTEXPAND, 0) };
288 }
289
290 /// Set the `VM_DONTCOPY` flag on this vma.
291 ///
292 /// This prevents the vma from being copied on fork. This option is only permanent if `VM_IO`
293 /// is set.
294 #[inline]
295 pub fn set_dontcopy(&self) {
296 // SAFETY: Setting the VM_DONTCOPY flag is always okay.
297 unsafe { self.update_flags(flags::DONTCOPY, 0) };
298 }
299
300 /// Set the `VM_DONTDUMP` flag on this vma.
301 ///
302 /// This prevents the vma from being included in core dumps. This option is only permanent if
303 /// `VM_IO` is set.
304 #[inline]
305 pub fn set_dontdump(&self) {
306 // SAFETY: Setting the VM_DONTDUMP flag is always okay.
307 unsafe { self.update_flags(flags::DONTDUMP, 0) };
308 }
309
310 /// Returns whether `VM_READ` is set.
311 ///
312 /// This flag indicates whether userspace is mapping this vma as readable.
313 #[inline]
314 pub fn readable(&self) -> bool {
315 (self.flags() & flags::READ) != 0
316 }
317
318 /// Try to clear the `VM_MAYREAD` flag, failing if `VM_READ` is set.
319 ///
320 /// This flag indicates whether userspace is allowed to make this vma readable with
321 /// `mprotect()`.
322 ///
323 /// Note that this operation is irreversible. Once `VM_MAYREAD` has been cleared, it can never
324 /// be set again.
325 #[inline]
326 pub fn try_clear_mayread(&self) -> Result {
327 if self.readable() {
328 return Err(EINVAL);
329 }
330 // SAFETY: Clearing `VM_MAYREAD` is okay when `VM_READ` is not set.
331 unsafe { self.update_flags(0, flags::MAYREAD) };
332 Ok(())
333 }
334
335 /// Returns whether `VM_WRITE` is set.
336 ///
337 /// This flag indicates whether userspace is mapping this vma as writable.
338 #[inline]
339 pub fn writable(&self) -> bool {
340 (self.flags() & flags::WRITE) != 0
341 }
342
343 /// Try to clear the `VM_MAYWRITE` flag, failing if `VM_WRITE` is set.
344 ///
345 /// This flag indicates whether userspace is allowed to make this vma writable with
346 /// `mprotect()`.
347 ///
348 /// Note that this operation is irreversible. Once `VM_MAYWRITE` has been cleared, it can never
349 /// be set again.
350 #[inline]
351 pub fn try_clear_maywrite(&self) -> Result {
352 if self.writable() {
353 return Err(EINVAL);
354 }
355 // SAFETY: Clearing `VM_MAYWRITE` is okay when `VM_WRITE` is not set.
356 unsafe { self.update_flags(0, flags::MAYWRITE) };
357 Ok(())
358 }
359
360 /// Returns whether `VM_EXEC` is set.
361 ///
362 /// This flag indicates whether userspace is mapping this vma as executable.
363 #[inline]
364 pub fn executable(&self) -> bool {
365 (self.flags() & flags::EXEC) != 0
366 }
367
368 /// Try to clear the `VM_MAYEXEC` flag, failing if `VM_EXEC` is set.
369 ///
370 /// This flag indicates whether userspace is allowed to make this vma executable with
371 /// `mprotect()`.
372 ///
373 /// Note that this operation is irreversible. Once `VM_MAYEXEC` has been cleared, it can never
374 /// be set again.
375 #[inline]
376 pub fn try_clear_mayexec(&self) -> Result {
377 if self.executable() {
378 return Err(EINVAL);
379 }
380 // SAFETY: Clearing `VM_MAYEXEC` is okay when `VM_EXEC` is not set.
381 unsafe { self.update_flags(0, flags::MAYEXEC) };
382 Ok(())
383 }
384}
385
386/// The integer type used for vma flags.
387#[doc(inline)]
388pub use bindings::vm_flags_t;
389
390/// All possible flags for [`VmaRef`].
391pub mod flags {
392 use super::vm_flags_t;
393 use crate::bindings;
394
395 /// No flags are set.
396 pub const NONE: vm_flags_t = bindings::VM_NONE as vm_flags_t;
397
398 /// Mapping allows reads.
399 pub const READ: vm_flags_t = bindings::VM_READ as vm_flags_t;
400
401 /// Mapping allows writes.
402 pub const WRITE: vm_flags_t = bindings::VM_WRITE as vm_flags_t;
403
404 /// Mapping allows execution.
405 pub const EXEC: vm_flags_t = bindings::VM_EXEC as vm_flags_t;
406
407 /// Mapping is shared.
408 pub const SHARED: vm_flags_t = bindings::VM_SHARED as vm_flags_t;
409
410 /// Mapping may be updated to allow reads.
411 pub const MAYREAD: vm_flags_t = bindings::VM_MAYREAD as vm_flags_t;
412
413 /// Mapping may be updated to allow writes.
414 pub const MAYWRITE: vm_flags_t = bindings::VM_MAYWRITE as vm_flags_t;
415
416 /// Mapping may be updated to allow execution.
417 pub const MAYEXEC: vm_flags_t = bindings::VM_MAYEXEC as vm_flags_t;
418
419 /// Mapping may be updated to be shared.
420 pub const MAYSHARE: vm_flags_t = bindings::VM_MAYSHARE as vm_flags_t;
421
422 /// Page-ranges managed without `struct page`, just pure PFN.
423 pub const PFNMAP: vm_flags_t = bindings::VM_PFNMAP as vm_flags_t;
424
425 /// Memory mapped I/O or similar.
426 pub const IO: vm_flags_t = bindings::VM_IO as vm_flags_t;
427
428 /// Do not copy this vma on fork.
429 pub const DONTCOPY: vm_flags_t = bindings::VM_DONTCOPY as vm_flags_t;
430
431 /// Cannot expand with mremap().
432 pub const DONTEXPAND: vm_flags_t = bindings::VM_DONTEXPAND as vm_flags_t;
433
434 /// Lock the pages covered when they are faulted in.
435 pub const LOCKONFAULT: vm_flags_t = bindings::VM_LOCKONFAULT as vm_flags_t;
436
437 /// Is a VM accounted object.
438 pub const ACCOUNT: vm_flags_t = bindings::VM_ACCOUNT as vm_flags_t;
439
440 /// Should the VM suppress accounting.
441 pub const NORESERVE: vm_flags_t = bindings::VM_NORESERVE as vm_flags_t;
442
443 /// Huge TLB Page VM.
444 pub const HUGETLB: vm_flags_t = bindings::VM_HUGETLB as vm_flags_t;
445
446 /// Synchronous page faults. (DAX-specific)
447 pub const SYNC: vm_flags_t = bindings::VM_SYNC as vm_flags_t;
448
449 /// Architecture-specific flag.
450 pub const ARCH_1: vm_flags_t = bindings::VM_ARCH_1 as vm_flags_t;
451
452 /// Wipe VMA contents in child on fork.
453 pub const WIPEONFORK: vm_flags_t = bindings::VM_WIPEONFORK as vm_flags_t;
454
455 /// Do not include in the core dump.
456 pub const DONTDUMP: vm_flags_t = bindings::VM_DONTDUMP as vm_flags_t;
457
458 /// Not soft dirty clean area.
459 pub const SOFTDIRTY: vm_flags_t = bindings::VM_SOFTDIRTY as vm_flags_t;
460
461 /// Can contain `struct page` and pure PFN pages.
462 pub const MIXEDMAP: vm_flags_t = bindings::VM_MIXEDMAP as vm_flags_t;
463
464 /// MADV_HUGEPAGE marked this vma.
465 pub const HUGEPAGE: vm_flags_t = bindings::VM_HUGEPAGE as vm_flags_t;
466
467 /// MADV_NOHUGEPAGE marked this vma.
468 pub const NOHUGEPAGE: vm_flags_t = bindings::VM_NOHUGEPAGE as vm_flags_t;
469
470 /// KSM may merge identical pages.
471 pub const MERGEABLE: vm_flags_t = bindings::VM_MERGEABLE as vm_flags_t;
472}