nub_host_common/vmem.rs
1/*
2Copyright 2025 The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15 */
16
17#[cfg_attr(target_arch = "x86", path = "arch/i686/vmem.rs")]
18#[cfg_attr(
19 all(target_arch = "x86_64", not(feature = "i686-guest")),
20 path = "arch/amd64/vmem.rs"
21)]
22#[cfg_attr(
23 all(target_arch = "x86_64", feature = "i686-guest"),
24 path = "arch/i686/vmem.rs"
25)]
26#[cfg_attr(target_arch = "aarch64", path = "arch/aarch64/vmem.rs")]
27mod arch;
28
29#[cfg(all(
30 feature = "i686-guest",
31 not(any(target_arch = "x86", target_arch = "x86_64"))
32))]
33compile_error!(
34 "the `i686-guest` feature is only supported on `target_arch = \"x86\"` (guest) or \
35 `target_arch = \"x86_64\"` (host) targets"
36);
37
38/// This is always the page size that the /guest/ is being compiled
39/// for, which may or may not be the same as the host page size.
40pub use arch::PAGE_SIZE;
41pub use arch::{PAGE_PRESENT, PAGE_TABLE_SIZE, PTE_ADDR_MASK, PageTableEntry, PhysAddr, VirtAddr};
42pub const PAGE_TABLE_ENTRIES_PER_TABLE: usize =
43 PAGE_TABLE_SIZE / core::mem::size_of::<PageTableEntry>();
44
45// Shared page table iterator infrastructure used by each arch module.
46
47/// Utility function to extract an (inclusive on both ends) bit range
48/// from a quadword.
49#[inline(always)]
50#[allow(dead_code)] // unused on aarch64 (only the amd64/i686 page-walkers use it)
51pub(in crate::vmem) fn bits<const HIGH_BIT: u8, const LOW_BIT: u8>(x: u64) -> u64 {
52 (x & ((1 << (HIGH_BIT + 1)) - 1)) >> LOW_BIT
53}
54
55/// Helper function to write a page table entry, updating the whole
56/// chain of tables back to the root if necessary.
57///
58/// # Safety
59/// Same requirements as [`TableOps::write_entry`].
60#[allow(dead_code)] // unused on aarch64 (only the amd64/i686 page-walkers use it)
61pub(in crate::vmem) unsafe fn write_entry_updating<
62 Op: TableOps,
63 P: UpdateParent<
64 Op,
65 TableMoveInfo = <Op::TableMovability as TableMovabilityBase<Op>>::TableMoveInfo,
66 >,
67>(
68 op: &Op,
69 parent: P,
70 addr: Op::TableAddr,
71 entry: u64,
72) {
73 #[allow(clippy::useless_conversion)]
74 if let Some(again) = unsafe { op.write_entry(addr, entry as PageTableEntry) } {
75 parent.update_parent(op, again);
76 }
77}
78
79/// A helper trait that allows us to move a page table (e.g. from the
80/// snapshot to the scratch region), keeping track of the context that
81/// needs to be updated when that is moved (and potentially
82/// recursively updating, if necessary).
83///
84/// This is done via a trait so that the selected impl knows the exact
85/// nesting depth of tables, in order to assist
86/// inlining/specialisation in generating efficient code.
87///
88/// The trait definition only bounds its parameter by
89/// [`TableReadOps`], since [`UpdateParentNone`] does not need to be
90/// able to actually write to the tables.
91pub trait UpdateParent<Op: TableReadOps + ?Sized>: Copy {
92 /// The type of the information about a moved table which is
93 /// needed in order to update its parent.
94 type TableMoveInfo;
95 /// The [`UpdateParent`] type that should be used when going down
96 /// another level in the table, in order to add the current level
97 /// to the chain of ancestors to be updated.
98 type ChildType: UpdateParent<Op, TableMoveInfo = Self::TableMoveInfo>;
99 fn update_parent(self, op: &Op, new_ptr: Self::TableMoveInfo);
100 fn for_child_at_entry(self, entry_ptr: Op::TableAddr) -> Self::ChildType;
101}
102
103/// A struct implementing [`UpdateParent`] that is impossible to use
104/// (since its [`UpdateParent::update_parent`] method takes [`Void`]),
105/// used when it is statically known that a table operation cannot
106/// result in a need to update ancestors.
107#[derive(Copy, Clone)]
108pub struct UpdateParentNone {}
109impl<Op: TableReadOps> UpdateParent<Op> for UpdateParentNone {
110 type TableMoveInfo = Void;
111 type ChildType = Self;
112 fn update_parent(self, _op: &Op, impossible: Void) {
113 match impossible {}
114 }
115 fn for_child_at_entry(self, _entry_ptr: Op::TableAddr) -> Self {
116 self
117 }
118}
119
120/// A helper structure indicating a mapping operation that needs to be
121/// performed.
122#[allow(dead_code)] // unused on aarch64 (only the amd64/i686 page-walkers use it)
123pub(in crate::vmem) struct MapRequest<Op: TableReadOps, P: UpdateParent<Op>> {
124 pub table_base: Op::TableAddr,
125 pub vmin: u64,
126 pub len: u64,
127 pub update_parent: P,
128}
129
130/// A helper structure indicating that a particular PTE needs to be
131/// modified.
132#[allow(dead_code)] // unused on aarch64 (only the amd64/i686 page-walkers use it)
133pub(in crate::vmem) struct MapResponse<Op: TableReadOps, P: UpdateParent<Op>> {
134 pub entry_ptr: Op::TableAddr,
135 pub vmin: u64,
136 pub len: u64,
137 pub update_parent: P,
138}
139
140/// Iterator that walks through page table entries at a specific level.
141///
142/// Given a virtual address range and a table base, this iterator yields
143/// `MapResponse` items for each page table entry that needs to be modified.
144/// The const generics `HIGH_BIT` and `LOW_BIT` specify which bits of the
145/// virtual address are used to index into this level's table.
146///
147/// For example on amd64:
148/// - PML4: HIGH_BIT=47, LOW_BIT=39 (9 bits = 512 entries, each covering 512GB)
149/// - PDPT: HIGH_BIT=38, LOW_BIT=30 (9 bits = 512 entries, each covering 1GB)
150/// - PD: HIGH_BIT=29, LOW_BIT=21 (9 bits = 512 entries, each covering 2MB)
151/// - PT: HIGH_BIT=20, LOW_BIT=12 (9 bits = 512 entries, each covering 4KB)
152///
153/// On i686:
154/// - PD: HIGH_BIT=31, LOW_BIT=22 (10 bits = 1024 entries, each covering 4MB)
155/// - PT: HIGH_BIT=21, LOW_BIT=12 (10 bits = 1024 entries, each covering 4KB)
156#[allow(dead_code)] // unused on aarch64 (only the amd64/i686 page-walkers use it)
157pub(in crate::vmem) struct ModifyPteIterator<
158 const HIGH_BIT: u8,
159 const LOW_BIT: u8,
160 Op: TableReadOps,
161 P: UpdateParent<Op>,
162> {
163 request: MapRequest<Op, P>,
164 n: u64,
165}
166impl<const HIGH_BIT: u8, const LOW_BIT: u8, Op: TableReadOps, P: UpdateParent<Op>> Iterator
167 for ModifyPteIterator<HIGH_BIT, LOW_BIT, Op, P>
168{
169 type Item = MapResponse<Op, P>;
170 fn next(&mut self) -> Option<Self::Item> {
171 // Each page table entry at this level covers a region of size
172 // (1 << LOW_BIT) bytes. For example, at the PT level
173 // (LOW_BIT=12), each entry covers 4KB (0x1000 bytes). At the
174 // PD level (LOW_BIT=21), each entry covers 2MB (0x200000
175 // bytes).
176 //
177 // This mask isolates the bits below this level's index bits,
178 // used for alignment.
179 let lower_bits_mask = (1u64 << LOW_BIT) - 1;
180
181 // Calculate the virtual address for this iteration.
182 // On the first iteration (n=0), start at the requested vmin.
183 // On subsequent iterations, advance to the next aligned boundary.
184 // This handles the case where vmin isn't aligned to this level's
185 // entry size.
186 let next_vmin = if self.n == 0 {
187 self.request.vmin
188 } else {
189 // Align to the next boundary by adding one entry's worth
190 // and masking off lower bits. Masking off before adding
191 // is safe, since n << LOW_BIT must always have zeros in
192 // these positions.
193 let aligned_min = self.request.vmin & !lower_bits_mask;
194 // Use checked_add because going past the end of the
195 // address space counts as "the next one would be out of
196 // range"
197 aligned_min.checked_add(self.n << LOW_BIT)?
198 };
199
200 // Check if we've processed the entire requested range
201 if next_vmin >= self.request.vmin + self.request.len {
202 return None;
203 }
204
205 // Calculate the pointer to this level's page table entry.
206 // bits::<HIGH_BIT, LOW_BIT> extracts the relevant index bits
207 // from the virtual address. Multiply by the PTE size to get
208 // the byte offset.
209 let pte_index = bits::<HIGH_BIT, LOW_BIT>(next_vmin);
210 let entry_ptr = Op::entry_addr(
211 self.request.table_base,
212 pte_index * core::mem::size_of::<PageTableEntry>() as u64,
213 );
214
215 // Calculate how many bytes remain to be mapped from this point.
216 let len_from_here = self.request.len - (next_vmin - self.request.vmin);
217 // Calculate the maximum bytes this single entry can cover.
218 // If next_vmin is aligned, this is the full entry size (1 << LOW_BIT).
219 // If not aligned (only possible on first iteration), it's the
220 // remaining space until the next boundary.
221 let max_len = (1u64 << LOW_BIT) - (next_vmin & lower_bits_mask);
222 // The actual length for this entry is the smaller of what's
223 // needed vs what fits.
224 let next_len = core::cmp::min(len_from_here, max_len);
225
226 // Advance iteration counter for next call
227 self.n += 1;
228
229 Some(MapResponse {
230 entry_ptr,
231 vmin: next_vmin,
232 len: next_len,
233 update_parent: self.request.update_parent,
234 })
235 }
236}
237
238#[allow(dead_code)] // unused on aarch64 (only the amd64/i686 page-walkers use it)
239pub(in crate::vmem) fn modify_ptes<
240 const HIGH_BIT: u8,
241 const LOW_BIT: u8,
242 Op: TableReadOps,
243 P: UpdateParent<Op>,
244>(
245 r: MapRequest<Op, P>,
246) -> ModifyPteIterator<HIGH_BIT, LOW_BIT, Op, P> {
247 ModifyPteIterator { request: r, n: 0 }
248}
249
250/// The read-only operations used to actually access the page table
251/// structures, used to allow the same code to be used in the host and
252/// the guest for page table setup. This is distinct from
253/// `TableWriteOps`, since there are some implementations for which
254/// writing does not make sense, and only reading is required.
255pub trait TableReadOps {
256 /// The type of table addresses
257 type TableAddr: Copy;
258
259 /// Offset the table address by the given offset in bytes.
260 ///
261 /// # Parameters
262 /// - `addr`: The base address of the table.
263 /// - `entry_offset`: The offset in **bytes** within the page table. This is
264 /// not an entry index; callers must multiply the entry index by the size
265 /// of a page table entry (typically 8 bytes) to obtain the correct byte offset.
266 ///
267 /// # Returns
268 /// The address of the entry at the given byte offset from the base address.
269 fn entry_addr(addr: Self::TableAddr, entry_offset: u64) -> Self::TableAddr;
270
271 /// Read a u64 from the given address, used to read existing page
272 /// table entries
273 ///
274 /// # Safety
275 /// This reads from the given memory address, and so all the usual
276 /// Rust things about raw pointers apply. This will also be used
277 /// to update guest page tables, so especially in the guest, it is
278 /// important to ensure that the page tables updates do not break
279 /// invariants. The implementor of the trait should ensure that
280 /// nothing else will be reading/writing the address at the same
281 /// time as mapping code using the trait.
282 unsafe fn read_entry(&self, addr: Self::TableAddr) -> PageTableEntry;
283
284 /// Convert an abstract table address to a concrete physical address (u64)
285 /// which can be e.g. written into a page table entry
286 fn to_phys(addr: Self::TableAddr) -> PhysAddr;
287
288 /// Convert a concrete physical address (u64) which may have been e.g. read
289 /// from a page table entry back into an abstract table address
290 fn from_phys(addr: PhysAddr) -> Self::TableAddr;
291
292 /// Return the address of the root page table
293 fn root_table(&self) -> Self::TableAddr;
294}
295
296/// Our own version of ! until it is stable. Used to avoid needing to
297/// implement [`TableOps::update_root`] for ops that never need
298/// to move a table.
299pub enum Void {}
300
301/// A marker struct, used by an implementation of [`TableOps`] to
302/// indicate that it may need to move existing page tables
303pub struct MayMoveTable {}
304/// A marker struct, used by an implementation of [`TableOps`] to
305/// indicate that it will be able to update existing page tables
306/// in-place, without moving them.
307pub struct MayNotMoveTable {}
308
309mod sealed {
310 use super::{MayMoveTable, MayNotMoveTable, TableReadOps, Void};
311
312 /// A (purposefully-not-exposed) internal implementation detail of the
313 /// logic around whether a [`TableOps`] implementation may or may not
314 /// move page tables.
315 pub trait TableMovabilityBase<Op: TableReadOps + ?Sized> {
316 type TableMoveInfo;
317 }
318 impl<Op: TableReadOps> TableMovabilityBase<Op> for MayMoveTable {
319 type TableMoveInfo = Op::TableAddr;
320 }
321 impl<Op: TableReadOps> TableMovabilityBase<Op> for MayNotMoveTable {
322 type TableMoveInfo = Void;
323 }
324}
325use sealed::*;
326
327/// A sealed trait used to collect some information about the marker structures [`MayMoveTable`] and [`MayNotMoveTable`]
328pub trait TableMovability<Op: TableReadOps + ?Sized>:
329 TableMovabilityBase<Op>
330 + arch::TableMovability<Op, <Self as TableMovabilityBase<Op>>::TableMoveInfo>
331{
332}
333impl<
334 Op: TableReadOps,
335 T: TableMovabilityBase<Op>
336 + arch::TableMovability<Op, <Self as TableMovabilityBase<Op>>::TableMoveInfo>,
337> TableMovability<Op> for T
338{
339}
340
341/// The operations used to actually access the page table structures
342/// that involve writing to them, used to allow the same code to be
343/// used in the host and the guest for page table setup.
344pub trait TableOps: TableReadOps {
345 /// This marker should be either [`MayMoveTable`] or
346 /// [`MayNotMoveTable`], as the case may be.
347 ///
348 /// If this is [`MayMoveTable`], the return type of
349 /// [`Self::write_entry`] and the parameter type of
350 /// [`Self::update_root`] will be `<Self as
351 /// TableReadOps>::TableAddr`. If it is [`MayNotMoveTable`], those
352 /// types will be [`Void`].
353 type TableMovability: TableMovability<Self>;
354
355 /// Allocate a zeroed table
356 ///
357 /// # Safety
358 /// The current implementations of this function are not
359 /// inherently unsafe, but the guest implementation will likely
360 /// become so in the future when a real physical page allocator is
361 /// implemented.
362 ///
363 /// Currently, callers should take care not to call this on
364 /// multiple threads at the same time.
365 ///
366 /// # Panics
367 /// This function may panic if:
368 /// - The Layout creation fails
369 /// - Memory allocation fails
370 unsafe fn alloc_table(&self) -> Self::TableAddr;
371
372 /// Write a u64 to the given address, used to write updated page
373 /// table entries. In some cases,the page table in which the entry
374 /// is located may need to be relocated in order for this to
375 /// succeed; if this is the case, the base address of the new
376 /// table is returned.
377 ///
378 /// # Safety
379 /// This writes to the given memory address, and so all the usual
380 /// Rust things about raw pointers apply. This will also be used
381 /// to update guest page tables, so especially in the guest, it is
382 /// important to ensure that the page tables updates do not break
383 /// invariants. The implementor of the trait should ensure that
384 /// nothing else will be reading/writing the address at the same
385 /// time as mapping code using the trait.
386 unsafe fn write_entry(
387 &self,
388 addr: Self::TableAddr,
389 entry: PageTableEntry,
390 ) -> Option<<Self::TableMovability as TableMovabilityBase<Self>>::TableMoveInfo>;
391
392 /// Change the root page table to one at a different address
393 ///
394 /// # Safety
395 /// This function will directly result in a change to virtual
396 /// memory translation, and so is inherently unsafe w.r.t. the
397 /// Rust memory model. All the caveats listed on [`map`] apply as
398 /// well.
399 unsafe fn update_root(
400 &self,
401 new_root: <Self::TableMovability as TableMovabilityBase<Self>>::TableMoveInfo,
402 );
403}
404
405#[derive(Debug, PartialEq, Clone, Copy)]
406pub struct BasicMapping {
407 pub readable: bool,
408 pub writable: bool,
409 pub executable: bool,
410}
411
412#[derive(Debug, PartialEq, Clone, Copy)]
413pub struct CowMapping {
414 pub readable: bool,
415 pub executable: bool,
416}
417
418#[derive(Debug, PartialEq, Clone, Copy)]
419pub enum MappingKind {
420 Unmapped,
421 Basic(BasicMapping),
422 Cow(CowMapping),
423 /* TODO: What useful things other than basic mappings actually
424 * require touching the tables? */
425}
426
427#[derive(Debug)]
428pub struct Mapping {
429 pub phys_base: u64,
430 pub virt_base: u64,
431 pub len: u64,
432 pub kind: MappingKind,
433 /// On architectures that support multiple privilege levels inside
434 /// the guest, whether the mapping is accessible to the
435 /// lower-privileged level (with the same permissions/behaviour as
436 /// the upper-privileged level, for now).
437 pub user_accessible: bool,
438}
439
440/// Assumption: all are page-aligned
441///
442/// # Safety
443/// This function modifies pages backing a virtual memory range which
444/// is inherently unsafe w.r.t. the Rust memory model.
445///
446/// When using this function, please note:
447/// - No locking is performed before touching page table data structures,
448/// as such do not use concurrently with any other page table operations
449/// - TLB invalidation is not performed, if previously-mapped ranges
450/// are being remapped, TLB invalidation may need to be performed
451/// afterwards.
452pub use arch::map;
453/// This function is presently used for reading the tracing data, also
454/// it is useful for debugging
455///
456/// # Safety
457/// This function traverses page table data structures, and should not
458/// be called concurrently with any other operations that modify the
459/// page table.
460pub use arch::virt_to_phys;
461
462//==================================================================================================
463// Multi-space (aliased page-table) walking
464//==================================================================================================
465
466/// Identifier for a virtual address space, used by the multi-space
467/// walker to describe which space "owns" a shared intermediate table.
468/// Implementations typically use the physical address of the root
469/// page table (which is unique per space).
470pub type SpaceId = u64;
471
472/// A reference from one address space to an intermediate page table
473/// that lives in a different space. Produced by [`walk_va_spaces`] when
474/// the walker encounters an intermediate table (at some `depth` below
475/// the root) whose physical address was already seen via an earlier
476/// root — i.e. the two spaces alias that sub-tree.
477///
478/// Semantics: the level-`depth` block in **our** space that contains
479/// VAs starting at `our_va` is aliased to the level-`depth` block in
480/// `space` that contains VAs starting at `their_va`. Everything below
481/// that sub-tree — PDEs, PTEs, leaf mappings — is shared wholesale.
482///
483/// `depth` is counted from the root:
484/// - `depth = 1` on i686: the shared thing is a leaf PT (the thing a
485/// PDE points to).
486/// - `depth = 1, 2, 3` on amd64: PDPT, PD, or PT respectively.
487#[derive(Debug, Clone, Copy)]
488pub struct SpaceReferenceMapping {
489 /// Depth from the root at which the alias starts (1-based).
490 pub depth: usize,
491 /// The "owning" space — the first root that visited this
492 /// intermediate PA during [`walk_va_spaces`].
493 pub space: SpaceId,
494 /// Start VA of the aliased sub-tree in OUR space.
495 pub our_va: u64,
496 /// Start VA of the aliased sub-tree in the owning space. Usually
497 /// equal to `our_va` (kernel mappings at the same VA across
498 /// processes) but the design permits different VAs.
499 pub their_va: u64,
500}
501
502/// Either a normal leaf mapping in the current space, or a reference
503/// to an intermediate table in another space. The compaction loop in
504/// the host snapshotting code treats these two cases differently:
505///
506/// - `ThisSpace(m)` is rebuilt like any other leaf mapping: the
507/// backing page is compacted into the new snapshot blob, the PTE is
508/// written, and intermediate tables are allocated on demand.
509/// - `AnotherSpace(r)` is rebuilt by *linking*: the entry in our
510/// rebuilt root at depth `r.depth - 1` for `r.our_va` is made to
511/// point at whatever table the owning space ended up with at
512/// `r.their_va`. See [`space_aware_map`].
513#[derive(Debug)]
514pub enum SpaceAwareMapping {
515 ThisSpace(Mapping),
516 AnotherSpace(SpaceReferenceMapping),
517}
518
519/// Counterpart of [`walk_va_spaces`]'s `AnotherSpace` entries on the
520/// write side: installs a link in `op`'s root PT tree at `ref_map.our_va`
521/// that points at whatever intermediate table the owning space ended
522/// up with at `ref_map.their_va` (in `built_roots[ref_map.space]`).
523///
524/// Callers must ensure that `built_roots` contains populated page
525/// tables for any other space referenced by the mapping.
526///
527/// # Safety
528/// Same invariants as [`map`]: the caller owns the concurrency story
529/// around the page tables being written, and must invalidate TLBs
530/// afterwards if they were live.
531pub use arch::space_aware_map;
532/// Walk multiple page-table roots together, emitting either a normal
533/// leaf mapping (`ThisSpace`) or a reference to an alias that was
534/// already seen via an earlier root (`AnotherSpace`).
535///
536/// The caller passes `roots` in their preferred order of primacy. The
537/// first root to visit a particular intermediate PA becomes the
538/// "owner" of that sub-table — subsequent roots that alias it receive
539/// `AnotherSpace` entries referencing the owner.
540///
541/// The returned `Vec` is ordered the same way `roots` was passed — so
542/// by construction the result is topologically sorted: every
543/// `AnotherSpace` reference points to a space that appears earlier in
544/// the list. This lets a rebuilder process roots in iteration order
545/// without a separate sort pass, and guarantees that the
546/// [`space_aware_map`] invariant is met.
547///
548/// # Safety
549/// Same invariants as [`virt_to_phys`]. Callers must ensure the page
550/// tables are not being mutated concurrently.
551pub use arch::walk_va_spaces;