Skip to main content

nub_host_kvm/mem/
shared_mem.rs

1/*
2Copyright 2025  The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use std::any::type_name;
18use std::ffi::c_void;
19use std::io::Error;
20use std::mem::{align_of, size_of};
21#[cfg(target_os = "linux")]
22use std::ptr::null_mut;
23use std::sync::{Arc, RwLock};
24
25use nub_host_common::mem::PAGE_SIZE_USIZE;
26use tracing::{Span, instrument};
27#[cfg(target_os = "windows")]
28use windows::Win32::Foundation::{CloseHandle, HANDLE, INVALID_HANDLE_VALUE};
29#[cfg(target_os = "windows")]
30use windows::Win32::System::Memory::PAGE_READWRITE;
31#[cfg(target_os = "windows")]
32use windows::Win32::System::Memory::{
33    CreateFileMappingA, FILE_MAP_ALL_ACCESS, MEMORY_MAPPED_VIEW_ADDRESS, MapViewOfFile,
34    PAGE_NOACCESS, PAGE_PROTECTION_FLAGS, UnmapViewOfFile, VirtualProtect,
35};
36#[cfg(target_os = "windows")]
37use windows::core::PCSTR;
38
39use super::memory_region::{
40    HostGuestMemoryRegion, MemoryRegion, MemoryRegionFlags, MemoryRegionKind, MemoryRegionType,
41};
42#[cfg(target_os = "windows")]
43use crate::HyperlightError::WindowsAPIError;
44use crate::{HyperlightError, Result, log_then_return, new_error};
45
46/// Makes sure that the given `offset` and `size` are within the bounds of the memory with size `mem_size`.
47macro_rules! bounds_check {
48    ($offset:expr, $size:expr, $mem_size:expr) => {
49        if $offset.checked_add($size).is_none_or(|end| end > $mem_size) {
50            return Err(new_error!(
51                "Cannot read value from offset {} with size {} in memory of size {}",
52                $offset,
53                $size,
54                $mem_size
55            ));
56        }
57    };
58}
59
60/// generates a reader function for the given type
61macro_rules! generate_reader {
62    ($fname:ident, $ty:ty) => {
63        /// Read a value of type `$ty` from the memory at the given offset.
64        #[allow(dead_code)]
65        #[instrument(err(Debug), skip_all, parent = Span::current(), level= "Trace")]
66        pub(crate) fn $fname(&self, offset: usize) -> Result<$ty> {
67            let data = self.as_slice();
68            bounds_check!(offset, std::mem::size_of::<$ty>(), data.len());
69            Ok(<$ty>::from_le_bytes(
70                data[offset..offset + std::mem::size_of::<$ty>()].try_into()?,
71            ))
72        }
73    };
74}
75
76/// generates a writer function for the given type
77macro_rules! generate_writer {
78    ($fname:ident, $ty:ty) => {
79        /// Write a value of type `$ty` to the memory at the given offset.
80        #[allow(dead_code)]
81        pub(crate) fn $fname(&mut self, offset: usize, value: $ty) -> Result<()> {
82            let data = self.as_mut_slice();
83            bounds_check!(offset, std::mem::size_of::<$ty>(), data.len());
84            data[offset..offset + std::mem::size_of::<$ty>()].copy_from_slice(&value.to_le_bytes());
85            Ok(())
86        }
87    };
88}
89
90/// A representation of a host mapping of a shared memory region,
91/// which will be released when this structure is Drop'd. This is not
92/// individually Clone (since it holds ownership of the mapping), or
93/// Send or Sync, since it doesn't ensure any particular synchronization.
94#[derive(Debug)]
95pub struct HostMapping {
96    ptr: *mut u8,
97    size: usize,
98    #[cfg(target_os = "windows")]
99    handle: HANDLE,
100}
101
102impl Drop for HostMapping {
103    #[cfg(target_os = "linux")]
104    fn drop(&mut self) {
105        use libc::munmap;
106
107        unsafe {
108            munmap(self.ptr as *mut c_void, self.size);
109        }
110    }
111    #[cfg(target_os = "windows")]
112    fn drop(&mut self) {
113        let mem_mapped_address = MEMORY_MAPPED_VIEW_ADDRESS {
114            Value: self.ptr as *mut c_void,
115        };
116        if let Err(e) = unsafe { UnmapViewOfFile(mem_mapped_address) } {
117            tracing::error!(
118                "Failed to drop HostMapping (UnmapViewOfFile failed): {:?}",
119                e
120            );
121        }
122
123        let file_handle: HANDLE = self.handle;
124        if let Err(e) = unsafe { CloseHandle(file_handle) } {
125            tracing::error!("Failed to  drop HostMapping (CloseHandle failed): {:?}", e);
126        }
127    }
128}
129
130/// These three structures represent various phases of the lifecycle of
131/// a memory buffer that is shared with the guest. An
132/// ExclusiveSharedMemory is used for certain operations that
133/// unrestrictedly write to the shared memory, including setting it up
134/// and taking snapshots.
135#[derive(Debug)]
136pub struct ExclusiveSharedMemory {
137    region: Arc<HostMapping>,
138}
139unsafe impl Send for ExclusiveSharedMemory {}
140
141/// A GuestSharedMemory is used to represent
142/// the reference to all-of-memory that is taken by the virtual cpu.
143/// Because of the memory model limitations that affect
144/// HostSharedMemory, it is likely fairly important (to ensure that
145/// our UB remains limited to interaction with an external compilation
146/// unit that likely can't be discovered by the compiler) that _rust_
147/// users do not perform racy accesses to the guest communication
148/// buffers that are also accessed by HostSharedMemory.
149#[derive(Debug)]
150pub struct GuestSharedMemory {
151    region: Arc<HostMapping>,
152    /// The lock that indicates this shared memory is being used by non-Rust code
153    ///
154    /// This lock _must_ be held whenever the guest is executing,
155    /// because it prevents the host from converting its
156    /// HostSharedMemory to an ExclusiveSharedMemory. Since the guest
157    /// may arbitrarily mutate the shared memory, only synchronized
158    /// accesses from Rust should be allowed!
159    ///
160    /// We cannot enforce this in the type system, because the memory
161    /// is mapped in to the VM at VM creation time.
162    pub lock: Arc<RwLock<()>>,
163}
164unsafe impl Send for GuestSharedMemory {}
165
166/// A HostSharedMemory allows synchronized accesses to guest
167/// communication buffers, allowing it to be used concurrently with a
168/// GuestSharedMemory.
169///
170/// # Concurrency model
171///
172/// Given future requirements for asynchronous I/O with a minimum
173/// amount of copying (e.g. WASIp3 streams), we would like it to be
174/// possible to safely access these buffers concurrently with the
175/// guest, ensuring that (1) data is read appropriately if the guest
176/// is well-behaved; and (2) the host's behaviour is defined
177/// regardless of whether or not the guest is well-behaved.
178///
179/// The ideal (future) flow for a guest->host message is something like
180///   - Guest writes (unordered) bytes describing a work item into a buffer
181///   - Guest reveals buffer via a release-store of a pointer into an
182///     MMIO ring-buffer
183///   - Host acquire-loads the buffer pointer from the "MMIO" ring
184///     buffer
185///   - Host (unordered) reads the bytes from the buffer
186///   - Host performs validation of those bytes and uses them
187///
188/// Unfortunately, there appears to be no way to do this with defined
189/// behaviour in present Rust (see
190/// e.g. <https://github.com/rust-lang/unsafe-code-guidelines/issues/152>).
191/// Rust does not yet have its own defined memory model, but in the
192/// interim, it is widely treated as inheriting the current C/C++
193/// memory models.  The most immediate problem is that regardless of
194/// anything else, under those memory models \[1, p. 17-18; 2, p. 88\],
195///
196///   > The execution of a program contains a _data race_ if it
197///   > contains two [C++23: "potentially concurrent"] conflicting
198///   > actions [C23: "in different threads"], at least one of which
199///   > is not atomic, and neither happens before the other [C++23: ",
200///   > except for the special case for signal handlers described
201///   > below"].  Any such data race results in undefined behavior.
202///
203/// Consequently, if a misbehaving guest fails to correctly
204/// synchronize its stores with the host, the host's innocent loads
205/// will trigger undefined behaviour for the entire program, including
206/// the host.  Note that this also applies if the guest makes an
207/// unsynchronized read of a location that the host is writing!
208///
209/// Despite Rust's de jure inheritance of the C memory model at the
210/// present time, the compiler in many cases de facto adheres to LLVM
211/// semantics, so it is worthwhile to consider what LLVM does in this
212/// case as well.  According to the the LangRef \[3\] memory model,
213/// loads which are involved in a race that includes at least one
214/// non-atomic access (whether the load or a store) return `undef`,
215/// making them roughly equivalent to reading uninitialized
216/// memory. While this is much better, it is still bad.
217///
218/// Considering a different direction, recent C++ papers have seemed
219/// to lean towards using `volatile` for similar use cases. For
220/// example, in P1152R0 \[4\], JF Bastien notes that
221///
222///   > We’ve shown that volatile is purposely defined to denote
223///   > external modifications. This happens for:
224///   >   - Shared memory with untrusted code, where volatile is the
225///   >     right way to avoid time-of-check time-of-use (ToCToU)
226///   >     races which lead to security bugs such as \[PWN2OWN\] and
227///   >     \[XENXSA155\].
228///
229/// Unfortunately, although this paper was adopted for C++20 (and,
230/// sadly, mostly un-adopted for C++23, although that does not concern
231/// us), the paper did not actually redefine volatile accesses or data
232/// races to prevent volatile accesses from racing with other accesses
233/// and causing undefined behaviour.  P1382R1 \[5\] would have amended
234/// the wording of the data race definition to specifically exclude
235/// volatile, but, unfortunately, despite receiving a
236/// generally-positive reception at its first WG21 meeting more than
237/// five years ago, it has not progressed.
238///
239/// Separately from the data race issue, there is also a concern that
240/// according to the various memory models in use, there may be ways
241/// in which the guest can semantically obtain uninitialized memory
242/// and write it into the shared buffer, which may also result in
243/// undefined behaviour on reads.  The degree to which this is a
244/// concern is unclear, however, since it is unclear to what degree
245/// the Rust abstract machine's conception of uninitialized memory
246/// applies to the sandbox.  Returning briefly to the LLVM level,
247/// rather than the Rust level, this, combined with the fact that
248/// racing loads in LLVM return `undef`, as discussed above, we would
249/// ideally `llvm.freeze` the result of any load out of the sandbox.
250///
251/// It would furthermore be ideal if we could run the flatbuffers
252/// parsing code directly on the guest memory, in order to avoid
253/// unnecessary copies.  That is unfortunately probably not viable at
254/// the present time: because the generated flatbuffers parsing code
255/// doesn't use atomic or volatile accesses, it is likely to introduce
256/// double-read vulnerabilities.
257///
258/// In short, none of the Rust-level operations available to us do the
259/// right thing, at the Rust spec level or the LLVM spec level. Our
260/// major remaining options are therefore:
261///   - Choose one of the options that is available to us, and accept
262///     that we are doing something unsound according to the spec, but
263///     hope that no reasonable compiler could possibly notice.
264///   - Use inline assembly per architecture, for which we would only
265///     need to worry about the _architecture_'s memory model (which
266///     is far less demanding).
267///
268/// The leading candidate for the first option would seem to be to
269/// simply use volatile accesses; there seems to be wide agreement
270/// that this _should_ be a valid use case for them (even if it isn't
271/// now), and projects like Linux and rust-vmm already use C11
272/// `volatile` for this purpose.  It is also worth noting that because
273/// we still do need to synchronize with the guest when it _is_ being
274/// well-behaved, we would ideally use volatile acquire loads and
275/// volatile release stores for interacting with the stack pointer in
276/// the guest in this case.  Unfortunately, while those operations are
277/// defined in LLVM, they are not presently exposed to Rust. While
278/// atomic fences that are not associated with memory accesses
279/// ([`std::sync::atomic::fence`]) might at first glance seem to help with
280/// this problem, they unfortunately do not \[6\]:
281///
282///    > A fence ‘A’ which has (at least) Release ordering semantics,
283///    > synchronizes with a fence ‘B’ with (at least) Acquire
284///    > semantics, if and only if there exist operations X and Y,
285///    > both operating on some atomic object ‘M’ such that A is
286///    > sequenced before X, Y is sequenced before B and Y observes
287///    > the change to M. This provides a happens-before dependence
288///    > between A and B.
289///
290/// Note that the X and Y must be to an _atomic_ object.
291///
292/// We consequently assume that there has been a strong architectural
293/// fence on a vmenter/vmexit between data being read and written.
294/// This is unsafe (not guaranteed in the type system)!
295///
296/// \[1\] N3047 C23 Working Draft. <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3047.pdf>
297/// \[2\] N4950 C++23 Working Draft. <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/n4950.pdf>
298/// \[3\] LLVM Language Reference Manual, Memory Model for Concurrent Operations. <https://llvm.org/docs/LangRef.html#memmodel>
299/// \[4\] P1152R0: Deprecating `volatile`. JF Bastien. <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p1152r0.html>
300/// \[5\] P1382R1: `volatile_load<T>` and `volatile_store<T>`. JF Bastien, Paul McKenney, Jeffrey Yasskin, and the indefatigable TBD. <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p1382r1.pdf>
301/// \[6\] Documentation for std::sync::atomic::fence. <https://doc.rust-lang.org/std/sync/atomic/fn.fence.html>
302///
303/// # Note \[Keeping mappings in sync between userspace and the guest\]
304///
305/// When using this structure with mshv on Linux, it is necessary to
306/// be a little bit careful: since the hypervisor is not directly
307/// integrated with the host kernel virtual memory subsystem, it is
308/// easy for the memory region in userspace to get out of sync with
309/// the memory region mapped into the guest.  Generally speaking, when
310/// the [`SharedMemory`] is mapped into a partition, the MSHV kernel
311/// module will call `pin_user_pages(FOLL_PIN|FOLL_WRITE)` on it,
312/// which will eagerly do any CoW, etc needing to obtain backing pages
313/// pinned in memory, and then map precisely those backing pages into
314/// the virtual machine. After that, the backing pages mapped into the
315/// VM will not change until the region is unmapped or remapped.  This
316/// means that code in this module needs to be very careful to avoid
317/// changing the backing pages of the region in the host userspace,
318/// since that would result in hyperlight-host's view of the memory
319/// becoming completely divorced from the view of the VM.
320#[derive(Clone, Debug)]
321pub struct HostSharedMemory {
322    region: Arc<HostMapping>,
323    lock: Arc<RwLock<()>>,
324}
325unsafe impl Send for HostSharedMemory {}
326
327impl ExclusiveSharedMemory {
328    /// Create a new region of shared memory with the given minimum
329    /// size in bytes. The region will be surrounded by guard pages.
330    ///
331    /// Return `Err` if shared memory could not be allocated.
332    #[cfg(target_os = "linux")]
333    #[instrument(skip_all, parent = Span::current(), level= "Trace")]
334    pub fn new(min_size_bytes: usize) -> Result<Self> {
335        use libc::{
336            MAP_ANONYMOUS, MAP_FAILED, MAP_PRIVATE, PROT_READ, PROT_WRITE, c_int, mmap, off_t,
337            size_t,
338        };
339        #[cfg(not(miri))]
340        use libc::{MAP_NORESERVE, PROT_NONE, mprotect};
341
342        if min_size_bytes == 0 {
343            return Err(new_error!("Cannot create shared memory with size 0"));
344        }
345
346        let total_size = min_size_bytes
347            .checked_add(2 * PAGE_SIZE_USIZE) // guard page around the memory
348            .ok_or_else(|| new_error!("Memory required for sandbox exceeded usize::MAX"))?;
349
350        if total_size % PAGE_SIZE_USIZE != 0 {
351            return Err(new_error!(
352                "shared memory must be a multiple of {}",
353                PAGE_SIZE_USIZE
354            ));
355        }
356
357        // usize and isize are guaranteed to be the same size, and
358        // isize::MAX should be positive, so this cast should be safe.
359        if total_size > isize::MAX as usize {
360            return Err(HyperlightError::MemoryRequestTooBig(
361                total_size,
362                isize::MAX as usize,
363            ));
364        }
365
366        // allocate the memory
367        #[cfg(not(miri))]
368        let flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE;
369        #[cfg(miri)]
370        let flags = MAP_ANONYMOUS | MAP_PRIVATE;
371
372        let addr = unsafe {
373            mmap(
374                null_mut(),
375                total_size as size_t,
376                PROT_READ | PROT_WRITE,
377                flags,
378                -1 as c_int,
379                0 as off_t,
380            )
381        };
382        if addr == MAP_FAILED {
383            log_then_return!(HyperlightError::MmapFailed(
384                Error::last_os_error().raw_os_error()
385            ));
386        }
387
388        // protect the guard pages
389        #[cfg(not(miri))]
390        {
391            let res = unsafe { mprotect(addr, PAGE_SIZE_USIZE, PROT_NONE) };
392            if res != 0 {
393                return Err(HyperlightError::MprotectFailed(
394                    Error::last_os_error().raw_os_error(),
395                ));
396            }
397            let res = unsafe {
398                mprotect(
399                    (addr as *const u8).add(total_size - PAGE_SIZE_USIZE) as *mut c_void,
400                    PAGE_SIZE_USIZE,
401                    PROT_NONE,
402                )
403            };
404            if res != 0 {
405                return Err(HyperlightError::MprotectFailed(
406                    Error::last_os_error().raw_os_error(),
407                ));
408            }
409        }
410
411        Ok(Self {
412            // HostMapping is only non-Send/Sync because raw pointers
413            // are not ("as a lint", as the Rust docs say). We don't
414            // want to mark HostMapping Send/Sync immediately, because
415            // that could socially imply that it's "safe" to use
416            // unsafe accesses from multiple threads at once. Instead, we
417            // directly impl Send and Sync on this type. Since this
418            // type does have Send and Sync manually impl'd, the Arc
419            // is not pointless as the lint suggests.
420            #[allow(clippy::arc_with_non_send_sync)]
421            region: Arc::new(HostMapping {
422                ptr: addr as *mut u8,
423                size: total_size,
424            }),
425        })
426    }
427
428    /// Create a new region of shared memory with the given minimum
429    /// size in bytes. The region will be surrounded by guard pages.
430    ///
431    /// Return `Err` if shared memory could not be allocated.
432    #[cfg(target_os = "windows")]
433    #[instrument(skip_all, parent = Span::current(), level= "Trace")]
434    pub fn new(min_size_bytes: usize) -> Result<Self> {
435        if min_size_bytes == 0 {
436            return Err(new_error!("Cannot create shared memory with size 0"));
437        }
438
439        let total_size = min_size_bytes
440            .checked_add(2 * PAGE_SIZE_USIZE)
441            .ok_or_else(|| new_error!("Memory required for sandbox exceeded {}", usize::MAX))?;
442
443        if total_size % PAGE_SIZE_USIZE != 0 {
444            return Err(new_error!(
445                "shared memory must be a multiple of {}",
446                PAGE_SIZE_USIZE
447            ));
448        }
449
450        // usize and isize are guaranteed to be the same size, and
451        // isize::MAX should be positive, so this cast should be safe.
452        if total_size > isize::MAX as usize {
453            return Err(HyperlightError::MemoryRequestTooBig(
454                total_size,
455                isize::MAX as usize,
456            ));
457        }
458
459        let mut dwmaximumsizehigh = 0;
460        let mut dwmaximumsizelow = 0;
461
462        if std::mem::size_of::<usize>() == 8 {
463            dwmaximumsizehigh = (total_size >> 32) as u32;
464            dwmaximumsizelow = (total_size & 0xFFFFFFFF) as u32;
465        }
466
467        // Allocate the memory use CreateFileMapping instead of VirtualAlloc
468        // This allows us to map the memory into the surrogate process using MapViewOfFile2
469
470        let flags = PAGE_READWRITE;
471
472        let handle = unsafe {
473            CreateFileMappingA(
474                INVALID_HANDLE_VALUE,
475                None,
476                flags,
477                dwmaximumsizehigh,
478                dwmaximumsizelow,
479                PCSTR::null(),
480            )?
481        };
482
483        if handle.is_invalid() {
484            log_then_return!(HyperlightError::MemoryAllocationFailed(
485                Error::last_os_error().raw_os_error()
486            ));
487        }
488
489        let file_map = FILE_MAP_ALL_ACCESS;
490        let addr = unsafe { MapViewOfFile(handle, file_map, 0, 0, 0) };
491
492        if addr.Value.is_null() {
493            log_then_return!(HyperlightError::MemoryAllocationFailed(
494                Error::last_os_error().raw_os_error()
495            ));
496        }
497
498        // Set the first and last pages to be guard pages
499
500        let mut unused_out_old_prot_flags = PAGE_PROTECTION_FLAGS(0);
501
502        // If the following calls to VirtualProtect are changed make sure to update the calls to VirtualProtectEx in surrogate_process_manager.rs
503
504        let first_guard_page_start = addr.Value;
505        if let Err(e) = unsafe {
506            VirtualProtect(
507                first_guard_page_start,
508                PAGE_SIZE_USIZE,
509                PAGE_NOACCESS,
510                &mut unused_out_old_prot_flags,
511            )
512        } {
513            log_then_return!(WindowsAPIError(e.clone()));
514        }
515
516        let last_guard_page_start = unsafe { addr.Value.add(total_size - PAGE_SIZE_USIZE) };
517        if let Err(e) = unsafe {
518            VirtualProtect(
519                last_guard_page_start,
520                PAGE_SIZE_USIZE,
521                PAGE_NOACCESS,
522                &mut unused_out_old_prot_flags,
523            )
524        } {
525            log_then_return!(WindowsAPIError(e.clone()));
526        }
527
528        Ok(Self {
529            // HostMapping is only non-Send/Sync because raw pointers
530            // are not ("as a lint", as the Rust docs say). We don't
531            // want to mark HostMapping Send/Sync immediately, because
532            // that could socially imply that it's "safe" to use
533            // unsafe accesses from multiple threads at once. Instead, we
534            // directly impl Send and Sync on this type. Since this
535            // type does have Send and Sync manually impl'd, the Arc
536            // is not pointless as the lint suggests.
537            #[allow(clippy::arc_with_non_send_sync)]
538            region: Arc::new(HostMapping {
539                ptr: addr.Value as *mut u8,
540                size: total_size,
541                handle,
542            }),
543        })
544    }
545
546    /// Internal helper method to get the backing memory as a mutable slice.
547    ///
548    /// # Safety
549    /// As per std::slice::from_raw_parts_mut:
550    /// - self.base_addr() must be valid for both reads and writes for
551    ///   self.mem_size() * mem::size_of::<u8>() many bytes, and it
552    ///   must be properly aligned.
553    ///
554    ///   The rules on validity are still somewhat unspecified, but we
555    ///   assume that the result of our calls to mmap/CreateFileMappings may
556    ///   be considered a single "allocated object". The use of
557    ///   non-atomic accesses is alright from a Safe Rust standpoint,
558    ///   because SharedMemoryBuilder is  not Sync.
559    /// - self.base_addr() must point to self.mem_size() consecutive
560    ///   properly initialized values of type u8
561    ///
562    ///   Again, the exact provenance restrictions on what is
563    ///   considered to be initialized values are unclear, but we make
564    ///   sure to use mmap(MAP_ANONYMOUS) and
565    ///   CreateFileMapping(SEC_COMMIT), so the pages in question are
566    ///   zero-initialized, which we hope counts for u8.
567    /// - The memory referenced by the returned slice must not be
568    ///   accessed through any other pointer (not derived from the
569    ///   return value) for the duration of the lifetime 'a. Both read
570    ///   and write accesses are forbidden.
571    ///
572    ///   Accesses from Safe Rust necessarily follow this rule,
573    ///   because the returned slice's lifetime is the same as that of
574    ///   a mutable borrow of self.
575    /// - The total size self.mem_size() * mem::size_of::<u8>() of the
576    ///   slice must be no larger than isize::MAX, and adding that
577    ///   size to data must not "wrap around" the address space. See
578    ///   the safety documentation of pointer::offset.
579    ///
580    ///   This is ensured by a check in ::new()
581    pub(super) fn as_mut_slice(&mut self) -> &mut [u8] {
582        unsafe { std::slice::from_raw_parts_mut(self.base_ptr(), self.mem_size()) }
583    }
584
585    /// Internal helper method to get the backing memory as a slice.
586    ///
587    /// # Safety
588    /// See the discussion on as_mut_slice, with the third point
589    /// replaced by:
590    /// - The memory referenced by the returned slice must not be
591    ///   mutated for the duration of lifetime 'a, except inside an
592    ///   UnsafeCell.
593    ///
594    ///   Host accesses from Safe Rust necessarily follow this rule,
595    ///   because the returned slice's lifetime is the same as that of
596    ///   a borrow of self, preventing mutations via other methods.
597    #[instrument(skip_all, parent = Span::current(), level= "Trace")]
598    pub fn as_slice<'a>(&'a self) -> &'a [u8] {
599        unsafe { std::slice::from_raw_parts(self.base_ptr(), self.mem_size()) }
600    }
601
602    /// Copies all bytes from `src` to `self` starting at offset
603    #[instrument(err(Debug), skip_all, parent = Span::current(), level= "Trace")]
604    pub fn copy_from_slice(&mut self, src: &[u8], offset: usize) -> Result<()> {
605        let data = self.as_mut_slice();
606        bounds_check!(offset, src.len(), data.len());
607        data[offset..offset + src.len()].copy_from_slice(src);
608        Ok(())
609    }
610
611    generate_reader!(read_u8, u8);
612    generate_reader!(read_i8, i8);
613    generate_reader!(read_u16, u16);
614    generate_reader!(read_i16, i16);
615    generate_reader!(read_u32, u32);
616    generate_reader!(read_i32, i32);
617    generate_reader!(read_u64, u64);
618    generate_reader!(read_i64, i64);
619    generate_reader!(read_usize, usize);
620    generate_reader!(read_isize, isize);
621
622    generate_writer!(write_u8, u8);
623    generate_writer!(write_i8, i8);
624    generate_writer!(write_u16, u16);
625    generate_writer!(write_i16, i16);
626    generate_writer!(write_u32, u32);
627    generate_writer!(write_i32, i32);
628    generate_writer!(write_u64, u64);
629    generate_writer!(write_i64, i64);
630    generate_writer!(write_usize, usize);
631    generate_writer!(write_isize, isize);
632
633    /// Convert the ExclusiveSharedMemory, which may be freely
634    /// modified, into a GuestSharedMemory, which may be somewhat
635    /// freely modified (mostly by the guest), and a HostSharedMemory,
636    /// which may only make certain kinds of accesses that do not race
637    /// in the presence of malicious code inside the guest mutating
638    /// the GuestSharedMemory.
639    pub fn build(self) -> (HostSharedMemory, GuestSharedMemory) {
640        let lock = Arc::new(RwLock::new(()));
641        let hshm = HostSharedMemory {
642            region: self.region.clone(),
643            lock: lock.clone(),
644        };
645        (
646            hshm,
647            GuestSharedMemory {
648                region: self.region.clone(),
649                lock,
650            },
651        )
652    }
653
654    /// Gets the file handle of the shared memory region for this Sandbox
655    #[cfg(target_os = "windows")]
656    pub fn get_mmap_file_handle(&self) -> HANDLE {
657        self.region.handle
658    }
659
660    /// Create a [`HostSharedMemory`] view of this region without
661    /// consuming `self`. Used in tests where the full `build()` /
662    /// `evolve()` pipeline is not available.
663    #[cfg(all(test, feature = "guest-counter"))]
664    pub(crate) fn as_host_shared_memory(&self) -> HostSharedMemory {
665        let lock = Arc::new(RwLock::new(()));
666        HostSharedMemory {
667            region: self.region.clone(),
668            lock,
669        }
670    }
671}
672
673fn mapping_at(
674    s: &impl SharedMemory,
675    gpa: u64,
676    size: usize,
677    region_type: MemoryRegionType,
678    flags: MemoryRegionFlags,
679) -> MemoryRegion {
680    let guest_base = gpa as usize;
681
682    MemoryRegion {
683        guest_region: guest_base..(guest_base + size),
684        host_region: s.host_region_base()
685            ..<HostGuestMemoryRegion as MemoryRegionKind>::add(s.host_region_base(), size),
686        region_type,
687        flags,
688    }
689}
690
691impl GuestSharedMemory {
692    /// Create a [`super::memory_region::MemoryRegion`] structure
693    /// suitable for mapping this region into a VM
694    pub(crate) fn mapping_at(
695        &self,
696        guest_base: u64,
697        region_type: MemoryRegionType,
698    ) -> MemoryRegion {
699        let flags = match region_type {
700            MemoryRegionType::Scratch => {
701                MemoryRegionFlags::READ | MemoryRegionFlags::WRITE | MemoryRegionFlags::EXECUTE
702            }
703            #[cfg(unshared_snapshot_mem)]
704            MemoryRegionType::Snapshot => {
705                MemoryRegionFlags::READ | MemoryRegionFlags::WRITE | MemoryRegionFlags::EXECUTE
706            }
707            #[allow(clippy::panic)]
708            // This will not ever actually panic: the only places this
709            // is called are HyperlightVm::update_snapshot_mapping and
710            // HyperlightVm::update_scratch_mapping. The latter
711            // statically uses the Scratch region type, and the former
712            // does not use this at all when the unshared_snapshot_mem
713            // feature is not set, since in that case the scratch
714            // mapping type is ReadonlySharedMemory, not
715            // GuestSharedMemory.
716            _ => panic!(
717                "GuestSharedMemory::mapping_at should only be used for Scratch or Snapshot regions"
718            ),
719        };
720        mapping_at(self, guest_base, self.mem_size(), region_type, flags)
721    }
722}
723
724/// A trait that abstracts over the particular kind of SharedMemory,
725/// used when invoking operations from Rust that absolutely must have
726/// exclusive control over the shared memory for correctness +
727/// performance, like snapshotting.
728pub trait SharedMemory {
729    /// Return a readonly reference to the host mapping backing this SharedMemory
730    fn region(&self) -> &HostMapping;
731
732    /// Return the base address of the host mapping of this
733    /// region. Following the general Rust philosophy, this does not
734    /// need to be marked as `unsafe` because doing anything with this
735    /// pointer itself requires `unsafe`.
736    fn base_addr(&self) -> usize {
737        self.region().ptr as usize + PAGE_SIZE_USIZE
738    }
739
740    /// Return the base address of the host mapping of this region as
741    /// a pointer. Following the general Rust philosophy, this does
742    /// not need to be marked as `unsafe` because doing anything with
743    /// this pointer itself requires `unsafe`.
744    fn base_ptr(&self) -> *mut u8 {
745        self.region().ptr.wrapping_add(PAGE_SIZE_USIZE)
746    }
747
748    /// Return the length of usable memory contained in `self`.
749    /// The returned size does not include the size of the surrounding
750    /// guard pages.
751    fn mem_size(&self) -> usize {
752        self.region().size - 2 * PAGE_SIZE_USIZE
753    }
754
755    /// Return the raw base address of the host mapping, including the
756    /// guard pages.
757    fn raw_ptr(&self) -> *mut u8 {
758        self.region().ptr
759    }
760
761    /// Return the raw size of the host mapping, including the guard
762    /// pages.
763    fn raw_mem_size(&self) -> usize {
764        self.region().size
765    }
766
767    /// Extract a base address that can be mapped into a VM for this
768    /// SharedMemory.
769    ///
770    /// On Linux this returns a raw `usize` pointer. On Windows it
771    /// returns a `HostRegionBase` (see `super::memory_region`)
772    /// that carries the file-mapping handle metadata needed by WHP.
773    fn host_region_base(&self) -> <HostGuestMemoryRegion as MemoryRegionKind>::HostBaseType {
774        #[cfg(not(windows))]
775        {
776            self.base_addr()
777        }
778        #[cfg(windows)]
779        {
780            super::memory_region::HostRegionBase {
781                from_handle: self.region().handle.into(),
782                handle_base: self.region().ptr as usize,
783                handle_size: self.region().size,
784                offset: PAGE_SIZE_USIZE,
785            }
786        }
787    }
788
789    /// Return the end address of the host region (base + usable size).
790    fn host_region_end(&self) -> <HostGuestMemoryRegion as MemoryRegionKind>::HostBaseType {
791        <HostGuestMemoryRegion as MemoryRegionKind>::add(self.host_region_base(), self.mem_size())
792    }
793
794    /// Run some code with exclusive access to the SharedMemory
795    /// underlying this.  If the SharedMemory is not an
796    /// ExclusiveSharedMemory, any concurrent accesses to the relevant
797    /// HostSharedMemory/GuestSharedMemory may make this fail, or be
798    /// made to fail by this, and should be avoided.
799    fn with_exclusivity<T, F: FnOnce(&mut ExclusiveSharedMemory) -> T>(
800        &mut self,
801        f: F,
802    ) -> Result<T>;
803
804    /// Run some code that is allowed to access the contents of the
805    /// SharedMemory as if it is a normal slice.  By default, this is
806    /// implemented via [`SharedMemory::with_exclusivity`], which is
807    /// the correct implementation for a memory that can be mutated,
808    /// but a [`ReadonlySharedMemory`], can support this.
809    fn with_contents<T, F: FnOnce(&[u8]) -> T>(&mut self, f: F) -> Result<T> {
810        self.with_exclusivity(|m| f(m.as_slice()))
811    }
812
813    /// Zero a shared memory region
814    fn zero(&mut self) -> Result<()> {
815        self.with_exclusivity(|e| {
816            #[allow(unused_mut)] // unused on some platforms, although not others
817            let mut do_copy = true;
818            // TODO: Compare & add heuristic thresholds: mmap, MADV_DONTNEED, MADV_REMOVE, MADV_FREE (?)
819            // TODO: Find a similar lazy zeroing approach that works on MSHV.
820            //       (See Note [Keeping mappings in sync between userspace and the guest])
821            #[cfg(all(target_os = "linux", feature = "kvm", not(any(feature = "mshv3"))))]
822            unsafe {
823                let ret = libc::madvise(
824                    e.region.ptr as *mut libc::c_void,
825                    e.region.size,
826                    libc::MADV_DONTNEED,
827                );
828                if ret == 0 {
829                    do_copy = false;
830                }
831            }
832            if do_copy {
833                e.as_mut_slice().fill(0);
834            }
835        })
836    }
837}
838
839impl SharedMemory for ExclusiveSharedMemory {
840    fn region(&self) -> &HostMapping {
841        &self.region
842    }
843    fn with_exclusivity<T, F: FnOnce(&mut ExclusiveSharedMemory) -> T>(
844        &mut self,
845        f: F,
846    ) -> Result<T> {
847        Ok(f(self))
848    }
849}
850
851impl SharedMemory for GuestSharedMemory {
852    fn region(&self) -> &HostMapping {
853        &self.region
854    }
855    fn with_exclusivity<T, F: FnOnce(&mut ExclusiveSharedMemory) -> T>(
856        &mut self,
857        f: F,
858    ) -> Result<T> {
859        let guard = self
860            .lock
861            .try_write()
862            .map_err(|e| new_error!("Error locking at {}:{}: {}", file!(), line!(), e))?;
863        let mut excl = ExclusiveSharedMemory {
864            region: self.region.clone(),
865        };
866        let ret = f(&mut excl);
867        drop(excl);
868        drop(guard);
869        Ok(ret)
870    }
871}
872
873/// An unsafe marker trait for types for which all bit patterns are valid.
874/// This is required in order for it to be safe to read a value of a particular
875/// type out of the sandbox from the HostSharedMemory.
876///
877/// # Safety
878/// This must only be implemented for types for which all bit patterns
879/// are valid. It requires that any (non-undef/poison) value of the
880/// correct size can be transmuted to the type.
881pub unsafe trait AllValid {}
882unsafe impl AllValid for u8 {}
883unsafe impl AllValid for u16 {}
884unsafe impl AllValid for u32 {}
885unsafe impl AllValid for u64 {}
886unsafe impl AllValid for i8 {}
887unsafe impl AllValid for i16 {}
888unsafe impl AllValid for i32 {}
889unsafe impl AllValid for i64 {}
890unsafe impl AllValid for [u8; 16] {}
891
892impl HostSharedMemory {
893    /// Read a value of type T, whose representation is the same
894    /// between the sandbox and the host, and which has no invalid bit
895    /// patterns
896    pub fn read<T: AllValid>(&self, offset: usize) -> Result<T> {
897        bounds_check!(offset, std::mem::size_of::<T>(), self.mem_size());
898        unsafe {
899            let mut ret: core::mem::MaybeUninit<T> = core::mem::MaybeUninit::uninit();
900            {
901                let slice: &mut [u8] = core::slice::from_raw_parts_mut(
902                    ret.as_mut_ptr() as *mut u8,
903                    std::mem::size_of::<T>(),
904                );
905                self.copy_to_slice(slice, offset)?;
906            }
907            Ok(ret.assume_init())
908        }
909    }
910
911    /// Write a value of type T, whose representation is the same
912    /// between the sandbox and the host, and which has no invalid bit
913    /// patterns
914    pub fn write<T: AllValid>(&self, offset: usize, data: T) -> Result<()> {
915        bounds_check!(offset, std::mem::size_of::<T>(), self.mem_size());
916        unsafe {
917            let slice: &[u8] = core::slice::from_raw_parts(
918                core::ptr::addr_of!(data) as *const u8,
919                std::mem::size_of::<T>(),
920            );
921            self.copy_from_slice(slice, offset)?;
922        }
923        Ok(())
924    }
925
926    /// Copy the contents of the slice into the sandbox at the
927    /// specified offset
928    pub fn copy_to_slice(&self, slice: &mut [u8], offset: usize) -> Result<()> {
929        bounds_check!(offset, slice.len(), self.mem_size());
930        let base = self.base_ptr().wrapping_add(offset);
931        let guard = self
932            .lock
933            .try_read()
934            .map_err(|e| new_error!("Error locking at {}:{}: {}", file!(), line!(), e))?;
935
936        const CHUNK: usize = size_of::<u128>();
937        let len = slice.len();
938        let mut i = 0;
939
940        // Handle unaligned head bytes until we reach u128 alignment.
941        // Note: align_offset can return usize::MAX if alignment is impossible.
942        // In that case, head_len = len via .min(), so we fall back to byte-by-byte
943        // operations for the entire slice.
944        let align_offset = base.align_offset(align_of::<u128>());
945        let head_len = align_offset.min(len);
946        while i < head_len {
947            unsafe {
948                slice[i] = base.add(i).read_volatile();
949            }
950            i += 1;
951        }
952
953        // Read aligned u128 chunks
954        // SAFETY: After processing head_len bytes, base.add(i) is u128-aligned.
955        // We use write_unaligned for the destination since the slice may not be u128-aligned.
956        let dst = slice.as_mut_ptr();
957        while i + CHUNK <= len {
958            unsafe {
959                let value = (base.add(i) as *const u128).read_volatile();
960                std::ptr::write_unaligned(dst.add(i) as *mut u128, value);
961            }
962            i += CHUNK;
963        }
964
965        // Handle remaining tail bytes
966        while i < len {
967            unsafe {
968                slice[i] = base.add(i).read_volatile();
969            }
970            i += 1;
971        }
972
973        drop(guard);
974        Ok(())
975    }
976
977    /// Copy the contents of the sandbox at the specified offset into
978    /// the slice
979    pub fn copy_from_slice(&self, slice: &[u8], offset: usize) -> Result<()> {
980        bounds_check!(offset, slice.len(), self.mem_size());
981        let base = self.base_ptr().wrapping_add(offset);
982        let guard = self
983            .lock
984            .try_read()
985            .map_err(|e| new_error!("Error locking at {}:{}: {}", file!(), line!(), e))?;
986
987        const CHUNK: usize = size_of::<u128>();
988        let len = slice.len();
989        let mut i = 0;
990
991        // Handle unaligned head bytes until we reach u128 alignment.
992        // Note: align_offset can return usize::MAX if alignment is impossible.
993        // In that case, head_len = len via .min(), so we fall back to byte-by-byte
994        // operations for the entire slice.
995        let align_offset = base.align_offset(align_of::<u128>());
996        let head_len = align_offset.min(len);
997        while i < head_len {
998            unsafe {
999                base.add(i).write_volatile(slice[i]);
1000            }
1001            i += 1;
1002        }
1003
1004        // Write aligned u128 chunks
1005        // SAFETY: After processing head_len bytes, base.add(i) is u128-aligned.
1006        // We use read_unaligned for the source since the slice may not be u128-aligned.
1007        let src = slice.as_ptr();
1008        while i + CHUNK <= len {
1009            unsafe {
1010                let value = std::ptr::read_unaligned(src.add(i) as *const u128);
1011                (base.add(i) as *mut u128).write_volatile(value);
1012            }
1013            i += CHUNK;
1014        }
1015
1016        // Handle remaining tail bytes
1017        while i < len {
1018            unsafe {
1019                base.add(i).write_volatile(slice[i]);
1020            }
1021            i += 1;
1022        }
1023
1024        drop(guard);
1025        Ok(())
1026    }
1027
1028    /// Fill the memory in the range `[offset, offset + len)` with `value`
1029    #[instrument(err(Debug), skip_all, parent = Span::current(), level= "Trace")]
1030    pub fn fill(&mut self, value: u8, offset: usize, len: usize) -> Result<()> {
1031        bounds_check!(offset, len, self.mem_size());
1032        let base = self.base_ptr().wrapping_add(offset);
1033        let guard = self
1034            .lock
1035            .try_read()
1036            .map_err(|e| new_error!("Error locking at {}:{}: {}", file!(), line!(), e))?;
1037
1038        const CHUNK: usize = size_of::<u128>();
1039        let value_u128 = u128::from_ne_bytes([value; CHUNK]);
1040        let mut i = 0;
1041
1042        // Handle unaligned head bytes until we reach u128 alignment.
1043        // Note: align_offset can return usize::MAX if alignment is impossible.
1044        // In that case, head_len = len via .min(), so we fall back to byte-by-byte
1045        // operations for the entire slice.
1046        let align_offset = base.align_offset(align_of::<u128>());
1047        let head_len = align_offset.min(len);
1048        while i < head_len {
1049            unsafe {
1050                base.add(i).write_volatile(value);
1051            }
1052            i += 1;
1053        }
1054
1055        // Write aligned u128 chunks
1056        // SAFETY: After processing head_len bytes, base.add(i) is u128-aligned
1057        while i + CHUNK <= len {
1058            unsafe {
1059                (base.add(i) as *mut u128).write_volatile(value_u128);
1060            }
1061            i += CHUNK;
1062        }
1063
1064        // Handle remaining tail bytes
1065        while i < len {
1066            unsafe {
1067                base.add(i).write_volatile(value);
1068            }
1069            i += 1;
1070        }
1071
1072        drop(guard);
1073        Ok(())
1074    }
1075
1076    /// Pushes the given data onto shared memory to the buffer at the given offset.
1077    /// NOTE! buffer_start_offset must point to the beginning of the buffer
1078    #[instrument(err(Debug), skip_all, parent = Span::current(), level= "Trace")]
1079    pub fn push_buffer(
1080        &mut self,
1081        buffer_start_offset: usize,
1082        buffer_size: usize,
1083        data: &[u8],
1084    ) -> Result<()> {
1085        let stack_pointer_rel = self.read::<u64>(buffer_start_offset)? as usize;
1086        let buffer_size_u64: u64 = buffer_size.try_into()?;
1087
1088        if stack_pointer_rel > buffer_size || stack_pointer_rel < 8 {
1089            return Err(new_error!(
1090                "Unable to push data to buffer: Stack pointer is out of bounds. Stack pointer: {}, Buffer size: {}",
1091                stack_pointer_rel,
1092                buffer_size_u64
1093            ));
1094        }
1095
1096        let size_required = data.len() + 8;
1097        let size_available = buffer_size - stack_pointer_rel;
1098
1099        if size_required > size_available {
1100            return Err(new_error!(
1101                "Not enough space in buffer to push data. Required: {}, Available: {}",
1102                size_required,
1103                size_available
1104            ));
1105        }
1106
1107        // get absolute
1108        let stack_pointer_abs = stack_pointer_rel + buffer_start_offset;
1109
1110        // write the actual data to the top of stack
1111        self.copy_from_slice(data, stack_pointer_abs)?;
1112
1113        // write the offset to the newly written data, to the top of stack.
1114        // this is used when popping the stack, to know how far back to jump
1115        self.write::<u64>(stack_pointer_abs + data.len(), stack_pointer_rel as u64)?;
1116
1117        // update stack pointer to point to the next free address
1118        self.write::<u64>(
1119            buffer_start_offset,
1120            (stack_pointer_rel + data.len() + 8) as u64,
1121        )?;
1122        Ok(())
1123    }
1124
1125    /// Pop the top element of the ring as raw bytes. Unlike
1126    /// [`Self::try_pop_buffer_into`], this doesn't peek at the element's
1127    /// contents — the element size is recovered from the trailing
1128    /// back-pointer that [`Self::push_buffer`] wrote.
1129    pub fn try_pop_buffer_raw(
1130        &mut self,
1131        buffer_start_offset: usize,
1132        buffer_size: usize,
1133    ) -> Result<Vec<u8>> {
1134        let stack_pointer_rel = self.read::<u64>(buffer_start_offset)? as usize;
1135
1136        if stack_pointer_rel > buffer_size || stack_pointer_rel < 16 {
1137            return Err(new_error!(
1138                "try_pop_buffer_raw: stack pointer {} out of bounds (size {})",
1139                stack_pointer_rel,
1140                buffer_size
1141            ));
1142        }
1143
1144        let back_ptr_abs = stack_pointer_rel + buffer_start_offset - 8;
1145        let element_offset_rel = self.read::<u64>(back_ptr_abs)? as usize;
1146
1147        if element_offset_rel < 8 || element_offset_rel > stack_pointer_rel.saturating_sub(8) {
1148            return Err(new_error!(
1149                "try_pop_buffer_raw: back-pointer {} outside [8, {}]",
1150                element_offset_rel,
1151                stack_pointer_rel.saturating_sub(8)
1152            ));
1153        }
1154
1155        let element_size = stack_pointer_rel - element_offset_rel - 8;
1156        let element_abs = element_offset_rel + buffer_start_offset;
1157        let mut out = vec![0u8; element_size];
1158        self.copy_to_slice(&mut out, element_abs)?;
1159
1160        // Pop: rewind stack pointer.
1161        self.write::<u64>(buffer_start_offset, element_offset_rel as u64)?;
1162        // Zero out the popped slot + its back-pointer.
1163        self.fill(0, element_abs, stack_pointer_rel - element_offset_rel)?;
1164
1165        Ok(out)
1166    }
1167
1168    /// Pops the given given buffer into a `T` and returns it.
1169    /// NOTE! the data must be a size-prefixed flatbuffer, and
1170    /// buffer_start_offset must point to the beginning of the buffer
1171    pub fn try_pop_buffer_into<T>(
1172        &mut self,
1173        buffer_start_offset: usize,
1174        buffer_size: usize,
1175    ) -> Result<T>
1176    where
1177        T: for<'b> TryFrom<&'b [u8]>,
1178    {
1179        // get the stackpointer
1180        let stack_pointer_rel = self.read::<u64>(buffer_start_offset)? as usize;
1181
1182        if stack_pointer_rel > buffer_size || stack_pointer_rel < 16 {
1183            return Err(new_error!(
1184                "Unable to pop data from buffer: Stack pointer is out of bounds. Stack pointer: {}, Buffer size: {}",
1185                stack_pointer_rel,
1186                buffer_size
1187            ));
1188        }
1189
1190        // make it absolute
1191        let last_element_offset_abs = stack_pointer_rel + buffer_start_offset;
1192
1193        // go back 8 bytes to get offset to element on top of stack
1194        let last_element_offset_rel: usize =
1195            self.read::<u64>(last_element_offset_abs - 8)? as usize;
1196
1197        // Validate element offset (guest-writable): must be in [8, stack_pointer_rel - 16]
1198        // to leave room for the 8-byte back-pointer plus at least 8 bytes of element data
1199        // (the minimum for a size-prefixed flatbuffer: 4-byte prefix + 4-byte root offset).
1200        if last_element_offset_rel > stack_pointer_rel.saturating_sub(16)
1201            || last_element_offset_rel < 8
1202        {
1203            return Err(new_error!(
1204                "Corrupt buffer back-pointer: element offset {} is outside valid range [8, {}].",
1205                last_element_offset_rel,
1206                stack_pointer_rel.saturating_sub(16),
1207            ));
1208        }
1209
1210        // make it absolute
1211        let last_element_offset_abs = last_element_offset_rel + buffer_start_offset;
1212
1213        // Max bytes the element can span (excluding the 8-byte back-pointer).
1214        let max_element_size = stack_pointer_rel - last_element_offset_rel - 8;
1215
1216        // Get the size of the flatbuffer buffer from memory
1217        let fb_buffer_size = {
1218            let raw_prefix = self.read::<u32>(last_element_offset_abs)?;
1219            // flatbuffer byte arrays are prefixed by 4 bytes indicating
1220            // the remaining size; add 4 for the prefix itself.
1221            let total = raw_prefix.checked_add(4).ok_or_else(|| {
1222                new_error!(
1223                    "Corrupt buffer size prefix: value {} overflows when adding 4-byte header.",
1224                    raw_prefix
1225                )
1226            })?;
1227            usize::try_from(total)
1228        }?;
1229
1230        if fb_buffer_size > max_element_size {
1231            return Err(new_error!(
1232                "Corrupt buffer size prefix: flatbuffer claims {} bytes but the element slot is only {} bytes.",
1233                fb_buffer_size,
1234                max_element_size
1235            ));
1236        }
1237
1238        let mut result_buffer = vec![0; fb_buffer_size];
1239
1240        self.copy_to_slice(&mut result_buffer, last_element_offset_abs)?;
1241        let to_return = T::try_from(result_buffer.as_slice()).map_err(|_e| {
1242            new_error!(
1243                "pop_buffer_into: failed to convert buffer to {}",
1244                type_name::<T>()
1245            )
1246        })?;
1247
1248        // update the stack pointer to point to the element we just popped off since that is now free
1249        self.write::<u64>(buffer_start_offset, last_element_offset_rel as u64)?;
1250
1251        // zero out the memory we just popped off
1252        let num_bytes_to_zero = stack_pointer_rel - last_element_offset_rel;
1253        self.fill(0, last_element_offset_abs, num_bytes_to_zero)?;
1254
1255        Ok(to_return)
1256    }
1257}
1258
1259impl SharedMemory for HostSharedMemory {
1260    fn region(&self) -> &HostMapping {
1261        &self.region
1262    }
1263    fn with_exclusivity<T, F: FnOnce(&mut ExclusiveSharedMemory) -> T>(
1264        &mut self,
1265        f: F,
1266    ) -> Result<T> {
1267        let guard = self
1268            .lock
1269            .try_write()
1270            .map_err(|e| new_error!("Error locking at {}:{}: {}", file!(), line!(), e))?;
1271        let mut excl = ExclusiveSharedMemory {
1272            region: self.region.clone(),
1273        };
1274        let ret = f(&mut excl);
1275        drop(excl);
1276        drop(guard);
1277        Ok(ret)
1278    }
1279}
1280
1281/// A ReadonlySharedMemory is a different kind of shared memory,
1282/// separate from the exclusive/host/guest lifecycle, used to
1283/// represent read-only mappings of snapshot pages into the guest
1284/// efficiently.
1285#[derive(Clone, Debug)]
1286pub struct ReadonlySharedMemory {
1287    region: Arc<HostMapping>,
1288    /// If `Some`, only this many bytes are mapped into guest PA space
1289    /// by `mapping_at`. If `None`, the full `mem_size()` is mapped.
1290    #[cfg_attr(unshared_snapshot_mem, allow(dead_code))]
1291    guest_mapped_size: Option<usize>,
1292}
1293// Safety: HostMapping is only non-Send/Sync (causing
1294// ReadonlySharedMemory to not be automatically Send/Sync) because raw
1295// pointers are not ("as a lint", as the Rust docs say). We don't want
1296// to mark HostMapping Send/Sync immediately, because that could
1297// socially imply that it's "safe" to use unsafe accesses from
1298// multiple threads at once in more cases, including ones that don't
1299// actually ensure immutability/synchronisation. Since
1300// ReadonlySharedMemory can only be accessed by reading, and reading
1301// concurrently from multiple threads is not racy,
1302// ReadonlySharedMemory can be Send and Sync.
1303unsafe impl Send for ReadonlySharedMemory {}
1304unsafe impl Sync for ReadonlySharedMemory {}
1305
1306impl ReadonlySharedMemory {
1307    pub(crate) fn from_bytes(contents: &[u8]) -> Result<Self> {
1308        let mut anon = ExclusiveSharedMemory::new(contents.len())?;
1309        anon.copy_from_slice(contents, 0)?;
1310        Ok(ReadonlySharedMemory {
1311            region: anon.region,
1312            guest_mapped_size: None,
1313        })
1314    }
1315
1316    /// The number of bytes that should be mapped into guest PA space.
1317    /// Returns `guest_mapped_size` if set, otherwise `mem_size()`.
1318    #[cfg(not(unshared_snapshot_mem))]
1319    pub(crate) fn guest_mapped_size(&self) -> usize {
1320        self.guest_mapped_size.unwrap_or_else(|| self.mem_size())
1321    }
1322
1323    pub(crate) fn as_slice(&self) -> &[u8] {
1324        unsafe { std::slice::from_raw_parts(self.base_ptr(), self.mem_size()) }
1325    }
1326
1327    #[cfg(unshared_snapshot_mem)]
1328    pub(crate) fn copy_to_writable(&self) -> Result<ExclusiveSharedMemory> {
1329        let mut writable = ExclusiveSharedMemory::new(self.mem_size())?;
1330        writable.copy_from_slice(self.as_slice(), 0)?;
1331        Ok(writable)
1332    }
1333
1334    #[cfg(not(unshared_snapshot_mem))]
1335    pub(crate) fn build(self) -> (Self, Self) {
1336        (self.clone(), self)
1337    }
1338
1339    #[cfg(not(unshared_snapshot_mem))]
1340    pub(crate) fn mapping_at(
1341        &self,
1342        guest_base: u64,
1343        region_type: MemoryRegionType,
1344    ) -> MemoryRegion {
1345        #[allow(clippy::panic)]
1346        // This will not ever actually panic: the only place this is
1347        // called is HyperlightVm::update_snapshot_mapping, which
1348        // always calls it with the Snapshot region type.
1349        if region_type != MemoryRegionType::Snapshot {
1350            panic!("ReadonlySharedMemory::mapping_at should only be used for Snapshot regions");
1351        }
1352        // Register snapshot mem RWX at the KVM level. Upstream marked
1353        // this RX-only and relied on guest-PT CoW for write semantics,
1354        // which trapped first writes and resolved them to scratch frames
1355        // — driving a slow leak via prim_alloc. The underlying mmap is
1356        // already PROT_READ|PROT_WRITE; `ReadonlySharedMemory` is a
1357        // host-side Rust-API artifact, not a KVM-level constraint.
1358        mapping_at(
1359            self,
1360            guest_base,
1361            self.guest_mapped_size(),
1362            region_type,
1363            MemoryRegionFlags::READ | MemoryRegionFlags::WRITE | MemoryRegionFlags::EXECUTE,
1364        )
1365    }
1366}
1367
1368impl SharedMemory for ReadonlySharedMemory {
1369    fn region(&self) -> &HostMapping {
1370        &self.region
1371    }
1372    // There's no way to get exclusive (and therefore writable) access
1373    // to a ReadonlySharedMemory.
1374    fn with_exclusivity<T, F: FnOnce(&mut ExclusiveSharedMemory) -> T>(
1375        &mut self,
1376        _: F,
1377    ) -> Result<T> {
1378        Err(new_error!(
1379            "Cannot take exclusive access to a ReadonlySharedMemory"
1380        ))
1381    }
1382    // However, just access to the contents as a slice is doable
1383    fn with_contents<T, F: FnOnce(&[u8]) -> T>(&mut self, f: F) -> Result<T> {
1384        Ok(f(self.as_slice()))
1385    }
1386}
1387
1388impl<S: SharedMemory> PartialEq<S> for ReadonlySharedMemory {
1389    fn eq(&self, other: &S) -> bool {
1390        self.raw_ptr() == other.raw_ptr()
1391    }
1392}