Skip to main content

nub_host_kvm/sandbox/
snapshot.rs

1/*
2Copyright 2025 The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use std::sync::atomic::{AtomicU64, Ordering};
18
19use nub_host_common::layout::{scratch_base_gpa, scratch_base_gva};
20use nub_host_common::vmem;
21use nub_host_common::vmem::{BasicMapping, Mapping, MappingKind};
22use tracing::{Span, instrument};
23
24use crate::HyperlightError::MemoryRegionSizeMismatch;
25use crate::Result;
26use crate::mem::exe::{ExeInfo, LoadInfo};
27use crate::mem::memory_region::{GuestMemoryRegion, MemoryRegion, MemoryRegionFlags};
28use crate::mem::mgr::GuestPageTableBuffer;
29use crate::mem::shared_mem::ReadonlySharedMemory;
30use crate::sandbox::SandboxConfiguration;
31use crate::sandbox::uninitialized::{GuestBinary, GuestEnvironment};
32
33pub(super) static SANDBOX_CONFIGURATION_COUNTER: AtomicU64 = AtomicU64::new(0);
34
35const PTE_SIZE: usize = size_of::<vmem::PageTableEntry>();
36
37/// Presently, a snapshot can be of a preinitialised sandbox, which
38/// still needs an initialise function called in order to determine
39/// how to call into it, or of an already-properly-initialised sandbox
40/// which can be immediately called into. This keeps track of the
41/// difference.
42///
43/// TODO: this should not necessarily be around in the long term:
44/// ideally we would just preinitialise earlier in the snapshot
45/// creation process and never need this.
46#[derive(Copy, Clone, PartialEq, Eq)]
47pub enum NextAction {
48    /// A sandbox in the preinitialise state still needs to be
49    /// initialised by calling the initialise function
50    Initialise(u64),
51    /// A sandbox in the ready state can immediately be called into,
52    /// using the dispatch function pointer.
53    Call(u64),
54    /// Only when compiling for tests: a sandbox that cannot actually
55    /// be used
56    #[cfg(test)]
57    None,
58}
59
60/// Initial guest memory layout + page tables produced from a binary.
61///
62/// Post-Stage-F.CoW: this is purely a build-time artifact — the
63/// snapshot/restore rollback API that motivated rebuilding from a
64/// running sandbox is gone, so `Snapshot::from_env` is the only
65/// constructor and most accessors trimmed.
66pub struct Snapshot {
67    /// Layout object for the sandbox.
68    layout: crate::mem::layout::SandboxMemoryLayout,
69    /// Memory of the sandbox at the time this snapshot was taken
70    memory: ReadonlySharedMemory,
71    /// Extra debug information about the binary in this snapshot.
72    load_info: LoadInfo,
73    /// The hash of the other portions of the snapshot.
74    hash: [u8; 32],
75    /// The address of the top of the guest stack
76    stack_top_gva: u64,
77    /// The next action that should be performed on this snapshot
78    entrypoint: NextAction,
79}
80impl nub_host_common::vmem::TableReadOps for Snapshot {
81    type TableAddr = u64;
82    fn entry_addr(addr: u64, offset: u64) -> u64 {
83        addr + offset
84    }
85    unsafe fn read_entry(&self, addr: u64) -> vmem::PageTableEntry {
86        let addr = addr as usize;
87        let Some(pte_bytes) = self.memory.as_slice().get(addr..addr + PTE_SIZE) else {
88            // Attacker-controlled data pointed out-of-bounds. We'll
89            // default to returning 0 in this case, which, for most
90            // architectures (including x86-64 and arm64, the ones we
91            // care about presently) will be a not-present entry.
92            return 0;
93        };
94        // The `get()` above ensures exactly PTE_SIZE bytes.
95        #[allow(clippy::unwrap_used)]
96        vmem::PageTableEntry::from_le_bytes(pte_bytes.try_into().unwrap())
97    }
98    #[allow(clippy::unnecessary_cast)]
99    fn to_phys(addr: u64) -> vmem::PhysAddr {
100        addr as vmem::PhysAddr
101    }
102    #[allow(clippy::unnecessary_cast)]
103    fn from_phys(addr: vmem::PhysAddr) -> u64 {
104        addr as u64
105    }
106    fn root_table(&self) -> u64 {
107        self.root_pt_gpa()
108    }
109}
110
111/// Compute a deterministic hash of a snapshot.
112///
113/// This does not include the load info from the snapshot, because
114/// that is only used for debugging builds.
115fn hash(memory: &[u8], regions: &[MemoryRegion]) -> Result<[u8; 32]> {
116    let mut hasher = blake3::Hasher::new();
117    hasher.update(memory);
118    for rgn in regions {
119        hasher.update(&usize::to_le_bytes(rgn.guest_region.start));
120        let guest_len = rgn.guest_region.end - rgn.guest_region.start;
121        #[allow(clippy::useless_conversion)]
122        let host_start_addr: usize = rgn.host_region.start.into();
123        #[allow(clippy::useless_conversion)]
124        let host_end_addr: usize = rgn.host_region.end.into();
125        hasher.update(&usize::to_le_bytes(host_start_addr));
126        let host_len = host_end_addr - host_start_addr;
127        if guest_len != host_len {
128            return Err(MemoryRegionSizeMismatch(
129                host_len,
130                guest_len,
131                format!("{:?}", rgn),
132            ));
133        }
134        // Ignore [`MemoryRegion::region_type`], since it is extra
135        // information for debugging rather than a core part of the
136        // identity of the snapshot/workload.
137        hasher.update(&usize::to_le_bytes(guest_len));
138        hasher.update(&u32::to_le_bytes(rgn.flags.bits()));
139    }
140    // Ignore [`load_info`], since it is extra information for
141    // debugging rather than a core part of the identity of the
142    // snapshot/workload.
143    Ok(hasher.finalize().into())
144}
145
146fn map_specials(pt_buf: &GuestPageTableBuffer, scratch_size: usize) {
147    // Map the scratch region
148    let mapping = Mapping {
149        phys_base: scratch_base_gpa(scratch_size),
150        virt_base: scratch_base_gva(scratch_size),
151        len: scratch_size as u64,
152        kind: MappingKind::Basic(BasicMapping {
153            readable: true,
154            writable: true,
155            // assume that the guest will map these pages elsewhere if
156            // it actually needs to execute from them
157            executable: false,
158        }),
159        user_accessible: false,
160    };
161    unsafe { vmem::map(pt_buf, mapping) };
162}
163
164impl Snapshot {
165    /// Create a new snapshot from the guest binary identified by `env`. With the configuration
166    /// specified in `cfg`.
167    pub(crate) fn from_env<'a, 'b>(
168        env: impl Into<GuestEnvironment<'a, 'b>>,
169        cfg: SandboxConfiguration,
170    ) -> Result<Self> {
171        let env = env.into();
172        let mut bin = env.guest_binary;
173        bin.canonicalize()?;
174        let blob = env.init_data;
175
176        let exe_info = match bin {
177            GuestBinary::FilePath(bin_path_str) => ExeInfo::from_file(&bin_path_str)?,
178            GuestBinary::Buffer(buffer) => ExeInfo::from_buf(buffer)?,
179        };
180
181        // F4.2: dropped the host/guest version-mismatch check. As a
182        // fork, our host crate's version (nub-host-kvm) intentionally
183        // diverges from the upstream `hyperlight-guest-bin` ELF note
184        // version. The note's bit-for-bit ABI contract is what
185        // matters; we honor it. The `exe_info.guest_bin_version()`
186        // accessor stays available for diagnostics.
187        let _ = exe_info.guest_bin_version();
188
189        let guest_blob_size = blob.as_ref().map(|b| b.data.len()).unwrap_or(0);
190        let guest_blob_mem_flags = blob.as_ref().map(|b| b.permissions);
191
192        #[cfg_attr(feature = "i686-guest", allow(unused_mut))]
193        let mut layout = crate::mem::layout::SandboxMemoryLayout::new(
194            cfg,
195            exe_info.loaded_size(),
196            guest_blob_size,
197            guest_blob_mem_flags,
198        )?;
199
200        let load_addr = layout.get_guest_code_address() as u64;
201        let base_va = exe_info.base_va();
202        let entrypoint_va: u64 = exe_info.entrypoint().into();
203
204        let mut memory = vec![0; layout.get_memory_size()?];
205
206        let load_info = exe_info.load(
207            load_addr.try_into()?,
208            &mut memory[layout.get_guest_code_offset()..],
209        )?;
210
211        layout.write_peb(&mut memory)?;
212
213        blob.map(|x| layout.write_init_data(&mut memory, x.data))
214            .transpose()?;
215
216        // Set up page table entries for the snapshot
217        let pt_buf = GuestPageTableBuffer::new(layout.get_pt_base_gpa() as usize);
218
219        // 1. Map the pages of snapshot data as plain RW basic mappings.
220        // Pre-Stage-F these were CoW so the (now-deleted) snapshot/restore
221        // machinery could roll back writes; we don't use it.
222        //
223        // Kernel half lives at high VA (`KERNEL_HIGH_BASE`); GPAs stay
224        // identical, only the GVA shifts. Computes
225        // `virt_base = KERNEL_HIGH_BASE + (phys_base - BASE_ADDRESS)`.
226        for rgn in layout.get_memory_regions_::<GuestMemoryRegion>(())?.iter() {
227            let readable = rgn.flags.contains(MemoryRegionFlags::READ);
228            let executable = rgn.flags.contains(MemoryRegionFlags::EXECUTE);
229            let writable = rgn.flags.contains(MemoryRegionFlags::WRITE);
230            let kind = MappingKind::Basic(BasicMapping {
231                readable,
232                writable,
233                executable,
234            });
235            let phys_base = rgn.guest_region.start as u64;
236            let virt_base = crate::mem::layout::SandboxMemoryLayout::KERNEL_HIGH_BASE
237                + (phys_base - crate::mem::layout::SandboxMemoryLayout::BASE_ADDRESS as u64);
238            let mapping = Mapping {
239                phys_base,
240                virt_base,
241                len: rgn.guest_region.len() as u64,
242                kind,
243                user_accessible: false,
244            };
245            unsafe { vmem::map(&pt_buf, mapping) };
246        }
247
248        // 2. Map the special mappings
249        map_specials(&pt_buf, layout.get_scratch_size());
250
251        let pt_bytes = pt_buf.into_bytes();
252        layout.set_pt_size(pt_bytes.len())?;
253        memory.extend(&pt_bytes);
254
255        let exn_stack_top_gva = nub_host_common::layout::MAX_GVA as u64
256            - nub_host_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET
257            + 1;
258
259        // Bump the configuration counter so `MultiUseSandbox::id`
260        // values stay unique across constructions.
261        SANDBOX_CONFIGURATION_COUNTER.fetch_add(1, Ordering::Relaxed);
262        let extra_regions: Vec<MemoryRegion> = Vec::new();
263        let hash = hash(&memory, &extra_regions)?;
264
265        // Entrypoint GVA: kernel base + offset from ELF base to
266        // entrypoint + offset from BASE_ADDRESS to the code's GPA.
267        // Today `load_addr == BASE_ADDRESS` and `base_va == KERNEL_HIGH_BASE`,
268        // so this is just `entrypoint_va`, but the general form keeps
269        // working if either shifts.
270        let entrypoint_gva = crate::mem::layout::SandboxMemoryLayout::KERNEL_HIGH_BASE
271            + (load_addr - crate::mem::layout::SandboxMemoryLayout::BASE_ADDRESS as u64)
272            + (entrypoint_va - base_va);
273
274        Ok(Self {
275            memory: ReadonlySharedMemory::from_bytes(&memory)?,
276            layout,
277            load_info,
278            hash,
279            stack_top_gva: exn_stack_top_gva,
280            entrypoint: NextAction::Initialise(entrypoint_gva),
281        })
282    }
283
284    /// Return the main memory contents of the snapshot
285    #[instrument(skip_all, parent = Span::current(), level= "Trace")]
286    pub(crate) fn memory(&self) -> &ReadonlySharedMemory {
287        &self.memory
288    }
289
290    /// Return a copy of the load info for the exe in the snapshot
291    pub(crate) fn load_info(&self) -> LoadInfo {
292        self.load_info.clone()
293    }
294
295    pub(crate) fn layout(&self) -> &crate::mem::layout::SandboxMemoryLayout {
296        &self.layout
297    }
298
299    pub(crate) fn root_pt_gpa(&self) -> u64 {
300        self.layout.get_pt_base_gpa()
301    }
302
303    pub(crate) fn stack_top_gva(&self) -> u64 {
304        self.stack_top_gva
305    }
306
307    pub(crate) fn entrypoint(&self) -> NextAction {
308        self.entrypoint
309    }
310}
311
312impl PartialEq for Snapshot {
313    fn eq(&self, other: &Snapshot) -> bool {
314        self.hash == other.hash
315    }
316}