Skip to main content

javm_cap/cap/
data.rs

1//! `DataCap` — backing (immutable dense page slab) + copy-on-write overlay.
2//!
3//! A `DataCap` is `{ backing: Arc<PageSlab>, overlay }`:
4//!
5//! - [`PageSlab`] is the **immutable backing**: a *dense* runtime-sized vector
6//!   of pages (index `i` is absolute page `i`), `Arc`-shared so `MGMT_COPY` is
7//!   one refcount bump. It is **not** a sparse `RadixMap` of 2 MiB groups and
8//!   **not** raw bytes — it is a custom SSZ page-vector whose `hash_tree_root`
9//!   merkleizes the page roots at the **exact runtime depth** `ceil_log2(page_count)`
10//!   (`page_count = size / PAGE_SIZE`). `ssz::List<T, N>` / `Vector` fix `N` at
11//!   compile time; a cap's `page_count` is a runtime value, so `PageSlab` is a
12//!   bespoke type reusing [`ssz::merkle::merkleize`] with `limit = page_count`.
13//! - `overlay` is the **copy-on-write working layer**: the pages this cap has
14//!   modified since the backing settled, keyed by absolute page index. A page
15//!   present in the overlay *shadows* the backing; a clean cap has an empty
16//!   overlay and is identical to its backing. During execution the engines
17//!   write dirty pages straight into the overlay (no separate dirty-page list);
18//!   at settle [`DataCap::flush`] folds the overlay into a fresh `PageSlab`.
19//!
20//! Earlier drafts split this across a separate `DataCap` (immutable, sparse
21//! groups) and `DataViewCap` (backing-by-hash + overlay). They are now **one
22//! type**: `DataViewCap == DataCap`.
23//!
24//! ## The cap root (flat, size-scaled) — defined only on a *flushed* cap
25//!
26//! `hash_tree_root` is the SSZ 2-field container `{ size, pages }`:
27//!
28//! ```text
29//! cap_root   = merkleize([ htr(size), pages_root ], 2)            // depth-1 container
30//! pages_root = merkleize([ page(0)..page(page_count) ], page_count) // = PageSlab::hash_tree_root
31//! page_count = size / PAGE_SIZE
32//! ```
33//!
34//! The cap root is **only defined when the `overlay` is empty** — the backing is
35//! the hashable, content-addressed form; the overlay is transient working state.
36//! Hashing a cap with a non-empty overlay is a usage error (it panics, like
37//! hashing a cap graph that still holds an unresolved `Ref`): callers
38//! [`flush`](DataCap::flush) first. The engines' read path
39//! ([`page_slot`](DataCap::page_slot) / [`page_at`](DataCap::page_at)) and the
40//! zero-copy slot return read *effective* bytes without hashing.
41//!
42//! The pages-root tracks the cap's *actual* size (depth `ceil_log2(page_count)`),
43//! not a fixed compile-time capacity: ≤ 256 pages (≤ 1 MiB) is shallower than
44//! depth 9; a 4 GiB cap is depth 20. `size` is committed in the cap's own root
45//! (the first container field) because SSZ merkleization is not self-describing.
46//!
47//! ## Page-alignment invariant
48//!
49//! Each present page is a [`PageSlot::Loaded`] holding a refcounted
50//! [`PageBytes`] whose `bytes` is a **`PAGE_SIZE`-aligned** 4 KiB slab
51//! ([`alloc_page_aligned_zeroed`]) — load-bearing because the x86 recompiler
52//! resolves each page's physical address from its slab pointer and maps it
53//! directly into a ring-3 page table. This holds for backing *and* overlay pages.
54
55use core::alloc::Layout;
56
57use alloc::alloc::alloc_zeroed;
58use alloc::collections::BTreeMap;
59use alloc::sync::Arc;
60use alloc::vec::Vec;
61
62use ssz::HashTreeRoot;
63use ssz::digest::Digest;
64use ssz::digest::typenum::U32;
65use ssz::merkle::merkleize;
66
67use super::CapHash;
68use super::page::{PageBytes, PageSlot};
69
70/// Cap-level page size. Mirrors the architecture's 4 KiB page (must match
71/// `nub_arch_x86::paging::PAGE_SIZE` for direct PT mapping to work).
72pub const PAGE_SIZE: usize = 4096;
73
74/// Pages per 2 MiB group (`512 = 2^9`). Kept as the natural large-page /
75/// 2 MiB-cluster unit (architecture-portable large page; the read-only gas
76/// materialization unit), even though storage is no longer group-chunked.
77pub const GROUP_PAGES: usize = 512;
78
79/// 2 MiB span in bytes (`512 * 4096 = 1 << 21`).
80pub const GROUP_SIZE: usize = GROUP_PAGES * PAGE_SIZE;
81
82/// The dense immutable backing of a [`DataCap`]: a custom runtime-sized SSZ
83/// vector of pages.
84///
85/// `pages[i]` is absolute page `i`; trailing [`PageSlot::Empty`] pages may be
86/// omitted (so `pages.len() <= page_count`), and an out-of-range index reads as
87/// `Empty` (zero) — the merkle pads them back via the zero-hash table. The
88/// `hash_tree_root` is `merkleize(page_roots, page_count)` at runtime depth.
89#[derive(Clone, Debug, Default, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
90pub struct PageSlab {
91    /// Logical byte length; always a [`PAGE_SIZE`] multiple. `page_count =
92    /// size / PAGE_SIZE`.
93    pub size: u64,
94    /// Dense page storage indexed by absolute page (trailing `Empty` trimmed).
95    pub pages: Vec<PageSlot>,
96}
97
98impl PageSlab {
99    /// An empty slab (size 0, no pages).
100    pub fn empty() -> Self {
101        PageSlab {
102            size: 0,
103            pages: Vec::new(),
104        }
105    }
106
107    /// Number of logical pages (`size / PAGE_SIZE`).
108    #[inline]
109    pub fn page_count(&self) -> usize {
110        (self.size / PAGE_SIZE as u64) as usize
111    }
112
113    /// Borrow page `i` (absolute). Out-of-range / trimmed-tail reads as
114    /// [`PageSlot::Empty`].
115    #[inline]
116    pub fn page(&self, i: usize) -> &PageSlot {
117        self.pages.get(i).unwrap_or(&PageSlot::Empty)
118    }
119
120    /// Fold up to [`PAGE_SIZE`] `content` bytes into absolute page index `p`,
121    /// **canonically**: all-zero content stores the [`PageSlot::Empty`] sentinel
122    /// (no allocation), any other content a fresh `PAGE_SIZE`-aligned
123    /// [`PageSlot::Loaded`] slab. Grows the dense vector as needed, then trims
124    /// trailing `Empty` so the layout is unique.
125    fn put_page_idx(&mut self, p: usize, content: &[u8]) {
126        if self.pages.len() <= p {
127            self.pages.resize(p + 1, PageSlot::Empty);
128        }
129        self.pages[p] = if content.iter().all(|&b| b == 0) {
130            PageSlot::Empty
131        } else {
132            PageSlot::Loaded(Arc::new(PageBytes::from_content(content)))
133        };
134        while matches!(self.pages.last(), Some(PageSlot::Empty)) {
135            self.pages.pop();
136        }
137    }
138
139    /// Build a slab from contiguous `content`, logical size at least
140    /// `target_size` (rounded up to a page boundary, minimum one page). All-zero
141    /// pages become [`PageSlot::Empty`] (sparse).
142    fn from_bytes_sized(content: &[u8], target_size: u64) -> Self {
143        let size = target_size
144            .max(content.len() as u64)
145            .next_multiple_of(PAGE_SIZE as u64)
146            .max(PAGE_SIZE as u64);
147        let total_pages = (size / PAGE_SIZE as u64) as usize;
148        let mut pages: Vec<PageSlot> = Vec::new();
149        let mut last_nonempty: Option<usize> = None;
150        for p in 0..total_pages {
151            let off = p * PAGE_SIZE;
152            let lo = off.min(content.len());
153            let hi = (off + PAGE_SIZE).min(content.len());
154            let slice = &content[lo..hi];
155            if slice.iter().all(|&b| b == 0) {
156                pages.push(PageSlot::Empty);
157            } else {
158                pages.push(PageSlot::Loaded(Arc::new(PageBytes::from_content(slice))));
159                last_nonempty = Some(p);
160            }
161        }
162        match last_nonempty {
163            Some(last) => pages.truncate(last + 1),
164            None => pages.clear(),
165        }
166        PageSlab { size, pages }
167    }
168
169    /// Build a slab of logical `size` (a [`PAGE_SIZE`] multiple) from sparse
170    /// named pages: `pages` yields `(page_index, content)` for the non-zero
171    /// pages (`content` ≤ `PAGE_SIZE`); every unnamed page is the canonical
172    /// zero page. Reuses `put_page_idx` so the result is the same canonical
173    /// form (all-zero → `Empty`, trailing `Empty` trimmed) a contiguous
174    /// `from_bytes_sized` build would produce — i.e. byte- and hash-identical
175    /// for equivalent logical content. The decode target for
176    /// [`crate::image::DataDesc`].
177    pub fn from_sparse_pages<'a>(
178        size: u64,
179        pages: impl IntoIterator<Item = (u32, &'a [u8])>,
180    ) -> Self {
181        // Mirror `from_bytes_sized`'s size flooring so the two constructors
182        // are equivalent for *every* input — including `size == 0`, which
183        // floors to one `Empty` page rather than a zero-extent slab. A
184        // non-page-multiple `size` rounds up identically. (The deblob
185        // validates `size` is a page multiple, so this only normalizes
186        // degenerate direct-caller input.)
187        let size = size
188            .next_multiple_of(PAGE_SIZE as u64)
189            .max(PAGE_SIZE as u64);
190        let mut slab = PageSlab {
191            size,
192            pages: Vec::new(),
193        };
194        let page_count = slab.page_count();
195        for (page_index, content) in pages {
196            debug_assert!(
197                (page_index as usize) < page_count,
198                "from_sparse_pages: page_index {page_index} >= page_count {page_count}",
199            );
200            slab.put_page_idx(page_index as usize, content);
201        }
202        slab
203    }
204}
205
206impl HashTreeRoot for PageSlab {
207    /// The `pages` field root: the flat size-scaled page merkle at exact depth
208    /// `ceil_log2(page_count)`. Empty/absent pages contribute `[0;32]`, folded
209    /// by the merkle zero-hash table.
210    fn hash_tree_root<D: Digest<OutputSize = U32>>(&self) -> [u8; 32] {
211        let page_count = self.page_count();
212        let leaves: Vec<[u8; 32]> = (0..page_count)
213            .map(|i| self.page(i).hash_tree_root::<D>())
214            .collect();
215        merkleize::<D>(&leaves, page_count.max(1))
216    }
217}
218
219/// Data cap: an `Arc`-shared immutable [`PageSlab`] backing plus a copy-on-write
220/// overlay of modified pages. The cap identity (when flushed) is the flat
221/// size-scaled `{ size, pages }` merkle (see the module docs).
222#[derive(Clone, Debug, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
223pub struct DataCap {
224    /// Immutable backing, shared across a `MGMT_COPY` lineage by `Arc`.
225    pub backing: Arc<PageSlab>,
226    /// Copy-on-write modified pages (absolute page index → page). Empty for a
227    /// clean / settled cap. A present entry shadows the backing; zero-writes are
228    /// stored explicitly (a present [`PageSlot::Loaded`]) so they shadow a
229    /// possibly-nonzero backing page.
230    pub overlay: BTreeMap<u32, PageSlot>,
231}
232
233impl HashTreeRoot for DataCap {
234    fn hash_tree_root<D: Digest<OutputSize = U32>>(&self) -> [u8; 32] {
235        // The cap root is defined only on a flushed (overlay-empty) cap; the
236        // backing is the content-addressed form. Hashing an overlay-bearing cap
237        // is a usage error (mirrors hashing a graph with an unresolved Ref):
238        // callers `flush()` first.
239        assert!(
240            self.overlay.is_empty(),
241            "DataCap::hash_tree_root: cap has a non-empty CoW overlay; flush() before hashing"
242        );
243        debug_assert!(
244            self.backing.size.is_multiple_of(PAGE_SIZE as u64),
245            "DataCap::hash_tree_root: size must be a PAGE_SIZE multiple"
246        );
247        // 2-field SSZ container `{ size, pages }`.
248        let size_root = self.backing.size.hash_tree_root::<D>();
249        let pages_root = self.backing.hash_tree_root::<D>();
250        merkleize::<D>(&[size_root, pages_root], 2)
251    }
252}
253
254/// Resolution of a single page within a [`DataCap`], shared by both engines so
255/// they materialize byte-identically.
256#[derive(Clone, Copy, Debug)]
257pub enum PageResolution<'a> {
258    /// Canonical zero page (absent / `Empty`): reads as zero; a write
259    /// copies-on-write a fresh page.
260    Zero,
261    /// A materialized, `PAGE_SIZE`-aligned page slab.
262    Bytes(&'a [u8]),
263    /// An elided page known only by its content hash: a faulting access is a
264    /// PVM page fault. V1 never mints this.
265    Missing(CapHash),
266}
267
268impl DataCap {
269    /// An empty `DataCap`: logical size 0, no pages, no overlay.
270    pub fn empty() -> Self {
271        DataCap {
272            backing: Arc::new(PageSlab::empty()),
273            overlay: BTreeMap::new(),
274        }
275    }
276
277    /// Total logical content size in bytes (always a [`PAGE_SIZE`] multiple).
278    pub fn content_len(&self) -> u64 {
279        self.backing.size
280    }
281
282    /// Number of logical pages.
283    #[inline]
284    pub fn page_count(&self) -> usize {
285        self.backing.page_count()
286    }
287
288    /// Is page `i` overlaid (CoW'd)? A page is dirty iff it is present in the
289    /// overlay (`Empty` slots / absent entries are clean — defer to the backing).
290    #[inline]
291    pub fn is_dirty(&self, i: usize) -> bool {
292        matches!(
293            self.overlay.get(&(i as u32)),
294            Some(PageSlot::Loaded(_) | PageSlot::Missing(_))
295        )
296    }
297
298    /// Borrow the **effective** page slot at absolute page index `i`: the
299    /// overlay page if dirty, else the backing page. Both engines resolve a
300    /// page's physical address from the returned [`PageSlot::Loaded`] slab.
301    #[inline]
302    pub fn page_slot(&self, i: usize) -> &PageSlot {
303        match self.overlay.get(&(i as u32)) {
304            Some(slot @ (PageSlot::Loaded(_) | PageSlot::Missing(_))) => slot,
305            _ => self.backing.page(i),
306        }
307    }
308
309    /// Resolve the **effective** page containing byte offset `off` (need not be
310    /// page-aligned). Both engines call this so their materialized page contents
311    /// are identical.
312    pub fn page_at(&self, off: u64) -> PageResolution<'_> {
313        let i = (off / PAGE_SIZE as u64) as usize;
314        match self.page_slot(i) {
315            PageSlot::Loaded(pr) => PageResolution::Bytes(&pr.bytes),
316            PageSlot::Missing(h) => PageResolution::Missing(*h),
317            PageSlot::Empty => PageResolution::Zero,
318        }
319    }
320
321    /// Copy `out.len()` logical bytes starting at byte offset `start` into
322    /// `out` (effective content), fully defining every byte: materialized pages
323    /// are copied; `Zero` pages read as zero.
324    ///
325    /// # Panics
326    ///
327    /// Panics on a [`PageResolution::Missing`] page — elided content known only
328    /// by hash is **not** zero, and the recompiler hard-faults on it; silently
329    /// zero-filling would fork consensus. V1 never mints `Missing`.
330    pub fn copy_into(&self, start: u64, out: &mut [u8]) {
331        let mut done = 0usize;
332        while done < out.len() {
333            let off = start + done as u64;
334            let page_off = (off % PAGE_SIZE as u64) as usize;
335            let take = (PAGE_SIZE - page_off).min(out.len() - done);
336            match self.page_at(off) {
337                PageResolution::Bytes(bytes) => {
338                    let avail = bytes.len().saturating_sub(page_off);
339                    let n = avail.min(take);
340                    out[done..done + n].copy_from_slice(&bytes[page_off..page_off + n]);
341                    out[done + n..done + take].fill(0);
342                }
343                PageResolution::Zero => {
344                    out[done..done + take].fill(0);
345                }
346                PageResolution::Missing(h) => {
347                    panic!(
348                        "DataCap::copy_into: Missing page at offset {off} (hash \
349                         {:02x?}..) — host reads of elided pages are unsupported \
350                         (would fork vs the engine's page fault)",
351                        &h[..4],
352                    );
353                }
354            }
355            done += take;
356        }
357    }
358
359    /// Overwrite the **backing** page containing absolute offset `off` with up
360    /// to [`PAGE_SIZE`] content bytes (zero-padded tail), canonically. This is
361    /// the **construction / settle** primitive (it mutates the backing via
362    /// `Arc::make_mut`, so it is O(slab) on a shared backing — only use it while
363    /// building a fresh cap). Copy-on-write *during execution* goes through
364    /// [`write_page`](Self::write_page) (the overlay) instead.
365    ///
366    /// Panics (debug) if `off >= self.content_len()`.
367    pub fn put_page(&mut self, off: u64, content: &[u8]) {
368        debug_assert!(
369            off < self.content_len(),
370            "DataCap::put_page: offset past logical size"
371        );
372        let p = (off / PAGE_SIZE as u64) as usize;
373        Arc::make_mut(&mut self.backing).put_page_idx(p, content);
374    }
375
376    /// Copy-on-write the **overlay** page containing absolute offset `off` with
377    /// up to [`PAGE_SIZE`] `content` bytes (zero-padded tail). The page is stored
378    /// **explicitly** as a present `Loaded` slab — even all-zero content — so it
379    /// shadows the backing. This is the execution write boundary.
380    ///
381    /// Panics (debug) if `off >= self.content_len()`.
382    pub fn write_page(&mut self, off: u64, content: &[u8]) {
383        debug_assert!(
384            off < self.content_len(),
385            "DataCap::write_page: offset past logical size"
386        );
387        let p = (off / PAGE_SIZE as u64) as u32;
388        self.overlay.insert(
389            p,
390            PageSlot::Loaded(Arc::new(PageBytes::from_content(content))),
391        );
392    }
393
394    /// Insert an already-built overlay page slot at absolute page index `p`
395    /// (move, no copy). The slab is page-aligned by construction; used by the
396    /// engines' CoW path to hand a freshly-written page to the cap directly.
397    pub fn insert_overlay_page(&mut self, p: u32, slot: PageSlot) {
398        self.overlay.insert(p, slot);
399    }
400
401    /// Place every effective page of `src` into this cap's **backing** starting
402    /// at absolute byte offset `dst_off` (page-aligned): backing page
403    /// `dst_off/PAGE_SIZE + i` becomes a clone of `src.page_slot(i)` — an `Arc`
404    /// refcount bump, **not** a byte copy. Pages beyond this cap's extent are
405    /// dropped (clamped, mirroring the interpreter's `off < extent` fold guard).
406    ///
407    /// This is the **page-sharing** instance-memory composer: a fresh Instance
408    /// `mem` built by placing an Image's mapped `Cap::Data` sources shares those
409    /// sources' physical pages, so N sub-VMs spawned from one Image all map the
410    /// same read-only frames and each CoWs (into its overlay) only the pages it
411    /// writes — the shared backing is never mutated. Effective bytes are
412    /// identical to the copying `put_page` fold; only the allocation is shared.
413    pub fn place_shared(&mut self, dst_off: u64, src: &DataCap) {
414        debug_assert!(
415            dst_off.is_multiple_of(PAGE_SIZE as u64),
416            "place_shared: dst_off must be page-aligned"
417        );
418        let base = (dst_off / PAGE_SIZE as u64) as usize;
419        let total_pages = self.backing.page_count();
420        let slab = Arc::make_mut(&mut self.backing);
421        for i in 0..src.page_count() {
422            let dst = base + i;
423            if dst >= total_pages {
424                break; // clamp to this cap's logical extent
425            }
426            if slab.pages.len() <= dst {
427                slab.pages.resize(dst + 1, PageSlot::Empty);
428            }
429            slab.pages[dst] = src.page_slot(i).clone();
430        }
431        // Re-canonicalize: trim trailing `Empty` so the layout stays unique.
432        while matches!(slab.pages.last(), Some(PageSlot::Empty)) {
433            slab.pages.pop();
434        }
435    }
436
437    /// Fold the overlay into a fresh, clean (overlay-empty) `DataCap` whose
438    /// `hash_tree_root` is defined. Clones the backing and folds every overlaid
439    /// page in via the canonical backing CoW. This is the settle / content-
440    /// address primitive (it replaces the old `DataViewCap::settle`).
441    pub fn flush(&self) -> DataCap {
442        if self.overlay.is_empty() {
443            return self.clone();
444        }
445        let mut backing = (*self.backing).clone();
446        for (&p, slot) in &self.overlay {
447            match slot {
448                PageSlot::Loaded(pr) => backing.put_page_idx(p as usize, &pr.bytes),
449                PageSlot::Empty => {}
450                PageSlot::Missing(_) => {
451                    unreachable!("DataCap::flush: Missing overlay page (cold-load unsupported)")
452                }
453            }
454        }
455        DataCap {
456            backing: Arc::new(backing),
457            overlay: BTreeMap::new(),
458        }
459    }
460
461    /// Build a `DataCap` from contiguous `content`, sized to the next page
462    /// boundary (at least one page). All-zero pages become [`PageSlot::Empty`].
463    pub fn from_bytes(content: &[u8]) -> Self {
464        Self::from_bytes_sized(content, content.len() as u64)
465    }
466
467    /// Build a `DataCap` from `content` with a logical size of at least
468    /// `target_size` (rounded up to a page boundary, minimum one page).
469    /// `content` fills the low bytes; the remainder is zero (sparse). The cap is
470    /// clean (empty overlay).
471    pub fn from_bytes_sized(content: &[u8], target_size: u64) -> Self {
472        DataCap {
473            backing: Arc::new(PageSlab::from_bytes_sized(content, target_size)),
474            overlay: BTreeMap::new(),
475        }
476    }
477
478    /// Build a clean (overlay-free) `DataCap` from sparse named pages over a
479    /// logical `size`. See [`PageSlab::from_sparse_pages`]. This is the
480    /// decode target for [`crate::image::DataDesc::to_data_cap`]; the result
481    /// is byte- and hash-identical to a contiguous [`Self::from_bytes_sized`]
482    /// of equivalent logical content.
483    pub fn from_sparse_pages<'a>(
484        size: u64,
485        pages: impl IntoIterator<Item = (u32, &'a [u8])>,
486    ) -> Self {
487        DataCap {
488            backing: Arc::new(PageSlab::from_sparse_pages(size, pages)),
489            overlay: BTreeMap::new(),
490        }
491    }
492}
493
494/// Allocate a zero-filled `Vec<u8>` of `len` bytes (rounded up to the next page
495/// boundary) with `PAGE_SIZE`-aligned backing storage. Page alignment is what
496/// lets the kernel map the buffer directly into a ring-3 PT.
497///
498/// Panics if the allocator returns null (OOM) or the `Layout` overflows.
499pub fn alloc_page_aligned_zeroed(len: usize) -> Vec<u8> {
500    let padded = len.next_multiple_of(PAGE_SIZE).max(PAGE_SIZE);
501    let layout =
502        Layout::from_size_align(padded, PAGE_SIZE).expect("DataCap page-aligned layout overflow");
503    // SAFETY: `padded > 0` so the `Layout` is non-zero; the std global
504    // allocator is what `Vec` uses, so the buffer is `Vec::from_raw_parts`-safe.
505    let ptr = unsafe { alloc_zeroed(layout) };
506    if ptr.is_null() {
507        alloc::alloc::handle_alloc_error(layout);
508    }
509    // SAFETY: non-null pointer to `padded` zeroed bytes, PAGE_SIZE-aligned;
510    // capacity == len == padded, all bytes initialised (to zero).
511    unsafe { Vec::from_raw_parts(ptr, padded, padded) }
512}
513
514/// Content hash of a single page: the SSZ `hash_tree_root` of the page as a
515/// `ByteVector[PAGE_SIZE]` (zero-padded), under the cap digest (SHA-256). This
516/// is the value a materialized page contributes to the cap merkle and the
517/// precomputed [`PageBytes::hash`] kept by the substitution invariant.
518pub fn page_content_hash(bytes: &[u8]) -> CapHash {
519    let mut arr = [0u8; PAGE_SIZE];
520    let n = bytes.len().min(PAGE_SIZE);
521    arr[..n].copy_from_slice(&bytes[..n]);
522    ssz::hash_tree_root(&arr)
523}