javm_cap/cap/data.rs
1//! `DataCap` — backing (immutable dense page slab) + copy-on-write overlay.
2//!
3//! A `DataCap` is `{ backing: Arc<PageSlab>, overlay }`:
4//!
5//! - [`PageSlab`] is the **immutable backing**: a *dense* runtime-sized vector
6//! of pages (index `i` is absolute page `i`), `Arc`-shared so `MGMT_COPY` is
7//! one refcount bump. It is **not** a sparse `RadixMap` of 2 MiB groups and
8//! **not** raw bytes — it is a custom SSZ page-vector whose `hash_tree_root`
9//! merkleizes the page roots at the **exact runtime depth** `ceil_log2(page_count)`
10//! (`page_count = size / PAGE_SIZE`). `ssz::List<T, N>` / `Vector` fix `N` at
11//! compile time; a cap's `page_count` is a runtime value, so `PageSlab` is a
12//! bespoke type reusing [`ssz::merkle::merkleize`] with `limit = page_count`.
13//! - `overlay` is the **copy-on-write working layer**: the pages this cap has
14//! modified since the backing settled, keyed by absolute page index. A page
15//! present in the overlay *shadows* the backing; a clean cap has an empty
16//! overlay and is identical to its backing. During execution the engines
17//! write dirty pages straight into the overlay (no separate dirty-page list);
18//! at settle [`DataCap::flush`] folds the overlay into a fresh `PageSlab`.
19//!
20//! Earlier drafts split this across a separate `DataCap` (immutable, sparse
21//! groups) and `DataViewCap` (backing-by-hash + overlay). They are now **one
22//! type**: `DataViewCap == DataCap`.
23//!
24//! ## The cap root (flat, size-scaled) — defined only on a *flushed* cap
25//!
26//! `hash_tree_root` is the SSZ 2-field container `{ size, pages }`:
27//!
28//! ```text
29//! cap_root = merkleize([ htr(size), pages_root ], 2) // depth-1 container
30//! pages_root = merkleize([ page(0)..page(page_count) ], page_count) // = PageSlab::hash_tree_root
31//! page_count = size / PAGE_SIZE
32//! ```
33//!
34//! The cap root is **only defined when the `overlay` is empty** — the backing is
35//! the hashable, content-addressed form; the overlay is transient working state.
36//! Hashing a cap with a non-empty overlay is a usage error (it panics, like
37//! hashing a cap graph that still holds an unresolved `Ref`): callers
38//! [`flush`](DataCap::flush) first. The engines' read path
39//! ([`page_slot`](DataCap::page_slot) / [`page_at`](DataCap::page_at)) and the
40//! zero-copy slot return read *effective* bytes without hashing.
41//!
42//! The pages-root tracks the cap's *actual* size (depth `ceil_log2(page_count)`),
43//! not a fixed compile-time capacity: ≤ 256 pages (≤ 1 MiB) is shallower than
44//! depth 9; a 4 GiB cap is depth 20. `size` is committed in the cap's own root
45//! (the first container field) because SSZ merkleization is not self-describing.
46//!
47//! ## Page-alignment invariant
48//!
49//! Each present page is a [`PageSlot::Loaded`] holding a refcounted
50//! [`PageBytes`] whose `bytes` is a **`PAGE_SIZE`-aligned** 4 KiB slab
51//! ([`alloc_page_aligned_zeroed`]) — load-bearing because the x86 recompiler
52//! resolves each page's physical address from its slab pointer and maps it
53//! directly into a ring-3 page table. This holds for backing *and* overlay pages.
54
55use core::alloc::Layout;
56
57use alloc::alloc::alloc_zeroed;
58use alloc::collections::BTreeMap;
59use alloc::sync::Arc;
60use alloc::vec::Vec;
61
62use ssz::HashTreeRoot;
63use ssz::digest::Digest;
64use ssz::digest::typenum::U32;
65use ssz::merkle::merkleize;
66
67use super::CapHash;
68use super::page::{PageBytes, PageSlot};
69
70/// Cap-level page size. Mirrors the architecture's 4 KiB page (must match
71/// `nub_arch_x86::paging::PAGE_SIZE` for direct PT mapping to work).
72pub const PAGE_SIZE: usize = 4096;
73
74/// Pages per 2 MiB group (`512 = 2^9`). Kept as the natural large-page /
75/// 2 MiB-cluster unit (architecture-portable large page; the read-only gas
76/// materialization unit), even though storage is no longer group-chunked.
77pub const GROUP_PAGES: usize = 512;
78
79/// 2 MiB span in bytes (`512 * 4096 = 1 << 21`).
80pub const GROUP_SIZE: usize = GROUP_PAGES * PAGE_SIZE;
81
82/// The dense immutable backing of a [`DataCap`]: a custom runtime-sized SSZ
83/// vector of pages.
84///
85/// `pages[i]` is absolute page `i`; trailing [`PageSlot::Empty`] pages may be
86/// omitted (so `pages.len() <= page_count`), and an out-of-range index reads as
87/// `Empty` (zero) — the merkle pads them back via the zero-hash table. The
88/// `hash_tree_root` is `merkleize(page_roots, page_count)` at runtime depth.
89#[derive(Clone, Debug, Default, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
90pub struct PageSlab {
91 /// Logical byte length; always a [`PAGE_SIZE`] multiple. `page_count =
92 /// size / PAGE_SIZE`.
93 pub size: u64,
94 /// Dense page storage indexed by absolute page (trailing `Empty` trimmed).
95 pub pages: Vec<PageSlot>,
96}
97
98impl PageSlab {
99 /// An empty slab (size 0, no pages).
100 pub fn empty() -> Self {
101 PageSlab {
102 size: 0,
103 pages: Vec::new(),
104 }
105 }
106
107 /// Number of logical pages (`size / PAGE_SIZE`).
108 #[inline]
109 pub fn page_count(&self) -> usize {
110 (self.size / PAGE_SIZE as u64) as usize
111 }
112
113 /// Borrow page `i` (absolute). Out-of-range / trimmed-tail reads as
114 /// [`PageSlot::Empty`].
115 #[inline]
116 pub fn page(&self, i: usize) -> &PageSlot {
117 self.pages.get(i).unwrap_or(&PageSlot::Empty)
118 }
119
120 /// Fold up to [`PAGE_SIZE`] `content` bytes into absolute page index `p`,
121 /// **canonically**: all-zero content stores the [`PageSlot::Empty`] sentinel
122 /// (no allocation), any other content a fresh `PAGE_SIZE`-aligned
123 /// [`PageSlot::Loaded`] slab. Grows the dense vector as needed, then trims
124 /// trailing `Empty` so the layout is unique.
125 fn put_page_idx(&mut self, p: usize, content: &[u8]) {
126 if self.pages.len() <= p {
127 self.pages.resize(p + 1, PageSlot::Empty);
128 }
129 self.pages[p] = if content.iter().all(|&b| b == 0) {
130 PageSlot::Empty
131 } else {
132 PageSlot::Loaded(Arc::new(PageBytes::from_content(content)))
133 };
134 while matches!(self.pages.last(), Some(PageSlot::Empty)) {
135 self.pages.pop();
136 }
137 }
138
139 /// Build a slab from contiguous `content`, logical size at least
140 /// `target_size` (rounded up to a page boundary, minimum one page). All-zero
141 /// pages become [`PageSlot::Empty`] (sparse).
142 fn from_bytes_sized(content: &[u8], target_size: u64) -> Self {
143 let size = target_size
144 .max(content.len() as u64)
145 .next_multiple_of(PAGE_SIZE as u64)
146 .max(PAGE_SIZE as u64);
147 let total_pages = (size / PAGE_SIZE as u64) as usize;
148 let mut pages: Vec<PageSlot> = Vec::new();
149 let mut last_nonempty: Option<usize> = None;
150 for p in 0..total_pages {
151 let off = p * PAGE_SIZE;
152 let lo = off.min(content.len());
153 let hi = (off + PAGE_SIZE).min(content.len());
154 let slice = &content[lo..hi];
155 if slice.iter().all(|&b| b == 0) {
156 pages.push(PageSlot::Empty);
157 } else {
158 pages.push(PageSlot::Loaded(Arc::new(PageBytes::from_content(slice))));
159 last_nonempty = Some(p);
160 }
161 }
162 match last_nonempty {
163 Some(last) => pages.truncate(last + 1),
164 None => pages.clear(),
165 }
166 PageSlab { size, pages }
167 }
168
169 /// Build a slab of logical `size` (a [`PAGE_SIZE`] multiple) from sparse
170 /// named pages: `pages` yields `(page_index, content)` for the non-zero
171 /// pages (`content` ≤ `PAGE_SIZE`); every unnamed page is the canonical
172 /// zero page. Reuses `put_page_idx` so the result is the same canonical
173 /// form (all-zero → `Empty`, trailing `Empty` trimmed) a contiguous
174 /// `from_bytes_sized` build would produce — i.e. byte- and hash-identical
175 /// for equivalent logical content. The decode target for
176 /// [`crate::image::DataDesc`].
177 pub fn from_sparse_pages<'a>(
178 size: u64,
179 pages: impl IntoIterator<Item = (u32, &'a [u8])>,
180 ) -> Self {
181 // Mirror `from_bytes_sized`'s size flooring so the two constructors
182 // are equivalent for *every* input — including `size == 0`, which
183 // floors to one `Empty` page rather than a zero-extent slab. A
184 // non-page-multiple `size` rounds up identically. (The deblob
185 // validates `size` is a page multiple, so this only normalizes
186 // degenerate direct-caller input.)
187 let size = size
188 .next_multiple_of(PAGE_SIZE as u64)
189 .max(PAGE_SIZE as u64);
190 let mut slab = PageSlab {
191 size,
192 pages: Vec::new(),
193 };
194 let page_count = slab.page_count();
195 for (page_index, content) in pages {
196 debug_assert!(
197 (page_index as usize) < page_count,
198 "from_sparse_pages: page_index {page_index} >= page_count {page_count}",
199 );
200 slab.put_page_idx(page_index as usize, content);
201 }
202 slab
203 }
204}
205
206impl HashTreeRoot for PageSlab {
207 /// The `pages` field root: the flat size-scaled page merkle at exact depth
208 /// `ceil_log2(page_count)`. Empty/absent pages contribute `[0;32]`, folded
209 /// by the merkle zero-hash table.
210 fn hash_tree_root<D: Digest<OutputSize = U32>>(&self) -> [u8; 32] {
211 let page_count = self.page_count();
212 let leaves: Vec<[u8; 32]> = (0..page_count)
213 .map(|i| self.page(i).hash_tree_root::<D>())
214 .collect();
215 merkleize::<D>(&leaves, page_count.max(1))
216 }
217}
218
219/// Data cap: an `Arc`-shared immutable [`PageSlab`] backing plus a copy-on-write
220/// overlay of modified pages. The cap identity (when flushed) is the flat
221/// size-scaled `{ size, pages }` merkle (see the module docs).
222#[derive(Clone, Debug, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
223pub struct DataCap {
224 /// Immutable backing, shared across a `MGMT_COPY` lineage by `Arc`.
225 pub backing: Arc<PageSlab>,
226 /// Copy-on-write modified pages (absolute page index → page). Empty for a
227 /// clean / settled cap. A present entry shadows the backing; zero-writes are
228 /// stored explicitly (a present [`PageSlot::Loaded`]) so they shadow a
229 /// possibly-nonzero backing page.
230 pub overlay: BTreeMap<u32, PageSlot>,
231}
232
233impl HashTreeRoot for DataCap {
234 fn hash_tree_root<D: Digest<OutputSize = U32>>(&self) -> [u8; 32] {
235 // The cap root is defined only on a flushed (overlay-empty) cap; the
236 // backing is the content-addressed form. Hashing an overlay-bearing cap
237 // is a usage error (mirrors hashing a graph with an unresolved Ref):
238 // callers `flush()` first.
239 assert!(
240 self.overlay.is_empty(),
241 "DataCap::hash_tree_root: cap has a non-empty CoW overlay; flush() before hashing"
242 );
243 debug_assert!(
244 self.backing.size.is_multiple_of(PAGE_SIZE as u64),
245 "DataCap::hash_tree_root: size must be a PAGE_SIZE multiple"
246 );
247 // 2-field SSZ container `{ size, pages }`.
248 let size_root = self.backing.size.hash_tree_root::<D>();
249 let pages_root = self.backing.hash_tree_root::<D>();
250 merkleize::<D>(&[size_root, pages_root], 2)
251 }
252}
253
254/// Resolution of a single page within a [`DataCap`], shared by both engines so
255/// they materialize byte-identically.
256#[derive(Clone, Copy, Debug)]
257pub enum PageResolution<'a> {
258 /// Canonical zero page (absent / `Empty`): reads as zero; a write
259 /// copies-on-write a fresh page.
260 Zero,
261 /// A materialized, `PAGE_SIZE`-aligned page slab.
262 Bytes(&'a [u8]),
263 /// An elided page known only by its content hash: a faulting access is a
264 /// PVM page fault. V1 never mints this.
265 Missing(CapHash),
266}
267
268impl DataCap {
269 /// An empty `DataCap`: logical size 0, no pages, no overlay.
270 pub fn empty() -> Self {
271 DataCap {
272 backing: Arc::new(PageSlab::empty()),
273 overlay: BTreeMap::new(),
274 }
275 }
276
277 /// Total logical content size in bytes (always a [`PAGE_SIZE`] multiple).
278 pub fn content_len(&self) -> u64 {
279 self.backing.size
280 }
281
282 /// Number of logical pages.
283 #[inline]
284 pub fn page_count(&self) -> usize {
285 self.backing.page_count()
286 }
287
288 /// Is page `i` overlaid (CoW'd)? A page is dirty iff it is present in the
289 /// overlay (`Empty` slots / absent entries are clean — defer to the backing).
290 #[inline]
291 pub fn is_dirty(&self, i: usize) -> bool {
292 matches!(
293 self.overlay.get(&(i as u32)),
294 Some(PageSlot::Loaded(_) | PageSlot::Missing(_))
295 )
296 }
297
298 /// Borrow the **effective** page slot at absolute page index `i`: the
299 /// overlay page if dirty, else the backing page. Both engines resolve a
300 /// page's physical address from the returned [`PageSlot::Loaded`] slab.
301 #[inline]
302 pub fn page_slot(&self, i: usize) -> &PageSlot {
303 match self.overlay.get(&(i as u32)) {
304 Some(slot @ (PageSlot::Loaded(_) | PageSlot::Missing(_))) => slot,
305 _ => self.backing.page(i),
306 }
307 }
308
309 /// Resolve the **effective** page containing byte offset `off` (need not be
310 /// page-aligned). Both engines call this so their materialized page contents
311 /// are identical.
312 pub fn page_at(&self, off: u64) -> PageResolution<'_> {
313 let i = (off / PAGE_SIZE as u64) as usize;
314 match self.page_slot(i) {
315 PageSlot::Loaded(pr) => PageResolution::Bytes(&pr.bytes),
316 PageSlot::Missing(h) => PageResolution::Missing(*h),
317 PageSlot::Empty => PageResolution::Zero,
318 }
319 }
320
321 /// Copy `out.len()` logical bytes starting at byte offset `start` into
322 /// `out` (effective content), fully defining every byte: materialized pages
323 /// are copied; `Zero` pages read as zero.
324 ///
325 /// # Panics
326 ///
327 /// Panics on a [`PageResolution::Missing`] page — elided content known only
328 /// by hash is **not** zero, and the recompiler hard-faults on it; silently
329 /// zero-filling would fork consensus. V1 never mints `Missing`.
330 pub fn copy_into(&self, start: u64, out: &mut [u8]) {
331 let mut done = 0usize;
332 while done < out.len() {
333 let off = start + done as u64;
334 let page_off = (off % PAGE_SIZE as u64) as usize;
335 let take = (PAGE_SIZE - page_off).min(out.len() - done);
336 match self.page_at(off) {
337 PageResolution::Bytes(bytes) => {
338 let avail = bytes.len().saturating_sub(page_off);
339 let n = avail.min(take);
340 out[done..done + n].copy_from_slice(&bytes[page_off..page_off + n]);
341 out[done + n..done + take].fill(0);
342 }
343 PageResolution::Zero => {
344 out[done..done + take].fill(0);
345 }
346 PageResolution::Missing(h) => {
347 panic!(
348 "DataCap::copy_into: Missing page at offset {off} (hash \
349 {:02x?}..) — host reads of elided pages are unsupported \
350 (would fork vs the engine's page fault)",
351 &h[..4],
352 );
353 }
354 }
355 done += take;
356 }
357 }
358
359 /// Overwrite the **backing** page containing absolute offset `off` with up
360 /// to [`PAGE_SIZE`] content bytes (zero-padded tail), canonically. This is
361 /// the **construction / settle** primitive (it mutates the backing via
362 /// `Arc::make_mut`, so it is O(slab) on a shared backing — only use it while
363 /// building a fresh cap). Copy-on-write *during execution* goes through
364 /// [`write_page`](Self::write_page) (the overlay) instead.
365 ///
366 /// Panics (debug) if `off >= self.content_len()`.
367 pub fn put_page(&mut self, off: u64, content: &[u8]) {
368 debug_assert!(
369 off < self.content_len(),
370 "DataCap::put_page: offset past logical size"
371 );
372 let p = (off / PAGE_SIZE as u64) as usize;
373 Arc::make_mut(&mut self.backing).put_page_idx(p, content);
374 }
375
376 /// Copy-on-write the **overlay** page containing absolute offset `off` with
377 /// up to [`PAGE_SIZE`] `content` bytes (zero-padded tail). The page is stored
378 /// **explicitly** as a present `Loaded` slab — even all-zero content — so it
379 /// shadows the backing. This is the execution write boundary.
380 ///
381 /// Panics (debug) if `off >= self.content_len()`.
382 pub fn write_page(&mut self, off: u64, content: &[u8]) {
383 debug_assert!(
384 off < self.content_len(),
385 "DataCap::write_page: offset past logical size"
386 );
387 let p = (off / PAGE_SIZE as u64) as u32;
388 self.overlay.insert(
389 p,
390 PageSlot::Loaded(Arc::new(PageBytes::from_content(content))),
391 );
392 }
393
394 /// Insert an already-built overlay page slot at absolute page index `p`
395 /// (move, no copy). The slab is page-aligned by construction; used by the
396 /// engines' CoW path to hand a freshly-written page to the cap directly.
397 pub fn insert_overlay_page(&mut self, p: u32, slot: PageSlot) {
398 self.overlay.insert(p, slot);
399 }
400
401 /// Place every effective page of `src` into this cap's **backing** starting
402 /// at absolute byte offset `dst_off` (page-aligned): backing page
403 /// `dst_off/PAGE_SIZE + i` becomes a clone of `src.page_slot(i)` — an `Arc`
404 /// refcount bump, **not** a byte copy. Pages beyond this cap's extent are
405 /// dropped (clamped, mirroring the interpreter's `off < extent` fold guard).
406 ///
407 /// This is the **page-sharing** instance-memory composer: a fresh Instance
408 /// `mem` built by placing an Image's mapped `Cap::Data` sources shares those
409 /// sources' physical pages, so N sub-VMs spawned from one Image all map the
410 /// same read-only frames and each CoWs (into its overlay) only the pages it
411 /// writes — the shared backing is never mutated. Effective bytes are
412 /// identical to the copying `put_page` fold; only the allocation is shared.
413 pub fn place_shared(&mut self, dst_off: u64, src: &DataCap) {
414 debug_assert!(
415 dst_off.is_multiple_of(PAGE_SIZE as u64),
416 "place_shared: dst_off must be page-aligned"
417 );
418 let base = (dst_off / PAGE_SIZE as u64) as usize;
419 let total_pages = self.backing.page_count();
420 let slab = Arc::make_mut(&mut self.backing);
421 for i in 0..src.page_count() {
422 let dst = base + i;
423 if dst >= total_pages {
424 break; // clamp to this cap's logical extent
425 }
426 if slab.pages.len() <= dst {
427 slab.pages.resize(dst + 1, PageSlot::Empty);
428 }
429 slab.pages[dst] = src.page_slot(i).clone();
430 }
431 // Re-canonicalize: trim trailing `Empty` so the layout stays unique.
432 while matches!(slab.pages.last(), Some(PageSlot::Empty)) {
433 slab.pages.pop();
434 }
435 }
436
437 /// Fold the overlay into a fresh, clean (overlay-empty) `DataCap` whose
438 /// `hash_tree_root` is defined. Clones the backing and folds every overlaid
439 /// page in via the canonical backing CoW. This is the settle / content-
440 /// address primitive (it replaces the old `DataViewCap::settle`).
441 pub fn flush(&self) -> DataCap {
442 if self.overlay.is_empty() {
443 return self.clone();
444 }
445 let mut backing = (*self.backing).clone();
446 for (&p, slot) in &self.overlay {
447 match slot {
448 PageSlot::Loaded(pr) => backing.put_page_idx(p as usize, &pr.bytes),
449 PageSlot::Empty => {}
450 PageSlot::Missing(_) => {
451 unreachable!("DataCap::flush: Missing overlay page (cold-load unsupported)")
452 }
453 }
454 }
455 DataCap {
456 backing: Arc::new(backing),
457 overlay: BTreeMap::new(),
458 }
459 }
460
461 /// Build a `DataCap` from contiguous `content`, sized to the next page
462 /// boundary (at least one page). All-zero pages become [`PageSlot::Empty`].
463 pub fn from_bytes(content: &[u8]) -> Self {
464 Self::from_bytes_sized(content, content.len() as u64)
465 }
466
467 /// Build a `DataCap` from `content` with a logical size of at least
468 /// `target_size` (rounded up to a page boundary, minimum one page).
469 /// `content` fills the low bytes; the remainder is zero (sparse). The cap is
470 /// clean (empty overlay).
471 pub fn from_bytes_sized(content: &[u8], target_size: u64) -> Self {
472 DataCap {
473 backing: Arc::new(PageSlab::from_bytes_sized(content, target_size)),
474 overlay: BTreeMap::new(),
475 }
476 }
477
478 /// Build a clean (overlay-free) `DataCap` from sparse named pages over a
479 /// logical `size`. See [`PageSlab::from_sparse_pages`]. This is the
480 /// decode target for [`crate::image::DataDesc::to_data_cap`]; the result
481 /// is byte- and hash-identical to a contiguous [`Self::from_bytes_sized`]
482 /// of equivalent logical content.
483 pub fn from_sparse_pages<'a>(
484 size: u64,
485 pages: impl IntoIterator<Item = (u32, &'a [u8])>,
486 ) -> Self {
487 DataCap {
488 backing: Arc::new(PageSlab::from_sparse_pages(size, pages)),
489 overlay: BTreeMap::new(),
490 }
491 }
492}
493
494/// Allocate a zero-filled `Vec<u8>` of `len` bytes (rounded up to the next page
495/// boundary) with `PAGE_SIZE`-aligned backing storage. Page alignment is what
496/// lets the kernel map the buffer directly into a ring-3 PT.
497///
498/// Panics if the allocator returns null (OOM) or the `Layout` overflows.
499pub fn alloc_page_aligned_zeroed(len: usize) -> Vec<u8> {
500 let padded = len.next_multiple_of(PAGE_SIZE).max(PAGE_SIZE);
501 let layout =
502 Layout::from_size_align(padded, PAGE_SIZE).expect("DataCap page-aligned layout overflow");
503 // SAFETY: `padded > 0` so the `Layout` is non-zero; the std global
504 // allocator is what `Vec` uses, so the buffer is `Vec::from_raw_parts`-safe.
505 let ptr = unsafe { alloc_zeroed(layout) };
506 if ptr.is_null() {
507 alloc::alloc::handle_alloc_error(layout);
508 }
509 // SAFETY: non-null pointer to `padded` zeroed bytes, PAGE_SIZE-aligned;
510 // capacity == len == padded, all bytes initialised (to zero).
511 unsafe { Vec::from_raw_parts(ptr, padded, padded) }
512}
513
514/// Content hash of a single page: the SSZ `hash_tree_root` of the page as a
515/// `ByteVector[PAGE_SIZE]` (zero-padded), under the cap digest (SHA-256). This
516/// is the value a materialized page contributes to the cap merkle and the
517/// precomputed [`PageBytes::hash`] kept by the substitution invariant.
518pub fn page_content_hash(bytes: &[u8]) -> CapHash {
519 let mut arr = [0u8; PAGE_SIZE];
520 let n = bytes.len().min(PAGE_SIZE);
521 arr[..n].copy_from_slice(&bytes[..n]);
522 ssz::hash_tree_root(&arr)
523}