Skip to main content

javm_exec/
args.rs

1//! Instruction argument decoding (JAM Gray Paper Appendix A.5).
2//!
3//! Handles register extraction, immediate decoding, and sign
4//! extension. Cherry-picked verbatim from v2 `javm/src/args.rs`.
5//! Pure decoding; no cap awareness.
6
7/// Sign-extend a value from `n` bytes to 64 bits (eq A.16: Xₙ).
8///
9/// X_n(x) = x + floor(x / 2^(8n-1)) * (2^64 - 2^(8n))
10#[inline(always)]
11pub fn sign_extend(value: u64, n: usize) -> u64 {
12    match n {
13        0 => 0,
14        1 => value as u8 as i8 as i64 as u64,
15        2 => value as u16 as i16 as i64 as u64,
16        3 => {
17            let v = value & 0xFF_FFFF;
18            if v & 0x80_0000 != 0 {
19                v | 0xFFFF_FFFF_FF00_0000
20            } else {
21                v
22            }
23        }
24        4 => value as u32 as i32 as i64 as u64,
25        _ => value, // 8 bytes: no extension needed
26    }
27}
28
29/// Signed interpretation of a 64-bit register value (eq A.10: Z₈).
30pub fn to_signed(value: u64) -> i64 {
31    value as i64
32}
33
34/// Unsigned interpretation of a signed value (eq A.11: Z₈⁻¹).
35pub fn to_unsigned(value: i64) -> u64 {
36    value as u64
37}
38
39/// Sign-extend from 32 bits to 64 bits (X₄).
40pub fn sign_extend_32(value: u64) -> u64 {
41    (value as u32) as i32 as i64 as u64
42}
43
44/// Decode a little-endian unsigned integer from a byte slice (E_l⁻¹).
45pub fn decode_le(bytes: &[u8]) -> u64 {
46    let mut value: u64 = 0;
47    for (i, &b) in bytes.iter().enumerate() {
48        value |= (b as u64) << (i * 8);
49    }
50    value
51}
52
53/// Decoded instruction arguments.
54#[derive(Clone, Copy, Debug)]
55pub enum Args {
56    /// No arguments (trap, fallthrough).
57    None,
58    /// One immediate value (ecalli).
59    Imm { imm: u64 },
60    /// One register + extended width immediate (load_imm_64).
61    RegExtImm { ra: usize, imm: u64 },
62    /// Two immediates (store_imm_*).
63    TwoImm { imm_x: u64, imm_y: u64 },
64    /// One offset (jump).
65    Offset { offset: u64 },
66    /// One register + one immediate.
67    RegImm { ra: usize, imm: u64 },
68    /// One register + two immediates.
69    RegTwoImm { ra: usize, imm_x: u64, imm_y: u64 },
70    /// One register + one immediate + one offset.
71    RegImmOffset { ra: usize, imm: u64, offset: u64 },
72    /// Two registers.
73    TwoReg { rd: usize, ra: usize },
74    /// Two registers + one immediate.
75    TwoRegImm { ra: usize, rb: usize, imm: u64 },
76    /// Two registers + one offset.
77    TwoRegOffset { ra: usize, rb: usize, offset: u64 },
78    /// Two registers + two immediates.
79    TwoRegTwoImm {
80        ra: usize,
81        rb: usize,
82        imm_x: u64,
83        imm_y: u64,
84    },
85    /// Three registers.
86    ThreeReg { ra: usize, rb: usize, rd: usize },
87}
88
89/// Read from the zero-extended code blob (ζ, eq A.4).
90#[inline(always)]
91fn zeta(code: &[u8], i: usize) -> u8 {
92    if i < code.len() { code[i] } else { 0 }
93}
94
95/// Read `n` bytes from code at offset as little-endian u64 (no allocation).
96#[inline(always)]
97fn read_le_at(code: &[u8], offset: usize, n: usize) -> u64 {
98    // Fast path: all bytes in bounds — read directly without per-byte checks
99    if offset + n <= code.len() {
100        let s = &code[offset..offset + n];
101        match n {
102            0 => 0,
103            1 => s[0] as u64,
104            2 => u16::from_le_bytes([s[0], s[1]]) as u64,
105            3 => s[0] as u64 | (s[1] as u64) << 8 | (s[2] as u64) << 16,
106            4 => u32::from_le_bytes([s[0], s[1], s[2], s[3]]) as u64,
107            _ => {
108                let mut buf = [0u8; 8];
109                buf[..n].copy_from_slice(s);
110                u64::from_le_bytes(buf)
111            }
112        }
113    } else {
114        // Slow path: near end of code, use zero-extending reads
115        let mut val = 0u64;
116        for i in 0..n {
117            val |= (zeta(code, offset + i) as u64) << (i * 8);
118        }
119        val
120    }
121}
122
123/// Read `n` bytes from code at offset, sign-extend, and return as u64.
124/// Public for use by the recompiler's inline decode path.
125#[inline(always)]
126pub fn read_signed_imm(code: &[u8], offset: usize, n: usize) -> u64 {
127    read_signed_at(code, offset, n)
128}
129
130/// Read `n` bytes from code at offset as little-endian u64 (no sign extension).
131/// Public for use by the recompiler's inline decode path (e.g., OneRegExtImm).
132#[inline(always)]
133pub fn read_le_imm(code: &[u8], offset: usize, n: usize) -> u64 {
134    read_le_at(code, offset, n)
135}
136
137/// Read `n` bytes and sign-extend (no allocation).
138#[inline(always)]
139fn read_signed_at(code: &[u8], offset: usize, n: usize) -> u64 {
140    sign_extend(read_le_at(code, offset, n), n)
141}
142
143/// Decode arguments based on instruction category.
144///
145/// `pc` is the instruction counter (ı), `skip` is the skip length (ℓ),
146/// `code` is the instruction data with implicit zero extension.
147pub fn decode_args(
148    code: &[u8],
149    pc: usize,
150    skip: usize,
151    category: crate::instruction::InstructionCategory,
152) -> Args {
153    use crate::instruction::InstructionCategory::*;
154    let l = skip; // ℓ = skip(ı)
155
156    match category {
157        NoArgs => Args::None,
158
159        // A.5.2: lX = min(4, ℓ), νX = X_lX(E_lX⁻¹(ζ[ı+1..+lX]))
160        OneImm => {
161            let lx = l.min(4);
162            let imm = read_signed_at(code, pc + 1, lx);
163            Args::Imm { imm }
164        }
165
166        // A.5.3: rA = min(12, ζ[ı+1] mod 16), νX = E₈⁻¹(ζ[ı+2..+8])
167        OneRegExtImm => {
168            let ra = (zeta(code, pc + 1) % 16).min(12) as usize;
169            let imm = read_le_at(code, pc + 2, 8);
170            Args::RegExtImm { ra, imm }
171        }
172
173        // A.5.4: lX = min(4, ζ[ı+1] mod 8)
174        TwoImm => {
175            let lx = (zeta(code, pc + 1) as usize % 8).min(4);
176            let ly = if l > lx + 1 { (l - lx - 1).min(4) } else { 0 };
177            let imm_x = read_signed_at(code, pc + 2, lx);
178            let imm_y = read_signed_at(code, pc + 2 + lx, ly);
179            Args::TwoImm { imm_x, imm_y }
180        }
181
182        // A.5.5: lX = min(4, ℓ), νX = ı + Z_lX(...)
183        OneOffset => {
184            let lx = l.min(4);
185            let signed_offset = read_signed_at(code, pc + 1, lx) as i64;
186            let offset = (pc as i64).wrapping_add(signed_offset) as u64;
187            Args::Offset { offset }
188        }
189
190        // A.5.6: rA = min(12, ζ[ı+1] mod 16), lX = min(4, max(0, ℓ-1))
191        OneRegOneImm => {
192            let ra = (zeta(code, pc + 1) % 16).min(12) as usize;
193            let lx = if l > 1 { (l - 1).min(4) } else { 0 };
194            let imm = read_signed_at(code, pc + 2, lx);
195            Args::RegImm { ra, imm }
196        }
197
198        // A.5.7: rA = min(12, ζ[ı+1] mod 16), lX = min(4, floor(ζ[ı+1]/16) mod 8)
199        OneRegTwoImm => {
200            let reg_byte = zeta(code, pc + 1);
201            let ra = (reg_byte % 16).min(12) as usize;
202            let lx = ((reg_byte as usize / 16) % 8).min(4);
203            let ly = if l > lx + 1 { (l - lx - 1).min(4) } else { 0 };
204            let imm_x = read_signed_at(code, pc + 2, lx);
205            let imm_y = read_signed_at(code, pc + 2 + lx, ly);
206            Args::RegTwoImm { ra, imm_x, imm_y }
207        }
208
209        // A.5.8: Same register/immediate encoding as OneRegTwoImm, but second is offset
210        OneRegImmOffset => {
211            let reg_byte = zeta(code, pc + 1);
212            let ra = (reg_byte % 16).min(12) as usize;
213            let lx = ((reg_byte as usize / 16) % 8).min(4);
214            let ly = if l > lx + 1 { (l - lx - 1).min(4) } else { 0 };
215            let imm = read_signed_at(code, pc + 2, lx);
216            let signed_offset = read_signed_at(code, pc + 2 + lx, ly) as i64;
217            let offset = (pc as i64).wrapping_add(signed_offset) as u64;
218            Args::RegImmOffset { ra, imm, offset }
219        }
220
221        // A.5.9: rD = min(12, ζ[ı+1] mod 16), rA = min(12, floor(ζ[ı+1]/16))
222        TwoReg => {
223            let reg_byte = zeta(code, pc + 1);
224            let rd = (reg_byte % 16).min(12) as usize;
225            let ra = (reg_byte / 16).min(12) as usize;
226            Args::TwoReg { rd, ra }
227        }
228
229        // A.5.10: rA = min(12, ζ[ı+1] mod 16), rB = min(12, floor(ζ[ı+1]/16))
230        TwoRegOneImm => {
231            let reg_byte = zeta(code, pc + 1);
232            let ra = (reg_byte % 16).min(12) as usize;
233            let rb = (reg_byte / 16).min(12) as usize;
234            let lx = if l > 1 { (l - 1).min(4) } else { 0 };
235            let imm = read_signed_at(code, pc + 2, lx);
236            Args::TwoRegImm { ra, rb, imm }
237        }
238
239        // A.5.11: Same as TwoRegOneImm but immediate is an offset
240        TwoRegOneOffset => {
241            let reg_byte = zeta(code, pc + 1);
242            let ra = (reg_byte % 16).min(12) as usize;
243            let rb = (reg_byte / 16).min(12) as usize;
244            let lx = if l > 1 { (l - 1).min(4) } else { 0 };
245            let signed_offset = read_signed_at(code, pc + 2, lx) as i64;
246            let offset = (pc as i64).wrapping_add(signed_offset) as u64;
247            Args::TwoRegOffset { ra, rb, offset }
248        }
249
250        // A.5.12: rA, rB from reg_byte, lX from ζ[ı+2]
251        TwoRegTwoImm => {
252            let reg_byte = zeta(code, pc + 1);
253            let ra = (reg_byte % 16).min(12) as usize;
254            let rb = (reg_byte / 16).min(12) as usize;
255            let lx = (zeta(code, pc + 2) as usize % 8).min(4);
256            let ly = if l > lx + 2 { (l - lx - 2).min(4) } else { 0 };
257            let imm_x = read_signed_at(code, pc + 3, lx);
258            let imm_y = read_signed_at(code, pc + 3 + lx, ly);
259            Args::TwoRegTwoImm {
260                ra,
261                rb,
262                imm_x,
263                imm_y,
264            }
265        }
266
267        // A.5.13: rA, rB from first reg_byte, rD from second byte
268        ThreeReg => {
269            let reg_byte = zeta(code, pc + 1);
270            let ra = (reg_byte % 16).min(12) as usize;
271            let rb = (reg_byte / 16).min(12) as usize;
272            let rd = zeta(code, pc + 2).min(12) as usize;
273            Args::ThreeReg { ra, rb, rd }
274        }
275    }
276}
277
278#[cfg(test)]
279mod tests {
280    use super::*;
281
282    #[test]
283    fn test_sign_extend_positive() {
284        assert_eq!(sign_extend(0x7F, 1), 0x7F);
285        assert_eq!(sign_extend(0x7FFF, 2), 0x7FFF);
286        assert_eq!(sign_extend(0x7FFF_FFFF, 4), 0x7FFF_FFFF);
287    }
288
289    #[test]
290    fn test_sign_extend_negative() {
291        assert_eq!(sign_extend(0x80, 1), 0xFFFF_FFFF_FFFF_FF80);
292        assert_eq!(sign_extend(0x8000, 2), 0xFFFF_FFFF_FFFF_8000);
293        assert_eq!(sign_extend(0x8000_0000, 4), 0xFFFF_FFFF_8000_0000);
294    }
295
296    #[test]
297    fn test_sign_extend_3byte() {
298        assert_eq!(sign_extend(0x7F_FFFF, 3), 0x7F_FFFF);
299        assert_eq!(sign_extend(0x80_0000, 3), 0xFFFF_FFFF_FF80_0000);
300    }
301
302    #[test]
303    fn test_decode_le() {
304        assert_eq!(decode_le(&[0x01, 0x02, 0x03, 0x04]), 0x04030201);
305        assert_eq!(decode_le(&[0xFF]), 0xFF);
306        assert_eq!(decode_le(&[]), 0);
307    }
308}
309
310#[cfg(test)]
311mod proptests {
312    use super::*;
313    use proptest::prelude::*;
314
315    proptest! {
316        /// sign_extend is idempotent: extending twice gives the same result.
317        #[test]
318        fn sign_extend_idempotent(value in any::<u64>(), n in 0usize..=4) {
319            let once = sign_extend(value, n);
320            let twice = sign_extend(once, n);
321            prop_assert_eq!(once, twice);
322        }
323
324        /// to_signed and to_unsigned are inverses.
325        #[test]
326        fn signed_unsigned_roundtrip(value in any::<u64>()) {
327            prop_assert_eq!(to_unsigned(to_signed(value)), value);
328        }
329
330        /// decode_le of a single byte equals that byte.
331        #[test]
332        fn decode_le_single_byte(b in any::<u8>()) {
333            prop_assert_eq!(decode_le(&[b]), b as u64);
334        }
335
336        /// decode_le is deterministic.
337        #[test]
338        fn decode_le_deterministic(
339            bytes in proptest::collection::vec(any::<u8>(), 0..8),
340        ) {
341            prop_assert_eq!(decode_le(&bytes), decode_le(&bytes));
342        }
343
344        /// sign_extend with n=0 always returns 0.
345        #[test]
346        fn sign_extend_zero_width_is_zero(value in any::<u64>()) {
347            prop_assert_eq!(sign_extend(value, 0), 0);
348        }
349
350        /// sign_extend_32 matches sign_extend with n=4.
351        #[test]
352        fn sign_extend_32_matches_generic(value in any::<u64>()) {
353            prop_assert_eq!(sign_extend_32(value), sign_extend(value, 4));
354        }
355
356        /// decode_args register indices are always <= 12 for all categories.
357        #[test]
358        fn decode_args_registers_bounded(
359            code in proptest::collection::vec(any::<u8>(), 3..16),
360            skip in 0usize..8,
361            category_idx in 0u8..13,
362        ) {
363            use crate::instruction::InstructionCategory::*;
364            let category = match category_idx {
365                0 => NoArgs,
366                1 => OneImm,
367                2 => OneRegExtImm,
368                3 => TwoImm,
369                4 => OneOffset,
370                5 => OneRegOneImm,
371                6 => OneRegTwoImm,
372                7 => OneRegImmOffset,
373                8 => TwoReg,
374                9 => TwoRegOneImm,
375                10 => TwoRegOneOffset,
376                11 => TwoRegTwoImm,
377                12 => ThreeReg,
378                _ => unreachable!(),
379            };
380            let args = decode_args(&code, 0, skip, category);
381            match args {
382                Args::None | Args::Imm { .. } | Args::TwoImm { .. } | Args::Offset { .. } => {}
383                Args::RegExtImm { ra, .. }
384                | Args::RegImm { ra, .. }
385                | Args::RegTwoImm { ra, .. }
386                | Args::RegImmOffset { ra, .. } => {
387                    prop_assert!(ra <= 12);
388                }
389                Args::TwoReg { rd, ra } => {
390                    prop_assert!(rd <= 12);
391                    prop_assert!(ra <= 12);
392                }
393                Args::TwoRegImm { ra, rb, .. }
394                | Args::TwoRegOffset { ra, rb, .. }
395                | Args::TwoRegTwoImm { ra, rb, .. } => {
396                    prop_assert!(ra <= 12);
397                    prop_assert!(rb <= 12);
398                }
399                Args::ThreeReg { ra, rb, rd } => {
400                    prop_assert!(ra <= 12);
401                    prop_assert!(rb <= 12);
402                    prop_assert!(rd <= 12);
403                }
404            }
405        }
406
407        /// decode_args is deterministic: same inputs produce the same variant.
408        #[test]
409        fn decode_args_deterministic(
410            code in proptest::collection::vec(any::<u8>(), 3..16),
411            skip in 0usize..8,
412        ) {
413            use crate::instruction::InstructionCategory::*;
414            let args1 = decode_args(&code, 0, skip, TwoRegOneImm);
415            let args2 = decode_args(&code, 0, skip, TwoRegOneImm);
416            // Check same variant and same register values
417            match (args1, args2) {
418                (Args::TwoRegImm { ra: a1, rb: b1, imm: i1 },
419                 Args::TwoRegImm { ra: a2, rb: b2, imm: i2 }) => {
420                    prop_assert_eq!(a1, a2);
421                    prop_assert_eq!(b1, b2);
422                    prop_assert_eq!(i1, i2);
423                }
424                _ => prop_assert!(false),
425            }
426        }
427    }
428}