javm_recompiler_x86/
codegen.rs

1//! PVM-to-x86-64 code generation.
2//!
3//! Compiles PVM bytecode into native x86-64 machine code. Each PVM basic block
4//! becomes a native basic block with gas metering at entry. PVM registers are
5//! mapped to x86-64 registers for the duration of execution.
6//!
7//! Register mapping (PVM `φ[i]` → x86-64):
8//!   `φ[0]`  → RBP   (callee-saved) — RA, rarely used as memory base
9//!   `φ[1]`  → RBX   (callee-saved) — SP, avoids RBP encoding penalty
10//!   `φ[2]`  → R12   (callee-saved)
11//!   `φ[3]`  → R13   (callee-saved)
12//!   `φ[4]`  → R14   (callee-saved)
13//!   `φ[5]`  → RSI   (caller-saved)
14//!   `φ[6]`  → RDI   (caller-saved)
15//!   `φ[7]`  → R8    (caller-saved)
16//!   `φ[8]`  → R9    (caller-saved)
17//!   `φ[9]`  → R10   (caller-saved)
18//!   `φ[10]` → R11   (caller-saved)
19//!   `φ[11]` → RAX   (caller-saved)
20//!   `φ[12]` → RCX   (caller-saved)
21//!
22//! Reserved: R15 = gas meter, RDX = scratch, RSP = native stack.
23
24use alloc::vec;
25use alloc::vec::Vec;
26
27use super::asm::{Assembler, Cc, Label, Reg};
28use javm_exec::gas_sim::GasSimulator;
29
30/// Map RV register index (0..12) to x86-64 register.
31/// All 13 PVM registers live in x86 registers.
32pub(crate) const REG_MAP: [Reg; 13] = [
33    Reg::RBP, // φ[0] — RA (rarely used as memory base, so RBP encoding penalty is acceptable)
34    Reg::RBX, // φ[1] — SP (frequently used as memory base, RBX avoids RBP disp8 penalty)
35    Reg::R12, // φ[2]
36    Reg::R13, // φ[3]
37    Reg::R14, // φ[4]
38    Reg::RSI, // φ[5]
39    Reg::RDI, // φ[6]
40    Reg::R8,  // φ[7]
41    Reg::R9,  // φ[8]
42    Reg::R10, // φ[9]
43    Reg::R11, // φ[10]
44    Reg::RAX, // φ[11]
45    Reg::RCX, // φ[12]
46];
47
48/// Scratch register (not mapped to any PVM register).
49pub(crate) const SCRATCH: Reg = Reg::RDX;
50
51/// RV register (5-bit, 0..31) → PVM2 slot (0..14), or `0xFF` for "no
52/// slot" (x0, or a reserved register x16..x31). A 32-byte const LUT:
53/// one load replaces the range-match (~8.8% of compile, called
54/// ~6×/instruction across codegen + gas feed). The classification is the
55/// single source [`javm_exec::regs`]; this is its const-folded copy, so
56/// gas stays bit-identical with the interpreter's predecode-cached path.
57pub(crate) use javm_exec::regs::REG_SLOT_LUT as RV_SLOT_LUT;
58
59/// RV register number → PVM2 slot (0..12), or `0xFF` for "no slot".
60#[inline(always)]
61pub(crate) fn rv_slot_or_ff(x: u8) -> u8 {
62    RV_SLOT_LUT[(x & 31) as usize]
63}
64/// R15 = gas meter. Loaded from `ctx.gas` at the prologue, decremented
65/// once per basic block, flushed back to `ctx.gas` at every exit.
66pub(crate) const GAS: Reg = Reg::R15;
67
68/// JitContext lives above the PVM u32 address space (no bounds check
69/// on guest mem — the full low 4 GiB of native VA belongs to the
70/// program). CTX is reached via RIP-relative `[rip+disp32]`, which
71/// gives ±2 GiB range from the JIT code, so CTX must be **adjacent**
72/// to the JIT region.
73///
74/// In the nub-x86 microkernel, CTX and the per-Image JIT arena both
75/// live in PML4 slot 1 (base 512 GiB). Sharing one PML4 slot lets
76/// the Image's PDPT subtree be cached as a template across all
77/// Instances (per-call PT just shallow-clones the slot's entry). MEM
78/// stays in `PML4[0]` at VA 0 so PVM addr == native VA still holds.
79pub const CTX_VA: u64 = 1u64 << 39;
80
81use super::JitContext;
82use memoffset::offset_of;
83
84pub const CTX_REGS: u64 = CTX_VA + offset_of!(JitContext, regs) as u64;
85pub const CTX_GAS: u64 = CTX_VA + offset_of!(JitContext, gas) as u64;
86pub const CTX_EXIT_REASON: u64 = CTX_VA + offset_of!(JitContext, exit_reason) as u64;
87pub const CTX_EXIT_ARG: u64 = CTX_VA + offset_of!(JitContext, exit_arg) as u64;
88pub const CTX_ENTRY_PC: u64 = CTX_VA + offset_of!(JitContext, entry_pc) as u64;
89pub const CTX_PC: u64 = CTX_VA + offset_of!(JitContext, pc) as u64;
90pub const CTX_DISPATCH_TABLE: u64 = CTX_VA + offset_of!(JitContext, dispatch_table) as u64;
91pub const CTX_CODE_BASE: u64 = CTX_VA + offset_of!(JitContext, code_base) as u64;
92pub const CTX_HOST_RSP_BASE: u64 = CTX_VA + offset_of!(JitContext, host_rsp_base) as u64;
93
94/// Exit reason codes (matching ExitReason enum).
95pub const EXIT_HALT: u32 = 0;
96pub const EXIT_PANIC: u32 = 1;
97pub const EXIT_OOG: u32 = 2;
98pub const EXIT_PAGE_FAULT: u32 = 3;
99pub const EXIT_HOST_CALL: u32 = 4;
100pub const EXIT_ECALL: u32 = 6;
101pub const EXIT_TRAP: u32 = 7;
102
103/// Result of compilation.
104pub struct CompileResult {
105    pub native_code: Vec<u8>,
106    /// Sparse dispatch entries — `(pvm_pc, native_offset)` for every
107    /// gas-block start. The runtime arena's dispatch region is
108    /// page-zero-filled, so callers only need to write these
109    /// non-zero entries instead of materialising a dense
110    /// `code.len() + 1`-sized array.
111    pub dispatch_entries: Vec<(u32, i32)>,
112    /// Per-mem-op trap entries `(native_offset, pvm_pc, access_width)`,
113    /// sorted by `native_offset`. The #PF handler binary-searches this by
114    /// the faulting RIP's native offset to recover the PVM PC (for the
115    /// PageFault exit / OOG resume) and the access width (for the
116    /// category-#3 straddle page-set, so both engines materialize the
117    /// same pages via `mat::access_pages`).
118    pub trap_table: Vec<(u32, u32, u32)>,
119    pub exit_label_offset: u32,
120    /// Native offset of the panic stub. The runtime dense-fills the
121    /// dispatch table with this so a `jalr` to any non-block-start
122    /// offset lands on the panic stub *via the table* — folding the
123    /// block-start validation into the dispatch lookup (no separate
124    /// `bb_starts` set). The RV path always populates this.
125    pub panic_offset: u32,
126}
127
128/// Helper function pointers passed to compiled code.
129#[repr(C)]
130pub struct HelperFns {
131    pub mem_read_u8: u64,
132    pub mem_read_u16: u64,
133    pub mem_read_u32: u64,
134    pub mem_read_u64: u64,
135    pub mem_write_u8: u64,
136    pub mem_write_u16: u64,
137    pub mem_write_u32: u64,
138    pub mem_write_u64: u64,
139}
140
141/// Tracks what a PVM register was last set to, for peephole optimization.
142#[derive(Clone, Copy, Debug)]
143pub(crate) enum RegDef {
144    /// Unknown or complex value.
145    Unknown,
146    /// Known compile-time constant (32-bit address or immediate).
147    Const(u32),
148    /// reg = src << shift (shift 1..=3, i.e. *2, *4, *8).
149    /// Built from: add D,A,A → Shifted{src:A, shift:1}
150    ///             add D,D,D where D=Shifted{src,s} → Shifted{src, shift:s+1}
151    Shifted { src: usize, shift: u8 },
152    /// reg = base + (idx << shift) (shift 0..=3, i.e. *1, *2, *4, *8).
153    /// Built from: add D,BASE,S where S=Shifted{src,s} → ScaledAdd{base:BASE, idx:src, shift:s}
154    ScaledAdd { base: usize, idx: usize, shift: u8 },
155}
156
157/// In-flight per-block gas-gate patch state. The block's `cmp r15,
158/// cost+reserve` gate and `sub r15, cost` charge are emitted with
159/// placeholder imm32s at the block start; their offsets are stashed here
160/// and patched once the block's cost + #3 reserve are known (at the next
161/// block start, or the final flush).
162struct PendingGas {
163    stub_label: Label,
164    block_pc: u32,
165    cmp_offset: usize,
166    sub_offset: usize,
167}
168
169/// PVM-to-x86-64 compiler.
170pub struct Compiler {
171    pub asm: Assembler,
172    /// Base label ID for PC labels. label_for_pc(pc) = Label(label_base + pc).
173    /// Labels are bulk-allocated in the assembler with LABEL_UNBOUND=0 (zeroed pages).
174    pub(crate) label_base: u32,
175    /// Gas block start PCs discovered during compilation (for dispatch table).
176    pub(crate) gas_block_pcs: Vec<u32>,
177    /// Label for the exit sequence.
178    pub(crate) exit_label: Label,
179    /// Label for the shared out-of-gas exit (sets EXIT_OOG + jumps to exit).
180    oog_label: Label,
181    /// Label for panic exit.
182    pub(crate) panic_label: Label,
183    /// Label for OOG handler that reads PC from SCRATCH: stores PC, then falls through to oog_label.
184    oog_pc_label: Label,
185    /// Per-gas-block OOG stubs: (label, pvm_pc) — emitted as cold code after main body.
186    pub(crate) oog_stubs: Vec<(Label, u32, u32)>, // (label, pvm_pc, block_cost)
187    /// Helper function addresses.
188    pub(crate) helpers: HelperFns,
189    /// Bitmask reference (1 = instruction start). Stored as raw pointer for self-referential use.
190    pub(crate) bitmask_ptr: *const u8,
191    pub(crate) bitmask_len: usize,
192    /// Peephole: tracks how each PVM register was last defined.
193    pub(crate) reg_defs: [RegDef; 13],
194    /// Bitmask of registers that have non-Unknown reg_defs (for fast invalidation).
195    pub(crate) reg_defs_active: u16,
196    /// Carry flag fusion: after an `add64 D, A, B`, CF = overflow(A+B).
197    /// Stores (D, A, B) so that a subsequent `setLtU C, D, A` or `setLtU C, D, B`
198    /// can use CF directly instead of emitting a redundant `cmp`.
199    /// Cleared by any instruction that clobbers flags (i.e., everything except the
200    /// immediately following setLtU).
201    pub(crate) last_add_cf: Option<(usize, usize, usize)>,
202    /// Trap table for signal-based bounds checking + category-#3 fault
203    /// resolution: `(native_offset, pvm_pc, access_width)`. One entry per
204    /// guest load/store, recorded at the native offset of the faulting
205    /// MOV.
206    pub(crate) trap_entries: Vec<(u32, u32, u32)>,
207    /// Memory tier load/store cycles for gas simulation.
208    pub(crate) mem_cycles: u8,
209    /// Pipeline simulator for per-block gas costing. The streaming
210    /// compile path drives this directly from `compile_rv_instruction`
211    /// arms (so the per-instruction loop performs ONE match over
212    /// `Inst`); `bind_rv_gas_block_start_streaming` flushes it at
213    /// block boundaries.
214    pub(crate) gas_sim: GasSimulator,
215    /// Accumulated worst-case category-#3 reserve for the block being
216    /// streamed (sum of `gas_cost::rv_kind_reserve` over its loads /
217    /// stores). Reset at each gas-block start; folded into the block's
218    /// `cmp r15, cost+reserve` gate. Mirrors the interpreter's
219    /// `predecode.block_reserves`, accumulated at the same per-instruction
220    /// feed points so the two engines gate identically.
221    pub(crate) gas_reserve_accum: u32,
222    /// When set, [`feed_gas_rv`](Compiler::feed_gas_rv) does **not** feed the
223    /// real `gas_sim`. Used by `compile_rv_spilled`: an x3/x4 instruction is
224    /// re-dispatched through the fast path with rewritten (donor) registers,
225    /// but its gas is fed once with the *original* x3/x4 slots (so the spill
226    /// cost is charged and the interpreter is matched bit-for-bit). The
227    /// throwaway path still returns the correct terminator flag.
228    pub(crate) suppress_gas: bool,
229    /// Guest VA the code region is mapped at. `jalr`/`auipc` produce
230    /// and consume code addresses as `code_base + offset`; the dispatch
231    /// table is offset-indexed (offset = VA - code_base).
232    pub(crate) code_base: u32,
233    /// Code region length in bytes — the upper bound for jalr target
234    /// offsets (== dispatch-table length in entries).
235    pub(crate) code_len: u32,
236    /// True during the streaming compile walk (`compile`). When set,
237    /// branch emit helpers defer forward-target validation (`target > pc`)
238    /// to a post-pass instead of consulting `bitmask_ptr`; the post-pass
239    /// clears it so those helpers consult the now-populated `rv_valid_pc`.
240    pub(crate) rv_streaming: bool,
241    /// Forward branches whose target validity could not be determined at
242    /// emit time. Resolved post-pass: each entry is
243    /// `(target_pc, branch_pc, fixup_idx)`. If `valid_pc[target]` is
244    /// false after the streaming pass, the fixup is redirected to a
245    /// per-branch panic stub.
246    pub(crate) rv_pending_fwd_branches: Vec<(u32, u32, usize)>,
247    /// Backing storage for `bitmask_ptr` during the streaming compile.
248    /// Built incrementally in `bind_rv_gas_block_start_streaming`.
249    pub(crate) rv_valid_pc: Vec<bool>,
250}
251
252impl Compiler {
253    pub fn new(
254        helpers: HelperFns,
255        code_len: usize,
256        jit_va_base: u64,
257        mem_cycles: u8,
258        code_base: u32,
259    ) -> Self {
260        // Estimate native code size: ~3x PVM code provides safety margin for
261        // direct-write emission (no per-byte capacity checks in hot loop).
262        let estimated_native = code_len * 3 + 8192;
263        // Labels: one per PC (dense array) + fixed overhead for exit/oog/stubs.
264        let estimated_labels = code_len + 1024;
265        // mmap-backed assembler buffer was a host-only path; the recompiler is
266        // now embedded only by `nub-arch-x86`, which uses the Vec-backed form.
267        let mut asm = Assembler::with_capacity(estimated_native, estimated_labels);
268        // RIP-relative CTX accesses need the eventual load VA to compute
269        // disp32. Callers from a per-invocation runtime pass JIT_VA_M;
270        // tests pass 0 (encodings reference offset 0).
271        asm.set_jit_va_base(jit_va_base);
272        // Reserve label 0 so label IDs start from 1 (for consistency with fixed labels).
273        let _reserved = asm.new_label(); // Label(0)
274        let exit_label = asm.new_label();
275        let oog_label = asm.new_label();
276        let panic_label = asm.new_label();
277        let oog_pc_label = asm.new_label();
278        // Pre-create one label per PC for O(1) lookup in label_for_pc.
279        // With LABEL_UNBOUND=0, bulk allocation uses zeroed pages (calloc/COW).
280        // Only the ~640 labels that get bound trigger page faults — the other
281        // ~110K labels stay on zero pages and cost nothing.
282        let label_base = asm.labels_len() as u32;
283        asm.bulk_create_labels(code_len + 1);
284        Self {
285            label_base,
286            gas_block_pcs: Vec::with_capacity(1024),
287            asm,
288            exit_label,
289            oog_label,
290            panic_label,
291            oog_pc_label,
292            oog_stubs: Vec::with_capacity(1024),
293            reg_defs: [RegDef::Unknown; 13],
294            reg_defs_active: 0,
295            last_add_cf: None,
296            helpers,
297            bitmask_ptr: core::ptr::null(),
298            bitmask_len: 0,
299            trap_entries: Vec::with_capacity(2048),
300            mem_cycles,
301            gas_sim: GasSimulator::new(),
302            gas_reserve_accum: 0,
303            suppress_gas: false,
304            code_base,
305            code_len: code_len as u32,
306            rv_streaming: false,
307            rv_pending_fwd_branches: Vec::new(),
308            rv_valid_pc: Vec::new(),
309        }
310    }
311
312    /// RV streaming-compile gas feed. Each `compile_rv_instruction`
313    /// arm calls this once with its kind constant + raw RV register
314    /// indices; we slot-translate inline and call
315    /// `rv_feed_gas_kind` against `self.gas_sim`. Returns
316    /// `is_terminator` (RVF_TERM flag from the LUT entry).
317    #[inline(always)]
318    pub(crate) fn feed_gas_rv(&mut self, kind: u8, rs1: u8, rs2: u8, rd: u8) -> bool {
319        // During a spilled re-dispatch, gas is fed once by `compile_rv_spilled`
320        // with the original x3/x4 slots; don't perturb the real sim here. Feed
321        // a throwaway sim only to recover the terminator flag.
322        if self.suppress_gas {
323            let mut throwaway = GasSimulator::new();
324            return javm_exec::gas_cost::rv_feed_gas_kind(
325                kind,
326                rv_slot_or_ff(rs1),
327                rv_slot_or_ff(rs2),
328                rv_slot_or_ff(rd),
329                &mut throwaway,
330                self.mem_cycles,
331            );
332        }
333        // Accumulate the per-instruction #3 reserve at the same point we
334        // feed the real gas sim, so the block's `cost+reserve` gate
335        // matches the interpreter's `block_reserves` bit-for-bit.
336        self.gas_reserve_accum = self
337            .gas_reserve_accum
338            .saturating_add(javm_exec::gas_cost::rv_kind_reserve(kind));
339        javm_exec::gas_cost::rv_feed_gas_kind(
340            kind,
341            rv_slot_or_ff(rs1),
342            rv_slot_or_ff(rs2),
343            rv_slot_or_ff(rd),
344            &mut self.gas_sim,
345            self.mem_cycles,
346        )
347    }
348
349    /// Look up the pre-created label for a PVM PC. O(1) arithmetic.
350    #[inline]
351    pub(crate) fn label_for_pc(&self, pc: u32) -> Label {
352        Label(self.label_base + pc)
353    }
354
355    pub(crate) fn is_basic_block_start(&self, idx: u32) -> bool {
356        let i = idx as usize;
357        // SAFETY: bitmask_ptr points to the start of a valid &[u8] slice of length
358        // bitmask_len, and i < bitmask_len is checked before the dereference.
359        i < self.bitmask_len && unsafe { *self.bitmask_ptr.add(i) } == 1
360    }
361
362    /// Emit memory read with bounds check (cold fault path).
363    /// Hot path: cmp + jae + load (2 instructions, no extra stores).
364    /// No bounds check — SIGSEGV handler catches OOB.
365    pub(crate) fn emit_mem_read_sized(
366        &mut self,
367        dst: Reg,
368        fn_addr: u64,
369        width_bytes: u32,
370        pvm_pc: u32,
371    ) {
372        let w = if width_bytes > 0 {
373            width_bytes
374        } else if fn_addr == self.helpers.mem_read_u8 {
375            1
376        } else if fn_addr == self.helpers.mem_read_u16 {
377            2
378        } else if fn_addr == self.helpers.mem_read_u32 {
379            4
380        } else {
381            8
382        };
383
384        // Record trap entry before the load instruction (for SIGSEGV
385        // handler + #3 materialization). `w` is the access width.
386        self.trap_entries
387            .push((self.asm.offset() as u32, pvm_pc, w));
388
389        match w {
390            1 => self.asm.movzx_load8_at_index(dst, SCRATCH),
391            2 => self.asm.movzx_load16_at_index(dst, SCRATCH),
392            4 => self.asm.mov_load32_at_index(dst, SCRATCH),
393            8 => self.asm.mov_load64_at_index(dst, SCRATCH),
394            _ => unreachable!(),
395        }
396    }
397
398    /// Emit sign extension after a memory load, if the opcode is a signed variant.
399    /// Handles both direct loads (LoadI8/I16/I32) and indirect loads (LoadIndI8/I16/I32).
400    pub(crate) fn emit_mem_write(
401        &mut self,
402        _addr_in_scratch: bool,
403        val_reg: Reg,
404        fn_addr: u64,
405        pvm_pc: u32,
406    ) {
407        let w = if fn_addr == self.helpers.mem_write_u8 {
408            1u32
409        } else if fn_addr == self.helpers.mem_write_u16 {
410            2
411        } else if fn_addr == self.helpers.mem_write_u32 {
412            4
413        } else {
414            8
415        };
416
417        // Record trap entry before the store instruction (for SIGSEGV
418        // handler + #3 materialization). `w` is the access width.
419        self.trap_entries
420            .push((self.asm.offset() as u32, pvm_pc, w));
421
422        match w {
423            1 => self.asm.mov_store8_at_index(SCRATCH, val_reg),
424            2 => self.asm.mov_store16_at_index(SCRATCH, val_reg),
425            4 => self.asm.mov_store32_at_index(SCRATCH, val_reg),
426            8 => self.asm.mov_store64_at_index(SCRATCH, val_reg),
427            _ => unreachable!(),
428        }
429    }
430
431    /// Emit store-immediate-indirect: store an immediate value to memory.
432    /// Inline SIB store (no function call needed).
433    ///
434    pub(crate) fn invalidate_dependents(&mut self, reg: usize) {
435        // Only iterate registers that have active (non-Unknown) defs
436        let mut active = self.reg_defs_active & !(1u16 << reg);
437        while active != 0 {
438            let i = active.trailing_zeros() as usize;
439            active &= active - 1;
440            let depends = match self.reg_defs[i] {
441                RegDef::Shifted { src, .. } => src == reg,
442                RegDef::ScaledAdd { base, idx, .. } => base == reg || idx == reg,
443                _ => false,
444            };
445            if depends {
446                self.reg_defs[i] = RegDef::Unknown;
447                self.reg_defs_active &= !(1u16 << i);
448            }
449        }
450    }
451
452    /// Invalidate a register's tracked definition and any dependents.
453    #[inline]
454    pub(crate) fn invalidate_reg(&mut self, reg: usize) {
455        self.reg_defs[reg] = RegDef::Unknown;
456        self.reg_defs_active &= !(1u16 << reg);
457        self.invalidate_dependents(reg);
458    }
459
460    /// Invalidate all register definitions (on block boundaries, calls, etc.)
461    #[inline]
462    pub(crate) fn invalidate_all_regs(&mut self) {
463        self.reg_defs = [RegDef::Unknown; 13];
464        self.reg_defs_active = 0;
465    }
466
467    /// Emit an unconditional static branch to `target`. In streaming mode
468    /// a forward target is deferred to the post-pass; otherwise a
469    /// non-block-start target redirects to the panic stub.
470    pub(crate) fn emit_static_branch(
471        &mut self,
472        target: u32,
473        condition: bool,
474        _fallthrough: u32,
475        pc: u32,
476    ) {
477        if !condition {
478            return;
479        }
480        if self.rv_streaming && target > pc {
481            let label = self.label_for_pc(target);
482            let fixup_idx = self.asm.fixups_len();
483            self.asm.jmp_label(label);
484            self.rv_pending_fwd_branches.push((target, pc, fixup_idx));
485            return;
486        }
487        if !self.is_basic_block_start(target) {
488            self.asm.mov_store32_rip_rel_imm(CTX_PC, pc as i32);
489            self.emit_exit(EXIT_PANIC, 0);
490            return;
491        }
492        let label = self.label_for_pc(target);
493        self.asm.jmp_label(label);
494    }
495
496    /// Emit a dynamic jump (through jump table).
497    pub(crate) fn emit_branch_reg(
498        &mut self,
499        a: Reg,
500        b: Reg,
501        cc: Cc,
502        target: u32,
503        _fallthrough: u32,
504        pc: u32,
505    ) {
506        if self.rv_streaming && target > pc {
507            self.asm.cmp_rr(a, b);
508            let label = self.label_for_pc(target);
509            let fixup_idx = self.asm.fixups_len();
510            self.asm.jcc_label(cc, label);
511            self.rv_pending_fwd_branches.push((target, pc, fixup_idx));
512            return;
513        }
514        if !self.is_basic_block_start(target) {
515            self.asm.mov_store32_rip_rel_imm(CTX_PC, pc as i32);
516            self.asm.cmp_rr(a, b);
517            self.asm.jcc_label(cc, self.panic_label);
518            return;
519        }
520        self.asm.cmp_rr(a, b);
521        let label = self.label_for_pc(target);
522        self.asm.jcc_label(cc, label);
523    }
524
525    /// Emit a shift by register value using CL.
526    /// shift_op: 4=SHL, 5=SHR, 7=SAR, 0=ROL, 1=ROR
527    pub(crate) fn emit_shift_by_reg32(&mut self, dst: Reg, shift_reg: Reg, shift_op: u8) {
528        // Need shift amount in CL (RCX = φ[12])
529        // If shift_reg is already RCX, great. Otherwise save/restore.
530        if shift_reg == Reg::RCX {
531            self.asm.shift_cl32(shift_op, dst);
532        } else if dst == Reg::RCX {
533            self.emit_shift_dst_is_rcx(shift_reg, shift_op, false);
534        } else {
535            self.asm.push(Reg::RCX);
536            self.asm.mov_rr(Reg::RCX, shift_reg);
537            self.asm.shift_cl32(shift_op, dst);
538            self.asm.pop(Reg::RCX);
539        }
540    }
541
542    pub(crate) fn emit_shift_by_reg64(&mut self, dst: Reg, shift_reg: Reg, shift_op: u8) {
543        if shift_reg == Reg::RCX {
544            self.asm.shift_cl64(shift_op, dst);
545        } else if dst == Reg::RCX {
546            self.emit_shift_dst_is_rcx(shift_reg, shift_op, true);
547        } else {
548            self.asm.push(Reg::RCX);
549            self.asm.mov_rr(Reg::RCX, shift_reg);
550            self.asm.shift_cl64(shift_op, dst);
551            self.asm.pop(Reg::RCX);
552        }
553    }
554
555    /// Shift when `dst` is RCX (φ[12] = x15), which is also the CL count
556    /// register. The value to shift currently lives in RCX and the count in
557    /// `shift_reg` (≠ RCX). The caller may have routed `shift_reg` through
558    /// `SCRATCH` (it does when `dst == rs2`, e.g. `sra x15, x2, x15`), so we
559    /// must read the count BEFORE touching `SCRATCH`: stash the value on the
560    /// stack, load the count into RCX, then pop the value into `SCRATCH` and
561    /// shift it. Correct whether `shift_reg` is `SCRATCH` or any other register.
562    ///
563    /// (The earlier `mov SCRATCH, dst` snapshot clobbered the count when the
564    /// caller had stashed it in `SCRATCH` — the `dst == rs2 == RCX` shift bug
565    /// the lossless fuzz signature surfaced.)
566    fn emit_shift_dst_is_rcx(&mut self, shift_reg: Reg, shift_op: u8, is_64: bool) {
567        self.asm.push(Reg::RCX); // save dst's value (currently in RCX)
568        self.asm.mov_rr(Reg::RCX, shift_reg); // RCX = count
569        self.asm.pop(SCRATCH); // SCRATCH = value (count no longer aliased here)
570        if is_64 {
571            self.asm.shift_cl64(shift_op, SCRATCH);
572        } else {
573            self.asm.shift_cl32(shift_op, SCRATCH);
574        }
575        self.asm.mov_rr(Reg::RCX, SCRATCH); // dst (RCX) = result
576    }
577
578    /// Emit an exit sequence that sets exit_reason and exit_arg.
579    pub(crate) fn emit_exit(&mut self, reason: u32, arg: u32) {
580        self.asm
581            .mov_store32_rip_rel_imm(CTX_EXIT_REASON, reason as i32);
582        self.asm.mov_store32_rip_rel_imm(CTX_EXIT_ARG, arg as i32);
583        self.asm.jmp_label(self.exit_label);
584    }
585
586    /// Emit prologue: save callee-saved, load PVM registers from context,
587    /// then dispatch to the correct basic block based on entry_pc.
588    pub(crate) fn emit_prologue(&mut self) {
589        self.asm.ensure_capacity(512); // prologue needs ~200 bytes
590        // Save callee-saved registers
591        self.asm.push(Reg::RBX);
592        self.asm.push(Reg::RBP);
593        self.asm.push(Reg::R12);
594        self.asm.push(Reg::R13);
595        self.asm.push(Reg::R14);
596        self.asm.push(Reg::R15);
597
598        // Stack alignment: after 6 callee-saved pushes + return address
599        // (7 * 8 = 56 bytes), RSP mod 16 = 8. Push one extra 8 bytes so
600        // RSP mod 16 = 0 — the SysV ABI alignment any helper CALL we
601        // emit below expects at the call site.
602        self.asm.push(SCRATCH); // alignment padding
603
604        // Save the post-callee-saved RSP into the context. The exit
605        // path restores RSP from this slot before popping the 7 entries
606        // above, so an OOG / page-fault / mid-sequence trap redirect
607        // leaves the exit path with a clean stack instead of corrupting
608        // the exit pops.
609        self.asm.mov_store64_rip_rel(CTX_HOST_RSP_BASE, Reg::RSP);
610
611        // R15 = gas register. Loaded from ctx.gas at prologue, decremented
612        // per basic block, flushed back to ctx.gas at exit. Mem accesses
613        // are baseless `[rdx]` (PVM addr == native VA); CTX is reached via
614        // absolute SIB. Neither path reads R15.
615        self.asm.mov_load64_rip_rel(GAS, CTX_GAS);
616
617        // Clear exit reason
618        self.asm.mov_store32_rip_rel_imm(CTX_EXIT_REASON, 0);
619
620        // --- O(1) dispatch via table lookup (before loading PVM regs) ---
621        self.asm.mov_load32_rip_rel(SCRATCH, CTX_ENTRY_PC);
622        self.asm.mov_load64_rip_rel(Reg::RAX, CTX_DISPATCH_TABLE);
623        self.asm.movsxd_load_sib4(Reg::RAX, Reg::RAX, SCRATCH);
624        self.asm.mov_load64_rip_rel(SCRATCH, CTX_CODE_BASE);
625        self.asm.add_rr(Reg::RAX, SCRATCH);
626        self.asm.push(Reg::RAX);
627
628        // Load all 13 PVM registers from context
629        for (i, &reg) in REG_MAP.iter().enumerate() {
630            self.asm.mov_load64_rip_rel(reg, CTX_REGS + (i as u64) * 8);
631        }
632
633        // Jump to the dispatch target (pop into SCRATCH, then indirect jump)
634        self.asm.pop(SCRATCH);
635        self.asm.jmp_reg(SCRATCH);
636    }
637
638    /// Emit exit sequences and epilogue.
639    pub(crate) fn emit_exit_sequences(&mut self) {
640        // Reserve capacity for exit sequences + all OOG stubs.
641        // Each OOG stub is ~12 bytes.
642        let needed = 512 + self.oog_stubs.len() * 16;
643        self.asm.ensure_capacity(needed);
644        // Shared OOG handler that reads PC from SCRATCH — emitted BEFORE OOG
645        // stubs so backward jumps from stubs can use jmp rel8 (2 bytes).
646        self.asm.bind_label(self.oog_pc_label);
647        self.asm.mov_store32_rip_rel(CTX_PC, SCRATCH);
648        // fall through to oog_label:
649        self.asm.bind_label(self.oog_label);
650        self.asm
651            .mov_store32_rip_rel_imm(CTX_EXIT_REASON, EXIT_OOG as i32);
652        self.asm.jmp_label(self.exit_label);
653
654        // Per-gas-block OOG stubs: compact format — load PC into SCRATCH,
655        // jump to shared handler. Saves ~6 bytes per stub vs inline PC store.
656        let stubs = core::mem::take(&mut self.oog_stubs);
657        for (label, pvm_pc, _cost) in &stubs {
658            self.asm.bind_label(*label);
659            self.asm.mov_ri32(SCRATCH, *pvm_pc);
660            self.asm.jmp_label(self.oog_pc_label);
661        }
662
663        // Page faults are handled by the SIGSEGV handler (signal.rs).
664
665        // Panic exit
666        self.asm.bind_label(self.panic_label);
667        self.asm
668            .mov_store32_rip_rel_imm(CTX_EXIT_REASON, EXIT_PANIC as i32);
669        // fall through to exit_label
670
671        // Common exit: flush gas (R15) → ctx.gas, then save PVM regs.
672        self.asm.bind_label(self.exit_label);
673        self.asm.mov_store64_rip_rel(CTX_GAS, GAS);
674        for (i, &reg) in REG_MAP.iter().enumerate() {
675            self.asm.mov_store64_rip_rel(CTX_REGS + (i as u64) * 8, reg);
676        }
677
678        // Restore RSP to the post-prologue baseline. For a clean exit
679        // RSP is already there and the mov is a no-op; for OOG /
680        // page-fault / mid-sequence trap redirects it truncates the
681        // stack back to where the 7 callee-saved entries sit on top.
682        self.asm.mov_load64_rip_rel(Reg::RSP, CTX_HOST_RSP_BASE);
683
684        // Restore callee-saved (+ alignment padding)
685        self.asm.pop(SCRATCH); // alignment padding
686        self.asm.pop(Reg::R15);
687        self.asm.pop(Reg::R14);
688        self.asm.pop(Reg::R13);
689        self.asm.pop(Reg::R12);
690        self.asm.pop(Reg::RBP);
691        self.asm.pop(Reg::RBX);
692        self.asm.ret();
693    }
694}
695
696/// Detect a trailing ALU-rr instruction in raw bytes, for streaming-fusion
697/// lookahead. Handles both 4-byte OP_OP forms (Add/Xor/Or/And with funct7=0)
698/// and the 2-byte RVC equivalents (`c.add`, `c.xor`, `c.or`, `c.and`).
699///
700/// Returns `(op, rd, rs1, rs2, consumed_bytes)`. For RVC's two-operand forms
701/// (`rd <- rd ⊕ rs2`), we surface `rs1 == rd` so callers don't need separate
702/// RVC-aware logic.
703///
704/// Roughly half of PVM2 guest code is RVC (49–71% across the gap-driving
705/// guests), so missing the compressed-Add form here would forfeit most of
706/// the win from these fusions.
707#[inline]
708fn peek_alu_rr_trailer(rest: &[u8]) -> Option<(AluOp, u8, u8, u8, usize)> {
709    if rest.len() < 2 {
710        return None;
711    }
712    // 4-byte path: only when `rest[0]`'s low 2 bits == 0b11.
713    if rest[0] & 0b11 == 0b11 {
714        if rest.len() < 4 {
715            return None;
716        }
717        let w = u32::from_le_bytes([rest[0], rest[1], rest[2], rest[3]]);
718        let op = match w & 0xFE00_707F {
719            0x0000_0033 => AluOp::Add,
720            0x0000_4033 => AluOp::Xor,
721            0x0000_6033 => AluOp::Or,
722            0x0000_7033 => AluOp::And,
723            _ => return None,
724        };
725        let rd = ((w >> 7) & 0x1F) as u8;
726        let rs1 = ((w >> 15) & 0x1F) as u8;
727        let rs2 = ((w >> 20) & 0x1F) as u8;
728        return Some((op, rd, rs1, rs2, 4));
729    }
730    // 2-byte RVC path.
731    let h = u16::from_le_bytes([rest[0], rest[1]]);
732    // c.add — q2 (bits[1:0]=10), funct4=1001, bits[12]=1, rd!=0, rs2!=0.
733    // mask 0xF003 == 0x9002.
734    if h & 0xF003 == 0x9002 {
735        let rd = ((h >> 7) & 0x1F) as u8;
736        let rs2 = ((h >> 2) & 0x1F) as u8;
737        if rd != 0 && rs2 != 0 {
738            return Some((AluOp::Add, rd, rd, rs2, 2));
739        }
740        return None;
741    }
742    // c.{and,or,xor,sub} — q1 misc_alu, funct6=100011, bits[1:0]=01.
743    // mask 0xFC03 == 0x8C01. funct2 (bits[6:5]) selects op.
744    if h & 0xFC03 == 0x8C01 {
745        let op = match (h >> 5) & 0x3 {
746            0b01 => AluOp::Xor,
747            0b10 => AluOp::Or,
748            0b11 => AluOp::And,
749            _ => return None, // 0b00 = c.sub, not in fusion set
750        };
751        let rd = ((h >> 7) & 0x7) as u8 + 8; // creg
752        let rs2 = ((h >> 2) & 0x7) as u8 + 8; // creg
753        return Some((op, rd, rd, rs2, 2));
754    }
755    None
756}
757
758// ----------------------------------------------------------------------
759// RV opcode majors (bits [6:2]). Bits [1:0] are always 0b11 for 4-byte.
760// Mirrors `javm_exec::instruction::OP_*`; redeclared here to keep the
761// recompiler self-contained on the byte-dispatch hot path. Only majors
762// PVM2 accepts are named — AUIPC, JALR, SYSTEM, CUSTOM_1, AMO, FP* etc.
763// are routed through the catch-all default branch in `compile_rv4`.
764// ----------------------------------------------------------------------
765const OP_LOAD: u32 = 0b00_000;
766const OP_MISC_MEM: u32 = 0b00_011;
767const OP_IMM: u32 = 0b00_100;
768const OP_OP_IMM_32: u32 = 0b00_110;
769const OP_STORE: u32 = 0b01_000;
770const OP_OP: u32 = 0b01_100;
771const OP_LUI: u32 = 0b01_101;
772const OP_AUIPC: u32 = 0b00_101;
773const OP_OP_32: u32 = 0b01_110;
774const OP_BRANCH: u32 = 0b11_000;
775const OP_JAL: u32 = 0b11_011;
776const OP_JALR: u32 = 0b11_001;
777const OP_CUSTOM_0: u32 = 0b00_010;
778
779/// True if `w` is a custom-0 `ecall.jar` (f3=001) or `ecalli` (f3=010) —
780/// the two instructions that form their own gas block (charged
781/// dynamically at dispatch, no static preamble). Custom-0 is always
782/// 4-byte. Mirrors `predecode::is_ecall_block` so the recompiler splits
783/// blocks identically to the interpreter.
784#[inline]
785fn is_custom0_ecall(w: u32) -> bool {
786    let opcode5 = (w >> 2) & 0x1F;
787    let f3 = (w >> 12) & 0x7;
788    opcode5 == OP_CUSTOM_0 && (f3 == 0b001 || f3 == 0b010)
789}
790
791// Sign-extended immediates straight off a 4-byte RV word. Mirrors the
792// canonical encoders in `javm_exec::instruction`.
793#[inline]
794fn imm_i(w: u32) -> i32 {
795    (w as i32) >> 20
796}
797#[inline]
798fn imm_s(w: u32) -> i32 {
799    let hi = (w >> 25) & 0x7F;
800    let lo = (w >> 7) & 0x1F;
801    let raw = ((hi << 5) | lo) as i32;
802    (raw << 20) >> 20
803}
804#[inline]
805fn imm_b(w: u32) -> i32 {
806    let b12 = (w >> 31) & 1;
807    let b11 = (w >> 7) & 1;
808    let b10_5 = (w >> 25) & 0x3F;
809    let b4_1 = (w >> 8) & 0xF;
810    let raw = (b12 << 12) | (b11 << 11) | (b10_5 << 5) | (b4_1 << 1);
811    ((raw as i32) << 19) >> 19
812}
813#[inline]
814fn imm_j(w: u32) -> i32 {
815    let b20 = (w >> 31) & 1;
816    let b10_1 = (w >> 21) & 0x3FF;
817    let b11 = (w >> 20) & 1;
818    let b19_12 = (w >> 12) & 0xFF;
819    let raw = (b20 << 20) | (b19_12 << 12) | (b11 << 11) | (b10_1 << 1);
820    ((raw as i32) << 11) >> 11
821}
822#[inline]
823fn imm_u(w: u32) -> i32 {
824    (w & 0xFFFFF000) as i32
825}
826
827// ----------------------------------------------------------------------
828// Encoders for synthesising a 4-byte RV word from RVC fields. RVC is
829// rare enough (~1% of code on the gap-driving guests) that the natural
830// implementation is: extract the relevant RVC fields, re-encode as the
831// equivalent 4-byte word, and feed it through `compile_rv4`. This lets
832// all the funct3/funct7 dispatch + fusion logic live in one place.
833//
834// The `opcode5` parameter is the 5-bit opcode major (bits [6:2]); we OR
835// in `0b11` for bits [1:0] automatically.
836// ----------------------------------------------------------------------
837#[inline]
838fn enc_i(opcode5: u32, f3: u32, rd: u8, rs1: u8, imm: i32) -> u32 {
839    let imm12 = (imm as u32) & 0xFFF;
840    (imm12 << 20) | ((rs1 as u32) << 15) | (f3 << 12) | ((rd as u32) << 7) | (opcode5 << 2) | 0b11
841}
842#[inline]
843fn enc_s(opcode5: u32, f3: u32, rs1: u8, rs2: u8, imm: i32) -> u32 {
844    let imm12 = (imm as u32) & 0xFFF;
845    ((imm12 >> 5) << 25)
846        | ((rs2 as u32) << 20)
847        | ((rs1 as u32) << 15)
848        | (f3 << 12)
849        | ((imm12 & 0x1F) << 7)
850        | (opcode5 << 2)
851        | 0b11
852}
853#[inline]
854fn enc_b(opcode5: u32, f3: u32, rs1: u8, rs2: u8, imm: i32) -> u32 {
855    let imm13 = (imm as u32) & 0x1FFF;
856    let b12 = (imm13 >> 12) & 1;
857    let b11 = (imm13 >> 11) & 1;
858    let b10_5 = (imm13 >> 5) & 0x3F;
859    let b4_1 = (imm13 >> 1) & 0xF;
860    (b12 << 31)
861        | (b10_5 << 25)
862        | ((rs2 as u32) << 20)
863        | ((rs1 as u32) << 15)
864        | (f3 << 12)
865        | (b4_1 << 8)
866        | (b11 << 7)
867        | (opcode5 << 2)
868        | 0b11
869}
870#[inline]
871fn enc_j(opcode5: u32, rd: u8, imm: i32) -> u32 {
872    let imm21 = (imm as u32) & 0x1FFFFF;
873    let b20 = (imm21 >> 20) & 1;
874    let b10_1 = (imm21 >> 1) & 0x3FF;
875    let b11 = (imm21 >> 11) & 1;
876    let b19_12 = (imm21 >> 12) & 0xFF;
877    (b20 << 31)
878        | (b10_1 << 21)
879        | (b11 << 20)
880        | (b19_12 << 12)
881        | ((rd as u32) << 7)
882        | (opcode5 << 2)
883        | 0b11
884}
885#[inline]
886fn enc_u(opcode5: u32, rd: u8, imm: i32) -> u32 {
887    let imm_u = (imm as u32) & 0xFFFFF000;
888    imm_u | ((rd as u32) << 7) | (opcode5 << 2) | 0b11
889}
890#[inline]
891fn enc_r(opcode5: u32, f3: u32, f7: u32, rd: u8, rs1: u8, rs2: u8) -> u32 {
892    (f7 << 25)
893        | ((rs2 as u32) << 20)
894        | ((rs1 as u32) << 15)
895        | (f3 << 12)
896        | ((rd as u32) << 7)
897        | (opcode5 << 2)
898        | 0b11
899}
900#[inline]
901fn enc_shimm6(opcode5: u32, f3: u32, shtype6: u32, rd: u8, rs1: u8, shamt6: u8) -> u32 {
902    (shtype6 << 26)
903        | ((shamt6 as u32) << 20)
904        | ((rs1 as u32) << 15)
905        | (f3 << 12)
906        | ((rd as u32) << 7)
907        | (opcode5 << 2)
908        | 0b11
909}
910
911// RVC compressed-register field (3 bits) maps to x8..x15.
912#[inline]
913fn creg(r: u16) -> u8 {
914    (r + 8) as u8
915}
916
917// CI-format 6-bit signed immediate.
918#[inline]
919fn decode_ci_imm6(h: u16) -> i32 {
920    let imm = (((h >> 12) & 1) << 5) | ((h >> 2) & 0x1F);
921    ((imm as i32) << 26) >> 26
922}
923
924// CJ-format 12-bit signed immediate (byte offset).
925#[inline]
926fn decode_cj_imm(h: u16) -> i32 {
927    let b11 = (h >> 12) & 1;
928    let b4 = (h >> 11) & 1;
929    let b9_8 = (h >> 9) & 0x3;
930    let b10 = (h >> 8) & 1;
931    let b6 = (h >> 7) & 1;
932    let b7 = (h >> 6) & 1;
933    let b3_1 = (h >> 3) & 0x7;
934    let b5 = (h >> 2) & 1;
935    let imm = (b11 << 11)
936        | (b10 << 10)
937        | (b9_8 << 8)
938        | (b7 << 7)
939        | (b6 << 6)
940        | (b5 << 5)
941        | (b4 << 4)
942        | (b3_1 << 1);
943    ((imm as i32) << 20) >> 20
944}
945
946// CB-format 9-bit signed immediate (byte offset).
947#[inline]
948fn decode_cb_imm(h: u16) -> i32 {
949    let b8 = (h >> 12) & 1;
950    let b4_3 = (h >> 10) & 0x3;
951    let b7_6 = (h >> 5) & 0x3;
952    let b2_1 = (h >> 3) & 0x3;
953    let b5 = (h >> 2) & 1;
954    let imm = (b8 << 8) | (b7_6 << 6) | (b5 << 5) | (b4_3 << 3) | (b2_1 << 1);
955    ((imm as i32) << 23) >> 23
956}
957
958/// Expand a 2-byte RVC encoding to its 4-byte equivalent. Returns
959/// `None` for the forbidden RVC encodings (c.ebreak, c.illegal) and
960/// for malformed encodings (reserved sub-cases). c.jr/c.jalr expand
961/// to standard `jalr x0/x1, rs1, 0` and are not forbidden.
962///
963/// The caller (`compile_rvc`) feeds the result through `compile_rv4`,
964/// so any shape `compile_rv4` understands is acceptable here. RVC
965/// expansions never set the JAL `rd` to a non-zero value (c.jal is
966/// RV32-only and doesn't exist in our target), so the `next_pc`
967/// hardcoded in `compile_rv4`'s jal/branch sub-dispatchers is unused
968/// — RVC's actual `pc + 2` advance happens in the streaming loop.
969fn expand_rvc(h: u16) -> Option<u32> {
970    // c.illegal is encoding 0x0000.
971    if h == 0 {
972        return None;
973    }
974    let op = h & 0b11;
975    let f3 = (h >> 13) & 0b111;
976    match op {
977        0b00 => expand_rvc_q0(h, f3),
978        0b01 => expand_rvc_q1(h, f3),
979        0b10 => expand_rvc_q2(h, f3),
980        _ => None,
981    }
982}
983
984fn expand_rvc_q0(h: u16, f3: u16) -> Option<u32> {
985    let rs1c = creg((h >> 7) & 0b111);
986    let rdrs2c = creg((h >> 2) & 0b111);
987    match f3 {
988        0b000 => {
989            // c.addi4spn -> addi rd', x2, nzuimm
990            // nzuimm bits: h[12:11] -> [5:4], h[10:7] -> [9:6], h[6] -> [2], h[5] -> [3].
991            let n = (((h >> 11) & 0x3) << 4)
992                | (((h >> 7) & 0xF) << 6)
993                | (((h >> 6) & 0x1) << 2)
994                | (((h >> 5) & 0x1) << 3);
995            if n == 0 {
996                return None;
997            }
998            Some(enc_i(OP_IMM, 0b000, rdrs2c, 2, n as i32))
999        }
1000        0b010 => {
1001            // c.lw -> lw rd', uimm(rs1')
1002            let imm = (((h >> 10) & 0x7) << 3) | (((h >> 6) & 0x1) << 2) | (((h >> 5) & 0x1) << 6);
1003            Some(enc_i(OP_LOAD, 0b010, rdrs2c, rs1c, imm as i32))
1004        }
1005        0b011 => {
1006            // c.ld -> ld rd', uimm(rs1')
1007            let imm = (((h >> 10) & 0x7) << 3) | (((h >> 5) & 0x3) << 6);
1008            Some(enc_i(OP_LOAD, 0b011, rdrs2c, rs1c, imm as i32))
1009        }
1010        0b110 => {
1011            // c.sw
1012            let imm = (((h >> 10) & 0x7) << 3) | (((h >> 6) & 0x1) << 2) | (((h >> 5) & 0x1) << 6);
1013            Some(enc_s(OP_STORE, 0b010, rs1c, rdrs2c, imm as i32))
1014        }
1015        0b111 => {
1016            // c.sd
1017            let imm = (((h >> 10) & 0x7) << 3) | (((h >> 5) & 0x3) << 6);
1018            Some(enc_s(OP_STORE, 0b011, rs1c, rdrs2c, imm as i32))
1019        }
1020        _ => None,
1021    }
1022}
1023
1024fn expand_rvc_q1(h: u16, f3: u16) -> Option<u32> {
1025    match f3 {
1026        0b000 => {
1027            // c.nop / c.addi
1028            let rd = ((h >> 7) & 0x1F) as u8;
1029            let imm = decode_ci_imm6(h);
1030            if rd == 0 {
1031                Some(enc_i(OP_IMM, 0b000, 0, 0, 0)) // c.nop
1032            } else {
1033                Some(enc_i(OP_IMM, 0b000, rd, rd, imm))
1034            }
1035        }
1036        0b001 => {
1037            // c.addiw (RV64) — rd != 0
1038            let rd = ((h >> 7) & 0x1F) as u8;
1039            if rd == 0 {
1040                return None;
1041            }
1042            Some(enc_i(OP_OP_IMM_32, 0b000, rd, rd, decode_ci_imm6(h)))
1043        }
1044        0b010 => {
1045            // c.li -> addi rd, x0, imm
1046            let rd = ((h >> 7) & 0x1F) as u8;
1047            if rd == 0 {
1048                return None;
1049            }
1050            Some(enc_i(OP_IMM, 0b000, rd, 0, decode_ci_imm6(h)))
1051        }
1052        0b011 => {
1053            // c.addi16sp / c.lui
1054            let rd = ((h >> 7) & 0x1F) as u8;
1055            if rd == 2 {
1056                let imm = (((h >> 12) & 1) << 9)
1057                    | (((h >> 6) & 1) << 4)
1058                    | (((h >> 5) & 1) << 6)
1059                    | (((h >> 3) & 0x3) << 7)
1060                    | (((h >> 2) & 1) << 5);
1061                let sx = ((imm as i32) << 22) >> 22;
1062                if sx == 0 {
1063                    return None;
1064                }
1065                Some(enc_i(OP_IMM, 0b000, 2, 2, sx))
1066            } else if rd == 0 {
1067                None
1068            } else {
1069                let h_u = h as u32;
1070                let imm = (((h_u >> 12) & 1) << 17) | (((h_u >> 2) & 0x1F) << 12);
1071                let sx = ((imm as i32) << 14) >> 14;
1072                if sx == 0 {
1073                    return None;
1074                }
1075                Some(enc_u(OP_LUI, rd, sx))
1076            }
1077        }
1078        0b100 => expand_rvc_q1_misc_alu(h),
1079        0b101 => {
1080            // c.j -> jal x0, off
1081            Some(enc_j(OP_JAL, 0, decode_cj_imm(h)))
1082        }
1083        0b110 | 0b111 => {
1084            // c.beqz / c.bnez (rs1 = creg)
1085            let rs1 = creg((h >> 7) & 0b111);
1086            let imm = decode_cb_imm(h);
1087            let f3b = if f3 == 0b110 { 0b000 } else { 0b001 };
1088            Some(enc_b(OP_BRANCH, f3b, rs1, 0, imm))
1089        }
1090        _ => None,
1091    }
1092}
1093
1094fn expand_rvc_q1_misc_alu(h: u16) -> Option<u32> {
1095    let f6_10 = (h >> 10) & 0b11;
1096    let rdrs1c = creg((h >> 7) & 0b111);
1097    match f6_10 {
1098        0b00 | 0b01 => {
1099            // c.srli / c.srai (RV64 shamt: bit12||bits6:2)
1100            let shamt = ((((h >> 12) & 1) << 5) | ((h >> 2) & 0x1F)) as u8;
1101            let shtype = if f6_10 == 0b00 { 0b000000 } else { 0b010000 };
1102            Some(enc_shimm6(OP_IMM, 0b101, shtype, rdrs1c, rdrs1c, shamt))
1103        }
1104        0b10 => {
1105            // c.andi
1106            Some(enc_i(OP_IMM, 0b111, rdrs1c, rdrs1c, decode_ci_imm6(h)))
1107        }
1108        0b11 => {
1109            // c.sub/xor/or/and (bit12=0) or c.subw/c.addw (bit12=1)
1110            let rs2c = creg((h >> 2) & 0b111);
1111            let bit12 = (h >> 12) & 1;
1112            let f2 = (h >> 5) & 0b11;
1113            match (bit12, f2) {
1114                // OP family (bit12=0)
1115                (0, 0b00) => Some(enc_r(OP_OP, 0b000, 0b0100000, rdrs1c, rdrs1c, rs2c)), // sub
1116                (0, 0b01) => Some(enc_r(OP_OP, 0b100, 0b0000000, rdrs1c, rdrs1c, rs2c)), // xor
1117                (0, 0b10) => Some(enc_r(OP_OP, 0b110, 0b0000000, rdrs1c, rdrs1c, rs2c)), // or
1118                (0, 0b11) => Some(enc_r(OP_OP, 0b111, 0b0000000, rdrs1c, rdrs1c, rs2c)), // and
1119                // OP_32 family (bit12=1)
1120                (1, 0b00) => Some(enc_r(OP_OP_32, 0b000, 0b0100000, rdrs1c, rdrs1c, rs2c)), // subw
1121                (1, 0b01) => Some(enc_r(OP_OP_32, 0b000, 0b0000000, rdrs1c, rdrs1c, rs2c)), // addw
1122                _ => None,
1123            }
1124        }
1125        _ => None,
1126    }
1127}
1128
1129fn expand_rvc_q2(h: u16, f3: u16) -> Option<u32> {
1130    let rdrs1 = ((h >> 7) & 0x1F) as u8;
1131    let rs2 = ((h >> 2) & 0x1F) as u8;
1132    match f3 {
1133        0b000 => {
1134            // c.slli (RV64 shamt: bit12||bits6:2)
1135            if rdrs1 == 0 {
1136                return None;
1137            }
1138            let shamt = ((((h >> 12) & 1) << 5) | ((h >> 2) & 0x1F)) as u8;
1139            Some(enc_shimm6(OP_IMM, 0b001, 0b000000, rdrs1, rdrs1, shamt))
1140        }
1141        0b010 => {
1142            // c.lwsp -> lw rd, uimm(x2)
1143            if rdrs1 == 0 {
1144                return None;
1145            }
1146            let imm = (((h >> 12) & 1) << 5) | (((h >> 4) & 0x7) << 2) | (((h >> 2) & 0x3) << 6);
1147            Some(enc_i(OP_LOAD, 0b010, rdrs1, 2, imm as i32))
1148        }
1149        0b011 => {
1150            // c.ldsp -> ld rd, uimm(x2)
1151            if rdrs1 == 0 {
1152                return None;
1153            }
1154            let imm = (((h >> 12) & 1) << 5) | (((h >> 5) & 0x3) << 3) | (((h >> 2) & 0x7) << 6);
1155            Some(enc_i(OP_LOAD, 0b011, rdrs1, 2, imm as i32))
1156        }
1157        0b100 => {
1158            // (bit12, rdrs1, rs2):
1159            //   (0, r, 0)  r!=0 -> c.jr     -> jalr x0, r, 0  (return)
1160            //   (0, r, s)  both!=0 -> c.mv  -> add rd, x0, rs2
1161            //   (1, 0, 0)         -> c.ebreak (FORBIDDEN)
1162            //   (1, r, 0)  r!=0 -> c.jalr   -> jalr x1, r, 0  (indirect call)
1163            //   (1, r, s)  both!=0 -> c.add -> add rd, rd, rs2
1164            let bit12 = (h >> 12) & 1;
1165            if rs2 == 0 {
1166                // c.jr (bit12=0) / c.jalr (bit12=1): native jalr. rdrs1=0
1167                // is c.ebreak (bit12=1) or reserved (bit12=0) — forbidden.
1168                if rdrs1 == 0 {
1169                    return None;
1170                }
1171                let rd = if bit12 == 0 { 0 } else { 1 };
1172                Some(enc_i(OP_JALR, 0b000, rd, rdrs1, 0))
1173            } else {
1174                // c.mv (bit12=0, rs1=x0) or c.add (bit12=1, rs1=rdrs1)
1175                if rdrs1 == 0 {
1176                    return None;
1177                }
1178                let rs1_enc = if bit12 == 0 { 0 } else { rdrs1 };
1179                Some(enc_r(OP_OP, 0b000, 0b0000000, rdrs1, rs1_enc, rs2))
1180            }
1181        }
1182        0b110 => {
1183            // c.swsp -> sw rs2, uimm(x2)
1184            let imm = (((h >> 9) & 0xF) << 2) | (((h >> 7) & 0x3) << 6);
1185            Some(enc_s(OP_STORE, 0b010, 2, rs2, imm as i32))
1186        }
1187        0b111 => {
1188            // c.sdsp -> sd rs2, uimm(x2)
1189            let imm = (((h >> 10) & 0x7) << 3) | (((h >> 7) & 0x7) << 6);
1190            Some(enc_s(OP_STORE, 0b011, 2, rs2, imm as i32))
1191        }
1192        _ => None,
1193    }
1194}
1195
1196/// Map an RV register index to its **host-mapped** PVM slot (0..=12).
1197///
1198/// Returns `None` for x0 (hardwired zero), for the host-spilled `x3`/`x4`
1199/// (slots 13/14 — no [`REG_MAP`] entry), and for any reserved register
1200/// (x16..x31). Callers on the fast path never see x3/x4 (the
1201/// `word_uses_spilled_reg` fork in `compile_rv4` routes them to
1202/// `compile_rv_spilled` first); a reserved register is unreachable
1203/// defence-in-depth (the `word_uses_reserved_reg` fork panics earlier).
1204#[inline]
1205fn rv_slot(x: u8) -> Option<usize> {
1206    match RV_SLOT_LUT[(x & 31) as usize] {
1207        s if s <= 12 => Some(s as usize),
1208        _ => None, // 0xFF (x0 / x16..x31) or 13/14 (spilled x3/x4)
1209    }
1210}
1211
1212/// Guest VA of a host-spilled register's home in `JitContext.regs`.
1213/// Only valid for `x3`/`x4` (slots 13/14); the recompiler addresses it
1214/// RIP-relatively, exactly as the prologue/epilogue address the 13
1215/// host-mapped slots.
1216#[inline]
1217fn spill_va(x: u8) -> u64 {
1218    CTX_REGS + (RV_SLOT_LUT[(x & 31) as usize] as u64) * 8
1219}
1220
1221/// True for a reserved register (`x16..x31`, which do not exist in RV64E).
1222/// Single source: [`javm_exec::regs::reg_is_reserved`].
1223use javm_exec::regs::reg_is_reserved as rv_is_reserved;
1224/// True for a host-spilled register (`x3`/`x4`). Single source:
1225/// [`javm_exec::regs::reg_is_spilled`].
1226use javm_exec::regs::reg_is_spilled;
1227
1228impl Compiler {
1229    /// Compile an RV+C+custom-0 byte stream into x86-64 in a single
1230    /// streaming pass.
1231    ///
1232    /// Decode + valid-PC + gas-block detection + gas simulation +
1233    /// codegen all happen in one walk over `code`. No `Predecode`
1234    /// intermediary — that was 57% of the old cold-path compile time
1235    /// on the large guests (ed25519, ecrecover).
1236    ///
1237    /// The internal `rv_valid_pc` bitmap (a bit set iff the PC is a
1238    /// gas-block start) drives compile-time forward-branch validation
1239    /// via `is_basic_block_start`. It is *not* surfaced: the runtime
1240    /// validates `jalr` targets through the dense dispatch table
1241    /// instead. Built incrementally during the streaming pass — no
1242    /// separate length-only pre-pass.
1243    pub fn compile(mut self, code: &[u8]) -> CompileResult {
1244        // valid_pc is populated incrementally as the streaming pass
1245        // binds gas-block starts. The pointer is stable across mutation
1246        // (Vec doesn't reallocate from `vec![false; n]` with in-place
1247        // index assignment), so `is_basic_block_start` reads through
1248        // the raw pointer remain coherent.
1249        self.rv_valid_pc = vec![false; code.len()];
1250        self.bitmask_ptr = self.rv_valid_pc.as_ptr() as *const u8;
1251        self.bitmask_len = self.rv_valid_pc.len();
1252        self.rv_streaming = true;
1253
1254        self.emit_prologue();
1255
1256        let mut pending_gas: Option<PendingGas> = None;
1257        let mut next_is_gas_start = true;
1258        let mut pc: usize = 0;
1259
1260        while pc < code.len() {
1261            self.asm.ensure_capacity(512);
1262
1263            // Length encoding lives in bits [1:0] of byte 0: `xx11` is
1264            // 4-byte, anything else is 2-byte (RVC). Decode no further
1265            // than that — the dispatcher inspects raw bits directly.
1266            if pc + 2 > code.len() {
1267                self.rv_emit_panic_at(pc as u32);
1268                break;
1269            }
1270            let is_4byte = code[pc] & 0b11 == 0b11;
1271            let base_len = if is_4byte { 4 } else { 2 };
1272            if pc + base_len > code.len() {
1273                self.rv_emit_panic_at(pc as u32);
1274                break;
1275            }
1276
1277            let inst_pc = pc as u32;
1278
1279            // `ecall.jar` / `ecalli` are *forced* gas-block starts — each
1280            // is its own singleton block, charged dynamically by the
1281            // kernel at dispatch (no static preamble). Detect it from the
1282            // raw 4-byte word (custom-0 is never compressed) BEFORE the
1283            // bind so the recompiler splits identically to predecode, even
1284            // mid-straight-line (`addi; ecall`).
1285            let ecall_block = is_4byte
1286                && is_custom0_ecall(u32::from_le_bytes([
1287                    code[pc],
1288                    code[pc + 1],
1289                    code[pc + 2],
1290                    code[pc + 3],
1291                ]));
1292            if ecall_block {
1293                next_is_gas_start = true;
1294            }
1295
1296            if next_is_gas_start {
1297                self.bind_rv_gas_block_start_streaming(inst_pc, ecall_block, &mut pending_gas);
1298                next_is_gas_start = false;
1299            }
1300
1301            // Byte-based dispatch. Each path returns
1302            // `(is_terminator, preserve_cf, extra_bytes)`. `extra_bytes`
1303            // counts the *additional* bytes consumed beyond `base_len`
1304            // for lookahead fusion (e.g., Ld→Add fuses an extra 4-byte
1305            // Add). `preserve_cf` tells us whether to keep
1306            // `last_add_cf` alive for a following Sltu fusion.
1307            let rest = &code[pc + base_len..];
1308            let (term, preserve_cf, extra) = if is_4byte {
1309                let w = u32::from_le_bytes([code[pc], code[pc + 1], code[pc + 2], code[pc + 3]]);
1310                self.compile_rv4(w, inst_pc, 4, rest)
1311            } else {
1312                let h = u16::from_le_bytes([code[pc], code[pc + 1]]);
1313                self.compile_rvc(h, inst_pc, rest)
1314            };
1315
1316            if !preserve_cf {
1317                self.last_add_cf = None;
1318            }
1319
1320            if term {
1321                next_is_gas_start = true;
1322            }
1323
1324            pc += base_len + extra;
1325        }
1326
1327        // Finalize the last gas block — patch its gate + charge in.
1328        self.flush_pending_gas(&mut pending_gas);
1329
1330        // Resolve deferred forward branches now that valid_pc is fully
1331        // populated. For each forward branch recorded with target > pc
1332        // at emit time:
1333        //   - valid target: label_for_pc(target) was bound during the
1334        //     streaming pass; the existing fixup resolves naturally.
1335        //   - invalid target: append a per-branch panic stub and
1336        //     redirect the fixup to it. Keeps the source PC of the
1337        //     branch in the exit report.
1338        // We disable rv_streaming first so emit_branch_* / panic helpers
1339        // called below take their non-deferred path.
1340        self.rv_streaming = false;
1341        let pending = core::mem::take(&mut self.rv_pending_fwd_branches);
1342        for (target, branch_pc, fixup_idx) in pending {
1343            if !self.is_basic_block_start(target) {
1344                let stub = self.asm.new_label();
1345                self.asm.bind_label(stub);
1346                self.asm.mov_store32_rip_rel_imm(CTX_PC, branch_pc as i32);
1347                self.asm.jmp_label(self.panic_label);
1348                self.asm.redirect_fixup(fixup_idx, stub);
1349            }
1350        }
1351
1352        self.emit_exit_sequences();
1353
1354        // Sparse dispatch entries — caller writes only these into the
1355        // (page-zero-filled) arena dispatch region. No code.len() + 1
1356        // intermediate Vec.
1357        let mut dispatch_entries: Vec<(u32, i32)> = Vec::with_capacity(self.gas_block_pcs.len());
1358        for &pc in self.gas_block_pcs.iter() {
1359            let label = Label(self.label_base + pc);
1360            if let Some(off) = self.asm.label_offset(label) {
1361                dispatch_entries.push((pc, off as i32));
1362            }
1363        }
1364
1365        let exit_label_offset = self.asm.label_offset(self.exit_label).unwrap_or(0) as u32;
1366        // Security-critical: the runtime dense-fills the dispatch table
1367        // with this, so it must resolve to the real panic stub (a 0 here
1368        // would route bad `jalr` targets to native offset 0 instead of
1369        // faulting). The panic stub is always emitted by
1370        // `emit_exit_sequences` above.
1371        let panic_offset = self
1372            .asm
1373            .label_offset(self.panic_label)
1374            .expect("panic stub label must resolve") as u32;
1375        let trap_table = core::mem::take(&mut self.trap_entries);
1376
1377        CompileResult {
1378            native_code: self.asm.finalize(),
1379            dispatch_entries,
1380            trap_table,
1381            exit_label_offset,
1382            panic_offset,
1383        }
1384    }
1385
1386    /// Streaming gas-block-start hook: bind label, flush prior block's
1387    /// cost into its `sub` patch, emit a fresh `sub r15, 0; js stub`
1388    /// placeholder and stash the patch offset in `pending`. Drives
1389    /// `self.gas_sim` directly so the per-arm `feed_gas_rv` calls in
1390    /// `compile_rv4` see a coherent simulator.
1391    fn bind_rv_gas_block_start_streaming(
1392        &mut self,
1393        pc: u32,
1394        is_ecall: bool,
1395        pending: &mut Option<PendingGas>,
1396    ) {
1397        let label = Label(self.label_base + pc);
1398        self.asm.bind_label(label);
1399        self.gas_block_pcs.push(pc);
1400        // valid_pc is the gas-block-start bitmap consulted by both the
1401        // codegen-time `is_basic_block_start` check and the runtime's
1402        // djump validation. Set it here so backward branches emit time
1403        // see the bit (we walk PCs in order, so any T < cur_pc has
1404        // already passed through here if it's a gas-block start).
1405        // bitmask_ptr points to rv_valid_pc's heap buffer, so this
1406        // mutation is visible to subsequent is_basic_block_start reads.
1407        if (pc as usize) < self.rv_valid_pc.len() {
1408            self.rv_valid_pc[pc as usize] = true;
1409        }
1410
1411        // Peephole state must not leak across gas-block boundaries: the
1412        // dispatch table can enter this block from any predecessor.
1413        self.invalidate_all_regs();
1414        self.last_add_cf = None;
1415
1416        self.flush_pending_gas(pending);
1417        self.gas_sim.reset();
1418        self.gas_reserve_accum = 0;
1419
1420        // ecall/ecalli block: emit NO static gate (cost == 0). It is
1421        // charged dynamically by the kernel at dispatch, and an OOG there
1422        // re-attempts at the ecall's own pc. `pending` stays `None`, so
1423        // there is no imm to patch and no in-code OOG stub for this block.
1424        // (The ecall's own gas feed below is discarded at the next reset.)
1425        if is_ecall {
1426            return;
1427        }
1428
1429        // Check-before-charge gate: `cmp r15, cost+reserve; jl stub; sub
1430        // r15, cost`. The `cmp` does NOT mutate gas, so on out-of-gas
1431        // (gas < cost+reserve) the block is not entered and gas is
1432        // unchanged — never negative, and the un-entered block charges
1433        // nothing. Both imm32s are patched once the block's cost and
1434        // reserve are known (at the next block start / final flush).
1435        // `Cc::L` (signed) is sound because gas (R15) is provably ≥ 0.
1436        let stub_label = self.asm.new_label();
1437        self.asm.cmp_r64_imm32_patchable(GAS, 0);
1438        let cmp_offset = self.asm.offset() - 4;
1439        self.asm.jcc_label(Cc::L, stub_label);
1440        self.asm.sub_r64_imm32_patchable(GAS, 0);
1441        let sub_offset = self.asm.offset() - 4;
1442        *pending = Some(PendingGas {
1443            stub_label,
1444            block_pc: pc,
1445            cmp_offset,
1446            sub_offset,
1447        });
1448    }
1449
1450    /// Patch the pending block's gate (`cmp` ← cost+reserve) and charge
1451    /// (`sub` ← cost) immediates now that its cost and #3 reserve are
1452    /// known, and record its per-block OOG stub. Shared by the
1453    /// per-block-start flush and the final flush.
1454    fn flush_pending_gas(&mut self, pending: &mut Option<PendingGas>) {
1455        if let Some(p) = pending.take() {
1456            let cost = self.gas_sim.flush_and_get_cost();
1457            let gate = cost.saturating_add(self.gas_reserve_accum);
1458            self.asm.patch_i32(p.cmp_offset, gate as i32);
1459            self.asm.patch_i32(p.sub_offset, cost as i32);
1460            self.oog_stubs.push((p.stub_label, p.block_pc, cost));
1461        }
1462    }
1463
1464    /// 4-byte RV instruction dispatch (byte-based).
1465    ///
1466    /// Returns `(is_terminator, preserve_cf, extra_bytes)`. `extra_bytes`
1467    /// counts the additional bytes (beyond the 4-byte base) consumed by
1468    /// lookahead fusion. `preserve_cf` tells the streaming loop whether
1469    /// to keep `last_add_cf` alive for a following Sltu fusion.
1470    ///
1471    /// Hot path: walks the opcode-major tree directly on raw bits, no
1472    /// `Inst` enum constructed. Fusion sites (Ld→Add, Mul-pair) are
1473    /// inline at their dispatchers.
1474    /// `inst_len` is the encoded length of the instruction being
1475    /// compiled (4 for the 4-byte path, 2 for an RVC instruction
1476    /// expanded by [`compile_rvc`]). It is what jal/jalr use to compute
1477    /// the return-address `next_pc = pc + inst_len`; a hardcoded `pc + 4`
1478    /// would mis-set `ra` for `c.jalr` (a 2-byte indirect call).
1479    fn compile_rv4(&mut self, w: u32, pc: u32, inst_len: u32, rest: &[u8]) -> (bool, bool, usize) {
1480        use javm_exec::gas_cost::*;
1481        let opcode = (w >> 2) & 0x1F;
1482        let rd = ((w >> 7) & 0x1F) as u8;
1483        let rs1 = ((w >> 15) & 0x1F) as u8;
1484        let rs2 = ((w >> 20) & 0x1F) as u8;
1485        let f3 = ((w >> 12) & 0x07) as u8;
1486        let f7 = ((w >> 25) & 0x7F) as u8;
1487
1488        // x16..x31 in any register field is an illegal RV64E encoding:
1489        // panic, uniformly, matching the interpreter (which decodes such an
1490        // instruction as `Reserved`). Shared predicate so the two engines
1491        // can't drift; covers expanded RVC, which routes through here.
1492        if javm_exec::instruction::word_uses_reserved_reg(w) {
1493            self.rv_emit_panic_at(pc);
1494            self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
1495            return (true, false, 0);
1496        }
1497
1498        // x3/x4 in any register field: route to the cold spill path. These
1499        // are valid RV64E registers the runtime must execute, but the host
1500        // register file is full, so they live in `JitContext.regs[13|14]`
1501        // and are materialised per access. Conformant code never names them
1502        // (the toolchain rejects x3/x4), so this fork is never taken for the
1503        // 12 bench guests — the fast-path dispatch below stays byte-identical.
1504        if javm_exec::instruction::word_uses_spilled_reg(w) {
1505            return self.compile_rv_spilled(w, pc, inst_len);
1506        }
1507
1508        match opcode {
1509            OP_LOAD => self.compile_load(rd, rs1, f3, w, pc, rest),
1510            OP_STORE => self.compile_store(rs1, rs2, f3, w, pc),
1511            OP_IMM => self.compile_op_imm(rd, rs1, f3, w, pc),
1512            OP_OP_IMM_32 => self.compile_op_imm_32(rd, rs1, f3, w, pc),
1513            OP_OP => self.compile_op(rd, rs1, rs2, f3, f7, w, pc, rest),
1514            OP_OP_32 => self.compile_op_32(rd, rs1, rs2, f3, f7, w, pc),
1515            OP_LUI => self.compile_lui(rd, w, pc, rest),
1516            OP_AUIPC => self.compile_auipc(rd, w, pc),
1517            OP_JAL => self.compile_jal(rd, w, pc, inst_len),
1518            OP_JALR if f3 == 0 => self.compile_jalr(rd, rs1, w, pc, inst_len),
1519            OP_BRANCH => self.compile_branch(rs1, rs2, f3, w, pc),
1520            OP_CUSTOM_0 => self.compile_custom_0(rd, rs1, f3, w, pc),
1521            OP_MISC_MEM => {
1522                // Fence / FenceI — no-op emit.
1523                self.feed_gas_rv(RV_KIND_FENCE, 0, 0, 0);
1524                (false, false, 0)
1525            }
1526            // OP_SYSTEM, OP_CUSTOM_1, jalr-with-funct3≠0, etc. — all
1527            // forbidden in PVM2 and rejected by the linker's validator.
1528            // Defence in depth: emit a runtime panic if we ever see one.
1529            _ => {
1530                self.rv_emit_panic_at(pc);
1531                self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
1532                (true, false, 0)
1533            }
1534        }
1535    }
1536
1537    /// 2-byte RVC dispatch. Expands the compressed encoding to its
1538    /// 4-byte equivalent via `expand_rvc` and reuses `compile_rv4` —
1539    /// all the funct3/funct7 dispatch + fusion logic stays in one
1540    /// place, and the only RVC-specific code is the bit-shuffling of
1541    /// the expansion. The forbidden RVC encodings (c.ebreak, c.illegal)
1542    /// return `None` from `expand_rvc` and emit a panic; c.jr/c.jalr are
1543    /// standard jalr and compile normally.
1544    ///
1545    /// One contract: RVC expansion never produces a JAL with rd != 0
1546    /// (c.jal is RV32-only and doesn't exist in our target), so
1547    /// `compile_rv4`'s hardcoded `next_pc = pc + 4` is never consumed
1548    /// for return-address writes. Branches don't use next_pc either
1549    /// (the `_fallthrough` parameter on emit_branch_* is unused). The
1550    /// streaming loop's `pc += base_len + extra` advances by 2 for
1551    /// RVC regardless of what `compile_rv4` did internally.
1552    fn compile_rvc(&mut self, h: u16, pc: u32, rest: &[u8]) -> (bool, bool, usize) {
1553        use javm_exec::gas_cost::*;
1554        match expand_rvc(h) {
1555            // RVC instructions are 2 bytes — pass inst_len = 2 so a
1556            // `c.jalr` writes the correct return address (`pc + 2`).
1557            Some(w) => self.compile_rv4(w, pc, 2, rest),
1558            None => {
1559                self.rv_emit_panic_at(pc);
1560                self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
1561                (true, false, 0)
1562            }
1563        }
1564    }
1565
1566    // === x3/x4 spill path (cold) ==================================
1567    //
1568    // `x3`/`x4` are valid RV64E registers but have no host register (the
1569    // file is exhausted by the 13 commonly-used slots), so they live in
1570    // `JitContext.regs[13|14]` and are materialised per access. Conformant
1571    // code never names them — the toolchain rejects x3/x4 — so this whole
1572    // path is cold; it exists only so an untrusted RV64E blob executes
1573    // bit-for-bit with the interpreter.
1574    //
1575    // Gas is fed once here with the *original* x3/x4 slots (so the
1576    // memory-spill cost is charged, exactly as the interpreter charges it).
1577    // The actual emit is then done with `suppress_gas` set so the reused
1578    // fast-path helpers don't double-feed.
1579
1580    fn compile_rv_spilled(&mut self, w: u32, pc: u32, inst_len: u32) -> (bool, bool, usize) {
1581        use javm_exec::gas_cost::{rv_feed_gas_direct, rv_gas_meta};
1582        // Gas: original registers → memory-spill cost + the terminator flag,
1583        // matching the interpreter's per-instruction feed exactly.
1584        let inst = javm_exec::instruction::decode(&w.to_le_bytes())
1585            .expect("4-byte spilled word decodes")
1586            .0;
1587        let meta = rv_gas_meta(&inst);
1588        // Accumulate the #3 reserve here (the real feed for a spilled
1589        // instruction); the suppress_gas re-dispatch below must not.
1590        self.gas_reserve_accum = self
1591            .gas_reserve_accum
1592            .saturating_add(javm_exec::gas_cost::rv_kind_reserve(meta.kind));
1593        let term = rv_feed_gas_direct(&meta, &mut self.gas_sim, self.mem_cycles);
1594
1595        let opcode = (w >> 2) & 0x1F;
1596        let f3 = (w >> 12) & 0x07;
1597        let rd = ((w >> 7) & 0x1F) as u8;
1598        let rs1 = ((w >> 15) & 0x1F) as u8;
1599        let rs2 = ((w >> 20) & 0x1F) as u8;
1600
1601        self.suppress_gas = true;
1602        match opcode {
1603            // Terminators: sources go through the spill-aware `rv_read` /
1604            // `rv_read_into` (into the clobberable SCRATCH, which the jump
1605            // discards), and spilled `rd` / `rs2` are handled inline.
1606            OP_JAL => self.rv_jal_spilled(rd, imm_j(w), pc, pc + inst_len),
1607            OP_JALR if f3 == 0 => self.rv_jalr_spilled(rd, rs1, imm_i(w), pc, pc + inst_len),
1608            OP_BRANCH => self.rv_branch_spilled(rs1, rs2, f3 as u8, imm_b(w), pc),
1609            // Non-terminators: load spilled sources into collision-free donor
1610            // host registers, rewrite the word, re-dispatch through the fast
1611            // path, store the spilled dest back, restore donors.
1612            OP_LOAD | OP_STORE | OP_IMM | OP_OP_IMM_32 | OP_OP | OP_OP_32 | OP_LUI | OP_AUIPC => {
1613                self.emit_spilled_via_donors(w, pc, inst_len)
1614            }
1615            // No other opcode names x3/x4 in a register field
1616            // (`word_uses_spilled_reg` only fires for R/I/S/B/U/J formats).
1617            _ => self.rv_emit_panic_at(pc),
1618        }
1619        self.suppress_gas = false;
1620        (term, false, 0)
1621    }
1622
1623    /// Non-terminator spill: borrow donor host registers for x3/x4, rewrite
1624    /// the instruction to name the donors, re-dispatch through the fast path
1625    /// (gas suppressed, no fusion), then store a spilled dest back. Bounded:
1626    /// ≤2 donor push/pops + ≤2 loads + ≤1 store around one fast-path emit.
1627    fn emit_spilled_via_donors(&mut self, w: u32, pc: u32, inst_len: u32) {
1628        let opcode = (w >> 2) & 0x1F;
1629        let rd = ((w >> 7) & 0x1F) as u8;
1630        let rs1 = ((w >> 15) & 0x1F) as u8;
1631        let rs2 = ((w >> 20) & 0x1F) as u8;
1632
1633        // Which register fields this format actually uses, and their role.
1634        let (has_rd, has_rs1, has_rs2) = match opcode {
1635            OP_LOAD | OP_IMM | OP_OP_IMM_32 => (true, true, false),
1636            OP_OP | OP_OP_32 => (true, true, true),
1637            OP_STORE => (false, true, true),
1638            OP_LUI | OP_AUIPC => (true, false, false),
1639            _ => {
1640                self.rv_emit_panic_at(pc);
1641                return;
1642            }
1643        };
1644
1645        // Mark non-spilled, host-mapped operand regs so a donor never aliases
1646        // a real operand (x0 has no host reg; x3/x4 are the ones we replace).
1647        let mut blocked = [false; 16];
1648        let mut block = |x: u8| {
1649            if x != 0 && !reg_is_spilled(x) && !rv_is_reserved(x) {
1650                blocked[x as usize] = true;
1651            }
1652        };
1653        if has_rd {
1654            block(rd);
1655        }
1656        if has_rs1 {
1657            block(rs1);
1658        }
1659        if has_rs2 {
1660            block(rs2);
1661        }
1662
1663        // Does x3 / x4 appear in any used register field?
1664        let appears = |x: u8| (has_rd && rd == x) || (has_rs1 && rs1 == x) || (has_rs2 && rs2 == x);
1665        let need3 = appears(3);
1666        let need4 = appears(4);
1667
1668        // Pick donor RV regs from the host-mapped set, avoiding real operands.
1669        const CANDIDATES: [u8; 13] = [1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
1670        let pick = |blocked: &mut [bool; 16]| -> u8 {
1671            for &c in &CANDIDATES {
1672                if !blocked[c as usize] {
1673                    blocked[c as usize] = true;
1674                    return c;
1675                }
1676            }
1677            unreachable!("≤2 operands block ≤2 of 13 candidates")
1678        };
1679        let donor3 = if need3 { pick(&mut blocked) } else { 0 };
1680        let donor4 = if need4 { pick(&mut blocked) } else { 0 };
1681
1682        // Save the donor host registers (restored after the emit).
1683        let mut donors = [0u8; 2];
1684        let mut n = 0;
1685        if need3 {
1686            donors[n] = donor3;
1687            n += 1;
1688        }
1689        if need4 {
1690            donors[n] = donor4;
1691            n += 1;
1692        }
1693        for d in &donors[..n] {
1694            self.asm.push(REG_MAP[rv_slot(*d).unwrap()]);
1695        }
1696
1697        // Load spilled *sources* into their donor host regs.
1698        let src3 = (has_rs1 && rs1 == 3) || (has_rs2 && rs2 == 3);
1699        let src4 = (has_rs1 && rs1 == 4) || (has_rs2 && rs2 == 4);
1700        if need3 && src3 {
1701            self.asm
1702                .mov_load64_rip_rel(REG_MAP[rv_slot(donor3).unwrap()], spill_va(3));
1703        }
1704        if need4 && src4 {
1705            self.asm
1706                .mov_load64_rip_rel(REG_MAP[rv_slot(donor4).unwrap()], spill_va(4));
1707        }
1708
1709        // Donor reg_defs must not carry stale const/scaled-add tracking into
1710        // the fast-path emit (we restore the donor's value afterwards).
1711        for d in &donors[..n] {
1712            self.invalidate_reg(rv_slot(*d).unwrap());
1713        }
1714
1715        // Rewrite the word: x3 → donor3, x4 → donor4 in the used fields.
1716        let repl = |r: u8| -> u8 {
1717            if r == 3 {
1718                donor3
1719            } else if r == 4 {
1720                donor4
1721            } else {
1722                r
1723            }
1724        };
1725        let mut wr = w;
1726        if has_rd {
1727            wr = (wr & !(0x1F << 7)) | ((repl(rd) as u32) << 7);
1728        }
1729        if has_rs1 {
1730            wr = (wr & !(0x1F << 15)) | ((repl(rs1) as u32) << 15);
1731        }
1732        if has_rs2 {
1733            wr = (wr & !(0x1F << 20)) | ((repl(rs2) as u32) << 20);
1734        }
1735
1736        // Re-dispatch with no lookahead (empty `rest` → no fusion). Gas is
1737        // already suppressed by the caller.
1738        let _ = self.compile_rv4(wr, pc, inst_len, &[]);
1739
1740        // Store a spilled destination back to its home.
1741        if has_rd && reg_is_spilled(rd) {
1742            let donor = if rd == 3 { donor3 } else { donor4 };
1743            self.asm
1744                .mov_store64_rip_rel(spill_va(rd), REG_MAP[rv_slot(donor).unwrap()]);
1745        }
1746
1747        // Restore donors (value + reg_defs).
1748        for d in &donors[..n] {
1749            self.invalidate_reg(rv_slot(*d).unwrap());
1750        }
1751        for d in donors[..n].iter().rev() {
1752            self.asm.pop(REG_MAP[rv_slot(*d).unwrap()]);
1753        }
1754    }
1755
1756    /// `jal rd, imm` where `rd ∈ {x3, x4}` (the only register field of a
1757    /// J-type). Writes the return-address VA (a constant `< 4 GiB`, so the
1758    /// high word is 0) straight to the spilled home, then jumps.
1759    fn rv_jal_spilled(&mut self, rd: u8, imm: i32, pc: u32, next_pc: u32) {
1760        let ret = self.code_base.wrapping_add(next_pc);
1761        self.asm.mov_store32_rip_rel_imm(spill_va(rd), ret as i32);
1762        self.asm.mov_store32_rip_rel_imm(spill_va(rd) + 4, 0);
1763        let target = (pc as i64).wrapping_add(imm as i64) as u32;
1764        self.emit_static_branch(target, true, next_pc, pc);
1765    }
1766
1767    /// `jalr rd, rs1, imm` with `x3`/`x4` in `rd` and/or `rs1`. Mirrors
1768    /// [`rv_jalr`] but reads `rs1` via the spill-aware [`rv_read`] (into
1769    /// SCRATCH) and writes a spilled `rd` to its home as two 32-bit stores
1770    /// (low = return VA, high = 0), leaving SCRATCH holding the target.
1771    fn rv_jalr_spilled(&mut self, rd: u8, rs1: u8, imm: i32, pc: u32, next_pc: u32) {
1772        use super::asm::Cc;
1773        // SCRATCH = (rs1 + imm) & 0xFFFFFFFF.
1774        self.rv_read(rs1, SCRATCH, pc);
1775        if imm != 0 {
1776            self.asm.add_ri(SCRATCH, imm);
1777        }
1778        self.asm.shl_ri64(SCRATCH, 32);
1779        self.asm.shr_ri64(SCRATCH, 32);
1780        // Return address (a guest VA) into rd — after the SCRATCH read, so a
1781        // spilled rd == rs1 doesn't perturb the target already in SCRATCH.
1782        if rd != 0 {
1783            let ret = self.code_base.wrapping_add(next_pc);
1784            if reg_is_spilled(rd) {
1785                self.asm.mov_store32_rip_rel_imm(spill_va(rd), ret as i32);
1786                self.asm.mov_store32_rip_rel_imm(spill_va(rd) + 4, 0);
1787            } else {
1788                let slot = rv_slot(rd).unwrap();
1789                self.asm.mov_ri64(REG_MAP[slot], ret as u64);
1790                self.invalidate_reg(slot);
1791            }
1792        }
1793        if self.code_base != 0 {
1794            self.asm.sub_ri(SCRATCH, self.code_base as i32);
1795        }
1796        self.asm.mov_store32_rip_rel(CTX_PC, SCRATCH);
1797        self.asm.cmp_ri32(SCRATCH, self.code_len as i32);
1798        self.asm.jcc_label(Cc::AE, self.panic_label);
1799        self.asm.push(Reg::RAX);
1800        self.asm.mov_load64_rip_rel(Reg::RAX, CTX_DISPATCH_TABLE);
1801        self.asm.movsxd_load_sib4(Reg::RAX, Reg::RAX, SCRATCH);
1802        self.asm.add_r64_mem_rip_rel(Reg::RAX, CTX_CODE_BASE);
1803        self.asm.mov_rr(SCRATCH, Reg::RAX);
1804        self.asm.pop(Reg::RAX);
1805        self.asm.jmp_reg(SCRATCH);
1806    }
1807
1808    /// Conditional branch with `x3`/`x4` in `rs1` and/or `rs2`. `rs1` loads
1809    /// via the spill-aware [`rv_read_into`]; `rs2` is compared as a register,
1810    /// the immediate 0, or directly from its spilled memory home (so the
1811    /// both-spilled case needs no second scratch).
1812    fn rv_branch_spilled(&mut self, rs1: u8, rs2: u8, f3: u8, imm: i32, pc: u32) {
1813        let cc = match f3 {
1814            0b000 => Cc::E,
1815            0b001 => Cc::NE,
1816            0b100 => Cc::L,
1817            0b101 => Cc::GE,
1818            0b110 => Cc::B,
1819            0b111 => Cc::AE,
1820            _ => {
1821                self.rv_emit_panic_at(pc);
1822                return;
1823            }
1824        };
1825        let target = (pc as i64).wrapping_add(imm as i64) as u32;
1826        let a = self.rv_read_into(rs1, SCRATCH, pc);
1827        if reg_is_spilled(rs2) {
1828            self.asm.cmp_r64_mem_rip_rel(a, spill_va(rs2));
1829        } else if rs2 == 0 {
1830            self.asm.cmp_ri32(a, 0);
1831        } else {
1832            self.asm.cmp_rr(a, REG_MAP[rv_slot(rs2).unwrap()]);
1833        }
1834        self.emit_cond_branch_to(cc, target, pc);
1835    }
1836
1837    /// Emit a conditional jump to `target` assuming flags are already set
1838    /// (the spilled-branch path emits its own `cmp`). Mirrors the streaming
1839    /// deferral / non-block-start-panic logic of [`emit_branch_reg`]; byte
1840    /// layout differs (cold path), but the control-flow outcome is identical.
1841    fn emit_cond_branch_to(&mut self, cc: Cc, target: u32, pc: u32) {
1842        if self.rv_streaming && target > pc {
1843            let label = self.label_for_pc(target);
1844            let fixup_idx = self.asm.fixups_len();
1845            self.asm.jcc_label(cc, label);
1846            self.rv_pending_fwd_branches.push((target, pc, fixup_idx));
1847            return;
1848        }
1849        if !self.is_basic_block_start(target) {
1850            self.asm.mov_store32_rip_rel_imm(CTX_PC, pc as i32);
1851            self.asm.jcc_label(cc, self.panic_label);
1852            return;
1853        }
1854        let label = self.label_for_pc(target);
1855        self.asm.jcc_label(cc, label);
1856    }
1857
1858    // === Per-opcode dispatchers (4-byte path) =====================
1859
1860    fn compile_load(
1861        &mut self,
1862        rd: u8,
1863        rs1: u8,
1864        f3: u8,
1865        w: u32,
1866        pc: u32,
1867        rest: &[u8],
1868    ) -> (bool, bool, usize) {
1869        use javm_exec::gas_cost::*;
1870        let imm = imm_i(w);
1871        let (width, signed) = match f3 {
1872            0b000 => (1u32, true),
1873            0b001 => (2, true),
1874            0b010 => (4, true),
1875            0b011 => (8, false),
1876            0b100 => (1, false),
1877            0b101 => (2, false),
1878            0b110 => (4, false),
1879            _ => {
1880                self.rv_emit_panic_at(pc);
1881                self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
1882                return (true, false, 0);
1883            }
1884        };
1885        // Ld→{Add,Xor,Or,And} fusion: only triggers on the 64-bit `ld`
1886        // (f3 == 0b011). `peek_alu_rr_trailer` handles both 4-byte OP_OP and
1887        // 2-byte RVC trailing forms — half the code in these guests is RVC,
1888        // so missing c.add/c.{and,or,xor} would forfeit most of the win.
1889        if width == 8
1890            && rd != 0
1891            && !rv_is_reserved(rd)
1892            && !rv_is_reserved(rs1)
1893            && let Some((op, a_rd, a_rs1, a_rs2, consumed)) = peek_alu_rr_trailer(rest)
1894            && a_rd != 0
1895            && !rv_is_reserved(a_rd)
1896            && (a_rs1 == rd || a_rs2 == rd)
1897            && (a_rs1 == 0 || !rv_is_reserved(a_rs1))
1898            && (a_rs2 == 0 || !rv_is_reserved(a_rs2))
1899        {
1900            self.rv_load(rd, rs1, imm, 8, false, pc);
1901            self.feed_gas_rv(RV_KIND_LOAD, rs1, 0, rd);
1902            let next_pc = pc + 4;
1903            self.rv_alu_rr(a_rd, a_rs1, a_rs2, op, next_pc);
1904            // ScaledAdd tracking only meaningful for Add.
1905            if matches!(op, AluOp::Add) && a_rd != a_rs1 && a_rd != a_rs2 {
1906                self.track_add_scaledadd(a_rd, a_rs1, a_rs2);
1907            }
1908            self.feed_gas_rv(RV_KIND_ADD, a_rs1, a_rs2, a_rd);
1909            // preserve_cf only valid for Add (Sltu fusion consumer).
1910            let preserve_cf = matches!(op, AluOp::Add);
1911            return (false, preserve_cf, consumed);
1912        }
1913        self.rv_load(rd, rs1, imm, width, signed, pc);
1914        let term = self.feed_gas_rv(RV_KIND_LOAD, rs1, 0, rd);
1915        (term, false, 0)
1916    }
1917
1918    fn compile_store(&mut self, rs1: u8, rs2: u8, f3: u8, w: u32, pc: u32) -> (bool, bool, usize) {
1919        use javm_exec::gas_cost::*;
1920        let imm = imm_s(w);
1921        let width = match f3 {
1922            0b000 => 1u32,
1923            0b001 => 2,
1924            0b010 => 4,
1925            0b011 => 8,
1926            _ => {
1927                self.rv_emit_panic_at(pc);
1928                self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
1929                return (true, false, 0);
1930            }
1931        };
1932        self.rv_store(rs1, rs2, imm, width, pc);
1933        let term = self.feed_gas_rv(RV_KIND_STORE, rs1, rs2, 0);
1934        (term, false, 0)
1935    }
1936
1937    fn compile_op_imm(&mut self, rd: u8, rs1: u8, f3: u8, w: u32, pc: u32) -> (bool, bool, usize) {
1938        use javm_exec::gas_cost::*;
1939        match f3 {
1940            0b000 => {
1941                // Addi
1942                let imm = imm_i(w);
1943                self.rv_alu_imm(rd, rs1, imm, AluImmOp::Add, pc);
1944                if rs1 == 0 {
1945                    self.track_const(rd, imm);
1946                }
1947                let term = self.feed_gas_rv(RV_KIND_ADDI, rs1, 0, rd);
1948                (term, false, 0)
1949            }
1950            0b010 => {
1951                let imm = imm_i(w);
1952                self.rv_slt_imm(rd, rs1, imm, true, pc);
1953                let term = self.feed_gas_rv(RV_KIND_ADDI, rs1, 0, rd);
1954                (term, false, 0)
1955            }
1956            0b011 => {
1957                let imm = imm_i(w);
1958                self.rv_slt_imm(rd, rs1, imm, false, pc);
1959                let term = self.feed_gas_rv(RV_KIND_ADDI, rs1, 0, rd);
1960                (term, false, 0)
1961            }
1962            0b100 => {
1963                let imm = imm_i(w);
1964                self.rv_alu_imm(rd, rs1, imm, AluImmOp::Xor, pc);
1965                let term = self.feed_gas_rv(RV_KIND_ADDI, rs1, 0, rd);
1966                (term, false, 0)
1967            }
1968            0b110 => {
1969                let imm = imm_i(w);
1970                self.rv_alu_imm(rd, rs1, imm, AluImmOp::Or, pc);
1971                let term = self.feed_gas_rv(RV_KIND_ADDI, rs1, 0, rd);
1972                (term, false, 0)
1973            }
1974            0b111 => {
1975                let imm = imm_i(w);
1976                self.rv_alu_imm(rd, rs1, imm, AluImmOp::And, pc);
1977                let term = self.feed_gas_rv(RV_KIND_ADDI, rs1, 0, rd);
1978                (term, false, 0)
1979            }
1980            0b001 => {
1981                // SLLI / Zbs Bclri / Bseti / Binvi / Zbb unary (clz, ctz,
1982                // cpop, sext.b, sext.h) — distinguished by funct6 (the
1983                // top 6 bits) + rs2 field for Zbb unaries.
1984                let shtype = (w >> 26) & 0x3F;
1985                let shamt = ((w >> 20) & 0x3F) as u8;
1986                let rs2_field = (w >> 20) & 0x1F;
1987                match shtype {
1988                    0b000000 => {
1989                        self.rv_shift_imm(rd, rs1, shamt, ShiftOp::Shl64, pc);
1990                        if (1..=3).contains(&shamt) && rs1 != rd {
1991                            self.track_shifted(rd, rs1, shamt);
1992                        }
1993                        let term = self.feed_gas_rv(RV_KIND_ADDI, rs1, 0, rd);
1994                        (term, false, 0)
1995                    }
1996                    0b010010 => {
1997                        self.rv_bit_imm(rd, rs1, shamt, BitOp::Clear, pc);
1998                        let term = self.feed_gas_rv(RV_KIND_ZBS_IMM, rs1, 0, rd);
1999                        (term, false, 0)
2000                    }
2001                    0b001010 => {
2002                        self.rv_bit_imm(rd, rs1, shamt, BitOp::Set, pc);
2003                        let term = self.feed_gas_rv(RV_KIND_ZBS_IMM, rs1, 0, rd);
2004                        (term, false, 0)
2005                    }
2006                    0b011010 => {
2007                        self.rv_bit_imm(rd, rs1, shamt, BitOp::Invert, pc);
2008                        let term = self.feed_gas_rv(RV_KIND_ZBS_IMM, rs1, 0, rd);
2009                        (term, false, 0)
2010                    }
2011                    0b011000 => {
2012                        let (op, kind) = match rs2_field {
2013                            0b00000 => (UnaryOp::Clz64, RV_KIND_ZBB_U1),
2014                            0b00001 => (UnaryOp::Ctz64, RV_KIND_ZBB_CTZ),
2015                            0b00010 => (UnaryOp::Popcnt64, RV_KIND_ZBB_U1),
2016                            0b00100 => (UnaryOp::SextB, RV_KIND_ZBB_U1),
2017                            0b00101 => (UnaryOp::SextH, RV_KIND_ZBB_U1),
2018                            _ => {
2019                                self.rv_emit_panic_at(pc);
2020                                self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
2021                                return (true, false, 0);
2022                            }
2023                        };
2024                        self.rv_unary(rd, rs1, op, pc);
2025                        let term = self.feed_gas_rv(kind, rs1, 0, rd);
2026                        (term, false, 0)
2027                    }
2028                    _ => {
2029                        self.rv_emit_panic_at(pc);
2030                        self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
2031                        (true, false, 0)
2032                    }
2033                }
2034            }
2035            0b101 => {
2036                // SRLI / SRAI / Bexti / Rori / OrcB / Rev8.
2037                let shtype = (w >> 26) & 0x3F;
2038                let shamt = ((w >> 20) & 0x3F) as u8;
2039                let rs2_field = (w >> 20) & 0x1F;
2040                match shtype {
2041                    0b000000 => {
2042                        self.rv_shift_imm(rd, rs1, shamt, ShiftOp::Shr64, pc);
2043                        let term = self.feed_gas_rv(RV_KIND_ADDI, rs1, 0, rd);
2044                        (term, false, 0)
2045                    }
2046                    0b010000 => {
2047                        self.rv_shift_imm(rd, rs1, shamt, ShiftOp::Sar64, pc);
2048                        let term = self.feed_gas_rv(RV_KIND_ADDI, rs1, 0, rd);
2049                        (term, false, 0)
2050                    }
2051                    0b010010 => {
2052                        self.rv_bit_imm(rd, rs1, shamt, BitOp::Extract, pc);
2053                        let term = self.feed_gas_rv(RV_KIND_ZBS_IMM, rs1, 0, rd);
2054                        (term, false, 0)
2055                    }
2056                    0b011000 => {
2057                        self.rv_shift_imm(rd, rs1, shamt, ShiftOp::Ror64, pc);
2058                        let term = self.feed_gas_rv(RV_KIND_ZBB_RORI, rs1, 0, rd);
2059                        (term, false, 0)
2060                    }
2061                    0b001010 if rs2_field == 0b00111 => {
2062                        self.rv_unary(rd, rs1, UnaryOp::OrcB, pc);
2063                        let term = self.feed_gas_rv(RV_KIND_ZBB_U1, rs1, 0, rd);
2064                        (term, false, 0)
2065                    }
2066                    0b011010 if rs2_field == 0b11000 => {
2067                        self.rv_unary(rd, rs1, UnaryOp::Rev8, pc);
2068                        let term = self.feed_gas_rv(RV_KIND_ZBB_U1, rs1, 0, rd);
2069                        (term, false, 0)
2070                    }
2071                    _ => {
2072                        self.rv_emit_panic_at(pc);
2073                        self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
2074                        (true, false, 0)
2075                    }
2076                }
2077            }
2078            _ => {
2079                self.rv_emit_panic_at(pc);
2080                self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
2081                (true, false, 0)
2082            }
2083        }
2084    }
2085
2086    fn compile_op_imm_32(
2087        &mut self,
2088        rd: u8,
2089        rs1: u8,
2090        f3: u8,
2091        w: u32,
2092        pc: u32,
2093    ) -> (bool, bool, usize) {
2094        use javm_exec::gas_cost::*;
2095        match f3 {
2096            0b000 => {
2097                let imm = imm_i(w);
2098                self.rv_alu_imm(rd, rs1, imm, AluImmOp::Addw, pc);
2099                let term = self.feed_gas_rv(RV_KIND_ADDIW, rs1, 0, rd);
2100                (term, false, 0)
2101            }
2102            0b001 => {
2103                let f7 = (w >> 25) & 0x7F;
2104                let shamt5 = ((w >> 20) & 0x1F) as u8;
2105                match f7 {
2106                    0b0000000 => {
2107                        self.rv_shift_imm(rd, rs1, shamt5, ShiftOp::Shl32, pc);
2108                        let term = self.feed_gas_rv(RV_KIND_ADDIW, rs1, 0, rd);
2109                        (term, false, 0)
2110                    }
2111                    0b0000100 => {
2112                        // Slli.uw — uses 6-bit shamt (RV64).
2113                        let shamt6 = ((w >> 20) & 0x3F) as u8;
2114                        self.rv_slliuw(rd, rs1, shamt6, pc);
2115                        let term = self.feed_gas_rv(RV_KIND_ZBA_IMM, rs1, 0, rd);
2116                        (term, false, 0)
2117                    }
2118                    0b0110000 => {
2119                        let rs2_field = (w >> 20) & 0x1F;
2120                        let op = match rs2_field {
2121                            0b00000 => UnaryOp::Clz32,
2122                            0b00001 => UnaryOp::Ctz32,
2123                            0b00010 => UnaryOp::Popcnt32,
2124                            _ => {
2125                                self.rv_emit_panic_at(pc);
2126                                self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
2127                                return (true, false, 0);
2128                            }
2129                        };
2130                        let kind = if matches!(op, UnaryOp::Ctz32) {
2131                            RV_KIND_ZBB_CTZ
2132                        } else {
2133                            RV_KIND_ZBB_U1
2134                        };
2135                        self.rv_unary(rd, rs1, op, pc);
2136                        let term = self.feed_gas_rv(kind, rs1, 0, rd);
2137                        (term, false, 0)
2138                    }
2139                    _ => {
2140                        self.rv_emit_panic_at(pc);
2141                        self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
2142                        (true, false, 0)
2143                    }
2144                }
2145            }
2146            0b101 => {
2147                let f7 = (w >> 25) & 0x7F;
2148                let shamt5 = ((w >> 20) & 0x1F) as u8;
2149                match f7 {
2150                    0b0000000 => {
2151                        self.rv_shift_imm(rd, rs1, shamt5, ShiftOp::Shr32, pc);
2152                        let term = self.feed_gas_rv(RV_KIND_ADDIW, rs1, 0, rd);
2153                        (term, false, 0)
2154                    }
2155                    0b0100000 => {
2156                        self.rv_shift_imm(rd, rs1, shamt5, ShiftOp::Sar32, pc);
2157                        let term = self.feed_gas_rv(RV_KIND_ADDIW, rs1, 0, rd);
2158                        (term, false, 0)
2159                    }
2160                    0b0110000 => {
2161                        self.rv_shift_imm(rd, rs1, shamt5, ShiftOp::Ror32, pc);
2162                        let term = self.feed_gas_rv(RV_KIND_ZBB_RORIW, rs1, 0, rd);
2163                        (term, false, 0)
2164                    }
2165                    _ => {
2166                        self.rv_emit_panic_at(pc);
2167                        self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
2168                        (true, false, 0)
2169                    }
2170                }
2171            }
2172            _ => {
2173                self.rv_emit_panic_at(pc);
2174                self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
2175                (true, false, 0)
2176            }
2177        }
2178    }
2179
2180    #[allow(clippy::too_many_arguments)]
2181    fn compile_op(
2182        &mut self,
2183        rd: u8,
2184        rs1: u8,
2185        rs2: u8,
2186        f3: u8,
2187        f7: u8,
2188        w: u32,
2189        pc: u32,
2190        rest: &[u8],
2191    ) -> (bool, bool, usize) {
2192        use javm_exec::gas_cost::*;
2193        // Mul-pair fusion: a 64-bit `mul` (f7=0000001, f3=000) followed
2194        // by `mulh`/`mulhu` on the SAME operand pair folds into a single
2195        // x86 imul/mul that produces RDX:RAX (lo:hi). See commit
2196        // `perf(pvm2): mul-pair fusion`.
2197        if f7 == 0b0000001
2198            && f3 == 0b000
2199            && let Some(extra) = self.try_fuse_mul_pair_bytes(rd, rs1, rs2, rest, pc)
2200        {
2201            return (false, false, extra);
2202        }
2203        match (f7, f3) {
2204            (0b0000000, 0b000) => {
2205                self.rv_alu_rr(rd, rs1, rs2, AluOp::Add, pc);
2206                if rd != rs1 && rd != rs2 {
2207                    self.track_add_scaledadd(rd, rs1, rs2);
2208                }
2209                let term = self.feed_gas_rv(RV_KIND_ADD, rs1, rs2, rd);
2210                (term, true, 0)
2211            }
2212            (0b0100000, 0b000) => {
2213                self.rv_alu_rr(rd, rs1, rs2, AluOp::Sub, pc);
2214                let term = self.feed_gas_rv(RV_KIND_ADD, rs1, rs2, rd);
2215                (term, false, 0)
2216            }
2217            (0b0000000, 0b001) => {
2218                self.rv_shift_rr(rd, rs1, rs2, ShiftOp::Shl64, pc);
2219                let term = self.feed_gas_rv(RV_KIND_SLL, rs1, rs2, rd);
2220                (term, false, 0)
2221            }
2222            (0b0000000, 0b010) => {
2223                self.rv_slt_rr(rd, rs1, rs2, true, pc);
2224                let term = self.feed_gas_rv(RV_KIND_SLT, rs1, rs2, rd);
2225                (term, false, 0)
2226            }
2227            (0b0000000, 0b011) => {
2228                // Sltu — preserve_cf so the next-instruction CF clear
2229                // doesn't trample a pending Add's flags before rv_slt_rr
2230                // had a chance to consume them. (Note: rv_slt_rr already
2231                // handles the case where last_add_cf is stale; we just
2232                // skip the post-emit clear here to mirror the legacy
2233                // behaviour.)
2234                self.rv_slt_rr(rd, rs1, rs2, false, pc);
2235                let term = self.feed_gas_rv(RV_KIND_SLT, rs1, rs2, rd);
2236                (term, true, 0)
2237            }
2238            (0b0000000, 0b100) => {
2239                self.rv_alu_rr(rd, rs1, rs2, AluOp::Xor, pc);
2240                let term = self.feed_gas_rv(RV_KIND_ADD, rs1, rs2, rd);
2241                (term, false, 0)
2242            }
2243            (0b0000000, 0b101) => {
2244                self.rv_shift_rr(rd, rs1, rs2, ShiftOp::Shr64, pc);
2245                let term = self.feed_gas_rv(RV_KIND_SLL, rs1, rs2, rd);
2246                (term, false, 0)
2247            }
2248            (0b0100000, 0b101) => {
2249                self.rv_shift_rr(rd, rs1, rs2, ShiftOp::Sar64, pc);
2250                let term = self.feed_gas_rv(RV_KIND_SLL, rs1, rs2, rd);
2251                (term, false, 0)
2252            }
2253            (0b0000000, 0b110) => {
2254                self.rv_alu_rr(rd, rs1, rs2, AluOp::Or, pc);
2255                let term = self.feed_gas_rv(RV_KIND_ADD, rs1, rs2, rd);
2256                (term, false, 0)
2257            }
2258            (0b0000000, 0b111) => {
2259                self.rv_alu_rr(rd, rs1, rs2, AluOp::And, pc);
2260                let term = self.feed_gas_rv(RV_KIND_ADD, rs1, rs2, rd);
2261                (term, false, 0)
2262            }
2263            // M extension
2264            (0b0000001, 0b000) => {
2265                self.rv_alu_rr(rd, rs1, rs2, AluOp::Mul, pc);
2266                let term = self.feed_gas_rv(RV_KIND_MUL, rs1, rs2, rd);
2267                (term, false, 0)
2268            }
2269            (0b0000001, 0b001) => {
2270                self.rv_mulh(rd, rs1, rs2, true, true, pc);
2271                let term = self.feed_gas_rv(RV_KIND_MULH, rs1, rs2, rd);
2272                (term, false, 0)
2273            }
2274            (0b0000001, 0b010) => {
2275                self.rv_mulh(rd, rs1, rs2, true, false, pc);
2276                let term = self.feed_gas_rv(RV_KIND_MULHSU, rs1, rs2, rd);
2277                (term, false, 0)
2278            }
2279            (0b0000001, 0b011) => {
2280                self.rv_mulh(rd, rs1, rs2, false, false, pc);
2281                let term = self.feed_gas_rv(RV_KIND_MULH, rs1, rs2, rd);
2282                (term, false, 0)
2283            }
2284            (0b0000001, 0b100) => {
2285                self.rv_div_rem(rd, rs1, rs2, true, false, false, pc);
2286                let term = self.feed_gas_rv(RV_KIND_DIV, rs1, rs2, rd);
2287                (term, false, 0)
2288            }
2289            (0b0000001, 0b101) => {
2290                self.rv_div_rem(rd, rs1, rs2, false, false, false, pc);
2291                let term = self.feed_gas_rv(RV_KIND_DIV, rs1, rs2, rd);
2292                (term, false, 0)
2293            }
2294            (0b0000001, 0b110) => {
2295                self.rv_div_rem(rd, rs1, rs2, true, true, false, pc);
2296                let term = self.feed_gas_rv(RV_KIND_DIV, rs1, rs2, rd);
2297                (term, false, 0)
2298            }
2299            (0b0000001, 0b111) => {
2300                self.rv_div_rem(rd, rs1, rs2, false, true, false, pc);
2301                let term = self.feed_gas_rv(RV_KIND_DIV, rs1, rs2, rd);
2302                (term, false, 0)
2303            }
2304            // Zbb inv / xnor / min / max
2305            (0b0100000, 0b111) => {
2306                self.rv_alu_rr(rd, rs1, rs2, AluOp::Andn, pc);
2307                let term = self.feed_gas_rv(RV_KIND_ZBB_INV, rs1, rs2, rd);
2308                (term, false, 0)
2309            }
2310            (0b0100000, 0b110) => {
2311                self.rv_alu_rr(rd, rs1, rs2, AluOp::Orn, pc);
2312                let term = self.feed_gas_rv(RV_KIND_ZBB_INV, rs1, rs2, rd);
2313                (term, false, 0)
2314            }
2315            (0b0100000, 0b100) => {
2316                self.rv_alu_rr(rd, rs1, rs2, AluOp::Xnor, pc);
2317                let term = self.feed_gas_rv(RV_KIND_ZBB_XNOR, rs1, rs2, rd);
2318                (term, false, 0)
2319            }
2320            (0b0000101, 0b100) => {
2321                self.rv_alu_rr(rd, rs1, rs2, AluOp::Min, pc);
2322                let term = self.feed_gas_rv(RV_KIND_ZBB_MINMAX, rs1, rs2, rd);
2323                (term, false, 0)
2324            }
2325            (0b0000101, 0b101) => {
2326                self.rv_alu_rr(rd, rs1, rs2, AluOp::Minu, pc);
2327                let term = self.feed_gas_rv(RV_KIND_ZBB_MINMAX, rs1, rs2, rd);
2328                (term, false, 0)
2329            }
2330            (0b0000101, 0b110) => {
2331                self.rv_alu_rr(rd, rs1, rs2, AluOp::Max, pc);
2332                let term = self.feed_gas_rv(RV_KIND_ZBB_MINMAX, rs1, rs2, rd);
2333                (term, false, 0)
2334            }
2335            (0b0000101, 0b111) => {
2336                self.rv_alu_rr(rd, rs1, rs2, AluOp::Maxu, pc);
2337                let term = self.feed_gas_rv(RV_KIND_ZBB_MINMAX, rs1, rs2, rd);
2338                (term, false, 0)
2339            }
2340            (0b0110000, 0b001) => {
2341                self.rv_shift_rr(rd, rs1, rs2, ShiftOp::Rol64, pc);
2342                let term = self.feed_gas_rv(RV_KIND_ZBB_ROT, rs1, rs2, rd);
2343                (term, false, 0)
2344            }
2345            (0b0110000, 0b101) => {
2346                self.rv_shift_rr(rd, rs1, rs2, ShiftOp::Ror64, pc);
2347                let term = self.feed_gas_rv(RV_KIND_ZBB_ROT, rs1, rs2, rd);
2348                (term, false, 0)
2349            }
2350            // Zba shift-add
2351            (0b0010000, 0b010) => {
2352                self.rv_shadd(rd, rs1, rs2, 1, false, pc);
2353                self.record_scaledadd(rd, rs1, rs2, 1);
2354                let term = self.feed_gas_rv(RV_KIND_ZBA, rs1, rs2, rd);
2355                (term, false, 0)
2356            }
2357            (0b0010000, 0b100) => {
2358                self.rv_shadd(rd, rs1, rs2, 2, false, pc);
2359                self.record_scaledadd(rd, rs1, rs2, 2);
2360                let term = self.feed_gas_rv(RV_KIND_ZBA, rs1, rs2, rd);
2361                (term, false, 0)
2362            }
2363            (0b0010000, 0b110) => {
2364                self.rv_shadd(rd, rs1, rs2, 3, false, pc);
2365                self.record_scaledadd(rd, rs1, rs2, 3);
2366                let term = self.feed_gas_rv(RV_KIND_ZBA, rs1, rs2, rd);
2367                (term, false, 0)
2368            }
2369            // Zbs
2370            (0b0100100, 0b001) => {
2371                self.rv_bit_rr(rd, rs1, rs2, BitOp::Clear, pc);
2372                let term = self.feed_gas_rv(RV_KIND_ZBS, rs1, rs2, rd);
2373                (term, false, 0)
2374            }
2375            (0b0010100, 0b001) => {
2376                self.rv_bit_rr(rd, rs1, rs2, BitOp::Set, pc);
2377                let term = self.feed_gas_rv(RV_KIND_ZBS, rs1, rs2, rd);
2378                (term, false, 0)
2379            }
2380            (0b0110100, 0b001) => {
2381                self.rv_bit_rr(rd, rs1, rs2, BitOp::Invert, pc);
2382                let term = self.feed_gas_rv(RV_KIND_ZBS, rs1, rs2, rd);
2383                (term, false, 0)
2384            }
2385            (0b0100100, 0b101) => {
2386                self.rv_bit_rr(rd, rs1, rs2, BitOp::Extract, pc);
2387                let term = self.feed_gas_rv(RV_KIND_ZBS, rs1, rs2, rd);
2388                (term, false, 0)
2389            }
2390            // Zicond
2391            (0b0000111, 0b101) => {
2392                self.rv_czero(rd, rs1, rs2, Cc::E, pc);
2393                let term = self.feed_gas_rv(RV_KIND_ZICOND, rs1, rs2, rd);
2394                (term, false, 0)
2395            }
2396            (0b0000111, 0b111) => {
2397                self.rv_czero(rd, rs1, rs2, Cc::NE, pc);
2398                let term = self.feed_gas_rv(RV_KIND_ZICOND, rs1, rs2, rd);
2399                (term, false, 0)
2400            }
2401            // Zbb zext.h via pack rd, rs1, x0
2402            (0b0000100, 0b100) if rs2 == 0 => {
2403                self.rv_unary(rd, rs1, UnaryOp::ZextH, pc);
2404                let term = self.feed_gas_rv(RV_KIND_ZBB_U1, rs1, 0, rd);
2405                (term, false, 0)
2406            }
2407            _ => {
2408                let _ = w;
2409                self.rv_emit_panic_at(pc);
2410                self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
2411                (true, false, 0)
2412            }
2413        }
2414    }
2415
2416    #[allow(clippy::too_many_arguments)]
2417    fn compile_op_32(
2418        &mut self,
2419        rd: u8,
2420        rs1: u8,
2421        rs2: u8,
2422        f3: u8,
2423        f7: u8,
2424        w: u32,
2425        pc: u32,
2426    ) -> (bool, bool, usize) {
2427        use javm_exec::gas_cost::*;
2428        match (f7, f3) {
2429            (0b0000000, 0b000) => {
2430                self.rv_alu_rr(rd, rs1, rs2, AluOp::Addw, pc);
2431                let term = self.feed_gas_rv(RV_KIND_ADDW, rs1, rs2, rd);
2432                (term, false, 0)
2433            }
2434            (0b0100000, 0b000) => {
2435                self.rv_alu_rr(rd, rs1, rs2, AluOp::Subw, pc);
2436                let term = self.feed_gas_rv(RV_KIND_ADDW, rs1, rs2, rd);
2437                (term, false, 0)
2438            }
2439            (0b0000000, 0b001) => {
2440                self.rv_shift_rr(rd, rs1, rs2, ShiftOp::Shl32, pc);
2441                let term = self.feed_gas_rv(RV_KIND_SLLW, rs1, rs2, rd);
2442                (term, false, 0)
2443            }
2444            (0b0000000, 0b101) => {
2445                self.rv_shift_rr(rd, rs1, rs2, ShiftOp::Shr32, pc);
2446                let term = self.feed_gas_rv(RV_KIND_SLLW, rs1, rs2, rd);
2447                (term, false, 0)
2448            }
2449            (0b0100000, 0b101) => {
2450                self.rv_shift_rr(rd, rs1, rs2, ShiftOp::Sar32, pc);
2451                let term = self.feed_gas_rv(RV_KIND_SLLW, rs1, rs2, rd);
2452                (term, false, 0)
2453            }
2454            (0b0000001, 0b000) => {
2455                self.rv_alu_rr(rd, rs1, rs2, AluOp::Mulw, pc);
2456                let term = self.feed_gas_rv(RV_KIND_MULW, rs1, rs2, rd);
2457                (term, false, 0)
2458            }
2459            (0b0000001, 0b100) => {
2460                self.rv_div_rem(rd, rs1, rs2, true, false, true, pc);
2461                let term = self.feed_gas_rv(RV_KIND_DIV, rs1, rs2, rd);
2462                (term, false, 0)
2463            }
2464            (0b0000001, 0b101) => {
2465                self.rv_div_rem(rd, rs1, rs2, false, false, true, pc);
2466                let term = self.feed_gas_rv(RV_KIND_DIV, rs1, rs2, rd);
2467                (term, false, 0)
2468            }
2469            (0b0000001, 0b110) => {
2470                self.rv_div_rem(rd, rs1, rs2, true, true, true, pc);
2471                let term = self.feed_gas_rv(RV_KIND_DIV, rs1, rs2, rd);
2472                (term, false, 0)
2473            }
2474            (0b0000001, 0b111) => {
2475                self.rv_div_rem(rd, rs1, rs2, false, true, true, pc);
2476                let term = self.feed_gas_rv(RV_KIND_DIV, rs1, rs2, rd);
2477                (term, false, 0)
2478            }
2479            (0b0110000, 0b001) => {
2480                self.rv_shift_rr(rd, rs1, rs2, ShiftOp::Rol32, pc);
2481                let term = self.feed_gas_rv(RV_KIND_ZBB_ROTW, rs1, rs2, rd);
2482                (term, false, 0)
2483            }
2484            (0b0110000, 0b101) => {
2485                self.rv_shift_rr(rd, rs1, rs2, ShiftOp::Ror32, pc);
2486                let term = self.feed_gas_rv(RV_KIND_ZBB_ROTW, rs1, rs2, rd);
2487                (term, false, 0)
2488            }
2489            (0b0000100, 0b000) => {
2490                self.rv_adduw(rd, rs1, rs2, pc);
2491                let term = self.feed_gas_rv(RV_KIND_ZBA, rs1, rs2, rd);
2492                (term, false, 0)
2493            }
2494            (0b0010000, 0b010) => {
2495                self.rv_shadd(rd, rs1, rs2, 1, true, pc);
2496                let term = self.feed_gas_rv(RV_KIND_ZBA, rs1, rs2, rd);
2497                (term, false, 0)
2498            }
2499            (0b0010000, 0b100) => {
2500                self.rv_shadd(rd, rs1, rs2, 2, true, pc);
2501                let term = self.feed_gas_rv(RV_KIND_ZBA, rs1, rs2, rd);
2502                (term, false, 0)
2503            }
2504            (0b0010000, 0b110) => {
2505                self.rv_shadd(rd, rs1, rs2, 3, true, pc);
2506                let term = self.feed_gas_rv(RV_KIND_ZBA, rs1, rs2, rd);
2507                (term, false, 0)
2508            }
2509            _ => {
2510                let _ = w;
2511                self.rv_emit_panic_at(pc);
2512                self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
2513                (true, false, 0)
2514            }
2515        }
2516    }
2517
2518    fn compile_lui(&mut self, rd: u8, w: u32, pc: u32, rest: &[u8]) -> (bool, bool, usize) {
2519        use javm_exec::gas_cost::*;
2520        let imm = imm_u(w);
2521
2522        // Lui→Add fusion: `lui rd, imm; add rd, rd, rs2` (4-byte) or the
2523        // RVC equivalent `c.add rd, rs2` collapses into one `lea rd, [rs2 +
2524        // imm]`. Only the same-rd Add case is fusable: if the Add writes a
2525        // different register, the LUI value is still live and we can't skip
2526        // its materialisation.
2527        if rd != 0
2528            && !rv_is_reserved(rd)
2529            && let Some((op, a_rd, a_rs1, a_rs2, consumed)) = peek_alu_rr_trailer(rest)
2530            && matches!(op, AluOp::Add)
2531            && a_rd == rd
2532        {
2533            let other = if a_rs1 == rd {
2534                Some(a_rs2)
2535            } else if a_rs2 == rd {
2536                Some(a_rs1)
2537            } else {
2538                None
2539            };
2540            if let Some(other) = other
2541                && (other == 0 || !rv_is_reserved(other))
2542            {
2543                if let Some(d) = self.rv_dst(a_rd, pc) {
2544                    if other == 0 {
2545                        // `add rd, rd, x0` = identity → rd stays as the LUI
2546                        // constant. Fall back to mov_ri64 + track_const so
2547                        // subsequent addr-folding still works.
2548                        self.asm.mov_ri64(d, imm as i64 as u64);
2549                        self.track_const(a_rd, imm);
2550                    } else {
2551                        let base = REG_MAP[rv_slot(other).unwrap()];
2552                        self.asm.lea(d, base, imm);
2553                        self.invalidate_reg(rv_slot(a_rd).unwrap());
2554                    }
2555                }
2556                self.feed_gas_rv(RV_KIND_LUI, 0, 0, rd);
2557                self.feed_gas_rv(RV_KIND_ADD, a_rs1, a_rs2, a_rd);
2558                // lea preserves CF in x86, but no RV-semantic Add was emitted
2559                // — clear so downstream Sltu can't fuse against stale CF.
2560                return (false, false, consumed);
2561            }
2562        }
2563
2564        self.rv_lui(rd, imm, pc);
2565        self.track_const(rd, imm);
2566        let term = self.feed_gas_rv(RV_KIND_LUI, 0, 0, rd);
2567        (term, false, 0)
2568    }
2569
2570    fn compile_jal(&mut self, rd: u8, w: u32, pc: u32, inst_len: u32) -> (bool, bool, usize) {
2571        use javm_exec::gas_cost::*;
2572        let imm = imm_j(w);
2573        let next_pc = pc + inst_len;
2574        self.rv_jal(rd, imm, pc, next_pc);
2575        let term = self.feed_gas_rv(RV_KIND_JAL, 0, 0, rd);
2576        (term, false, 0)
2577    }
2578
2579    fn compile_auipc(&mut self, rd: u8, w: u32, pc: u32) -> (bool, bool, usize) {
2580        use javm_exec::gas_cost::*;
2581        // auipc result is a compile-time constant: code_base + pc + imm.
2582        let imm = imm_u(w);
2583        self.rv_auipc(rd, imm, pc);
2584        // Gas: same kind/cost as LUI (a constant materialise).
2585        let term = self.feed_gas_rv(RV_KIND_LUI, 0, 0, rd);
2586        (term, false, 0)
2587    }
2588
2589    fn compile_jalr(
2590        &mut self,
2591        rd: u8,
2592        rs1: u8,
2593        w: u32,
2594        pc: u32,
2595        inst_len: u32,
2596    ) -> (bool, bool, usize) {
2597        use javm_exec::gas_cost::*;
2598        let imm = imm_i(w);
2599        let next_pc = pc + inst_len;
2600        self.rv_jalr(rd, rs1, imm, pc, next_pc);
2601        // src = rs1 (target); rd not tracked (terminator).
2602        let term = self.feed_gas_rv(RV_KIND_JALR, rs1, 0, 0);
2603        (term, false, 0)
2604    }
2605
2606    fn compile_branch(&mut self, rs1: u8, rs2: u8, f3: u8, w: u32, pc: u32) -> (bool, bool, usize) {
2607        use javm_exec::gas_cost::*;
2608        let imm = imm_b(w);
2609        let next_pc = pc + 4;
2610        let cc = match f3 {
2611            0b000 => Cc::E,
2612            0b001 => Cc::NE,
2613            0b100 => Cc::L,
2614            0b101 => Cc::GE,
2615            0b110 => Cc::B,
2616            0b111 => Cc::AE,
2617            _ => {
2618                self.rv_emit_panic_at(pc);
2619                self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
2620                return (true, false, 0);
2621            }
2622        };
2623        self.rv_branch(rs1, rs2, imm, cc, pc, next_pc);
2624        let term = self.feed_gas_rv(RV_KIND_BRANCH, rs1, rs2, 0);
2625        (term, false, 0)
2626    }
2627
2628    fn compile_custom_0(
2629        &mut self,
2630        _rd: u8,
2631        _rs1: u8,
2632        f3: u8,
2633        w: u32,
2634        pc: u32,
2635    ) -> (bool, bool, usize) {
2636        use javm_exec::gas_cost::*;
2637        // PVM2 custom-0 encoding:
2638        //   f3=000 → trap     (other fields ignored)
2639        //   f3=001 → ecall.jar
2640        //   f3=010 → ecalli imm
2641        //   f3=100 → fallthrough (terminator no-op)
2642        //   f3=011 (was br_table) → reserved; PVM2 uses plain jalr.
2643        let next_pc = pc + 4;
2644        match f3 {
2645            0b000 => {
2646                self.rv_trap(pc);
2647                let term = self.feed_gas_rv(RV_KIND_TRAP, 0, 0, 0);
2648                (term, false, 0)
2649            }
2650            0b001 => {
2651                self.rv_ecall_jar(next_pc);
2652                let term = self.feed_gas_rv(RV_KIND_ECALL_JAR, 0, 0, 0);
2653                (term, false, 0)
2654            }
2655            0b010 => {
2656                let imm = imm_i(w);
2657                self.rv_ecalli(imm, next_pc);
2658                let term = self.feed_gas_rv(RV_KIND_ECALLI, 0, 0, 0);
2659                (term, false, 0)
2660            }
2661            0b100 => {
2662                let term = self.feed_gas_rv(RV_KIND_FALLTHROUGH, 0, 0, 0);
2663                (term, false, 0)
2664            }
2665            _ => {
2666                self.rv_emit_panic_at(pc);
2667                self.feed_gas_rv(RV_KIND_RESERVED, 0, 0, 0);
2668                (true, false, 0)
2669            }
2670        }
2671    }
2672
2673    /// Byte-based Mul-pair fusion: a 64-bit `mul rd1, rs1, rs2` followed
2674    /// by `mulh`/`mulhu rd2, rs1, rs2` (same operand pair, different
2675    /// destination) folds into a single x86 mul/imul that produces
2676    /// RDX:RAX. Returns `Some(extra_bytes_consumed)` on success.
2677    fn try_fuse_mul_pair_bytes(
2678        &mut self,
2679        m_rd: u8,
2680        m_rs1: u8,
2681        m_rs2: u8,
2682        rest: &[u8],
2683        _pc: u32,
2684    ) -> Option<usize> {
2685        use javm_exec::gas_cost::*;
2686        if rest.len() < 4 {
2687            return None;
2688        }
2689        let w2 = u32::from_le_bytes([rest[0], rest[1], rest[2], rest[3]]);
2690        // Mulh: f7=0000001 f3=001. Mulhu: f7=0000001 f3=011.
2691        // Mask catches both: opcode 0x33 + f7=1 + (f3=001 or f3=011).
2692        let signed = match w2 & 0xFE00_707F {
2693            0x0200_1033 => true,  // Mulh
2694            0x0200_3033 => false, // Mulhu
2695            _ => return None,
2696        };
2697        let u_rd = ((w2 >> 7) & 0x1F) as u8;
2698        let u_rs1 = ((w2 >> 15) & 0x1F) as u8;
2699        let u_rs2 = ((w2 >> 20) & 0x1F) as u8;
2700        if u_rs1 != m_rs1 || u_rs2 != m_rs2 || u_rd == m_rd {
2701            return None;
2702        }
2703        if rv_is_reserved(m_rd) || rv_is_reserved(u_rd) {
2704            return None;
2705        }
2706        if rv_is_reserved(m_rs1) || rv_is_reserved(m_rs2) {
2707            return None;
2708        }
2709        let (rs1_slot, rs2_slot) = (rv_slot(m_rs1)?, rv_slot(m_rs2)?);
2710        let (lo_slot, hi_slot) = (rv_slot(m_rd)?, rv_slot(u_rd)?);
2711
2712        let a = REG_MAP[rs1_slot];
2713        let b = REG_MAP[rs2_slot];
2714        let rd_lo = REG_MAP[lo_slot];
2715        let rd_hi = REG_MAP[hi_slot];
2716        let phi11 = REG_MAP[11];
2717
2718        let need_save_phi11 = rd_lo != phi11 && rd_hi != phi11;
2719        if need_save_phi11 {
2720            self.asm.push(phi11);
2721        }
2722        let mul_src = if b == phi11 {
2723            if need_save_phi11 {
2724                self.asm.mov_load64(SCRATCH, Reg::RSP, 0);
2725            } else {
2726                self.asm.mov_rr(SCRATCH, b);
2727            }
2728            SCRATCH
2729        } else {
2730            b
2731        };
2732        if a != phi11 {
2733            self.asm.mov_rr(phi11, a);
2734        }
2735        if signed {
2736            self.asm.imul_rdx_rax(mul_src);
2737        } else {
2738            self.asm.mul_rdx_rax(mul_src);
2739        }
2740        if rd_lo != phi11 {
2741            self.asm.mov_rr(rd_lo, phi11);
2742        }
2743        if rd_hi != Reg::RDX {
2744            self.asm.mov_rr(rd_hi, Reg::RDX);
2745        }
2746        if need_save_phi11 {
2747            self.asm.pop(phi11);
2748        }
2749
2750        self.invalidate_reg(lo_slot);
2751        self.invalidate_reg(hi_slot);
2752        self.last_add_cf = None;
2753
2754        // Feed gas for both consumed instructions (Mul + Mulh/Mulhu).
2755        // Both Mulh and Mulhu use RV_KIND_MULH per the gas table.
2756        let _ = signed;
2757        self.feed_gas_rv(RV_KIND_MUL, m_rs1, m_rs2, m_rd);
2758        self.feed_gas_rv(RV_KIND_MULH, u_rs1, u_rs2, u_rd);
2759
2760        Some(4)
2761    }
2762
2763    // ----------------------------------------------------------------
2764    // RV-side helpers — resolve x0/x3/x4 aliases and call through asm.
2765    // ----------------------------------------------------------------
2766
2767    /// Read RV source register into `dst_reg`. x0 → load 0; `x3`/`x4`
2768    /// (host-spilled) → load from `JitContext.regs[13|14]`; x16..x31 → panic.
2769    fn rv_read(&mut self, rs: u8, dst_reg: Reg, pc: u32) {
2770        if rs == 0 {
2771            self.asm.mov_ri64(dst_reg, 0);
2772        } else if reg_is_spilled(rs) {
2773            self.asm.mov_load64_rip_rel(dst_reg, spill_va(rs));
2774        } else if rv_is_reserved(rs) {
2775            self.rv_emit_panic_at(pc);
2776        } else {
2777            self.asm.mov_rr(dst_reg, REG_MAP[rv_slot(rs).unwrap()]);
2778        }
2779    }
2780
2781    /// Return the x86 register holding rs's value. For x0, materialise 0
2782    /// into `scratch` and return `scratch`. For a host-spilled `x3`/`x4`,
2783    /// load from `JitContext.regs[13|14]` into `scratch` and return it.
2784    fn rv_read_into(&mut self, rs: u8, scratch: Reg, pc: u32) -> Reg {
2785        if rs == 0 {
2786            self.asm.mov_ri64(scratch, 0);
2787            scratch
2788        } else if reg_is_spilled(rs) {
2789            self.asm.mov_load64_rip_rel(scratch, spill_va(rs));
2790            scratch
2791        } else if rv_is_reserved(rs) {
2792            self.rv_emit_panic_at(pc);
2793            scratch
2794        } else {
2795            REG_MAP[rv_slot(rs).unwrap()]
2796        }
2797    }
2798
2799    /// Resolve an RV destination register. None when rd == x0 (discard).
2800    /// x3/x4 emit a panic and return None.
2801    fn rv_dst(&mut self, rd: u8, pc: u32) -> Option<Reg> {
2802        if rd == 0 {
2803            None
2804        } else if rv_is_reserved(rd) {
2805            self.rv_emit_panic_at(pc);
2806            None
2807        } else {
2808            Some(REG_MAP[rv_slot(rd).unwrap()])
2809        }
2810    }
2811
2812    // ---- LUI ---------------------------------------------------------
2813
2814    fn rv_lui(&mut self, rd: u8, imm: i32, pc: u32) {
2815        if let Some(d) = self.rv_dst(rd, pc) {
2816            // imm has bits in [31:12]; sign-extend to 64.
2817            self.asm.mov_ri64(d, imm as i64 as u64);
2818            self.invalidate_reg(rv_slot(rd).unwrap());
2819        }
2820    }
2821
2822    /// `auipc rd, imm` — `rd = (code_base + pc) + imm`. Both addends
2823    /// are compile-time constants, so this materialises a single
2824    /// constant (mirrors the interpreter's `Auipc` arm exactly). The
2825    /// value is a guest VA, sign-extended 32→64 like `lui`.
2826    fn rv_auipc(&mut self, rd: u8, imm: i32, pc: u32) {
2827        if let Some(d) = self.rv_dst(rd, pc) {
2828            let va = self.code_base.wrapping_add(pc).wrapping_add(imm as u32);
2829            self.asm.mov_ri64(d, va as i32 as i64 as u64);
2830            self.invalidate_reg(rv_slot(rd).unwrap());
2831        }
2832    }
2833
2834    // ---- Loads / stores ---------------------------------------------
2835
2836    fn rv_load(&mut self, rd: u8, rs1: u8, imm: i32, width: u32, signed: bool, pc: u32) {
2837        if rv_is_reserved(rd) || rv_is_reserved(rs1) {
2838            self.rv_emit_panic_at(pc);
2839            return;
2840        }
2841        self.rv_addr_to_scratch(rs1, imm, pc);
2842        let fn_addr = match width {
2843            1 => self.helpers.mem_read_u8,
2844            2 => self.helpers.mem_read_u16,
2845            4 => self.helpers.mem_read_u32,
2846            _ => self.helpers.mem_read_u64,
2847        };
2848        let dst = match self.rv_dst(rd, pc) {
2849            Some(r) => r,
2850            None => SCRATCH, // x0: load discarded but trap-on-OOB still fires
2851        };
2852        self.emit_mem_read_sized(dst, fn_addr, width, pc);
2853        if signed && width < 8 && rd != 0 {
2854            match width {
2855                1 => self.asm.movsx_8_64(dst, dst),
2856                2 => self.asm.movsx_16_64(dst, dst),
2857                4 => self.asm.movsxd(dst, dst),
2858                _ => {}
2859            }
2860        }
2861        if rd != 0 {
2862            self.invalidate_reg(rv_slot(rd).unwrap());
2863        }
2864    }
2865
2866    fn rv_store(&mut self, rs1: u8, rs2: u8, imm: i32, width: u32, pc: u32) {
2867        if rv_is_reserved(rs1) || rv_is_reserved(rs2) {
2868            self.rv_emit_panic_at(pc);
2869            return;
2870        }
2871        let fn_addr = match width {
2872            1 => self.helpers.mem_write_u8,
2873            2 => self.helpers.mem_write_u16,
2874            4 => self.helpers.mem_write_u32,
2875            _ => self.helpers.mem_write_u64,
2876        };
2877        if rs2 == 0 {
2878            // Materialise 0 into a temp register so SCRATCH can hold the
2879            // addr. Compute the address FIRST — rs1 might map to RAX
2880            // (x14), in which case clobbering RAX before reading rs1
2881            // would feed the address calc the wrong value.
2882            self.rv_addr_to_scratch(rs1, imm, pc);
2883            self.asm.push(Reg::RAX);
2884            self.asm.mov_ri64(Reg::RAX, 0);
2885            self.emit_mem_write(true, Reg::RAX, fn_addr, pc);
2886            self.asm.pop(Reg::RAX);
2887        } else {
2888            let val = REG_MAP[rv_slot(rs2).unwrap()];
2889            self.rv_addr_to_scratch(rs1, imm, pc);
2890            self.emit_mem_write(true, val, fn_addr, pc);
2891        }
2892    }
2893
2894    /// Build `addr = (rs1 + imm) & 0xFFFFFFFF` into SCRATCH.
2895    fn rv_addr_to_scratch(&mut self, rs1: u8, imm: i32, pc: u32) {
2896        use super::codegen::RegDef;
2897        if rs1 == 0 {
2898            self.asm.mov_ri32(SCRATCH, imm as u32);
2899            return;
2900        }
2901        if rv_is_reserved(rs1) {
2902            self.rv_emit_panic_at(pc);
2903            return;
2904        }
2905        // Ported from PVM's emit_addr_to_scratch peephole: fold a known
2906        // constant address (set by `addi rd, x0, imm` / `lui`) directly
2907        // into the immediate, skipping the lea/movzx entirely.
2908        let slot = rv_slot(rs1).unwrap();
2909        if let RegDef::Const(addr) = self.reg_defs[slot] {
2910            let effective = addr.wrapping_add(imm as u32);
2911            self.asm.mov_ri32(SCRATCH, effective);
2912            return;
2913        }
2914        // Use SIB addressing for scaled-index patterns when imm == 0
2915        // (sh{1,2,3}add or slli+add chains tracked via reg_defs).
2916        // Tracking guarantees rd didn't alias rs1/rs2 (record_scaledadd
2917        // refuses self-referential defs), so base/idx still hold their
2918        // pre-emit values at the consumer site.
2919        if imm == 0
2920            && let RegDef::ScaledAdd { base, idx, shift } = self.reg_defs[slot]
2921        {
2922            self.asm
2923                .lea_sib_scaled_32(SCRATCH, REG_MAP[base], REG_MAP[idx], shift);
2924            return;
2925        }
2926        let base = REG_MAP[slot];
2927        if imm != 0 {
2928            self.asm.lea_32(SCRATCH, base, imm);
2929        } else {
2930            self.asm.movzx_32_64(SCRATCH, base);
2931        }
2932    }
2933
2934    // ---- ALU --------------------------------------------------------
2935
2936    fn rv_alu_imm(&mut self, rd: u8, rs1: u8, imm: i32, op: AluImmOp, pc: u32) {
2937        let Some(d) = self.rv_dst(rd, pc) else { return };
2938        // Phase 5: `addi rd, x0, imm` is the canonical RV "li" form. The
2939        // generic path would emit `xor d, d; add d, imm` (2 ops); we can
2940        // do it as a single sign-extended move.
2941        if rs1 == 0 && matches!(op, AluImmOp::Add) {
2942            self.asm.mov_ri64(d, imm as i64 as u64);
2943            self.invalidate_reg(rv_slot(rd).unwrap());
2944            return;
2945        }
2946        self.rv_read(rs1, d, pc);
2947        match op {
2948            AluImmOp::Add => self.asm.add_ri(d, imm),
2949            AluImmOp::And => self.asm.and_ri(d, imm),
2950            AluImmOp::Or => self.asm.or_ri(d, imm),
2951            AluImmOp::Xor => self.asm.xor_ri(d, imm),
2952            AluImmOp::Addw => {
2953                self.asm.add_ri32(d, imm);
2954                self.asm.movsxd(d, d);
2955            }
2956        }
2957        if rd != 0 {
2958            self.invalidate_reg(rv_slot(rd).unwrap());
2959        }
2960    }
2961
2962    fn rv_alu_rr(&mut self, rd: u8, rs1: u8, rs2: u8, op: AluOp, pc: u32) {
2963        let Some(d) = self.rv_dst(rd, pc) else { return };
2964        if rv_is_reserved(rs1) || rv_is_reserved(rs2) {
2965            self.rv_emit_panic_at(pc);
2966            return;
2967        }
2968        // Phase 5: `add rd, rs, x0` / `add rd, x0, rs` — canonical RV `mv`.
2969        // Generic path emits `mov SCRATCH, 0; mov d, rs; add d, SCRATCH`
2970        // (or `xor d, d; add d, rs`); the single `mov d, rs` (with rs2=x0
2971        // src=rs1, or vice versa) is one op. mov_rr doesn't touch CF.
2972        //
2973        // This path bypasses the Phase 4 last_add_cf set at the bottom,
2974        // and the main-loop clearing keeps last_add_cf alive across the
2975        // Add instruction. If the mv's rd was the previous add's D/A/B,
2976        // the carry handoff is no longer meaningful — clear conservatively.
2977        if matches!(op, AluOp::Add) && (rs1 == 0 || rs2 == 0) {
2978            let src = if rs1 == 0 { rs2 } else { rs1 };
2979            self.rv_read(src, d, pc);
2980            self.invalidate_reg(rv_slot(rd).unwrap());
2981            self.last_add_cf = None;
2982            return;
2983        }
2984        // PVM-ported peephole: `sub rd, rs1, rs2` where rd_slot == rs2_slot
2985        // and rs1 != rs2. Generic path snapshots rs2 to SCRATCH (because d
2986        // aliases rs2), then mov d, rs1, then sub d, SCRATCH — 3 ops.
2987        // We can compute the same result as `neg d; add d, rs1` in 2 ops
2988        // since d already holds rs2's value.
2989        if matches!(op, AluOp::Sub) && rs1 != 0 && rs2 != 0 && rs1 != rs2 {
2990            let r1_x86 = REG_MAP[rv_slot(rs1).unwrap()];
2991            let r2_x86 = REG_MAP[rv_slot(rs2).unwrap()];
2992            if d == r2_x86 {
2993                self.asm.neg64(d);
2994                self.asm.add_rr(d, r1_x86);
2995                self.invalidate_reg(rv_slot(rd).unwrap());
2996                self.last_add_cf = None;
2997                return;
2998            }
2999        }
3000        // Aliasing analysis: rv_read(rs1, d) might write d, which can
3001        // clobber rs2's value if rd's slot equals rs2's slot. Save rs2
3002        // into SCRATCH first whenever d aliases rs2 (and rs2 != rs1).
3003        // x0 is handled specially since it has no mapped register.
3004        let r1_is_x0 = rs1 == 0;
3005        let r2_is_x0 = rs2 == 0;
3006        let r1 = if r1_is_x0 {
3007            None
3008        } else {
3009            Some(REG_MAP[rv_slot(rs1).unwrap()])
3010        };
3011        let r2 = if r2_is_x0 {
3012            None
3013        } else {
3014            Some(REG_MAP[rv_slot(rs2).unwrap()])
3015        };
3016
3017        let b_reg = if r2_is_x0 {
3018            // rs2 == 0: materialise 0 in SCRATCH. rv_read of rs1 below
3019            // won't touch SCRATCH (mov_rr / mov_ri64).
3020            self.asm.mov_ri64(SCRATCH, 0);
3021            SCRATCH
3022        } else if Some(d) == r2 && r1 != r2 {
3023            // d aliases r2 and rs1 != rs2 — rv_read(rs1, d) would
3024            // clobber rs2. Snapshot rs2 into SCRATCH first.
3025            self.asm.mov_rr(SCRATCH, r2.unwrap());
3026            SCRATCH
3027        } else {
3028            r2.unwrap()
3029        };
3030        // Now safe to load rs1 into d.
3031        self.rv_read(rs1, d, pc);
3032        self.apply_alu_op(op, d, b_reg);
3033        if rd != 0 {
3034            self.invalidate_reg(rv_slot(rd).unwrap());
3035        }
3036        // Phase 4: record carry-flag handoff. Only 64-bit `add` sets CF
3037        // in a way that matches a subsequent `sltu rd, rs1, rs2` checking
3038        // unsigned overflow of rs1+rs2. Addw operates on the 32-bit view
3039        // and sign-extends — CF reflects 32-bit overflow, not 64-bit,
3040        // so a 64-bit sltu against the sign-extended sum would be wrong.
3041        // Skip x0 source/dest cases: degenerate, not worth tracking.
3042        if matches!(op, AluOp::Add)
3043            && rd != 0
3044            && rs1 != 0
3045            && rs2 != 0
3046            && let (Some(d_s), Some(a_s), Some(b_s)) = (rv_slot(rd), rv_slot(rs1), rv_slot(rs2))
3047        {
3048            self.last_add_cf = Some((d_s, a_s, b_s));
3049        }
3050    }
3051
3052    fn apply_alu_op(&mut self, op: AluOp, d: Reg, s: Reg) {
3053        match op {
3054            AluOp::Add => self.asm.add_rr(d, s),
3055            AluOp::Sub => self.asm.sub_rr(d, s),
3056            AluOp::And => self.asm.and_rr(d, s),
3057            AluOp::Or => self.asm.or_rr(d, s),
3058            AluOp::Xor => self.asm.xor_rr(d, s),
3059            AluOp::Mul => self.asm.imul_rr(d, s),
3060            AluOp::Addw => {
3061                self.asm.add_rr32(d, s);
3062                self.asm.movsxd(d, d);
3063            }
3064            AluOp::Subw => {
3065                self.asm.sub_rr32(d, s);
3066                self.asm.movsxd(d, d);
3067            }
3068            AluOp::Mulw => {
3069                self.asm.imul_rr32(d, s);
3070                self.asm.movsxd(d, d);
3071            }
3072            AluOp::Min => {
3073                self.asm.cmp_rr(d, s);
3074                self.asm.cmovcc(Cc::G, d, s);
3075            }
3076            AluOp::Max => {
3077                self.asm.cmp_rr(d, s);
3078                self.asm.cmovcc(Cc::L, d, s);
3079            }
3080            AluOp::Minu => {
3081                self.asm.cmp_rr(d, s);
3082                self.asm.cmovcc(Cc::A, d, s);
3083            }
3084            AluOp::Maxu => {
3085                self.asm.cmp_rr(d, s);
3086                self.asm.cmovcc(Cc::B, d, s);
3087            }
3088            AluOp::Andn => {
3089                self.asm.mov_rr(SCRATCH, s);
3090                self.asm.not64(SCRATCH);
3091                self.asm.and_rr(d, SCRATCH);
3092            }
3093            AluOp::Orn => {
3094                self.asm.mov_rr(SCRATCH, s);
3095                self.asm.not64(SCRATCH);
3096                self.asm.or_rr(d, SCRATCH);
3097            }
3098            AluOp::Xnor => {
3099                self.asm.xor_rr(d, s);
3100                self.asm.not64(d);
3101            }
3102        }
3103    }
3104
3105    fn rv_slt_imm(&mut self, rd: u8, rs1: u8, imm: i32, signed: bool, pc: u32) {
3106        let Some(d) = self.rv_dst(rd, pc) else { return };
3107        if rv_is_reserved(rs1) {
3108            self.rv_emit_panic_at(pc);
3109            return;
3110        }
3111        // Snapshot rs1 into SCRATCH if d aliases its register — zeroing
3112        // d below would otherwise clobber rs1 before the cmp.
3113        let src = if rs1 == 0 {
3114            self.asm.mov_ri64(SCRATCH, 0);
3115            SCRATCH
3116        } else {
3117            let r1 = REG_MAP[rv_slot(rs1).unwrap()];
3118            if d == r1 {
3119                self.asm.mov_rr(SCRATCH, r1);
3120                SCRATCH
3121            } else {
3122                r1
3123            }
3124        };
3125        // Zero d FIRST (mov_ri64 with 0 uses XOR → clobbers flags).
3126        // Then cmp sets flags fresh for setcc.
3127        self.asm.mov_ri64(d, 0);
3128        self.asm.cmp_ri(src, imm);
3129        self.asm.setcc(if signed { Cc::L } else { Cc::B }, d);
3130        if rd != 0 {
3131            self.invalidate_reg(rv_slot(rd).unwrap());
3132        }
3133    }
3134
3135    fn rv_slt_rr(&mut self, rd: u8, rs1: u8, rs2: u8, signed: bool, pc: u32) {
3136        let Some(d) = self.rv_dst(rd, pc) else { return };
3137        if rv_is_reserved(rs1) || rv_is_reserved(rs2) {
3138            self.rv_emit_panic_at(pc);
3139            return;
3140        }
3141        // Phase 4: carry-flag fast path for `sltu d, rs1, rs2` immediately
3142        // following `add rs1, A, B` (with rs2 ∈ {A, B}). CF already holds
3143        // the unsigned-overflow bit, so we skip the cmp and emit just
3144        // `setb d` + zero-extension. Mirrors PVM's SetLtU fusion.
3145        //
3146        // If the conditions don't match, the general path below emits
3147        // `mov_ri64(d, 0); cmp; setcc` — the first of which clobbers CF
3148        // via xor. last_add_cf is single-shot: cleared on entry to keep
3149        // any *subsequent* sltu from reading the (now-stale) add flags.
3150        if !signed && let Some((add_d, add_a, add_b)) = self.last_add_cf {
3151            let rs1_s = rv_slot(rs1);
3152            let rs2_s = rv_slot(rs2);
3153            let rd_s = rv_slot(rd);
3154            if let (Some(rs1_s), Some(rs2_s), Some(rd_s)) = (rs1_s, rs2_s, rd_s)
3155                && rs1_s == add_d
3156                && rs2_s != add_d
3157                && (rs2_s == add_a || rs2_s == add_b)
3158                && rd_s != rs2_s
3159            {
3160                // CF is valid. Zero d first via mov_ri32 (`mov r32, 0`,
3161                // no flag effect), then setb writes the low byte. This
3162                // avoids the partial-register dependency that a bare
3163                // `setcc; movzx` sequence would create.
3164                self.asm.mov_ri32(d, 0);
3165                self.asm.setcc(Cc::B, d);
3166                self.invalidate_reg(rd_s);
3167                // setb/movzx don't touch CF — a *further* consecutive sltu
3168                // against the same add still has the live carry available,
3169                // so leave last_add_cf intact.
3170                return;
3171            }
3172        }
3173        // Fell through: the general path below clobbers CF. Clear the
3174        // tracked carry so a subsequent sltu doesn't fuse spuriously.
3175        self.last_add_cf = None;
3176        // Snapshot operands into SCRATCH and/or read original mapped
3177        // registers BEFORE touching d. Zero d up front; the cmp below
3178        // sets flags fresh for the setcc.
3179        let r1 = if rs1 == 0 {
3180            None
3181        } else {
3182            Some(REG_MAP[rv_slot(rs1).unwrap()])
3183        };
3184        let r2 = if rs2 == 0 {
3185            None
3186        } else {
3187            Some(REG_MAP[rv_slot(rs2).unwrap()])
3188        };
3189        // Choose registers for a and b without writing d yet.
3190        // Strategy: if d aliases r1 or r2, snapshot one of them to
3191        // SCRATCH. We only have one SCRATCH (RDX) so handle carefully.
3192        let (a_reg, b_reg) = match (r1, r2) {
3193            (Some(ra), Some(rb)) => {
3194                if d == ra && d == rb {
3195                    // Both r1 and r2 are d. cmp d, d → ZF=1 always; SLT=0.
3196                    (ra, rb)
3197                } else if d == ra {
3198                    // We'll write d = 0 then load a into d. But that
3199                    // overwrites b if d == ra... wait, ra is d. Snapshot
3200                    // ra into SCRATCH BEFORE zeroing d.
3201                    self.asm.mov_rr(SCRATCH, ra);
3202                    (SCRATCH, rb)
3203                } else if d == rb {
3204                    self.asm.mov_rr(SCRATCH, rb);
3205                    (ra, SCRATCH)
3206                } else {
3207                    (ra, rb)
3208                }
3209            }
3210            (None, Some(rb)) => {
3211                // a is x0. result = (0 < rb), i.e. (rb > 0) signed or
3212                // (rb != 0) unsigned. Cc::G == "ZF=0 && SF=0" after a
3213                // test against self (OF=0), so it captures rb > 0
3214                // signed; Cc::A == "ZF=0" after the same test, capturing
3215                // rb != 0 (= 0 < rb unsigned).
3216                if d == rb {
3217                    // Snapshot rb (d will be clobbered to receive the
3218                    // setcc byte). mov_rr does not clobber flags but we
3219                    // haven't set them yet; the test_rr below sets fresh
3220                    // flags after mov_ri64 (which uses XOR and clobbers
3221                    // flags). Order matters.
3222                    self.asm.mov_rr(SCRATCH, rb);
3223                    self.asm.mov_ri64(d, 0);
3224                    self.asm.test_rr(SCRATCH, SCRATCH);
3225                    self.asm.setcc(if signed { Cc::G } else { Cc::A }, d);
3226                    if rd != 0 {
3227                        self.invalidate_reg(rv_slot(rd).unwrap());
3228                    }
3229                    return;
3230                } else {
3231                    self.asm.mov_ri64(d, 0);
3232                    self.asm.test_rr(rb, rb);
3233                    self.asm.setcc(if signed { Cc::G } else { Cc::A }, d);
3234                    if rd != 0 {
3235                        self.invalidate_reg(rv_slot(rd).unwrap());
3236                    }
3237                    return;
3238                }
3239            }
3240            (Some(ra), None) => {
3241                // b is x0.
3242                if d == ra {
3243                    self.asm.mov_rr(SCRATCH, ra);
3244                    self.asm.mov_ri64(d, 0);
3245                    self.asm.cmp_ri(SCRATCH, 0);
3246                    self.asm.setcc(if signed { Cc::L } else { Cc::B }, d);
3247                    if rd != 0 {
3248                        self.invalidate_reg(rv_slot(rd).unwrap());
3249                    }
3250                    return;
3251                } else {
3252                    // cmp ra, 0 — no need for SCRATCH.
3253                    self.asm.mov_ri64(d, 0);
3254                    self.asm.cmp_ri(ra, 0);
3255                    self.asm.setcc(if signed { Cc::L } else { Cc::B }, d);
3256                    if rd != 0 {
3257                        self.invalidate_reg(rv_slot(rd).unwrap());
3258                    }
3259                    return;
3260                }
3261            }
3262            (None, None) => {
3263                // x0 < x0 — always false; d = 0.
3264                self.asm.mov_ri64(d, 0);
3265                if rd != 0 {
3266                    self.invalidate_reg(rv_slot(rd).unwrap());
3267                }
3268                return;
3269            }
3270        };
3271        // a_reg and b_reg now point at the actual values.
3272        self.asm.mov_ri64(d, 0);
3273        self.asm.cmp_rr(a_reg, b_reg);
3274        self.asm.setcc(if signed { Cc::L } else { Cc::B }, d);
3275        if rd != 0 {
3276            self.invalidate_reg(rv_slot(rd).unwrap());
3277        }
3278    }
3279
3280    // ---- Shifts -----------------------------------------------------
3281
3282    fn rv_shift_imm(&mut self, rd: u8, rs1: u8, shamt: u8, op: ShiftOp, pc: u32) {
3283        let Some(d) = self.rv_dst(rd, pc) else { return };
3284        self.rv_read(rs1, d, pc);
3285        match op {
3286            ShiftOp::Shl64 => self.asm.shl_ri64(d, shamt & 63),
3287            ShiftOp::Shr64 => self.asm.shr_ri64(d, shamt & 63),
3288            ShiftOp::Sar64 => self.asm.sar_ri64(d, shamt & 63),
3289            ShiftOp::Shl32 => {
3290                self.asm.shl_ri32(d, shamt & 31);
3291                self.asm.movsxd(d, d);
3292            }
3293            ShiftOp::Shr32 => {
3294                self.asm.movzx_32_64(d, d);
3295                self.asm.shr_ri32(d, shamt & 31);
3296                self.asm.movsxd(d, d);
3297            }
3298            ShiftOp::Sar32 => {
3299                self.asm.sar_ri32(d, shamt & 31);
3300                self.asm.movsxd(d, d);
3301            }
3302            ShiftOp::Ror64 => self.asm.ror_ri64(d, shamt & 63),
3303            ShiftOp::Ror32 => {
3304                self.asm.movzx_32_64(d, d);
3305                self.asm.ror_ri32(d, shamt & 31);
3306                self.asm.movsxd(d, d);
3307            }
3308            ShiftOp::Rol64 | ShiftOp::Rol32 => {
3309                // No imm-rol instruction in PVM2 — should not reach.
3310                self.rv_emit_panic_at(pc);
3311            }
3312        }
3313        if rd != 0 {
3314            self.invalidate_reg(rv_slot(rd).unwrap());
3315        }
3316    }
3317
3318    fn rv_shift_rr(&mut self, rd: u8, rs1: u8, rs2: u8, op: ShiftOp, pc: u32) {
3319        let Some(d) = self.rv_dst(rd, pc) else { return };
3320        if rv_is_reserved(rs1) || rv_is_reserved(rs2) {
3321            self.rv_emit_panic_at(pc);
3322            return;
3323        }
3324        // Snapshot rs2 to SCRATCH if d would clobber it.
3325        let r2 = if rs2 == 0 {
3326            None
3327        } else {
3328            Some(REG_MAP[rv_slot(rs2).unwrap()])
3329        };
3330        let r1 = if rs1 == 0 {
3331            None
3332        } else {
3333            Some(REG_MAP[rv_slot(rs1).unwrap()])
3334        };
3335        let shift_src = if rs2 == 0 {
3336            self.asm.mov_ri64(SCRATCH, 0);
3337            SCRATCH
3338        } else if Some(d) == r2 && r1 != r2 {
3339            self.asm.mov_rr(SCRATCH, r2.unwrap());
3340            SCRATCH
3341        } else {
3342            r2.unwrap()
3343        };
3344        self.rv_read(rs1, d, pc);
3345        let sub_op: u8 = match op {
3346            ShiftOp::Shl64 | ShiftOp::Shl32 => 4,
3347            ShiftOp::Shr64 | ShiftOp::Shr32 => 5,
3348            ShiftOp::Sar64 | ShiftOp::Sar32 => 7,
3349            ShiftOp::Rol64 | ShiftOp::Rol32 => 0,
3350            ShiftOp::Ror64 | ShiftOp::Ror32 => 1,
3351        };
3352        let is_32 = matches!(
3353            op,
3354            ShiftOp::Shl32 | ShiftOp::Shr32 | ShiftOp::Sar32 | ShiftOp::Rol32 | ShiftOp::Ror32
3355        );
3356        if is_32 {
3357            if matches!(op, ShiftOp::Shr32 | ShiftOp::Ror32) {
3358                self.asm.movzx_32_64(d, d);
3359            }
3360            self.emit_shift_by_reg32(d, shift_src, sub_op);
3361            self.asm.movsxd(d, d);
3362        } else {
3363            self.emit_shift_by_reg64(d, shift_src, sub_op);
3364        }
3365        if rd != 0 {
3366            self.invalidate_reg(rv_slot(rd).unwrap());
3367        }
3368    }
3369
3370    // ---- Multiply-high ----------------------------------------------
3371
3372    fn rv_mulh(&mut self, rd: u8, rs1: u8, rs2: u8, a_signed: bool, b_signed: bool, pc: u32) {
3373        let Some(d) = self.rv_dst(rd, pc) else { return };
3374        if rv_is_reserved(rs1) || rv_is_reserved(rs2) {
3375            self.rv_emit_panic_at(pc);
3376            return;
3377        }
3378        // Spill RAX (if d != RAX) and materialise both operands.
3379        let save_rax = d != Reg::RAX;
3380        let r2_mapped = if rs2 == 0 {
3381            None
3382        } else {
3383            Some(REG_MAP[rv_slot(rs2).unwrap()])
3384        };
3385        // Snapshot rs2 into SCRATCH up-front if rs2 maps to RAX (x14) —
3386        // we're about to clobber RAX. This covers both save_rax=true
3387        // (where RAX is also on stack, but reading from stack costs a
3388        // load) and save_rax=false (where RAX is the only live copy of
3389        // both rs2 and rd; we must capture rs2 before the load of rs1).
3390        let snapshot_rs2 = r2_mapped == Some(Reg::RAX);
3391        if snapshot_rs2 {
3392            self.asm.mov_rr(SCRATCH, Reg::RAX);
3393        }
3394        if save_rax {
3395            self.asm.push(Reg::RAX);
3396        }
3397        // Load rs1 into RAX.
3398        if rs1 == 0 {
3399            self.asm.mov_ri64(Reg::RAX, 0);
3400        } else {
3401            let r1 = REG_MAP[rv_slot(rs1).unwrap()];
3402            if r1 != Reg::RAX {
3403                self.asm.mov_rr(Reg::RAX, r1);
3404            }
3405            // If r1 == RAX but we saved RAX, the value is on stack — reload.
3406            if r1 == Reg::RAX && save_rax {
3407                self.asm.mov_load64(Reg::RAX, Reg::RSP, 0);
3408            }
3409        }
3410        // b is a mapped reg or 0; if 0, materialise into SCRATCH.
3411        let b_reg = if rs2 == 0 {
3412            self.asm.mov_ri64(SCRATCH, 0);
3413            SCRATCH
3414        } else if snapshot_rs2 {
3415            // rs2 already snapshotted into SCRATCH above.
3416            SCRATCH
3417        } else {
3418            r2_mapped.unwrap()
3419        };
3420        if a_signed && b_signed {
3421            self.asm.imul_rdx_rax(b_reg);
3422        } else if !a_signed && !b_signed {
3423            self.asm.mul_rdx_rax(b_reg);
3424        } else {
3425            // mulhsu: hi = unsigned_mul_hi(a, b) - (a < 0 ? b : 0).
3426            self.asm.push(b_reg);
3427            self.asm.push(Reg::RAX); // save a for sign check
3428            self.asm.mul_rdx_rax(b_reg);
3429            self.asm.pop(Reg::RAX); // a (signed)
3430            let skip = self.asm.new_label();
3431            self.asm.test_rr(Reg::RAX, Reg::RAX);
3432            self.asm.jcc_label(Cc::NS, skip);
3433            self.asm.pop(Reg::RAX); // pop saved b
3434            self.asm.sub_rr(SCRATCH, Reg::RAX);
3435            let done = self.asm.new_label();
3436            self.asm.jmp_label(done);
3437            self.asm.bind_label(skip);
3438            self.asm.add_ri(Reg::RSP, 8); // discard saved b
3439            self.asm.bind_label(done);
3440        }
3441        // High word in RDX (SCRATCH).
3442        if save_rax {
3443            self.asm.mov_rr(d, SCRATCH);
3444            self.asm.pop(Reg::RAX);
3445        } else {
3446            self.asm.mov_rr(Reg::RAX, SCRATCH);
3447        }
3448        if rd != 0 {
3449            self.invalidate_reg(rv_slot(rd).unwrap());
3450        }
3451    }
3452
3453    // ---- Division / remainder ---------------------------------------
3454
3455    #[allow(clippy::too_many_arguments)]
3456    fn rv_div_rem(
3457        &mut self,
3458        rd: u8,
3459        rs1: u8,
3460        rs2: u8,
3461        signed: bool,
3462        remainder: bool,
3463        is_32bit: bool,
3464        pc: u32,
3465    ) {
3466        let Some(d) = self.rv_dst(rd, pc) else { return };
3467        if rv_is_reserved(rs1) || rv_is_reserved(rs2) {
3468            self.rv_emit_panic_at(pc);
3469            return;
3470        }
3471        // ---- prologue (push spills once; both branches share a single
3472        // cleanup epilogue at `join`) ----
3473        let save_rax = d != Reg::RAX;
3474        if save_rax {
3475            self.asm.push(Reg::RAX);
3476        }
3477        // RCX is spilled when rs2 maps to nothing (x0) — we materialise
3478        // 0 into RCX — or when rs2 maps to RAX (we move the divisor to
3479        // RCX before loading the dividend into RAX).
3480        let r2 = if rs2 == 0 {
3481            None
3482        } else {
3483            Some(REG_MAP[rv_slot(rs2).unwrap()])
3484        };
3485        let spilled_rcx = rs2 == 0 || r2 == Some(Reg::RAX);
3486        if spilled_rcx {
3487            self.asm.push(Reg::RCX);
3488        }
3489        // Determine the divisor register (b_reg).
3490        let b_reg = if rs2 == 0 {
3491            self.asm.mov_ri64(Reg::RCX, 0);
3492            Reg::RCX
3493        } else if r2 == Some(Reg::RAX) {
3494            // rs2 mapped to RAX (x14). Get its value into RCX.
3495            if save_rax {
3496                // RAX was pushed first, RCX next. RSP+8 holds saved RAX.
3497                self.asm.mov_load64(Reg::RCX, Reg::RSP, 8);
3498            } else {
3499                // RAX wasn't pushed (d == RAX) — rs2's value is still
3500                // live in RAX. Snapshot to RCX before we load rs1 below.
3501                self.asm.mov_rr(Reg::RCX, Reg::RAX);
3502            }
3503            Reg::RCX
3504        } else {
3505            r2.unwrap()
3506        };
3507        // Load dividend (a) into RAX.
3508        if rs1 == 0 {
3509            self.asm.mov_ri64(Reg::RAX, 0);
3510        } else {
3511            let r1 = REG_MAP[rv_slot(rs1).unwrap()];
3512            if r1 == Reg::RAX {
3513                if save_rax {
3514                    let off = if spilled_rcx { 8 } else { 0 };
3515                    self.asm.mov_load64(Reg::RAX, Reg::RSP, off);
3516                }
3517                // else: already in RAX.
3518            } else {
3519                self.asm.mov_rr(Reg::RAX, r1);
3520            }
3521        }
3522        // ---- branch on divisor == 0 ----
3523        // `idivl`/`divl` consume only the divisor's low 32 bits, so the W-ops
3524        // must test *those* bits for zero — a divisor like `0x8000_0000_0000_0000`
3525        // has a nonzero 64-bit value but a zero low half, and dividing by it
3526        // raises #DE. (The 64-bit ops correctly test the full register.)
3527        if is_32bit {
3528            self.asm.test_rr32(b_reg, b_reg);
3529        } else {
3530            self.asm.test_rr(b_reg, b_reg);
3531        }
3532        let nonzero = self.asm.new_label();
3533        let join = self.asm.new_label();
3534        self.asm.jcc_label(Cc::NE, nonzero);
3535        // Divisor == 0: div → -1 (all-ones); remainder → dividend.
3536        if remainder {
3537            if d != Reg::RAX {
3538                self.asm.mov_rr(d, Reg::RAX);
3539            }
3540            if is_32bit {
3541                self.asm.movsxd(d, d);
3542            }
3543        } else {
3544            self.asm.mov_ri64(d, u64::MAX);
3545            // u64::MAX is sign-extended -1 in both 32/64-bit views.
3546        }
3547        self.asm.jmp_label(join);
3548
3549        // ---- nonzero branch: real DIV/IDIV ----
3550        self.asm.bind_label(nonzero);
3551        // Signed overflow guard. `idiv` on INT_MIN / -1 raises #DE on x86, but
3552        // RISC-V *defines* it (quotient = INT_MIN, remainder = 0). More
3553        // generally, for divisor == -1 the quotient is -dividend (which wraps
3554        // INT_MIN → INT_MIN) and the remainder is always 0 — so we special-case
3555        // divisor == -1, skip the `idiv`, and avoid the fault. This matches the
3556        // interpreter (`javm-exec/src/interp.rs`, Div/Rem). Unsigned division
3557        // cannot overflow, so the guard is signed-only.
3558        if signed {
3559            let not_neg_one = self.asm.new_label();
3560            if is_32bit {
3561                self.asm.cmp_ri32(b_reg, -1);
3562            } else {
3563                self.asm.cmp_ri(b_reg, -1);
3564            }
3565            self.asm.jcc_label(Cc::NE, not_neg_one);
3566            if remainder {
3567                self.asm.mov_ri64(d, 0); // a % -1 == 0 for all a
3568            } else {
3569                if d != Reg::RAX {
3570                    self.asm.mov_rr(d, Reg::RAX);
3571                }
3572                self.asm.neg64(d); // d = -dividend (wraps INT_MIN → INT_MIN)
3573                if is_32bit {
3574                    // Low 32 bits of a 64-bit negation == 32-bit negation of the
3575                    // low 32 bits, so re-narrow to the signed 32-bit view.
3576                    self.asm.movsxd(d, d);
3577                }
3578            }
3579            self.asm.jmp_label(join);
3580            self.asm.bind_label(not_neg_one);
3581        }
3582        if is_32bit {
3583            if signed {
3584                self.asm.movsxd(Reg::RAX, Reg::RAX);
3585                self.asm.cdq();
3586                self.asm.idiv32(b_reg);
3587            } else {
3588                self.asm.movzx_32_64(Reg::RAX, Reg::RAX);
3589                self.asm.mov_ri64(SCRATCH, 0);
3590                self.asm.div32(b_reg);
3591            }
3592        } else if signed {
3593            self.asm.cqo();
3594            self.asm.idiv64(b_reg);
3595        } else {
3596            self.asm.mov_ri64(SCRATCH, 0);
3597            self.asm.div64(b_reg);
3598        }
3599        let result_reg = if remainder { SCRATCH } else { Reg::RAX };
3600        if d != result_reg {
3601            self.asm.mov_rr(d, result_reg);
3602        }
3603        if is_32bit {
3604            self.asm.movsxd(d, d);
3605        }
3606
3607        // ---- single epilogue ----
3608        self.asm.bind_label(join);
3609        if spilled_rcx {
3610            self.asm.pop(Reg::RCX);
3611        }
3612        if save_rax {
3613            self.asm.pop(Reg::RAX);
3614        }
3615        if rd != 0 {
3616            self.invalidate_reg(rv_slot(rd).unwrap());
3617        }
3618    }
3619
3620    // ---- Unary ops (Zbb) --------------------------------------------
3621
3622    fn rv_unary(&mut self, rd: u8, rs1: u8, op: UnaryOp, pc: u32) {
3623        let Some(d) = self.rv_dst(rd, pc) else { return };
3624        let src = if rs1 == 0 {
3625            self.asm.mov_ri64(SCRATCH, 0);
3626            SCRATCH
3627        } else if rv_is_reserved(rs1) {
3628            self.rv_emit_panic_at(pc);
3629            return;
3630        } else {
3631            REG_MAP[rv_slot(rs1).unwrap()]
3632        };
3633        match op {
3634            UnaryOp::Clz64 => self.asm.lzcnt64(d, src),
3635            UnaryOp::Clz32 => self.asm.lzcnt32(d, src),
3636            UnaryOp::Ctz64 => self.asm.tzcnt64(d, src),
3637            UnaryOp::Ctz32 => self.asm.tzcnt32(d, src),
3638            UnaryOp::Popcnt64 => self.asm.popcnt64(d, src),
3639            UnaryOp::Popcnt32 => self.asm.popcnt32(d, src),
3640            UnaryOp::SextB => self.asm.movsx_8_64(d, src),
3641            UnaryOp::SextH => self.asm.movsx_16_64(d, src),
3642            UnaryOp::ZextH => self.asm.movzx_16_64(d, src),
3643            UnaryOp::Rev8 => {
3644                if d != src {
3645                    self.asm.mov_rr(d, src);
3646                }
3647                self.asm.bswap64(d);
3648            }
3649            UnaryOp::OrcB => {
3650                // orc.b: each byte → 0xFF if any bit was set, else 0x00. No
3651                // single x86 op, so emulate via SWAR:
3652                //   t = (((x & LO7) + LO7) | x) & HI   — bit7 of byte i set iff
3653                //       byte i != 0 (the carry trick; per-byte, no cross-byte
3654                //       carry since (b&0x7F)+0x7F ≤ 0xFE).
3655                //   result = t | (t - (t >> 7))        — spread that flag to the
3656                //       whole byte (0x80 → 0xFF; per-byte, no cross-byte borrow).
3657                // The 13-slot host register file leaves only SCRATCH free, so
3658                // the two values each needed twice (`x`, then `t`) are parked on
3659                // the stack and read back with `pop` (balanced push/pop pairs).
3660                const LO7: u64 = 0x7F7F_7F7F_7F7F_7F7F;
3661                const HI: u64 = 0x8080_8080_8080_8080;
3662                self.asm.mov_rr(SCRATCH, src); // SCRATCH = x (safe if d == src)
3663                self.asm.push(SCRATCH); // save x
3664                self.asm.mov_ri64(d, LO7);
3665                self.asm.and_rr(d, SCRATCH); // d = x & LO7
3666                self.asm.mov_ri64(SCRATCH, LO7);
3667                self.asm.add_rr(d, SCRATCH); // d = (x & LO7) + LO7
3668                self.asm.pop(SCRATCH); // SCRATCH = x (rsp restored)
3669                self.asm.or_rr(d, SCRATCH); // d = ((x&LO7)+LO7) | x
3670                self.asm.mov_ri64(SCRATCH, HI);
3671                self.asm.and_rr(d, SCRATCH); // d = t (bit7/byte = byte != 0)
3672                self.asm.push(d); // save t
3673                self.asm.mov_rr(SCRATCH, d);
3674                self.asm.shr_ri64(SCRATCH, 7); // SCRATCH = t >> 7
3675                self.asm.sub_rr(d, SCRATCH); // d = t - (t >> 7)
3676                self.asm.pop(SCRATCH); // SCRATCH = t (rsp restored)
3677                self.asm.or_rr(d, SCRATCH); // d = t | (t - (t>>7))
3678            }
3679        }
3680        if rd != 0 {
3681            self.invalidate_reg(rv_slot(rd).unwrap());
3682        }
3683    }
3684
3685    // ---- Zba shift-add ----------------------------------------------
3686
3687    fn rv_shadd(&mut self, rd: u8, rs1: u8, rs2: u8, shift: u8, uw: bool, pc: u32) {
3688        let Some(d) = self.rv_dst(rd, pc) else { return };
3689        if rv_is_reserved(rs1) || rv_is_reserved(rs2) {
3690            self.rv_emit_panic_at(pc);
3691            return;
3692        }
3693        // SCRATCH = (zext32 if uw else val)(rs1) << shift
3694        if rs1 == 0 {
3695            self.asm.mov_ri64(SCRATCH, 0);
3696        } else {
3697            let r1 = REG_MAP[rv_slot(rs1).unwrap()];
3698            if uw {
3699                self.asm.movzx_32_64(SCRATCH, r1);
3700            } else {
3701                self.asm.mov_rr(SCRATCH, r1);
3702            }
3703        }
3704        self.asm.shl_ri64(SCRATCH, shift);
3705        // d = rs2; d += SCRATCH
3706        if rs2 == 0 {
3707            self.asm.mov_ri64(d, 0);
3708        } else {
3709            let r2 = REG_MAP[rv_slot(rs2).unwrap()];
3710            if d != r2 {
3711                self.asm.mov_rr(d, r2);
3712            }
3713        }
3714        self.asm.add_rr(d, SCRATCH);
3715        if rd != 0 {
3716            self.invalidate_reg(rv_slot(rd).unwrap());
3717        }
3718    }
3719
3720    fn rv_adduw(&mut self, rd: u8, rs1: u8, rs2: u8, pc: u32) {
3721        let Some(d) = self.rv_dst(rd, pc) else { return };
3722        if rv_is_reserved(rs1) || rv_is_reserved(rs2) {
3723            self.rv_emit_panic_at(pc);
3724            return;
3725        }
3726        if rs1 == 0 {
3727            self.asm.mov_ri64(SCRATCH, 0);
3728        } else {
3729            let r1 = REG_MAP[rv_slot(rs1).unwrap()];
3730            self.asm.movzx_32_64(SCRATCH, r1);
3731        }
3732        if rs2 == 0 {
3733            self.asm.mov_ri64(d, 0);
3734        } else {
3735            let r2 = REG_MAP[rv_slot(rs2).unwrap()];
3736            if d != r2 {
3737                self.asm.mov_rr(d, r2);
3738            }
3739        }
3740        self.asm.add_rr(d, SCRATCH);
3741        if rd != 0 {
3742            self.invalidate_reg(rv_slot(rd).unwrap());
3743        }
3744    }
3745
3746    fn rv_slliuw(&mut self, rd: u8, rs1: u8, shamt: u8, pc: u32) {
3747        let Some(d) = self.rv_dst(rd, pc) else { return };
3748        if rs1 == 0 {
3749            self.asm.mov_ri64(d, 0);
3750        } else if rv_is_reserved(rs1) {
3751            self.rv_emit_panic_at(pc);
3752            return;
3753        } else {
3754            let r1 = REG_MAP[rv_slot(rs1).unwrap()];
3755            self.asm.movzx_32_64(d, r1);
3756            self.asm.shl_ri64(d, shamt & 63);
3757        }
3758        if rd != 0 {
3759            self.invalidate_reg(rv_slot(rd).unwrap());
3760        }
3761    }
3762
3763    // ---- Zbs single-bit ---------------------------------------------
3764
3765    fn rv_bit_rr(&mut self, rd: u8, rs1: u8, rs2: u8, op: BitOp, pc: u32) {
3766        let Some(d) = self.rv_dst(rd, pc) else { return };
3767        if rv_is_reserved(rs1) || rv_is_reserved(rs2) {
3768            self.rv_emit_panic_at(pc);
3769            return;
3770        }
3771        // SCRATCH = 1 << (rs2 & 0x3F).
3772        self.asm.mov_ri64(SCRATCH, 1);
3773        if rs2 != 0 {
3774            let r2 = REG_MAP[rv_slot(rs2).unwrap()];
3775            if r2 == Reg::RCX {
3776                self.asm.shl_cl64(SCRATCH);
3777            } else {
3778                self.asm.push(Reg::RCX);
3779                self.asm.mov_rr(Reg::RCX, r2);
3780                self.asm.shl_cl64(SCRATCH);
3781                self.asm.pop(Reg::RCX);
3782            }
3783        }
3784        // Apply.
3785        self.rv_read(rs1, d, pc);
3786        match op {
3787            BitOp::Clear => {
3788                self.asm.not64(SCRATCH);
3789                self.asm.and_rr(d, SCRATCH);
3790            }
3791            BitOp::Set => self.asm.or_rr(d, SCRATCH),
3792            BitOp::Invert => self.asm.xor_rr(d, SCRATCH),
3793            BitOp::Extract => {
3794                // test sets ZF; mov_ri32 (not mov_ri64-zero) writes 0
3795                // to d WITHOUT clobbering flags so setcc sees ZF.
3796                self.asm.test_rr(d, SCRATCH);
3797                self.asm.mov_ri32(d, 0);
3798                self.asm.setcc(Cc::NE, d);
3799            }
3800        }
3801        if rd != 0 {
3802            self.invalidate_reg(rv_slot(rd).unwrap());
3803        }
3804    }
3805
3806    fn rv_bit_imm(&mut self, rd: u8, rs1: u8, shamt: u8, op: BitOp, pc: u32) {
3807        let Some(d) = self.rv_dst(rd, pc) else { return };
3808        if rv_is_reserved(rs1) {
3809            self.rv_emit_panic_at(pc);
3810            return;
3811        }
3812        let s = shamt & 0x3F;
3813        if s < 31 {
3814            let mask_lo: i32 = 1i32 << s;
3815            self.rv_read(rs1, d, pc);
3816            match op {
3817                BitOp::Clear => self.asm.and_ri(d, !mask_lo),
3818                BitOp::Set => self.asm.or_ri(d, mask_lo),
3819                BitOp::Invert => self.asm.xor_ri(d, mask_lo),
3820                BitOp::Extract => {
3821                    self.asm.shr_ri64(d, s);
3822                    self.asm.and_ri(d, 1);
3823                }
3824            }
3825        } else {
3826            let mask: u64 = 1u64 << s;
3827            self.asm.mov_ri64(SCRATCH, mask);
3828            self.rv_read(rs1, d, pc);
3829            match op {
3830                BitOp::Clear => {
3831                    self.asm.not64(SCRATCH);
3832                    self.asm.and_rr(d, SCRATCH);
3833                }
3834                BitOp::Set => self.asm.or_rr(d, SCRATCH),
3835                BitOp::Invert => self.asm.xor_rr(d, SCRATCH),
3836                BitOp::Extract => {
3837                    self.asm.shr_ri64(d, s);
3838                    self.asm.and_ri(d, 1);
3839                }
3840            }
3841        }
3842        if rd != 0 {
3843            self.invalidate_reg(rv_slot(rd).unwrap());
3844        }
3845    }
3846
3847    // ---- Zicond -----------------------------------------------------
3848
3849    /// Semantics:
3850    ///   `cond = Cc::E`  → czero.eqz rd, rs1, rs2 = (rs2 == 0) ? 0 : rs1
3851    ///   `cond = Cc::NE` → czero.nez rd, rs1, rs2 = (rs2 != 0) ? 0 : rs1
3852    ///
3853    /// Emits a three-op CMOV sequence:
3854    ///   test r2, r2     ; ZF reflects rs2 == 0
3855    ///   mov_ri32 _, 0   ; 5-byte mov-imm (no flag effect)
3856    ///   cmov... ...     ; conditionally swap on ZF
3857    ///
3858    /// The two branches below differ only in which register is the cmov
3859    /// destination vs source, dictated by whether `d` aliases `rs1`
3860    /// (in which case `d` already holds the "keep" value and we cmov
3861    /// 0 in on the spec condition) or not (we initialise `d=0` and
3862    /// cmov `r1` in on the opposite condition). Both paths are 3 ops.
3863    fn rv_czero(&mut self, rd: u8, rs1: u8, rs2: u8, cond: Cc, pc: u32) {
3864        let Some(d) = self.rv_dst(rd, pc) else { return };
3865        if rv_is_reserved(rs1) || rv_is_reserved(rs2) {
3866            self.rv_emit_panic_at(pc);
3867            return;
3868        }
3869        let slot = rv_slot(rd).unwrap();
3870
3871        // Static-result short circuits.
3872        if rs2 == 0 {
3873            // rs2 hardwired zero: spec condition is statically known.
3874            //   eqz: rs2==0 always true → d = 0
3875            //   nez: rs2!=0 always false → d = rs1
3876            if matches!(cond, Cc::E) {
3877                self.asm.mov_ri64(d, 0);
3878            } else {
3879                self.rv_read(rs1, d, pc);
3880            }
3881            self.invalidate_reg(slot);
3882            return;
3883        }
3884        if rs1 == 0 {
3885            // rs1 hardwired zero: both branches of the conditional yield 0.
3886            self.asm.mov_ri64(d, 0);
3887            self.invalidate_reg(slot);
3888            return;
3889        }
3890        if rs1 == rs2 {
3891            //   eqz: (rs1==0) ? 0 : rs1 == rs1
3892            //   nez: (rs1!=0) ? 0 : rs1 == 0
3893            if matches!(cond, Cc::E) {
3894                self.rv_read(rs1, d, pc);
3895            } else {
3896                self.asm.mov_ri64(d, 0);
3897            }
3898            self.invalidate_reg(slot);
3899            return;
3900        }
3901
3902        let r1 = REG_MAP[rv_slot(rs1).unwrap()];
3903        let r2 = REG_MAP[rv_slot(rs2).unwrap()];
3904        let opposite = match cond {
3905            Cc::E => Cc::NE,
3906            Cc::NE => Cc::E,
3907            _ => unreachable!("rv_czero only accepts E/NE"),
3908        };
3909
3910        if d == r1 {
3911            // d already holds rs1's value. Test rs2, then cmov 0 in
3912            // when the spec condition holds. We can't cmov from `r1`
3913            // here — at execution time `r1 == d`, so the source value
3914            // is whatever d *currently* holds, not the original rs1.
3915            self.asm.test_rr(r2, r2);
3916            self.asm.mov_ri32(SCRATCH, 0);
3917            self.asm.cmovcc(cond, d, SCRATCH);
3918        } else {
3919            // d != r1. d may alias r2; that's fine because we test r2
3920            // BEFORE the mov writes 0 into d.
3921            self.asm.test_rr(r2, r2);
3922            self.asm.mov_ri32(d, 0);
3923            self.asm.cmovcc(opposite, d, r1);
3924        }
3925        self.invalidate_reg(slot);
3926    }
3927
3928    // ---- Jumps & branches -------------------------------------------
3929
3930    fn rv_jal(&mut self, rd: u8, imm: i32, pc: u32, next_pc: u32) {
3931        if rv_is_reserved(rd) {
3932            self.rv_emit_panic_at(pc);
3933            return;
3934        }
3935        if rd != 0 {
3936            // The link register holds a guest VA (code_base + offset),
3937            // matching jalr's return-address contract and the interp.
3938            let slot = rv_slot(rd).unwrap();
3939            self.asm
3940                .mov_ri64(REG_MAP[slot], self.code_base.wrapping_add(next_pc) as u64);
3941            self.invalidate_reg(slot);
3942        }
3943        let target = (pc as i64).wrapping_add(imm as i64) as u32;
3944        self.emit_static_branch(target, true, next_pc, pc);
3945    }
3946
3947    /// Emit `jalr rd, rs1, imm` — indirect jump (return / indirect
3948    /// call). Strictly simpler than the former br_table (no jump-table
3949    /// indirection):
3950    ///   1. `target_va = (rs1 + imm) & 0xFFFFFFFF`   (2³² wrap)
3951    ///   2. write `rd = code_base + next_pc` if `rd != 0` (return addr)
3952    ///   3. `offset = target_va - code_base`
3953    ///   4. bounds: `offset < code_len`  else PANIC
3954    ///   5. `native = code_base_native + dispatch_table[offset]; jmp`
3955    ///      — the dispatch table is *dense*: every non-block-start offset
3956    ///      holds the panic-stub offset, so a mid-block / mid-instruction
3957    ///      target jumps to the panic stub (gas is precharged at block
3958    ///      entry). Folds the former `bb_starts` check into the lookup.
3959    ///      SECURITY-CRITICAL: the runtime MUST dense-fill the table.
3960    fn rv_jalr(&mut self, rd: u8, rs1: u8, imm: i32, pc: u32, next_pc: u32) {
3961        use super::asm::Cc;
3962
3963        if rv_is_reserved(rs1) {
3964            self.rv_emit_panic_at(pc);
3965            return;
3966        }
3967
3968        // SCRATCH = rs1 (x0 → 0).
3969        self.rv_read(rs1, SCRATCH, pc);
3970        if imm != 0 {
3971            self.asm.add_ri(SCRATCH, imm);
3972        }
3973        // 2³² wrap: zero-extend the low 32 bits (shl 32 ; shr 32).
3974        self.asm.shl_ri64(SCRATCH, 32);
3975        self.asm.shr_ri64(SCRATCH, 32);
3976
3977        // Write the return address (a guest VA) — target already in
3978        // SCRATCH, so this can't clobber it (rd never maps to RDX).
3979        if rd != 0 {
3980            let slot = rv_slot(rd).unwrap();
3981            self.asm
3982                .mov_ri64(REG_MAP[slot], self.code_base.wrapping_add(next_pc) as u64);
3983            self.invalidate_reg(slot);
3984        }
3985
3986        // offset = target_va - code_base.
3987        if self.code_base != 0 {
3988            self.asm.sub_ri(SCRATCH, self.code_base as i32);
3989        }
3990
3991        // Record the offset as the paused PC for fault attribution.
3992        self.asm.mov_store32_rip_rel(CTX_PC, SCRATCH);
3993
3994        // Bounds: offset < code_len (unsigned) — underflow from a
3995        // target below code_base wraps huge and fails here.
3996        self.asm.cmp_ri32(SCRATCH, self.code_len as i32);
3997        self.asm.jcc_label(Cc::AE, self.panic_label);
3998
3999        // native = code_base_native + dispatch_table[offset]; jmp.
4000        // The dispatch table is dense: a non-block-start offset holds the
4001        // panic-stub offset, so a mid-block / mid-instruction target
4002        // lands on the panic stub here instead of valid native code.
4003        // This folds the former `bb_starts[offset] == 1` validation into
4004        // the lookup (one fewer load + branch per jalr).
4005        self.asm.push(Reg::RAX);
4006        self.asm.mov_load64_rip_rel(Reg::RAX, CTX_DISPATCH_TABLE);
4007        self.asm.movsxd_load_sib4(Reg::RAX, Reg::RAX, SCRATCH);
4008        self.asm.add_r64_mem_rip_rel(Reg::RAX, CTX_CODE_BASE);
4009        self.asm.mov_rr(SCRATCH, Reg::RAX);
4010        self.asm.pop(Reg::RAX);
4011        self.asm.jmp_reg(SCRATCH);
4012    }
4013
4014    fn rv_branch(&mut self, rs1: u8, rs2: u8, imm: i32, cc: Cc, pc: u32, next_pc: u32) {
4015        if rv_is_reserved(rs1) || rv_is_reserved(rs2) {
4016            self.rv_emit_panic_at(pc);
4017            return;
4018        }
4019        let target = (pc as i64).wrapping_add(imm as i64) as u32;
4020        let a = self.rv_read_into(rs1, SCRATCH, pc);
4021        let b = if a == SCRATCH {
4022            if rs2 == 0 {
4023                // both x0: cmp SCRATCH, SCRATCH (0 vs 0).
4024                SCRATCH
4025            } else {
4026                REG_MAP[rv_slot(rs2).unwrap()]
4027            }
4028        } else if rs2 == 0 {
4029            self.asm.mov_ri64(SCRATCH, 0);
4030            SCRATCH
4031        } else {
4032            REG_MAP[rv_slot(rs2).unwrap()]
4033        };
4034        self.emit_branch_reg(a, b, cc, target, next_pc, pc);
4035    }
4036
4037    // ---- custom-0 ---------------------------------------------------
4038
4039    fn rv_trap(&mut self, pc: u32) {
4040        self.asm.mov_store32_rip_rel_imm(CTX_PC, pc as i32);
4041        self.asm
4042            .mov_store32_rip_rel_imm(CTX_EXIT_REASON, EXIT_TRAP as i32);
4043        self.asm.mov_store32_rip_rel_imm(CTX_EXIT_ARG, 0);
4044        self.asm.jmp_label(self.exit_label);
4045    }
4046
4047    fn rv_ecall_jar(&mut self, next_pc: u32) {
4048        self.asm.mov_store32_rip_rel_imm(CTX_PC, next_pc as i32);
4049        self.asm
4050            .mov_store32_rip_rel_imm(CTX_EXIT_REASON, EXIT_ECALL as i32);
4051        self.asm.mov_store32_rip_rel_imm(CTX_EXIT_ARG, 0);
4052        self.asm.jmp_label(self.exit_label);
4053    }
4054
4055    fn rv_ecalli(&mut self, imm: i32, next_pc: u32) {
4056        self.asm.mov_store32_rip_rel_imm(CTX_PC, next_pc as i32);
4057        self.asm
4058            .mov_store32_rip_rel_imm(CTX_EXIT_REASON, EXIT_HOST_CALL as i32);
4059        self.asm.mov_store32_rip_rel_imm(CTX_EXIT_ARG, imm);
4060        self.asm.jmp_label(self.exit_label);
4061    }
4062
4063    /// Generic "panic at this PC" exit.
4064    fn rv_emit_panic_at(&mut self, pc: u32) {
4065        self.asm.mov_store32_rip_rel_imm(CTX_PC, pc as i32);
4066        self.asm
4067            .mov_store32_rip_rel_imm(CTX_EXIT_REASON, EXIT_PANIC as i32);
4068        self.asm.jmp_label(self.exit_label);
4069    }
4070
4071    // ----------------------------------------------------------------
4072    // Peephole tracking helpers — called inline from the tracked
4073    // dispatchers in `compile_rv4`. They replace the old separate
4074    // `update_reg_defs_rv` match pass (strict single-pass refactor).
4075    //
4076    // Each helper short-circuits when the destination register can't
4077    // produce a useful tracking entry (x0 / x3 / x4) or when the arm-
4078    // specific alias guard fires. The per-op emit helper has already
4079    // cleared `rd` via `invalidate_reg`, so the helper just installs
4080    // the new RegDef when applicable.
4081    // ----------------------------------------------------------------
4082
4083    /// `addi rd, x0, imm` / `lui rd, imm` — canonical constant load.
4084    /// Records `RegDef::Const(imm as u32)` so subsequent address
4085    /// formations can fold the constant directly.
4086    #[inline]
4087    fn track_const(&mut self, rd: u8, imm: i32) {
4088        use super::codegen::RegDef;
4089        if let Some(slot) = rv_slot(rd) {
4090            self.reg_defs[slot] = RegDef::Const(imm as u32);
4091            self.reg_defs_active |= 1u16 << slot;
4092            self.invalidate_dependents(slot);
4093        }
4094    }
4095
4096    /// `slli rd, rs1, shamt` with `shamt ∈ {1,2,3}` and `rs1 != rd`.
4097    /// Records `RegDef::Shifted` so a following Add can promote to
4098    /// ScaledAdd for SIB-style LEA. The arm-side guards (range and
4099    /// aliasing) live in the caller so this helper just installs.
4100    #[inline]
4101    fn track_shifted(&mut self, rd: u8, rs1: u8, shamt: u8) {
4102        use super::codegen::RegDef;
4103        if let (Some(d), Some(s)) = (rv_slot(rd), rv_slot(rs1)) {
4104            self.reg_defs[d] = RegDef::Shifted {
4105                src: s,
4106                shift: shamt,
4107            };
4108            self.reg_defs_active |= 1u16 << d;
4109            self.invalidate_dependents(d);
4110        }
4111    }
4112
4113    /// `add rd, rs1, rs2` with `rd != rs1 && rd != rs2`. Promotes to
4114    /// `RegDef::ScaledAdd` when one operand is already tracked as
4115    /// `Shifted`. Mirrors PVM's update_reg_defs for Add64.
4116    #[inline]
4117    fn track_add_scaledadd(&mut self, rd: u8, rs1: u8, rs2: u8) {
4118        use super::codegen::RegDef;
4119        let (Some(d), Some(a), Some(b)) = (rv_slot(rd), rv_slot(rs1), rv_slot(rs2)) else {
4120            return;
4121        };
4122        let def = if let RegDef::Shifted { src, shift } = self.reg_defs[b] {
4123            Some(RegDef::ScaledAdd {
4124                base: a,
4125                idx: src,
4126                shift,
4127            })
4128        } else if let RegDef::Shifted { src, shift } = self.reg_defs[a] {
4129            Some(RegDef::ScaledAdd {
4130                base: b,
4131                idx: src,
4132                shift,
4133            })
4134        } else {
4135            None
4136        };
4137        if let Some(def) = def {
4138            self.reg_defs[d] = def;
4139            self.reg_defs_active |= 1u16 << d;
4140            self.invalidate_dependents(d);
4141        }
4142        // else: per-op handler already invalidated rd.
4143    }
4144
4145    /// Helper for Sh{1,2,3}add → ScaledAdd tracking.
4146    ///
4147    /// `sh{N}add rd, rs1, rs2` writes `rd = rs2 + (rs1 << N)`. If rd
4148    /// aliases either operand, the post-emit value of rd no longer
4149    /// equals base+idx<<shift in terms of the *new* register state —
4150    /// any subsequent use of the tracked def would substitute the
4151    /// already-overwritten value. Skip tracking in those cases
4152    /// (mirrors PVM's update_reg_defs guard for Add64).
4153    #[inline]
4154    fn record_scaledadd(&mut self, rd: u8, rs1: u8, rs2: u8, shift: u8) {
4155        use super::codegen::RegDef;
4156        if rd == rs1 || rd == rs2 {
4157            return;
4158        }
4159        let (Some(d), Some(idx), Some(base)) = (rv_slot(rd), rv_slot(rs1), rv_slot(rs2)) else {
4160            return;
4161        };
4162        self.reg_defs[d] = RegDef::ScaledAdd { base, idx, shift };
4163        self.reg_defs_active |= 1u16 << d;
4164        self.invalidate_dependents(d);
4165    }
4166}
4167
4168#[derive(Clone, Copy)]
4169enum AluImmOp {
4170    Add,
4171    And,
4172    Or,
4173    Xor,
4174    Addw,
4175}
4176
4177#[derive(Clone, Copy)]
4178enum AluOp {
4179    Add,
4180    Sub,
4181    And,
4182    Or,
4183    Xor,
4184    Mul,
4185    Addw,
4186    Subw,
4187    Mulw,
4188    Min,
4189    Max,
4190    Minu,
4191    Maxu,
4192    Andn,
4193    Orn,
4194    Xnor,
4195}
4196
4197#[derive(Clone, Copy)]
4198enum ShiftOp {
4199    Shl64,
4200    Shr64,
4201    Sar64,
4202    Shl32,
4203    Shr32,
4204    Sar32,
4205    Rol64,
4206    Ror64,
4207    Rol32,
4208    Ror32,
4209}
4210
4211#[derive(Clone, Copy)]
4212enum BitOp {
4213    Clear,
4214    Set,
4215    Invert,
4216    Extract,
4217}
4218
4219#[derive(Clone, Copy)]
4220enum UnaryOp {
4221    Clz64,
4222    Clz32,
4223    Ctz64,
4224    Ctz32,
4225    Popcnt64,
4226    Popcnt32,
4227    SextB,
4228    SextH,
4229    ZextH,
4230    Rev8,
4231    OrcB,
4232}
javm_recompiler_x86/codegen.rs

javm_recompiler_x86/
codegen.rs