Skip to main content

javm_recompiler_x86/
codegen.rs

1//! PVM-to-x86-64 code generation.
2//!
3//! Compiles PVM bytecode into native x86-64 machine code. Each PVM basic block
4//! becomes a native basic block with gas metering at entry. PVM registers are
5//! mapped to x86-64 registers for the duration of execution.
6//!
7//! Register mapping (PVM `φ[i]` → x86-64):
8//!   `φ[0]`  → RBP   (callee-saved) — RA, rarely used as memory base
9//!   `φ[1]`  → RBX   (callee-saved) — SP, avoids RBP encoding penalty
10//!   `φ[2]`  → R12   (callee-saved)
11//!   `φ[3]`  → R13   (callee-saved)
12//!   `φ[4]`  → R14   (callee-saved)
13//!   `φ[5]`  → RSI   (caller-saved)
14//!   `φ[6]`  → RDI   (caller-saved)
15//!   `φ[7]`  → R8    (caller-saved)
16//!   `φ[8]`  → R9    (caller-saved)
17//!   `φ[9]`  → R10   (caller-saved)
18//!   `φ[10]` → R11   (caller-saved)
19//!   `φ[11]` → RAX   (caller-saved)
20//!   `φ[12]` → RCX   (caller-saved)
21//!
22//! Reserved: R15 = gas meter, RDX = scratch, RSP = native stack.
23
24use alloc::vec;
25use alloc::vec::Vec;
26
27use super::asm::{Assembler, Cc, Label, Reg};
28use javm_exec::args::{self, Args};
29use javm_exec::gas_sim::GasSimulator;
30use javm_exec::instruction::Opcode;
31
32/// Compute skip(i) — distance to next instruction start.
33fn compute_skip(pc: usize, bitmask: &[u8]) -> usize {
34    for j in 0..25 {
35        let idx = pc + 1 + j;
36        let bit = if idx < bitmask.len() { bitmask[idx] } else { 1 };
37        if bit == 1 {
38            return j;
39        }
40    }
41    24
42}
43/// Map PVM register index (0..12) to x86-64 register.
44/// All 13 PVM registers live in x86 registers.
45const REG_MAP: [Reg; 13] = [
46    Reg::RBP, // φ[0] — RA (rarely used as memory base, so RBP encoding penalty is acceptable)
47    Reg::RBX, // φ[1] — SP (frequently used as memory base, RBX avoids RBP disp8 penalty)
48    Reg::R12, // φ[2]
49    Reg::R13, // φ[3]
50    Reg::R14, // φ[4]
51    Reg::RSI, // φ[5]
52    Reg::RDI, // φ[6]
53    Reg::R8,  // φ[7]
54    Reg::R9,  // φ[8]
55    Reg::R10, // φ[9]
56    Reg::R11, // φ[10]
57    Reg::RAX, // φ[11]
58    Reg::RCX, // φ[12]
59];
60
61/// Scratch register (not mapped to any PVM register).
62const SCRATCH: Reg = Reg::RDX;
63/// R15 = gas meter. Loaded from `ctx.gas` at the prologue, decremented
64/// once per basic block, flushed back to `ctx.gas` at every exit.
65const GAS: Reg = Reg::R15;
66
67/// Caller-saved PVM registers that need saving around helper calls.
68#[allow(dead_code)]
69const CALLER_SAVED: [Reg; 8] = [
70    Reg::RSI,
71    Reg::RDI,
72    Reg::R8,
73    Reg::R9,
74    Reg::R10,
75    Reg::R11,
76    Reg::RAX,
77    Reg::RCX,
78];
79
80/// JitContext lives above the PVM u32 address space (no bounds check
81/// on guest mem — the full low 4 GiB of native VA belongs to the
82/// program). CTX is reached via RIP-relative `[rip+disp32]`, which
83/// gives ±2 GiB range from the JIT code, so CTX must be **adjacent**
84/// to the JIT region.
85///
86/// In the nub-x86 microkernel, CTX and the per-Image JIT arena both
87/// live in PML4 slot 1 (base 512 GiB). Sharing one PML4 slot lets
88/// the Image's PDPT subtree be cached as a template across all
89/// Instances (per-call PT just shallow-clones the slot's entry). MEM
90/// stays in `PML4[0]` at VA 0 so PVM addr == native VA still holds.
91pub const CTX_VA: u64 = 1u64 << 39;
92
93use super::JitContext;
94use memoffset::offset_of;
95
96pub const CTX_REGS: u64 = CTX_VA + offset_of!(JitContext, regs) as u64;
97pub const CTX_GAS: u64 = CTX_VA + offset_of!(JitContext, gas) as u64;
98pub const CTX_EXIT_REASON: u64 = CTX_VA + offset_of!(JitContext, exit_reason) as u64;
99pub const CTX_EXIT_ARG: u64 = CTX_VA + offset_of!(JitContext, exit_arg) as u64;
100pub const CTX_HEAP_BASE: u64 = CTX_VA + offset_of!(JitContext, heap_base) as u64;
101pub const CTX_HEAP_TOP: u64 = CTX_VA + offset_of!(JitContext, heap_top) as u64;
102pub const CTX_JT_PTR: u64 = CTX_VA + offset_of!(JitContext, jt_ptr) as u64;
103pub const CTX_JT_LEN: u64 = CTX_VA + offset_of!(JitContext, jt_len) as u64;
104pub const CTX_BB_STARTS: u64 = CTX_VA + offset_of!(JitContext, bb_starts) as u64;
105pub const CTX_BB_LEN: u64 = CTX_VA + offset_of!(JitContext, bb_len) as u64;
106pub const CTX_ENTRY_PC: u64 = CTX_VA + offset_of!(JitContext, entry_pc) as u64;
107pub const CTX_PC: u64 = CTX_VA + offset_of!(JitContext, pc) as u64;
108pub const CTX_DISPATCH_TABLE: u64 = CTX_VA + offset_of!(JitContext, dispatch_table) as u64;
109pub const CTX_CODE_BASE: u64 = CTX_VA + offset_of!(JitContext, code_base) as u64;
110pub const CTX_FAST_REENTRY: u64 = CTX_VA + offset_of!(JitContext, fast_reentry) as u64;
111
112/// Exit reason codes (matching ExitReason enum).
113pub const EXIT_HALT: u32 = 0;
114pub const EXIT_PANIC: u32 = 1;
115pub const EXIT_OOG: u32 = 2;
116pub const EXIT_PAGE_FAULT: u32 = 3;
117pub const EXIT_HOST_CALL: u32 = 4;
118pub const EXIT_ECALL: u32 = 6;
119pub const EXIT_TRAP: u32 = 7;
120
121/// Result of compilation.
122pub struct CompileResult {
123    pub native_code: Vec<u8>,
124    pub dispatch_table: Vec<i32>,
125    pub trap_table: Vec<(u32, u32)>,
126    pub exit_label_offset: u32,
127}
128
129/// Helper function pointers passed to compiled code.
130#[repr(C)]
131pub struct HelperFns {
132    pub mem_read_u8: u64,
133    pub mem_read_u16: u64,
134    pub mem_read_u32: u64,
135    pub mem_read_u64: u64,
136    pub mem_write_u8: u64,
137    pub mem_write_u16: u64,
138    pub mem_write_u32: u64,
139    pub mem_write_u64: u64,
140    pub sbrk_helper: u64,
141}
142
143/// Tracks what a PVM register was last set to, for peephole optimization.
144#[derive(Clone, Copy, Debug)]
145enum RegDef {
146    /// Unknown or complex value.
147    Unknown,
148    /// Known compile-time constant (32-bit address or immediate).
149    Const(u32),
150    /// reg = src << shift (shift 1..=3, i.e. *2, *4, *8).
151    /// Built from: add D,A,A → Shifted{src:A, shift:1}
152    ///             add D,D,D where D=Shifted{src,s} → Shifted{src, shift:s+1}
153    Shifted { src: usize, shift: u8 },
154    /// reg = base + (idx << shift) (shift 0..=3, i.e. *1, *2, *4, *8).
155    /// Built from: add D,BASE,S where S=Shifted{src,s} → ScaledAdd{base:BASE, idx:src, shift:s}
156    ScaledAdd { base: usize, idx: usize, shift: u8 },
157}
158
159/// PVM-to-x86-64 compiler.
160pub struct Compiler {
161    pub asm: Assembler,
162    /// Base label ID for PC labels. label_for_pc(pc) = Label(label_base + pc).
163    /// Labels are bulk-allocated in the assembler with LABEL_UNBOUND=0 (zeroed pages).
164    label_base: u32,
165    /// Gas block start PCs discovered during compilation (for dispatch table).
166    gas_block_pcs: Vec<u32>,
167    /// Label for the exit sequence.
168    exit_label: Label,
169    /// Label for the shared out-of-gas exit (sets EXIT_OOG + jumps to exit).
170    oog_label: Label,
171    /// Label for panic exit.
172    panic_label: Label,
173    /// Label for OOG handler that reads PC from SCRATCH: stores PC, then falls through to oog_label.
174    oog_pc_label: Label,
175    /// Per-gas-block OOG stubs: (label, pvm_pc) — emitted as cold code after main body.
176    oog_stubs: Vec<(Label, u32, u32)>, // (label, pvm_pc, block_cost)
177    /// Helper function addresses.
178    helpers: HelperFns,
179    /// Bitmask reference (1 = instruction start). Stored as raw pointer for self-referential use.
180    bitmask_ptr: *const u8,
181    bitmask_len: usize,
182    /// Peephole: tracks how each PVM register was last defined.
183    reg_defs: [RegDef; 13],
184    /// Bitmask of registers that have non-Unknown reg_defs (for fast invalidation).
185    reg_defs_active: u16,
186    /// Carry flag fusion: after an `add64 D, A, B`, CF = overflow(A+B).
187    /// Stores (D, A, B) so that a subsequent `setLtU C, D, A` or `setLtU C, D, B`
188    /// can use CF directly instead of emitting a redundant `cmp`.
189    /// Cleared by any instruction that clobbers flags (i.e., everything except the
190    /// immediately following setLtU).
191    last_add_cf: Option<(usize, usize, usize)>,
192    /// Trap table for signal-based bounds checking: (native_offset, pvm_pc).
193    trap_entries: Vec<(u32, u32)>,
194    /// Memory tier load/store cycles for gas simulation.
195    mem_cycles: u8,
196}
197
198impl Compiler {
199    pub fn new(
200        bitmask: &[u8],
201        _jump_table: &[u32],
202        helpers: HelperFns,
203        code_len: usize,
204        jit_va_base: u64,
205        mem_cycles: u8,
206    ) -> Self {
207        // Estimate native code size: ~3x PVM code provides safety margin for
208        // direct-write emission (no per-byte capacity checks in hot loop).
209        let estimated_native = code_len * 3 + 8192;
210        // Labels: one per PC (dense array) + fixed overhead for exit/oog/stubs.
211        let estimated_labels = code_len + 1024;
212        // mmap-backed assembler buffer was a host-only path; the recompiler is
213        // now embedded only by `nub-arch-x86`, which uses the Vec-backed form.
214        let mut asm = Assembler::with_capacity(estimated_native, estimated_labels);
215        // RIP-relative CTX accesses need the eventual load VA to compute
216        // disp32. Callers from a per-invocation runtime pass JIT_VA_M;
217        // tests pass 0 (encodings reference offset 0).
218        asm.set_jit_va_base(jit_va_base);
219        // Reserve label 0 so label IDs start from 1 (for consistency with fixed labels).
220        let _reserved = asm.new_label(); // Label(0)
221        let exit_label = asm.new_label();
222        let oog_label = asm.new_label();
223        let panic_label = asm.new_label();
224        let oog_pc_label = asm.new_label();
225        // Pre-create one label per PC for O(1) lookup in label_for_pc.
226        // With LABEL_UNBOUND=0, bulk allocation uses zeroed pages (calloc/COW).
227        // Only the ~640 labels that get bound trigger page faults — the other
228        // ~110K labels stay on zero pages and cost nothing.
229        let label_base = asm.labels_len() as u32;
230        asm.bulk_create_labels(code_len + 1);
231        Self {
232            label_base,
233            gas_block_pcs: Vec::with_capacity(1024),
234            asm,
235            exit_label,
236            oog_label,
237            panic_label,
238            oog_pc_label,
239            oog_stubs: Vec::with_capacity(1024),
240            reg_defs: [RegDef::Unknown; 13],
241            reg_defs_active: 0,
242            last_add_cf: None,
243            helpers,
244            bitmask_ptr: bitmask.as_ptr(),
245            bitmask_len: bitmask.len(),
246            trap_entries: Vec::with_capacity(2048),
247            mem_cycles,
248        }
249    }
250
251    /// Look up the pre-created label for a PVM PC. O(1) arithmetic.
252    #[inline]
253    fn label_for_pc(&self, pc: u32) -> Label {
254        Label(self.label_base + pc)
255    }
256
257    fn is_basic_block_start(&self, idx: u32) -> bool {
258        let i = idx as usize;
259        // SAFETY: bitmask_ptr points to the start of a valid &[u8] slice of length
260        // bitmask_len, and i < bitmask_len is checked before the dereference.
261        i < self.bitmask_len && unsafe { *self.bitmask_ptr.add(i) } == 1
262    }
263
264    /// Compile directly from raw code+bitmask. Streaming single-pass:
265    /// gas block discovery + decode + gas sim + codegen in one loop.
266    pub fn compile(mut self, code: &[u8], bitmask: &[u8]) -> CompileResult {
267        let code_len = code.len();
268
269        // Emit prologue
270        self.emit_prologue();
271
272        // True single-pass: no pre-scan. Gas block starts (ϖ) are discovered
273        // inline — PC=0 is always a gas block start, and after every terminator
274        // instruction the next PC becomes a gas block start.
275        let mut gas_sim = GasSimulator::new();
276        let mut pending_gas: Option<(Label, u32, usize)> = None;
277        // Tracks whether the next instruction starts a new gas block.
278        // True initially for PC=0.
279        let mut next_is_gas_start = true;
280
281        // Find first instruction start
282        let mut pc: usize = 0;
283        while pc < code.len() && (pc >= bitmask.len() || bitmask[pc] != 1) {
284            pc += 1;
285        }
286
287        let code_ptr = code.as_ptr();
288
289        while pc < code.len() {
290            self.asm.ensure_capacity(512);
291
292            // SAFETY: pc < code_len is guaranteed by the loop condition.
293            let raw_byte = unsafe { *code_ptr.add(pc) };
294            let is_gas_start = next_is_gas_start;
295            next_is_gas_start = false;
296
297            // Fast skip for Fallthrough/Unlikely: these produce zero native code
298            // but ARE terminators, so the next instruction starts a new gas block.
299            if raw_byte == 1 || raw_byte == 2 {
300                // Fallthrough=1, Unlikely=2
301                let skip = javm_exec::gas_cost::skip_distance(bitmask, pc);
302                if is_gas_start {
303                    self.emit_gas_block_start(pc, &mut pending_gas, &mut gas_sim);
304                }
305                gas_sim.feed(&javm_exec::gas_cost::FastCost {
306                    cycles: 2,
307                    decode_slots: 1,
308                    exec_unit: 0,
309                    src_mask: 0,
310                    dst_mask: 0,
311                    is_terminator: true,
312                    is_move_reg: false,
313                });
314                next_is_gas_start = true; // fallthrough IS a terminator
315                pc += 1 + skip;
316                continue;
317            }
318
319            // Combined opcode validation + category lookup in a single array access.
320            let (opcode, category) = match javm_exec::instruction::decode_opcode_fast(raw_byte) {
321                Some(oc) => oc,
322                None => {
323                    self.asm.mov_store32_rip_rel_imm(CTX_PC, pc as i32);
324                    self.emit_exit(EXIT_PANIC, 0);
325                    pc += 1;
326                    continue;
327                }
328            };
329            let skip = javm_exec::gas_cost::skip_distance(bitmask, pc);
330            let next_pc = (pc + 1 + skip) as u32;
331
332            // Read register bytes once — used by both arg decoding and gas cost.
333            // SAFETY: pc < code.len(). pc+1/pc+2 may be out of bounds for
334            // instructions at the end, so we bounds-check those.
335            let reg_byte1 = if pc + 1 < code.len() {
336                unsafe { *code_ptr.add(pc + 1) }
337            } else {
338                0
339            };
340            let reg_byte2 = if pc + 2 < code.len() {
341                unsafe { *code_ptr.add(pc + 2) }
342            } else {
343                0
344            };
345            let raw_ra = reg_byte1 & 0x0F;
346            let raw_rb = reg_byte1 >> 4;
347
348            let decoded_args = match category {
349                javm_exec::instruction::InstructionCategory::ThreeReg => Args::ThreeReg {
350                    ra: raw_ra.min(12) as usize,
351                    rb: raw_rb.min(12) as usize,
352                    rd: reg_byte2.min(12) as usize,
353                },
354                javm_exec::instruction::InstructionCategory::TwoReg => Args::TwoReg {
355                    rd: raw_ra.min(12) as usize,
356                    ra: raw_rb.min(12) as usize,
357                },
358                javm_exec::instruction::InstructionCategory::TwoRegOneImm => {
359                    let ra = raw_ra.min(12) as usize;
360                    let rb = raw_rb.min(12) as usize;
361                    let lx = if skip > 1 { (skip - 1).min(4) } else { 0 };
362                    let imm = args::read_signed_imm(code, pc + 2, lx);
363                    Args::TwoRegImm { ra, rb, imm }
364                }
365                javm_exec::instruction::InstructionCategory::NoArgs => Args::None,
366                javm_exec::instruction::InstructionCategory::OneImm => {
367                    let lx = skip.min(4);
368                    Args::Imm {
369                        imm: args::read_signed_imm(code, pc + 1, lx),
370                    }
371                }
372                javm_exec::instruction::InstructionCategory::OneRegOneImm => {
373                    let ra = raw_ra.min(12) as usize;
374                    let lx = if skip > 1 { (skip - 1).min(4) } else { 0 };
375                    Args::RegImm {
376                        ra,
377                        imm: args::read_signed_imm(code, pc + 2, lx),
378                    }
379                }
380                javm_exec::instruction::InstructionCategory::OneRegExtImm => {
381                    let ra = raw_ra.min(12) as usize;
382                    Args::RegExtImm {
383                        ra,
384                        imm: args::read_le_imm(code, pc + 2, 8),
385                    }
386                }
387                javm_exec::instruction::InstructionCategory::TwoImm => {
388                    let lx = (reg_byte1 as usize % 8).min(4);
389                    let ly = if skip > lx + 1 {
390                        (skip - lx - 1).min(4)
391                    } else {
392                        0
393                    };
394                    Args::TwoImm {
395                        imm_x: args::read_signed_imm(code, pc + 2, lx),
396                        imm_y: args::read_signed_imm(code, pc + 2 + lx, ly),
397                    }
398                }
399                javm_exec::instruction::InstructionCategory::OneOffset => {
400                    let lx = skip.min(4);
401                    let signed_off = args::read_signed_imm(code, pc + 1, lx) as i64;
402                    Args::Offset {
403                        offset: (pc as i64).wrapping_add(signed_off) as u64,
404                    }
405                }
406                javm_exec::instruction::InstructionCategory::OneRegTwoImm => {
407                    let ra = raw_ra.min(12) as usize;
408                    let lx = ((reg_byte1 as usize / 16) % 8).min(4);
409                    let ly = if skip > lx + 1 {
410                        (skip - lx - 1).min(4)
411                    } else {
412                        0
413                    };
414                    Args::RegTwoImm {
415                        ra,
416                        imm_x: args::read_signed_imm(code, pc + 2, lx),
417                        imm_y: args::read_signed_imm(code, pc + 2 + lx, ly),
418                    }
419                }
420                javm_exec::instruction::InstructionCategory::OneRegImmOffset => {
421                    let ra = raw_ra.min(12) as usize;
422                    let lx = ((reg_byte1 as usize / 16) % 8).min(4);
423                    let ly = if skip > lx + 1 {
424                        (skip - lx - 1).min(4)
425                    } else {
426                        0
427                    };
428                    let imm = args::read_signed_imm(code, pc + 2, lx);
429                    let signed_off = args::read_signed_imm(code, pc + 2 + lx, ly) as i64;
430                    Args::RegImmOffset {
431                        ra,
432                        imm,
433                        offset: (pc as i64).wrapping_add(signed_off) as u64,
434                    }
435                }
436                javm_exec::instruction::InstructionCategory::TwoRegOneOffset => {
437                    let ra = raw_ra.min(12) as usize;
438                    let rb = raw_rb.min(12) as usize;
439                    let lx = if skip > 1 { (skip - 1).min(4) } else { 0 };
440                    let signed_off = args::read_signed_imm(code, pc + 2, lx) as i64;
441                    Args::TwoRegOffset {
442                        ra,
443                        rb,
444                        offset: (pc as i64).wrapping_add(signed_off) as u64,
445                    }
446                }
447                javm_exec::instruction::InstructionCategory::TwoRegTwoImm => {
448                    let ra = raw_ra.min(12) as usize;
449                    let rb = raw_rb.min(12) as usize;
450                    let lx = (reg_byte2 as usize % 8).min(4);
451                    let ly = if skip > lx + 2 {
452                        (skip - lx - 2).min(4)
453                    } else {
454                        0
455                    };
456                    Args::TwoRegTwoImm {
457                        ra,
458                        rb,
459                        imm_x: args::read_signed_imm(code, pc + 3, lx),
460                        imm_y: args::read_signed_imm(code, pc + 3 + lx, ly),
461                    }
462                }
463            };
464
465            // Gas block boundary: discovered inline via next_is_gas_start flag.
466            if is_gas_start {
467                self.emit_gas_block_start(pc, &mut pending_gas, &mut gas_sim);
468            }
469
470            let is_terminator = {
471                // Fast path: feed gas simulator directly from register bytes,
472                // skipping FastCost struct construction and bitmask iteration.
473                let (term, needs_full) = javm_exec::gas_cost::feed_gas_direct(
474                    opcode as u8,
475                    raw_ra,
476                    raw_rb,
477                    reg_byte2 & 0x0F,
478                    &mut gas_sim,
479                    self.mem_cycles,
480                );
481                if needs_full {
482                    // Slow path for branches/overlap/move: use full FastCost
483                    let fc = javm_exec::gas_cost::fast_cost_lut_regs(
484                        opcode as u8,
485                        &decoded_args,
486                        pc,
487                        code,
488                        bitmask,
489                        raw_ra,
490                        raw_rb,
491                        reg_byte2 & 0x0F,
492                        self.mem_cycles,
493                    );
494                    gas_sim.feed(&fc);
495                    fc.is_terminator
496                } else {
497                    term
498                }
499            };
500
501            // Peephole fusions
502            let fused = match opcode {
503                Opcode::Add64 => {
504                    self.try_fuse_scaled_index_raw(code, bitmask, pc, &decoded_args, &mut gas_sim)
505                }
506                Opcode::Mul64 => {
507                    self.try_fuse_mul_pair_raw(code, bitmask, pc, &decoded_args, &mut gas_sim)
508                }
509                _ => None,
510            };
511
512            if let Some(advance) = fused {
513                self.last_add_cf = None; // fused instruction clobbers flags
514                pc += advance;
515                continue;
516            }
517
518            // Clear carry flag tracking for all opcodes except Add64 (which sets it)
519            // and SetLtU (which consumes it inside compile_instruction).
520            if !matches!(opcode, Opcode::Add64 | Opcode::SetLtU) {
521                self.last_add_cf = None;
522            }
523
524            self.compile_instruction(opcode, &decoded_args, pc as u32, next_pc);
525
526            // Fast reg_defs update: for special-case opcodes that produce
527            // trackable patterns (Add64→Shifted, LoadImm→Const, etc.), call
528            // the full update_reg_defs. For all other opcodes, just invalidate
529            // the destination register directly from the decoded args. This
530            // avoids the opcode match + Args re-destructuring for ~95% of
531            // instructions.
532            match opcode {
533                Opcode::Add64
534                | Opcode::LoadImm
535                | Opcode::LoadImm64
536                | Opcode::ShloLImm64
537                | Opcode::MoveReg => {
538                    self.update_reg_defs(opcode, &decoded_args);
539                }
540                _ => {
541                    // Fast path: invalidate dest register based on category.
542                    // The destination is the first register field for most categories.
543                    match category {
544                        javm_exec::instruction::InstructionCategory::ThreeReg => {
545                            if let Args::ThreeReg { rd, .. } = decoded_args {
546                                self.invalidate_reg(rd);
547                            }
548                        }
549                        javm_exec::instruction::InstructionCategory::TwoReg => {
550                            if let Args::TwoReg { rd, .. } = decoded_args {
551                                self.invalidate_reg(rd);
552                            }
553                        }
554                        javm_exec::instruction::InstructionCategory::TwoRegOneImm
555                        | javm_exec::instruction::InstructionCategory::OneRegOneImm
556                        | javm_exec::instruction::InstructionCategory::OneRegExtImm
557                        | javm_exec::instruction::InstructionCategory::OneRegTwoImm
558                        | javm_exec::instruction::InstructionCategory::OneRegImmOffset => {
559                            // Destination = first register (ra in raw byte low nibble)
560                            self.invalidate_reg(raw_ra.min(12) as usize);
561                        }
562                        _ => {
563                            // NoArgs, OneImm, OneOffset, TwoRegOneOffset, TwoRegTwoImm:
564                            // These either don't write to a register or are terminators
565                            // (which invalidate_all_regs at the next gas block boundary).
566                            if is_terminator {
567                                self.invalidate_all_regs();
568                            }
569                        }
570                    }
571                }
572            }
573
574            // After a terminator, the next instruction starts a new gas block.
575            if is_terminator {
576                next_is_gas_start = true;
577            }
578
579            pc += 1 + skip;
580        }
581
582        // Finalize last gas block
583        if let Some((stub_label, block_pc, patch_offset)) = pending_gas.take() {
584            let cost = gas_sim.flush_and_get_cost();
585            self.asm.patch_i32(patch_offset, cost as i32);
586            self.oog_stubs.push((stub_label, block_pc, cost));
587        }
588
589        // Emit epilogue and exit sequences
590        self.emit_exit_sequences();
591
592        // Build dispatch table: PVM PC → native code offset.
593        // gas_block_pcs was populated inline during the single-pass loop.
594        let table_len = code_len + 1;
595        let mut dispatch_table = vec![0i32; table_len];
596        for &pvm_pc in self.gas_block_pcs.iter() {
597            let label = Label(self.label_base + pvm_pc);
598            if let Some(offset) = self.asm.label_offset(label) {
599                dispatch_table[pvm_pc as usize] = offset as i32;
600            }
601        }
602        // PC=0 must always be valid (program start); if not already set, it'll be
603        // set by the first basic block at PC 0.
604
605        let exit_label_offset = self.asm.label_offset(self.exit_label).unwrap_or(0) as u32;
606        let trap_table = self.trap_entries;
607
608        CompileResult {
609            native_code: self.asm.finalize(),
610            dispatch_table,
611            trap_table,
612            exit_label_offset,
613        }
614    }
615
616    /// Save caller-saved registers (PVM registers in caller-saved x86-64 regs).
617    #[allow(dead_code)]
618    fn save_caller_saved(&mut self) {
619        for &reg in &CALLER_SAVED {
620            self.asm.push(reg);
621        }
622    }
623
624    /// Restore caller-saved registers (reverse order).
625    #[allow(dead_code)]
626    fn restore_caller_saved(&mut self) {
627        for &reg in CALLER_SAVED.iter().rev() {
628            self.asm.pop(reg);
629        }
630    }
631
632    /// Peephole: fuse scaled-index from raw code (no pre-decoded array).
633    /// Pattern: add64 D,A,A / add64 D,D,D / add64 D2,BASE,D / load/store_ind R,D2,0
634    fn try_fuse_scaled_index_raw(
635        &mut self,
636        code: &[u8],
637        bitmask: &[u8],
638        pc: usize,
639        args: &Args,
640        gas_sim: &mut GasSimulator,
641    ) -> Option<usize> {
642        let Args::ThreeReg {
643            ra: a1_ra,
644            rb: a1_rb,
645            rd: a1_rd,
646        } = args
647        else {
648            return None;
649        };
650        if a1_ra != a1_rb {
651            return None;
652        }
653        let idx_reg = *a1_ra;
654        let d1 = *a1_rd;
655
656        // Peek instruction 2
657        let skip1 = compute_skip(pc, bitmask);
658        let pc2 = pc + 1 + skip1;
659        if pc2 >= code.len() || (pc2 < bitmask.len() && bitmask[pc2] != 1) {
660            return None;
661        }
662        let op2 = Opcode::from_byte(code[pc2])?;
663        if op2 != Opcode::Add64 {
664            return None;
665        }
666        let skip2 = compute_skip(pc2, bitmask);
667        let args2 = args::decode_args(code, pc2, skip2, op2.category());
668        let Args::ThreeReg {
669            ra: a2_ra,
670            rb: a2_rb,
671            rd: a2_rd,
672        } = args2
673        else {
674            return None;
675        };
676        if a2_ra != d1 || a2_rb != d1 || a2_rd != d1 {
677            return None;
678        }
679
680        // Peek instruction 3
681        let pc3 = pc2 + 1 + skip2;
682        if pc3 >= code.len() || (pc3 < bitmask.len() && bitmask[pc3] != 1) {
683            return None;
684        }
685        let op3 = Opcode::from_byte(code[pc3])?;
686        if op3 != Opcode::Add64 {
687            return None;
688        }
689        let skip3 = compute_skip(pc3, bitmask);
690        let args3 = args::decode_args(code, pc3, skip3, op3.category());
691        let Args::ThreeReg {
692            ra: a3_ra,
693            rb: a3_rb,
694            rd: a3_rd,
695        } = args3
696        else {
697            return None;
698        };
699        let base_reg;
700        if a3_rb == d1 && a3_ra != d1 {
701            base_reg = a3_ra;
702        } else if a3_ra == d1 && a3_rb != d1 {
703            base_reg = a3_rb;
704        } else {
705            return None;
706        }
707        let addr_reg = a3_rd;
708
709        // Peek instruction 4
710        let pc4 = pc3 + 1 + skip3;
711        if pc4 >= code.len() || (pc4 < bitmask.len() && bitmask[pc4] != 1) {
712            return None;
713        }
714        let op4 = Opcode::from_byte(code[pc4])?;
715        let skip4 = compute_skip(pc4, bitmask);
716        let args4 = args::decode_args(code, pc4, skip4, op4.category());
717
718        // Feed instructions 2-4 to gas sim (using decoded args, no redundant decode)
719        for &(opc, a, p) in &[(op2, &args2, pc2), (op3, &args3, pc3), (op4, &args4, pc4)] {
720            let fc = javm_exec::gas_cost::fast_cost_from_decoded(
721                opc as u8,
722                a,
723                p as u32,
724                code,
725                bitmask,
726                self.mem_cycles,
727            );
728            gas_sim.feed(&fc);
729        }
730
731        // Bind labels for all 4 instructions
732        // With post-terminator-only gas blocks, fused instructions (add, mul,
733        // load, store) are never terminators, so none of these PCs are gas block
734        // starts. No label binding needed.
735
736        match op4 {
737            Opcode::LoadIndU8
738            | Opcode::LoadIndI8
739            | Opcode::LoadIndU16
740            | Opcode::LoadIndI16
741            | Opcode::LoadIndU32
742            | Opcode::LoadIndI32
743            | Opcode::LoadIndU64 => {
744                let Args::TwoRegImm { ra, rb, imm } = args4 else {
745                    return None;
746                };
747                if rb != addr_reg || imm as i32 != 0 {
748                    return None;
749                }
750                self.asm
751                    .lea_sib_scaled_32(SCRATCH, REG_MAP[base_reg], REG_MAP[idx_reg], 2);
752                let fn_addr = self.read_fn_for(op4);
753                let ra_reg = REG_MAP[ra];
754                self.emit_mem_read(ra_reg, SCRATCH, fn_addr, pc4 as u32);
755                self.emit_sign_extend(op4, ra_reg);
756                self.invalidate_all_regs();
757                Some(pc4 + 1 + skip4 - pc)
758            }
759            Opcode::StoreIndU8
760            | Opcode::StoreIndU16
761            | Opcode::StoreIndU32
762            | Opcode::StoreIndU64 => {
763                let Args::TwoRegImm { ra, rb, imm } = args4 else {
764                    return None;
765                };
766                if rb != addr_reg || imm as i32 != 0 {
767                    return None;
768                }
769                self.asm
770                    .lea_sib_scaled_32(SCRATCH, REG_MAP[base_reg], REG_MAP[idx_reg], 2);
771                let fn_addr = self.write_fn_for(op4);
772                let ra_reg = REG_MAP[ra];
773                self.emit_mem_write(true, ra_reg, fn_addr, pc4 as u32);
774                self.invalidate_all_regs();
775                Some(pc4 + 1 + skip4 - pc)
776            }
777            _ => None,
778        }
779    }
780
781    /// Peephole: fuse `mul_64 D_lo, A, B` + `mul_upper_{uu,ss} D_hi, A, B`
782    /// (same A and B) into a single x86 `MUL`/`IMUL` that computes the
783    /// full 128-bit product in one shot (`RDX:RAX = A*B`).
784    ///
785    /// Hot in Goldilocks `mul`: the `(a as u128)*(b as u128)` decomposition
786    /// emits the PVM pair on identical operands; the standalone codegen
787    /// would do two separate full 64-bit multiplies (one for low, one
788    /// for high).
789    ///
790    /// Skipped for `mul_upper_su` (mixed signedness): handled by its
791    /// dedicated emit path, which needs the sign-correction.
792    fn try_fuse_mul_pair_raw(
793        &mut self,
794        code: &[u8],
795        bitmask: &[u8],
796        pc: usize,
797        args: &Args,
798        gas_sim: &mut GasSimulator,
799    ) -> Option<usize> {
800        let Args::ThreeReg {
801            ra: m_ra,
802            rb: m_rb,
803            rd: m_rd,
804        } = args
805        else {
806            return None;
807        };
808
809        let skip1 = compute_skip(pc, bitmask);
810        let pc2 = pc + 1 + skip1;
811        if pc2 >= code.len() || (pc2 < bitmask.len() && bitmask[pc2] != 1) {
812            return None;
813        }
814        let op2 = Opcode::from_byte(code[pc2])?;
815        let signed = match op2 {
816            Opcode::MulUpperSS => true,
817            Opcode::MulUpperUU => false,
818            _ => return None,
819        };
820        let skip2 = compute_skip(pc2, bitmask);
821        let args2 = args::decode_args(code, pc2, skip2, op2.category());
822        let Args::ThreeReg {
823            ra: u_ra,
824            rb: u_rb,
825            rd: u_rd,
826        } = args2
827        else {
828            return None;
829        };
830        if u_ra != *m_ra || u_rb != *m_rb {
831            return None;
832        }
833        // Disallow rd_lo == rd_hi (would only deliver one of the two products).
834        if *m_rd == u_rd {
835            return None;
836        }
837
838        // Feed instruction 2 to gas sim (using decoded args, no redundant decode).
839        // Fused mul-pair instructions are never terminators — no gas block binding.
840        let fc = javm_exec::gas_cost::fast_cost_from_decoded(
841            op2 as u8,
842            &args2,
843            pc2 as u32,
844            code,
845            bitmask,
846            self.mem_cycles,
847        );
848        gas_sim.feed(&fc);
849
850        let (a, b) = (REG_MAP[*m_ra], REG_MAP[*m_rb]);
851        let (rd_lo, rd_hi) = (REG_MAP[*m_rd], REG_MAP[u_rd]);
852        let phi11 = REG_MAP[11]; // RAX
853        debug_assert_eq!(phi11, Reg::RAX);
854
855        // Strategy:
856        //   1. Get A's value into RAX (preserve φ[11] if it's neither rd_lo
857        //      nor rd_hi).
858        //   2. Get B's value into a non-RAX, non-RDX register (mul_src).
859        //   3. MUL/IMUL mul_src → RDX:RAX.
860        //   4. Move RAX → rd_lo (skip if rd_lo is RAX, value already there).
861        //   5. Move RDX → rd_hi (skip if rd_hi is RDX = SCRATCH, but SCRATCH
862        //      isn't a PVM register, so rd_hi is always ≠ RDX).
863        //   6. Restore φ[11] from stack if we saved it.
864        //
865        // Order of moves matters when rd_lo or rd_hi aliases A or B:
866        //   - rd_hi aliases A: writing rd_hi clobbers A's home, but A's
867        //     value was already consumed by mul. OK.
868        //   - rd_lo aliases B: writing rd_lo overwrites B's home. mul has
869        //     already consumed B. OK.
870        //   - rd_lo == RAX: RAX already holds low; skip the mov.
871        //
872        // We preserve φ[11] when neither rd writes to it (the rest of the
873        // program may still expect RAX to hold φ[11]'s value).
874        let need_save_phi11 = rd_lo != phi11 && rd_hi != phi11;
875
876        if need_save_phi11 {
877            self.asm.push(phi11);
878        }
879
880        // If B is RAX, its value is now either in RAX (where mul wants A)
881        // or on the stack (above) if we saved. Load to SCRATCH before
882        // clobbering RAX with A.
883        let mul_src = if b == phi11 {
884            if need_save_phi11 {
885                // φ[11]'s original value is at [RSP], which is B's value.
886                self.asm.mov_load64(SCRATCH, Reg::RSP, 0);
887            } else {
888                // Didn't save; B's value is still in RAX. But we're about
889                // to overwrite RAX with A. Stash B to SCRATCH first.
890                self.asm.mov_rr(SCRATCH, b);
891            }
892            SCRATCH
893        } else {
894            b
895        };
896
897        // Load A into RAX.
898        if a != phi11 {
899            self.asm.mov_rr(phi11, a);
900        }
901
902        if signed {
903            self.asm.imul_rdx_rax(mul_src);
904        } else {
905            self.asm.mul_rdx_rax(mul_src);
906        }
907
908        // Write rd_lo (from RAX) first. If rd_lo == phi11, no-op. If rd_lo
909        // == SCRATCH that's impossible (SCRATCH isn't a PVM reg).
910        if rd_lo != phi11 {
911            self.asm.mov_rr(rd_lo, phi11);
912        }
913        // Write rd_hi (from RDX = SCRATCH).
914        self.asm.mov_rr(rd_hi, SCRATCH);
915
916        if need_save_phi11 {
917            self.asm.pop(phi11);
918        }
919
920        self.invalidate_all_regs();
921        Some(pc2 + 1 + skip2 - pc)
922    }
923
924    /// Emit memory read. Address in SCRATCH (RDX). Result in dst.
925    /// Uses inline flat buffer access with helper fallback for cross-page.
926    fn emit_mem_read(&mut self, dst: Reg, _addr_reg: Reg, fn_addr: u64, pvm_pc: u32) {
927        self.emit_mem_read_sized(dst, fn_addr, 0, pvm_pc);
928    }
929
930    /// Emit memory read with bounds check (cold fault path).
931    /// Hot path: cmp + jae + load (2 instructions, no extra stores).
932    /// No bounds check — SIGSEGV handler catches OOB.
933    fn emit_mem_read_sized(&mut self, dst: Reg, fn_addr: u64, width_bytes: u32, pvm_pc: u32) {
934        let w = if width_bytes > 0 {
935            width_bytes
936        } else if fn_addr == self.helpers.mem_read_u8 {
937            1
938        } else if fn_addr == self.helpers.mem_read_u16 {
939            2
940        } else if fn_addr == self.helpers.mem_read_u32 {
941            4
942        } else {
943            8
944        };
945
946        // Record trap entry before the load instruction (for SIGSEGV handler).
947        self.trap_entries.push((self.asm.offset() as u32, pvm_pc));
948
949        match w {
950            1 => self.asm.movzx_load8_at_index(dst, SCRATCH),
951            2 => self.asm.movzx_load16_at_index(dst, SCRATCH),
952            4 => self.asm.mov_load32_at_index(dst, SCRATCH),
953            8 => self.asm.mov_load64_at_index(dst, SCRATCH),
954            _ => unreachable!(),
955        }
956    }
957
958    /// Emit sign extension after a memory load, if the opcode is a signed variant.
959    /// Handles both direct loads (LoadI8/I16/I32) and indirect loads (LoadIndI8/I16/I32).
960    fn emit_sign_extend(&mut self, opcode: Opcode, reg: Reg) {
961        match opcode {
962            Opcode::LoadI8 | Opcode::LoadIndI8 => self.asm.movsx_8_64(reg, reg),
963            Opcode::LoadI16 | Opcode::LoadIndI16 => self.asm.movsx_16_64(reg, reg),
964            Opcode::LoadI32 | Opcode::LoadIndI32 => self.asm.movsxd(reg, reg),
965            _ => {}
966        }
967    }
968
969    /// Emit memory write with bounds check (cold fault path).
970    /// No bounds check — SIGSEGV handler catches OOB.
971    fn emit_mem_write(&mut self, _addr_in_scratch: bool, val_reg: Reg, fn_addr: u64, pvm_pc: u32) {
972        let w = if fn_addr == self.helpers.mem_write_u8 {
973            1u32
974        } else if fn_addr == self.helpers.mem_write_u16 {
975            2
976        } else if fn_addr == self.helpers.mem_write_u32 {
977            4
978        } else {
979            8
980        };
981
982        // Record trap entry before the store instruction (for SIGSEGV handler).
983        self.trap_entries.push((self.asm.offset() as u32, pvm_pc));
984
985        match w {
986            1 => self.asm.mov_store8_at_index(SCRATCH, val_reg),
987            2 => self.asm.mov_store16_at_index(SCRATCH, val_reg),
988            4 => self.asm.mov_store32_at_index(SCRATCH, val_reg),
989            8 => self.asm.mov_store64_at_index(SCRATCH, val_reg),
990            _ => unreachable!(),
991        }
992    }
993
994    /// Emit store-immediate-indirect: store an immediate value to memory.
995    /// Inline SIB store (no function call needed).
996    ///
997    fn emit_store_imm_ind(
998        &mut self,
999        opcode: Opcode,
1000        ra: usize,
1001        imm_x: i32,
1002        imm_y: u64,
1003        _pvm_pc: u32,
1004    ) {
1005        // Compute address into SCRATCH
1006        self.emit_addr_to_scratch(ra, imm_x);
1007
1008        let fits_i32 = {
1009            let imm_i64 = imm_y as i64;
1010            imm_i64 >= i32::MIN as i64 && imm_i64 <= i32::MAX as i64
1011        };
1012
1013        // Record trap entry before the store instruction (for SIGSEGV handler).
1014        self.trap_entries.push((self.asm.offset() as u32, _pvm_pc));
1015
1016        match opcode {
1017            Opcode::StoreImmIndU8 => {
1018                self.asm.mov_store8_at_index_imm(SCRATCH, imm_y as u8);
1019            }
1020            Opcode::StoreImmIndU16 => {
1021                self.asm.mov_store16_at_index_imm(SCRATCH, imm_y as u16);
1022            }
1023            Opcode::StoreImmIndU32 => {
1024                self.asm.mov_store32_at_index_imm(SCRATCH, imm_y as i32);
1025            }
1026            Opcode::StoreImmIndU64 if fits_i32 => {
1027                // mov qword [SCRATCH], sign-extended imm32
1028                self.asm.mov_store64_at_index_imm(SCRATCH, imm_y as i32);
1029            }
1030            Opcode::StoreImmIndU64 => {
1031                // Value doesn't fit in sign-extended i32: use a temp register.
1032                self.asm.push(Reg::RCX);
1033                self.asm.mov_ri64(Reg::RCX, imm_y);
1034                self.asm.mov_store64_at_index(SCRATCH, Reg::RCX);
1035                self.asm.pop(Reg::RCX);
1036            }
1037            _ => unreachable!(),
1038        }
1039    }
1040
1041    /// Compute a memory address into SCRATCH, using peephole optimizations when available.
1042    fn emit_addr_to_scratch(&mut self, rb: usize, imm: i32) {
1043        // Peephole: fold known constant address (no register load needed)
1044        if let RegDef::Const(addr) = self.reg_defs[rb] {
1045            let effective = addr.wrapping_add(imm as u32);
1046            self.asm.mov_ri32(SCRATCH, effective);
1047            return;
1048        }
1049        // Peephole: use SIB addressing for scaled-index patterns
1050        if imm == 0
1051            && let RegDef::ScaledAdd { base, idx, shift } = self.reg_defs[rb]
1052        {
1053            self.asm
1054                .lea_sib_scaled_32(SCRATCH, REG_MAP[base], REG_MAP[idx], shift);
1055            return;
1056        }
1057        let rb_reg = REG_MAP[rb];
1058        if imm != 0 {
1059            // lea r32, [base + disp]: combines truncation to 32-bit and offset
1060            // addition in one instruction (saves ~2 bytes vs movzx + add).
1061            self.asm.lea_32(SCRATCH, rb_reg, imm);
1062        } else {
1063            self.asm.movzx_32_64(SCRATCH, rb_reg);
1064        }
1065    }
1066
1067    /// Invalidate any reg_defs that depend on `reg`, but NOT reg itself.
1068    #[inline]
1069    fn invalidate_dependents(&mut self, reg: usize) {
1070        // Only iterate registers that have active (non-Unknown) defs
1071        let mut active = self.reg_defs_active & !(1u16 << reg);
1072        while active != 0 {
1073            let i = active.trailing_zeros() as usize;
1074            active &= active - 1;
1075            let depends = match self.reg_defs[i] {
1076                RegDef::Shifted { src, .. } => src == reg,
1077                RegDef::ScaledAdd { base, idx, .. } => base == reg || idx == reg,
1078                _ => false,
1079            };
1080            if depends {
1081                self.reg_defs[i] = RegDef::Unknown;
1082                self.reg_defs_active &= !(1u16 << i);
1083            }
1084        }
1085    }
1086
1087    /// Invalidate a register's tracked definition and any dependents.
1088    #[inline]
1089    fn invalidate_reg(&mut self, reg: usize) {
1090        self.reg_defs[reg] = RegDef::Unknown;
1091        self.reg_defs_active &= !(1u16 << reg);
1092        self.invalidate_dependents(reg);
1093    }
1094
1095    /// Invalidate all register definitions (on block boundaries, calls, etc.)
1096    #[inline]
1097    fn invalidate_all_regs(&mut self) {
1098        self.reg_defs = [RegDef::Unknown; 13];
1099        self.reg_defs_active = 0;
1100    }
1101
1102    /// Emit gas block boundary: bind label, flush previous block cost, emit new gas check.
1103    ///
1104    /// Called at every gas block start (PC=0 and post-terminator PCs) to:
1105    /// 1. Bind the PC label for branch resolution
1106    /// 2. Patch the previous block's gas cost (deferred until block end)
1107    /// 3. Emit a new `sub [ctx+gas], cost; js oog_stub` sequence
1108    fn emit_gas_block_start(
1109        &mut self,
1110        pc: usize,
1111        pending_gas: &mut Option<(Label, u32, usize)>,
1112        gas_sim: &mut GasSimulator,
1113    ) {
1114        let label = Label(self.label_base + pc as u32);
1115        self.asm.bind_label(label);
1116        self.gas_block_pcs.push(pc as u32);
1117        self.invalidate_all_regs();
1118        self.last_add_cf = None; // gas check clobbers flags
1119
1120        if let Some((stub_label, block_pc, patch_offset)) = pending_gas.take() {
1121            let cost = gas_sim.flush_and_get_cost();
1122            self.asm.patch_i32(patch_offset, cost as i32);
1123            self.oog_stubs.push((stub_label, block_pc, cost));
1124        }
1125        gas_sim.reset();
1126
1127        let stub_label = self.asm.new_label();
1128        self.asm.sub_r64_imm32_patchable(GAS, 0);
1129        let patch_offset = self.asm.offset() - 4;
1130        self.asm.jcc_label(Cc::S, stub_label);
1131        *pending_gas = Some((stub_label, pc as u32, patch_offset));
1132    }
1133
1134    /// Update reg_defs after compiling an instruction.
1135    /// Opcodes that produce trackable patterns update positively;
1136    /// all others invalidate the destination register.
1137    fn update_reg_defs(&mut self, opcode: Opcode, args: &Args) {
1138        match opcode {
1139            Opcode::Add64 => {
1140                if let Args::ThreeReg { ra, rb, rd } = args {
1141                    if *ra == *rb && *ra == *rd {
1142                        // add64 D, D, D — doubles again. Shifted{src,s} → Shifted{src,s+1}.
1143                        if let RegDef::Shifted { src, shift } = self.reg_defs[*rd] {
1144                            if shift < 3 {
1145                                self.reg_defs[*rd] = RegDef::Shifted {
1146                                    src,
1147                                    shift: shift + 1,
1148                                };
1149                                self.reg_defs_active |= 1u16 << *rd;
1150                            } else {
1151                                self.reg_defs[*rd] = RegDef::Unknown;
1152                                self.reg_defs_active &= !(1u16 << *rd);
1153                            }
1154                        } else {
1155                            self.reg_defs[*rd] = RegDef::Unknown;
1156                            self.reg_defs_active &= !(1u16 << *rd);
1157                        }
1158                    } else if *ra == *rb {
1159                        // add64 D, A, A — D = A * 2 = A << 1
1160                        // Only track as Shifted when rd != ra; in-place doubling
1161                        // (rd == ra) overwrites the original value, making the
1162                        // Shifted{src} self-referential — ScaledAdd would then
1163                        // double-shift at emit time.
1164                        if *rd != *ra {
1165                            self.reg_defs[*rd] = RegDef::Shifted { src: *ra, shift: 1 };
1166                            self.reg_defs_active |= 1u16 << *rd;
1167                        } else {
1168                            self.reg_defs[*rd] = RegDef::Unknown;
1169                            self.reg_defs_active &= !(1u16 << *rd);
1170                        }
1171                    } else {
1172                        // add64 D, A, B — check if one operand is Shifted
1173                        let def = if let RegDef::Shifted { src, shift } = self.reg_defs[*rb] {
1174                            Some((*ra, src, shift))
1175                        } else if let RegDef::Shifted { src, shift } = self.reg_defs[*ra] {
1176                            Some((*rb, src, shift))
1177                        } else {
1178                            None
1179                        };
1180                        if let Some((base, idx, shift)) = def {
1181                            self.reg_defs[*rd] = RegDef::ScaledAdd { base, idx, shift };
1182                            self.reg_defs_active |= 1u16 << *rd;
1183                        } else {
1184                            self.reg_defs[*rd] = RegDef::Unknown;
1185                            self.reg_defs_active &= !(1u16 << *rd);
1186                        }
1187                    }
1188                    self.invalidate_dependents(*rd);
1189                }
1190            }
1191            Opcode::LoadImm => {
1192                if let Args::RegImm { ra, imm } = args {
1193                    self.reg_defs[*ra] = RegDef::Const(*imm as u32);
1194                    self.reg_defs_active |= 1u16 << *ra;
1195                    self.invalidate_dependents(*ra);
1196                }
1197            }
1198            Opcode::LoadImm64 => {
1199                if let Args::RegExtImm { ra, imm } = args {
1200                    self.reg_defs[*ra] = RegDef::Const(*imm as u32);
1201                    self.reg_defs_active |= 1u16 << *ra;
1202                    self.invalidate_dependents(*ra);
1203                }
1204            }
1205            // Track shift-left-immediate as Shifted for LEA-based scaled indexing.
1206            // sll_imm_64 rd, rb, shift → Shifted{src:rb, shift} if shift ∈ 1..=3.
1207            // This enables the peephole: sll + add + load → LEA + load with SIB scaling.
1208            Opcode::ShloLImm64 => {
1209                if let Args::TwoRegImm { ra, rb, imm } = args {
1210                    let shift = (*imm as u32 % 64) as u8;
1211                    // Only track as Shifted when ra != rb; in-place shifts
1212                    // overwrite the original value, making the Shifted{src}
1213                    // self-referential — ScaledAdd would double-shift at emit.
1214                    if (1..=3).contains(&shift) && ra != rb {
1215                        self.reg_defs[*ra] = RegDef::Shifted { src: *rb, shift };
1216                        self.reg_defs_active |= 1u16 << *ra;
1217                    } else {
1218                        self.reg_defs[*ra] = RegDef::Unknown;
1219                        self.reg_defs_active &= !(1u16 << *ra);
1220                    }
1221                    self.invalidate_dependents(*ra);
1222                }
1223            }
1224            Opcode::MoveReg => {
1225                if let Args::TwoReg { rd, ra } = args
1226                    && *rd != *ra
1227                {
1228                    // Propagate the source's definition to the destination.
1229                    self.reg_defs[*rd] = self.reg_defs[*ra];
1230                    if matches!(self.reg_defs[*rd], RegDef::Unknown) {
1231                        self.reg_defs_active &= !(1u16 << *rd);
1232                    } else {
1233                        self.reg_defs_active |= 1u16 << *rd;
1234                    }
1235                    self.invalidate_dependents(*rd);
1236                }
1237            }
1238            _ => {
1239                match args {
1240                    Args::ThreeReg { rd, .. } => self.invalidate_reg(*rd),
1241                    Args::TwoReg { rd, .. } => self.invalidate_reg(*rd),
1242                    Args::TwoRegImm { ra, .. } => self.invalidate_reg(*ra),
1243                    Args::RegImm { ra, .. } => self.invalidate_reg(*ra),
1244                    Args::RegExtImm { ra, .. } => self.invalidate_reg(*ra),
1245                    _ => {}
1246                }
1247                if opcode.is_terminator() {
1248                    self.invalidate_all_regs();
1249                }
1250            }
1251        }
1252    }
1253
1254    /// Compile a single PVM instruction.
1255    /// Caller must ensure the assembler has sufficient capacity (at least 256 bytes).
1256    #[inline(always)]
1257    fn compile_instruction(&mut self, opcode: Opcode, args: &Args, pc: u32, next_pc: u32) {
1258        match opcode {
1259            // === A.5.1: No arguments ===
1260            Opcode::Trap => {
1261                self.asm.mov_store32_rip_rel_imm(CTX_PC, pc as i32);
1262                self.emit_exit(EXIT_TRAP, 0);
1263            }
1264            Opcode::Fallthrough | Opcode::Unlikely => {
1265                // Just fall through to next instruction.
1266                // Note: gas is already charged at basic block start above.
1267            }
1268
1269            // === A.5.1b: Ecall (management ops, no immediate) ===
1270            Opcode::Ecall => {
1271                self.asm.mov_store32_rip_rel_imm(CTX_PC, next_pc as i32);
1272                self.emit_exit(EXIT_ECALL, 0);
1273            }
1274
1275            // === A.5.2: One immediate ===
1276            Opcode::Ecalli => {
1277                if let Args::Imm { imm } = args {
1278                    let cap_slot = *imm as u32;
1279                    // v3: every ecalli is the slow-path exit through the
1280                    // EcallHandler. The `original_bitmap` GAS fast-path
1281                    // present in v2 is stripped — there is no protocol cap
1282                    // in v3, and gas-debit will reattach in Stage 3 with
1283                    // the kernel-assisted Gas Instance.
1284                    self.asm.mov_store32_rip_rel_imm(CTX_PC, next_pc as i32);
1285                    self.emit_exit(EXIT_HOST_CALL, cap_slot);
1286                }
1287            }
1288
1289            // === A.5.3: One register + extended immediate ===
1290            Opcode::LoadImm64 => {
1291                if let Args::RegExtImm { ra, imm } = args {
1292                    self.asm.mov_ri64(REG_MAP[*ra], *imm);
1293                }
1294            }
1295
1296            // === A.5.4: Two immediates (store_imm) ===
1297            Opcode::StoreImmU8
1298            | Opcode::StoreImmU16
1299            | Opcode::StoreImmU32
1300            | Opcode::StoreImmU64 => {
1301                if let Args::TwoImm { imm_x, imm_y } = args {
1302                    // Reuse StoreImmInd logic: treat as register 0 with the address
1303                    // replaced by a direct constant load into SCRATCH.
1304                    let addr = *imm_x as u32;
1305                    self.asm.mov_ri32(SCRATCH, addr);
1306                    let imm_val = *imm_y;
1307
1308                    let fits_i32 = {
1309                        let imm_i64 = imm_val as i64;
1310                        imm_i64 >= i32::MIN as i64 && imm_i64 <= i32::MAX as i64
1311                    };
1312
1313                    // Record trap entry before the store instruction (for SIGSEGV handler).
1314                    self.trap_entries.push((self.asm.offset() as u32, pc));
1315                    match opcode {
1316                        Opcode::StoreImmU8 => {
1317                            self.asm.mov_store8_at_index_imm(SCRATCH, imm_val as u8);
1318                        }
1319                        Opcode::StoreImmU16 => {
1320                            self.asm.mov_store16_at_index_imm(SCRATCH, imm_val as u16);
1321                        }
1322                        Opcode::StoreImmU32 => {
1323                            self.asm.mov_store32_at_index_imm(SCRATCH, imm_val as i32);
1324                        }
1325                        Opcode::StoreImmU64 if fits_i32 => {
1326                            self.asm.mov_store64_at_index_imm(SCRATCH, imm_val as i32);
1327                        }
1328                        Opcode::StoreImmU64 => {
1329                            self.asm.push(Reg::RCX);
1330                            self.asm.mov_ri64(Reg::RCX, imm_val);
1331                            self.asm.mov_store64_at_index(SCRATCH, Reg::RCX);
1332                            self.asm.pop(Reg::RCX);
1333                        }
1334                        _ => unreachable!(),
1335                    }
1336                }
1337            }
1338
1339            // === A.5.5: One offset (jump) ===
1340            Opcode::Jump => {
1341                if let Args::Offset { offset } = args {
1342                    self.emit_static_branch(*offset as u32, true, next_pc, pc);
1343                }
1344            }
1345
1346            // === A.5.6: One register + one immediate ===
1347            Opcode::JumpInd => {
1348                if let Args::RegImm { ra, imm } = args {
1349                    self.emit_dynamic_jump(*ra, *imm, pc);
1350                }
1351            }
1352            Opcode::LoadImm => {
1353                if let Args::RegImm { ra, imm } = args {
1354                    self.asm.mov_ri64(REG_MAP[*ra], *imm);
1355                }
1356            }
1357            Opcode::LoadU8
1358            | Opcode::LoadI8
1359            | Opcode::LoadU16
1360            | Opcode::LoadI16
1361            | Opcode::LoadU32
1362            | Opcode::LoadI32
1363            | Opcode::LoadU64 => {
1364                if let Args::RegImm { ra, imm } = args {
1365                    let addr = *imm as u32;
1366                    let fn_addr = self.read_fn_for(opcode);
1367                    self.asm.mov_ri32(SCRATCH, addr);
1368                    let ra_reg = REG_MAP[*ra];
1369                    self.emit_mem_read(ra_reg, SCRATCH, fn_addr, pc);
1370                    self.emit_sign_extend(opcode, ra_reg);
1371                }
1372            }
1373            Opcode::StoreU8 | Opcode::StoreU16 | Opcode::StoreU32 | Opcode::StoreU64 => {
1374                if let Args::RegImm { ra, imm } = args {
1375                    let addr = *imm as u32;
1376                    let ra_reg = REG_MAP[*ra];
1377                    let fn_addr = self.write_fn_for(opcode);
1378                    self.asm.mov_ri32(SCRATCH, addr);
1379                    self.emit_mem_write(true, ra_reg, fn_addr, pc);
1380                }
1381            }
1382
1383            // === A.5.7: One register + two immediates (store_imm_ind) ===
1384            Opcode::StoreImmIndU8
1385            | Opcode::StoreImmIndU16
1386            | Opcode::StoreImmIndU32
1387            | Opcode::StoreImmIndU64 => {
1388                if let Args::RegTwoImm { ra, imm_x, imm_y } = args {
1389                    self.emit_store_imm_ind(opcode, *ra, *imm_x as i32, *imm_y, pc);
1390                }
1391            }
1392
1393            // === A.5.8: One register + immediate + offset ===
1394            Opcode::LoadImmJump => {
1395                if let Args::RegImmOffset { ra, imm, offset } = args {
1396                    self.asm.mov_ri64(REG_MAP[*ra], *imm);
1397                    self.emit_static_branch(*offset as u32, true, next_pc, pc);
1398                }
1399            }
1400            Opcode::BranchEqImm => {
1401                if let Args::RegImmOffset { ra, imm, offset } = args {
1402                    let ra_reg = REG_MAP[*ra];
1403                    self.emit_branch_imm(ra_reg, *imm, Cc::E, *offset as u32, next_pc, pc);
1404                }
1405            }
1406            Opcode::BranchNeImm => {
1407                if let Args::RegImmOffset { ra, imm, offset } = args {
1408                    let ra_reg = REG_MAP[*ra];
1409                    self.emit_branch_imm(ra_reg, *imm, Cc::NE, *offset as u32, next_pc, pc);
1410                }
1411            }
1412            Opcode::BranchLtUImm => {
1413                if let Args::RegImmOffset { ra, imm, offset } = args {
1414                    let ra_reg = REG_MAP[*ra];
1415                    self.emit_branch_imm(ra_reg, *imm, Cc::B, *offset as u32, next_pc, pc);
1416                }
1417            }
1418            Opcode::BranchLeUImm => {
1419                if let Args::RegImmOffset { ra, imm, offset } = args {
1420                    let ra_reg = REG_MAP[*ra];
1421                    self.emit_branch_imm(ra_reg, *imm, Cc::BE, *offset as u32, next_pc, pc);
1422                }
1423            }
1424            Opcode::BranchGeUImm => {
1425                if let Args::RegImmOffset { ra, imm, offset } = args {
1426                    let ra_reg = REG_MAP[*ra];
1427                    self.emit_branch_imm(ra_reg, *imm, Cc::AE, *offset as u32, next_pc, pc);
1428                }
1429            }
1430            Opcode::BranchGtUImm => {
1431                if let Args::RegImmOffset { ra, imm, offset } = args {
1432                    let ra_reg = REG_MAP[*ra];
1433                    self.emit_branch_imm(ra_reg, *imm, Cc::A, *offset as u32, next_pc, pc);
1434                }
1435            }
1436            Opcode::BranchLtSImm => {
1437                if let Args::RegImmOffset { ra, imm, offset } = args {
1438                    let ra_reg = REG_MAP[*ra];
1439                    self.emit_branch_imm(ra_reg, *imm, Cc::L, *offset as u32, next_pc, pc);
1440                }
1441            }
1442            Opcode::BranchLeSImm => {
1443                if let Args::RegImmOffset { ra, imm, offset } = args {
1444                    let ra_reg = REG_MAP[*ra];
1445                    self.emit_branch_imm(ra_reg, *imm, Cc::LE, *offset as u32, next_pc, pc);
1446                }
1447            }
1448            Opcode::BranchGeSImm => {
1449                if let Args::RegImmOffset { ra, imm, offset } = args {
1450                    let ra_reg = REG_MAP[*ra];
1451                    self.emit_branch_imm(ra_reg, *imm, Cc::GE, *offset as u32, next_pc, pc);
1452                }
1453            }
1454            Opcode::BranchGtSImm => {
1455                if let Args::RegImmOffset { ra, imm, offset } = args {
1456                    let ra_reg = REG_MAP[*ra];
1457                    self.emit_branch_imm(ra_reg, *imm, Cc::G, *offset as u32, next_pc, pc);
1458                }
1459            }
1460
1461            // === A.5.9: Two registers ===
1462            Opcode::MoveReg => {
1463                if let Args::TwoReg { rd, ra } = args {
1464                    let ra_reg = REG_MAP[*ra];
1465                    self.asm.mov_rr(REG_MAP[*rd], ra_reg);
1466                }
1467            }
1468            Opcode::Sbrk => {
1469                // JAR v0.8.0: sbrk removed from ISA, replaced by grow_heap hostcall
1470                self.asm.mov_store32_rip_rel_imm(CTX_PC, pc as i32);
1471                self.emit_exit(EXIT_PANIC, 0);
1472            }
1473            Opcode::CountSetBits64 => {
1474                if let Args::TwoReg { rd, ra } = args {
1475                    let ra_reg = REG_MAP[*ra];
1476                    self.asm.popcnt64(REG_MAP[*rd], ra_reg);
1477                }
1478            }
1479            Opcode::CountSetBits32 => {
1480                if let Args::TwoReg { rd, ra } = args {
1481                    let ra_reg = REG_MAP[*ra];
1482                    // popcnt32 counts set bits of 32-bit value, zero-extends result
1483                    self.asm.popcnt32(REG_MAP[*rd], ra_reg);
1484                }
1485            }
1486            Opcode::LeadingZeroBits64 => {
1487                if let Args::TwoReg { rd, ra } = args {
1488                    let ra_reg = REG_MAP[*ra];
1489                    self.asm.lzcnt64(REG_MAP[*rd], ra_reg);
1490                }
1491            }
1492            Opcode::LeadingZeroBits32 => {
1493                if let Args::TwoReg { rd, ra } = args {
1494                    let ra_reg = REG_MAP[*ra];
1495                    // lzcnt32 counts leading zeros of 32-bit value, zero-extends result
1496                    self.asm.lzcnt32(REG_MAP[*rd], ra_reg);
1497                }
1498            }
1499            Opcode::TrailingZeroBits64 => {
1500                if let Args::TwoReg { rd, ra } = args {
1501                    let ra_reg = REG_MAP[*ra];
1502                    self.asm.tzcnt64(REG_MAP[*rd], ra_reg);
1503                }
1504            }
1505            Opcode::TrailingZeroBits32 => {
1506                if let Args::TwoReg { rd, ra } = args {
1507                    let ra_reg = REG_MAP[*ra];
1508                    // tzcnt32 returns 32 for zero input and zero-extends result to 64 bits
1509                    self.asm.tzcnt32(REG_MAP[*rd], ra_reg);
1510                }
1511            }
1512            Opcode::SignExtend8 => {
1513                if let Args::TwoReg { rd, ra } = args {
1514                    let ra_reg = REG_MAP[*ra];
1515                    self.asm.movsx_8_64(REG_MAP[*rd], ra_reg);
1516                }
1517            }
1518            Opcode::SignExtend16 => {
1519                if let Args::TwoReg { rd, ra } = args {
1520                    let ra_reg = REG_MAP[*ra];
1521                    self.asm.movsx_16_64(REG_MAP[*rd], ra_reg);
1522                }
1523            }
1524            Opcode::ZeroExtend16 => {
1525                if let Args::TwoReg { rd, ra } = args {
1526                    let ra_reg = REG_MAP[*ra];
1527                    self.asm.movzx_16_64(REG_MAP[*rd], ra_reg);
1528                }
1529            }
1530            Opcode::ReverseBytes => {
1531                if let Args::TwoReg { rd, ra } = args {
1532                    let ra_reg = REG_MAP[*ra];
1533                    if *rd != *ra {
1534                        self.asm.mov_rr(REG_MAP[*rd], ra_reg);
1535                    }
1536                    self.asm.bswap64(REG_MAP[*rd]);
1537                }
1538            }
1539
1540            // === A.5.10: Two registers + one immediate ===
1541            Opcode::StoreIndU8
1542            | Opcode::StoreIndU16
1543            | Opcode::StoreIndU32
1544            | Opcode::StoreIndU64 => {
1545                if let Args::TwoRegImm { ra, rb, imm } = args {
1546                    let ra_reg = REG_MAP[*ra];
1547                    self.emit_addr_to_scratch(*rb, *imm as i32);
1548                    let fn_addr = self.write_fn_for(opcode);
1549                    self.emit_mem_write(true, ra_reg, fn_addr, pc);
1550                }
1551            }
1552            Opcode::LoadIndU8
1553            | Opcode::LoadIndI8
1554            | Opcode::LoadIndU16
1555            | Opcode::LoadIndI16
1556            | Opcode::LoadIndU32
1557            | Opcode::LoadIndI32
1558            | Opcode::LoadIndU64 => {
1559                if let Args::TwoRegImm { ra, rb, imm } = args {
1560                    let ra_reg = REG_MAP[*ra];
1561                    self.emit_addr_to_scratch(*rb, *imm as i32);
1562                    let fn_addr = self.read_fn_for(opcode);
1563                    self.emit_mem_read(ra_reg, SCRATCH, fn_addr, pc);
1564                    self.emit_sign_extend(opcode, ra_reg);
1565                }
1566            }
1567            Opcode::AddImm32 => {
1568                if let Args::TwoRegImm { ra, rb, imm } = args {
1569                    let rb_reg = REG_MAP[*rb];
1570                    if *ra != *rb {
1571                        self.asm.mov_rr(REG_MAP[*ra], rb_reg);
1572                    }
1573                    self.asm.add_ri32(REG_MAP[*ra], *imm as i32);
1574                    self.asm.movsxd(REG_MAP[*ra], REG_MAP[*ra]);
1575                }
1576            }
1577            Opcode::AddImm64 => {
1578                if let Args::TwoRegImm { ra, rb, imm } = args {
1579                    let rb_reg = REG_MAP[*rb];
1580                    if *ra != *rb {
1581                        self.asm.mov_rr(REG_MAP[*ra], rb_reg);
1582                    }
1583                    if *imm as i32 == 1 {
1584                        self.asm.inc64(REG_MAP[*ra]);
1585                    } else if *imm as i32 == -1 {
1586                        self.asm.dec64(REG_MAP[*ra]);
1587                    } else {
1588                        self.asm.add_ri(REG_MAP[*ra], *imm as i32);
1589                    }
1590                }
1591            }
1592            Opcode::AndImm => {
1593                if let Args::TwoRegImm { ra, rb, imm } = args {
1594                    let rb_reg = REG_MAP[*rb];
1595                    if *ra != *rb {
1596                        self.asm.mov_rr(REG_MAP[*ra], rb_reg);
1597                    }
1598                    self.asm.and_ri(REG_MAP[*ra], *imm as i32);
1599                }
1600            }
1601            Opcode::XorImm => {
1602                if let Args::TwoRegImm { ra, rb, imm } = args {
1603                    let rb_reg = REG_MAP[*rb];
1604                    if *ra != *rb {
1605                        self.asm.mov_rr(REG_MAP[*ra], rb_reg);
1606                    }
1607                    self.asm.xor_ri(REG_MAP[*ra], *imm as i32);
1608                }
1609            }
1610            Opcode::OrImm => {
1611                if let Args::TwoRegImm { ra, rb, imm } = args {
1612                    let rb_reg = REG_MAP[*rb];
1613                    if *ra != *rb {
1614                        self.asm.mov_rr(REG_MAP[*ra], rb_reg);
1615                    }
1616                    self.asm.or_ri(REG_MAP[*ra], *imm as i32);
1617                }
1618            }
1619            Opcode::MulImm32 => {
1620                if let Args::TwoRegImm { ra, rb, imm } = args {
1621                    let rb_reg = REG_MAP[*rb];
1622                    self.asm.imul_rri32(REG_MAP[*ra], rb_reg, *imm as i32);
1623                    self.asm.movsxd(REG_MAP[*ra], REG_MAP[*ra]);
1624                }
1625            }
1626            Opcode::MulImm64 => {
1627                if let Args::TwoRegImm { ra, rb, imm } = args {
1628                    let rb_reg = REG_MAP[*rb];
1629                    self.asm.imul_rri(REG_MAP[*ra], rb_reg, *imm as i32);
1630                }
1631            }
1632            Opcode::SetLtUImm => {
1633                if let Args::TwoRegImm { ra, rb, imm } = args {
1634                    self.emit_setcc_imm(*ra, *rb, *imm, Cc::B);
1635                }
1636            }
1637            Opcode::SetLtSImm => {
1638                if let Args::TwoRegImm { ra, rb, imm } = args {
1639                    self.emit_setcc_imm(*ra, *rb, *imm, Cc::L);
1640                }
1641            }
1642            Opcode::SetGtUImm => {
1643                if let Args::TwoRegImm { ra, rb, imm } = args {
1644                    self.emit_setcc_imm(*ra, *rb, *imm, Cc::A);
1645                }
1646            }
1647            Opcode::SetGtSImm => {
1648                if let Args::TwoRegImm { ra, rb, imm } = args {
1649                    self.emit_setcc_imm(*ra, *rb, *imm, Cc::G);
1650                }
1651            }
1652            Opcode::ShloLImm32 => {
1653                if let Args::TwoRegImm { ra, rb, imm } = args {
1654                    let rb_reg = REG_MAP[*rb];
1655                    if *ra != *rb {
1656                        self.asm.mov_rr(REG_MAP[*ra], rb_reg);
1657                    }
1658                    self.asm.shl_ri32(REG_MAP[*ra], (*imm as u8) & 31);
1659                    self.asm.movsxd(REG_MAP[*ra], REG_MAP[*ra]);
1660                }
1661            }
1662            Opcode::ShloRImm32 => {
1663                if let Args::TwoRegImm { ra, rb, imm } = args {
1664                    let rb_reg = REG_MAP[*rb];
1665                    if *ra != *rb {
1666                        self.asm.mov_rr(REG_MAP[*ra], rb_reg);
1667                    }
1668                    self.asm.movzx_32_64(REG_MAP[*ra], REG_MAP[*ra]);
1669                    self.asm.shr_ri32(REG_MAP[*ra], (*imm as u8) & 31);
1670                    self.asm.movsxd(REG_MAP[*ra], REG_MAP[*ra]);
1671                }
1672            }
1673            Opcode::SharRImm32 => {
1674                if let Args::TwoRegImm { ra, rb, imm } = args {
1675                    let rb_reg = REG_MAP[*rb];
1676                    if *ra != *rb {
1677                        self.asm.mov_rr(REG_MAP[*ra], rb_reg);
1678                    }
1679                    self.asm.sar_ri32(REG_MAP[*ra], (*imm as u8) & 31);
1680                    self.asm.movsxd(REG_MAP[*ra], REG_MAP[*ra]);
1681                }
1682            }
1683            Opcode::ShloLImm64 => {
1684                if let Args::TwoRegImm { ra, rb, imm } = args {
1685                    let rb_reg = REG_MAP[*rb];
1686                    if *ra != *rb {
1687                        self.asm.mov_rr(REG_MAP[*ra], rb_reg);
1688                    }
1689                    self.asm.shl_ri64(REG_MAP[*ra], (*imm as u8) & 63);
1690                }
1691            }
1692            Opcode::ShloRImm64 => {
1693                if let Args::TwoRegImm { ra, rb, imm } = args {
1694                    let rb_reg = REG_MAP[*rb];
1695                    if *ra != *rb {
1696                        self.asm.mov_rr(REG_MAP[*ra], rb_reg);
1697                    }
1698                    self.asm.shr_ri64(REG_MAP[*ra], (*imm as u8) & 63);
1699                }
1700            }
1701            Opcode::SharRImm64 => {
1702                if let Args::TwoRegImm { ra, rb, imm } = args {
1703                    let rb_reg = REG_MAP[*rb];
1704                    if *ra != *rb {
1705                        self.asm.mov_rr(REG_MAP[*ra], rb_reg);
1706                    }
1707                    self.asm.sar_ri64(REG_MAP[*ra], (*imm as u8) & 63);
1708                }
1709            }
1710            Opcode::NegAddImm32 => {
1711                if let Args::TwoRegImm { ra, rb, imm } = args {
1712                    let rb_reg = REG_MAP[*rb];
1713                    // rd = imm - rb (32-bit)
1714                    if *ra == *rb {
1715                        self.asm.mov_rr(SCRATCH, rb_reg);
1716                        self.asm.mov_ri64(REG_MAP[*ra], *imm);
1717                        self.asm.sub_rr32(REG_MAP[*ra], SCRATCH);
1718                    } else {
1719                        self.asm.mov_ri64(REG_MAP[*ra], *imm);
1720                        self.asm.sub_rr32(REG_MAP[*ra], rb_reg);
1721                    }
1722                    self.asm.movsxd(REG_MAP[*ra], REG_MAP[*ra]);
1723                }
1724            }
1725            Opcode::NegAddImm64 => {
1726                if let Args::TwoRegImm { ra, rb, imm } = args {
1727                    let rb_reg = REG_MAP[*rb];
1728                    if *ra == *rb {
1729                        self.asm.mov_rr(SCRATCH, rb_reg);
1730                        self.asm.mov_ri64(REG_MAP[*ra], *imm);
1731                        self.asm.sub_rr(REG_MAP[*ra], SCRATCH);
1732                    } else {
1733                        self.asm.mov_ri64(REG_MAP[*ra], *imm);
1734                        self.asm.sub_rr(REG_MAP[*ra], rb_reg);
1735                    }
1736                }
1737            }
1738            // Alt shifts: rd = imm OP rb (operands swapped)
1739            Opcode::ShloLImmAlt32 => {
1740                if let Args::TwoRegImm { ra, rb, imm } = args {
1741                    // rd = imm << (rb & 31)
1742                    let rb_reg = REG_MAP[*rb];
1743                    let shift_src = if *ra == *rb {
1744                        self.asm.mov_rr(SCRATCH, rb_reg);
1745                        SCRATCH
1746                    } else {
1747                        rb_reg
1748                    };
1749                    self.asm.mov_ri64(REG_MAP[*ra], *imm);
1750                    self.emit_shift_by_reg32(REG_MAP[*ra], shift_src, 4); // SHL
1751                    self.asm.movsxd(REG_MAP[*ra], REG_MAP[*ra]);
1752                }
1753            }
1754            Opcode::ShloRImmAlt32 => {
1755                if let Args::TwoRegImm { ra, rb, imm } = args {
1756                    let rb_reg = REG_MAP[*rb];
1757                    let shift_src = if *ra == *rb {
1758                        self.asm.mov_rr(SCRATCH, rb_reg);
1759                        SCRATCH
1760                    } else {
1761                        rb_reg
1762                    };
1763                    self.asm.mov_ri64(REG_MAP[*ra], *imm);
1764                    self.asm.movzx_32_64(REG_MAP[*ra], REG_MAP[*ra]);
1765                    self.emit_shift_by_reg32(REG_MAP[*ra], shift_src, 5); // SHR
1766                    self.asm.movsxd(REG_MAP[*ra], REG_MAP[*ra]);
1767                }
1768            }
1769            Opcode::SharRImmAlt32 => {
1770                if let Args::TwoRegImm { ra, rb, imm } = args {
1771                    let rb_reg = REG_MAP[*rb];
1772                    let shift_src = if *ra == *rb {
1773                        self.asm.mov_rr(SCRATCH, rb_reg);
1774                        SCRATCH
1775                    } else {
1776                        rb_reg
1777                    };
1778                    self.asm.mov_ri64(REG_MAP[*ra], *imm);
1779                    self.emit_shift_by_reg32(REG_MAP[*ra], shift_src, 7); // SAR
1780                    self.asm.movsxd(REG_MAP[*ra], REG_MAP[*ra]);
1781                }
1782            }
1783            Opcode::ShloLImmAlt64 => {
1784                if let Args::TwoRegImm { ra, rb, imm } = args {
1785                    let rb_reg = REG_MAP[*rb];
1786                    let shift_src = if *ra == *rb {
1787                        self.asm.mov_rr(SCRATCH, rb_reg);
1788                        SCRATCH
1789                    } else {
1790                        rb_reg
1791                    };
1792                    self.asm.mov_ri64(REG_MAP[*ra], *imm);
1793                    self.emit_shift_by_reg64(REG_MAP[*ra], shift_src, 4);
1794                }
1795            }
1796            Opcode::ShloRImmAlt64 => {
1797                if let Args::TwoRegImm { ra, rb, imm } = args {
1798                    let rb_reg = REG_MAP[*rb];
1799                    let shift_src = if *ra == *rb {
1800                        self.asm.mov_rr(SCRATCH, rb_reg);
1801                        SCRATCH
1802                    } else {
1803                        rb_reg
1804                    };
1805                    self.asm.mov_ri64(REG_MAP[*ra], *imm);
1806                    self.emit_shift_by_reg64(REG_MAP[*ra], shift_src, 5);
1807                }
1808            }
1809            Opcode::SharRImmAlt64 => {
1810                if let Args::TwoRegImm { ra, rb, imm } = args {
1811                    let rb_reg = REG_MAP[*rb];
1812                    let shift_src = if *ra == *rb {
1813                        self.asm.mov_rr(SCRATCH, rb_reg);
1814                        SCRATCH
1815                    } else {
1816                        rb_reg
1817                    };
1818                    self.asm.mov_ri64(REG_MAP[*ra], *imm);
1819                    self.emit_shift_by_reg64(REG_MAP[*ra], shift_src, 7);
1820                }
1821            }
1822            Opcode::CmovIzImm => {
1823                if let Args::TwoRegImm { ra, rb, imm } = args {
1824                    // if φ[rb] == 0 then φ[ra] = imm — branchless via cmov.
1825                    // Hot in Goldilocks `add` overflow-correction (~10% of
1826                    // poseidon2 PVM trace), where the carry is ~50% random
1827                    // and the branch was a pipeline drag.
1828                    let rb_reg = REG_MAP[*rb];
1829                    let ra_reg = REG_MAP[*ra];
1830                    self.asm.mov_ri64(SCRATCH, *imm);
1831                    self.asm.test_rr(rb_reg, rb_reg);
1832                    self.asm.cmovcc(Cc::E, ra_reg, SCRATCH);
1833                }
1834            }
1835            Opcode::CmovNzImm => {
1836                if let Args::TwoRegImm { ra, rb, imm } = args {
1837                    let rb_reg = REG_MAP[*rb];
1838                    let ra_reg = REG_MAP[*ra];
1839                    self.asm.mov_ri64(SCRATCH, *imm);
1840                    self.asm.test_rr(rb_reg, rb_reg);
1841                    self.asm.cmovcc(Cc::NE, ra_reg, SCRATCH);
1842                }
1843            }
1844            Opcode::RotR64Imm => {
1845                if let Args::TwoRegImm { ra, rb, imm } = args {
1846                    let rb_reg = REG_MAP[*rb];
1847                    if *ra != *rb {
1848                        self.asm.mov_rr(REG_MAP[*ra], rb_reg);
1849                    }
1850                    self.asm.ror_ri64(REG_MAP[*ra], (*imm as u8) & 63);
1851                }
1852            }
1853            Opcode::RotR64ImmAlt => {
1854                if let Args::TwoRegImm { ra, rb, imm } = args {
1855                    // rd = imm ROR rb
1856                    let rb_reg = REG_MAP[*rb];
1857                    let shift_src = if *ra == *rb {
1858                        self.asm.mov_rr(SCRATCH, rb_reg);
1859                        SCRATCH
1860                    } else {
1861                        rb_reg
1862                    };
1863                    self.asm.mov_ri64(REG_MAP[*ra], *imm);
1864                    self.emit_shift_by_reg64(REG_MAP[*ra], shift_src, 1); // ROR
1865                }
1866            }
1867            Opcode::RotR32Imm => {
1868                if let Args::TwoRegImm { ra, rb, imm } = args {
1869                    let rb_reg = REG_MAP[*rb];
1870                    if *ra != *rb {
1871                        self.asm.mov_rr(REG_MAP[*ra], rb_reg);
1872                    }
1873                    self.asm.movzx_32_64(REG_MAP[*ra], REG_MAP[*ra]);
1874                    self.asm.ror_ri32(REG_MAP[*ra], (*imm as u8) & 31);
1875                    self.asm.movsxd(REG_MAP[*ra], REG_MAP[*ra]);
1876                }
1877            }
1878            Opcode::RotR32ImmAlt => {
1879                if let Args::TwoRegImm { ra, rb, imm } = args {
1880                    let rb_reg = REG_MAP[*rb];
1881                    let shift_src = if *ra == *rb {
1882                        self.asm.mov_rr(SCRATCH, rb_reg);
1883                        SCRATCH
1884                    } else {
1885                        rb_reg
1886                    };
1887                    self.asm.mov_ri64(REG_MAP[*ra], *imm);
1888                    self.asm.movzx_32_64(REG_MAP[*ra], REG_MAP[*ra]);
1889                    self.emit_shift_by_reg32(REG_MAP[*ra], shift_src, 1); // ROR
1890                    self.asm.movsxd(REG_MAP[*ra], REG_MAP[*ra]);
1891                }
1892            }
1893
1894            // === A.5.11: Two registers + one offset ===
1895            Opcode::BranchEq => {
1896                if let Args::TwoRegOffset { ra, rb, offset } = args {
1897                    // Both ra and rb are READ. If one is 12, we need special handling
1898                    // since both map to RCX. Load spilled first, save to SCRATCH if needed.
1899                    let (ra_reg, rb_reg) = (REG_MAP[*ra], REG_MAP[*rb]);
1900                    self.emit_branch_reg(ra_reg, rb_reg, Cc::E, *offset as u32, next_pc, pc);
1901                }
1902            }
1903            Opcode::BranchNe => {
1904                if let Args::TwoRegOffset { ra, rb, offset } = args {
1905                    let (ra_reg, rb_reg) = (REG_MAP[*ra], REG_MAP[*rb]);
1906                    self.emit_branch_reg(ra_reg, rb_reg, Cc::NE, *offset as u32, next_pc, pc);
1907                }
1908            }
1909            Opcode::BranchLtU => {
1910                if let Args::TwoRegOffset { ra, rb, offset } = args {
1911                    let (ra_reg, rb_reg) = (REG_MAP[*ra], REG_MAP[*rb]);
1912                    self.emit_branch_reg(ra_reg, rb_reg, Cc::B, *offset as u32, next_pc, pc);
1913                }
1914            }
1915            Opcode::BranchLtS => {
1916                if let Args::TwoRegOffset { ra, rb, offset } = args {
1917                    let (ra_reg, rb_reg) = (REG_MAP[*ra], REG_MAP[*rb]);
1918                    self.emit_branch_reg(ra_reg, rb_reg, Cc::L, *offset as u32, next_pc, pc);
1919                }
1920            }
1921            Opcode::BranchGeU => {
1922                if let Args::TwoRegOffset { ra, rb, offset } = args {
1923                    let (ra_reg, rb_reg) = (REG_MAP[*ra], REG_MAP[*rb]);
1924                    self.emit_branch_reg(ra_reg, rb_reg, Cc::AE, *offset as u32, next_pc, pc);
1925                }
1926            }
1927            Opcode::BranchGeS => {
1928                if let Args::TwoRegOffset { ra, rb, offset } = args {
1929                    let (ra_reg, rb_reg) = (REG_MAP[*ra], REG_MAP[*rb]);
1930                    self.emit_branch_reg(ra_reg, rb_reg, Cc::GE, *offset as u32, next_pc, pc);
1931                }
1932            }
1933
1934            // === A.5.12: Two registers + two immediates ===
1935            Opcode::LoadImmJumpInd => {
1936                if let Args::TwoRegTwoImm {
1937                    ra,
1938                    rb,
1939                    imm_x,
1940                    imm_y,
1941                } = args
1942                {
1943                    // GP: registers[ra] = imm_x, addr = registers[rb] + imm_y
1944                    // Per GP semantics, ra is written first, then jump uses the
1945                    // (possibly updated) rb value.
1946                    // If ra==rb, the jump target uses imm_x + imm_y.
1947                    self.asm.mov_ri64(REG_MAP[*ra], *imm_x);
1948                    self.emit_dynamic_jump(*rb, *imm_y, pc);
1949                }
1950            }
1951
1952            // === A.5.13: Three registers ===
1953            Opcode::Add32 => {
1954                self.emit_alu3_32(args, |a, d, s| {
1955                    a.add_rr32(d, s);
1956                });
1957            }
1958            Opcode::Sub32 => {
1959                self.emit_alu3_32_sub(args);
1960            }
1961            Opcode::Mul32 => {
1962                if let Args::ThreeReg { ra, rb, rd } = args {
1963                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
1964                    if *rd == *rb && *rd != *ra {
1965                        self.asm.mov_rr(SCRATCH, b);
1966                        self.asm.mov_rr(d, a);
1967                        self.asm.imul_rr32(d, SCRATCH);
1968                    } else {
1969                        if *rd != *ra {
1970                            self.asm.mov_rr(d, a);
1971                        }
1972                        self.asm.imul_rr32(d, b);
1973                    }
1974                    self.asm.movsxd(d, d);
1975                }
1976            }
1977            Opcode::Add64 => {
1978                self.emit_alu3_64_comm(args, true, |a, d, s| {
1979                    a.add_rr(d, s);
1980                });
1981                // Track CF: after add64 D, A, B, CF = overflow(A+B).
1982                // A subsequent setLtU C, D, A (or D, B) can use CF directly.
1983                if let Args::ThreeReg { ra, rb, rd } = args {
1984                    self.last_add_cf = Some((*rd, *ra, *rb));
1985                }
1986                // reg_defs tracking handled by update_reg_defs() in main loop
1987            }
1988            Opcode::Sub64 => {
1989                if let Args::ThreeReg { ra, rb, rd } = args {
1990                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
1991                    if *rd == *rb && *rd != *ra {
1992                        // d = a - d: neg d; add d, a (6 bytes vs 9 bytes)
1993                        self.asm.neg64(d);
1994                        self.asm.add_rr(d, a);
1995                    } else {
1996                        if *rd != *ra {
1997                            self.asm.mov_rr(d, a);
1998                        }
1999                        self.asm.sub_rr(d, b);
2000                    }
2001                }
2002            }
2003            Opcode::Mul64 => {
2004                self.emit_alu3_64_comm(args, true, |a, d, s| {
2005                    a.imul_rr(d, s);
2006                });
2007            }
2008            Opcode::And => {
2009                self.emit_alu3_64_comm(args, true, |a, d, s| {
2010                    a.and_rr(d, s);
2011                });
2012            }
2013            Opcode::Or => {
2014                self.emit_alu3_64_comm(args, true, |a, d, s| {
2015                    a.or_rr(d, s);
2016                });
2017            }
2018            Opcode::Xor => {
2019                self.emit_alu3_64_comm(args, true, |a, d, s| {
2020                    a.xor_rr(d, s);
2021                });
2022            }
2023
2024            // Division (32-bit and 64-bit)
2025            Opcode::DivU32 => {
2026                self.emit_div(args, false, false, true);
2027            }
2028            Opcode::DivS32 => {
2029                self.emit_div(args, true, false, true);
2030            }
2031            Opcode::RemU32 => {
2032                self.emit_div(args, false, true, true);
2033            }
2034            Opcode::RemS32 => {
2035                self.emit_div(args, true, true, true);
2036            }
2037            Opcode::DivU64 => {
2038                self.emit_div(args, false, false, false);
2039            }
2040            Opcode::DivS64 => {
2041                self.emit_div(args, true, false, false);
2042            }
2043            Opcode::RemU64 => {
2044                self.emit_div(args, false, true, false);
2045            }
2046            Opcode::RemS64 => {
2047                self.emit_div(args, true, true, false);
2048            }
2049
2050            // Shifts (three-register)
2051            // Note: when rd==rb, we must save rb to SCRATCH before mov rd, ra.
2052            Opcode::ShloL32 => {
2053                if let Args::ThreeReg { ra, rb, rd } = args {
2054                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2055                    let shift_src = if *rd == *rb && *rd != *ra {
2056                        self.asm.mov_rr(SCRATCH, b);
2057                        SCRATCH
2058                    } else {
2059                        b
2060                    };
2061                    if *rd != *ra {
2062                        self.asm.mov_rr(d, a);
2063                    }
2064                    self.emit_shift_by_reg32(d, shift_src, 4);
2065                    self.asm.movsxd(d, d);
2066                }
2067            }
2068            Opcode::ShloR32 => {
2069                if let Args::ThreeReg { ra, rb, rd } = args {
2070                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2071                    let shift_src = if *rd == *rb && *rd != *ra {
2072                        self.asm.mov_rr(SCRATCH, b);
2073                        SCRATCH
2074                    } else {
2075                        b
2076                    };
2077                    if *rd != *ra {
2078                        self.asm.mov_rr(d, a);
2079                    }
2080                    self.asm.movzx_32_64(d, d);
2081                    self.emit_shift_by_reg32(d, shift_src, 5);
2082                    self.asm.movsxd(d, d);
2083                }
2084            }
2085            Opcode::SharR32 => {
2086                if let Args::ThreeReg { ra, rb, rd } = args {
2087                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2088                    let shift_src = if *rd == *rb && *rd != *ra {
2089                        self.asm.mov_rr(SCRATCH, b);
2090                        SCRATCH
2091                    } else {
2092                        b
2093                    };
2094                    if *rd != *ra {
2095                        self.asm.mov_rr(d, a);
2096                    }
2097                    self.emit_shift_by_reg32(d, shift_src, 7);
2098                    self.asm.movsxd(d, d);
2099                }
2100            }
2101            Opcode::ShloL64 => {
2102                if let Args::ThreeReg { ra, rb, rd } = args {
2103                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2104                    let shift_src = if *rd == *rb && *rd != *ra {
2105                        self.asm.mov_rr(SCRATCH, b);
2106                        SCRATCH
2107                    } else {
2108                        b
2109                    };
2110                    if *rd != *ra {
2111                        self.asm.mov_rr(d, a);
2112                    }
2113                    self.emit_shift_by_reg64(d, shift_src, 4);
2114                }
2115            }
2116            Opcode::ShloR64 => {
2117                if let Args::ThreeReg { ra, rb, rd } = args {
2118                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2119                    let shift_src = if *rd == *rb && *rd != *ra {
2120                        self.asm.mov_rr(SCRATCH, b);
2121                        SCRATCH
2122                    } else {
2123                        b
2124                    };
2125                    if *rd != *ra {
2126                        self.asm.mov_rr(d, a);
2127                    }
2128                    self.emit_shift_by_reg64(d, shift_src, 5);
2129                }
2130            }
2131            Opcode::SharR64 => {
2132                if let Args::ThreeReg { ra, rb, rd } = args {
2133                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2134                    let shift_src = if *rd == *rb && *rd != *ra {
2135                        self.asm.mov_rr(SCRATCH, b);
2136                        SCRATCH
2137                    } else {
2138                        b
2139                    };
2140                    if *rd != *ra {
2141                        self.asm.mov_rr(d, a);
2142                    }
2143                    self.emit_shift_by_reg64(d, shift_src, 7);
2144                }
2145            }
2146
2147            // Multiply upper
2148            Opcode::MulUpperSS => {
2149                self.emit_mul_upper(args, true, true);
2150            }
2151            Opcode::MulUpperUU => {
2152                self.emit_mul_upper(args, false, false);
2153            }
2154            Opcode::MulUpperSU => {
2155                self.emit_mul_upper(args, true, false);
2156            }
2157
2158            // Set comparisons (three-register)
2159            Opcode::SetLtU => {
2160                if let Args::ThreeReg { ra, rb, rd } = args {
2161                    // Carry flag fusion: if the previous instruction was add64 D, A, B,
2162                    // and this is setLtU where rd = (ra < rb), CF already holds the carry.
2163                    // Pattern: ra == D (the sum), rb == A or B (one of the addends).
2164                    // Result goes to rd (the carry register).
2165                    let fused = if let Some((add_d, add_a, add_b)) = self.last_add_cf {
2166                        // Carry flag fusion: ra must be the sum register (add_d),
2167                        // and rb must be an UNMODIFIED original addend (not add_d,
2168                        // which now holds the sum). If rb == add_d, both sides of
2169                        // the comparison would be the sum, giving 0 always, but CF
2170                        // might be 1.
2171                        if *ra == add_d
2172                            && *rb != add_d
2173                            && (*rb == add_a || *rb == add_b)
2174                            && *rd != *rb
2175                        {
2176                            let d = REG_MAP[*rd];
2177                            // CF is valid from the add — use setb directly (no cmp needed).
2178                            // Cannot use xor to clear upper bits (it would clobber CF).
2179                            // Instead: setb + movzx (2 insns vs xor+cmp+setb = 3 insns).
2180                            self.asm.setcc(Cc::B, d);
2181                            self.asm.movzx_8_64(d, d);
2182                            true
2183                        } else {
2184                            false
2185                        }
2186                    } else {
2187                        false
2188                    };
2189                    if !fused {
2190                        self.emit_setcc_3reg(*ra, *rb, *rd, Cc::B);
2191                    }
2192                }
2193            }
2194            Opcode::SetLtS => {
2195                if let Args::ThreeReg { ra, rb, rd } = args {
2196                    self.emit_setcc_3reg(*ra, *rb, *rd, Cc::L);
2197                }
2198            }
2199
2200            // Conditional moves
2201            Opcode::CmovIz => {
2202                if let Args::ThreeReg { ra, rb, rd } = args {
2203                    // if φ[rb] == 0 then φ[rd] = φ[ra]
2204                    self.asm.test_rr(REG_MAP[*rb], REG_MAP[*rb]);
2205                    self.asm.cmovcc(Cc::E, REG_MAP[*rd], REG_MAP[*ra]);
2206                }
2207            }
2208            Opcode::CmovNz => {
2209                if let Args::ThreeReg { ra, rb, rd } = args {
2210                    self.asm.test_rr(REG_MAP[*rb], REG_MAP[*rb]);
2211                    self.asm.cmovcc(Cc::NE, REG_MAP[*rd], REG_MAP[*ra]);
2212                }
2213            }
2214
2215            // Rotates (three-register)
2216            Opcode::RotL64 => {
2217                if let Args::ThreeReg { ra, rb, rd } = args {
2218                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2219                    let shift_src = if *rd == *rb && *rd != *ra {
2220                        self.asm.mov_rr(SCRATCH, b);
2221                        SCRATCH
2222                    } else {
2223                        b
2224                    };
2225                    if *rd != *ra {
2226                        self.asm.mov_rr(d, a);
2227                    }
2228                    self.emit_shift_by_reg64(d, shift_src, 0); // ROL
2229                }
2230            }
2231            Opcode::RotL32 => {
2232                if let Args::ThreeReg { ra, rb, rd } = args {
2233                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2234                    let shift_src = if *rd == *rb && *rd != *ra {
2235                        self.asm.mov_rr(SCRATCH, b);
2236                        SCRATCH
2237                    } else {
2238                        b
2239                    };
2240                    if *rd != *ra {
2241                        self.asm.mov_rr(d, a);
2242                    }
2243                    self.asm.movzx_32_64(d, d);
2244                    self.emit_shift_by_reg32(d, shift_src, 0);
2245                    self.asm.movsxd(d, d);
2246                }
2247            }
2248            Opcode::RotR64 => {
2249                if let Args::ThreeReg { ra, rb, rd } = args {
2250                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2251                    let shift_src = if *rd == *rb && *rd != *ra {
2252                        self.asm.mov_rr(SCRATCH, b);
2253                        SCRATCH
2254                    } else {
2255                        b
2256                    };
2257                    if *rd != *ra {
2258                        self.asm.mov_rr(d, a);
2259                    }
2260                    self.emit_shift_by_reg64(d, shift_src, 1); // ROR
2261                }
2262            }
2263            Opcode::RotR32 => {
2264                if let Args::ThreeReg { ra, rb, rd } = args {
2265                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2266                    let shift_src = if *rd == *rb && *rd != *ra {
2267                        self.asm.mov_rr(SCRATCH, b);
2268                        SCRATCH
2269                    } else {
2270                        b
2271                    };
2272                    if *rd != *ra {
2273                        self.asm.mov_rr(d, a);
2274                    }
2275                    self.asm.movzx_32_64(d, d);
2276                    self.emit_shift_by_reg32(d, shift_src, 1);
2277                    self.asm.movsxd(d, d);
2278                }
2279            }
2280
2281            // Logical with invert
2282            Opcode::AndInv => {
2283                if let Args::ThreeReg { ra, rb, rd } = args {
2284                    // rd = ra & ~rb
2285                    self.asm.mov_rr(SCRATCH, REG_MAP[*rb]);
2286                    self.asm.not64(SCRATCH);
2287                    self.asm.mov_rr(REG_MAP[*rd], REG_MAP[*ra]);
2288                    self.asm.and_rr(REG_MAP[*rd], SCRATCH);
2289                }
2290            }
2291            Opcode::OrInv => {
2292                if let Args::ThreeReg { ra, rb, rd } = args {
2293                    // rd = ra | ~rb
2294                    self.asm.mov_rr(SCRATCH, REG_MAP[*rb]);
2295                    self.asm.not64(SCRATCH);
2296                    self.asm.mov_rr(REG_MAP[*rd], REG_MAP[*ra]);
2297                    self.asm.or_rr(REG_MAP[*rd], SCRATCH);
2298                }
2299            }
2300            Opcode::Xnor => {
2301                if let Args::ThreeReg { ra, rb, rd } = args {
2302                    // rd = ~(ra ^ rb)
2303                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2304                    if *rd == *rb && *rd != *ra {
2305                        self.asm.mov_rr(SCRATCH, b);
2306                        self.asm.mov_rr(d, a);
2307                        self.asm.xor_rr(d, SCRATCH);
2308                    } else {
2309                        if *rd != *ra {
2310                            self.asm.mov_rr(d, a);
2311                        }
2312                        self.asm.xor_rr(d, b);
2313                    }
2314                    self.asm.not64(REG_MAP[*rd]);
2315                }
2316            }
2317
2318            // Min/Max
2319            Opcode::Max => {
2320                if let Args::ThreeReg { ra, rb, rd } = args {
2321                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2322                    self.asm.cmp_rr(a, b);
2323                    if *rd == *rb && *rd != *ra {
2324                        self.asm.mov_rr(SCRATCH, b);
2325                        self.asm.mov_rr(d, a);
2326                        self.asm.cmovcc(Cc::L, d, SCRATCH);
2327                    } else {
2328                        if *rd != *ra {
2329                            self.asm.mov_rr(d, a);
2330                        }
2331                        self.asm.cmovcc(Cc::L, d, b);
2332                    }
2333                }
2334            }
2335            Opcode::MaxU => {
2336                if let Args::ThreeReg { ra, rb, rd } = args {
2337                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2338                    self.asm.cmp_rr(a, b);
2339                    if *rd == *rb && *rd != *ra {
2340                        self.asm.mov_rr(SCRATCH, b);
2341                        self.asm.mov_rr(d, a);
2342                        self.asm.cmovcc(Cc::B, d, SCRATCH);
2343                    } else {
2344                        if *rd != *ra {
2345                            self.asm.mov_rr(d, a);
2346                        }
2347                        self.asm.cmovcc(Cc::B, d, b);
2348                    }
2349                }
2350            }
2351            Opcode::Min => {
2352                if let Args::ThreeReg { ra, rb, rd } = args {
2353                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2354                    self.asm.cmp_rr(a, b);
2355                    if *rd == *rb && *rd != *ra {
2356                        self.asm.mov_rr(SCRATCH, b);
2357                        self.asm.mov_rr(d, a);
2358                        self.asm.cmovcc(Cc::G, d, SCRATCH);
2359                    } else {
2360                        if *rd != *ra {
2361                            self.asm.mov_rr(d, a);
2362                        }
2363                        self.asm.cmovcc(Cc::G, d, b);
2364                    }
2365                }
2366            }
2367            Opcode::MinU => {
2368                if let Args::ThreeReg { ra, rb, rd } = args {
2369                    let (d, a, b) = (REG_MAP[*rd], REG_MAP[*ra], REG_MAP[*rb]);
2370                    self.asm.cmp_rr(a, b);
2371                    if *rd == *rb && *rd != *ra {
2372                        self.asm.mov_rr(SCRATCH, b);
2373                        self.asm.mov_rr(d, a);
2374                        self.asm.cmovcc(Cc::A, d, SCRATCH);
2375                    } else {
2376                        if *rd != *ra {
2377                            self.asm.mov_rr(d, a);
2378                        }
2379                        self.asm.cmovcc(Cc::A, d, b);
2380                    }
2381                }
2382            }
2383        }
2384    }
2385
2386    // === Helper emission methods ===
2387
2388    /// Emit a static branch (validated at compile time).
2389    fn emit_static_branch(&mut self, target: u32, condition: bool, _fallthrough: u32, pc: u32) {
2390        if !condition {
2391            return;
2392        }
2393        if !self.is_basic_block_start(target) {
2394            self.asm.mov_store32_rip_rel_imm(CTX_PC, pc as i32);
2395            self.emit_exit(EXIT_PANIC, 0);
2396            return;
2397        }
2398        let label = self.label_for_pc(target);
2399        self.asm.jmp_label(label);
2400    }
2401
2402    /// Emit a dynamic jump (through jump table).
2403    fn emit_dynamic_jump(&mut self, ra: usize, imm: u64, pc: u32) {
2404        // Store PC for any exit path in the dynamic jump sequence
2405        self.asm.mov_store32_rip_rel_imm(CTX_PC, pc as i32);
2406        // addr = (φ[ra] + imm) % 2^32
2407        self.asm.mov_rr(SCRATCH, REG_MAP[ra]);
2408        if imm as i32 != 0 {
2409            self.asm.add_ri(SCRATCH, imm as i32);
2410        }
2411        self.asm.movzx_32_64(SCRATCH, SCRATCH); // truncate to 32-bit
2412
2413        // No halt address check — programs terminate via REPLY (ecalli 0xFF).
2414
2415        // For dynamic jumps, we save state and return to the host to handle
2416        // (the host will validate and dispatch). This is simpler than inlining
2417        // the full jump table lookup. Exit with a special "dynamic jump" that
2418        // stores the target address.
2419        // We use EXIT_PANIC as default and let the caller handle djump.
2420        // Actually, let's inline it for performance:
2421
2422        // Check alignment: addr must be even and non-zero
2423        // addr == 0 → panic
2424        self.asm.test_rr(SCRATCH, SCRATCH);
2425        self.asm.jcc_label(Cc::E, self.panic_label);
2426
2427        // idx = addr/2 - 1 (also checks alignment: bit 0 goes to CF via SHR)
2428        self.asm.shr_ri64(SCRATCH, 1); // CF = bit 0 (alignment)
2429        self.asm.jcc_label(Cc::B, self.panic_label); // odd addr → panic (B = carry set)
2430        self.asm.sub_ri(SCRATCH, 1);
2431
2432        // Inline djump resolution: idx is in SCRATCH (RDX).
2433        // Bounds check: idx < jt_len
2434        self.asm.cmp_mem32_rip_rel_r(CTX_JT_LEN, SCRATCH);
2435        self.asm.jcc_label(Cc::BE, self.panic_label); // jt_len <= idx → panic
2436
2437        // target_pc = jt_ptr[idx] (u32 array, need idx*4)
2438        self.asm.push(Reg::RAX); // save φ[11]
2439        self.asm.shl_ri64(SCRATCH, 2); // idx *= 4
2440        self.asm.mov_load64_rip_rel(Reg::RAX, CTX_JT_PTR);
2441        self.asm.add_rr(Reg::RAX, SCRATCH);
2442        self.asm.mov_load32(SCRATCH, Reg::RAX, 0); // SCRATCH = jt_ptr[idx]
2443
2444        // Validate: target_pc < bb_len && bb_starts[target_pc] == 1
2445        let djump_panic = self.asm.new_label();
2446        self.asm.cmp_mem32_rip_rel_r(CTX_BB_LEN, SCRATCH);
2447        self.asm.jcc_label(Cc::BE, djump_panic); // bb_len <= target → panic
2448        self.asm.mov_load64_rip_rel(Reg::RAX, CTX_BB_STARTS);
2449        self.asm.movzx_load8_sib(Reg::RAX, Reg::RAX, SCRATCH);
2450        self.asm.cmp_ri32(Reg::RAX, 1);
2451        self.asm.jcc_label(Cc::NE, djump_panic);
2452
2453        // Dispatch: native_addr = code_base + dispatch_table[target_pc]
2454        self.asm.mov_load64_rip_rel(Reg::RAX, CTX_DISPATCH_TABLE);
2455        self.asm.movsxd_load_sib4(Reg::RAX, Reg::RAX, SCRATCH);
2456        self.asm.add_r64_mem_rip_rel(Reg::RAX, CTX_CODE_BASE);
2457        // Store target PC for gas block tracking
2458        self.asm.mov_store32_rip_rel(CTX_PC, SCRATCH);
2459        // RAX = native addr, [rsp] = saved φ[11].
2460        // Use SCRATCH (which we no longer need) to swap.
2461        self.asm.mov_rr(SCRATCH, Reg::RAX); // SCRATCH = native addr
2462        self.asm.pop(Reg::RAX); // restore φ[11]
2463        self.asm.jmp_reg(SCRATCH); // jump to native addr
2464
2465        self.asm.bind_label(djump_panic);
2466        self.asm.pop(Reg::RAX); // restore φ[11] before panicking
2467        self.asm.jmp_label(self.panic_label);
2468    }
2469
2470    /// Emit setcc for three-register comparisons: rd = (ra CMP rb) ? 1 : 0.
2471    /// When rd != ra and rd != rb, uses xor+cmp+setcc (eliminates movzx).
2472    fn emit_setcc_3reg(&mut self, ra: usize, rb: usize, rd: usize, cc: Cc) {
2473        let (a, b, d) = (REG_MAP[ra], REG_MAP[rb], REG_MAP[rd]);
2474        if rd != ra && rd != rb {
2475            // xor clears upper bits; setcc writes only the low byte.
2476            self.asm.mov_ri64(d, 0); // xor r32,r32 (via mov_ri64 zero optimization)
2477            self.asm.cmp_rr(a, b);
2478            self.asm.setcc(cc, d);
2479        } else {
2480            self.asm.cmp_rr(a, b);
2481            self.asm.setcc(cc, d);
2482            self.asm.movzx_8_64(d, d);
2483        }
2484    }
2485
2486    /// Emit setcc for immediate comparisons: ra = (rb CMP imm) ? 1 : 0.
2487    /// When ra != rb, uses xor+cmp+setcc (eliminates movzx).
2488    fn emit_setcc_imm(&mut self, ra: usize, rb: usize, imm: u64, cc: Cc) {
2489        let (a, b) = (REG_MAP[ra], REG_MAP[rb]);
2490        if ra != rb {
2491            self.asm.mov_ri64(a, 0); // xor r32,r32
2492            self.emit_cmp_imm(b, imm);
2493            self.asm.setcc(cc, a);
2494        } else {
2495            self.emit_cmp_imm(b, imm);
2496            self.asm.setcc(cc, a);
2497            self.asm.movzx_8_64(a, a);
2498        }
2499    }
2500
2501    /// Compare register against immediate, using cmp_ri for i32-range values.
2502    fn emit_cmp_imm(&mut self, reg: Reg, imm: u64) {
2503        let imm_i64 = imm as i64;
2504        if imm_i64 >= i32::MIN as i64 && imm_i64 <= i32::MAX as i64 {
2505            self.asm.cmp_ri(reg, imm_i64 as i32);
2506        } else {
2507            self.asm.mov_ri64(SCRATCH, imm);
2508            self.asm.cmp_rr(reg, SCRATCH);
2509        }
2510    }
2511
2512    /// Emit a branch comparing register against immediate.
2513    fn emit_branch_imm(
2514        &mut self,
2515        reg: Reg,
2516        imm: u64,
2517        cc: Cc,
2518        target: u32,
2519        _fallthrough: u32,
2520        pc: u32,
2521    ) {
2522        if !self.is_basic_block_start(target) {
2523            // Target not valid → store PC and panic if condition true (cold path)
2524            self.asm.mov_store32_rip_rel_imm(CTX_PC, pc as i32);
2525            self.asm.mov_ri64(SCRATCH, imm);
2526            self.asm.cmp_rr(reg, SCRATCH);
2527            self.asm.jcc_label(cc, self.panic_label);
2528            return;
2529        }
2530        self.emit_cmp_imm(reg, imm);
2531        let label = self.label_for_pc(target);
2532        self.asm.jcc_label(cc, label);
2533    }
2534
2535    /// Emit a branch comparing two registers.
2536    fn emit_branch_reg(&mut self, a: Reg, b: Reg, cc: Cc, target: u32, _fallthrough: u32, pc: u32) {
2537        if !self.is_basic_block_start(target) {
2538            self.asm.mov_store32_rip_rel_imm(CTX_PC, pc as i32);
2539            self.asm.cmp_rr(a, b);
2540            self.asm.jcc_label(cc, self.panic_label);
2541            return;
2542        }
2543        self.asm.cmp_rr(a, b);
2544        let label = self.label_for_pc(target);
2545        self.asm.jcc_label(cc, label);
2546    }
2547
2548    /// Emit a shift by register value using CL.
2549    /// shift_op: 4=SHL, 5=SHR, 7=SAR, 0=ROL, 1=ROR
2550    fn emit_shift_by_reg32(&mut self, dst: Reg, shift_reg: Reg, shift_op: u8) {
2551        // Need shift amount in CL (RCX = φ[12])
2552        // If shift_reg is already RCX, great. Otherwise save/restore.
2553        if shift_reg == Reg::RCX {
2554            self.asm.shift_cl32(shift_op, dst);
2555        } else if dst == Reg::RCX {
2556            // dst is CL — need to swap
2557            self.asm.push(shift_reg);
2558            self.asm.mov_rr(Reg::RCX, shift_reg);
2559            // But we also need dst's value which was in RCX
2560            // We pushed shift_reg, not dst. Let me handle this differently.
2561            // Move dst to SCRATCH, put shift in CL, shift SCRATCH, move back.
2562            self.asm.pop(shift_reg); // undo push
2563            self.asm.mov_rr(SCRATCH, dst);
2564            self.asm.push(Reg::RCX);
2565            self.asm.mov_rr(Reg::RCX, shift_reg);
2566            self.asm.shift_cl32(shift_op, SCRATCH);
2567            self.asm.pop(Reg::RCX);
2568            self.asm.mov_rr(dst, SCRATCH);
2569        } else {
2570            self.asm.push(Reg::RCX);
2571            self.asm.mov_rr(Reg::RCX, shift_reg);
2572            self.asm.shift_cl32(shift_op, dst);
2573            self.asm.pop(Reg::RCX);
2574        }
2575    }
2576
2577    fn emit_shift_by_reg64(&mut self, dst: Reg, shift_reg: Reg, shift_op: u8) {
2578        if shift_reg == Reg::RCX {
2579            self.asm.shift_cl64(shift_op, dst);
2580        } else if dst == Reg::RCX {
2581            self.asm.mov_rr(SCRATCH, dst);
2582            self.asm.push(Reg::RCX);
2583            self.asm.mov_rr(Reg::RCX, shift_reg);
2584            self.asm.shift_cl64(shift_op, SCRATCH);
2585            self.asm.pop(Reg::RCX);
2586            self.asm.mov_rr(dst, SCRATCH);
2587        } else {
2588            self.asm.push(Reg::RCX);
2589            self.asm.mov_rr(Reg::RCX, shift_reg);
2590            self.asm.shift_cl64(shift_op, dst);
2591            self.asm.pop(Reg::RCX);
2592        }
2593    }
2594
2595    /// Three-register 64-bit ALU: rd = ra OP rb
2596    #[allow(dead_code)]
2597    fn emit_alu3_64(&mut self, args: &Args, op: impl FnOnce(&mut Assembler, Reg, Reg)) {
2598        self.emit_alu3_64_comm(args, false, op);
2599    }
2600
2601    /// Three-register 64-bit ALU with optional commutativity optimization.
2602    /// When `commutative` is true and rd == rb, emit `op(d, a)` directly
2603    /// instead of saving/restoring via SCRATCH.
2604    fn emit_alu3_64_comm(
2605        &mut self,
2606        args: &Args,
2607        commutative: bool,
2608        op: impl FnOnce(&mut Assembler, Reg, Reg),
2609    ) {
2610        if let Args::ThreeReg { ra, rb, rd } = args {
2611            let d = REG_MAP[*rd];
2612            let a = REG_MAP[*ra];
2613            let b = REG_MAP[*rb];
2614            if *rd == *ra {
2615                op(&mut self.asm, d, b);
2616            } else if *rd == *rb && commutative {
2617                // Commutative: rd = rb OP ra = ra OP rb — just op(d, a)
2618                op(&mut self.asm, d, a);
2619            } else if *rd == *rb {
2620                self.asm.mov_rr(SCRATCH, b);
2621                self.asm.mov_rr(d, a);
2622                op(&mut self.asm, d, SCRATCH);
2623            } else {
2624                self.asm.mov_rr(d, a);
2625                op(&mut self.asm, d, b);
2626            }
2627        }
2628    }
2629
2630    /// Three-register 32-bit ALU with sign extension: rd = sx32(ra OP rb)
2631    fn emit_alu3_32(&mut self, args: &Args, op: impl FnOnce(&mut Assembler, Reg, Reg)) {
2632        if let Args::ThreeReg { ra, rb, rd } = args {
2633            let d = REG_MAP[*rd];
2634            let a = REG_MAP[*ra];
2635            let b = REG_MAP[*rb];
2636            if *rd == *ra {
2637                op(&mut self.asm, d, b);
2638            } else if *rd == *rb {
2639                self.asm.mov_rr(SCRATCH, b);
2640                self.asm.mov_rr(d, a);
2641                op(&mut self.asm, d, SCRATCH);
2642            } else {
2643                self.asm.mov_rr(d, a);
2644                op(&mut self.asm, d, b);
2645            }
2646            self.asm.movsxd(d, d);
2647        }
2648    }
2649
2650    fn emit_alu3_32_sub(&mut self, args: &Args) {
2651        if let Args::ThreeReg { ra, rb, rd } = args {
2652            let d = REG_MAP[*rd];
2653            let a = REG_MAP[*ra];
2654            let b = REG_MAP[*rb];
2655            if *rd == *ra {
2656                self.asm.sub_rr32(d, b);
2657            } else if *rd == *rb {
2658                // d = a - d: neg32 d; add32 d, a (6 bytes vs 9 bytes)
2659                self.asm.neg32(d);
2660                self.asm.add_rr32(d, a);
2661            } else {
2662                self.asm.mov_rr(d, a);
2663                self.asm.sub_rr32(d, b);
2664            }
2665            self.asm.movsxd(d, d);
2666        }
2667    }
2668
2669    /// Division/remainder.
2670    ///
2671    /// x86 DIV/IDIV: dividend in RDX:RAX, divisor in any GPR except RAX/RDX.
2672    /// Quotient → RAX, remainder → RDX. Only RAX and RDX are clobbered.
2673    ///
2674    /// Key insight: RDX = SCRATCH (not mapped to any PVM register), so it never
2675    /// needs saving/restoring. When b_reg != RAX (~92% of cases), we use b_reg
2676    /// directly as the divisor — DIV/IDIV does not clobber the operand register,
2677    /// so no save of RCX (φ[12]) is needed either. Only RAX (φ[11]) must be
2678    /// preserved (unless d_reg == RAX).
2679    fn emit_div(&mut self, args: &Args, signed: bool, remainder: bool, is_32bit: bool) {
2680        if let Args::ThreeReg { ra, rb, rd } = args {
2681            let a_reg = REG_MAP[*ra];
2682            let b_reg = REG_MAP[*rb];
2683            let d_reg = REG_MAP[*rd];
2684
2685            // Check divisor == 0
2686            self.asm.test_rr(b_reg, b_reg);
2687            let nonzero = self.asm.new_label();
2688            let done = self.asm.new_label();
2689            self.asm.jcc_label(Cc::NE, nonzero);
2690
2691            // Division by zero: quotient = 2^64-1, remainder = dividend
2692            if remainder {
2693                self.asm.mov_rr(d_reg, a_reg);
2694            } else {
2695                self.asm.mov_ri64(d_reg, u64::MAX);
2696                if is_32bit {
2697                    self.asm.movsxd(d_reg, d_reg);
2698                }
2699            }
2700            self.asm.jmp_label(done);
2701
2702            self.asm.bind_label(nonzero);
2703
2704            if b_reg != Reg::RAX {
2705                // Fast path: use b_reg directly as divisor (no extra register needed).
2706                // Only save RAX (φ[11]) if the result doesn't go there.
2707                self.emit_div_fast(a_reg, b_reg, d_reg, signed, remainder, is_32bit);
2708            } else {
2709                // b_reg == RAX: divisor is in RAX, but we need RAX for the dividend.
2710                // Move divisor to RCX; save both RAX (φ[11]) and RCX (φ[12]).
2711                self.emit_div_b_is_rax(a_reg, d_reg, signed, remainder, is_32bit);
2712            }
2713
2714            if is_32bit {
2715                self.asm.movsxd(d_reg, d_reg);
2716            }
2717
2718            self.asm.bind_label(done);
2719        }
2720    }
2721
2722    /// Division fast path: b_reg is not RAX, so we use it directly as the divisor.
2723    /// DIV/IDIV does not clobber the operand register, so only RAX needs saving.
2724    fn emit_div_fast(
2725        &mut self,
2726        a_reg: Reg,
2727        b_reg: Reg,
2728        d_reg: Reg,
2729        signed: bool,
2730        remainder: bool,
2731        is_32bit: bool,
2732    ) {
2733        let save_rax = d_reg != Reg::RAX;
2734
2735        if save_rax {
2736            self.asm.push(Reg::RAX);
2737        }
2738
2739        // Load dividend into RAX (push doesn't modify RAX, so a_reg==RAX is fine).
2740        if a_reg != Reg::RAX {
2741            self.asm.mov_rr(Reg::RAX, a_reg);
2742        }
2743
2744        // Set up RDX and divide.
2745        self.emit_div_setup_and_exec(signed, is_32bit, b_reg);
2746
2747        if save_rax {
2748            // d_reg != RAX: move result, then restore φ[11].
2749            let result_reg = if remainder { SCRATCH } else { Reg::RAX };
2750            self.asm.mov_rr(d_reg, result_reg);
2751            self.asm.pop(Reg::RAX);
2752        } else {
2753            // d_reg == RAX: quotient is already there; for remainder, move RDX → RAX.
2754            if remainder {
2755                self.asm.mov_rr(Reg::RAX, SCRATCH);
2756            }
2757        }
2758    }
2759
2760    /// Division slow path: b_reg == RAX (divisor is φ[11]).
2761    /// We must move the divisor to RCX before loading the dividend into RAX.
2762    fn emit_div_b_is_rax(
2763        &mut self,
2764        a_reg: Reg,
2765        d_reg: Reg,
2766        signed: bool,
2767        remainder: bool,
2768        is_32bit: bool,
2769    ) {
2770        // Always save RAX and RCX so we can restore both PVM registers.
2771        self.asm.push(Reg::RAX); // save φ[11]
2772        self.asm.push(Reg::RCX); // save φ[12]
2773        // Stack: [RSP+0]=old_RCX, [RSP+8]=old_RAX
2774
2775        // Move divisor (currently in RAX) to RCX.
2776        // (push doesn't modify RAX, so it still holds the divisor.)
2777        self.asm.mov_rr(Reg::RCX, Reg::RAX);
2778
2779        // Load dividend into RAX.
2780        if a_reg == Reg::RAX {
2781            // Dividend is also φ[11] — RAX still holds it (mov_rr above
2782            // copied RAX→RCX but didn't change RAX). Nothing to do.
2783        } else if a_reg == Reg::RCX {
2784            // We just overwrote RCX with the divisor; load original φ[12] from stack.
2785            self.asm.mov_load64(Reg::RAX, Reg::RSP, 0); // old_RCX
2786        } else {
2787            self.asm.mov_rr(Reg::RAX, a_reg);
2788        }
2789
2790        // Set up RDX and divide.
2791        self.emit_div_setup_and_exec(signed, is_32bit, Reg::RCX);
2792
2793        // Place result and restore saved registers.
2794        let result_reg = if remainder { SCRATCH } else { Reg::RAX };
2795
2796        if d_reg == Reg::RAX {
2797            if remainder {
2798                self.asm.mov_rr(Reg::RAX, SCRATCH);
2799            }
2800            self.asm.pop(Reg::RCX); // restore φ[12]
2801            self.asm.pop(SCRATCH); // discard saved RAX (d_reg overwrites φ[11])
2802        } else if d_reg == Reg::RCX {
2803            self.asm.mov_rr(Reg::RCX, result_reg);
2804            self.asm.pop(SCRATCH); // discard saved RCX (d_reg overwrites φ[12])
2805            self.asm.pop(Reg::RAX); // restore φ[11]
2806        } else {
2807            self.asm.mov_rr(d_reg, result_reg);
2808            self.asm.pop(Reg::RCX); // restore φ[12]
2809            self.asm.pop(Reg::RAX); // restore φ[11]
2810        }
2811    }
2812
2813    /// Emit RDX setup (sign-extend or zero) and the DIV/IDIV instruction.
2814    fn emit_div_setup_and_exec(&mut self, signed: bool, is_32bit: bool, divisor: Reg) {
2815        if is_32bit {
2816            if signed {
2817                self.asm.movsxd(Reg::RAX, Reg::RAX);
2818                self.asm.cdq();
2819                self.asm.idiv32(divisor);
2820            } else {
2821                self.asm.movzx_32_64(Reg::RAX, Reg::RAX);
2822                self.asm.mov_ri64(SCRATCH, 0);
2823                self.asm.div32(divisor);
2824            }
2825        } else if signed {
2826            self.asm.cqo();
2827            self.asm.idiv64(divisor);
2828        } else {
2829            self.asm.mov_ri64(SCRATCH, 0);
2830            self.asm.div64(divisor);
2831        }
2832    }
2833
2834    /// Multiply upper (128-bit product, take high 64 bits).
2835    ///
2836    /// MUL/IMUL uses RAX (φ[11]) and RDX (SCRATCH) implicitly.
2837    /// RDX = SCRATCH is not a PVM register, so only RAX needs saving.
2838    fn emit_mul_upper(&mut self, args: &Args, a_signed: bool, b_signed: bool) {
2839        if let Args::ThreeReg { ra, rb, rd } = args {
2840            let d_reg = REG_MAP[*rd];
2841            let rb_is_rax = REG_MAP[*rb] == Reg::RAX;
2842            // We need to preserve φ[11] (RAX) unless d_reg is RAX AND rb != RAX
2843            // (if rb == RAX, we always push so we can recover the original value).
2844            let save_rax = d_reg != Reg::RAX || rb_is_rax;
2845
2846            if save_rax {
2847                self.asm.push(Reg::RAX); // save φ[11]
2848            }
2849
2850            // Load ra into RAX (push doesn't modify RAX).
2851            if REG_MAP[*ra] != Reg::RAX {
2852                self.asm.mov_rr(Reg::RAX, REG_MAP[*ra]);
2853            }
2854
2855            // Determine mul_src: the register holding rb's value.
2856            let mul_src = if rb_is_rax {
2857                // rb is φ[11] = RAX; original value is on stack.
2858                self.asm.mov_load64(SCRATCH, Reg::RSP, 0);
2859                SCRATCH
2860            } else {
2861                REG_MAP[*rb]
2862            };
2863
2864            if a_signed && b_signed {
2865                self.asm.imul_rdx_rax(mul_src);
2866            } else if !a_signed && !b_signed {
2867                self.asm.mul_rdx_rax(mul_src);
2868            } else {
2869                // MulUpperSU: ra is signed, rb is unsigned
2870                // result_hi = unsigned_mul_hi(ra, rb) - (ra < 0 ? rb : 0)
2871                self.asm.push(mul_src); // save rb
2872                self.asm.push(Reg::RAX); // save ra (for sign check)
2873                if rb_is_rax {
2874                    // mul_src was SCRATCH (loaded from stack); reload after pushes.
2875                    // orig_RAX is now at [RSP + 16] (ra push + rb push above it).
2876                    self.asm.mov_load64(SCRATCH, Reg::RSP, 16);
2877                    self.asm.mul_rdx_rax(SCRATCH);
2878                } else {
2879                    self.asm.mul_rdx_rax(mul_src);
2880                }
2881                // RDX = high bits. Check if original ra was negative.
2882                self.asm.pop(Reg::RAX); // pop saved ra
2883                let skip = self.asm.new_label();
2884                self.asm.test_rr(Reg::RAX, Reg::RAX);
2885                self.asm.jcc_label(Cc::NS, skip);
2886                // ra was negative: subtract rb from high word (RDX)
2887                self.asm.pop(Reg::RAX); // pop saved rb
2888                self.asm.sub_rr(SCRATCH, Reg::RAX);
2889                let done = self.asm.new_label();
2890                self.asm.jmp_label(done);
2891                self.asm.bind_label(skip);
2892                self.asm.add_ri(Reg::RSP, 8); // discard saved rb
2893                self.asm.bind_label(done);
2894            }
2895
2896            // High 64 bits are in RDX (SCRATCH).
2897            if save_rax {
2898                if d_reg == Reg::RAX {
2899                    // rb_is_rax case: we saved RAX for rb recovery but d_reg is also RAX.
2900                    // Discard the saved value and put result in RAX.
2901                    self.asm.add_ri(Reg::RSP, 8);
2902                    self.asm.mov_rr(Reg::RAX, SCRATCH);
2903                } else {
2904                    self.asm.mov_rr(d_reg, SCRATCH);
2905                    self.asm.pop(Reg::RAX); // restore φ[11]
2906                }
2907            } else {
2908                // d_reg == RAX and !rb_is_rax → didn't save RAX.
2909                self.asm.mov_rr(Reg::RAX, SCRATCH);
2910            }
2911        }
2912    }
2913
2914    /// Emit an exit sequence that sets exit_reason and exit_arg.
2915    fn emit_exit(&mut self, reason: u32, arg: u32) {
2916        self.asm
2917            .mov_store32_rip_rel_imm(CTX_EXIT_REASON, reason as i32);
2918        self.asm.mov_store32_rip_rel_imm(CTX_EXIT_ARG, arg as i32);
2919        self.asm.jmp_label(self.exit_label);
2920    }
2921
2922    /// Emit prologue: save callee-saved, load PVM registers from context,
2923    /// then dispatch to the correct basic block based on entry_pc.
2924    fn emit_prologue(&mut self) {
2925        self.asm.ensure_capacity(512); // prologue needs ~200 bytes
2926        // Save callee-saved registers
2927        self.asm.push(Reg::RBX);
2928        self.asm.push(Reg::RBP);
2929        self.asm.push(Reg::R12);
2930        self.asm.push(Reg::R13);
2931        self.asm.push(Reg::R14);
2932        self.asm.push(Reg::R15);
2933
2934        // Stack alignment: after 6 callee-saved pushes + return address (7 * 8 = 56),
2935        // RSP mod 16 = 8. With save_caller_saved (8 pushes = 64 bytes), total
2936        // displacement = 56 + 64 = 120, RSP mod 16 = 8. Push extra 8 bytes for
2937        // alignment so that save_caller_saved leaves RSP mod 16 = 0 for CALL.
2938        self.asm.push(SCRATCH); // alignment padding
2939
2940        // R15 = gas register. Loaded from ctx.gas at prologue, decremented
2941        // per basic block, flushed back to ctx.gas at exit. Mem accesses
2942        // are baseless `[rdx]` (PVM addr == native VA); CTX is reached via
2943        // absolute SIB. Neither path reads R15.
2944        self.asm.mov_load64_rip_rel(GAS, CTX_GAS);
2945
2946        // Clear exit reason
2947        self.asm.mov_store32_rip_rel_imm(CTX_EXIT_REASON, 0);
2948
2949        // --- O(1) dispatch via table lookup (before loading PVM regs) ---
2950        self.asm.mov_load32_rip_rel(SCRATCH, CTX_ENTRY_PC);
2951        self.asm.mov_load64_rip_rel(Reg::RAX, CTX_DISPATCH_TABLE);
2952        self.asm.movsxd_load_sib4(Reg::RAX, Reg::RAX, SCRATCH);
2953        self.asm.mov_load64_rip_rel(SCRATCH, CTX_CODE_BASE);
2954        self.asm.add_rr(Reg::RAX, SCRATCH);
2955        self.asm.push(Reg::RAX);
2956
2957        // Load all 13 PVM registers from context
2958        for (i, &reg) in REG_MAP.iter().enumerate() {
2959            self.asm.mov_load64_rip_rel(reg, CTX_REGS + (i as u64) * 8);
2960        }
2961
2962        // Jump to the dispatch target (pop into SCRATCH, then indirect jump)
2963        self.asm.pop(SCRATCH);
2964        self.asm.jmp_reg(SCRATCH);
2965    }
2966
2967    /// Emit exit sequences and epilogue.
2968    fn emit_exit_sequences(&mut self) {
2969        // Reserve capacity for exit sequences + all OOG stubs.
2970        // Each OOG stub is ~12 bytes.
2971        let needed = 512 + self.oog_stubs.len() * 16;
2972        self.asm.ensure_capacity(needed);
2973        // Shared OOG handler that reads PC from SCRATCH — emitted BEFORE OOG
2974        // stubs so backward jumps from stubs can use jmp rel8 (2 bytes).
2975        self.asm.bind_label(self.oog_pc_label);
2976        self.asm.mov_store32_rip_rel(CTX_PC, SCRATCH);
2977        // fall through to oog_label:
2978        self.asm.bind_label(self.oog_label);
2979        self.asm
2980            .mov_store32_rip_rel_imm(CTX_EXIT_REASON, EXIT_OOG as i32);
2981        self.asm.jmp_label(self.exit_label);
2982
2983        // Per-gas-block OOG stubs: compact format — load PC into SCRATCH,
2984        // jump to shared handler. Saves ~6 bytes per stub vs inline PC store.
2985        let stubs = core::mem::take(&mut self.oog_stubs);
2986        for (label, pvm_pc, _cost) in &stubs {
2987            self.asm.bind_label(*label);
2988            self.asm.mov_ri32(SCRATCH, *pvm_pc);
2989            self.asm.jmp_label(self.oog_pc_label);
2990        }
2991
2992        // Page faults are handled by the SIGSEGV handler (signal.rs).
2993
2994        // Panic exit
2995        self.asm.bind_label(self.panic_label);
2996        self.asm
2997            .mov_store32_rip_rel_imm(CTX_EXIT_REASON, EXIT_PANIC as i32);
2998        // fall through to exit_label
2999
3000        // Common exit: flush gas (R15) → ctx.gas, then save PVM regs.
3001        self.asm.bind_label(self.exit_label);
3002        self.asm.mov_store64_rip_rel(CTX_GAS, GAS);
3003        for (i, &reg) in REG_MAP.iter().enumerate() {
3004            self.asm.mov_store64_rip_rel(CTX_REGS + (i as u64) * 8, reg);
3005        }
3006
3007        // Restore callee-saved (+ alignment padding)
3008        self.asm.pop(SCRATCH); // alignment padding
3009        self.asm.pop(Reg::R15);
3010        self.asm.pop(Reg::R14);
3011        self.asm.pop(Reg::R13);
3012        self.asm.pop(Reg::R12);
3013        self.asm.pop(Reg::RBP);
3014        self.asm.pop(Reg::RBX);
3015        self.asm.ret();
3016    }
3017
3018    /// Get the memory read helper for a load opcode.
3019    fn read_fn_for(&self, opcode: Opcode) -> u64 {
3020        match opcode {
3021            Opcode::LoadU8 | Opcode::LoadI8 | Opcode::LoadIndU8 | Opcode::LoadIndI8 => {
3022                self.helpers.mem_read_u8
3023            }
3024            Opcode::LoadU16 | Opcode::LoadI16 | Opcode::LoadIndU16 | Opcode::LoadIndI16 => {
3025                self.helpers.mem_read_u16
3026            }
3027            Opcode::LoadU32 | Opcode::LoadI32 | Opcode::LoadIndU32 | Opcode::LoadIndI32 => {
3028                self.helpers.mem_read_u32
3029            }
3030            Opcode::LoadU64 | Opcode::LoadIndU64 => self.helpers.mem_read_u64,
3031            _ => self.helpers.mem_read_u8,
3032        }
3033    }
3034
3035    /// Get the memory write helper for a store opcode.
3036    fn write_fn_for(&self, opcode: Opcode) -> u64 {
3037        match opcode {
3038            Opcode::StoreU8 | Opcode::StoreIndU8 => self.helpers.mem_write_u8,
3039            Opcode::StoreU16 | Opcode::StoreIndU16 => self.helpers.mem_write_u16,
3040            Opcode::StoreU32 | Opcode::StoreIndU32 => self.helpers.mem_write_u32,
3041            Opcode::StoreU64 | Opcode::StoreIndU64 => self.helpers.mem_write_u64,
3042            _ => self.helpers.mem_write_u8,
3043        }
3044    }
3045}