javm_recompiler_x86/
asm.rs

1//! x86-64 assembler for PVM recompiler.
2//!
3//! Emits native x86-64 machine code with label-based jump resolution.
4//! All jumps use 32-bit relative offsets (no short-jump optimization).
5//!
6//! # Safety model
7//!
8//! The assembler writes to a raw `*mut u8` buffer (`self.buf`) for performance.
9//! The key invariant: `self.buf` points to a valid allocation of at least
10//! `self.capacity` bytes (the `code_buf` Vec's backing store).
11//! All emission functions have `debug_assert!(self.write_pos + N <= self.capacity)`
12//! guards. Callers must ensure capacity via `ensure_capacity()` before emitting.
13//! Vec length is synced via `set_len(write_pos)` only at finalization boundaries.
14
15use alloc::vec;
16use alloc::vec::Vec;
17
18/// Instruction buffer: accumulates x86 bytes in a u128 register, then flushes
19/// with a single bulk write. Avoids per-byte memory stores.
20#[derive(Clone, Copy)]
21struct InstBuf {
22    out: u128,
23    length: u32, // in bits
24}
25
26impl InstBuf {
27    #[inline(always)]
28    fn new() -> Self {
29        Self { out: 0, length: 0 }
30    }
31
32    #[inline(always)]
33    fn push(&mut self, byte: u8) {
34        self.out |= (byte as u128) << self.length;
35        self.length += 8;
36    }
37
38    #[inline(always)]
39    fn push_u32(&mut self, v: u32) {
40        self.out |= (v as u128) << self.length;
41        self.length += 32;
42    }
43
44    #[inline(always)]
45    fn push_u64(&mut self, v: u64) {
46        self.out |= (v as u128) << self.length;
47        self.length += 64;
48    }
49
50    #[inline(always)]
51    fn push_i32(&mut self, v: i32) {
52        self.push_u32(v as u32);
53    }
54
55    #[inline(always)]
56    fn len(&self) -> usize {
57        (self.length >> 3) as usize
58    }
59}
60
61/// x86-64 register encoding.
62#[derive(Clone, Copy, Debug, PartialEq, Eq)]
63#[repr(u8)]
64pub enum Reg {
65    RAX = 0,
66    RCX = 1,
67    RDX = 2,
68    RBX = 3,
69    RSP = 4,
70    RBP = 5,
71    RSI = 6,
72    RDI = 7,
73    R8 = 8,
74    R9 = 9,
75    R10 = 10,
76    R11 = 11,
77    R12 = 12,
78    R13 = 13,
79    R14 = 14,
80    R15 = 15,
81}
82
83impl Reg {
84    /// Low 3 bits for ModR/M encoding.
85    fn lo(self) -> u8 {
86        (self as u8) & 7
87    }
88    /// High bit for REX.R or REX.B.
89    fn hi(self) -> u8 {
90        (self as u8) >> 3
91    }
92    /// Whether this register requires a REX prefix.
93    fn needs_rex(self) -> bool {
94        (self as u8) >= 8
95    }
96}
97
98/// Condition codes for Jcc/SETcc/CMOVcc.
99#[derive(Clone, Copy, Debug, PartialEq, Eq)]
100#[repr(u8)]
101pub enum Cc {
102    O = 0,
103    NO = 1,
104    B = 2,  // Below (unsigned <)
105    AE = 3, // Above or Equal (unsigned >=)
106    E = 4,  // Equal
107    NE = 5, // Not Equal
108    BE = 6, // Below or Equal (unsigned <=)
109    A = 7,  // Above (unsigned >)
110    S = 8,  // Sign
111    NS = 9,
112    P = 10,
113    NP = 11,
114    L = 12,  // Less (signed <)
115    GE = 13, // Greater or Equal (signed >=)
116    LE = 14, // Less or Equal (signed <=)
117    G = 15,  // Greater (signed >)
118}
119
120/// Label identifier.
121#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
122pub struct Label(pub u32);
123
124/// Fixup kind for label resolution.
125#[derive(Clone, Copy)]
126struct Fixup {
127    /// Offset in code buffer where the 4-byte rel32 placeholder is.
128    offset: usize,
129    /// The label this fixup targets.
130    label: Label,
131}
132
133/// x86-64 assembler with label support.
134///
135/// Uses direct pointer writes to the pre-allocated buffer for emission,
136/// avoiding per-byte Vec::push overhead (capacity check + len update).
137pub struct Assembler {
138    code_buf: Vec<u8>,
139    /// Raw pointer to the start of the code buffer.
140    buf: *mut u8,
141    write_pos: usize,
142    capacity: usize,
143    /// Label ID → bound offset+1 as u32 (0 = unbound). Uses u32 to halve
144    /// memory vs usize (native code always fits in 4GB).
145    /// Pre-sized via `vec![0u32; capacity]` which uses calloc (zero-page COW).
146    labels: Vec<u32>,
147    /// Number of labels allocated via new_label/bulk_create_labels.
148    /// The Vec is pre-sized but labels_len tracks the logical length.
149    labels_len: usize,
150    fixups: Vec<Fixup>,
151    /// Eventual VA the code buffer will be loaded at. Used by RIP-relative
152    /// emitters to compute `disp32 = target_va - (jit_va_base + RIP_after_inst)`.
153    /// Zero means "relocatable / unknown" — RIP-relative emitters then
154    /// produce offsets relative to code-buffer offset 0, useful for tests.
155    jit_va_base: u64,
156}
157
158/// Unbound label sentinel. We use 0 so that bulk label allocation can use
159/// zeroed memory (calloc / zero-page COW) instead of writing 0xFF to every byte.
160/// Bound labels store `native_offset + 1` to avoid collision with the sentinel.
161const LABEL_UNBOUND: u32 = 0;
162
163impl Default for Assembler {
164    fn default() -> Self {
165        Self::new()
166    }
167}
168
169impl Assembler {
170    pub fn new() -> Self {
171        let mut code = Vec::with_capacity(4096);
172        let buf = code.as_mut_ptr();
173        let capacity = code.capacity();
174        Self {
175            code_buf: code,
176            buf,
177            write_pos: 0,
178            capacity,
179            labels: Vec::new(),
180            labels_len: 0,
181            fixups: Vec::new(),
182            jit_va_base: 0,
183        }
184    }
185
186    /// Create with pre-allocated capacity for code and labels.
187    pub fn with_capacity(code_capacity: usize, label_capacity: usize) -> Self {
188        let mut code = Vec::with_capacity(code_capacity);
189        let buf = code.as_mut_ptr();
190        let capacity = code.capacity();
191        Self {
192            code_buf: code,
193            buf,
194            write_pos: 0,
195            capacity,
196            // vec![0; n] uses calloc — zero pages via COW, no page faults for untouched entries
197            labels: vec![0u32; label_capacity],
198            labels_len: 0,
199            fixups: Vec::with_capacity(2048),
200            jit_va_base: 0,
201        }
202    }
203
204    /// Set the eventual load VA for the code buffer. Must be called before
205    /// any RIP-relative emitter that targets a fixed VA (i.e. CTX accesses).
206    pub fn set_jit_va_base(&mut self, va: u64) {
207        self.jit_va_base = va;
208    }
209
210    /// Ensure at least `additional` bytes of capacity remain.
211    /// Called before emitting large sequences. Most individual instructions
212    /// need at most ~32 bytes, so this is rarely needed mid-compilation.
213    #[cold]
214    fn grow(&mut self, additional: usize) {
215        let code = &mut self.code_buf;
216        // SAFETY: write_pos <= code.capacity() is maintained by ensure_capacity().
217        // set_len(write_pos) exposes the bytes we've written so reserve() copies them.
218        unsafe {
219            code.set_len(self.write_pos);
220        }
221        code.reserve(additional);
222        self.buf = code.as_mut_ptr();
223        self.capacity = code.capacity();
224        // SAFETY: Reset len to 0 — we track the actual length via write_pos,
225        // not Vec::len. The data is preserved in the backing allocation.
226        unsafe {
227            code.set_len(0);
228        }
229    }
230
231    /// Check capacity and grow if needed. Inlined for the fast path (no grow).
232    #[inline(always)]
233    pub fn ensure_capacity(&mut self, n: usize) {
234        if self.write_pos + n > self.capacity {
235            self.grow(n);
236        }
237    }
238
239    /// Allocate a new label.
240    pub fn new_label(&mut self) -> Label {
241        let id = self.labels_len as u32;
242        self.labels_len += 1;
243        // Grow if needed (rare — labels Vec is pre-sized in with_capacity)
244        if self.labels_len > self.labels.len() {
245            self.labels.push(LABEL_UNBOUND);
246        }
247        Label(id)
248    }
249
250    /// Current number of labels allocated.
251    pub fn labels_len(&self) -> usize {
252        self.labels_len
253    }
254
255    /// Bulk-allocate `count` unbound labels. The labels Vec is already pre-sized
256    /// via calloc (zero pages). This just advances the logical length counter.
257    pub fn bulk_create_labels(&mut self, count: usize) {
258        self.labels_len += count;
259        // Grow if pre-sized Vec wasn't large enough (shouldn't happen normally)
260        if self.labels_len > self.labels.len() {
261            self.labels.resize(self.labels_len, LABEL_UNBOUND);
262        }
263    }
264
265    /// Bind a label to the current write position.
266    pub fn bind_label(&mut self, label: Label) {
267        self.labels[label.0 as usize] = (self.write_pos + 1) as u32; // +1: 0 is LABEL_UNBOUND
268    }
269
270    /// Current code offset (write position).
271    pub fn offset(&self) -> usize {
272        self.write_pos
273    }
274
275    /// Patch an i32 value at a previously recorded offset.
276    pub fn patch_i32(&mut self, offset: usize, value: i32) {
277        debug_assert!(offset + 4 <= self.write_pos);
278        // SAFETY: offset + 4 <= write_pos <= capacity, so buf.add(offset) is in bounds.
279        unsafe {
280            core::ptr::copy_nonoverlapping(value.to_le_bytes().as_ptr(), self.buf.add(offset), 4);
281        }
282    }
283
284    // === Raw byte emission ===
285    // All emission writes directly to the buffer via raw pointer,
286    // bypassing Vec::push's capacity check and len update.
287    // SAFETY for all emit* functions: write_pos + N <= capacity is guarded by
288    // debug_assert. Callers ensure capacity via ensure_capacity() before sequences.
289
290    #[inline(always)]
291    fn emit(&mut self, b: u8) {
292        debug_assert!(self.write_pos < self.capacity);
293        // SAFETY: write_pos < capacity is asserted above; buf points to a valid
294        // allocation of at least `capacity` bytes (Vec backing store).
295        unsafe {
296            *self.buf.add(self.write_pos) = b;
297        }
298        self.write_pos += 1;
299    }
300
301    /// Emit 3 bytes at once.
302    #[inline(always)]
303    fn emit3(&mut self, a: u8, b: u8, c: u8) {
304        debug_assert!(self.write_pos + 3 <= self.capacity);
305        // SAFETY: write_pos + 3 <= capacity is asserted above; buf is a valid
306        // allocation. Individual byte writes are in-bounds.
307        unsafe {
308            let p = self.buf.add(self.write_pos);
309            *p = a;
310            *p.add(1) = b;
311            *p.add(2) = c;
312        }
313        self.write_pos += 3;
314    }
315
316    /// Flush an InstBuf to the code buffer in one bulk write.
317    #[inline(always)]
318    fn flush_instbuf(&mut self, ib: InstBuf) {
319        let len = ib.len();
320        debug_assert!(self.write_pos + len <= self.capacity);
321        // SAFETY: write_pos + len <= capacity is asserted above. We always
322        // write both 8-byte halves (16 bytes total) even if len < 16 — this
323        // is safe because ensure_capacity(512) guarantees ample slack beyond
324        // the actual instruction bytes.
325        unsafe {
326            let p = self.buf.add(self.write_pos);
327            core::ptr::write_unaligned(p as *mut u64, ib.out as u64);
328            core::ptr::write_unaligned(p.add(8) as *mut u64, (ib.out >> 64) as u64);
329        }
330        self.write_pos += len;
331    }
332
333    #[inline(always)]
334    fn emit_u32(&mut self, v: u32) {
335        debug_assert!(self.write_pos + 4 <= self.capacity);
336        // SAFETY: write_pos + 4 <= capacity asserted; unaligned write is valid
337        // for any byte-aligned pointer within the buffer.
338        unsafe {
339            core::ptr::write_unaligned(self.buf.add(self.write_pos) as *mut u32, v.to_le());
340        }
341        self.write_pos += 4;
342    }
343
344    #[inline(always)]
345    fn emit_i32(&mut self, v: i32) {
346        debug_assert!(self.write_pos + 4 <= self.capacity);
347        // SAFETY: write_pos + 4 <= capacity asserted; unaligned write is valid.
348        // The cast chain converts i32 → LE bytes → u32 for write_unaligned.
349        unsafe {
350            core::ptr::write_unaligned(
351                self.buf.add(self.write_pos) as *mut u32,
352                v.to_le_bytes().as_ptr().cast::<u32>().read(),
353            );
354        }
355        self.write_pos += 4;
356    }
357
358    /// Emit a label reference (4-byte rel32). For backward references (label
359    /// already bound), resolves immediately without creating a fixup entry.
360    /// For forward references, emits a placeholder and records a fixup.
361    fn emit_label_fixup(&mut self, label: Label) {
362        let bound = self.labels[label.0 as usize];
363        if bound != LABEL_UNBOUND {
364            // Backward reference — resolve immediately, no fixup needed.
365            // rel32 = target - (current_offset + 4). Stored value is offset+1.
366            let target = (bound - 1) as i64;
367            let rel = target - (self.write_pos as i64 + 4);
368            self.emit_i32(rel as i32);
369        } else {
370            // Forward reference — defer to finalization.
371            let offset = self.write_pos;
372            self.fixups.push(Fixup { offset, label });
373            self.emit_u32(0); // placeholder
374        }
375    }
376
377    // === REX prefix helpers ===
378
379    fn rex_opt_b(&mut self, rm: Reg) {
380        if rm.needs_rex() {
381            self.emit(0x40 | rm.hi());
382        }
383    }
384
385    /// ModR/M (+ optional SIB) + displacement for [base + disp] addressing.
386    /// Pushes into an InstBuf instead of emitting directly.
387    #[inline(always)]
388    fn modrm_disp_ib(ib: &mut InstBuf, reg: u8, base: Reg, disp: i32) {
389        let bl = base.lo();
390        let needs_sib = bl == 4;
391
392        if disp == 0 && bl != 5 {
393            if needs_sib {
394                ib.push((reg << 3) | 4);
395                ib.push(0x24);
396            } else {
397                ib.push((reg << 3) | bl);
398            }
399        } else if (-128..=127).contains(&disp) {
400            if needs_sib {
401                ib.push(0x40 | (reg << 3) | 4);
402                ib.push(0x24);
403            } else {
404                ib.push(0x40 | (reg << 3) | bl);
405            }
406            ib.push(disp as u8);
407        } else {
408            if needs_sib {
409                ib.push(0x80 | (reg << 3) | 4);
410                ib.push(0x24);
411            } else {
412                ib.push(0x80 | (reg << 3) | bl);
413            }
414            ib.push_i32(disp);
415        }
416    }
417
418    // === Instruction emission ===
419
420    // -- MOV --
421
422    /// mov r64, r64
423    #[inline(always)]
424    pub fn mov_rr(&mut self, dst: Reg, src: Reg) {
425        if dst == src {
426            return;
427        }
428        self.emit3(
429            0x48 | (src.hi() << 2) | dst.hi(),
430            0x89,
431            0xC0 | (src.lo() << 3) | dst.lo(),
432        );
433    }
434
435    /// mov r64, imm64
436    #[inline(always)]
437    pub fn mov_ri64(&mut self, dst: Reg, imm: u64) {
438        let mut ib = InstBuf::new();
439        if imm == 0 {
440            // xor r32, r32 (clears full r64)
441            let r = dst.hi();
442            if r != 0 {
443                ib.push(0x40 | (r << 2) | r);
444            }
445            ib.push(0x31);
446            ib.push(0xC0 | (dst.lo() << 3) | dst.lo());
447        } else if imm <= u32::MAX as u64 {
448            // mov r32, imm32 (zero-extends to 64)
449            if dst.needs_rex() {
450                ib.push(0x40 | dst.hi());
451            }
452            ib.push(0xB8 + dst.lo());
453            ib.push_u32(imm as u32);
454        } else if imm as i64 >= i32::MIN as i64 && imm as i64 <= i32::MAX as i64 {
455            // mov r64, sign-extended imm32
456            ib.push(0x48 | dst.hi());
457            ib.push(0xC7);
458            ib.push(0xC0 | dst.lo());
459            ib.push_i32(imm as i32);
460        } else {
461            // mov r64, imm64
462            ib.push(0x48 | dst.hi());
463            ib.push(0xB8 + dst.lo());
464            ib.push_u64(imm);
465        }
466        self.flush_instbuf(ib);
467    }
468
469    /// mov r32, imm32 (zero-extends to 64-bit)
470    #[inline(always)]
471    pub fn mov_ri32(&mut self, dst: Reg, imm: u32) {
472        let mut ib = InstBuf::new();
473        if dst.needs_rex() {
474            ib.push(0x40 | dst.hi());
475        }
476        ib.push(0xB8 + dst.lo());
477        ib.push_u32(imm);
478        self.flush_instbuf(ib);
479    }
480
481    /// mov r64, [base + disp]
482    #[inline(always)]
483    pub fn mov_load64(&mut self, dst: Reg, base: Reg, disp: i32) {
484        let mut ib = InstBuf::new();
485        ib.push(0x48 | (dst.hi() << 2) | base.hi());
486        ib.push(0x8B);
487        Self::modrm_disp_ib(&mut ib, dst.lo(), base, disp);
488        self.flush_instbuf(ib);
489    }
490
491    /// movsxd r64, dword [base + index*4] — sign-extending load with SIB scale=4
492    pub fn movsxd_load_sib4(&mut self, dst: Reg, base: Reg, index: Reg) {
493        let mut ib = InstBuf::new();
494        ib.push(0x48 | (dst.hi() << 2) | (index.hi() << 1) | base.hi());
495        ib.push(0x63);
496        ib.push((dst.lo() << 3) | 4);
497        ib.push(0x80 | (index.lo() << 3) | base.lo());
498        self.flush_instbuf(ib);
499    }
500
501    /// mov dword [base + disp], r32 — 32-bit store
502    #[inline(always)]
503    pub fn mov_store32(&mut self, base: Reg, disp: i32, src: Reg) {
504        let mut ib = InstBuf::new();
505        let r = src.hi();
506        let b = base.hi();
507        if r != 0 || b != 0 {
508            ib.push(0x40 | (r << 2) | b);
509        }
510        ib.push(0x89);
511        Self::modrm_disp_ib(&mut ib, src.lo(), base, disp);
512        self.flush_instbuf(ib);
513    }
514
515    /// mov [base + disp], r64
516    #[inline(always)]
517    pub fn mov_store64(&mut self, base: Reg, disp: i32, src: Reg) {
518        let mut ib = InstBuf::new();
519        ib.push(0x48 | (src.hi() << 2) | base.hi());
520        ib.push(0x89);
521        Self::modrm_disp_ib(&mut ib, src.lo(), base, disp);
522        self.flush_instbuf(ib);
523    }
524
525    // -- ALU reg,reg (64-bit) --
526
527    fn alu_rr64(&mut self, op: u8, dst: Reg, src: Reg) {
528        let mut ib = InstBuf::new();
529        ib.push(0x48 | (src.hi() << 2) | dst.hi());
530        ib.push(op);
531        ib.push(0xC0 | (src.lo() << 3) | dst.lo());
532        self.flush_instbuf(ib);
533    }
534
535    fn alu_rr32(&mut self, op: u8, dst: Reg, src: Reg) {
536        let r = src.hi();
537        let b = dst.hi();
538        if r != 0 || b != 0 {
539            let mut ib = InstBuf::new();
540            ib.push(0x40 | (r << 2) | b);
541            ib.push(op);
542            ib.push(0xC0 | (src.lo() << 3) | dst.lo());
543            self.flush_instbuf(ib);
544        } else {
545            let mut ib = InstBuf::new();
546            ib.push(op);
547            ib.push(0xC0 | (src.lo() << 3) | dst.lo());
548            self.flush_instbuf(ib);
549        }
550    }
551
552    #[inline(always)]
553    pub fn add_rr(&mut self, dst: Reg, src: Reg) {
554        self.alu_rr64(0x01, dst, src);
555    }
556    #[inline(always)]
557    pub fn sub_rr(&mut self, dst: Reg, src: Reg) {
558        self.alu_rr64(0x29, dst, src);
559    }
560    #[inline(always)]
561    pub fn and_rr(&mut self, dst: Reg, src: Reg) {
562        self.alu_rr64(0x21, dst, src);
563    }
564    #[inline(always)]
565    pub fn or_rr(&mut self, dst: Reg, src: Reg) {
566        self.alu_rr64(0x09, dst, src);
567    }
568    #[inline(always)]
569    pub fn xor_rr(&mut self, dst: Reg, src: Reg) {
570        self.alu_rr64(0x31, dst, src);
571    }
572    #[inline(always)]
573    pub fn cmp_rr(&mut self, a: Reg, b: Reg) {
574        self.alu_rr64(0x39, a, b);
575    }
576    #[inline(always)]
577    pub fn test_rr(&mut self, a: Reg, b: Reg) {
578        self.alu_rr64(0x85, a, b);
579    }
580
581    /// `test r32, r32` — sets ZF iff the low 32 bits are zero (ignores the
582    /// high 32 bits). Used by the 32-bit `divw`/`remw` zero-divisor guard,
583    /// which must test only the bits `idivl`/`divl` actually consume.
584    #[inline(always)]
585    pub fn test_rr32(&mut self, a: Reg, b: Reg) {
586        self.alu_rr32(0x85, a, b);
587    }
588
589    #[inline(always)]
590    pub fn add_rr32(&mut self, dst: Reg, src: Reg) {
591        self.alu_rr32(0x01, dst, src);
592    }
593    #[inline(always)]
594    pub fn sub_rr32(&mut self, dst: Reg, src: Reg) {
595        self.alu_rr32(0x29, dst, src);
596    }
597
598    // -- ALU reg,imm (64-bit) --
599    // Uses imm8 (opcode 0x83) when immediate fits in -128..127, saving 3 bytes.
600
601    #[inline(always)]
602    fn alu_ri64(&mut self, ext: u8, dst: Reg, imm: i32) {
603        let mut ib = InstBuf::new();
604        ib.push(0x48 | dst.hi());
605        if (-128..=127).contains(&imm) {
606            ib.push(0x83);
607            ib.push(0xC0 | (ext << 3) | dst.lo());
608            ib.push(imm as u8);
609        } else {
610            ib.push(0x81);
611            ib.push(0xC0 | (ext << 3) | dst.lo());
612            ib.push_i32(imm);
613        }
614        self.flush_instbuf(ib);
615    }
616
617    #[inline(always)]
618    fn alu_ri32(&mut self, ext: u8, dst: Reg, imm: i32) {
619        let mut ib = InstBuf::new();
620        if dst.needs_rex() {
621            ib.push(0x40 | dst.hi());
622        }
623        if (-128..=127).contains(&imm) {
624            ib.push(0x83);
625            ib.push(0xC0 | (ext << 3) | dst.lo());
626            ib.push(imm as u8);
627        } else {
628            ib.push(0x81);
629            ib.push(0xC0 | (ext << 3) | dst.lo());
630            ib.push_i32(imm);
631        }
632        self.flush_instbuf(ib);
633    }
634
635    #[inline(always)]
636    pub fn add_ri(&mut self, dst: Reg, imm: i32) {
637        self.alu_ri64(0, dst, imm);
638    }
639    #[inline(always)]
640    pub fn sub_ri(&mut self, dst: Reg, imm: i32) {
641        self.alu_ri64(5, dst, imm);
642    }
643    #[inline(always)]
644    pub fn and_ri(&mut self, dst: Reg, imm: i32) {
645        self.alu_ri64(4, dst, imm);
646    }
647    #[inline(always)]
648    pub fn or_ri(&mut self, dst: Reg, imm: i32) {
649        self.alu_ri64(1, dst, imm);
650    }
651    #[inline(always)]
652    pub fn xor_ri(&mut self, dst: Reg, imm: i32) {
653        self.alu_ri64(6, dst, imm);
654    }
655    #[inline(always)]
656    pub fn cmp_ri(&mut self, a: Reg, imm: i32) {
657        self.alu_ri64(7, a, imm);
658    }
659
660    #[inline(always)]
661    pub fn add_ri32(&mut self, dst: Reg, imm: i32) {
662        self.alu_ri32(0, dst, imm);
663    }
664    #[inline(always)]
665    pub fn sub_ri32(&mut self, dst: Reg, imm: i32) {
666        self.alu_ri32(5, dst, imm);
667    }
668    #[inline(always)]
669    pub fn cmp_ri32(&mut self, a: Reg, imm: i32) {
670        self.alu_ri32(7, a, imm);
671    }
672
673    // -- Baseless memory access [idx] (mod=00 r/m=idx, no SIB) --
674    //
675    // Codegen uses this when the PVM addr is itself the host VA — no
676    // base register needed because the runtime substrate's per-
677    // invocation page table maps `addr` → `addr` for the guest range.
678    // Saves one byte per access vs the SIB form `[base + idx]`.
679    //
680    // ModRM bytes: `(mod=00, reg, r/m=idx.lo())` if idx.lo() ∉ {4, 5}.
681    // For `idx.lo() == 4` (RSP/R12) the ModRM r/m=100 encoding means
682    // "SIB follows"; emit SIB = `(scale=00, index=100, base=4)` to
683    // recover plain `[idx]`. For `idx.lo() == 5` (RBP/R13) the
684    // mod=00 r/m=101 form is reserved for [RIP+disp32], so use
685    // mod=01 + disp8=0.
686    fn modrm_baseless(&mut self, reg: u8, idx: Reg) {
687        let r = reg & 7;
688        match idx.lo() {
689            4 => {
690                // SIB form: [RSP/R12]
691                self.emit((r << 3) | 4); // mod=00 r/m=100
692                self.emit(0x24); // SIB: scale=00, index=100=none, base=4
693            }
694            5 => {
695                // mod=01 disp8=0: [RBP/R13]
696                self.emit(0x40 | (r << 3) | 5);
697                self.emit(0);
698            }
699            _ => {
700                self.emit((r << 3) | idx.lo());
701            }
702        }
703    }
704
705    /// movzx r64, byte \[idx\] — zero-extending u8 load
706    pub fn movzx_load8_at_index(&mut self, dst: Reg, idx: Reg) {
707        self.emit(0x48 | (dst.hi() << 2) | idx.hi());
708        self.emit(0x0F);
709        self.emit(0xB6);
710        self.modrm_baseless(dst.lo(), idx);
711    }
712
713    /// movzx r64, word \[idx\] — zero-extending u16 load
714    pub fn movzx_load16_at_index(&mut self, dst: Reg, idx: Reg) {
715        let rex = 0x40 | (dst.hi() << 2) | idx.hi();
716        if rex != 0x40 {
717            self.emit(rex);
718        }
719        self.emit(0x0F);
720        self.emit(0xB7);
721        self.modrm_baseless(dst.lo(), idx);
722    }
723
724    /// mov r32, dword \[idx\] — zero-extending u32 load (writes EAX, clears upper 32)
725    pub fn mov_load32_at_index(&mut self, dst: Reg, idx: Reg) {
726        let rex = 0x40 | (dst.hi() << 2) | idx.hi();
727        if rex != 0x40 {
728            self.emit(rex);
729        }
730        self.emit(0x8B);
731        self.modrm_baseless(dst.lo(), idx);
732    }
733
734    /// mov r64, qword \[idx\]
735    pub fn mov_load64_at_index(&mut self, dst: Reg, idx: Reg) {
736        self.emit(0x48 | (dst.hi() << 2) | idx.hi());
737        self.emit(0x8B);
738        self.modrm_baseless(dst.lo(), idx);
739    }
740
741    /// mov byte \[idx\], r8
742    pub fn mov_store8_at_index(&mut self, idx: Reg, src: Reg) {
743        // REX prefix mandatory if `src` is SIL/DIL/BPL/SPL (encoded
744        // 4..=7 with hi=0) — without REX those encodings decode as
745        // AH/CH/DH/BH. (`needs_rex` only catches R8-R15.)
746        let rex = 0x40 | (src.hi() << 2) | idx.hi();
747        if rex != 0x40 || src.lo() >= 4 {
748            self.emit(rex);
749        }
750        self.emit(0x88);
751        self.modrm_baseless(src.lo(), idx);
752    }
753
754    /// mov word \[idx\], r16
755    pub fn mov_store16_at_index(&mut self, idx: Reg, src: Reg) {
756        self.emit(0x66);
757        let rex = 0x40 | (src.hi() << 2) | idx.hi();
758        if rex != 0x40 {
759            self.emit(rex);
760        }
761        self.emit(0x89);
762        self.modrm_baseless(src.lo(), idx);
763    }
764
765    /// mov dword \[idx\], r32
766    pub fn mov_store32_at_index(&mut self, idx: Reg, src: Reg) {
767        let rex = 0x40 | (src.hi() << 2) | idx.hi();
768        if rex != 0x40 {
769            self.emit(rex);
770        }
771        self.emit(0x89);
772        self.modrm_baseless(src.lo(), idx);
773    }
774
775    /// mov qword \[idx\], r64
776    pub fn mov_store64_at_index(&mut self, idx: Reg, src: Reg) {
777        self.emit(0x48 | (src.hi() << 2) | idx.hi());
778        self.emit(0x89);
779        self.modrm_baseless(src.lo(), idx);
780    }
781
782    // -- RIP-relative addressing `[rip + disp32]` --
783    //
784    // 64-bit mode's mod=00 r/m=101 form is RIP-relative: the effective
785    // address is `RIP_after_instruction + sign_extend(disp32)`. Used to
786    // reach CTX, which lives at 512 GiB (PML4 slot 1, adjacent to the
787    // JIT arena), outside the guest u32 address space and therefore
788    // beyond absolute-SIB disp32 reach. The `jit_va_base` field of the
789    // assembler holds the eventual load VA;
790    // `target_va - (jit_va_base + write_pos + 4)` is the disp32 we
791    // emit.
792    //
793    // 1 byte shorter per access than the absolute-SIB form (no SIB
794    // byte). Range: target must be within ±2 GiB of post-instruction
795    // RIP, which holds because CTX sits adjacent to the JIT arena.
796    /// Emit a 4-byte RIP-relative disp32 targeting `target_va`. The CPU
797    /// resolves the effective address against RIP-at-next-instruction,
798    /// so we add 4 (for the disp32 itself) plus `trailing_bytes` (any
799    /// imm fields following disp32 in the same instruction — 4 for
800    /// `mov [rip+disp32], imm32`, 0 for everything else).
801    fn emit_rip_rel_disp32(&mut self, target_va: u64, trailing_bytes: u64) {
802        let post_inst_rip = self
803            .jit_va_base
804            .wrapping_add(self.write_pos as u64)
805            .wrapping_add(4)
806            .wrapping_add(trailing_bytes);
807        let disp = (target_va as i64).wrapping_sub(post_inst_rip as i64);
808        debug_assert!(
809            disp >= i32::MIN as i64 && disp <= i32::MAX as i64,
810            "RIP-relative target 0x{:x} out of range from base 0x{:x} + offset 0x{:x}",
811            target_va,
812            self.jit_va_base,
813            self.write_pos
814        );
815        self.emit_i32(disp as i32);
816    }
817
818    fn modrm_rip_rel(&mut self, reg: u8) {
819        // mod=00, reg, r/m=101 = RIP-relative (no SIB byte).
820        self.emit(((reg & 7) << 3) | 5);
821    }
822
823    /// mov r32, dword [rip+disp32] (zero-extends to 64-bit)
824    pub fn mov_load32_rip_rel(&mut self, dst: Reg, target_va: u64) {
825        if dst.hi() != 0 {
826            self.emit(0x40 | (dst.hi() << 2));
827        }
828        self.emit(0x8B);
829        self.modrm_rip_rel(dst.lo());
830        self.emit_rip_rel_disp32(target_va, 0);
831    }
832
833    /// mov r64, qword [rip+disp32]
834    pub fn mov_load64_rip_rel(&mut self, dst: Reg, target_va: u64) {
835        self.emit(0x48 | (dst.hi() << 2));
836        self.emit(0x8B);
837        self.modrm_rip_rel(dst.lo());
838        self.emit_rip_rel_disp32(target_va, 0);
839    }
840
841    /// mov dword [rip+disp32], r32
842    pub fn mov_store32_rip_rel(&mut self, target_va: u64, src: Reg) {
843        if src.hi() != 0 {
844            self.emit(0x40 | (src.hi() << 2));
845        }
846        self.emit(0x89);
847        self.modrm_rip_rel(src.lo());
848        self.emit_rip_rel_disp32(target_va, 0);
849    }
850
851    /// mov qword [rip+disp32], r64
852    pub fn mov_store64_rip_rel(&mut self, target_va: u64, src: Reg) {
853        self.emit(0x48 | (src.hi() << 2));
854        self.emit(0x89);
855        self.modrm_rip_rel(src.lo());
856        self.emit_rip_rel_disp32(target_va, 0);
857    }
858
859    /// mov dword [rip+disp32], imm32 — the trailing imm32 means the
860    /// "next-instruction RIP" is 4 bytes further than disp32 alone.
861    pub fn mov_store32_rip_rel_imm(&mut self, target_va: u64, imm: i32) {
862        self.emit(0xC7);
863        self.modrm_rip_rel(0);
864        self.emit_rip_rel_disp32(target_va, 4);
865        self.emit_i32(imm);
866    }
867
868    /// add r64, qword [rip+disp32]
869    pub fn add_r64_mem_rip_rel(&mut self, dst: Reg, target_va: u64) {
870        self.emit(0x48 | (dst.hi() << 2));
871        self.emit(0x03);
872        self.modrm_rip_rel(dst.lo());
873        self.emit_rip_rel_disp32(target_va, 0);
874    }
875
876    /// cmp r64, qword [rip+disp32] — compare a register against a memory
877    /// operand (used by the x3/x4 spill path's branch, where a spilled
878    /// source lives in `JitContext.regs[13|14]`).
879    pub fn cmp_r64_mem_rip_rel(&mut self, dst: Reg, target_va: u64) {
880        self.emit(0x48 | (dst.hi() << 2));
881        self.emit(0x3B);
882        self.modrm_rip_rel(dst.lo());
883        self.emit_rip_rel_disp32(target_va, 0);
884    }
885
886    // -- In-register gas decrement (patchable) --
887
888    /// sub r64, imm32 in the always-imm32 (7-byte) encoding.
889    /// `offset() - 4` after this call points at the imm32 field;
890    /// callers patch it after emission for per-block gas metering.
891    pub fn sub_r64_imm32_patchable(&mut self, dst: Reg, imm: i32) {
892        // NOTE: Cannot use InstBuf here — caller reads offset() for
893        // patching. The imm32 must be the trailing 4 bytes.
894        self.emit(0x48 | dst.hi());
895        self.emit(0x81);
896        self.emit(0xE8 | dst.lo()); // mod=11 (register), reg=5 (sub), r/m=dst.lo()
897        self.emit_i32(imm);
898    }
899
900    /// cmp r64, imm32 in the always-imm32 (7-byte) encoding — a
901    /// non-mutating gate (sets flags, leaves `dst` unchanged). Mirrors
902    /// [`Self::sub_r64_imm32_patchable`] but with reg-field = 7 (CMP).
903    /// `offset() - 4` after this call points at the imm32 field; callers
904    /// patch it for the per-block check-before-charge gas gate.
905    pub fn cmp_r64_imm32_patchable(&mut self, dst: Reg, imm: i32) {
906        self.emit(0x48 | dst.hi());
907        self.emit(0x81);
908        self.emit(0xF8 | dst.lo()); // mod=11 (register), reg=7 (cmp), r/m=dst.lo()
909        self.emit_i32(imm);
910    }
911
912    // -- IMUL --
913
914    /// imul r64, r64
915    pub fn imul_rr(&mut self, dst: Reg, src: Reg) {
916        let mut ib = InstBuf::new();
917        ib.push(0x48 | (dst.hi() << 2) | src.hi());
918        ib.push(0x0F);
919        ib.push(0xAF);
920        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
921        self.flush_instbuf(ib);
922    }
923
924    /// imul r32, r32
925    pub fn imul_rr32(&mut self, dst: Reg, src: Reg) {
926        let mut ib = InstBuf::new();
927        let r = dst.hi();
928        let b = src.hi();
929        if r != 0 || b != 0 {
930            ib.push(0x40 | (r << 2) | b);
931        }
932        ib.push(0x0F);
933        ib.push(0xAF);
934        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
935        self.flush_instbuf(ib);
936    }
937
938    // -- MUL/IMUL widening (RDX:RAX = RAX * src) --
939
940    /// mul r64 (unsigned RDX:RAX = RAX * src)
941    pub fn mul_rdx_rax(&mut self, src: Reg) {
942        self.emit3(0x48 | src.hi(), 0xF7, 0xE0 | src.lo());
943    }
944
945    /// imul r64 (signed RDX:RAX = RAX * src)
946    pub fn imul_rdx_rax(&mut self, src: Reg) {
947        self.emit3(0x48 | src.hi(), 0xF7, 0xE8 | src.lo());
948    }
949
950    // -- DIV/IDIV --
951
952    /// div r64 (unsigned RAX = RDX:RAX / src, RDX = remainder)
953    pub fn div64(&mut self, src: Reg) {
954        self.emit3(0x48 | src.hi(), 0xF7, 0xF0 | src.lo());
955    }
956
957    /// idiv r64 (signed)
958    pub fn idiv64(&mut self, src: Reg) {
959        self.emit3(0x48 | src.hi(), 0xF7, 0xF8 | src.lo());
960    }
961
962    /// div r32
963    pub fn div32(&mut self, src: Reg) {
964        if src.needs_rex() {
965            self.emit3(0x41, 0xF7, 0xF0 | src.lo());
966        } else {
967            let mut ib = InstBuf::new();
968            ib.push(0xF7);
969            ib.push(0xF0 | src.lo());
970            self.flush_instbuf(ib);
971        }
972    }
973
974    /// idiv r32
975    pub fn idiv32(&mut self, src: Reg) {
976        if src.needs_rex() {
977            self.emit3(0x41, 0xF7, 0xF8 | src.lo());
978        } else {
979            let mut ib = InstBuf::new();
980            ib.push(0xF7);
981            ib.push(0xF8 | src.lo());
982            self.flush_instbuf(ib);
983        }
984    }
985
986    /// cqo (sign-extend RAX into RDX:RAX, 64-bit)
987    pub fn cqo(&mut self) {
988        self.emit(0x48);
989        self.emit(0x99);
990    }
991
992    /// cdq (sign-extend EAX into EDX:EAX, 32-bit)
993    pub fn cdq(&mut self) {
994        self.emit(0x99);
995    }
996
997    // -- NEG/NOT --
998
999    /// neg r64
1000    pub fn neg64(&mut self, dst: Reg) {
1001        self.emit3(0x48 | dst.hi(), 0xF7, 0xD8 | dst.lo());
1002    }
1003
1004    /// not r64
1005    pub fn not64(&mut self, dst: Reg) {
1006        self.emit3(0x48 | dst.hi(), 0xF7, 0xD0 | dst.lo());
1007    }
1008
1009    // -- Shifts --
1010
1011    fn shift_ri64(&mut self, ext: u8, dst: Reg, imm: u8) {
1012        let mut ib = InstBuf::new();
1013        ib.push(0x48 | dst.hi());
1014        ib.push(0xC1);
1015        ib.push(0xC0 | (ext << 3) | dst.lo());
1016        ib.push(imm);
1017        self.flush_instbuf(ib);
1018    }
1019
1020    pub fn shift_cl64(&mut self, ext: u8, dst: Reg) {
1021        let mut ib = InstBuf::new();
1022        ib.push(0x48 | dst.hi());
1023        ib.push(0xD3);
1024        ib.push(0xC0 | (ext << 3) | dst.lo());
1025        self.flush_instbuf(ib);
1026    }
1027
1028    fn shift_ri32(&mut self, ext: u8, dst: Reg, imm: u8) {
1029        let mut ib = InstBuf::new();
1030        if dst.needs_rex() {
1031            ib.push(0x40 | dst.hi());
1032        }
1033        ib.push(0xC1);
1034        ib.push(0xC0 | (ext << 3) | dst.lo());
1035        ib.push(imm);
1036        self.flush_instbuf(ib);
1037    }
1038
1039    pub fn shift_cl32(&mut self, ext: u8, dst: Reg) {
1040        let mut ib = InstBuf::new();
1041        if dst.needs_rex() {
1042            ib.push(0x40 | dst.hi());
1043        }
1044        ib.push(0xD3);
1045        ib.push(0xC0 | (ext << 3) | dst.lo());
1046        self.flush_instbuf(ib);
1047    }
1048
1049    pub fn shl_ri64(&mut self, dst: Reg, imm: u8) {
1050        self.shift_ri64(4, dst, imm);
1051    }
1052    pub fn shr_ri64(&mut self, dst: Reg, imm: u8) {
1053        self.shift_ri64(5, dst, imm);
1054    }
1055    pub fn sar_ri64(&mut self, dst: Reg, imm: u8) {
1056        self.shift_ri64(7, dst, imm);
1057    }
1058    pub fn shl_cl64(&mut self, dst: Reg) {
1059        self.shift_cl64(4, dst);
1060    }
1061    pub fn ror_ri64(&mut self, dst: Reg, imm: u8) {
1062        self.shift_ri64(1, dst, imm);
1063    }
1064
1065    pub fn shl_ri32(&mut self, dst: Reg, imm: u8) {
1066        self.shift_ri32(4, dst, imm);
1067    }
1068    pub fn shr_ri32(&mut self, dst: Reg, imm: u8) {
1069        self.shift_ri32(5, dst, imm);
1070    }
1071    pub fn sar_ri32(&mut self, dst: Reg, imm: u8) {
1072        self.shift_ri32(7, dst, imm);
1073    }
1074    pub fn ror_ri32(&mut self, dst: Reg, imm: u8) {
1075        self.shift_ri32(1, dst, imm);
1076    }
1077
1078    // -- Extensions --
1079
1080    /// movsxd r64, r32 (sign-extend 32→64)
1081    #[inline(always)]
1082    pub fn movsxd(&mut self, dst: Reg, src: Reg) {
1083        let mut ib = InstBuf::new();
1084        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1085        ib.push(0x63);
1086        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1087        self.flush_instbuf(ib);
1088    }
1089
1090    /// movsx r64, r8 (sign-extend 8→64)
1091    pub fn movsx_8_64(&mut self, dst: Reg, src: Reg) {
1092        let mut ib = InstBuf::new();
1093        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1094        ib.push(0x0F);
1095        ib.push(0xBE);
1096        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1097        self.flush_instbuf(ib);
1098    }
1099
1100    /// movsx r64, r16 (sign-extend 16→64)
1101    pub fn movsx_16_64(&mut self, dst: Reg, src: Reg) {
1102        let mut ib = InstBuf::new();
1103        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1104        ib.push(0x0F);
1105        ib.push(0xBF);
1106        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1107        self.flush_instbuf(ib);
1108    }
1109
1110    /// movzx r32, r16 (zero-extends to 64 due to 32-bit operation)
1111    pub fn movzx_16_64(&mut self, dst: Reg, src: Reg) {
1112        let mut ib = InstBuf::new();
1113        let r = dst.hi();
1114        let b = src.hi();
1115        if r != 0 || b != 0 {
1116            ib.push(0x40 | (r << 2) | b);
1117        }
1118        ib.push(0x0F);
1119        ib.push(0xB7);
1120        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1121        self.flush_instbuf(ib);
1122    }
1123
1124    /// Zero-extend 32→64: mov r32, r32 (implicit zero-extend)
1125    #[inline(always)]
1126    pub fn movzx_32_64(&mut self, dst: Reg, src: Reg) {
1127        let mut ib = InstBuf::new();
1128        let r = src.hi();
1129        let b = dst.hi();
1130        if r != 0 || b != 0 {
1131            ib.push(0x40 | (r << 2) | b);
1132        }
1133        ib.push(0x89);
1134        ib.push(0xC0 | (src.lo() << 3) | dst.lo());
1135        self.flush_instbuf(ib);
1136    }
1137
1138    // -- Conditional set --
1139
1140    /// setcc r8 (sets low byte, need to movzx after)
1141    #[inline(always)]
1142    pub fn setcc(&mut self, cc: Cc, dst: Reg) {
1143        let mut ib = InstBuf::new();
1144        if dst.needs_rex() || matches!(dst, Reg::RSP | Reg::RBP | Reg::RSI | Reg::RDI) {
1145            ib.push(0x40 | dst.hi());
1146        }
1147        ib.push(0x0F);
1148        ib.push(0x90 + cc as u8);
1149        ib.push(0xC0 | dst.lo());
1150        self.flush_instbuf(ib);
1151    }
1152
1153    /// cmovcc r64, r64
1154    #[inline(always)]
1155    pub fn cmovcc(&mut self, cc: Cc, dst: Reg, src: Reg) {
1156        let mut ib = InstBuf::new();
1157        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1158        ib.push(0x0F);
1159        ib.push(0x40 + cc as u8);
1160        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1161        self.flush_instbuf(ib);
1162    }
1163
1164    // -- Bit manipulation (require BMI/POPCNT support) --
1165
1166    /// popcnt r64, r64
1167    pub fn popcnt64(&mut self, dst: Reg, src: Reg) {
1168        let mut ib = InstBuf::new();
1169        ib.push(0xF3);
1170        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1171        ib.push(0x0F);
1172        ib.push(0xB8);
1173        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1174        self.flush_instbuf(ib);
1175    }
1176
1177    /// lzcnt r64, r64
1178    pub fn lzcnt64(&mut self, dst: Reg, src: Reg) {
1179        let mut ib = InstBuf::new();
1180        ib.push(0xF3);
1181        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1182        ib.push(0x0F);
1183        ib.push(0xBD);
1184        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1185        self.flush_instbuf(ib);
1186    }
1187
1188    /// tzcnt r64, r64
1189    pub fn tzcnt64(&mut self, dst: Reg, src: Reg) {
1190        let mut ib = InstBuf::new();
1191        ib.push(0xF3);
1192        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1193        ib.push(0x0F);
1194        ib.push(0xBC);
1195        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1196        self.flush_instbuf(ib);
1197    }
1198
1199    /// popcnt r32, r32 (result zero-extended to 64 bits)
1200    pub fn popcnt32(&mut self, dst: Reg, src: Reg) {
1201        let mut ib = InstBuf::new();
1202        ib.push(0xF3);
1203        let rex = (dst.hi() << 2) | src.hi();
1204        if rex != 0 {
1205            ib.push(0x40 | rex);
1206        }
1207        ib.push(0x0F);
1208        ib.push(0xB8);
1209        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1210        self.flush_instbuf(ib);
1211    }
1212
1213    /// lzcnt r32, r32 (result zero-extended to 64 bits)
1214    pub fn lzcnt32(&mut self, dst: Reg, src: Reg) {
1215        let mut ib = InstBuf::new();
1216        ib.push(0xF3);
1217        let rex = (dst.hi() << 2) | src.hi();
1218        if rex != 0 {
1219            ib.push(0x40 | rex);
1220        }
1221        ib.push(0x0F);
1222        ib.push(0xBD);
1223        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1224        self.flush_instbuf(ib);
1225    }
1226
1227    /// tzcnt r32, r32 (result zero-extended to 64 bits)
1228    pub fn tzcnt32(&mut self, dst: Reg, src: Reg) {
1229        let mut ib = InstBuf::new();
1230        ib.push(0xF3);
1231        let rex = (dst.hi() << 2) | src.hi();
1232        if rex != 0 {
1233            ib.push(0x40 | rex);
1234        }
1235        ib.push(0x0F);
1236        ib.push(0xBC);
1237        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1238        self.flush_instbuf(ib);
1239    }
1240
1241    /// bswap r64
1242    pub fn bswap64(&mut self, dst: Reg) {
1243        self.emit3(0x48 | dst.hi(), 0x0F, 0xC8 + dst.lo());
1244    }
1245
1246    // -- Stack --
1247
1248    #[inline(always)]
1249    pub fn push(&mut self, reg: Reg) {
1250        self.rex_opt_b(reg);
1251        self.emit(0x50 + reg.lo());
1252    }
1253
1254    #[inline(always)]
1255    pub fn pop(&mut self, reg: Reg) {
1256        self.rex_opt_b(reg);
1257        self.emit(0x58 + reg.lo());
1258    }
1259
1260    // -- Branches and jumps --
1261
1262    /// jmp to label — uses rel8 for backward jumps within ±127 bytes.
1263    #[inline(always)]
1264    pub fn jmp_label(&mut self, label: Label) {
1265        let bound = self.labels[label.0 as usize];
1266        if bound != LABEL_UNBOUND {
1267            let target = (bound - 1) as isize; // stored as offset+1
1268            // Backward jump — label already bound, try rel8.
1269            let rel = target - (self.write_pos as isize + 2);
1270            if rel >= i8::MIN as isize && rel <= i8::MAX as isize {
1271                self.emit(0xEB);
1272                self.emit(rel as u8);
1273                return;
1274            }
1275        }
1276        // Forward jump or out of rel8 range — use rel32
1277        self.emit(0xE9);
1278        self.emit_label_fixup(label);
1279    }
1280
1281    /// jcc to label — uses rel8 for backward jumps within ±127 bytes.
1282    #[inline(always)]
1283    pub fn jcc_label(&mut self, cc: Cc, label: Label) {
1284        let bound = self.labels[label.0 as usize];
1285        if bound != LABEL_UNBOUND {
1286            let target = (bound - 1) as isize; // stored as offset+1
1287            // Backward jump — label already bound, try rel8.
1288            let rel = target - (self.write_pos as isize + 2);
1289            if rel >= i8::MIN as isize && rel <= i8::MAX as isize {
1290                self.emit(0x70 + cc as u8);
1291                self.emit(rel as u8);
1292                return;
1293            }
1294        }
1295        // Forward jump or out of rel8 range — use rel32
1296        self.emit(0x0F);
1297        self.emit(0x80 + cc as u8);
1298        self.emit_label_fixup(label);
1299    }
1300
1301    /// jmp r64 (indirect)
1302    pub fn jmp_reg(&mut self, reg: Reg) {
1303        self.rex_opt_b(reg);
1304        self.emit(0xFF);
1305        self.emit(0xE0 | reg.lo()); // /4
1306    }
1307
1308    /// ret
1309    pub fn ret(&mut self) {
1310        self.emit(0xC3);
1311    }
1312
1313    // -- LEA --
1314
1315    /// lea r64, [base + disp]
1316    pub fn lea(&mut self, dst: Reg, base: Reg, disp: i32) {
1317        let mut ib = InstBuf::new();
1318        ib.push(0x48 | (dst.hi() << 2) | base.hi());
1319        ib.push(0x8D);
1320        Self::modrm_disp_ib(&mut ib, dst.lo(), base, disp);
1321        self.flush_instbuf(ib);
1322    }
1323
1324    /// lea r32, [base + disp] — 32-bit result, zero-extends to 64-bit.
1325    #[inline(always)]
1326    pub fn lea_32(&mut self, dst: Reg, base: Reg, disp: i32) {
1327        let mut ib = InstBuf::new();
1328        let r = dst.hi();
1329        let b = base.hi();
1330        if r != 0 || b != 0 {
1331            ib.push(0x40 | (r << 2) | b);
1332        }
1333        ib.push(0x8D);
1334        Self::modrm_disp_ib(&mut ib, dst.lo(), base, disp);
1335        self.flush_instbuf(ib);
1336    }
1337
1338    /// lea r32, [base32 + index32 * (1 << scale_log2)]
1339    /// scale_log2: 0=*1, 1=*2, 2=*4, 3=*8
1340    #[inline(always)]
1341    pub fn lea_sib_scaled_32(&mut self, dst: Reg, base: Reg, index: Reg, scale_log2: u8) {
1342        debug_assert!(scale_log2 <= 3);
1343        let mut ib = InstBuf::new();
1344        let rex = 0x40 | (dst.hi() << 2) | (index.hi() << 1) | base.hi();
1345        if rex != 0x40 {
1346            ib.push(rex);
1347        }
1348        ib.push(0x8D);
1349        let scale_bits = scale_log2 << 6;
1350        if base.lo() == 5 {
1351            ib.push(0x44 | (dst.lo() << 3));
1352            ib.push(scale_bits | (index.lo() << 3) | base.lo());
1353            ib.push(0x00);
1354        } else {
1355            ib.push((dst.lo() << 3) | 0x04);
1356            ib.push(scale_bits | (index.lo() << 3) | base.lo());
1357        }
1358        self.flush_instbuf(ib);
1359    }
1360
1361    // -- Misc --
1362
1363    /// nop
1364    pub fn nop(&mut self) {
1365        self.emit(0x90);
1366    }
1367
1368    // === Finalization ===
1369
1370    /// Get the resolved native offset for a label (only valid after bind_label).
1371    pub fn label_offset(&self, label: Label) -> Option<usize> {
1372        let off = self.labels[label.0 as usize];
1373        if off == LABEL_UNBOUND {
1374            None
1375        } else {
1376            Some((off - 1) as usize)
1377        }
1378    }
1379
1380    /// Current number of pending fixups. Capture this before emitting a
1381    /// label-referencing instruction so the caller can later `redirect_fixup`
1382    /// the entry that the emission appends. Forward references push exactly
1383    /// one fixup; backward references push none (resolved immediately).
1384    #[inline]
1385    pub fn fixups_len(&self) -> usize {
1386        self.fixups.len()
1387    }
1388
1389    /// Rewrite the target label of an existing fixup. Used by RV streaming
1390    /// compile to redirect forward branches whose target is determined to
1391    /// be invalid only after the streaming pass completes.
1392    pub fn redirect_fixup(&mut self, idx: usize, new_label: Label) {
1393        self.fixups[idx].label = new_label;
1394    }
1395
1396    /// Sync Vec length with the write cursor. Call before accessing `self.code` directly.
1397    #[cfg(test)]
1398    pub fn sync_len(&mut self) {
1399        let code = &mut self.code_buf;
1400        // SAFETY: write_pos <= code.capacity() (maintained by emission guards).
1401        unsafe {
1402            code.set_len(self.write_pos);
1403        }
1404    }
1405
1406    /// Resolve all label fixups in-place.
1407    fn resolve_fixups(&mut self) {
1408        for fixup in &self.fixups {
1409            let stored = self.labels[fixup.label.0 as usize];
1410            // All labels must be bound by finalization time.
1411            assert!(stored != LABEL_UNBOUND, "unbound label {:?}", fixup.label);
1412            let target = stored - 1; // stored as offset+1
1413            let rel = (target as i64) - (fixup.offset as i64 + 4);
1414            let rel32 = rel as i32;
1415            // SAFETY: fixup.offset + 4 <= write_pos (fixup was recorded during emission).
1416            unsafe {
1417                core::ptr::copy_nonoverlapping(
1418                    rel32.to_le_bytes().as_ptr(),
1419                    self.buf.add(fixup.offset),
1420                    4,
1421                );
1422            }
1423        }
1424    }
1425
1426    /// Resolve fixups and return the code as a `Vec<u8>`.
1427    pub fn finalize(&mut self) -> Vec<u8> {
1428        self.resolve_fixups();
1429        let code = &mut self.code_buf;
1430        // SAFETY: write_pos <= code.capacity().
1431        unsafe {
1432            code.set_len(self.write_pos);
1433        }
1434        core::mem::take(code)
1435    }
1436
1437    /// Get a slice of the written code bytes (for tests). Syncs Vec len first.
1438    #[cfg(test)]
1439    pub fn code_bytes(&mut self) -> &[u8] {
1440        self.sync_len();
1441        let v = &self.code_buf;
1442        v.as_slice()
1443    }
1444}
1445
1446#[cfg(test)]
1447mod tests {
1448    use super::*;
1449
1450    #[test]
1451    fn test_mov_ri64_zero() {
1452        let mut asm = Assembler::new();
1453        asm.mov_ri64(Reg::RAX, 0);
1454        // xor eax, eax → 0x31 0xC0
1455        assert_eq!(asm.code_bytes(), &[0x31, 0xC0]);
1456    }
1457
1458    #[test]
1459    fn test_mov_ri64_small() {
1460        let mut asm = Assembler::new();
1461        asm.mov_ri64(Reg::RAX, 42);
1462        // mov eax, 42 → 0xB8, 0x2A, 0x00, 0x00, 0x00
1463        assert_eq!(asm.code_bytes(), &[0xB8, 0x2A, 0x00, 0x00, 0x00]);
1464    }
1465
1466    #[test]
1467    fn test_label_resolution() {
1468        let mut asm = Assembler::new();
1469        let lbl = asm.new_label();
1470        asm.jmp_label(lbl); // 5 bytes: E9 + 4-byte rel32
1471        asm.nop(); // 1 byte at offset 5
1472        asm.bind_label(lbl); // label at offset 6
1473        let code = asm.finalize();
1474        // rel32 = 6 - (0 + 4 + 1) = 6 - 5 = 1
1475        // Wait: fixup offset is 1 (after E9), target is 6
1476        // rel = 6 - (1 + 4) = 1
1477        assert_eq!(code[0], 0xE9);
1478        let rel = i32::from_le_bytes([code[1], code[2], code[3], code[4]]);
1479        assert_eq!(rel, 1); // skip over the nop
1480    }
1481
1482    /// Baseless `mov r32, [rdx]`:
1483    /// REX (0x40) + 0x8B + ModRM (mod=00, reg=000=eax, r/m=010=rdx) = 3 bytes.
1484    /// Codegen relies on this being the 3-byte form (vs the 4-byte SIB).
1485    #[test]
1486    fn test_mov_load32_at_rdx() {
1487        let mut asm = Assembler::new();
1488        asm.mov_load32_at_index(Reg::RAX, Reg::RDX);
1489        assert_eq!(asm.code_bytes(), &[0x8B, 0x02]);
1490    }
1491
1492    /// Same shape but with R8 as dst (REX.R=1) and R12 as idx
1493    /// (lo=4, needs SIB recovery to encode bare `[r12]`).
1494    #[test]
1495    fn test_mov_load64_at_r12_into_r8() {
1496        let mut asm = Assembler::new();
1497        asm.mov_load64_at_index(Reg::R8, Reg::R12);
1498        // REX.W=1 REX.R=1 REX.B=1 → 0x4D
1499        // opcode 0x8B
1500        // ModRM mod=00 reg=000 r/m=100 → 0x04
1501        // SIB scale=00 index=100=none base=100 → 0x24
1502        assert_eq!(asm.code_bytes(), &[0x4D, 0x8B, 0x04, 0x24]);
1503    }
1504
1505    /// `mov eax, [rip+disp32]` reaches `target_va`. With jit_va_base=0
1506    /// and write_pos=0 after emit, post-instruction RIP is the 7th byte,
1507    /// so disp32 = target_va - 7.
1508    #[test]
1509    fn test_mov_load32_rip_rel() {
1510        let mut asm = Assembler::new();
1511        asm.set_jit_va_base(0x1_0000_0000); // 4 GiB
1512        asm.mov_load32_rip_rel(Reg::RAX, 0x1_0000_0040); // 64 bytes above base
1513        // 8B 05 <disp32>  — disp = 0x40 - (0 + 6) = 0x3A
1514        // Wait: instruction is 6 bytes (no REX needed). post_inst_rip = base + 6.
1515        // disp = 0x1_0000_0040 - (0x1_0000_0000 + 6) = 0x3A.
1516        assert_eq!(asm.code_bytes(), &[0x8B, 0x05, 0x3A, 0x00, 0x00, 0x00]);
1517    }
1518
1519    /// `mov [rip+disp32], r15` with REX.W|REX.R = 0x4C.
1520    #[test]
1521    fn test_mov_store64_rip_rel_r15() {
1522        let mut asm = Assembler::new();
1523        asm.set_jit_va_base(0x1_0000_0000);
1524        asm.mov_store64_rip_rel(0x1_0000_0080, Reg::R15);
1525        // 4C 89 3D <disp32> — 7-byte instruction.
1526        // disp = 0x80 - (0 + 7) = 0x79.
1527        assert_eq!(
1528            asm.code_bytes(),
1529            &[0x4C, 0x89, 0x3D, 0x79, 0x00, 0x00, 0x00]
1530        );
1531    }
1532
1533    /// `mov [rdx], eax` (32-bit store) — baseless form, 2 bytes.
1534    #[test]
1535    fn test_mov_store32_at_rdx() {
1536        let mut asm = Assembler::new();
1537        asm.mov_store32_at_index(Reg::RDX, Reg::RAX);
1538        assert_eq!(asm.code_bytes(), &[0x89, 0x02]);
1539    }
1540
1541    /// `mov [rdx], r11d` — REX.R for r11 source.
1542    #[test]
1543    fn test_mov_store32_at_rdx_from_r11() {
1544        let mut asm = Assembler::new();
1545        asm.mov_store32_at_index(Reg::RDX, Reg::R11);
1546        assert_eq!(asm.code_bytes(), &[0x44, 0x89, 0x1A]);
1547    }
1548
1549    /// `mov byte [rdx], sil` — src ∈ {SPL,BPL,SIL,DIL} requires a
1550    /// REX prefix to disambiguate from AH/CH/DH/BH. The bare `88 32`
1551    /// would decode as `mov [rdx], DH` which is silently wrong.
1552    #[test]
1553    fn test_mov_store8_at_rdx_from_sil() {
1554        let mut asm = Assembler::new();
1555        asm.mov_store8_at_index(Reg::RDX, Reg::RSI);
1556        assert_eq!(asm.code_bytes(), &[0x40, 0x88, 0x32]);
1557    }
1558
1559    #[test]
1560    fn test_mov_store8_at_rdx_from_dil() {
1561        let mut asm = Assembler::new();
1562        asm.mov_store8_at_index(Reg::RDX, Reg::RDI);
1563        assert_eq!(asm.code_bytes(), &[0x40, 0x88, 0x3A]);
1564    }
1565
1566    #[test]
1567    fn test_mov_store8_at_rdx_from_bpl() {
1568        let mut asm = Assembler::new();
1569        asm.mov_store8_at_index(Reg::RDX, Reg::RBP);
1570        assert_eq!(asm.code_bytes(), &[0x40, 0x88, 0x2A]);
1571    }
1572
1573    #[test]
1574    fn test_mov_store8_at_rdx_from_spl() {
1575        let mut asm = Assembler::new();
1576        asm.mov_store8_at_index(Reg::RDX, Reg::RSP);
1577        assert_eq!(asm.code_bytes(), &[0x40, 0x88, 0x22]);
1578    }
1579
1580    /// `mov byte [rdx], al` — REX still NOT emitted for AL/CL/DL/BL.
1581    #[test]
1582    fn test_mov_store8_at_rdx_from_al_no_rex() {
1583        let mut asm = Assembler::new();
1584        asm.mov_store8_at_index(Reg::RDX, Reg::RAX);
1585        assert_eq!(asm.code_bytes(), &[0x88, 0x02]);
1586    }
1587
1588    /// `mov dword [rip+disp32], imm32` — RIP-relative store-imm form.
1589    /// 10-byte instruction: C7 05 <disp32> <imm32>. The CPU resolves
1590    /// RIP-relative against the NEXT-instruction RIP, which is past
1591    /// BOTH disp32 and the trailing imm32 — so disp = target - 10.
1592    #[test]
1593    fn test_mov_store32_rip_rel_imm() {
1594        let mut asm = Assembler::new();
1595        asm.set_jit_va_base(0x1_0000_0000);
1596        asm.mov_store32_rip_rel_imm(0x1_0000_0100, 0x123);
1597        // disp = 0x100 - (0 + 10) = 0xF6.
1598        assert_eq!(
1599            asm.code_bytes(),
1600            &[0xC7, 0x05, 0xF6, 0x00, 0x00, 0x00, 0x23, 0x01, 0x00, 0x00]
1601        );
1602    }
1603
1604    /// `add rax, [rip+disp32]` — used by the dispatch sequence to
1605    /// fold code_base into native_addr.
1606    #[test]
1607    fn test_add_r64_mem_rip_rel() {
1608        let mut asm = Assembler::new();
1609        asm.set_jit_va_base(0x1_0000_0000);
1610        asm.add_r64_mem_rip_rel(Reg::RAX, 0x1_0000_0020);
1611        // 48 03 05 <disp32> — 7-byte instruction.
1612        // disp = 0x20 - (0 + 7) = 0x19.
1613        assert_eq!(
1614            asm.code_bytes(),
1615            &[0x48, 0x03, 0x05, 0x19, 0x00, 0x00, 0x00]
1616        );
1617    }
1618
1619    /// `sub r15, imm32` patchable form — must be exactly 7 bytes with
1620    /// the imm32 at bytes 3..7 (codegen patches via `offset() - 4`).
1621    #[test]
1622    fn test_sub_r64_imm32_patchable_r15() {
1623        let mut asm = Assembler::new();
1624        asm.sub_r64_imm32_patchable(Reg::R15, 0xCAFE_F00D_u32 as i32);
1625        let bytes = asm.code_bytes();
1626        assert_eq!(bytes.len(), 7, "patchable sub must be 7 bytes");
1627        // REX.W=1 REX.B=1 → 0x49
1628        // opcode 0x81
1629        // ModRM mod=11 reg=5 r/m=7 → 0xEF
1630        // imm32 at bytes 3..7 in LE
1631        assert_eq!(&bytes[0..3], &[0x49, 0x81, 0xEF]);
1632        assert_eq!(&bytes[3..7], &[0x0D, 0xF0, 0xFE, 0xCA]);
1633    }
1634
1635    #[test]
1636    fn test_push_pop_r15() {
1637        let mut asm = Assembler::new();
1638        asm.push(Reg::R15);
1639        asm.pop(Reg::R15);
1640        // push r15: 41 57, pop r15: 41 5F
1641        assert_eq!(asm.code_bytes(), &[0x41, 0x57, 0x41, 0x5F]);
1642    }
1643}
javm_recompiler_x86/asm.rs

javm_recompiler_x86/
asm.rs