Skip to main content

javm_recompiler_x86/
asm.rs

1//! x86-64 assembler for PVM recompiler.
2//!
3//! Emits native x86-64 machine code with label-based jump resolution.
4//! All jumps use 32-bit relative offsets (no short-jump optimization).
5//!
6//! # Safety model
7//!
8//! The assembler writes to a raw `*mut u8` buffer (`self.buf`) for performance.
9//! The key invariant: `self.buf` points to a valid allocation of at least
10//! `self.capacity` bytes (either a Vec's backing store or an mmap region).
11//! All emission functions have `debug_assert!(self.write_pos + N <= self.capacity)`
12//! guards. Callers must ensure capacity via `ensure_capacity()` before emitting.
13//! Vec length is synced via `set_len(write_pos)` only at finalization boundaries.
14
15use alloc::vec;
16use alloc::vec::Vec;
17
18/// Instruction buffer: accumulates x86 bytes in a u128 register, then flushes
19/// with a single bulk write. Avoids per-byte memory stores.
20#[derive(Clone, Copy)]
21pub struct InstBuf {
22    out: u128,
23    length: u32, // in bits
24}
25
26impl Default for InstBuf {
27    fn default() -> Self {
28        Self::new()
29    }
30}
31
32impl InstBuf {
33    #[inline(always)]
34    pub fn new() -> Self {
35        Self { out: 0, length: 0 }
36    }
37
38    #[inline(always)]
39    pub fn push(&mut self, byte: u8) {
40        self.out |= (byte as u128) << self.length;
41        self.length += 8;
42    }
43
44    #[inline(always)]
45    pub fn push_u32(&mut self, v: u32) {
46        self.out |= (v as u128) << self.length;
47        self.length += 32;
48    }
49
50    #[inline(always)]
51    pub fn push_u64(&mut self, v: u64) {
52        self.out |= (v as u128) << self.length;
53        self.length += 64;
54    }
55
56    #[inline(always)]
57    pub fn push_i32(&mut self, v: i32) {
58        self.push_u32(v as u32);
59    }
60
61    #[inline(always)]
62    pub fn len(&self) -> usize {
63        (self.length >> 3) as usize
64    }
65
66    pub fn is_empty(&self) -> bool {
67        self.length == 0
68    }
69}
70
71/// x86-64 register encoding.
72#[derive(Clone, Copy, Debug, PartialEq, Eq)]
73#[repr(u8)]
74pub enum Reg {
75    RAX = 0,
76    RCX = 1,
77    RDX = 2,
78    RBX = 3,
79    RSP = 4,
80    RBP = 5,
81    RSI = 6,
82    RDI = 7,
83    R8 = 8,
84    R9 = 9,
85    R10 = 10,
86    R11 = 11,
87    R12 = 12,
88    R13 = 13,
89    R14 = 14,
90    R15 = 15,
91}
92
93impl Reg {
94    /// Low 3 bits for ModR/M encoding.
95    fn lo(self) -> u8 {
96        (self as u8) & 7
97    }
98    /// High bit for REX.R or REX.B.
99    fn hi(self) -> u8 {
100        (self as u8) >> 3
101    }
102    /// Whether this register requires a REX prefix.
103    fn needs_rex(self) -> bool {
104        (self as u8) >= 8
105    }
106}
107
108/// Condition codes for Jcc/SETcc/CMOVcc.
109#[derive(Clone, Copy, Debug, PartialEq, Eq)]
110#[repr(u8)]
111pub enum Cc {
112    O = 0,
113    NO = 1,
114    B = 2,  // Below (unsigned <)
115    AE = 3, // Above or Equal (unsigned >=)
116    E = 4,  // Equal
117    NE = 5, // Not Equal
118    BE = 6, // Below or Equal (unsigned <=)
119    A = 7,  // Above (unsigned >)
120    S = 8,  // Sign
121    NS = 9,
122    P = 10,
123    NP = 11,
124    L = 12,  // Less (signed <)
125    GE = 13, // Greater or Equal (signed >=)
126    LE = 14, // Less or Equal (signed <=)
127    G = 15,  // Greater (signed >)
128}
129
130/// Label identifier.
131#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
132pub struct Label(pub u32);
133
134/// Fixup kind for label resolution.
135#[derive(Clone, Copy)]
136struct Fixup {
137    /// Offset in code buffer where the 4-byte rel32 placeholder is.
138    offset: usize,
139    /// The label this fixup targets.
140    label: Label,
141}
142
143/// Code buffer mode. Today only a heap-allocated Vec is supported; the
144/// host-targeted `mmap`-direct path was retired with the
145/// `RecompiledPvm` sweep.
146enum CodeBuf {
147    Vec(Vec<u8>),
148}
149
150/// x86-64 assembler with label support.
151///
152/// Uses direct pointer writes to the pre-allocated buffer for emission,
153/// avoiding per-byte Vec::push overhead (capacity check + len update).
154pub struct Assembler {
155    code_buf: CodeBuf,
156    /// Raw pointer to the start of the code buffer.
157    buf: *mut u8,
158    write_pos: usize,
159    capacity: usize,
160    /// Label ID → bound offset+1 as u32 (0 = unbound). Uses u32 to halve
161    /// memory vs usize (native code always fits in 4GB).
162    /// Pre-sized via `vec![0u32; capacity]` which uses calloc (zero-page COW).
163    labels: Vec<u32>,
164    /// Number of labels allocated via new_label/bulk_create_labels.
165    /// The Vec is pre-sized but labels_len tracks the logical length.
166    labels_len: usize,
167    fixups: Vec<Fixup>,
168    /// Eventual VA the code buffer will be loaded at. Used by RIP-relative
169    /// emitters to compute `disp32 = target_va - (jit_va_base + RIP_after_inst)`.
170    /// Zero means "relocatable / unknown" — RIP-relative emitters then
171    /// produce offsets relative to code-buffer offset 0, useful for tests.
172    jit_va_base: u64,
173}
174
175/// Unbound label sentinel. We use 0 so that bulk label allocation can use
176/// zeroed memory (calloc / zero-page COW) instead of writing 0xFF to every byte.
177/// Bound labels store `native_offset + 1` to avoid collision with the sentinel.
178const LABEL_UNBOUND: u32 = 0;
179
180impl Default for Assembler {
181    fn default() -> Self {
182        Self::new()
183    }
184}
185
186impl Assembler {
187    pub fn new() -> Self {
188        let mut code = Vec::with_capacity(4096);
189        let buf = code.as_mut_ptr();
190        let capacity = code.capacity();
191        Self {
192            code_buf: CodeBuf::Vec(code),
193            buf,
194            write_pos: 0,
195            capacity,
196            labels: Vec::new(),
197            labels_len: 0,
198            fixups: Vec::new(),
199            jit_va_base: 0,
200        }
201    }
202
203    /// Create with pre-allocated capacity for code and labels.
204    /// Uses Vec-backed buffer (for tests or when mmap is not needed).
205    pub fn with_capacity(code_capacity: usize, label_capacity: usize) -> Self {
206        let mut code = Vec::with_capacity(code_capacity);
207        let buf = code.as_mut_ptr();
208        let capacity = code.capacity();
209        Self {
210            code_buf: CodeBuf::Vec(code),
211            buf,
212            write_pos: 0,
213            capacity,
214            // vec![0; n] uses calloc — zero pages via COW, no page faults for untouched entries
215            labels: vec![0u32; label_capacity],
216            labels_len: 0,
217            fixups: Vec::with_capacity(2048),
218            jit_va_base: 0,
219        }
220    }
221
222    /// Set the eventual load VA for the code buffer. Must be called before
223    /// any RIP-relative emitter that targets a fixed VA (i.e. CTX accesses).
224    pub fn set_jit_va_base(&mut self, va: u64) {
225        self.jit_va_base = va;
226    }
227
228    /// Ensure at least `additional` bytes of capacity remain.
229    /// Called before emitting large sequences. Most individual instructions
230    /// need at most ~32 bytes, so this is rarely needed mid-compilation.
231    #[cold]
232    fn grow(&mut self, additional: usize) {
233        let CodeBuf::Vec(code) = &mut self.code_buf;
234        // SAFETY: write_pos <= code.capacity() is maintained by ensure_capacity().
235        // set_len(write_pos) exposes the bytes we've written so reserve() copies them.
236        unsafe {
237            code.set_len(self.write_pos);
238        }
239        code.reserve(additional);
240        self.buf = code.as_mut_ptr();
241        self.capacity = code.capacity();
242        // SAFETY: Reset len to 0 — we track the actual length via write_pos,
243        // not Vec::len. The data is preserved in the backing allocation.
244        unsafe {
245            code.set_len(0);
246        }
247    }
248
249    /// Check capacity and grow if needed. Inlined for the fast path (no grow).
250    #[inline(always)]
251    pub fn ensure_capacity(&mut self, n: usize) {
252        if self.write_pos + n > self.capacity {
253            self.grow(n);
254        }
255    }
256
257    /// Allocate a new label.
258    pub fn new_label(&mut self) -> Label {
259        let id = self.labels_len as u32;
260        self.labels_len += 1;
261        // Grow if needed (rare — labels Vec is pre-sized in with_capacity/with_mmap)
262        if self.labels_len > self.labels.len() {
263            self.labels.push(LABEL_UNBOUND);
264        }
265        Label(id)
266    }
267
268    /// Current number of labels allocated.
269    pub fn labels_len(&self) -> usize {
270        self.labels_len
271    }
272
273    /// Bulk-allocate `count` unbound labels. The labels Vec is already pre-sized
274    /// via calloc (zero pages). This just advances the logical length counter.
275    pub fn bulk_create_labels(&mut self, count: usize) {
276        self.labels_len += count;
277        // Grow if pre-sized Vec wasn't large enough (shouldn't happen normally)
278        if self.labels_len > self.labels.len() {
279            self.labels.resize(self.labels_len, LABEL_UNBOUND);
280        }
281    }
282
283    /// Bind a label to the current write position.
284    pub fn bind_label(&mut self, label: Label) {
285        self.labels[label.0 as usize] = (self.write_pos + 1) as u32; // +1: 0 is LABEL_UNBOUND
286    }
287
288    /// Current code offset (write position).
289    pub fn offset(&self) -> usize {
290        self.write_pos
291    }
292
293    /// Patch an i32 value at a previously recorded offset.
294    pub fn patch_i32(&mut self, offset: usize, value: i32) {
295        debug_assert!(offset + 4 <= self.write_pos);
296        // SAFETY: offset + 4 <= write_pos <= capacity, so buf.add(offset) is in bounds.
297        unsafe {
298            core::ptr::copy_nonoverlapping(value.to_le_bytes().as_ptr(), self.buf.add(offset), 4);
299        }
300    }
301
302    // === Raw byte emission ===
303    // All emission writes directly to the buffer via raw pointer,
304    // bypassing Vec::push's capacity check and len update.
305    // SAFETY for all emit* functions: write_pos + N <= capacity is guarded by
306    // debug_assert. Callers ensure capacity via ensure_capacity() before sequences.
307
308    #[inline(always)]
309    fn emit(&mut self, b: u8) {
310        debug_assert!(self.write_pos < self.capacity);
311        // SAFETY: write_pos < capacity is asserted above; buf points to a valid
312        // allocation of at least `capacity` bytes (Vec or mmap).
313        unsafe {
314            *self.buf.add(self.write_pos) = b;
315        }
316        self.write_pos += 1;
317    }
318
319    /// Emit 3 bytes at once.
320    #[inline(always)]
321    fn emit3(&mut self, a: u8, b: u8, c: u8) {
322        debug_assert!(self.write_pos + 3 <= self.capacity);
323        // SAFETY: write_pos + 3 <= capacity is asserted above; buf is a valid
324        // allocation. Individual byte writes are in-bounds.
325        unsafe {
326            let p = self.buf.add(self.write_pos);
327            *p = a;
328            *p.add(1) = b;
329            *p.add(2) = c;
330        }
331        self.write_pos += 3;
332    }
333
334    /// Flush an InstBuf to the code buffer in one bulk write.
335    #[inline(always)]
336    fn flush_instbuf(&mut self, ib: InstBuf) {
337        let len = ib.len();
338        debug_assert!(self.write_pos + len <= self.capacity);
339        // SAFETY: write_pos + len <= capacity is asserted above. We always
340        // write both 8-byte halves (16 bytes total) even if len < 16 — this
341        // is safe because ensure_capacity(512) guarantees ample slack beyond
342        // the actual instruction bytes.
343        unsafe {
344            let p = self.buf.add(self.write_pos);
345            core::ptr::write_unaligned(p as *mut u64, ib.out as u64);
346            core::ptr::write_unaligned(p.add(8) as *mut u64, (ib.out >> 64) as u64);
347        }
348        self.write_pos += len;
349    }
350
351    #[inline(always)]
352    fn emit_u32(&mut self, v: u32) {
353        debug_assert!(self.write_pos + 4 <= self.capacity);
354        // SAFETY: write_pos + 4 <= capacity asserted; unaligned write is valid
355        // for any byte-aligned pointer within the buffer.
356        unsafe {
357            core::ptr::write_unaligned(self.buf.add(self.write_pos) as *mut u32, v.to_le());
358        }
359        self.write_pos += 4;
360    }
361
362    #[inline(always)]
363    #[allow(dead_code)]
364    fn emit_u64(&mut self, v: u64) {
365        debug_assert!(self.write_pos + 8 <= self.capacity);
366        // SAFETY: write_pos + 8 <= capacity asserted; unaligned write is valid.
367        unsafe {
368            core::ptr::write_unaligned(self.buf.add(self.write_pos) as *mut u64, v.to_le());
369        }
370        self.write_pos += 8;
371    }
372
373    #[inline(always)]
374    fn emit_i32(&mut self, v: i32) {
375        debug_assert!(self.write_pos + 4 <= self.capacity);
376        // SAFETY: write_pos + 4 <= capacity asserted; unaligned write is valid.
377        // The cast chain converts i32 → LE bytes → u32 for write_unaligned.
378        unsafe {
379            core::ptr::write_unaligned(
380                self.buf.add(self.write_pos) as *mut u32,
381                v.to_le_bytes().as_ptr().cast::<u32>().read(),
382            );
383        }
384        self.write_pos += 4;
385    }
386
387    /// Emit a label reference (4-byte rel32). For backward references (label
388    /// already bound), resolves immediately without creating a fixup entry.
389    /// For forward references, emits a placeholder and records a fixup.
390    fn emit_label_fixup(&mut self, label: Label) {
391        let bound = self.labels[label.0 as usize];
392        if bound != LABEL_UNBOUND {
393            // Backward reference — resolve immediately, no fixup needed.
394            // rel32 = target - (current_offset + 4). Stored value is offset+1.
395            let target = (bound - 1) as i64;
396            let rel = target - (self.write_pos as i64 + 4);
397            self.emit_i32(rel as i32);
398        } else {
399            // Forward reference — defer to finalization.
400            let offset = self.write_pos;
401            self.fixups.push(Fixup { offset, label });
402            self.emit_u32(0); // placeholder
403        }
404    }
405
406    // === REX prefix helpers ===
407
408    /// REX prefix for 64-bit reg-reg operations.
409    #[allow(dead_code)]
410    fn rex_w(&mut self, reg: Reg, rm: Reg) {
411        self.emit(0x48 | (reg.hi() << 2) | rm.hi());
412    }
413
414    /// REX.W prefix for single-register operations.
415    fn rex_w_b(&mut self, rm: Reg) {
416        self.emit(0x48 | rm.hi());
417    }
418
419    /// Optional REX prefix for 32-bit ops (only if extended registers).
420    #[allow(dead_code)]
421    fn rex_opt(&mut self, reg: Reg, rm: Reg) {
422        let r = reg.hi();
423        let b = rm.hi();
424        if r != 0 || b != 0 {
425            self.emit(0x40 | (r << 2) | b);
426        }
427    }
428
429    fn rex_opt_b(&mut self, rm: Reg) {
430        if rm.needs_rex() {
431            self.emit(0x40 | rm.hi());
432        }
433    }
434
435    /// ModR/M byte: mod=3 (register direct), reg, rm.
436    #[allow(dead_code)]
437    fn modrm_rr(&mut self, reg: Reg, rm: Reg) {
438        self.emit(0xC0 | (reg.lo() << 3) | rm.lo());
439    }
440
441    /// ModR/M (+ optional SIB) + displacement for [base + disp] addressing.
442    /// Pushes into an InstBuf instead of emitting directly.
443    #[inline(always)]
444    fn modrm_disp_ib(ib: &mut InstBuf, reg: u8, base: Reg, disp: i32) {
445        let bl = base.lo();
446        let needs_sib = bl == 4;
447
448        if disp == 0 && bl != 5 {
449            if needs_sib {
450                ib.push((reg << 3) | 4);
451                ib.push(0x24);
452            } else {
453                ib.push((reg << 3) | bl);
454            }
455        } else if (-128..=127).contains(&disp) {
456            if needs_sib {
457                ib.push(0x40 | (reg << 3) | 4);
458                ib.push(0x24);
459            } else {
460                ib.push(0x40 | (reg << 3) | bl);
461            }
462            ib.push(disp as u8);
463        } else {
464            if needs_sib {
465                ib.push(0x80 | (reg << 3) | 4);
466                ib.push(0x24);
467            } else {
468                ib.push(0x80 | (reg << 3) | bl);
469            }
470            ib.push_i32(disp);
471        }
472    }
473
474    /// Legacy wrapper — delegates to InstBuf-based version.
475    #[allow(dead_code)]
476    fn modrm_disp(&mut self, reg: u8, base: Reg, disp: i32) {
477        let mut ib = InstBuf::new();
478        Self::modrm_disp_ib(&mut ib, reg, base, disp);
479        self.flush_instbuf(ib);
480    }
481
482    /// ModR/M + SIB for [base + index] addressing, into InstBuf.
483    #[inline(always)]
484    fn modrm_sib_base_index_ib(ib: &mut InstBuf, reg: u8, base: Reg, index: Reg) {
485        if base.lo() == 5 {
486            ib.push(0x44 | (reg << 3));
487            ib.push((index.lo() << 3) | base.lo());
488            ib.push(0);
489        } else {
490            ib.push((reg << 3) | 4);
491            ib.push((index.lo() << 3) | base.lo());
492        }
493    }
494
495    /// Emit ModR/M + displacement for [base + disp] with always-disp32 encoding.
496    /// Used when the immediate after the displacement must be at a fixed offset
497    /// (e.g., for patch-based gas metering where the imm32 is written later).
498    fn modrm_disp32(&mut self, reg: u8, base: Reg, disp: i32) {
499        let mut ib = InstBuf::new();
500        if base.lo() == 4 {
501            ib.push(0x80 | (reg << 3) | 4);
502            ib.push(0x24);
503        } else {
504            ib.push(0x80 | (reg << 3) | base.lo());
505        }
506        ib.push_i32(disp);
507        self.flush_instbuf(ib);
508    }
509
510    // === Instruction emission ===
511
512    // -- MOV --
513
514    /// mov r64, r64
515    #[inline(always)]
516    pub fn mov_rr(&mut self, dst: Reg, src: Reg) {
517        if dst == src {
518            return;
519        }
520        self.emit3(
521            0x48 | (src.hi() << 2) | dst.hi(),
522            0x89,
523            0xC0 | (src.lo() << 3) | dst.lo(),
524        );
525    }
526
527    /// mov r64, imm64
528    #[inline(always)]
529    pub fn mov_ri64(&mut self, dst: Reg, imm: u64) {
530        let mut ib = InstBuf::new();
531        if imm == 0 {
532            // xor r32, r32 (clears full r64)
533            let r = dst.hi();
534            if r != 0 {
535                ib.push(0x40 | (r << 2) | r);
536            }
537            ib.push(0x31);
538            ib.push(0xC0 | (dst.lo() << 3) | dst.lo());
539        } else if imm <= u32::MAX as u64 {
540            // mov r32, imm32 (zero-extends to 64)
541            if dst.needs_rex() {
542                ib.push(0x40 | dst.hi());
543            }
544            ib.push(0xB8 + dst.lo());
545            ib.push_u32(imm as u32);
546        } else if imm as i64 >= i32::MIN as i64 && imm as i64 <= i32::MAX as i64 {
547            // mov r64, sign-extended imm32
548            ib.push(0x48 | dst.hi());
549            ib.push(0xC7);
550            ib.push(0xC0 | dst.lo());
551            ib.push_i32(imm as i32);
552        } else {
553            // mov r64, imm64
554            ib.push(0x48 | dst.hi());
555            ib.push(0xB8 + dst.lo());
556            ib.push_u64(imm);
557        }
558        self.flush_instbuf(ib);
559    }
560
561    /// mov r32, imm32 (zero-extends to 64-bit)
562    #[inline(always)]
563    pub fn mov_ri32(&mut self, dst: Reg, imm: u32) {
564        let mut ib = InstBuf::new();
565        if dst.needs_rex() {
566            ib.push(0x40 | dst.hi());
567        }
568        ib.push(0xB8 + dst.lo());
569        ib.push_u32(imm);
570        self.flush_instbuf(ib);
571    }
572
573    /// mov r32, [base + disp] — zero-extending 32-bit load
574    #[inline(always)]
575    pub fn mov_load32(&mut self, dst: Reg, base: Reg, disp: i32) {
576        let mut ib = InstBuf::new();
577        let r = dst.hi();
578        let b = base.hi();
579        if r != 0 || b != 0 {
580            ib.push(0x40 | (r << 2) | b);
581        }
582        ib.push(0x8B);
583        Self::modrm_disp_ib(&mut ib, dst.lo(), base, disp);
584        self.flush_instbuf(ib);
585    }
586
587    /// mov r64, [base + disp]
588    #[inline(always)]
589    pub fn mov_load64(&mut self, dst: Reg, base: Reg, disp: i32) {
590        let mut ib = InstBuf::new();
591        ib.push(0x48 | (dst.hi() << 2) | base.hi());
592        ib.push(0x8B);
593        Self::modrm_disp_ib(&mut ib, dst.lo(), base, disp);
594        self.flush_instbuf(ib);
595    }
596
597    /// movsxd r64, dword [base + index*4] — sign-extending load with SIB scale=4
598    pub fn movsxd_load_sib4(&mut self, dst: Reg, base: Reg, index: Reg) {
599        let mut ib = InstBuf::new();
600        ib.push(0x48 | (dst.hi() << 2) | (index.hi() << 1) | base.hi());
601        ib.push(0x63);
602        ib.push((dst.lo() << 3) | 4);
603        ib.push(0x80 | (index.lo() << 3) | base.lo());
604        self.flush_instbuf(ib);
605    }
606
607    /// mov dword [base + disp], r32 — 32-bit store
608    #[inline(always)]
609    pub fn mov_store32(&mut self, base: Reg, disp: i32, src: Reg) {
610        let mut ib = InstBuf::new();
611        let r = src.hi();
612        let b = base.hi();
613        if r != 0 || b != 0 {
614            ib.push(0x40 | (r << 2) | b);
615        }
616        ib.push(0x89);
617        Self::modrm_disp_ib(&mut ib, src.lo(), base, disp);
618        self.flush_instbuf(ib);
619    }
620
621    /// mov [base + disp], r64
622    #[inline(always)]
623    pub fn mov_store64(&mut self, base: Reg, disp: i32, src: Reg) {
624        let mut ib = InstBuf::new();
625        ib.push(0x48 | (src.hi() << 2) | base.hi());
626        ib.push(0x89);
627        Self::modrm_disp_ib(&mut ib, src.lo(), base, disp);
628        self.flush_instbuf(ib);
629    }
630
631    /// mov dword [base + disp], imm32
632    #[inline(always)]
633    pub fn mov_store32_imm(&mut self, base: Reg, disp: i32, imm: i32) {
634        let mut ib = InstBuf::new();
635        if base.needs_rex() {
636            ib.push(0x40 | base.hi());
637        }
638        ib.push(0xC7);
639        Self::modrm_disp_ib(&mut ib, 0, base, disp);
640        ib.push_i32(imm);
641        self.flush_instbuf(ib);
642    }
643
644    /// mov qword [base + disp], sign-extended imm32
645    pub fn mov_store64_imm(&mut self, base: Reg, disp: i32, imm: i32) {
646        let mut ib = InstBuf::new();
647        ib.push(0x48 | base.hi());
648        ib.push(0xC7);
649        Self::modrm_disp_ib(&mut ib, 0, base, disp);
650        ib.push_i32(imm);
651        self.flush_instbuf(ib);
652    }
653
654    // -- SIB-based memory access [base + index] --
655
656    /// Emit ModR/M + SIB for [base + index] addressing (scale=1, no displacement).
657    /// Special case: base=RBP/R13 requires mod=01 with disp8=0.
658    /// Legacy wrapper — used by methods not yet converted to InstBuf.
659    #[allow(dead_code)]
660    fn modrm_sib_base_index(&mut self, reg: u8, base: Reg, index: Reg) {
661        let mut ib = InstBuf::new();
662        Self::modrm_sib_base_index_ib(&mut ib, reg, base, index);
663        self.flush_instbuf(ib);
664    }
665
666    /// movzx r64, byte [base + index] — zero-extending u8 load
667    pub fn movzx_load8_sib(&mut self, dst: Reg, base: Reg, index: Reg) {
668        let mut ib = InstBuf::new();
669        ib.push(0x48 | (dst.hi() << 2) | (index.hi() << 1) | base.hi());
670        ib.push(0x0F);
671        ib.push(0xB6);
672        Self::modrm_sib_base_index_ib(&mut ib, dst.lo(), base, index);
673        self.flush_instbuf(ib);
674    }
675
676    /// movzx r32, word [base + index] — zero-extending u16 load
677    pub fn movzx_load16_sib(&mut self, dst: Reg, base: Reg, index: Reg) {
678        let mut ib = InstBuf::new();
679        let rex = 0x40 | (dst.hi() << 2) | (index.hi() << 1) | base.hi();
680        if rex != 0x40 {
681            ib.push(rex);
682        }
683        ib.push(0x0F);
684        ib.push(0xB7);
685        Self::modrm_sib_base_index_ib(&mut ib, dst.lo(), base, index);
686        self.flush_instbuf(ib);
687    }
688
689    /// mov r32, dword [base + index] — zero-extending u32 load
690    pub fn mov_load32_sib(&mut self, dst: Reg, base: Reg, index: Reg) {
691        let mut ib = InstBuf::new();
692        let rex = 0x40 | (dst.hi() << 2) | (index.hi() << 1) | base.hi();
693        if rex != 0x40 {
694            ib.push(rex);
695        }
696        ib.push(0x8B);
697        Self::modrm_sib_base_index_ib(&mut ib, dst.lo(), base, index);
698        self.flush_instbuf(ib);
699    }
700
701    /// mov r64, qword [base + index]
702    pub fn mov_load64_sib(&mut self, dst: Reg, base: Reg, index: Reg) {
703        let mut ib = InstBuf::new();
704        ib.push(0x48 | (dst.hi() << 2) | (index.hi() << 1) | base.hi());
705        ib.push(0x8B);
706        Self::modrm_sib_base_index_ib(&mut ib, dst.lo(), base, index);
707        self.flush_instbuf(ib);
708    }
709
710    /// mov byte [base + index], r8
711    pub fn mov_store8_sib(&mut self, base: Reg, index: Reg, src: Reg) {
712        let mut ib = InstBuf::new();
713        ib.push(0x40 | (src.hi() << 2) | (index.hi() << 1) | base.hi());
714        ib.push(0x88);
715        Self::modrm_sib_base_index_ib(&mut ib, src.lo(), base, index);
716        self.flush_instbuf(ib);
717    }
718
719    /// mov word [base + index], r16
720    pub fn mov_store16_sib(&mut self, base: Reg, index: Reg, src: Reg) {
721        let mut ib = InstBuf::new();
722        ib.push(0x66);
723        let rex = 0x40 | (src.hi() << 2) | (index.hi() << 1) | base.hi();
724        if rex != 0x40 {
725            ib.push(rex);
726        }
727        ib.push(0x89);
728        Self::modrm_sib_base_index_ib(&mut ib, src.lo(), base, index);
729        self.flush_instbuf(ib);
730    }
731
732    /// mov dword [base + index], r32
733    pub fn mov_store32_sib(&mut self, base: Reg, index: Reg, src: Reg) {
734        let mut ib = InstBuf::new();
735        let rex = 0x40 | (src.hi() << 2) | (index.hi() << 1) | base.hi();
736        if rex != 0x40 {
737            ib.push(rex);
738        }
739        ib.push(0x89);
740        Self::modrm_sib_base_index_ib(&mut ib, src.lo(), base, index);
741        self.flush_instbuf(ib);
742    }
743
744    /// mov qword [base + index], r64
745    pub fn mov_store64_sib(&mut self, base: Reg, index: Reg, src: Reg) {
746        let mut ib = InstBuf::new();
747        ib.push(0x48 | (src.hi() << 2) | (index.hi() << 1) | base.hi());
748        ib.push(0x89);
749        Self::modrm_sib_base_index_ib(&mut ib, src.lo(), base, index);
750        self.flush_instbuf(ib);
751    }
752
753    /// mov dword [base + index], imm32
754    pub fn mov_store32_sib_imm(&mut self, base: Reg, index: Reg, imm: i32) {
755        let mut ib = InstBuf::new();
756        let rex = 0x40 | (index.hi() << 1) | base.hi();
757        if rex != 0x40 {
758            ib.push(rex);
759        }
760        ib.push(0xC7);
761        Self::modrm_sib_base_index_ib(&mut ib, 0, base, index);
762        ib.push_i32(imm);
763        self.flush_instbuf(ib);
764    }
765
766    /// mov qword [base + index], sign-extended imm32
767    pub fn mov_store64_sib_imm(&mut self, base: Reg, index: Reg, imm: i32) {
768        let mut ib = InstBuf::new();
769        ib.push(0x48 | (index.hi() << 1) | base.hi());
770        ib.push(0xC7);
771        Self::modrm_sib_base_index_ib(&mut ib, 0, base, index);
772        ib.push_i32(imm);
773        self.flush_instbuf(ib);
774    }
775
776    /// mov byte [base + index], imm8
777    pub fn mov_store8_sib_imm(&mut self, base: Reg, index: Reg, imm: u8) {
778        let mut ib = InstBuf::new();
779        ib.push(0x40 | (index.hi() << 1) | base.hi());
780        ib.push(0xC6);
781        Self::modrm_sib_base_index_ib(&mut ib, 0, base, index);
782        ib.push(imm);
783        self.flush_instbuf(ib);
784    }
785
786    /// mov word [base + index], imm16
787    pub fn mov_store16_sib_imm(&mut self, base: Reg, index: Reg, imm: u16) {
788        let mut ib = InstBuf::new();
789        ib.push(0x66);
790        let rex = 0x40 | (index.hi() << 1) | base.hi();
791        if rex != 0x40 {
792            ib.push(rex);
793        }
794        ib.push(0xC7);
795        Self::modrm_sib_base_index_ib(&mut ib, 0, base, index);
796        ib.push(imm as u8);
797        ib.push((imm >> 8) as u8);
798        self.flush_instbuf(ib);
799    }
800
801    /// add r64, qword [base + disp32]
802    pub fn add_r64_mem(&mut self, dst: Reg, base: Reg, disp: i32) {
803        let mut ib = InstBuf::new();
804        ib.push(0x48 | (dst.hi() << 2) | base.hi());
805        ib.push(0x03);
806        Self::modrm_disp_ib(&mut ib, dst.lo(), base, disp);
807        self.flush_instbuf(ib);
808    }
809
810    /// movzx r64, byte `[rax]` (simple deref, no SIB needed) — for perm table lookup
811    pub fn movzx_load8_deref(&mut self, dst: Reg, base: Reg) {
812        let mut ib = InstBuf::new();
813        ib.push(0x48 | (dst.hi() << 2) | base.hi());
814        ib.push(0x0F);
815        ib.push(0xB6);
816        if base.lo() == 5 {
817            ib.push((dst.lo() << 3) | base.lo() | 0x40);
818            ib.push(0);
819        } else if base.lo() == 4 {
820            ib.push((dst.lo() << 3) | 4);
821            ib.push(0x24);
822        } else {
823            ib.push((dst.lo() << 3) | base.lo());
824        }
825        self.flush_instbuf(ib);
826    }
827
828    /// cmp byte [base + index + disp32], imm8 — compare memory byte with SIB+displacement
829    pub fn cmp_byte_sib_disp32(&mut self, base: Reg, index: Reg, disp: i32, imm: u8) {
830        let mut ib = InstBuf::new();
831        let rex = 0x40 | (index.hi() << 1) | base.hi();
832        if rex != 0x40 {
833            ib.push(rex);
834        }
835        ib.push(0x80);
836        ib.push(0xBC); // mod=10, reg=/7(CMP), rm=100(SIB)
837        ib.push((index.lo() << 3) | base.lo());
838        ib.push_i32(disp);
839        ib.push(imm);
840        self.flush_instbuf(ib);
841    }
842
843    /// cmp byte `[reg]`, imm8 — compare memory byte with immediate
844    pub fn cmp_byte_deref_imm(&mut self, base: Reg, imm: u8) {
845        let mut ib = InstBuf::new();
846        if base.needs_rex() {
847            ib.push(0x41 | base.hi());
848        }
849        ib.push(0x80);
850        if base.lo() == 5 {
851            ib.push(0x78 | base.lo());
852            ib.push(0);
853        } else if base.lo() == 4 {
854            ib.push(0x38 | 4);
855            ib.push(0x24);
856        } else {
857            ib.push(0x38 | base.lo());
858        }
859        ib.push(imm);
860        self.flush_instbuf(ib);
861    }
862
863    // -- ALU reg,reg (64-bit) --
864
865    fn alu_rr64(&mut self, op: u8, dst: Reg, src: Reg) {
866        let mut ib = InstBuf::new();
867        ib.push(0x48 | (src.hi() << 2) | dst.hi());
868        ib.push(op);
869        ib.push(0xC0 | (src.lo() << 3) | dst.lo());
870        self.flush_instbuf(ib);
871    }
872
873    fn alu_rr32(&mut self, op: u8, dst: Reg, src: Reg) {
874        let r = src.hi();
875        let b = dst.hi();
876        if r != 0 || b != 0 {
877            let mut ib = InstBuf::new();
878            ib.push(0x40 | (r << 2) | b);
879            ib.push(op);
880            ib.push(0xC0 | (src.lo() << 3) | dst.lo());
881            self.flush_instbuf(ib);
882        } else {
883            let mut ib = InstBuf::new();
884            ib.push(op);
885            ib.push(0xC0 | (src.lo() << 3) | dst.lo());
886            self.flush_instbuf(ib);
887        }
888    }
889
890    #[inline(always)]
891    pub fn add_rr(&mut self, dst: Reg, src: Reg) {
892        self.alu_rr64(0x01, dst, src);
893    }
894    #[inline(always)]
895    pub fn sub_rr(&mut self, dst: Reg, src: Reg) {
896        self.alu_rr64(0x29, dst, src);
897    }
898    #[inline(always)]
899    pub fn and_rr(&mut self, dst: Reg, src: Reg) {
900        self.alu_rr64(0x21, dst, src);
901    }
902    #[inline(always)]
903    pub fn or_rr(&mut self, dst: Reg, src: Reg) {
904        self.alu_rr64(0x09, dst, src);
905    }
906    #[inline(always)]
907    pub fn xor_rr(&mut self, dst: Reg, src: Reg) {
908        self.alu_rr64(0x31, dst, src);
909    }
910    #[inline(always)]
911    pub fn cmp_rr(&mut self, a: Reg, b: Reg) {
912        self.alu_rr64(0x39, a, b);
913    }
914    #[inline(always)]
915    pub fn test_rr(&mut self, a: Reg, b: Reg) {
916        self.alu_rr64(0x85, a, b);
917    }
918
919    /// `test byte [base + disp32], imm8` — test memory byte against immediate bitmask.
920    #[inline(always)]
921    pub fn test_byte_mem_disp32(&mut self, base: Reg, disp: i32, imm: u8) {
922        let mut ib = InstBuf::new();
923        if base.needs_rex() {
924            ib.push(0x41 | base.hi());
925        }
926        ib.push(0xF6); // TEST r/m8, imm8
927        // ModRM: mod=10 (disp32), reg=000 (/0 = TEST), rm=base.lo()
928        ib.push(0x80 | base.lo());
929        ib.push_i32(disp);
930        ib.push(imm);
931        self.flush_instbuf(ib);
932    }
933
934    #[inline(always)]
935    pub fn add_rr32(&mut self, dst: Reg, src: Reg) {
936        self.alu_rr32(0x01, dst, src);
937    }
938    #[inline(always)]
939    pub fn sub_rr32(&mut self, dst: Reg, src: Reg) {
940        self.alu_rr32(0x29, dst, src);
941    }
942
943    // -- ALU reg,imm (64-bit) --
944    // Uses imm8 (opcode 0x83) when immediate fits in -128..127, saving 3 bytes.
945
946    #[inline(always)]
947    fn alu_ri64(&mut self, ext: u8, dst: Reg, imm: i32) {
948        let mut ib = InstBuf::new();
949        ib.push(0x48 | dst.hi());
950        if (-128..=127).contains(&imm) {
951            ib.push(0x83);
952            ib.push(0xC0 | (ext << 3) | dst.lo());
953            ib.push(imm as u8);
954        } else {
955            ib.push(0x81);
956            ib.push(0xC0 | (ext << 3) | dst.lo());
957            ib.push_i32(imm);
958        }
959        self.flush_instbuf(ib);
960    }
961
962    #[inline(always)]
963    fn alu_ri32(&mut self, ext: u8, dst: Reg, imm: i32) {
964        let mut ib = InstBuf::new();
965        if dst.needs_rex() {
966            ib.push(0x40 | dst.hi());
967        }
968        if (-128..=127).contains(&imm) {
969            ib.push(0x83);
970            ib.push(0xC0 | (ext << 3) | dst.lo());
971            ib.push(imm as u8);
972        } else {
973            ib.push(0x81);
974            ib.push(0xC0 | (ext << 3) | dst.lo());
975            ib.push_i32(imm);
976        }
977        self.flush_instbuf(ib);
978    }
979
980    #[inline(always)]
981    pub fn add_ri(&mut self, dst: Reg, imm: i32) {
982        self.alu_ri64(0, dst, imm);
983    }
984    #[inline(always)]
985    pub fn sub_ri(&mut self, dst: Reg, imm: i32) {
986        self.alu_ri64(5, dst, imm);
987    }
988    #[inline(always)]
989    pub fn and_ri(&mut self, dst: Reg, imm: i32) {
990        self.alu_ri64(4, dst, imm);
991    }
992    #[inline(always)]
993    pub fn or_ri(&mut self, dst: Reg, imm: i32) {
994        self.alu_ri64(1, dst, imm);
995    }
996    #[inline(always)]
997    pub fn xor_ri(&mut self, dst: Reg, imm: i32) {
998        self.alu_ri64(6, dst, imm);
999    }
1000    #[inline(always)]
1001    pub fn cmp_ri(&mut self, a: Reg, imm: i32) {
1002        self.alu_ri64(7, a, imm);
1003    }
1004
1005    #[inline(always)]
1006    pub fn add_ri32(&mut self, dst: Reg, imm: i32) {
1007        self.alu_ri32(0, dst, imm);
1008    }
1009    #[inline(always)]
1010    pub fn sub_ri32(&mut self, dst: Reg, imm: i32) {
1011        self.alu_ri32(5, dst, imm);
1012    }
1013    #[inline(always)]
1014    pub fn cmp_ri32(&mut self, a: Reg, imm: i32) {
1015        self.alu_ri32(7, a, imm);
1016    }
1017
1018    /// cmp dword [base + disp], imm32
1019    pub fn cmp_mem32_imm(&mut self, base: Reg, disp: i32, imm: i32) {
1020        let mut ib = InstBuf::new();
1021        if base.hi() != 0 {
1022            ib.push(0x41);
1023        }
1024        ib.push(0x81);
1025        Self::modrm_disp_ib(&mut ib, 7, base, disp);
1026        ib.push_i32(imm);
1027        self.flush_instbuf(ib);
1028    }
1029
1030    /// cmp dword [base + disp], reg32  (sets flags: mem vs reg)
1031    pub fn cmp_mem32_r(&mut self, base: Reg, disp: i32, src: Reg) {
1032        let mut ib = InstBuf::new();
1033        if base.hi() != 0 || src.hi() != 0 {
1034            ib.push(0x40 | src.hi() << 2 | base.hi());
1035        }
1036        ib.push(0x39);
1037        Self::modrm_disp_ib(&mut ib, src.lo(), base, disp);
1038        self.flush_instbuf(ib);
1039    }
1040
1041    /// sub qword [base + disp32], sign-extended imm32.
1042    /// Always uses disp32 encoding (the imm32 is patched after emission for gas metering).
1043    pub fn sub_mem64_imm32(&mut self, base: Reg, disp: i32, imm: i32) {
1044        // NOTE: Cannot use InstBuf here — caller reads offset() for gas patching.
1045        // The offset must be at the exact position of the imm32 field.
1046        self.rex_w_b(base);
1047        self.emit(0x81);
1048        self.modrm_disp32(5, base, disp);
1049        self.emit_i32(imm);
1050    }
1051
1052    /// add qword [base + disp32], imm32
1053    pub fn add_mem64_imm32(&mut self, base: Reg, disp: i32, imm: i32) {
1054        // Same as sub_mem64_imm32 — offset() must be accurate for patching.
1055        self.rex_w_b(base);
1056        self.emit(0x81);
1057        self.modrm_disp32(0, base, disp);
1058        self.emit_i32(imm);
1059    }
1060
1061    // -- Baseless memory access [idx] (mod=00 r/m=idx, no SIB) --
1062    //
1063    // Codegen uses this when the PVM addr is itself the host VA — no
1064    // base register needed because the runtime substrate's per-
1065    // invocation page table maps `addr` → `addr` for the guest range.
1066    // Saves one byte per access vs the SIB form `[base + idx]`.
1067    //
1068    // ModRM bytes: `(mod=00, reg, r/m=idx.lo())` if idx.lo() ∉ {4, 5}.
1069    // For `idx.lo() == 4` (RSP/R12) the ModRM r/m=100 encoding means
1070    // "SIB follows"; emit SIB = `(scale=00, index=100, base=4)` to
1071    // recover plain `[idx]`. For `idx.lo() == 5` (RBP/R13) the
1072    // mod=00 r/m=101 form is reserved for [RIP+disp32], so use
1073    // mod=01 + disp8=0.
1074    fn modrm_baseless(&mut self, reg: u8, idx: Reg) {
1075        let r = reg & 7;
1076        match idx.lo() {
1077            4 => {
1078                // SIB form: [RSP/R12]
1079                self.emit((r << 3) | 4); // mod=00 r/m=100
1080                self.emit(0x24); // SIB: scale=00, index=100=none, base=4
1081            }
1082            5 => {
1083                // mod=01 disp8=0: [RBP/R13]
1084                self.emit(0x40 | (r << 3) | 5);
1085                self.emit(0);
1086            }
1087            _ => {
1088                self.emit((r << 3) | idx.lo());
1089            }
1090        }
1091    }
1092
1093    /// movzx r64, byte \[idx\] — zero-extending u8 load
1094    pub fn movzx_load8_at_index(&mut self, dst: Reg, idx: Reg) {
1095        self.emit(0x48 | (dst.hi() << 2) | idx.hi());
1096        self.emit(0x0F);
1097        self.emit(0xB6);
1098        self.modrm_baseless(dst.lo(), idx);
1099    }
1100
1101    /// movzx r64, word \[idx\] — zero-extending u16 load
1102    pub fn movzx_load16_at_index(&mut self, dst: Reg, idx: Reg) {
1103        let rex = 0x40 | (dst.hi() << 2) | idx.hi();
1104        if rex != 0x40 {
1105            self.emit(rex);
1106        }
1107        self.emit(0x0F);
1108        self.emit(0xB7);
1109        self.modrm_baseless(dst.lo(), idx);
1110    }
1111
1112    /// mov r32, dword \[idx\] — zero-extending u32 load (writes EAX, clears upper 32)
1113    pub fn mov_load32_at_index(&mut self, dst: Reg, idx: Reg) {
1114        let rex = 0x40 | (dst.hi() << 2) | idx.hi();
1115        if rex != 0x40 {
1116            self.emit(rex);
1117        }
1118        self.emit(0x8B);
1119        self.modrm_baseless(dst.lo(), idx);
1120    }
1121
1122    /// mov r64, qword \[idx\]
1123    pub fn mov_load64_at_index(&mut self, dst: Reg, idx: Reg) {
1124        self.emit(0x48 | (dst.hi() << 2) | idx.hi());
1125        self.emit(0x8B);
1126        self.modrm_baseless(dst.lo(), idx);
1127    }
1128
1129    /// mov byte \[idx\], r8
1130    pub fn mov_store8_at_index(&mut self, idx: Reg, src: Reg) {
1131        // REX prefix mandatory if `src` is SIL/DIL/BPL/SPL (encoded
1132        // 4..=7 with hi=0) — without REX those encodings decode as
1133        // AH/CH/DH/BH. (`needs_rex` only catches R8-R15.)
1134        let rex = 0x40 | (src.hi() << 2) | idx.hi();
1135        if rex != 0x40 || src.lo() >= 4 {
1136            self.emit(rex);
1137        }
1138        self.emit(0x88);
1139        self.modrm_baseless(src.lo(), idx);
1140    }
1141
1142    /// mov word \[idx\], r16
1143    pub fn mov_store16_at_index(&mut self, idx: Reg, src: Reg) {
1144        self.emit(0x66);
1145        let rex = 0x40 | (src.hi() << 2) | idx.hi();
1146        if rex != 0x40 {
1147            self.emit(rex);
1148        }
1149        self.emit(0x89);
1150        self.modrm_baseless(src.lo(), idx);
1151    }
1152
1153    /// mov dword \[idx\], r32
1154    pub fn mov_store32_at_index(&mut self, idx: Reg, src: Reg) {
1155        let rex = 0x40 | (src.hi() << 2) | idx.hi();
1156        if rex != 0x40 {
1157            self.emit(rex);
1158        }
1159        self.emit(0x89);
1160        self.modrm_baseless(src.lo(), idx);
1161    }
1162
1163    /// mov qword \[idx\], r64
1164    pub fn mov_store64_at_index(&mut self, idx: Reg, src: Reg) {
1165        self.emit(0x48 | (src.hi() << 2) | idx.hi());
1166        self.emit(0x89);
1167        self.modrm_baseless(src.lo(), idx);
1168    }
1169
1170    /// mov byte \[idx\], imm8
1171    pub fn mov_store8_at_index_imm(&mut self, idx: Reg, imm: u8) {
1172        if idx.hi() != 0 {
1173            self.emit(0x40 | idx.hi());
1174        }
1175        self.emit(0xC6);
1176        self.modrm_baseless(0, idx);
1177        self.emit(imm);
1178    }
1179
1180    /// mov word \[idx\], imm16
1181    pub fn mov_store16_at_index_imm(&mut self, idx: Reg, imm: u16) {
1182        self.emit(0x66);
1183        if idx.hi() != 0 {
1184            self.emit(0x40 | idx.hi());
1185        }
1186        self.emit(0xC7);
1187        self.modrm_baseless(0, idx);
1188        self.emit(imm as u8);
1189        self.emit((imm >> 8) as u8);
1190    }
1191
1192    /// mov dword \[idx\], imm32
1193    pub fn mov_store32_at_index_imm(&mut self, idx: Reg, imm: i32) {
1194        if idx.hi() != 0 {
1195            self.emit(0x40 | idx.hi());
1196        }
1197        self.emit(0xC7);
1198        self.modrm_baseless(0, idx);
1199        self.emit_i32(imm);
1200    }
1201
1202    /// mov qword \[idx\], sign-extended imm32
1203    pub fn mov_store64_at_index_imm(&mut self, idx: Reg, imm: i32) {
1204        self.emit(0x48 | idx.hi());
1205        self.emit(0xC7);
1206        self.modrm_baseless(0, idx);
1207        self.emit_i32(imm);
1208    }
1209
1210    // -- RIP-relative addressing `[rip + disp32]` --
1211    //
1212    // 64-bit mode's mod=00 r/m=101 form is RIP-relative: the effective
1213    // address is `RIP_after_instruction + sign_extend(disp32)`. Used to
1214    // reach CTX, which lives above 4 GiB (outside the PVM u32 range)
1215    // and is therefore beyond absolute-SIB disp32 reach. The
1216    // `jit_va_base` field of the assembler holds the eventual load VA;
1217    // `target_va - (jit_va_base + write_pos + 4)` is the disp32 we
1218    // emit.
1219    //
1220    // 1 byte shorter per access than the absolute-SIB form (no SIB
1221    // byte). Range: target must be within ±2 GiB of post-instruction
1222    // RIP, which holds for any CTX page placed in or near the META
1223    // region above PVM mem.
1224    /// Emit a 4-byte RIP-relative disp32 targeting `target_va`. The CPU
1225    /// resolves the effective address against RIP-at-next-instruction,
1226    /// so we add 4 (for the disp32 itself) plus `trailing_bytes` (any
1227    /// imm fields following disp32 in the same instruction — 4 for
1228    /// `mov [rip+disp32], imm32`, 0 for everything else).
1229    fn emit_rip_rel_disp32(&mut self, target_va: u64, trailing_bytes: u64) {
1230        let post_inst_rip = self
1231            .jit_va_base
1232            .wrapping_add(self.write_pos as u64)
1233            .wrapping_add(4)
1234            .wrapping_add(trailing_bytes);
1235        let disp = (target_va as i64).wrapping_sub(post_inst_rip as i64);
1236        debug_assert!(
1237            disp >= i32::MIN as i64 && disp <= i32::MAX as i64,
1238            "RIP-relative target 0x{:x} out of range from base 0x{:x} + offset 0x{:x}",
1239            target_va,
1240            self.jit_va_base,
1241            self.write_pos
1242        );
1243        self.emit_i32(disp as i32);
1244    }
1245
1246    fn modrm_rip_rel(&mut self, reg: u8) {
1247        // mod=00, reg, r/m=101 = RIP-relative (no SIB byte).
1248        self.emit(((reg & 7) << 3) | 5);
1249    }
1250
1251    /// mov r32, dword [rip+disp32] (zero-extends to 64-bit)
1252    pub fn mov_load32_rip_rel(&mut self, dst: Reg, target_va: u64) {
1253        if dst.hi() != 0 {
1254            self.emit(0x40 | (dst.hi() << 2));
1255        }
1256        self.emit(0x8B);
1257        self.modrm_rip_rel(dst.lo());
1258        self.emit_rip_rel_disp32(target_va, 0);
1259    }
1260
1261    /// mov r64, qword [rip+disp32]
1262    pub fn mov_load64_rip_rel(&mut self, dst: Reg, target_va: u64) {
1263        self.emit(0x48 | (dst.hi() << 2));
1264        self.emit(0x8B);
1265        self.modrm_rip_rel(dst.lo());
1266        self.emit_rip_rel_disp32(target_va, 0);
1267    }
1268
1269    /// mov dword [rip+disp32], r32
1270    pub fn mov_store32_rip_rel(&mut self, target_va: u64, src: Reg) {
1271        if src.hi() != 0 {
1272            self.emit(0x40 | (src.hi() << 2));
1273        }
1274        self.emit(0x89);
1275        self.modrm_rip_rel(src.lo());
1276        self.emit_rip_rel_disp32(target_va, 0);
1277    }
1278
1279    /// mov qword [rip+disp32], r64
1280    pub fn mov_store64_rip_rel(&mut self, target_va: u64, src: Reg) {
1281        self.emit(0x48 | (src.hi() << 2));
1282        self.emit(0x89);
1283        self.modrm_rip_rel(src.lo());
1284        self.emit_rip_rel_disp32(target_va, 0);
1285    }
1286
1287    /// mov dword [rip+disp32], imm32 — the trailing imm32 means the
1288    /// "next-instruction RIP" is 4 bytes further than disp32 alone.
1289    pub fn mov_store32_rip_rel_imm(&mut self, target_va: u64, imm: i32) {
1290        self.emit(0xC7);
1291        self.modrm_rip_rel(0);
1292        self.emit_rip_rel_disp32(target_va, 4);
1293        self.emit_i32(imm);
1294    }
1295
1296    /// cmp dword [rip+disp32], reg32 (sets flags: mem vs reg)
1297    pub fn cmp_mem32_rip_rel_r(&mut self, target_va: u64, src: Reg) {
1298        if src.hi() != 0 {
1299            self.emit(0x40 | (src.hi() << 2));
1300        }
1301        self.emit(0x39);
1302        self.modrm_rip_rel(src.lo());
1303        self.emit_rip_rel_disp32(target_va, 0);
1304    }
1305
1306    /// add r64, qword [rip+disp32]
1307    pub fn add_r64_mem_rip_rel(&mut self, dst: Reg, target_va: u64) {
1308        self.emit(0x48 | (dst.hi() << 2));
1309        self.emit(0x03);
1310        self.modrm_rip_rel(dst.lo());
1311        self.emit_rip_rel_disp32(target_va, 0);
1312    }
1313
1314    // -- In-register gas decrement (patchable) --
1315
1316    /// sub r64, imm32 in the always-imm32 (7-byte) encoding.
1317    /// `offset() - 4` after this call points at the imm32 field;
1318    /// callers patch it after emission for per-block gas metering.
1319    pub fn sub_r64_imm32_patchable(&mut self, dst: Reg, imm: i32) {
1320        // NOTE: Cannot use InstBuf here — caller reads offset() for
1321        // patching. The imm32 must be the trailing 4 bytes.
1322        self.emit(0x48 | dst.hi());
1323        self.emit(0x81);
1324        self.emit(0xE8 | dst.lo()); // mod=11 (register), reg=5 (sub), r/m=dst.lo()
1325        self.emit_i32(imm);
1326    }
1327
1328    // -- IMUL --
1329
1330    /// imul r64, r64
1331    pub fn imul_rr(&mut self, dst: Reg, src: Reg) {
1332        let mut ib = InstBuf::new();
1333        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1334        ib.push(0x0F);
1335        ib.push(0xAF);
1336        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1337        self.flush_instbuf(ib);
1338    }
1339
1340    /// imul r32, r32
1341    pub fn imul_rr32(&mut self, dst: Reg, src: Reg) {
1342        let mut ib = InstBuf::new();
1343        let r = dst.hi();
1344        let b = src.hi();
1345        if r != 0 || b != 0 {
1346            ib.push(0x40 | (r << 2) | b);
1347        }
1348        ib.push(0x0F);
1349        ib.push(0xAF);
1350        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1351        self.flush_instbuf(ib);
1352    }
1353
1354    /// imul r64, r64, imm32
1355    pub fn imul_rri(&mut self, dst: Reg, src: Reg, imm: i32) {
1356        let mut ib = InstBuf::new();
1357        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1358        ib.push(0x69);
1359        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1360        ib.push_i32(imm);
1361        self.flush_instbuf(ib);
1362    }
1363
1364    /// imul r32, r32, imm32
1365    pub fn imul_rri32(&mut self, dst: Reg, src: Reg, imm: i32) {
1366        let mut ib = InstBuf::new();
1367        let r = dst.hi();
1368        let b = src.hi();
1369        if r != 0 || b != 0 {
1370            ib.push(0x40 | (r << 2) | b);
1371        }
1372        ib.push(0x69);
1373        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1374        ib.push_i32(imm);
1375        self.flush_instbuf(ib);
1376    }
1377
1378    // -- MUL/IMUL widening (RDX:RAX = RAX * src) --
1379
1380    /// mul r64 (unsigned RDX:RAX = RAX * src)
1381    pub fn mul_rdx_rax(&mut self, src: Reg) {
1382        self.emit3(0x48 | src.hi(), 0xF7, 0xE0 | src.lo());
1383    }
1384
1385    /// imul r64 (signed RDX:RAX = RAX * src)
1386    pub fn imul_rdx_rax(&mut self, src: Reg) {
1387        self.emit3(0x48 | src.hi(), 0xF7, 0xE8 | src.lo());
1388    }
1389
1390    // -- DIV/IDIV --
1391
1392    /// div r64 (unsigned RAX = RDX:RAX / src, RDX = remainder)
1393    pub fn div64(&mut self, src: Reg) {
1394        self.emit3(0x48 | src.hi(), 0xF7, 0xF0 | src.lo());
1395    }
1396
1397    /// idiv r64 (signed)
1398    pub fn idiv64(&mut self, src: Reg) {
1399        self.emit3(0x48 | src.hi(), 0xF7, 0xF8 | src.lo());
1400    }
1401
1402    /// div r32
1403    pub fn div32(&mut self, src: Reg) {
1404        if src.needs_rex() {
1405            self.emit3(0x41, 0xF7, 0xF0 | src.lo());
1406        } else {
1407            let mut ib = InstBuf::new();
1408            ib.push(0xF7);
1409            ib.push(0xF0 | src.lo());
1410            self.flush_instbuf(ib);
1411        }
1412    }
1413
1414    /// idiv r32
1415    pub fn idiv32(&mut self, src: Reg) {
1416        if src.needs_rex() {
1417            self.emit3(0x41, 0xF7, 0xF8 | src.lo());
1418        } else {
1419            let mut ib = InstBuf::new();
1420            ib.push(0xF7);
1421            ib.push(0xF8 | src.lo());
1422            self.flush_instbuf(ib);
1423        }
1424    }
1425
1426    /// cqo (sign-extend RAX into RDX:RAX, 64-bit)
1427    pub fn cqo(&mut self) {
1428        self.emit(0x48);
1429        self.emit(0x99);
1430    }
1431
1432    /// cdq (sign-extend EAX into EDX:EAX, 32-bit)
1433    pub fn cdq(&mut self) {
1434        self.emit(0x99);
1435    }
1436
1437    // -- INC/DEC --
1438
1439    /// inc r64
1440    pub fn inc64(&mut self, dst: Reg) {
1441        self.emit3(0x48 | dst.hi(), 0xFF, 0xC0 | dst.lo());
1442    }
1443
1444    /// dec r64
1445    pub fn dec64(&mut self, dst: Reg) {
1446        self.emit3(0x48 | dst.hi(), 0xFF, 0xC8 | dst.lo());
1447    }
1448
1449    // -- NEG/NOT --
1450
1451    /// neg r64
1452    pub fn neg64(&mut self, dst: Reg) {
1453        self.emit3(0x48 | dst.hi(), 0xF7, 0xD8 | dst.lo());
1454    }
1455
1456    pub fn neg32(&mut self, dst: Reg) {
1457        if dst.needs_rex() {
1458            self.emit3(0x41, 0xF7, 0xD8 | dst.lo());
1459        } else {
1460            let mut ib = InstBuf::new();
1461            ib.push(0xF7);
1462            ib.push(0xD8 | dst.lo());
1463            self.flush_instbuf(ib);
1464        }
1465    }
1466
1467    /// not r64
1468    pub fn not64(&mut self, dst: Reg) {
1469        self.emit3(0x48 | dst.hi(), 0xF7, 0xD0 | dst.lo());
1470    }
1471
1472    // -- Shifts --
1473
1474    fn shift_ri64(&mut self, ext: u8, dst: Reg, imm: u8) {
1475        let mut ib = InstBuf::new();
1476        ib.push(0x48 | dst.hi());
1477        ib.push(0xC1);
1478        ib.push(0xC0 | (ext << 3) | dst.lo());
1479        ib.push(imm);
1480        self.flush_instbuf(ib);
1481    }
1482
1483    pub fn shift_cl64(&mut self, ext: u8, dst: Reg) {
1484        let mut ib = InstBuf::new();
1485        ib.push(0x48 | dst.hi());
1486        ib.push(0xD3);
1487        ib.push(0xC0 | (ext << 3) | dst.lo());
1488        self.flush_instbuf(ib);
1489    }
1490
1491    fn shift_ri32(&mut self, ext: u8, dst: Reg, imm: u8) {
1492        let mut ib = InstBuf::new();
1493        if dst.needs_rex() {
1494            ib.push(0x40 | dst.hi());
1495        }
1496        ib.push(0xC1);
1497        ib.push(0xC0 | (ext << 3) | dst.lo());
1498        ib.push(imm);
1499        self.flush_instbuf(ib);
1500    }
1501
1502    pub fn shift_cl32(&mut self, ext: u8, dst: Reg) {
1503        let mut ib = InstBuf::new();
1504        if dst.needs_rex() {
1505            ib.push(0x40 | dst.hi());
1506        }
1507        ib.push(0xD3);
1508        ib.push(0xC0 | (ext << 3) | dst.lo());
1509        self.flush_instbuf(ib);
1510    }
1511
1512    pub fn shl_ri64(&mut self, dst: Reg, imm: u8) {
1513        self.shift_ri64(4, dst, imm);
1514    }
1515    pub fn shr_ri64(&mut self, dst: Reg, imm: u8) {
1516        self.shift_ri64(5, dst, imm);
1517    }
1518    pub fn sar_ri64(&mut self, dst: Reg, imm: u8) {
1519        self.shift_ri64(7, dst, imm);
1520    }
1521    pub fn shl_cl64(&mut self, dst: Reg) {
1522        self.shift_cl64(4, dst);
1523    }
1524    pub fn shr_cl64(&mut self, dst: Reg) {
1525        self.shift_cl64(5, dst);
1526    }
1527    pub fn sar_cl64(&mut self, dst: Reg) {
1528        self.shift_cl64(7, dst);
1529    }
1530    pub fn rol_cl64(&mut self, dst: Reg) {
1531        self.shift_cl64(0, dst);
1532    }
1533    pub fn ror_cl64(&mut self, dst: Reg) {
1534        self.shift_cl64(1, dst);
1535    }
1536    pub fn rol_ri64(&mut self, dst: Reg, imm: u8) {
1537        self.shift_ri64(0, dst, imm);
1538    }
1539    pub fn ror_ri64(&mut self, dst: Reg, imm: u8) {
1540        self.shift_ri64(1, dst, imm);
1541    }
1542
1543    pub fn shl_ri32(&mut self, dst: Reg, imm: u8) {
1544        self.shift_ri32(4, dst, imm);
1545    }
1546    pub fn shr_ri32(&mut self, dst: Reg, imm: u8) {
1547        self.shift_ri32(5, dst, imm);
1548    }
1549    pub fn sar_ri32(&mut self, dst: Reg, imm: u8) {
1550        self.shift_ri32(7, dst, imm);
1551    }
1552    pub fn shl_cl32(&mut self, dst: Reg) {
1553        self.shift_cl32(4, dst);
1554    }
1555    pub fn shr_cl32(&mut self, dst: Reg) {
1556        self.shift_cl32(5, dst);
1557    }
1558    pub fn sar_cl32(&mut self, dst: Reg) {
1559        self.shift_cl32(7, dst);
1560    }
1561    pub fn rol_cl32(&mut self, dst: Reg) {
1562        self.shift_cl32(0, dst);
1563    }
1564    pub fn ror_cl32(&mut self, dst: Reg) {
1565        self.shift_cl32(1, dst);
1566    }
1567    pub fn rol_ri32(&mut self, dst: Reg, imm: u8) {
1568        self.shift_ri32(0, dst, imm);
1569    }
1570    pub fn ror_ri32(&mut self, dst: Reg, imm: u8) {
1571        self.shift_ri32(1, dst, imm);
1572    }
1573
1574    // -- Extensions --
1575
1576    /// movsxd r64, r32 (sign-extend 32→64)
1577    #[inline(always)]
1578    pub fn movsxd(&mut self, dst: Reg, src: Reg) {
1579        let mut ib = InstBuf::new();
1580        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1581        ib.push(0x63);
1582        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1583        self.flush_instbuf(ib);
1584    }
1585
1586    /// movsx r64, r8 (sign-extend 8→64)
1587    pub fn movsx_8_64(&mut self, dst: Reg, src: Reg) {
1588        let mut ib = InstBuf::new();
1589        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1590        ib.push(0x0F);
1591        ib.push(0xBE);
1592        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1593        self.flush_instbuf(ib);
1594    }
1595
1596    /// movsx r64, r16 (sign-extend 16→64)
1597    pub fn movsx_16_64(&mut self, dst: Reg, src: Reg) {
1598        let mut ib = InstBuf::new();
1599        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1600        ib.push(0x0F);
1601        ib.push(0xBF);
1602        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1603        self.flush_instbuf(ib);
1604    }
1605
1606    /// movzx r64, r8 (zero-extend 8→64)
1607    pub fn movzx_8_64(&mut self, dst: Reg, src: Reg) {
1608        let mut ib = InstBuf::new();
1609        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1610        ib.push(0x0F);
1611        ib.push(0xB6);
1612        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1613        self.flush_instbuf(ib);
1614    }
1615
1616    /// movzx r32, r16 (zero-extends to 64 due to 32-bit operation)
1617    pub fn movzx_16_64(&mut self, dst: Reg, src: Reg) {
1618        let mut ib = InstBuf::new();
1619        let r = dst.hi();
1620        let b = src.hi();
1621        if r != 0 || b != 0 {
1622            ib.push(0x40 | (r << 2) | b);
1623        }
1624        ib.push(0x0F);
1625        ib.push(0xB7);
1626        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1627        self.flush_instbuf(ib);
1628    }
1629
1630    /// Zero-extend 32→64: mov r32, r32 (implicit zero-extend)
1631    #[inline(always)]
1632    pub fn movzx_32_64(&mut self, dst: Reg, src: Reg) {
1633        let mut ib = InstBuf::new();
1634        let r = src.hi();
1635        let b = dst.hi();
1636        if r != 0 || b != 0 {
1637            ib.push(0x40 | (r << 2) | b);
1638        }
1639        ib.push(0x89);
1640        ib.push(0xC0 | (src.lo() << 3) | dst.lo());
1641        self.flush_instbuf(ib);
1642    }
1643
1644    // -- Conditional set --
1645
1646    /// setcc r8 (sets low byte, need to movzx after)
1647    #[inline(always)]
1648    pub fn setcc(&mut self, cc: Cc, dst: Reg) {
1649        let mut ib = InstBuf::new();
1650        if dst.needs_rex() || matches!(dst, Reg::RSP | Reg::RBP | Reg::RSI | Reg::RDI) {
1651            ib.push(0x40 | dst.hi());
1652        }
1653        ib.push(0x0F);
1654        ib.push(0x90 + cc as u8);
1655        ib.push(0xC0 | dst.lo());
1656        self.flush_instbuf(ib);
1657    }
1658
1659    /// cmovcc r64, r64
1660    #[inline(always)]
1661    pub fn cmovcc(&mut self, cc: Cc, dst: Reg, src: Reg) {
1662        let mut ib = InstBuf::new();
1663        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1664        ib.push(0x0F);
1665        ib.push(0x40 + cc as u8);
1666        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1667        self.flush_instbuf(ib);
1668    }
1669
1670    // -- Bit manipulation (require BMI/POPCNT support) --
1671
1672    /// popcnt r64, r64
1673    pub fn popcnt64(&mut self, dst: Reg, src: Reg) {
1674        let mut ib = InstBuf::new();
1675        ib.push(0xF3);
1676        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1677        ib.push(0x0F);
1678        ib.push(0xB8);
1679        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1680        self.flush_instbuf(ib);
1681    }
1682
1683    /// lzcnt r64, r64
1684    pub fn lzcnt64(&mut self, dst: Reg, src: Reg) {
1685        let mut ib = InstBuf::new();
1686        ib.push(0xF3);
1687        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1688        ib.push(0x0F);
1689        ib.push(0xBD);
1690        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1691        self.flush_instbuf(ib);
1692    }
1693
1694    /// tzcnt r64, r64
1695    pub fn tzcnt64(&mut self, dst: Reg, src: Reg) {
1696        let mut ib = InstBuf::new();
1697        ib.push(0xF3);
1698        ib.push(0x48 | (dst.hi() << 2) | src.hi());
1699        ib.push(0x0F);
1700        ib.push(0xBC);
1701        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1702        self.flush_instbuf(ib);
1703    }
1704
1705    /// popcnt r32, r32 (result zero-extended to 64 bits)
1706    pub fn popcnt32(&mut self, dst: Reg, src: Reg) {
1707        let mut ib = InstBuf::new();
1708        ib.push(0xF3);
1709        let rex = (dst.hi() << 2) | src.hi();
1710        if rex != 0 {
1711            ib.push(0x40 | rex);
1712        }
1713        ib.push(0x0F);
1714        ib.push(0xB8);
1715        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1716        self.flush_instbuf(ib);
1717    }
1718
1719    /// lzcnt r32, r32 (result zero-extended to 64 bits)
1720    pub fn lzcnt32(&mut self, dst: Reg, src: Reg) {
1721        let mut ib = InstBuf::new();
1722        ib.push(0xF3);
1723        let rex = (dst.hi() << 2) | src.hi();
1724        if rex != 0 {
1725            ib.push(0x40 | rex);
1726        }
1727        ib.push(0x0F);
1728        ib.push(0xBD);
1729        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1730        self.flush_instbuf(ib);
1731    }
1732
1733    /// tzcnt r32, r32 (result zero-extended to 64 bits)
1734    pub fn tzcnt32(&mut self, dst: Reg, src: Reg) {
1735        let mut ib = InstBuf::new();
1736        ib.push(0xF3);
1737        let rex = (dst.hi() << 2) | src.hi();
1738        if rex != 0 {
1739            ib.push(0x40 | rex);
1740        }
1741        ib.push(0x0F);
1742        ib.push(0xBC);
1743        ib.push(0xC0 | (dst.lo() << 3) | src.lo());
1744        self.flush_instbuf(ib);
1745    }
1746
1747    /// bswap r64
1748    pub fn bswap64(&mut self, dst: Reg) {
1749        self.emit3(0x48 | dst.hi(), 0x0F, 0xC8 + dst.lo());
1750    }
1751
1752    // -- Stack --
1753
1754    #[inline(always)]
1755    pub fn push(&mut self, reg: Reg) {
1756        self.rex_opt_b(reg);
1757        self.emit(0x50 + reg.lo());
1758    }
1759
1760    #[inline(always)]
1761    pub fn pop(&mut self, reg: Reg) {
1762        self.rex_opt_b(reg);
1763        self.emit(0x58 + reg.lo());
1764    }
1765
1766    /// push sign-extended imm32 (5 bytes: 0x68 + imm32).
1767    pub fn push_imm32(&mut self, imm: i32) {
1768        self.emit(0x68);
1769        self.emit_i32(imm);
1770    }
1771
1772    // -- Branches and jumps --
1773
1774    /// jmp to label — uses rel8 for backward jumps within ±127 bytes.
1775    #[inline(always)]
1776    pub fn jmp_label(&mut self, label: Label) {
1777        let bound = self.labels[label.0 as usize];
1778        if bound != LABEL_UNBOUND {
1779            let target = (bound - 1) as isize; // stored as offset+1
1780            // Backward jump — label already bound, try rel8.
1781            let rel = target - (self.write_pos as isize + 2);
1782            if rel >= i8::MIN as isize && rel <= i8::MAX as isize {
1783                self.emit(0xEB);
1784                self.emit(rel as u8);
1785                return;
1786            }
1787        }
1788        // Forward jump or out of rel8 range — use rel32
1789        self.emit(0xE9);
1790        self.emit_label_fixup(label);
1791    }
1792
1793    /// jcc to label — uses rel8 for backward jumps within ±127 bytes.
1794    #[inline(always)]
1795    pub fn jcc_label(&mut self, cc: Cc, label: Label) {
1796        let bound = self.labels[label.0 as usize];
1797        if bound != LABEL_UNBOUND {
1798            let target = (bound - 1) as isize; // stored as offset+1
1799            // Backward jump — label already bound, try rel8.
1800            let rel = target - (self.write_pos as isize + 2);
1801            if rel >= i8::MIN as isize && rel <= i8::MAX as isize {
1802                self.emit(0x70 + cc as u8);
1803                self.emit(rel as u8);
1804                return;
1805            }
1806        }
1807        // Forward jump or out of rel8 range — use rel32
1808        self.emit(0x0F);
1809        self.emit(0x80 + cc as u8);
1810        self.emit_label_fixup(label);
1811    }
1812
1813    /// jmp r64 (indirect)
1814    pub fn jmp_reg(&mut self, reg: Reg) {
1815        self.rex_opt_b(reg);
1816        self.emit(0xFF);
1817        self.emit(0xE0 | reg.lo()); // /4
1818    }
1819
1820    /// call r64 (indirect)
1821    pub fn call_reg(&mut self, reg: Reg) {
1822        self.rex_opt_b(reg);
1823        self.emit(0xFF);
1824        self.emit(0xD0 | reg.lo()); // /2
1825    }
1826
1827    /// call label
1828    pub fn call_label(&mut self, label: Label) {
1829        self.emit(0xE8);
1830        self.emit_label_fixup(label);
1831    }
1832
1833    /// ret
1834    pub fn ret(&mut self) {
1835        self.emit(0xC3);
1836    }
1837
1838    // -- LEA --
1839
1840    /// lea r64, [base + disp]
1841    pub fn lea(&mut self, dst: Reg, base: Reg, disp: i32) {
1842        let mut ib = InstBuf::new();
1843        ib.push(0x48 | (dst.hi() << 2) | base.hi());
1844        ib.push(0x8D);
1845        Self::modrm_disp_ib(&mut ib, dst.lo(), base, disp);
1846        self.flush_instbuf(ib);
1847    }
1848
1849    /// lea r32, [base + disp] — 32-bit result, zero-extends to 64-bit.
1850    #[inline(always)]
1851    pub fn lea_32(&mut self, dst: Reg, base: Reg, disp: i32) {
1852        let mut ib = InstBuf::new();
1853        let r = dst.hi();
1854        let b = base.hi();
1855        if r != 0 || b != 0 {
1856            ib.push(0x40 | (r << 2) | b);
1857        }
1858        ib.push(0x8D);
1859        Self::modrm_disp_ib(&mut ib, dst.lo(), base, disp);
1860        self.flush_instbuf(ib);
1861    }
1862
1863    /// lea r32, [base32 + index32 * (1 << scale_log2)]
1864    /// scale_log2: 0=*1, 1=*2, 2=*4, 3=*8
1865    #[inline(always)]
1866    pub fn lea_sib_scaled_32(&mut self, dst: Reg, base: Reg, index: Reg, scale_log2: u8) {
1867        debug_assert!(scale_log2 <= 3);
1868        let mut ib = InstBuf::new();
1869        let rex = 0x40 | (dst.hi() << 2) | (index.hi() << 1) | base.hi();
1870        if rex != 0x40 {
1871            ib.push(rex);
1872        }
1873        ib.push(0x8D);
1874        let scale_bits = scale_log2 << 6;
1875        if base.lo() == 5 {
1876            ib.push(0x44 | (dst.lo() << 3));
1877            ib.push(scale_bits | (index.lo() << 3) | base.lo());
1878            ib.push(0x00);
1879        } else {
1880            ib.push((dst.lo() << 3) | 0x04);
1881            ib.push(scale_bits | (index.lo() << 3) | base.lo());
1882        }
1883        self.flush_instbuf(ib);
1884    }
1885
1886    // -- Misc --
1887
1888    /// ud2 (undefined instruction, for traps)
1889    pub fn ud2(&mut self) {
1890        self.emit(0x0F);
1891        self.emit(0x0B);
1892    }
1893
1894    /// nop
1895    pub fn nop(&mut self) {
1896        self.emit(0x90);
1897    }
1898
1899    /// int3 (debug breakpoint)
1900    pub fn int3(&mut self) {
1901        self.emit(0xCC);
1902    }
1903
1904    // === Finalization ===
1905
1906    /// Get the resolved native offset for a label (only valid after bind_label).
1907    pub fn label_offset(&self, label: Label) -> Option<usize> {
1908        let off = self.labels[label.0 as usize];
1909        if off == LABEL_UNBOUND {
1910            None
1911        } else {
1912            Some((off - 1) as usize)
1913        }
1914    }
1915
1916    /// Sync Vec length with the write cursor. Call before accessing `self.code` directly.
1917    pub fn sync_len(&mut self) {
1918        let CodeBuf::Vec(code) = &mut self.code_buf;
1919        // SAFETY: write_pos <= code.capacity() (maintained by emission guards).
1920        unsafe {
1921            code.set_len(self.write_pos);
1922        }
1923    }
1924
1925    /// Resolve all label fixups in-place (works for both Vec and mmap buffers).
1926    fn resolve_fixups(&mut self) {
1927        for fixup in &self.fixups {
1928            let stored = self.labels[fixup.label.0 as usize];
1929            // All labels must be bound by finalization time.
1930            assert!(stored != LABEL_UNBOUND, "unbound label {:?}", fixup.label);
1931            let target = stored - 1; // stored as offset+1
1932            let rel = (target as i64) - (fixup.offset as i64 + 4);
1933            let rel32 = rel as i32;
1934            // SAFETY: fixup.offset + 4 <= write_pos (fixup was recorded during emission).
1935            unsafe {
1936                core::ptr::copy_nonoverlapping(
1937                    rel32.to_le_bytes().as_ptr(),
1938                    self.buf.add(fixup.offset),
1939                    4,
1940                );
1941            }
1942        }
1943    }
1944
1945    /// Resolve fixups and return the code as a `Vec<u8>`.
1946    pub fn finalize(&mut self) -> Vec<u8> {
1947        self.resolve_fixups();
1948        let CodeBuf::Vec(code) = &mut self.code_buf;
1949        // SAFETY: write_pos <= code.capacity().
1950        unsafe {
1951            code.set_len(self.write_pos);
1952        }
1953        core::mem::take(code)
1954    }
1955
1956    /// Get a slice of the written code bytes (for tests). Syncs Vec len first.
1957    #[cfg(test)]
1958    pub fn code_bytes(&mut self) -> &[u8] {
1959        self.sync_len();
1960        let CodeBuf::Vec(v) = &self.code_buf;
1961        v.as_slice()
1962    }
1963}
1964
1965#[cfg(test)]
1966mod tests {
1967    use super::*;
1968
1969    #[test]
1970    fn test_mov_ri64_zero() {
1971        let mut asm = Assembler::new();
1972        asm.mov_ri64(Reg::RAX, 0);
1973        // xor eax, eax → 0x31 0xC0
1974        assert_eq!(asm.code_bytes(), &[0x31, 0xC0]);
1975    }
1976
1977    #[test]
1978    fn test_mov_ri64_small() {
1979        let mut asm = Assembler::new();
1980        asm.mov_ri64(Reg::RAX, 42);
1981        // mov eax, 42 → 0xB8, 0x2A, 0x00, 0x00, 0x00
1982        assert_eq!(asm.code_bytes(), &[0xB8, 0x2A, 0x00, 0x00, 0x00]);
1983    }
1984
1985    #[test]
1986    fn test_label_resolution() {
1987        let mut asm = Assembler::new();
1988        let lbl = asm.new_label();
1989        asm.jmp_label(lbl); // 5 bytes: E9 + 4-byte rel32
1990        asm.nop(); // 1 byte at offset 5
1991        asm.bind_label(lbl); // label at offset 6
1992        let code = asm.finalize();
1993        // rel32 = 6 - (0 + 4 + 1) = 6 - 5 = 1
1994        // Wait: fixup offset is 1 (after E9), target is 6
1995        // rel = 6 - (1 + 4) = 1
1996        assert_eq!(code[0], 0xE9);
1997        let rel = i32::from_le_bytes([code[1], code[2], code[3], code[4]]);
1998        assert_eq!(rel, 1); // skip over the nop
1999    }
2000
2001    /// Baseless `mov r32, [rdx]`:
2002    /// REX (0x40) + 0x8B + ModRM (mod=00, reg=000=eax, r/m=010=rdx) = 3 bytes.
2003    /// Codegen relies on this being the 3-byte form (vs the 4-byte SIB).
2004    #[test]
2005    fn test_mov_load32_at_rdx() {
2006        let mut asm = Assembler::new();
2007        asm.mov_load32_at_index(Reg::RAX, Reg::RDX);
2008        assert_eq!(asm.code_bytes(), &[0x8B, 0x02]);
2009    }
2010
2011    /// Same shape but with R8 as dst (REX.R=1) and R12 as idx
2012    /// (lo=4, needs SIB recovery to encode bare `[r12]`).
2013    #[test]
2014    fn test_mov_load64_at_r12_into_r8() {
2015        let mut asm = Assembler::new();
2016        asm.mov_load64_at_index(Reg::R8, Reg::R12);
2017        // REX.W=1 REX.R=1 REX.B=1 → 0x4D
2018        // opcode 0x8B
2019        // ModRM mod=00 reg=000 r/m=100 → 0x04
2020        // SIB scale=00 index=100=none base=100 → 0x24
2021        assert_eq!(asm.code_bytes(), &[0x4D, 0x8B, 0x04, 0x24]);
2022    }
2023
2024    /// `mov eax, [rip+disp32]` reaches `target_va`. With jit_va_base=0
2025    /// and write_pos=0 after emit, post-instruction RIP is the 7th byte,
2026    /// so disp32 = target_va - 7.
2027    #[test]
2028    fn test_mov_load32_rip_rel() {
2029        let mut asm = Assembler::new();
2030        asm.set_jit_va_base(0x1_0000_0000); // 4 GiB
2031        asm.mov_load32_rip_rel(Reg::RAX, 0x1_0000_0040); // 64 bytes above base
2032        // 8B 05 <disp32>  — disp = 0x40 - (0 + 6) = 0x3A
2033        // Wait: instruction is 6 bytes (no REX needed). post_inst_rip = base + 6.
2034        // disp = 0x1_0000_0040 - (0x1_0000_0000 + 6) = 0x3A.
2035        assert_eq!(asm.code_bytes(), &[0x8B, 0x05, 0x3A, 0x00, 0x00, 0x00]);
2036    }
2037
2038    /// `mov [rip+disp32], r15` with REX.W|REX.R = 0x4C.
2039    #[test]
2040    fn test_mov_store64_rip_rel_r15() {
2041        let mut asm = Assembler::new();
2042        asm.set_jit_va_base(0x1_0000_0000);
2043        asm.mov_store64_rip_rel(0x1_0000_0080, Reg::R15);
2044        // 4C 89 3D <disp32> — 7-byte instruction.
2045        // disp = 0x80 - (0 + 7) = 0x79.
2046        assert_eq!(
2047            asm.code_bytes(),
2048            &[0x4C, 0x89, 0x3D, 0x79, 0x00, 0x00, 0x00]
2049        );
2050    }
2051
2052    /// `mov [rdx], eax` (32-bit store) — baseless form, 2 bytes.
2053    #[test]
2054    fn test_mov_store32_at_rdx() {
2055        let mut asm = Assembler::new();
2056        asm.mov_store32_at_index(Reg::RDX, Reg::RAX);
2057        assert_eq!(asm.code_bytes(), &[0x89, 0x02]);
2058    }
2059
2060    /// `mov [rdx], r11d` — REX.R for r11 source.
2061    #[test]
2062    fn test_mov_store32_at_rdx_from_r11() {
2063        let mut asm = Assembler::new();
2064        asm.mov_store32_at_index(Reg::RDX, Reg::R11);
2065        assert_eq!(asm.code_bytes(), &[0x44, 0x89, 0x1A]);
2066    }
2067
2068    /// `mov byte [rdx], sil` — src ∈ {SPL,BPL,SIL,DIL} requires a
2069    /// REX prefix to disambiguate from AH/CH/DH/BH. The bare `88 32`
2070    /// would decode as `mov [rdx], DH` which is silently wrong.
2071    #[test]
2072    fn test_mov_store8_at_rdx_from_sil() {
2073        let mut asm = Assembler::new();
2074        asm.mov_store8_at_index(Reg::RDX, Reg::RSI);
2075        assert_eq!(asm.code_bytes(), &[0x40, 0x88, 0x32]);
2076    }
2077
2078    #[test]
2079    fn test_mov_store8_at_rdx_from_dil() {
2080        let mut asm = Assembler::new();
2081        asm.mov_store8_at_index(Reg::RDX, Reg::RDI);
2082        assert_eq!(asm.code_bytes(), &[0x40, 0x88, 0x3A]);
2083    }
2084
2085    #[test]
2086    fn test_mov_store8_at_rdx_from_bpl() {
2087        let mut asm = Assembler::new();
2088        asm.mov_store8_at_index(Reg::RDX, Reg::RBP);
2089        assert_eq!(asm.code_bytes(), &[0x40, 0x88, 0x2A]);
2090    }
2091
2092    #[test]
2093    fn test_mov_store8_at_rdx_from_spl() {
2094        let mut asm = Assembler::new();
2095        asm.mov_store8_at_index(Reg::RDX, Reg::RSP);
2096        assert_eq!(asm.code_bytes(), &[0x40, 0x88, 0x22]);
2097    }
2098
2099    /// `mov byte [rdx], al` — REX still NOT emitted for AL/CL/DL/BL.
2100    #[test]
2101    fn test_mov_store8_at_rdx_from_al_no_rex() {
2102        let mut asm = Assembler::new();
2103        asm.mov_store8_at_index(Reg::RDX, Reg::RAX);
2104        assert_eq!(asm.code_bytes(), &[0x88, 0x02]);
2105    }
2106
2107    /// `mov dword [rip+disp32], imm32` — RIP-relative store-imm form.
2108    /// 10-byte instruction: C7 05 <disp32> <imm32>. The CPU resolves
2109    /// RIP-relative against the NEXT-instruction RIP, which is past
2110    /// BOTH disp32 and the trailing imm32 — so disp = target - 10.
2111    #[test]
2112    fn test_mov_store32_rip_rel_imm() {
2113        let mut asm = Assembler::new();
2114        asm.set_jit_va_base(0x1_0000_0000);
2115        asm.mov_store32_rip_rel_imm(0x1_0000_0100, 0x123);
2116        // disp = 0x100 - (0 + 10) = 0xF6.
2117        assert_eq!(
2118            asm.code_bytes(),
2119            &[0xC7, 0x05, 0xF6, 0x00, 0x00, 0x00, 0x23, 0x01, 0x00, 0x00]
2120        );
2121    }
2122
2123    /// `add rax, [rip+disp32]` — used by the dispatch sequence to
2124    /// fold code_base into native_addr.
2125    #[test]
2126    fn test_add_r64_mem_rip_rel() {
2127        let mut asm = Assembler::new();
2128        asm.set_jit_va_base(0x1_0000_0000);
2129        asm.add_r64_mem_rip_rel(Reg::RAX, 0x1_0000_0020);
2130        // 48 03 05 <disp32> — 7-byte instruction.
2131        // disp = 0x20 - (0 + 7) = 0x19.
2132        assert_eq!(
2133            asm.code_bytes(),
2134            &[0x48, 0x03, 0x05, 0x19, 0x00, 0x00, 0x00]
2135        );
2136    }
2137
2138    /// `sub r15, imm32` patchable form — must be exactly 7 bytes with
2139    /// the imm32 at bytes 3..7 (codegen patches via `offset() - 4`).
2140    #[test]
2141    fn test_sub_r64_imm32_patchable_r15() {
2142        let mut asm = Assembler::new();
2143        asm.sub_r64_imm32_patchable(Reg::R15, 0xCAFE_F00D_u32 as i32);
2144        let bytes = asm.code_bytes();
2145        assert_eq!(bytes.len(), 7, "patchable sub must be 7 bytes");
2146        // REX.W=1 REX.B=1 → 0x49
2147        // opcode 0x81
2148        // ModRM mod=11 reg=5 r/m=7 → 0xEF
2149        // imm32 at bytes 3..7 in LE
2150        assert_eq!(&bytes[0..3], &[0x49, 0x81, 0xEF]);
2151        assert_eq!(&bytes[3..7], &[0x0D, 0xF0, 0xFE, 0xCA]);
2152    }
2153
2154    #[test]
2155    fn test_push_pop_r15() {
2156        let mut asm = Assembler::new();
2157        asm.push(Reg::R15);
2158        asm.pop(Reg::R15);
2159        // push r15: 41 57, pop r15: 41 5F
2160        assert_eq!(asm.code_bytes(), &[0x41, 0x57, 0x41, 0x5F]);
2161    }
2162}