Skip to main content

javm_exec/
gas_cost.rs

1//! Per-basic-block gas cost model (JAR v0.8.0).
2//!
3//! Simulates a CPU pipeline to compute gas cost for a basic block.
4//! Cost = max(simulation_cycles - 3, 1).
5//!
6//! Pipeline model:
7//! - Reorder buffer: max 32 entries
8//! - 4 decode slots per cycle, 5 dispatch slots per cycle
9//! - Execution units: ALU:4, LOAD:4, STORE:4, MUL:1, DIV:1
10
11use alloc::format;
12use alloc::string::String;
13use alloc::vec::Vec;
14
15/// `trace_eprintln!`-equivalent that's a no-op without std. Used by the
16/// trace path in `gas_sim_traced`, which is only ever enabled in
17/// debug builds.
18#[cfg(feature = "std")]
19macro_rules! trace_eprintln {
20    ($($t:tt)*) => { std::eprintln!($($t)*) };
21}
22#[cfg(not(feature = "std"))]
23macro_rules! trace_eprintln {
24    ($($t:tt)*) => { let _ = format_args!($($t)*); };
25}
26
27// --- Data structures ---
28
29#[derive(Clone, Copy, Default, Debug)]
30struct ExecUnits {
31    alu: u8,
32    load: u8,
33    store: u8,
34    mul: u8,
35    div: u8,
36}
37
38impl ExecUnits {
39    fn can_satisfy(self, req: ExecUnits) -> bool {
40        self.alu >= req.alu
41            && self.load >= req.load
42            && self.store >= req.store
43            && self.mul >= req.mul
44            && self.div >= req.div
45    }
46    fn sub(self, req: ExecUnits) -> ExecUnits {
47        ExecUnits {
48            alu: self.alu - req.alu,
49            load: self.load - req.load,
50            store: self.store - req.store,
51            mul: self.mul - req.mul,
52            div: self.div - req.div,
53        }
54    }
55    const RESET: ExecUnits = ExecUnits {
56        alu: 4,
57        load: 4,
58        store: 4,
59        mul: 1,
60        div: 1,
61    };
62    const ALU: ExecUnits = ExecUnits {
63        alu: 1,
64        load: 0,
65        store: 0,
66        mul: 0,
67        div: 0,
68    };
69    const LOAD: ExecUnits = ExecUnits {
70        alu: 1,
71        load: 1,
72        store: 0,
73        mul: 0,
74        div: 0,
75    };
76    const STORE: ExecUnits = ExecUnits {
77        alu: 1,
78        load: 0,
79        store: 1,
80        mul: 0,
81        div: 0,
82    };
83    const MUL: ExecUnits = ExecUnits {
84        alu: 1,
85        load: 0,
86        store: 0,
87        mul: 1,
88        div: 0,
89    };
90    const DIV: ExecUnits = ExecUnits {
91        alu: 1,
92        load: 0,
93        store: 0,
94        mul: 0,
95        div: 1,
96    };
97    const NONE: ExecUnits = ExecUnits {
98        alu: 0,
99        load: 0,
100        store: 0,
101        mul: 0,
102        div: 0,
103    };
104    fn _to_eu_byte(self) -> u8 {
105        if self.div > 0 {
106            5
107        } else if self.mul > 0 {
108            4
109        } else if self.store > 0 {
110            3
111        } else if self.load > 0 {
112            2
113        } else if self.alu > 0 {
114            1
115        } else {
116            0
117        }
118    }
119}
120
121#[derive(Clone, Copy, PartialEq)]
122enum RobState {
123    Wait,
124    Exe,
125    Fin,
126}
127
128#[derive(Clone, Copy)]
129struct RobEntry {
130    state: RobState,
131    cycles_left: u32,
132    deps: [u8; 4], // ROB indices this depends on (0xFF = unused)
133    dep_count: u8,
134    dest_regs: RegSet,
135    exec_units: ExecUnits,
136}
137
138struct SimState {
139    ip: Option<usize>, // instruction pointer (None = done decoding)
140    cycles: u32,
141    decode_slots: u8,      // remaining per cycle (reset to 4)
142    dispatch_slots: u8,    // remaining per cycle (reset to 5)
143    exec_units: ExecUnits, // remaining per cycle
144    rob: Vec<RobEntry>,
145}
146
147// --- Instruction cost analysis ---
148
149/// Fixed-capacity register set (max 3 registers, no heap allocation).
150#[derive(Clone, Copy, Default, Debug)]
151struct RegSet {
152    regs: [u8; 3],
153    len: u8,
154}
155
156impl RegSet {
157    const EMPTY: Self = Self {
158        regs: [0; 3],
159        len: 0,
160    };
161    fn one(r: u8) -> Self {
162        Self {
163            regs: [r, 0, 0],
164            len: 1,
165        }
166    }
167    fn two(a: u8, b: u8) -> Self {
168        Self {
169            regs: [a, b, 0],
170            len: 2,
171        }
172    }
173    #[inline]
174    fn contains(&self, r: u8) -> bool {
175        (self.len >= 1 && self.regs[0] == r)
176            || (self.len >= 2 && self.regs[1] == r)
177            || (self.len >= 3 && self.regs[2] == r)
178    }
179    #[inline]
180    fn iter(&self) -> impl Iterator<Item = &u8> {
181        self.regs[..self.len as usize].iter()
182    }
183}
184
185struct InstrCost {
186    cycles: u32,
187    decode_slots: u8,
188    exec_units: ExecUnits,
189    dest_regs: RegSet,
190    src_regs: RegSet,
191    is_terminator: bool,
192    is_move_reg: bool,
193}
194
195fn dst_overlaps_src(dst: u8, srcs: &RegSet) -> bool {
196    srcs.contains(dst)
197}
198
199/// Branch cost: 1 if target is unlikely(2) or trap(0), else 20.
200fn branch_cost(code: &[u8], bitmask: &[u8], target: usize) -> u32 {
201    if target < code.len() && target < bitmask.len() && bitmask[target] == 1 {
202        let opcode = code[target];
203        if opcode == 0 || opcode == 2 { 1 } else { 20 }
204    } else {
205        20
206    }
207}
208
209/// Extract a 4-bit register nibble from an instruction encoding.
210///
211/// Reads the byte at `pc + byte_offset`, shifts right by `shift`, and masks to 4 bits.
212/// Returns 0 if the byte is out of bounds.
213fn extract_reg(code: &[u8], pc: usize, byte_offset: usize, shift: u8) -> u8 {
214    if pc + byte_offset < code.len() {
215        (code[pc + byte_offset] >> shift) & 0x0F
216    } else {
217        0
218    }
219}
220
221/// Extract register A (first register, lower nibble of byte after opcode).
222fn reg_a(code: &[u8], pc: usize) -> u8 {
223    extract_reg(code, pc, 1, 0)
224}
225/// Extract register B (second register, upper nibble of byte after opcode).
226fn reg_b(code: &[u8], pc: usize) -> u8 {
227    extract_reg(code, pc, 1, 4)
228}
229/// Extract register D (third register, lower nibble of second byte after opcode).
230fn reg_d(code: &[u8], pc: usize) -> u8 {
231    extract_reg(code, pc, 2, 0)
232}
233
234/// Compute skip distance (bytes to next instruction start).
235pub fn skip_distance(bitmask: &[u8], pc: usize) -> usize {
236    for j in 0..25 {
237        let idx = pc + 1 + j;
238        let bit = if idx < bitmask.len() { bitmask[idx] } else { 1 };
239        if bit == 1 {
240            return j;
241        }
242    }
243    24
244}
245
246/// Extract branch target from reg+imm+offset instruction.
247fn extract_branch_target(code: &[u8], bitmask: &[u8], pc: usize) -> usize {
248    let skip = skip_distance(bitmask, pc);
249    // Target offset is encoded in the last bytes of the instruction
250    // For OneRegImmOffset: layout is [opcode, ra|imm_lo, imm_hi..., offset_bytes]
251    // The offset is a signed value relative to the instruction start
252    let instr_len = 1 + skip;
253    if instr_len >= 3 && pc + instr_len <= code.len() {
254        // Decode offset from the last portion of the instruction
255        // For A.5.8 format: opcode + reg_nibble + immediate + offset
256        // The offset part depends on skip length
257        let raw = crate::args::decode_args(
258            code,
259            pc,
260            skip,
261            crate::instruction::InstructionCategory::OneRegImmOffset,
262        );
263        if let crate::args::Args::RegImmOffset { offset, .. } = raw {
264            return offset as usize;
265        }
266    }
267    pc // fallback
268}
269
270/// Extract branch target from two-reg+offset instruction.
271fn extract_two_reg_branch_target(code: &[u8], bitmask: &[u8], pc: usize) -> usize {
272    let skip = skip_distance(bitmask, pc);
273    let raw = crate::args::decode_args(
274        code,
275        pc,
276        skip,
277        crate::instruction::InstructionCategory::TwoRegOneOffset,
278    );
279    if let crate::args::Args::TwoRegOffset { offset, .. } = raw {
280        return offset as usize;
281    }
282    pc
283}
284
285/// Instruction cost lookup based on opcode.
286fn instruction_cost(code: &[u8], bitmask: &[u8], pc: usize) -> InstrCost {
287    let opcode = if pc < code.len() { code[pc] } else { 0 };
288    let ra = reg_a(code, pc);
289    let rb = reg_b(code, pc);
290    let rd = reg_d(code, pc);
291
292    let mk = |cy: u32, dc: u8, eu: ExecUnits, dst: RegSet, src: RegSet| -> InstrCost {
293        InstrCost {
294            cycles: cy,
295            decode_slots: dc,
296            exec_units: eu,
297            dest_regs: dst,
298            src_regs: src,
299            is_terminator: false,
300            is_move_reg: false,
301        }
302    };
303    let mkt = |cy: u32, dc: u8, eu: ExecUnits, dst: RegSet, src: RegSet| -> InstrCost {
304        InstrCost {
305            cycles: cy,
306            decode_slots: dc,
307            exec_units: eu,
308            dest_regs: dst,
309            src_regs: src,
310            is_terminator: true,
311            is_move_reg: false,
312        }
313    };
314    let e = RegSet::EMPTY;
315    let r1 = RegSet::one;
316    let r2 = RegSet::two;
317
318    match opcode {
319        // No-arg
320        0 => mkt(2, 1, ExecUnits::NONE, e, e),   // trap
321        1 => mkt(2, 1, ExecUnits::NONE, e, e),   // fallthrough
322        2 => mkt(40, 1, ExecUnits::NONE, e, e),  // unlikely
323        10 => mkt(100, 4, ExecUnits::ALU, e, e), // ecalli
324
325        // Control flow
326        40 => mkt(15, 1, ExecUnits::ALU, e, e), // jump
327        80 => {
328            // load_imm_jump
329            let skip = skip_distance(bitmask, pc);
330            let raw = crate::args::decode_args(
331                code,
332                pc,
333                skip,
334                crate::instruction::InstructionCategory::OneRegImmOffset,
335            );
336            let r = if let crate::args::Args::RegImmOffset { ra: r, .. } = raw {
337                r as u8
338            } else {
339                ra
340            };
341            mkt(15, 1, ExecUnits::ALU, r1(r), e)
342        }
343        50 => mkt(22, 1, ExecUnits::ALU, e, e), // jump_ind
344        180 => mkt(22, 1, ExecUnits::ALU, r1(ra), r1(rb)), // load_imm_jump_ind
345
346        // Loads (reg+imm and two-reg+imm variants)
347        52..=58 => mk(25, 1, ExecUnits::LOAD, r1(ra), r1(rb)),
348        124..=130 => mk(25, 1, ExecUnits::LOAD, r1(ra), r1(rb)),
349
350        // Stores (reg+imm variants)
351        59..=62 => mk(25, 1, ExecUnits::STORE, e, r2(ra, rb)),
352        // Stores (two-reg+imm)
353        120..=123 => mk(25, 1, ExecUnits::STORE, e, r2(ra, rb)),
354        // Store immediates (two-imm)
355        30..=33 => mk(25, 1, ExecUnits::STORE, e, e),
356        // Store imm indirect (reg+two-imm)
357        70..=73 => mk(25, 1, ExecUnits::STORE, e, r1(ra)),
358
359        // Load immediates
360        51 => mk(1, 1, ExecUnits::NONE, r1(ra), e), // load_imm
361        20 => mk(1, 2, ExecUnits::NONE, r1(ra), e), // load_imm_64
362
363        // move_reg: decoded in frontend, no ROB entry
364        100 => InstrCost {
365            cycles: 0,
366            decode_slots: 1,
367            exec_units: ExecUnits::NONE,
368            dest_regs: r1(ra),
369            src_regs: r1(rb),
370            is_terminator: false,
371            is_move_reg: true,
372        },
373
374        // sbrk (101): removed in jar080, but cost it anyway for simulation
375        101 => mk(2, 1, ExecUnits::NONE, e, e),
376
377        // Branches (reg + imm + offset)
378        81..=90 => {
379            let target = extract_branch_target(code, bitmask, pc);
380            let bc = branch_cost(code, bitmask, target);
381            mkt(bc, 1, ExecUnits::ALU, e, r1(ra))
382        }
383
384        // Branches (two-reg + offset)
385        170..=175 => {
386            let target = extract_two_reg_branch_target(code, bitmask, pc);
387            let bc = branch_cost(code, bitmask, target);
388            mkt(bc, 1, ExecUnits::ALU, e, r2(ra, rb))
389        }
390
391        // ALU 64-bit 3-reg: add_64(200), sub_64(201), and(210), xor(211), or(212)
392        200 | 201 | 210 | 211 | 212 => {
393            let dc = if dst_overlaps_src(ra, &r2(rb, rd)) {
394                1
395            } else {
396                2
397            };
398            mk(1, dc, ExecUnits::ALU, r1(ra), r2(rb, rd))
399        }
400        // ALU 32-bit 3-reg: add_32(190), sub_32(191)
401        190 | 191 => {
402            let dc = if dst_overlaps_src(ra, &r2(rb, rd)) {
403                2
404            } else {
405                3
406            };
407            mk(2, dc, ExecUnits::ALU, r1(ra), r2(rb, rd))
408        }
409
410        // ALU 2-op imm 64-bit
411        132 | 133 | 134 | 149 | 151 | 152 | 153 | 158 | 110 => {
412            let dc = if dst_overlaps_src(ra, &r1(rb)) { 1 } else { 2 };
413            mk(1, dc, ExecUnits::ALU, r1(ra), r1(rb))
414        }
415        // ALU 2-op imm 32-bit
416        131 | 138 | 139 | 140 | 160 => {
417            let dc = if dst_overlaps_src(ra, &r1(rb)) { 2 } else { 3 };
418            mk(2, dc, ExecUnits::ALU, r1(ra), r1(rb))
419        }
420
421        // Trivial 2-op 1-cycle: popcount, clz, sign_extend, zero_extend
422        102 | 103 | 104 | 105 | 108 | 109 => mk(1, 1, ExecUnits::ALU, r1(ra), r1(rb)),
423        // Trivial 2-op 2-cycle: ctz
424        106 | 107 => mk(2, 1, ExecUnits::ALU, r1(ra), r1(rb)),
425        // reverse_bytes
426        111 => mk(1, 1, ExecUnits::ALU, r1(ra), r1(rb)),
427
428        // Shifts 64-bit 3-reg
429        207 | 208 | 209 | 220 | 222 => {
430            let dc = if rb == ra { 2 } else { 3 };
431            mk(1, dc, ExecUnits::ALU, r1(ra), r2(rb, rd))
432        }
433        // Shifts 32-bit 3-reg
434        197 | 198 | 199 | 221 | 223 => {
435            let dc = if rb == ra { 3 } else { 4 };
436            mk(2, dc, ExecUnits::ALU, r1(ra), r2(rb, rd))
437        }
438        // Shift alt 64-bit
439        155 | 156 | 157 | 159 => mk(1, 3, ExecUnits::ALU, r1(ra), r1(rb)),
440        // Shift alt 32-bit
441        144 | 145 | 146 | 161 => mk(2, 4, ExecUnits::ALU, r1(ra), r1(rb)),
442
443        // Comparisons (3-reg)
444        216 | 217 => mk(3, 3, ExecUnits::ALU, r1(ra), r2(rb, rd)),
445        // Comparisons (imm)
446        136 | 137 | 142 | 143 => mk(3, 3, ExecUnits::ALU, r1(ra), r1(rb)),
447
448        // Conditional moves (3-reg)
449        218 | 219 => mk(2, 2, ExecUnits::ALU, r1(ra), r2(rb, rd)),
450        // Conditional moves (imm)
451        147 | 148 => mk(2, 3, ExecUnits::ALU, r1(ra), r1(rb)),
452
453        // Min/Max
454        227..=230 => {
455            let dc = if dst_overlaps_src(ra, &r2(rb, rd)) {
456                2
457            } else {
458                3
459            };
460            mk(3, dc, ExecUnits::ALU, r1(ra), r2(rb, rd))
461        }
462        // and_inv, or_inv
463        224 | 225 => mk(2, 3, ExecUnits::ALU, r1(ra), r2(rb, rd)),
464        // xnor
465        226 => {
466            let dc = if dst_overlaps_src(ra, &r2(rb, rd)) {
467                2
468            } else {
469                3
470            };
471            mk(2, dc, ExecUnits::ALU, r1(ra), r2(rb, rd))
472        }
473
474        // neg_add_imm_64
475        154 => mk(2, 3, ExecUnits::ALU, r1(ra), r1(rb)),
476        // neg_add_imm_32
477        141 => mk(3, 4, ExecUnits::ALU, r1(ra), r1(rb)),
478
479        // Multiply 64-bit (3-reg)
480        202 => {
481            let dc = if dst_overlaps_src(ra, &r2(rb, rd)) {
482                1
483            } else {
484                2
485            };
486            mk(3, dc, ExecUnits::MUL, r1(ra), r2(rb, rd))
487        }
488        // mul_imm_64
489        150 => {
490            let dc = if dst_overlaps_src(ra, &r1(rb)) { 1 } else { 2 };
491            mk(3, dc, ExecUnits::MUL, r1(ra), r1(rb))
492        }
493        // Multiply 32-bit (3-reg)
494        192 => {
495            let dc = if dst_overlaps_src(ra, &r2(rb, rd)) {
496                2
497            } else {
498                3
499            };
500            mk(4, dc, ExecUnits::MUL, r1(ra), r2(rb, rd))
501        }
502        // mul_imm_32
503        135 => {
504            let dc = if dst_overlaps_src(ra, &r1(rb)) { 2 } else { 3 };
505            mk(4, dc, ExecUnits::MUL, r1(ra), r1(rb))
506        }
507
508        // Multiply upper (SS, UU)
509        213 | 214 => mk(4, 4, ExecUnits::MUL, r1(ra), r2(rb, rd)),
510        // Multiply upper (SU)
511        215 => mk(6, 4, ExecUnits::MUL, r1(ra), r2(rb, rd)),
512
513        // Divide (all variants)
514        193 | 194 | 195 | 196 | 203 | 204 | 205 | 206 => {
515            mk(60, 4, ExecUnits::DIV, r1(ra), r2(rb, rd))
516        }
517
518        // Rotate 64-bit (3-reg)
519        // Already covered by shifts above (220, 222 = RotL64, RotR64)
520
521        // Rotate 32-bit (3-reg)
522        // Already covered by shifts above (221, 223 = RotL32, RotR32)
523
524        // Rotate imm
525        // Already covered by shift alt above
526
527        // Default: unknown opcode
528        _ => mk(1, 1, ExecUnits::NONE, e, e),
529    }
530}
531
532// --- Simulation ---
533
534fn all_deps_finished(rob: &[RobEntry], entry: &RobEntry) -> bool {
535    for i in 0..entry.dep_count as usize {
536        let idx = entry.deps[i] as usize;
537        if idx < rob.len() && rob[idx].state != RobState::Fin {
538            return false;
539        }
540    }
541    true
542}
543
544fn find_ready_entry(rob: &[RobEntry], exec_units: ExecUnits) -> Option<usize> {
545    for (i, entry) in rob.iter().enumerate() {
546        if entry.state == RobState::Wait
547            && all_deps_finished(rob, entry)
548            && exec_units.can_satisfy(entry.exec_units)
549        {
550            return Some(i);
551        }
552    }
553    None
554}
555
556fn rob_all_finished(rob: &[RobEntry]) -> bool {
557    rob.iter().all(|e| e.state == RobState::Fin)
558}
559
560/// Run the pipeline simulation for a basic block starting at `start_pc`.
561/// If `trace` is true, print every action for debugging.
562fn gas_sim_traced(code: &[u8], bitmask: &[u8], start_pc: usize, trace: bool) -> u32 {
563    let mut s = SimState {
564        ip: Some(start_pc),
565        cycles: 0,
566        decode_slots: 4,
567        dispatch_slots: 5,
568        exec_units: ExecUnits::RESET,
569        rob: Vec::with_capacity(32),
570    };
571
572    for iter in 0..100_000 {
573        // Priority 1: Decode
574        if s.ip.is_some() && s.decode_slots > 0 && s.rob.len() < 32 {
575            let pc = s.ip.unwrap();
576            let cost = instruction_cost(code, bitmask, pc);
577            let mut deps = [0xFF_u8; 4];
578            let mut dep_count = 0u8;
579            for (i, e) in s.rob.iter().enumerate() {
580                if e.state != RobState::Fin
581                    && e.dest_regs.iter().any(|dr| cost.src_regs.contains(*dr))
582                    && dep_count < 4
583                {
584                    deps[dep_count as usize] = i as u8;
585                    dep_count += 1;
586                }
587            }
588            s.decode_slots = s.decode_slots.saturating_sub(cost.decode_slots);
589            let next_ip = if cost.is_terminator {
590                None
591            } else {
592                let skip = skip_distance(bitmask, pc);
593                let npc = pc + 1 + skip;
594                if npc < code.len() { Some(npc) } else { None }
595            };
596
597            if trace {
598                let op = crate::instruction::Opcode::from_byte(code[pc])
599                    .map(|o| format!("{:?}", o))
600                    .unwrap_or("?".into());
601                trace_eprintln!(
602                    "  [{}] DECODE pc={} {} cy={} dec={} rob_idx={} deps={:?} move={} term={} slots_left={}",
603                    iter,
604                    pc,
605                    op,
606                    cost.cycles,
607                    cost.decode_slots,
608                    s.rob.len(),
609                    &deps[..dep_count as usize],
610                    cost.is_move_reg,
611                    cost.is_terminator,
612                    s.decode_slots
613                );
614            }
615            if cost.is_move_reg {
616                s.ip = next_ip;
617            } else {
618                s.rob.push(RobEntry {
619                    state: RobState::Wait,
620                    cycles_left: cost.cycles,
621                    deps,
622                    dep_count,
623                    dest_regs: cost.dest_regs,
624                    exec_units: cost.exec_units,
625                });
626                s.ip = next_ip;
627            }
628            continue;
629        }
630
631        // Priority 2: Dispatch
632        if s.dispatch_slots > 0
633            && let Some(idx) = find_ready_entry(&s.rob, s.exec_units)
634        {
635            let eu = s.rob[idx].exec_units;
636
637            if trace {
638                trace_eprintln!(
639                    "  [{}] DISPATCH rob[{}] cy={} dispatch_left={}",
640                    iter,
641                    idx,
642                    s.rob[idx].cycles_left,
643                    s.dispatch_slots - 1
644                );
645            }
646            s.rob[idx].state = RobState::Exe;
647            s.dispatch_slots -= 1;
648            s.exec_units = s.exec_units.sub(eu);
649            continue;
650        }
651
652        // Priority 3: Done
653        if s.ip.is_none() && rob_all_finished(&s.rob) {
654            if trace {
655                trace_eprintln!("  [{}] DONE cycles={}", iter, s.cycles);
656            }
657            break;
658        }
659
660        // Priority 4: Advance cycle
661
662        if trace {
663            let states: Vec<String> = s
664                .rob
665                .iter()
666                .enumerate()
667                .map(|(i, e)| {
668                    let st = match e.state {
669                        RobState::Wait => "W",
670                        RobState::Exe => "E",
671                        RobState::Fin => "F",
672                    };
673                    format!(
674                        "{}:{}{}",
675                        i,
676                        st,
677                        if e.state == RobState::Exe {
678                            format!("({})", e.cycles_left)
679                        } else {
680                            String::new()
681                        }
682                    )
683                })
684                .collect();
685            trace_eprintln!(
686                "  [{}] ADVANCE cycle {} → {} rob=[{}]",
687                iter,
688                s.cycles,
689                s.cycles + 1,
690                states.join(", ")
691            );
692        }
693        for entry in s.rob.iter_mut() {
694            if entry.state == RobState::Exe {
695                if entry.cycles_left <= 1 {
696                    entry.state = RobState::Fin;
697                    entry.cycles_left = 0;
698                } else {
699                    entry.cycles_left -= 1;
700                }
701            }
702        }
703        s.cycles += 1;
704        s.decode_slots = 4;
705        s.dispatch_slots = 5;
706        s.exec_units = ExecUnits::RESET;
707    }
708
709    s.cycles
710}
711
712fn gas_sim(code: &[u8], bitmask: &[u8], start_pc: usize) -> u32 {
713    gas_sim_traced(code, bitmask, start_pc, false)
714}
715
716/// Compute gas cost for a basic block starting at `start_pc`.
717/// Returns max(simulation_cycles - 3, 1).
718pub fn gas_cost_for_block(code: &[u8], bitmask: &[u8], start_pc: usize) -> u64 {
719    let cycles = gas_sim(code, bitmask, start_pc);
720    if cycles > 3 { (cycles - 3) as u64 } else { 1 }
721}
722
723// === gas_cost_for_block_decoded / gas_sim_decoded / instruction_cost_fast ===
724// Cherry-picked from v2 `javm/src/gas_cost.rs`. These helpers consume
725// the recompiler's pre-decoded instruction stream (PreDecodedInst).
726// Gated to platforms where the recompiler is available.
727
728#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
729pub fn gas_cost_for_block_decoded(
730    instrs: &[crate::predecoded::PreDecodedInst],
731    code: &[u8],
732    bitmask: &[u8],
733) -> u64 {
734    let cycles = gas_sim_decoded(instrs, code, bitmask);
735    if cycles > 3 { (cycles - 3) as u64 } else { 1 }
736}
737
738/// Pipeline simulation from pre-decoded instructions (no raw byte re-parsing).
739#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
740fn gas_sim_decoded(
741    instrs: &[crate::predecoded::PreDecodedInst],
742    code: &[u8],
743    bitmask: &[u8],
744) -> u32 {
745    use crate::args::Args;
746
747    let mut s = SimState {
748        ip: Some(0), // index into instrs
749        cycles: 0,
750        decode_slots: 4,
751        dispatch_slots: 5,
752        exec_units: ExecUnits::RESET,
753        rob: Vec::with_capacity(32),
754    };
755
756    for _ in 0..100_000 {
757        if let Some(idx) = s.ip
758            && idx < instrs.len()
759            && s.decode_slots > 0
760            && s.rob.len() < 32
761        {
762            let instr = &instrs[idx];
763            let opcode_byte = instr.opcode as u8;
764
765            let (ra, rb, rd) = match instr.args {
766                Args::ThreeReg { ra, rb, rd } => (ra as u8, rb as u8, rd as u8),
767                Args::TwoReg { rd: d, ra: a } => (a as u8, 0xFF, d as u8),
768                Args::TwoRegImm { ra, rb, .. }
769                | Args::TwoRegOffset { ra, rb, .. }
770                | Args::TwoRegTwoImm { ra, rb, .. } => (ra as u8, rb as u8, 0xFF),
771                Args::RegImm { ra, .. }
772                | Args::RegExtImm { ra, .. }
773                | Args::RegTwoImm { ra, .. }
774                | Args::RegImmOffset { ra, .. } => (ra as u8, 0xFF, 0xFF),
775                _ => (0xFF, 0xFF, 0xFF),
776            };
777
778            let cost = instruction_cost_fast(opcode_byte, ra, rb, rd, instr, code, bitmask);
779
780            let mut deps = [0xFF_u8; 4];
781            let mut dep_count = 0u8;
782            for (i, e) in s.rob.iter().enumerate() {
783                if e.state != RobState::Fin
784                    && e.dest_regs.iter().any(|dr| cost.src_regs.contains(*dr))
785                    && dep_count < 4
786                {
787                    deps[dep_count as usize] = i as u8;
788                    dep_count += 1;
789                }
790            }
791
792            s.decode_slots = s.decode_slots.saturating_sub(cost.decode_slots);
793            let next_ip = if cost.is_terminator {
794                None
795            } else {
796                Some(idx + 1)
797            };
798
799            if cost.is_move_reg {
800                s.ip = next_ip;
801            } else {
802                s.rob.push(RobEntry {
803                    state: RobState::Wait,
804                    cycles_left: cost.cycles,
805                    deps,
806                    dep_count,
807                    dest_regs: cost.dest_regs,
808                    exec_units: cost.exec_units,
809                });
810                s.ip = next_ip;
811            }
812            continue;
813        }
814
815        if s.dispatch_slots > 0
816            && let Some(idx) = find_ready_entry(&s.rob, s.exec_units)
817        {
818            let eu = s.rob[idx].exec_units;
819            s.rob[idx].state = RobState::Exe;
820            s.dispatch_slots -= 1;
821            s.exec_units = s.exec_units.sub(eu);
822            continue;
823        }
824
825        if s.ip.is_none_or(|i| i >= instrs.len()) && rob_all_finished(&s.rob) {
826            break;
827        }
828
829        for entry in s.rob.iter_mut() {
830            if entry.state == RobState::Exe {
831                if entry.cycles_left <= 1 {
832                    entry.state = RobState::Fin;
833                    entry.cycles_left = 0;
834                } else {
835                    entry.cycles_left -= 1;
836                }
837            }
838        }
839        s.cycles += 1;
840        s.decode_slots = 4;
841        s.dispatch_slots = 5;
842        s.exec_units = ExecUnits::RESET;
843    }
844
845    s.cycles
846}
847
848/// Fast instruction cost lookup using pre-decoded register fields.
849/// Avoids re-parsing code bytes for register extraction.
850#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
851fn instruction_cost_fast(
852    opcode: u8,
853    ra: u8,
854    rb: u8,
855    rd: u8,
856    instr: &crate::predecoded::PreDecodedInst,
857    code: &[u8],
858    bitmask: &[u8],
859) -> InstrCost {
860    let mk = |cy: u32, dc: u8, eu: ExecUnits, dst: RegSet, src: RegSet| -> InstrCost {
861        InstrCost {
862            cycles: cy,
863            decode_slots: dc,
864            exec_units: eu,
865            dest_regs: dst,
866            src_regs: src,
867            is_terminator: false,
868            is_move_reg: false,
869        }
870    };
871    let mkt = |cy: u32, dc: u8, eu: ExecUnits, dst: RegSet, src: RegSet| -> InstrCost {
872        InstrCost {
873            cycles: cy,
874            decode_slots: dc,
875            exec_units: eu,
876            dest_regs: dst,
877            src_regs: src,
878            is_terminator: true,
879            is_move_reg: false,
880        }
881    };
882    let e = RegSet::EMPTY;
883    let r1 = RegSet::one;
884    let r2 = RegSet::two;
885
886    match opcode {
887        0 => mkt(2, 1, ExecUnits::NONE, e, e),
888        1 => mkt(2, 1, ExecUnits::NONE, e, e),
889        2 => mkt(40, 1, ExecUnits::NONE, e, e),
890        10 => mkt(100, 4, ExecUnits::ALU, e, e),
891        40 => mkt(15, 1, ExecUnits::ALU, e, e),
892        80 => mkt(15, 1, ExecUnits::ALU, r1(ra), e),
893        50 => mkt(22, 1, ExecUnits::ALU, e, e),
894        180 => mkt(22, 1, ExecUnits::ALU, r1(ra), r1(rb)),
895        52..=58 => mk(25, 1, ExecUnits::LOAD, r1(ra), r1(rb)),
896        124..=130 => mk(25, 1, ExecUnits::LOAD, r1(ra), r1(rb)),
897        59..=62 => mk(25, 1, ExecUnits::STORE, e, r2(ra, rb)),
898        120..=123 => mk(25, 1, ExecUnits::STORE, e, r2(ra, rb)),
899        30..=33 => mk(25, 1, ExecUnits::STORE, e, e),
900        70..=73 => mk(25, 1, ExecUnits::STORE, e, r1(ra)),
901        51 => mk(1, 1, ExecUnits::NONE, r1(ra), e),
902        20 => mk(1, 2, ExecUnits::NONE, r1(ra), e),
903        100 => InstrCost {
904            cycles: 0,
905            decode_slots: 1,
906            exec_units: ExecUnits::NONE,
907            dest_regs: r1(ra),
908            src_regs: r1(rb),
909            is_terminator: false,
910            is_move_reg: true,
911        },
912        101 => mk(2, 1, ExecUnits::NONE, e, e),
913        81..=90 => {
914            let target = match instr.args {
915                crate::args::Args::RegImmOffset { offset, .. } => offset as usize,
916                _ => instr.pc as usize,
917            };
918            let bc = branch_cost(code, bitmask, target);
919            mkt(bc, 1, ExecUnits::ALU, e, r1(ra))
920        }
921        170..=175 => {
922            let target = match instr.args {
923                crate::args::Args::TwoRegOffset { offset, .. } => offset as usize,
924                _ => instr.pc as usize,
925            };
926            let bc = branch_cost(code, bitmask, target);
927            mkt(bc, 1, ExecUnits::ALU, e, r2(ra, rb))
928        }
929        200 | 201 | 210 | 211 | 212 => {
930            let dc = if dst_overlaps_src(ra, &r2(rb, rd)) {
931                1
932            } else {
933                2
934            };
935            mk(1, dc, ExecUnits::ALU, r1(ra), r2(rb, rd))
936        }
937        190 | 191 => {
938            let dc = if dst_overlaps_src(ra, &r2(rb, rd)) {
939                2
940            } else {
941                3
942            };
943            mk(2, dc, ExecUnits::ALU, r1(ra), r2(rb, rd))
944        }
945        132 | 133 | 134 | 149 | 151 | 152 | 153 | 158 | 110 => {
946            let dc = if dst_overlaps_src(ra, &r1(rb)) { 1 } else { 2 };
947            mk(1, dc, ExecUnits::ALU, r1(ra), r1(rb))
948        }
949        131 | 138 | 139 | 140 | 160 => {
950            let dc = if dst_overlaps_src(ra, &r1(rb)) { 2 } else { 3 };
951            mk(2, dc, ExecUnits::ALU, r1(ra), r1(rb))
952        }
953        102 | 103 | 104 | 105 | 108 | 109 => mk(1, 1, ExecUnits::ALU, r1(ra), r1(rb)),
954        106 | 107 => mk(2, 1, ExecUnits::ALU, r1(ra), r1(rb)),
955        111 => mk(1, 1, ExecUnits::ALU, r1(ra), r1(rb)),
956        207 | 208 | 209 | 220 | 222 => {
957            let dc = if rb == ra { 2 } else { 3 };
958            mk(1, dc, ExecUnits::ALU, r1(ra), r2(rb, rd))
959        }
960        197 | 198 | 199 | 221 | 223 => {
961            let dc = if rb == ra { 3 } else { 4 };
962            mk(2, dc, ExecUnits::ALU, r1(ra), r2(rb, rd))
963        }
964        155 | 156 | 157 | 159 => mk(1, 3, ExecUnits::ALU, r1(ra), r1(rb)),
965        144 | 145 | 146 | 161 => mk(2, 4, ExecUnits::ALU, r1(ra), r1(rb)),
966        216 | 217 => mk(3, 3, ExecUnits::ALU, r1(ra), r2(rb, rd)),
967        136 | 137 | 142 | 143 => mk(3, 3, ExecUnits::ALU, r1(ra), r1(rb)),
968        218 | 219 => mk(2, 2, ExecUnits::ALU, r1(ra), r2(rb, rd)),
969        147 | 148 => mk(2, 3, ExecUnits::ALU, r1(ra), r1(rb)),
970        227..=230 => {
971            let dc = if dst_overlaps_src(ra, &r2(rb, rd)) {
972                2
973            } else {
974                3
975            };
976            mk(3, dc, ExecUnits::ALU, r1(ra), r2(rb, rd))
977        }
978        224 | 225 => mk(2, 3, ExecUnits::ALU, r1(ra), r2(rb, rd)),
979        226 => {
980            let dc = if dst_overlaps_src(ra, &r2(rb, rd)) {
981                2
982            } else {
983                3
984            };
985            mk(2, dc, ExecUnits::ALU, r1(ra), r2(rb, rd))
986        }
987        154 => mk(2, 3, ExecUnits::ALU, r1(ra), r1(rb)),
988        141 => mk(3, 4, ExecUnits::ALU, r1(ra), r1(rb)),
989        202 => {
990            let dc = if dst_overlaps_src(ra, &r2(rb, rd)) {
991                1
992            } else {
993                2
994            };
995            mk(3, dc, ExecUnits::MUL, r1(ra), r2(rb, rd))
996        }
997        150 => {
998            let dc = if dst_overlaps_src(ra, &r1(rb)) { 1 } else { 2 };
999            mk(3, dc, ExecUnits::MUL, r1(ra), r1(rb))
1000        }
1001        192 => {
1002            let dc = if dst_overlaps_src(ra, &r2(rb, rd)) {
1003                2
1004            } else {
1005                3
1006            };
1007            mk(4, dc, ExecUnits::MUL, r1(ra), r2(rb, rd))
1008        }
1009        135 => {
1010            let dc = if dst_overlaps_src(ra, &r1(rb)) { 2 } else { 3 };
1011            mk(4, dc, ExecUnits::MUL, r1(ra), r1(rb))
1012        }
1013        213 | 214 => mk(4, 4, ExecUnits::MUL, r1(ra), r2(rb, rd)),
1014        215 => mk(6, 4, ExecUnits::MUL, r1(ra), r2(rb, rd)),
1015        193 | 194 | 195 | 196 | 203 | 204 | 205 | 206 => {
1016            mk(60, 4, ExecUnits::DIV, r1(ra), r2(rb, rd))
1017        }
1018        _ => mk(1, 1, ExecUnits::NONE, e, e),
1019    }
1020}
1021
1022// (v2 had a 2-arg convenience `compute_block_gas_costs(code, bitmask)`
1023//  here that called `crate::interpreter::compute_gas_block_starts`.
1024//  That's a circular dependency relative to the v3 layering, and the
1025//  function was unused in v2 outside its own definition. The actual
1026//  used implementation is the 4-arg `compute_block_gas_costs` in the
1027//  decode module — which lives below the interpreter, and so doesn't
1028//  introduce the cycle.)
1029
1030// ============================================================================
1031// Fast bitmask-based pipeline simulator (safe Rust, zero heap allocation)
1032// ============================================================================
1033
1034/// Compact instruction cost for the fast simulator.
1035#[derive(Clone, Copy, Debug, Default)]
1036pub struct FastCost {
1037    pub cycles: u8,
1038    pub decode_slots: u8,
1039    /// 0=none, 1=alu, 2=load(+alu), 3=store(+alu), 4=mul(+alu), 5=div(+alu)
1040    pub exec_unit: u8,
1041    pub src_mask: u16,
1042    pub dst_mask: u16,
1043    pub is_terminator: bool,
1044    pub is_move_reg: bool,
1045}
1046
1047const EU_NONE: u8 = 0;
1048const EU_ALU: u8 = 1;
1049const EU_LOAD: u8 = 2;
1050const EU_STORE: u8 = 3;
1051const EU_MUL: u8 = 4;
1052const EU_DIV: u8 = 5;
1053
1054#[inline(always)]
1055fn reg_bit(r: u8) -> u16 {
1056    // PVM clamps registers to 0-12; raw nibble 13/14/15 all map to register 12.
1057    1u16 << r.min(12)
1058}
1059
1060/// Extract branch target from raw code bytes (for gas cost computation).
1061/// Works for both OneRegImmOffset and TwoRegOneOffset categories.
1062fn extract_branch_target_raw(code: &[u8], bitmask: &[u8], pc: usize) -> usize {
1063    let skip = {
1064        let mut s = 0;
1065        for j in 0..25 {
1066            let idx = pc + 1 + j;
1067            if idx >= bitmask.len() || bitmask[idx] == 1 {
1068                s = j;
1069                break;
1070            }
1071        }
1072        s
1073    };
1074    let opcode = code[pc];
1075    // For branches, use the existing decode_args to get the offset
1076    let cat = crate::instruction::Opcode::from_byte(opcode)
1077        .map(|o| o.category())
1078        .unwrap_or(crate::instruction::InstructionCategory::NoArgs);
1079    let args = crate::args::decode_args(code, pc, skip, cat);
1080    match args {
1081        crate::args::Args::RegImmOffset { offset, .. } => offset as usize,
1082        crate::args::Args::TwoRegOffset { offset, .. } => offset as usize,
1083        crate::args::Args::Offset { offset } => offset as usize,
1084        _ => pc,
1085    }
1086}
1087
1088/// Compute FastCost from raw register bytes (no Args enum needed).
1089/// For branches, extracts target from raw code bytes.
1090/// Default load/store latency (L2 cache hit baseline).
1091pub const DEFAULT_MEM_CYCLES: u8 = 25;
1092
1093#[allow(clippy::too_many_arguments)]
1094pub fn fast_cost_from_raw(
1095    opcode_byte: u8,
1096    ra: u8,
1097    rb: u8,
1098    rd: u8,
1099    pc: u32,
1100    code: &[u8],
1101    bitmask: &[u8],
1102    mem_cycles: u8,
1103) -> FastCost {
1104    let r1 = |r: u8| reg_bit(r);
1105    let r2 = |a: u8, b: u8| reg_bit(a) | reg_bit(b);
1106    let dst_src_overlap = |dst: u8, s: u16| (reg_bit(dst) & s) != 0;
1107
1108    let opcode = opcode_byte;
1109    match opcode {
1110        // No-arg terminators
1111        0 => FastCost {
1112            cycles: 2,
1113            decode_slots: 1,
1114            exec_unit: EU_NONE,
1115            src_mask: 0,
1116            dst_mask: 0,
1117            is_terminator: true,
1118            is_move_reg: false,
1119        },
1120        1 => FastCost {
1121            cycles: 2,
1122            decode_slots: 1,
1123            exec_unit: EU_NONE,
1124            src_mask: 0,
1125            dst_mask: 0,
1126            is_terminator: true,
1127            is_move_reg: false,
1128        },
1129        2 => FastCost {
1130            cycles: 40,
1131            decode_slots: 1,
1132            exec_unit: EU_NONE,
1133            src_mask: 0,
1134            dst_mask: 0,
1135            is_terminator: true,
1136            is_move_reg: false,
1137        },
1138        // Ecall (opcode 3): no immediate; op in φ[11], refs in φ[12].
1139        // Same cost shape as Ecalli — kernel does the work, the
1140        // PVM-side cost is just the exit dispatch.
1141        3 => FastCost {
1142            cycles: 100,
1143            decode_slots: 4,
1144            exec_unit: EU_ALU,
1145            src_mask: 0,
1146            dst_mask: 0,
1147            is_terminator: true,
1148            is_move_reg: false,
1149        },
1150        10 => FastCost {
1151            cycles: 100,
1152            decode_slots: 4,
1153            exec_unit: EU_ALU,
1154            src_mask: 0,
1155            dst_mask: 0,
1156            is_terminator: true,
1157            is_move_reg: false,
1158        },
1159
1160        // Control flow
1161        40 => FastCost {
1162            cycles: 15,
1163            decode_slots: 1,
1164            exec_unit: EU_ALU,
1165            src_mask: 0,
1166            dst_mask: 0,
1167            is_terminator: true,
1168            is_move_reg: false,
1169        },
1170        80 => FastCost {
1171            cycles: 15,
1172            decode_slots: 1,
1173            exec_unit: EU_ALU,
1174            src_mask: 0,
1175            dst_mask: r1(ra),
1176            is_terminator: true,
1177            is_move_reg: false,
1178        },
1179        50 => FastCost {
1180            cycles: 22,
1181            decode_slots: 1,
1182            exec_unit: EU_ALU,
1183            src_mask: 0,
1184            dst_mask: 0,
1185            is_terminator: true,
1186            is_move_reg: false,
1187        },
1188        180 => FastCost {
1189            cycles: 22,
1190            decode_slots: 1,
1191            exec_unit: EU_ALU,
1192            src_mask: r1(rb),
1193            dst_mask: r1(ra),
1194            is_terminator: true,
1195            is_move_reg: false,
1196        },
1197
1198        // Loads
1199        52..=58 => FastCost {
1200            cycles: mem_cycles,
1201            decode_slots: 1,
1202            exec_unit: EU_LOAD,
1203            src_mask: r1(rb),
1204            dst_mask: r1(ra),
1205            is_terminator: false,
1206            is_move_reg: false,
1207        },
1208        124..=130 => FastCost {
1209            cycles: mem_cycles,
1210            decode_slots: 1,
1211            exec_unit: EU_LOAD,
1212            src_mask: r1(rb),
1213            dst_mask: r1(ra),
1214            is_terminator: false,
1215            is_move_reg: false,
1216        },
1217
1218        // Stores
1219        59..=62 => FastCost {
1220            cycles: mem_cycles,
1221            decode_slots: 1,
1222            exec_unit: EU_STORE,
1223            src_mask: r2(ra, rb),
1224            dst_mask: 0,
1225            is_terminator: false,
1226            is_move_reg: false,
1227        },
1228        120..=123 => FastCost {
1229            cycles: mem_cycles,
1230            decode_slots: 1,
1231            exec_unit: EU_STORE,
1232            src_mask: r2(ra, rb),
1233            dst_mask: 0,
1234            is_terminator: false,
1235            is_move_reg: false,
1236        },
1237        30..=33 => FastCost {
1238            cycles: mem_cycles,
1239            decode_slots: 1,
1240            exec_unit: EU_STORE,
1241            src_mask: 0,
1242            dst_mask: 0,
1243            is_terminator: false,
1244            is_move_reg: false,
1245        },
1246        70..=73 => FastCost {
1247            cycles: mem_cycles,
1248            decode_slots: 1,
1249            exec_unit: EU_STORE,
1250            src_mask: r1(ra),
1251            dst_mask: 0,
1252            is_terminator: false,
1253            is_move_reg: false,
1254        },
1255
1256        // Load immediates
1257        51 => FastCost {
1258            cycles: 1,
1259            decode_slots: 1,
1260            exec_unit: EU_NONE,
1261            src_mask: 0,
1262            dst_mask: r1(ra),
1263            is_terminator: false,
1264            is_move_reg: false,
1265        },
1266        20 => FastCost {
1267            cycles: 1,
1268            decode_slots: 2,
1269            exec_unit: EU_NONE,
1270            src_mask: 0,
1271            dst_mask: r1(ra),
1272            is_terminator: false,
1273            is_move_reg: false,
1274        },
1275
1276        // move_reg — no ROB entry
1277        100 => FastCost {
1278            cycles: 0,
1279            decode_slots: 1,
1280            exec_unit: EU_NONE,
1281            src_mask: r1(rb),
1282            dst_mask: r1(ra),
1283            is_terminator: false,
1284            is_move_reg: true,
1285        },
1286
1287        101 => FastCost {
1288            cycles: 2,
1289            decode_slots: 1,
1290            exec_unit: EU_NONE,
1291            src_mask: 0,
1292            dst_mask: 0,
1293            is_terminator: false,
1294            is_move_reg: false,
1295        },
1296
1297        // Branches (reg+imm+offset)
1298        81..=90 => {
1299            let target = extract_branch_target_raw(code, bitmask, pc as usize);
1300            let bc = branch_cost(code, bitmask, target);
1301            FastCost {
1302                cycles: bc as u8,
1303                decode_slots: 1,
1304                exec_unit: EU_ALU,
1305                src_mask: r1(ra),
1306                dst_mask: 0,
1307                is_terminator: true,
1308                is_move_reg: false,
1309            }
1310        }
1311        // Branches (two-reg+offset)
1312        170..=175 => {
1313            let target = extract_branch_target_raw(code, bitmask, pc as usize);
1314            let bc = branch_cost(code, bitmask, target);
1315            FastCost {
1316                cycles: bc as u8,
1317                decode_slots: 1,
1318                exec_unit: EU_ALU,
1319                src_mask: r2(ra, rb),
1320                dst_mask: 0,
1321                is_terminator: true,
1322                is_move_reg: false,
1323            }
1324        }
1325
1326        // ALU 64-bit 3-reg
1327        200 | 201 | 210 | 211 | 212 => {
1328            let s = r2(rb, rd);
1329            let dc = if dst_src_overlap(ra, s) { 1 } else { 2 };
1330            FastCost {
1331                cycles: 1,
1332                decode_slots: dc,
1333                exec_unit: EU_ALU,
1334                src_mask: s,
1335                dst_mask: r1(ra),
1336                is_terminator: false,
1337                is_move_reg: false,
1338            }
1339        }
1340        // ALU 32-bit 3-reg
1341        190 | 191 => {
1342            let s = r2(rb, rd);
1343            let dc = if dst_src_overlap(ra, s) { 2 } else { 3 };
1344            FastCost {
1345                cycles: 2,
1346                decode_slots: dc,
1347                exec_unit: EU_ALU,
1348                src_mask: s,
1349                dst_mask: r1(ra),
1350                is_terminator: false,
1351                is_move_reg: false,
1352            }
1353        }
1354        // ALU 2-op imm 64-bit
1355        132 | 133 | 134 | 149 | 151 | 152 | 153 | 158 | 110 => {
1356            let dc = if dst_src_overlap(ra, r1(rb)) { 1 } else { 2 };
1357            FastCost {
1358                cycles: 1,
1359                decode_slots: dc,
1360                exec_unit: EU_ALU,
1361                src_mask: r1(rb),
1362                dst_mask: r1(ra),
1363                is_terminator: false,
1364                is_move_reg: false,
1365            }
1366        }
1367        // ALU 2-op imm 32-bit
1368        131 | 138 | 139 | 140 | 160 => {
1369            let dc = if dst_src_overlap(ra, r1(rb)) { 2 } else { 3 };
1370            FastCost {
1371                cycles: 2,
1372                decode_slots: dc,
1373                exec_unit: EU_ALU,
1374                src_mask: r1(rb),
1375                dst_mask: r1(ra),
1376                is_terminator: false,
1377                is_move_reg: false,
1378            }
1379        }
1380        // Trivial 2-op: popcount, clz, sign_extend, zero_extend, reverse_bytes
1381        102 | 103 | 104 | 105 | 108 | 109 | 111 => FastCost {
1382            cycles: 1,
1383            decode_slots: 1,
1384            exec_unit: EU_ALU,
1385            src_mask: r1(rb),
1386            dst_mask: r1(ra),
1387            is_terminator: false,
1388            is_move_reg: false,
1389        },
1390        // ctz
1391        106 | 107 => FastCost {
1392            cycles: 2,
1393            decode_slots: 1,
1394            exec_unit: EU_ALU,
1395            src_mask: r1(rb),
1396            dst_mask: r1(ra),
1397            is_terminator: false,
1398            is_move_reg: false,
1399        },
1400
1401        // Shifts 64-bit 3-reg
1402        207 | 208 | 209 | 220 | 222 => {
1403            let dc = if rb == ra { 2 } else { 3 };
1404            FastCost {
1405                cycles: 1,
1406                decode_slots: dc,
1407                exec_unit: EU_ALU,
1408                src_mask: r2(rb, rd),
1409                dst_mask: r1(ra),
1410                is_terminator: false,
1411                is_move_reg: false,
1412            }
1413        }
1414        // Shifts 32-bit 3-reg
1415        197 | 198 | 199 | 221 | 223 => {
1416            let dc = if rb == ra { 3 } else { 4 };
1417            FastCost {
1418                cycles: 2,
1419                decode_slots: dc,
1420                exec_unit: EU_ALU,
1421                src_mask: r2(rb, rd),
1422                dst_mask: r1(ra),
1423                is_terminator: false,
1424                is_move_reg: false,
1425            }
1426        }
1427        // Shift alt 64-bit
1428        155 | 156 | 157 | 159 => FastCost {
1429            cycles: 1,
1430            decode_slots: 3,
1431            exec_unit: EU_ALU,
1432            src_mask: r1(rb),
1433            dst_mask: r1(ra),
1434            is_terminator: false,
1435            is_move_reg: false,
1436        },
1437        // Shift alt 32-bit
1438        144 | 145 | 146 | 161 => FastCost {
1439            cycles: 2,
1440            decode_slots: 4,
1441            exec_unit: EU_ALU,
1442            src_mask: r1(rb),
1443            dst_mask: r1(ra),
1444            is_terminator: false,
1445            is_move_reg: false,
1446        },
1447
1448        // Comparisons 3-reg
1449        216 | 217 => FastCost {
1450            cycles: 3,
1451            decode_slots: 3,
1452            exec_unit: EU_ALU,
1453            src_mask: r2(rb, rd),
1454            dst_mask: r1(ra),
1455            is_terminator: false,
1456            is_move_reg: false,
1457        },
1458        // Comparisons imm
1459        136 | 137 | 142 | 143 => FastCost {
1460            cycles: 3,
1461            decode_slots: 3,
1462            exec_unit: EU_ALU,
1463            src_mask: r1(rb),
1464            dst_mask: r1(ra),
1465            is_terminator: false,
1466            is_move_reg: false,
1467        },
1468
1469        // Conditional moves 3-reg
1470        218 | 219 => FastCost {
1471            cycles: 2,
1472            decode_slots: 2,
1473            exec_unit: EU_ALU,
1474            src_mask: r2(rb, rd),
1475            dst_mask: r1(ra),
1476            is_terminator: false,
1477            is_move_reg: false,
1478        },
1479        // Conditional moves imm
1480        147 | 148 => FastCost {
1481            cycles: 2,
1482            decode_slots: 3,
1483            exec_unit: EU_ALU,
1484            src_mask: r1(rb),
1485            dst_mask: r1(ra),
1486            is_terminator: false,
1487            is_move_reg: false,
1488        },
1489
1490        // Min/Max
1491        227..=230 => {
1492            let s = r2(rb, rd);
1493            let dc = if dst_src_overlap(ra, s) { 2 } else { 3 };
1494            FastCost {
1495                cycles: 3,
1496                decode_slots: dc,
1497                exec_unit: EU_ALU,
1498                src_mask: s,
1499                dst_mask: r1(ra),
1500                is_terminator: false,
1501                is_move_reg: false,
1502            }
1503        }
1504        // and_inv, or_inv
1505        224 | 225 => FastCost {
1506            cycles: 2,
1507            decode_slots: 3,
1508            exec_unit: EU_ALU,
1509            src_mask: r2(rb, rd),
1510            dst_mask: r1(ra),
1511            is_terminator: false,
1512            is_move_reg: false,
1513        },
1514        // xnor
1515        226 => {
1516            let s = r2(rb, rd);
1517            let dc = if dst_src_overlap(ra, s) { 2 } else { 3 };
1518            FastCost {
1519                cycles: 2,
1520                decode_slots: dc,
1521                exec_unit: EU_ALU,
1522                src_mask: s,
1523                dst_mask: r1(ra),
1524                is_terminator: false,
1525                is_move_reg: false,
1526            }
1527        }
1528        // neg_add_imm
1529        154 => FastCost {
1530            cycles: 2,
1531            decode_slots: 3,
1532            exec_unit: EU_ALU,
1533            src_mask: r1(rb),
1534            dst_mask: r1(ra),
1535            is_terminator: false,
1536            is_move_reg: false,
1537        },
1538        141 => FastCost {
1539            cycles: 3,
1540            decode_slots: 4,
1541            exec_unit: EU_ALU,
1542            src_mask: r1(rb),
1543            dst_mask: r1(ra),
1544            is_terminator: false,
1545            is_move_reg: false,
1546        },
1547
1548        // Multiply 64-bit 3-reg
1549        202 => {
1550            let s = r2(rb, rd);
1551            let dc = if dst_src_overlap(ra, s) { 1 } else { 2 };
1552            FastCost {
1553                cycles: 3,
1554                decode_slots: dc,
1555                exec_unit: EU_MUL,
1556                src_mask: s,
1557                dst_mask: r1(ra),
1558                is_terminator: false,
1559                is_move_reg: false,
1560            }
1561        }
1562        // mul_imm_64
1563        150 => {
1564            let dc = if dst_src_overlap(ra, r1(rb)) { 1 } else { 2 };
1565            FastCost {
1566                cycles: 3,
1567                decode_slots: dc,
1568                exec_unit: EU_MUL,
1569                src_mask: r1(rb),
1570                dst_mask: r1(ra),
1571                is_terminator: false,
1572                is_move_reg: false,
1573            }
1574        }
1575        // Multiply 32-bit 3-reg
1576        192 => {
1577            let s = r2(rb, rd);
1578            let dc = if dst_src_overlap(ra, s) { 2 } else { 3 };
1579            FastCost {
1580                cycles: 4,
1581                decode_slots: dc,
1582                exec_unit: EU_MUL,
1583                src_mask: s,
1584                dst_mask: r1(ra),
1585                is_terminator: false,
1586                is_move_reg: false,
1587            }
1588        }
1589        // mul_imm_32
1590        135 => {
1591            let dc = if dst_src_overlap(ra, r1(rb)) { 2 } else { 3 };
1592            FastCost {
1593                cycles: 4,
1594                decode_slots: dc,
1595                exec_unit: EU_MUL,
1596                src_mask: r1(rb),
1597                dst_mask: r1(ra),
1598                is_terminator: false,
1599                is_move_reg: false,
1600            }
1601        }
1602        // Multiply upper
1603        213 | 214 => FastCost {
1604            cycles: 4,
1605            decode_slots: 4,
1606            exec_unit: EU_MUL,
1607            src_mask: r2(rb, rd),
1608            dst_mask: r1(ra),
1609            is_terminator: false,
1610            is_move_reg: false,
1611        },
1612        215 => FastCost {
1613            cycles: 6,
1614            decode_slots: 4,
1615            exec_unit: EU_MUL,
1616            src_mask: r2(rb, rd),
1617            dst_mask: r1(ra),
1618            is_terminator: false,
1619            is_move_reg: false,
1620        },
1621
1622        // Divide
1623        193 | 194 | 195 | 196 | 203 | 204 | 205 | 206 => FastCost {
1624            cycles: 60,
1625            decode_slots: 4,
1626            exec_unit: EU_DIV,
1627            src_mask: r2(rb, rd),
1628            dst_mask: r1(ra),
1629            is_terminator: false,
1630            is_move_reg: false,
1631        },
1632
1633        // Default
1634        _ => FastCost {
1635            cycles: 1,
1636            decode_slots: 1,
1637            exec_unit: EU_NONE,
1638            src_mask: 0,
1639            dst_mask: 0,
1640            is_terminator: false,
1641            is_move_reg: false,
1642        },
1643    }
1644}
1645
1646/// Compute FastCost using pre-decoded branch target from Args.
1647///
1648/// For non-branch instructions, identical to `fast_cost_from_raw`. For branches,
1649/// avoids the redundant `extract_branch_target_raw` call which re-computes skip
1650/// distances and re-decodes args just to extract the branch offset.
1651#[inline(always)]
1652pub fn fast_cost_from_decoded(
1653    opcode_byte: u8,
1654    args: &crate::args::Args,
1655    pc: u32,
1656    code: &[u8],
1657    bitmask: &[u8],
1658    mem_cycles: u8,
1659) -> FastCost {
1660    use crate::args::Args;
1661
1662    // Use raw byte positions for register fields (same as fast_cost_from_raw).
1663    // The raw nibble positions don't correspond to semantic arg names — the
1664    // mapping varies by instruction format — so we read directly from code[].
1665    let pcu = pc as usize;
1666    let ra = if pcu + 1 < code.len() {
1667        code[pcu + 1] & 0x0F
1668    } else {
1669        0xFF
1670    };
1671    let rb = if pcu + 1 < code.len() {
1672        (code[pcu + 1] >> 4) & 0x0F
1673    } else {
1674        0xFF
1675    };
1676    let rd = if pcu + 2 < code.len() {
1677        code[pcu + 2] & 0x0F
1678    } else {
1679        0xFF
1680    };
1681
1682    // Extract branch target from already-decoded offset (the main optimization:
1683    // avoids extract_branch_target_raw which does skip computation + decode_args)
1684    let branch_target = match args {
1685        Args::RegImmOffset { offset, .. } => *offset as usize,
1686        Args::TwoRegOffset { offset, .. } => *offset as usize,
1687        Args::Offset { offset } => *offset as usize,
1688        _ => pcu,
1689    };
1690
1691    let r1 = |r: u8| reg_bit(r);
1692    let r2 = |a: u8, b: u8| reg_bit(a) | reg_bit(b);
1693    let dst_src_overlap = |dst: u8, s: u16| (reg_bit(dst) & s) != 0;
1694
1695    let opcode = opcode_byte;
1696    match opcode {
1697        // No-arg terminators
1698        0 => FastCost {
1699            cycles: 2,
1700            decode_slots: 1,
1701            exec_unit: EU_NONE,
1702            src_mask: 0,
1703            dst_mask: 0,
1704            is_terminator: true,
1705            is_move_reg: false,
1706        },
1707        1 => FastCost {
1708            cycles: 2,
1709            decode_slots: 1,
1710            exec_unit: EU_NONE,
1711            src_mask: 0,
1712            dst_mask: 0,
1713            is_terminator: true,
1714            is_move_reg: false,
1715        },
1716        2 => FastCost {
1717            cycles: 40,
1718            decode_slots: 1,
1719            exec_unit: EU_NONE,
1720            src_mask: 0,
1721            dst_mask: 0,
1722            is_terminator: true,
1723            is_move_reg: false,
1724        },
1725        // Ecall (opcode 3): no immediate; op in φ[11], refs in φ[12].
1726        // Same cost shape as Ecalli — kernel does the work, the
1727        // PVM-side cost is just the exit dispatch.
1728        3 => FastCost {
1729            cycles: 100,
1730            decode_slots: 4,
1731            exec_unit: EU_ALU,
1732            src_mask: 0,
1733            dst_mask: 0,
1734            is_terminator: true,
1735            is_move_reg: false,
1736        },
1737        10 => FastCost {
1738            cycles: 100,
1739            decode_slots: 4,
1740            exec_unit: EU_ALU,
1741            src_mask: 0,
1742            dst_mask: 0,
1743            is_terminator: true,
1744            is_move_reg: false,
1745        },
1746
1747        // Control flow
1748        40 => FastCost {
1749            cycles: 15,
1750            decode_slots: 1,
1751            exec_unit: EU_ALU,
1752            src_mask: 0,
1753            dst_mask: 0,
1754            is_terminator: true,
1755            is_move_reg: false,
1756        },
1757        80 => FastCost {
1758            cycles: 15,
1759            decode_slots: 1,
1760            exec_unit: EU_ALU,
1761            src_mask: 0,
1762            dst_mask: r1(ra),
1763            is_terminator: true,
1764            is_move_reg: false,
1765        },
1766        50 => FastCost {
1767            cycles: 22,
1768            decode_slots: 1,
1769            exec_unit: EU_ALU,
1770            src_mask: 0,
1771            dst_mask: 0,
1772            is_terminator: true,
1773            is_move_reg: false,
1774        },
1775        180 => FastCost {
1776            cycles: 22,
1777            decode_slots: 1,
1778            exec_unit: EU_ALU,
1779            src_mask: r1(rb),
1780            dst_mask: r1(ra),
1781            is_terminator: true,
1782            is_move_reg: false,
1783        },
1784
1785        // Loads
1786        52..=58 => FastCost {
1787            cycles: mem_cycles,
1788            decode_slots: 1,
1789            exec_unit: EU_LOAD,
1790            src_mask: r1(rb),
1791            dst_mask: r1(ra),
1792            is_terminator: false,
1793            is_move_reg: false,
1794        },
1795        124..=130 => FastCost {
1796            cycles: mem_cycles,
1797            decode_slots: 1,
1798            exec_unit: EU_LOAD,
1799            src_mask: r1(rb),
1800            dst_mask: r1(ra),
1801            is_terminator: false,
1802            is_move_reg: false,
1803        },
1804
1805        // Stores
1806        59..=62 => FastCost {
1807            cycles: mem_cycles,
1808            decode_slots: 1,
1809            exec_unit: EU_STORE,
1810            src_mask: r2(ra, rb),
1811            dst_mask: 0,
1812            is_terminator: false,
1813            is_move_reg: false,
1814        },
1815        120..=123 => FastCost {
1816            cycles: mem_cycles,
1817            decode_slots: 1,
1818            exec_unit: EU_STORE,
1819            src_mask: r2(ra, rb),
1820            dst_mask: 0,
1821            is_terminator: false,
1822            is_move_reg: false,
1823        },
1824        30..=33 => FastCost {
1825            cycles: mem_cycles,
1826            decode_slots: 1,
1827            exec_unit: EU_STORE,
1828            src_mask: 0,
1829            dst_mask: 0,
1830            is_terminator: false,
1831            is_move_reg: false,
1832        },
1833        70..=73 => FastCost {
1834            cycles: mem_cycles,
1835            decode_slots: 1,
1836            exec_unit: EU_STORE,
1837            src_mask: r1(ra),
1838            dst_mask: 0,
1839            is_terminator: false,
1840            is_move_reg: false,
1841        },
1842
1843        // Load immediates
1844        51 => FastCost {
1845            cycles: 1,
1846            decode_slots: 1,
1847            exec_unit: EU_NONE,
1848            src_mask: 0,
1849            dst_mask: r1(ra),
1850            is_terminator: false,
1851            is_move_reg: false,
1852        },
1853        20 => FastCost {
1854            cycles: 1,
1855            decode_slots: 2,
1856            exec_unit: EU_NONE,
1857            src_mask: 0,
1858            dst_mask: r1(ra),
1859            is_terminator: false,
1860            is_move_reg: false,
1861        },
1862
1863        // move_reg — no ROB entry
1864        100 => FastCost {
1865            cycles: 0,
1866            decode_slots: 1,
1867            exec_unit: EU_NONE,
1868            src_mask: r1(rb),
1869            dst_mask: r1(ra),
1870            is_terminator: false,
1871            is_move_reg: true,
1872        },
1873
1874        101 => FastCost {
1875            cycles: 2,
1876            decode_slots: 1,
1877            exec_unit: EU_NONE,
1878            src_mask: 0,
1879            dst_mask: 0,
1880            is_terminator: false,
1881            is_move_reg: false,
1882        },
1883
1884        // Branches (reg+imm+offset) — uses pre-decoded branch target
1885        81..=90 => {
1886            let bc = branch_cost(code, bitmask, branch_target);
1887            FastCost {
1888                cycles: bc as u8,
1889                decode_slots: 1,
1890                exec_unit: EU_ALU,
1891                src_mask: r1(ra),
1892                dst_mask: 0,
1893                is_terminator: true,
1894                is_move_reg: false,
1895            }
1896        }
1897        // Branches (two-reg+offset) — uses pre-decoded branch target
1898        170..=175 => {
1899            let bc = branch_cost(code, bitmask, branch_target);
1900            FastCost {
1901                cycles: bc as u8,
1902                decode_slots: 1,
1903                exec_unit: EU_ALU,
1904                src_mask: r2(ra, rb),
1905                dst_mask: 0,
1906                is_terminator: true,
1907                is_move_reg: false,
1908            }
1909        }
1910
1911        // ALU 64-bit 3-reg
1912        200 | 201 | 210 | 211 | 212 => {
1913            let s = r2(rb, rd);
1914            let dc = if dst_src_overlap(ra, s) { 1 } else { 2 };
1915            FastCost {
1916                cycles: 1,
1917                decode_slots: dc,
1918                exec_unit: EU_ALU,
1919                src_mask: s,
1920                dst_mask: r1(ra),
1921                is_terminator: false,
1922                is_move_reg: false,
1923            }
1924        }
1925        // ALU 32-bit 3-reg
1926        190 | 191 => {
1927            let s = r2(rb, rd);
1928            let dc = if dst_src_overlap(ra, s) { 2 } else { 3 };
1929            FastCost {
1930                cycles: 2,
1931                decode_slots: dc,
1932                exec_unit: EU_ALU,
1933                src_mask: s,
1934                dst_mask: r1(ra),
1935                is_terminator: false,
1936                is_move_reg: false,
1937            }
1938        }
1939        // ALU 2-op imm 64-bit
1940        132 | 133 | 134 | 149 | 151 | 152 | 153 | 158 | 110 => {
1941            let dc = if dst_src_overlap(ra, r1(rb)) { 1 } else { 2 };
1942            FastCost {
1943                cycles: 1,
1944                decode_slots: dc,
1945                exec_unit: EU_ALU,
1946                src_mask: r1(rb),
1947                dst_mask: r1(ra),
1948                is_terminator: false,
1949                is_move_reg: false,
1950            }
1951        }
1952        // ALU 2-op imm 32-bit
1953        131 | 138 | 139 | 140 | 160 => {
1954            let dc = if dst_src_overlap(ra, r1(rb)) { 2 } else { 3 };
1955            FastCost {
1956                cycles: 2,
1957                decode_slots: dc,
1958                exec_unit: EU_ALU,
1959                src_mask: r1(rb),
1960                dst_mask: r1(ra),
1961                is_terminator: false,
1962                is_move_reg: false,
1963            }
1964        }
1965        // Trivial 2-op: popcount, clz, sign_extend, zero_extend, reverse_bytes
1966        102 | 103 | 104 | 105 | 108 | 109 | 111 => FastCost {
1967            cycles: 1,
1968            decode_slots: 1,
1969            exec_unit: EU_ALU,
1970            src_mask: r1(rb),
1971            dst_mask: r1(ra),
1972            is_terminator: false,
1973            is_move_reg: false,
1974        },
1975        // ctz
1976        106 | 107 => FastCost {
1977            cycles: 2,
1978            decode_slots: 1,
1979            exec_unit: EU_ALU,
1980            src_mask: r1(rb),
1981            dst_mask: r1(ra),
1982            is_terminator: false,
1983            is_move_reg: false,
1984        },
1985
1986        // Shifts 64-bit 3-reg
1987        207 | 208 | 209 | 220 | 222 => {
1988            let dc = if rb == ra { 2 } else { 3 };
1989            FastCost {
1990                cycles: 1,
1991                decode_slots: dc,
1992                exec_unit: EU_ALU,
1993                src_mask: r2(rb, rd),
1994                dst_mask: r1(ra),
1995                is_terminator: false,
1996                is_move_reg: false,
1997            }
1998        }
1999        // Shifts 32-bit 3-reg
2000        197 | 198 | 199 | 221 | 223 => {
2001            let dc = if rb == ra { 3 } else { 4 };
2002            FastCost {
2003                cycles: 2,
2004                decode_slots: dc,
2005                exec_unit: EU_ALU,
2006                src_mask: r2(rb, rd),
2007                dst_mask: r1(ra),
2008                is_terminator: false,
2009                is_move_reg: false,
2010            }
2011        }
2012        // Shift alt 64-bit
2013        155 | 156 | 157 | 159 => FastCost {
2014            cycles: 1,
2015            decode_slots: 3,
2016            exec_unit: EU_ALU,
2017            src_mask: r1(rb),
2018            dst_mask: r1(ra),
2019            is_terminator: false,
2020            is_move_reg: false,
2021        },
2022        // Shift alt 32-bit
2023        144 | 145 | 146 | 161 => FastCost {
2024            cycles: 2,
2025            decode_slots: 4,
2026            exec_unit: EU_ALU,
2027            src_mask: r1(rb),
2028            dst_mask: r1(ra),
2029            is_terminator: false,
2030            is_move_reg: false,
2031        },
2032
2033        // Comparisons 3-reg
2034        216 | 217 => FastCost {
2035            cycles: 3,
2036            decode_slots: 3,
2037            exec_unit: EU_ALU,
2038            src_mask: r2(rb, rd),
2039            dst_mask: r1(ra),
2040            is_terminator: false,
2041            is_move_reg: false,
2042        },
2043        // Comparisons imm
2044        136 | 137 | 142 | 143 => FastCost {
2045            cycles: 3,
2046            decode_slots: 3,
2047            exec_unit: EU_ALU,
2048            src_mask: r1(rb),
2049            dst_mask: r1(ra),
2050            is_terminator: false,
2051            is_move_reg: false,
2052        },
2053
2054        // Conditional moves 3-reg
2055        218 | 219 => FastCost {
2056            cycles: 2,
2057            decode_slots: 2,
2058            exec_unit: EU_ALU,
2059            src_mask: r2(rb, rd),
2060            dst_mask: r1(ra),
2061            is_terminator: false,
2062            is_move_reg: false,
2063        },
2064        // Conditional moves imm
2065        147 | 148 => FastCost {
2066            cycles: 2,
2067            decode_slots: 3,
2068            exec_unit: EU_ALU,
2069            src_mask: r1(rb),
2070            dst_mask: r1(ra),
2071            is_terminator: false,
2072            is_move_reg: false,
2073        },
2074
2075        // Min/Max
2076        227..=230 => {
2077            let s = r2(rb, rd);
2078            let dc = if dst_src_overlap(ra, s) { 2 } else { 3 };
2079            FastCost {
2080                cycles: 3,
2081                decode_slots: dc,
2082                exec_unit: EU_ALU,
2083                src_mask: s,
2084                dst_mask: r1(ra),
2085                is_terminator: false,
2086                is_move_reg: false,
2087            }
2088        }
2089        // and_inv, or_inv
2090        224 | 225 => FastCost {
2091            cycles: 2,
2092            decode_slots: 3,
2093            exec_unit: EU_ALU,
2094            src_mask: r2(rb, rd),
2095            dst_mask: r1(ra),
2096            is_terminator: false,
2097            is_move_reg: false,
2098        },
2099        // xnor
2100        226 => {
2101            let s = r2(rb, rd);
2102            let dc = if dst_src_overlap(ra, s) { 2 } else { 3 };
2103            FastCost {
2104                cycles: 2,
2105                decode_slots: dc,
2106                exec_unit: EU_ALU,
2107                src_mask: s,
2108                dst_mask: r1(ra),
2109                is_terminator: false,
2110                is_move_reg: false,
2111            }
2112        }
2113        // neg_add_imm
2114        154 => FastCost {
2115            cycles: 2,
2116            decode_slots: 3,
2117            exec_unit: EU_ALU,
2118            src_mask: r1(rb),
2119            dst_mask: r1(ra),
2120            is_terminator: false,
2121            is_move_reg: false,
2122        },
2123        141 => FastCost {
2124            cycles: 3,
2125            decode_slots: 4,
2126            exec_unit: EU_ALU,
2127            src_mask: r1(rb),
2128            dst_mask: r1(ra),
2129            is_terminator: false,
2130            is_move_reg: false,
2131        },
2132
2133        // Multiply 64-bit 3-reg
2134        202 => {
2135            let s = r2(rb, rd);
2136            let dc = if dst_src_overlap(ra, s) { 1 } else { 2 };
2137            FastCost {
2138                cycles: 3,
2139                decode_slots: dc,
2140                exec_unit: EU_MUL,
2141                src_mask: s,
2142                dst_mask: r1(ra),
2143                is_terminator: false,
2144                is_move_reg: false,
2145            }
2146        }
2147        // mul_imm_64
2148        150 => {
2149            let dc = if dst_src_overlap(ra, r1(rb)) { 1 } else { 2 };
2150            FastCost {
2151                cycles: 3,
2152                decode_slots: dc,
2153                exec_unit: EU_MUL,
2154                src_mask: r1(rb),
2155                dst_mask: r1(ra),
2156                is_terminator: false,
2157                is_move_reg: false,
2158            }
2159        }
2160        // Multiply 32-bit 3-reg
2161        192 => {
2162            let s = r2(rb, rd);
2163            let dc = if dst_src_overlap(ra, s) { 2 } else { 3 };
2164            FastCost {
2165                cycles: 4,
2166                decode_slots: dc,
2167                exec_unit: EU_MUL,
2168                src_mask: s,
2169                dst_mask: r1(ra),
2170                is_terminator: false,
2171                is_move_reg: false,
2172            }
2173        }
2174        // mul_imm_32
2175        135 => {
2176            let dc = if dst_src_overlap(ra, r1(rb)) { 2 } else { 3 };
2177            FastCost {
2178                cycles: 4,
2179                decode_slots: dc,
2180                exec_unit: EU_MUL,
2181                src_mask: r1(rb),
2182                dst_mask: r1(ra),
2183                is_terminator: false,
2184                is_move_reg: false,
2185            }
2186        }
2187        // Multiply upper
2188        213 | 214 => FastCost {
2189            cycles: 4,
2190            decode_slots: 4,
2191            exec_unit: EU_MUL,
2192            src_mask: r2(rb, rd),
2193            dst_mask: r1(ra),
2194            is_terminator: false,
2195            is_move_reg: false,
2196        },
2197        215 => FastCost {
2198            cycles: 6,
2199            decode_slots: 4,
2200            exec_unit: EU_MUL,
2201            src_mask: r2(rb, rd),
2202            dst_mask: r1(ra),
2203            is_terminator: false,
2204            is_move_reg: false,
2205        },
2206
2207        // Divide
2208        193 | 194 | 195 | 196 | 203 | 204 | 205 | 206 => FastCost {
2209            cycles: 60,
2210            decode_slots: 4,
2211            exec_unit: EU_DIV,
2212            src_mask: r2(rb, rd),
2213            dst_mask: r1(ra),
2214            is_terminator: false,
2215            is_move_reg: false,
2216        },
2217
2218        // Default
2219        _ => FastCost {
2220            cycles: 1,
2221            decode_slots: 1,
2222            exec_unit: EU_NONE,
2223            src_mask: 0,
2224            dst_mask: 0,
2225            is_terminator: false,
2226            is_move_reg: false,
2227        },
2228    }
2229}
2230
2231// === Gas cost lookup table ===
2232// Replaces the 256-arm match in fast_cost_from_decoded with a single array
2233// lookup + lightweight mask computation. Eliminates branch-heavy dispatch.
2234
2235/// Register pattern encoding for the lookup table.
2236/// Describes which raw register fields contribute to src_mask and dst_mask.
2237#[derive(Clone, Copy)]
2238struct GasCostEntry {
2239    cycles: u8,
2240    /// Base decode_slots (before overlap adjustment).
2241    decode_slots: u8,
2242    exec_unit: u8,
2243    /// Source mask pattern: 0=none, 1=ra, 2=rb, 3=ra|rb, 4=rb|rd, 5=ra(store-imm)
2244    src_pat: u8,
2245    /// Destination mask pattern: 0=none, 1=ra, 2=rd
2246    dst_pat: u8,
2247    flags: u8, // bit0=terminator, bit1=move_reg, bit2=needs_branch_cost, bit3=overlap_adjust
2248    /// For overlap_adjust: decode_slots_if_overlap (lower) and decode_slots_no_overlap (upper nibble)
2249    overlap_slots: u8,
2250}
2251
2252const F_TERM: u8 = 1;
2253const F_MOVE: u8 = 2;
2254const F_BRANCH: u8 = 4;
2255const F_OVERLAP: u8 = 8;
2256const F_BRANCH2: u8 = 16; // two-reg branch (src=ra|rb)
2257const F_SHIFT_OVERLAP: u8 = 32; // shift: overlap is rb==ra, not dst_src_overlap
2258
2259const fn gc(
2260    cycles: u8,
2261    decode_slots: u8,
2262    exec_unit: u8,
2263    src_pat: u8,
2264    dst_pat: u8,
2265    flags: u8,
2266) -> GasCostEntry {
2267    GasCostEntry {
2268        cycles,
2269        decode_slots,
2270        exec_unit,
2271        src_pat,
2272        dst_pat,
2273        flags,
2274        overlap_slots: 0,
2275    }
2276}
2277const fn gc_ov(
2278    cycles: u8,
2279    overlap_if: u8,
2280    overlap_no: u8,
2281    exec_unit: u8,
2282    src_pat: u8,
2283    dst_pat: u8,
2284    flags: u8,
2285) -> GasCostEntry {
2286    GasCostEntry {
2287        cycles,
2288        decode_slots: 0,
2289        exec_unit,
2290        src_pat,
2291        dst_pat,
2292        flags: flags | F_OVERLAP,
2293        overlap_slots: overlap_if | (overlap_no << 4),
2294    }
2295}
2296
2297static GAS_COST_LUT: [GasCostEntry; 256] = {
2298    let d = gc(1, 1, EU_NONE, 0, 0, 0); // default
2299    let mut t = [d; 256];
2300    // No-arg terminators
2301    t[0] = gc(2, 1, EU_NONE, 0, 0, F_TERM);
2302    t[1] = gc(2, 1, EU_NONE, 0, 0, F_TERM);
2303    t[2] = gc(40, 1, EU_NONE, 0, 0, F_TERM);
2304    // Ecall (opcode 3): no immediate; op in φ[11], refs in φ[12].
2305    // Same fast-path cost shape as Ecalli — kernel handles the work, the
2306    // PVM-side cost is just the exit dispatch.
2307    t[3] = gc(100, 4, EU_ALU, 0, 0, F_TERM);
2308    t[10] = gc(100, 4, EU_ALU, 0, 0, F_TERM);
2309    // Control flow
2310    t[40] = gc(15, 1, EU_ALU, 0, 0, F_TERM);
2311    t[80] = gc(15, 1, EU_ALU, 0, 1, F_TERM); // dst=ra
2312    t[50] = gc(22, 1, EU_ALU, 0, 0, F_TERM);
2313    t[180] = gc(22, 1, EU_ALU, 2, 1, F_TERM); // src=rb, dst=ra
2314    // Loads (src=rb, dst=ra)
2315    let mut i = 52;
2316    while i <= 58 {
2317        t[i] = gc(25, 1, EU_LOAD, 2, 1, 0);
2318        i += 1;
2319    }
2320    i = 124;
2321    while i <= 130 {
2322        t[i] = gc(25, 1, EU_LOAD, 2, 1, 0);
2323        i += 1;
2324    }
2325    // Stores (src=ra|rb, dst=none)
2326    i = 59;
2327    while i <= 62 {
2328        t[i] = gc(25, 1, EU_STORE, 3, 0, 0);
2329        i += 1;
2330    }
2331    i = 120;
2332    while i <= 123 {
2333        t[i] = gc(25, 1, EU_STORE, 3, 0, 0);
2334        i += 1;
2335    }
2336    i = 30;
2337    while i <= 33 {
2338        t[i] = gc(25, 1, EU_STORE, 0, 0, 0);
2339        i += 1;
2340    }
2341    i = 70;
2342    while i <= 73 {
2343        t[i] = gc(25, 1, EU_STORE, 1, 0, 0);
2344        i += 1;
2345    } // src=ra
2346    // Load immediates
2347    t[51] = gc(1, 1, EU_NONE, 0, 1, 0);
2348    t[20] = gc(1, 2, EU_NONE, 0, 1, 0);
2349    // move_reg
2350    t[100] = gc(0, 1, EU_NONE, 2, 1, F_MOVE); // src=rb, dst=ra
2351    t[101] = gc(2, 1, EU_NONE, 0, 0, 0); // nop
2352    // Branches (reg+imm+offset) — needs branch_cost
2353    i = 81;
2354    while i <= 90 {
2355        t[i] = gc(0, 1, EU_ALU, 1, 0, F_TERM | F_BRANCH);
2356        i += 1;
2357    } // src=ra
2358    // Branches (two-reg+offset)
2359    i = 170;
2360    while i <= 175 {
2361        t[i] = gc(0, 1, EU_ALU, 3, 0, F_TERM | F_BRANCH2);
2362        i += 1;
2363    } // src=ra|rb
2364    // ALU 64-bit 3-reg (src=rb|rd, dst=ra, overlap adjust)
2365    t[200] = gc_ov(1, 1, 2, EU_ALU, 4, 1, 0);
2366    t[201] = gc_ov(1, 1, 2, EU_ALU, 4, 1, 0);
2367    t[210] = gc_ov(1, 1, 2, EU_ALU, 4, 1, 0);
2368    t[211] = gc_ov(1, 1, 2, EU_ALU, 4, 1, 0);
2369    t[212] = gc_ov(1, 1, 2, EU_ALU, 4, 1, 0);
2370    // ALU 32-bit 3-reg
2371    t[190] = gc_ov(2, 2, 3, EU_ALU, 4, 1, 0);
2372    t[191] = gc_ov(2, 2, 3, EU_ALU, 4, 1, 0);
2373    // ALU 2-op imm 64-bit (src=rb, dst=ra, overlap adjust)
2374    {
2375        let e = gc_ov(1, 1, 2, EU_ALU, 2, 1, 0);
2376        t[132] = e;
2377        t[133] = e;
2378        t[134] = e;
2379        t[149] = e;
2380        t[151] = e;
2381        t[152] = e;
2382        t[153] = e;
2383        t[158] = e;
2384        t[110] = e;
2385    }
2386    // ALU 2-op imm 32-bit
2387    {
2388        let e = gc_ov(2, 2, 3, EU_ALU, 2, 1, 0);
2389        t[131] = e;
2390        t[138] = e;
2391        t[139] = e;
2392        t[140] = e;
2393        t[160] = e;
2394    }
2395    // Trivial 2-op (src=rb, dst=ra)
2396    {
2397        let e = gc(1, 1, EU_ALU, 2, 1, 0);
2398        t[102] = e;
2399        t[103] = e;
2400        t[104] = e;
2401        t[105] = e;
2402        t[108] = e;
2403        t[109] = e;
2404        t[111] = e;
2405    }
2406    // ctz
2407    t[106] = gc(2, 1, EU_ALU, 2, 1, 0);
2408    t[107] = gc(2, 1, EU_ALU, 2, 1, 0);
2409    // Shifts 64-bit 3-reg (src=rb|rd, dst=ra, shift overlap: rb==ra)
2410    {
2411        let e = gc_ov(1, 2, 3, EU_ALU, 4, 1, F_SHIFT_OVERLAP);
2412        t[207] = e;
2413        t[208] = e;
2414        t[209] = e;
2415        t[220] = e;
2416        t[222] = e;
2417    }
2418    // Shifts 32-bit 3-reg
2419    {
2420        let e = gc_ov(2, 3, 4, EU_ALU, 4, 1, F_SHIFT_OVERLAP);
2421        t[197] = e;
2422        t[198] = e;
2423        t[199] = e;
2424        t[221] = e;
2425        t[223] = e;
2426    }
2427    // Shift alt 64-bit
2428    {
2429        let e = gc(1, 3, EU_ALU, 2, 1, 0);
2430        t[155] = e;
2431        t[156] = e;
2432        t[157] = e;
2433        t[159] = e;
2434    }
2435    // Shift alt 32-bit
2436    {
2437        let e = gc(2, 4, EU_ALU, 2, 1, 0);
2438        t[144] = e;
2439        t[145] = e;
2440        t[146] = e;
2441        t[161] = e;
2442    }
2443    // Comparisons 3-reg (src=rb|rd, dst=ra)
2444    t[216] = gc(3, 3, EU_ALU, 4, 1, 0);
2445    t[217] = gc(3, 3, EU_ALU, 4, 1, 0);
2446    // Comparisons imm (src=rb, dst=ra)
2447    {
2448        let e = gc(3, 3, EU_ALU, 2, 1, 0);
2449        t[136] = e;
2450        t[137] = e;
2451        t[142] = e;
2452        t[143] = e;
2453    }
2454    // Conditional moves 3-reg
2455    t[218] = gc(2, 2, EU_ALU, 4, 1, 0);
2456    t[219] = gc(2, 2, EU_ALU, 4, 1, 0);
2457    // Conditional moves imm
2458    t[147] = gc(2, 3, EU_ALU, 2, 1, 0);
2459    t[148] = gc(2, 3, EU_ALU, 2, 1, 0);
2460    // Min/Max (src=rb|rd, dst=ra, overlap adjust)
2461    {
2462        let e = gc_ov(3, 2, 3, EU_ALU, 4, 1, 0);
2463        t[227] = e;
2464        t[228] = e;
2465        t[229] = e;
2466        t[230] = e;
2467    }
2468    // and_inv, or_inv
2469    t[224] = gc(2, 3, EU_ALU, 4, 1, 0);
2470    t[225] = gc(2, 3, EU_ALU, 4, 1, 0);
2471    // xnor (overlap adjust)
2472    t[226] = gc_ov(2, 2, 3, EU_ALU, 4, 1, 0);
2473    // neg_add_imm
2474    t[154] = gc(2, 3, EU_ALU, 2, 1, 0);
2475    t[141] = gc(3, 4, EU_ALU, 2, 1, 0);
2476    // Multiply 64-bit 3-reg (overlap adjust)
2477    t[202] = gc_ov(3, 1, 2, EU_MUL, 4, 1, 0);
2478    // mul_imm_64
2479    t[150] = gc_ov(3, 1, 2, EU_MUL, 2, 1, 0);
2480    // Multiply 32-bit 3-reg
2481    t[192] = gc_ov(4, 2, 3, EU_MUL, 4, 1, 0);
2482    // mul_imm_32
2483    t[135] = gc_ov(4, 2, 3, EU_MUL, 2, 1, 0);
2484    // Multiply upper
2485    t[213] = gc(4, 4, EU_MUL, 4, 1, 0);
2486    t[214] = gc(4, 4, EU_MUL, 4, 1, 0);
2487    t[215] = gc(6, 4, EU_MUL, 4, 1, 0);
2488    // Divide (src=rb|rd, dst=ra)
2489    {
2490        let e = gc(60, 4, EU_DIV, 4, 1, 0);
2491        t[193] = e;
2492        t[194] = e;
2493        t[195] = e;
2494        t[196] = e;
2495        t[203] = e;
2496        t[204] = e;
2497        t[205] = e;
2498        t[206] = e;
2499    }
2500    t
2501};
2502
2503/// Feed the gas simulator directly from raw register bytes, skipping FastCost
2504/// construction. Returns (is_terminator, is_branch_or_special) — the caller
2505/// uses is_branch_or_special to fall back to the full path for rare cases.
2506#[inline(always)]
2507pub fn feed_gas_direct(
2508    opcode_byte: u8,
2509    ra: u8,
2510    rb: u8,
2511    rd: u8,
2512    gas_sim: &mut crate::gas_sim::GasSimulator,
2513    mem_cycles: u8,
2514) -> (bool, bool) {
2515    let entry = &GAS_COST_LUT[opcode_byte as usize];
2516    let flags = entry.flags;
2517
2518    // Fast path: non-branch, non-overlap, non-move (~90% of instructions).
2519    if flags & (F_BRANCH | F_BRANCH2 | F_OVERLAP | F_MOVE | F_SHIFT_OVERLAP) == 0 {
2520        // Map src_pat to register indices (0xFF = "no source")
2521        let (src1, src2) = match entry.src_pat {
2522            0 => (0xFF, 0xFF),
2523            1 => (ra.min(12), 0xFF),
2524            2 => (rb.min(12), 0xFF),
2525            3 => (ra.min(12), rb.min(12)),
2526            4 => (rb.min(12), rd.min(12)),
2527            _ => (0xFF, 0xFF),
2528        };
2529        let dst = if entry.dst_pat == 1 {
2530            ra.min(12)
2531        } else if entry.dst_pat == 2 {
2532            rd.min(12)
2533        } else {
2534            0xFF
2535        };
2536        // Override cycles for load/store with tier-dependent mem_cycles
2537        let cycles = if entry.exec_unit == EU_LOAD || entry.exec_unit == EU_STORE {
2538            mem_cycles
2539        } else {
2540            entry.cycles
2541        };
2542        gas_sim.feed_direct(cycles, entry.decode_slots, src1, src2, dst);
2543        return (flags & F_TERM != 0, false);
2544    }
2545
2546    // Slow path needed — caller must use the full FastCost path
2547    (flags & F_TERM != 0, true)
2548}
2549
2550/// Compute FastCost via lookup table — replaces the 256-arm match dispatch
2551/// with a single array access + lightweight mask computation.
2552#[inline(always)]
2553pub fn fast_cost_lut(
2554    opcode_byte: u8,
2555    args: &crate::args::Args,
2556    pc: u32,
2557    code: &[u8],
2558    bitmask: &[u8],
2559    mem_cycles: u8,
2560) -> FastCost {
2561    let pcu = pc as usize;
2562    let reg_byte1 = if pcu + 1 < code.len() {
2563        code[pcu + 1]
2564    } else {
2565        0xFF
2566    };
2567    let ra = reg_byte1 & 0x0F;
2568    let rb = (reg_byte1 >> 4) & 0x0F;
2569    let rd = if pcu + 2 < code.len() {
2570        code[pcu + 2] & 0x0F
2571    } else {
2572        0xFF
2573    };
2574
2575    fast_cost_lut_inner(
2576        opcode_byte,
2577        args,
2578        pcu,
2579        code,
2580        bitmask,
2581        ra,
2582        rb,
2583        rd,
2584        mem_cycles,
2585    )
2586}
2587
2588/// Like `fast_cost_lut` but takes pre-extracted register bytes to avoid
2589/// re-reading code[pc+1] and code[pc+2] (already decoded by the caller).
2590#[inline(always)]
2591#[allow(clippy::too_many_arguments)]
2592pub fn fast_cost_lut_regs(
2593    opcode_byte: u8,
2594    args: &crate::args::Args,
2595    pc: usize,
2596    code: &[u8],
2597    bitmask: &[u8],
2598    ra: u8,
2599    rb: u8,
2600    rd: u8,
2601    mem_cycles: u8,
2602) -> FastCost {
2603    fast_cost_lut_inner(opcode_byte, args, pc, code, bitmask, ra, rb, rd, mem_cycles)
2604}
2605
2606/// Inner implementation — separated to allow the compiler to inline the
2607/// caller-side register extraction and keep the complex logic out-of-line.
2608#[inline(always)]
2609#[allow(clippy::too_many_arguments)]
2610fn fast_cost_lut_inner(
2611    opcode_byte: u8,
2612    args: &crate::args::Args,
2613    pcu: usize,
2614    code: &[u8],
2615    bitmask: &[u8],
2616    ra: u8,
2617    rb: u8,
2618    rd: u8,
2619    mem_cycles: u8,
2620) -> FastCost {
2621    use crate::args::Args;
2622
2623    let entry = &GAS_COST_LUT[opcode_byte as usize];
2624    let flags = entry.flags;
2625
2626    // Fast path: most instructions are non-branch, non-overlap.
2627    // Skip the expensive branch cost and overlap calculations.
2628    if flags & (F_BRANCH | F_BRANCH2 | F_OVERLAP) == 0 {
2629        // Compute masks inline (branchless via LUT could be even faster,
2630        // but the match is well-predicted for the common patterns).
2631        let ra_bit = 1u16 << ra.min(12);
2632        let rb_bit = 1u16 << rb.min(12);
2633        let rd_bit = 1u16 << rd.min(12);
2634        let src_mask: u16 = match entry.src_pat {
2635            0 => 0,
2636            1 => ra_bit,
2637            2 => rb_bit,
2638            3 => ra_bit | rb_bit,
2639            4 => rb_bit | rd_bit,
2640            _ => 0,
2641        };
2642        let dst_mask: u16 = if entry.dst_pat == 1 { ra_bit } else { 0 };
2643        let cycles = if entry.exec_unit == EU_LOAD || entry.exec_unit == EU_STORE {
2644            mem_cycles
2645        } else {
2646            entry.cycles
2647        };
2648        return FastCost {
2649            cycles,
2650            decode_slots: entry.decode_slots,
2651            exec_unit: entry.exec_unit,
2652            src_mask,
2653            dst_mask,
2654            is_terminator: flags & F_TERM != 0,
2655            is_move_reg: flags & F_MOVE != 0,
2656        };
2657    }
2658
2659    // Slow path: branch or overlap instructions
2660    let ra_bit = 1u16 << ra.min(12);
2661    let rb_bit = 1u16 << rb.min(12);
2662    let rd_bit = 1u16 << rd.min(12);
2663
2664    let src_mask: u16 = match entry.src_pat {
2665        0 => 0,
2666        1 => ra_bit,
2667        2 => rb_bit,
2668        3 => ra_bit | rb_bit,
2669        4 => rb_bit | rd_bit,
2670        _ => 0,
2671    };
2672    let dst_mask: u16 = if entry.dst_pat == 1 { ra_bit } else { 0 };
2673
2674    let cycles = if flags & (F_BRANCH | F_BRANCH2) != 0 {
2675        let branch_target = match args {
2676            Args::RegImmOffset { offset, .. } => *offset as usize,
2677            Args::TwoRegOffset { offset, .. } => *offset as usize,
2678            Args::Offset { offset } => *offset as usize,
2679            _ => pcu,
2680        };
2681        branch_cost(code, bitmask, branch_target) as u8
2682    } else if entry.exec_unit == EU_LOAD || entry.exec_unit == EU_STORE {
2683        mem_cycles
2684    } else {
2685        entry.cycles
2686    };
2687
2688    let decode_slots = if flags & F_OVERLAP != 0 {
2689        let overlap = if flags & F_SHIFT_OVERLAP != 0 {
2690            rb == ra
2691        } else {
2692            (dst_mask & src_mask) != 0
2693        };
2694        if overlap {
2695            entry.overlap_slots & 0x0F
2696        } else {
2697            entry.overlap_slots >> 4
2698        }
2699    } else {
2700        entry.decode_slots
2701    };
2702
2703    FastCost {
2704        cycles,
2705        decode_slots,
2706        exec_unit: entry.exec_unit,
2707        src_mask,
2708        dst_mask,
2709        is_terminator: flags & F_TERM != 0,
2710        is_move_reg: flags & F_MOVE != 0,
2711    }
2712}
2713
2714/// Check if execution unit is available.
2715#[inline(always)]
2716#[allow(dead_code)] // used by gas_sim_fast (stashed; returns with recompiler port)
2717fn eu_available(avail: &[u8; 5], eu: u8) -> bool {
2718    match eu {
2719        EU_NONE => true,
2720        EU_ALU => avail[0] >= 1,
2721        EU_LOAD => avail[0] >= 1 && avail[1] >= 1,
2722        EU_STORE => avail[0] >= 1 && avail[2] >= 1,
2723        EU_MUL => avail[0] >= 1 && avail[3] >= 1,
2724        EU_DIV => avail[0] >= 1 && avail[4] >= 1,
2725        _ => false,
2726    }
2727}
2728
2729/// Consume execution unit.
2730#[inline(always)]
2731#[allow(dead_code)] // used by gas_sim_fast (stashed; returns with recompiler port)
2732fn eu_consume(avail: &mut [u8; 5], eu: u8) {
2733    match eu {
2734        EU_ALU => {
2735            avail[0] -= 1;
2736        }
2737        EU_LOAD => {
2738            avail[0] -= 1;
2739            avail[1] -= 1;
2740        }
2741        EU_STORE => {
2742            avail[0] -= 1;
2743            avail[2] -= 1;
2744        }
2745        EU_MUL => {
2746            avail[0] -= 1;
2747            avail[3] -= 1;
2748        }
2749        EU_DIV => {
2750            avail[0] -= 1;
2751            avail[4] -= 1;
2752        }
2753        _ => {}
2754    }
2755}
2756
2757// === advance_cycle / gas_sim_fast / gas_cost_for_block_fast ===
2758// Cherry-picked from v2. Bitmask-based pipeline simulator used by the
2759// recompiler; reference `crate::predecoded::PreDecodedInst`.
2760
2761#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
2762fn advance_cycle(cycles_left: &mut [u8; 32], exe_mask: &mut u32, fin_mask: &mut u32) {
2763    let mut exe = *exe_mask;
2764    while exe != 0 {
2765        let i = exe.trailing_zeros() as usize;
2766        exe &= exe - 1;
2767        if cycles_left[i] <= 1 {
2768            cycles_left[i] = 0;
2769            *exe_mask &= !(1u32 << i);
2770            *fin_mask |= 1u32 << i;
2771        } else {
2772            cycles_left[i] -= 1;
2773        }
2774    }
2775}
2776
2777#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
2778fn gas_sim_fast(
2779    instrs: &[crate::predecoded::PreDecodedInst],
2780    _code: &[u8],
2781    _bitmask: &[u8],
2782) -> u32 {
2783    let mut state = [0u8; 32]; // 0=empty, 1=wait, 2=exe, 3=fin
2784    let mut cycles_left = [0u8; 32];
2785    let mut exec_unit = [0u8; 32];
2786    let mut deps = [0u32; 32];
2787    let mut reg_writer = [0xFFu8; 16];
2788
2789    let mut fin_mask: u32 = 0;
2790    let mut wait_mask: u32 = 0;
2791    let mut exe_mask: u32 = 0;
2792
2793    let mut next_slot: u8 = 0;
2794    let mut instr_idx: usize = 0;
2795    let mut cycles: u32 = 0;
2796    let mut decode_slots: u8 = 4;
2797    let mut dispatch_slots: u8 = 5;
2798    let mut eu_avail: [u8; 5] = [4, 4, 4, 1, 1]; // alu, load, store, mul, div
2799
2800    for _safety in 0..100_000u32 {
2801        while instr_idx < instrs.len() && decode_slots > 0 && (next_slot as usize) < 32 {
2802            let ii = &instrs[instr_idx];
2803            let cost = fast_cost_from_raw(
2804                ii.opcode as u8,
2805                ii.ra,
2806                ii.rb,
2807                ii.rd,
2808                ii.pc,
2809                _code,
2810                _bitmask,
2811                DEFAULT_MEM_CYCLES,
2812            );
2813
2814            if cost.is_move_reg {
2815                decode_slots = decode_slots.saturating_sub(cost.decode_slots);
2816                instr_idx = if cost.is_terminator {
2817                    instrs.len()
2818                } else {
2819                    instr_idx + 1
2820                };
2821                continue;
2822            }
2823
2824            let mut dep_mask: u32 = 0;
2825            let mut src = cost.src_mask;
2826            while src != 0 {
2827                let reg = src.trailing_zeros() as usize;
2828                src &= src - 1;
2829                let writer = reg_writer[reg];
2830                if writer != 0xFF && (fin_mask & (1u32 << writer)) == 0 {
2831                    dep_mask |= 1u32 << writer;
2832                }
2833            }
2834
2835            let slot = next_slot as usize;
2836            state[slot] = 1; // WAIT
2837            cycles_left[slot] = cost.cycles;
2838            exec_unit[slot] = cost.exec_unit;
2839            deps[slot] = dep_mask;
2840            wait_mask |= 1u32 << slot;
2841
2842            let mut dst = cost.dst_mask;
2843            while dst != 0 {
2844                let reg = dst.trailing_zeros() as usize;
2845                dst &= dst - 1;
2846                reg_writer[reg] = next_slot;
2847            }
2848
2849            next_slot += 1;
2850            decode_slots = decode_slots.saturating_sub(cost.decode_slots);
2851            instr_idx = if cost.is_terminator {
2852                instrs.len()
2853            } else {
2854                instr_idx + 1
2855            };
2856        }
2857
2858        while dispatch_slots > 0 {
2859            let mut candidates = wait_mask;
2860            let mut found = false;
2861            while candidates != 0 {
2862                let i = candidates.trailing_zeros() as usize;
2863                candidates &= candidates - 1;
2864                if (deps[i] & !fin_mask) == 0 && eu_available(&eu_avail, exec_unit[i]) {
2865                    eu_consume(&mut eu_avail, exec_unit[i]);
2866                    state[i] = 2; // EXE
2867                    wait_mask &= !(1u32 << i);
2868                    exe_mask |= 1u32 << i;
2869                    dispatch_slots -= 1;
2870                    found = true;
2871                    break;
2872                }
2873            }
2874            if !found {
2875                break;
2876            }
2877        }
2878
2879        if instr_idx >= instrs.len() && exe_mask == 0 && wait_mask == 0 {
2880            break;
2881        }
2882
2883        advance_cycle(&mut cycles_left, &mut exe_mask, &mut fin_mask);
2884
2885        cycles += 1;
2886        decode_slots = 4;
2887        dispatch_slots = 5;
2888        eu_avail = [4, 4, 4, 1, 1];
2889    }
2890
2891    let _ = state;
2892    cycles
2893}
2894
2895/// Fast gas cost computation using bitmask-based pipeline simulator.
2896#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
2897pub fn gas_cost_for_block_fast(
2898    instrs: &[crate::predecoded::PreDecodedInst],
2899    code: &[u8],
2900    bitmask: &[u8],
2901) -> u64 {
2902    let cycles = gas_sim_fast(instrs, code, bitmask);
2903    if cycles > 3 { (cycles - 3) as u64 } else { 1 }
2904}
2905
2906#[cfg(test)]
2907mod tests {
2908    use super::*;
2909    use crate::gas_sim::GasSimulator;
2910
2911    /// Helper: compute gas cost for a single-block program using GasSimulator.
2912    fn block_cost(code: &[u8], bitmask: &[u8]) -> u32 {
2913        let mut sim = GasSimulator::new();
2914        let mut pc = 0;
2915        while pc < code.len() {
2916            if pc < bitmask.len() && bitmask[pc] != 1 {
2917                pc += 1;
2918                continue;
2919            }
2920            let opcode_byte = code[pc];
2921            let raw_ra = if pc + 1 < code.len() {
2922                code[pc + 1] & 0x0F
2923            } else {
2924                0xFF
2925            };
2926            let raw_rb = if pc + 1 < code.len() {
2927                (code[pc + 1] >> 4) & 0x0F
2928            } else {
2929                0xFF
2930            };
2931            let raw_rd = if pc + 2 < code.len() {
2932                code[pc + 2] & 0x0F
2933            } else {
2934                0xFF
2935            };
2936            let fc = fast_cost_from_raw(
2937                opcode_byte,
2938                raw_ra,
2939                raw_rb,
2940                raw_rd,
2941                pc as u32,
2942                code,
2943                bitmask,
2944                DEFAULT_MEM_CYCLES,
2945            );
2946            sim.feed(&fc);
2947            if fc.is_terminator {
2948                break;
2949            }
2950            let skip = skip_distance(bitmask, pc);
2951            pc += 1 + skip;
2952        }
2953        sim.flush_and_get_cost()
2954    }
2955
2956    #[test]
2957    fn test_single_trap() {
2958        // trap = 2 cycles, max(2-3,1) = 1
2959        assert_eq!(block_cost(&[0u8], &[1u8]), 1);
2960    }
2961
2962    #[test]
2963    fn test_single_ecalli() {
2964        // ecalli = 100 cycles, max(100-3,1) = 97
2965        assert_eq!(block_cost(&[10u8, 0], &[1, 0]), 97);
2966    }
2967
2968    #[test]
2969    fn test_single_jump() {
2970        // jump = 15 cycles, max(15-3,1) = 12
2971        assert_eq!(block_cost(&[40u8, 0], &[1, 0]), 12);
2972    }
2973
2974    #[test]
2975    fn test_single_fallthrough() {
2976        // fallthrough = 2 cycles, max(2-3,1) = 1
2977        assert_eq!(block_cost(&[1u8], &[1]), 1);
2978    }
2979
2980    #[test]
2981    fn test_load_imm_then_trap() {
2982        let cost = block_cost(&[51, 0, 42, 0], &[1, 0, 0, 1]);
2983        assert!(cost >= 1, "cost should be >= 1, got {}", cost);
2984    }
2985}
2986
2987#[cfg(test)]
2988mod proptests {
2989    use super::*;
2990    use proptest::prelude::*;
2991
2992    proptest! {
2993        /// gas_cost_for_block always returns at least 1 (the minimum gas cost).
2994        #[test]
2995        fn gas_cost_always_at_least_one(
2996            code in proptest::collection::vec(any::<u8>(), 1..64),
2997        ) {
2998            // Build a bitmask: first byte is always an instruction start.
2999            let mut bitmask = vec![0u8; code.len()];
3000            bitmask[0] = 1;
3001            let cost = gas_cost_for_block(&code, &bitmask, 0);
3002            prop_assert!(cost >= 1);
3003        }
3004
3005        /// skip_distance never exceeds 24.
3006        #[test]
3007        fn skip_distance_bounded(
3008            bitmask in proptest::collection::vec(0u8..=1, 1..64),
3009            pc in 0usize..63,
3010        ) {
3011            let dist = skip_distance(&bitmask, pc);
3012            prop_assert!(dist <= 24);
3013        }
3014
3015        /// ExecUnits::RESET can always satisfy any of the unit-type constants.
3016        #[test]
3017        fn reset_satisfies_all_unit_types(choice in 0u8..6) {
3018            let req = match choice {
3019                0 => ExecUnits::NONE,
3020                1 => ExecUnits::ALU,
3021                2 => ExecUnits::LOAD,
3022                3 => ExecUnits::STORE,
3023                4 => ExecUnits::MUL,
3024                5 => ExecUnits::DIV,
3025                _ => unreachable!(),
3026            };
3027            prop_assert!(ExecUnits::RESET.can_satisfy(req));
3028        }
3029
3030        /// ExecUnits::sub followed by can_satisfy: subtracting a satisfiable
3031        /// request from RESET yields units that can satisfy NONE.
3032        #[test]
3033        fn sub_preserves_non_negative(choice in 0u8..6) {
3034            let req = match choice {
3035                0 => ExecUnits::NONE,
3036                1 => ExecUnits::ALU,
3037                2 => ExecUnits::LOAD,
3038                3 => ExecUnits::STORE,
3039                4 => ExecUnits::MUL,
3040                5 => ExecUnits::DIV,
3041                _ => unreachable!(),
3042            };
3043            let remaining = ExecUnits::RESET.sub(req);
3044            prop_assert!(remaining.can_satisfy(ExecUnits::NONE));
3045        }
3046
3047        /// gas_cost_for_block is deterministic: same inputs produce same output.
3048        #[test]
3049        fn gas_cost_deterministic(
3050            code in proptest::collection::vec(any::<u8>(), 1..32),
3051        ) {
3052            let mut bitmask = vec![0u8; code.len()];
3053            bitmask[0] = 1;
3054            let cost1 = gas_cost_for_block(&code, &bitmask, 0);
3055            let cost2 = gas_cost_for_block(&code, &bitmask, 0);
3056            prop_assert_eq!(cost1, cost2);
3057        }
3058
3059        /// reg_bit always returns a power of two (single bit set).
3060        #[test]
3061        fn reg_bit_is_power_of_two(r in 0u8..16) {
3062            let bit = reg_bit(r);
3063            prop_assert!(bit.is_power_of_two());
3064        }
3065
3066        /// reg_bit clamps register indices >= 13 to register 12.
3067        #[test]
3068        fn reg_bit_clamps_high_registers(r in 13u8..=15) {
3069            prop_assert_eq!(reg_bit(r), reg_bit(12));
3070        }
3071
3072        /// RegSet::contains is consistent with RegSet::one and RegSet::two.
3073        #[test]
3074        fn regset_contains_matches_construction(a in 0u8..13, b in 0u8..13) {
3075            prop_assume!(a != b);
3076            let set = RegSet::two(a, b);
3077            prop_assert!(set.contains(a));
3078            prop_assert!(set.contains(b));
3079
3080            let single = RegSet::one(a);
3081            prop_assert!(single.contains(a));
3082            prop_assert!(!single.contains(b) || a == b);
3083        }
3084    }
3085}