Skip to main content

javm_transpiler/
elf.rs

1//! ELF parsing helpers shared by the PVM2 linker.
2//!
3//! Reads section headers + relocations from a 64-bit rv64em ELF and
4//! returns a `LinkedElf` with the data the linker needs to lay out
5//! code/data and resolve relocations.
6
7use crate::TranspileError;
8use std::collections::HashMap;
9
10/// RISC-V relocation types we care about.
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub(crate) enum RelocType {
13    /// R_RISCV_32 (1): Absolute 32-bit address
14    Abs32,
15    /// R_RISCV_64 (2): Absolute 64-bit address
16    Abs64,
17    /// R_RISCV_CALL_PLT (19): AUIPC+JALR pair for function calls
18    CallPlt,
19    /// R_RISCV_PCREL_HI20 (23): Upper 20 bits of PC-relative address (AUIPC)
20    PcrelHi20,
21    /// R_RISCV_PCREL_LO12_I (24): Lower 12 bits, I-type (load/addi)
22    PcrelLo12I,
23    /// R_RISCV_PCREL_LO12_S (25): Lower 12 bits, S-type (store)
24    PcrelLo12S,
25    /// R_RISCV_ADD32 (35): Add 32-bit (paired with SUB32 for relative jump tables)
26    Add32,
27    /// R_RISCV_SUB32 (39): Subtract 32-bit (paired with ADD32 for relative jump tables)
28    Sub32,
29}
30
31impl RelocType {
32    fn from_raw(r: u32) -> Option<Self> {
33        match r {
34            1 => Some(Self::Abs32),
35            2 => Some(Self::Abs64),
36            19 => Some(Self::CallPlt),
37            23 => Some(Self::PcrelHi20),
38            24 => Some(Self::PcrelLo12I),
39            25 => Some(Self::PcrelLo12S),
40            35 => Some(Self::Add32),
41            39 => Some(Self::Sub32),
42            _ => None,
43        }
44    }
45}
46
47/// Parsed ELF with relocation info for linking.
48pub(crate) struct LinkedElf {
49    /// All code sections: (file_offset, vaddr, data)
50    pub(crate) code_sections: Vec<(u64, u64, Vec<u8>)>,
51    /// RO data blob.
52    pub(crate) ro_data: Vec<u8>,
53    /// RW data blob.
54    pub(crate) rw_data: Vec<u8>,
55    /// Stack size in bytes (= ro_base, so RO data is at the right PVM address)
56    pub(crate) stack_size: u32,
57    /// Heap pages
58    pub(crate) heap_pages: u32,
59    /// PCREL_HI20: AUIPC instruction vaddr → resolved data address.
60    pub(crate) hi20_targets: HashMap<u64, u64>,
61    /// PCREL_LO12: instruction vaddr → resolved target address (looked up from paired HI20).
62    pub(crate) lo12_targets: HashMap<u64, u64>,
63    /// PCREL_LO12: instruction vaddr → its anchor AUIPC (HI20) instruction
64    /// vaddr. The LO12's immediate is relative to the *AUIPC's* PC (RISC-V
65    /// ABI), so re-encoding a kept code-relative pair after fallthrough
66    /// injection needs the anchor's post-injection offset, not the LO12's.
67    pub(crate) lo12_to_hi20: HashMap<u64, u64>,
68    /// CALL_PLT: AUIPC instruction vaddr → target function RISC-V vaddr.
69    pub(crate) call_targets: HashMap<u64, u64>,
70    /// Absolute code pointers in data sections: (data_vaddr, target_code_vaddr, entry_size).
71    pub(crate) abs_code_ptrs: Vec<(u64, u64, u8)>,
72    /// Absolute *data* pointers in data sections: (data_vaddr,
73    /// target_data_vaddr, entry_size). The stored value is an ELF data
74    /// vaddr (the `[0, extent)` layout); the linker shifts it to the
75    /// runtime `[DATA_BASE, …)` mapping.
76    pub(crate) abs_data_ptrs: Vec<(u64, u64, u8)>,
77    /// SUB32 relocations: (data_vaddr, subtracted_addr).
78    pub(crate) sub32_relocs: Vec<(u64, u64)>,
79    /// Base RV vaddr that `rw_data[0]` maps to (for relocating absolute
80    /// data pointers stored in `.data`).
81    pub(crate) rw_base: u64,
82    /// Code section address ranges for detecting code pointers.
83    pub(crate) code_ranges: Vec<(u64, u64)>,
84}
85
86/// Locate every section header with the given name and return their
87/// bytes, ordered by ELF virtual address. Multiple headers can share a
88/// name when LLD doesn't coalesce input sections.
89pub(crate) fn find_all_section_bytes<'a>(
90    elf_data: &'a [u8],
91    section_name: &str,
92) -> Result<Vec<&'a [u8]>, TranspileError> {
93    if elf_data.len() < 64 || elf_data[0..4] != [0x7F, b'E', b'L', b'F'] {
94        return Err(TranspileError::ElfParse("not an ELF file".into()));
95    }
96    if elf_data[4] != 2 {
97        return Err(TranspileError::ElfParse("only 64-bit ELF supported".into()));
98    }
99    let e_shoff = u64::from_le_bytes(elf_data[40..48].try_into().unwrap()) as usize;
100    let e_shentsize = u16::from_le_bytes(elf_data[58..60].try_into().unwrap()) as usize;
101    let e_shnum = u16::from_le_bytes(elf_data[60..62].try_into().unwrap()) as usize;
102    let e_shstrndx = u16::from_le_bytes(elf_data[62..64].try_into().unwrap()) as usize;
103
104    let strtab = {
105        let sh = e_shoff + e_shstrndx * e_shentsize;
106        let off = u64::from_le_bytes(elf_data[sh + 24..sh + 32].try_into().unwrap()) as usize;
107        let sz = u64::from_le_bytes(elf_data[sh + 32..sh + 40].try_into().unwrap()) as usize;
108        &elf_data[off..off + sz]
109    };
110
111    let mut hits: Vec<(u64, &[u8])> = Vec::new();
112    for i in 0..e_shnum {
113        let sh = e_shoff + i * e_shentsize;
114        if sh + e_shentsize > elf_data.len() {
115            break;
116        }
117        let name_off = u32::from_le_bytes(elf_data[sh..sh + 4].try_into().unwrap()) as usize;
118        let addr = u64::from_le_bytes(elf_data[sh + 16..sh + 24].try_into().unwrap());
119        let file_off = u64::from_le_bytes(elf_data[sh + 24..sh + 32].try_into().unwrap()) as usize;
120        let size = u64::from_le_bytes(elf_data[sh + 32..sh + 40].try_into().unwrap()) as usize;
121        let name = if name_off < strtab.len() {
122            let end = strtab[name_off..].iter().position(|&b| b == 0).unwrap_or(0);
123            std::str::from_utf8(&strtab[name_off..name_off + end]).unwrap_or("")
124        } else {
125            ""
126        };
127        if name == section_name && file_off + size <= elf_data.len() {
128            hits.push((addr, &elf_data[file_off..file_off + size]));
129        }
130    }
131    hits.sort_by_key(|&(addr, _)| addr);
132    Ok(hits.into_iter().map(|(_, bytes)| bytes).collect())
133}
134
135/// Parse ELF with full relocation info.
136pub(crate) fn parse_linked_elf(data: &[u8]) -> Result<LinkedElf, TranspileError> {
137    if data.len() < 64 || data[0..4] != [0x7F, b'E', b'L', b'F'] {
138        return Err(TranspileError::ElfParse("not an ELF file".into()));
139    }
140
141    match data[4] {
142        2 => {}
143        1 => {
144            return Err(TranspileError::ElfParse(
145                "linker requires 64-bit ELF (rv64em)".into(),
146            ));
147        }
148        _ => return Err(TranspileError::ElfParse("unsupported ELF class".into())),
149    }
150
151    // ELF64 header fields
152    let e_shoff = u64::from_le_bytes(data[40..48].try_into().unwrap()) as usize;
153    let e_shentsize = u16::from_le_bytes(data[58..60].try_into().unwrap()) as usize;
154    let e_shnum = u16::from_le_bytes(data[60..62].try_into().unwrap()) as usize;
155    let e_shstrndx = u16::from_le_bytes(data[62..64].try_into().unwrap()) as usize;
156
157    // Section name string table
158    let strtab = {
159        let sh = e_shoff + e_shstrndx * e_shentsize;
160        let off = u64::from_le_bytes(data[sh + 24..sh + 32].try_into().unwrap()) as usize;
161        let sz = u64::from_le_bytes(data[sh + 32..sh + 40].try_into().unwrap()) as usize;
162        &data[off..off + sz]
163    };
164
165    let get_name = |name_off: usize| -> &str {
166        if name_off >= strtab.len() {
167            return "";
168        }
169        let end = strtab[name_off..].iter().position(|&b| b == 0).unwrap_or(0);
170        std::str::from_utf8(&strtab[name_off..name_off + end]).unwrap_or("")
171    };
172
173    // First pass: collect section metadata
174    struct SectionInfo {
175        name_off: usize,
176        sh_type: u32,
177        flags: u64,
178        addr: u64,
179        file_off: usize,
180        size: usize,
181        link: usize,
182        _info: usize,
183    }
184
185    let mut sections = Vec::with_capacity(e_shnum);
186    for i in 0..e_shnum {
187        let sh = e_shoff + i * e_shentsize;
188        if sh + e_shentsize > data.len() {
189            break;
190        }
191        sections.push(SectionInfo {
192            name_off: u32::from_le_bytes(data[sh..sh + 4].try_into().unwrap()) as usize,
193            sh_type: u32::from_le_bytes(data[sh + 4..sh + 8].try_into().unwrap()),
194            flags: u64::from_le_bytes(data[sh + 8..sh + 16].try_into().unwrap()),
195            addr: u64::from_le_bytes(data[sh + 16..sh + 24].try_into().unwrap()),
196            file_off: u64::from_le_bytes(data[sh + 24..sh + 32].try_into().unwrap()) as usize,
197            size: u64::from_le_bytes(data[sh + 32..sh + 40].try_into().unwrap()) as usize,
198            link: u32::from_le_bytes(data[sh + 40..sh + 44].try_into().unwrap()) as usize,
199            _info: u32::from_le_bytes(data[sh + 44..sh + 48].try_into().unwrap()) as usize,
200        });
201    }
202
203    // Collect code sections, ro sections, rw sections
204    let mut code_sections = Vec::new();
205    let mut ro_sections: Vec<(u64, usize, Vec<u8>)> = Vec::new();
206    let mut rw_sections: Vec<(u64, usize, Option<Vec<u8>>)> = Vec::new();
207    let mut rela_section_indices = Vec::new();
208    let mut symtab_idx = None;
209
210    for (i, s) in sections.iter().enumerate() {
211        let name = get_name(s.name_off);
212        let is_alloc = s.flags & 2 != 0;
213        let is_exec = s.flags & 4 != 0;
214        let is_write = s.flags & 1 != 0;
215
216        if s.sh_type == 2 {
217            // SYMTAB
218            symtab_idx = Some(i);
219        }
220        if s.sh_type == 4 {
221            // RELA
222            rela_section_indices.push(i);
223        }
224        if !is_alloc || s.sh_type == 0 {
225            continue;
226        }
227
228        if is_exec && s.file_off + s.size <= data.len() {
229            code_sections.push((
230                s.file_off as u64,
231                s.addr,
232                data[s.file_off..s.file_off + s.size].to_vec(),
233            ));
234        } else if !is_exec
235            && (name.starts_with(".rodata")
236                || name == ".srodata"
237                || name.starts_with(".data.rel.ro"))
238        {
239            if s.file_off + s.size <= data.len() {
240                ro_sections.push((
241                    s.addr,
242                    s.size,
243                    data[s.file_off..s.file_off + s.size].to_vec(),
244                ));
245            }
246        } else if is_write {
247            if s.sh_type == 8 {
248                // NOBITS (.bss)
249                rw_sections.push((s.addr, s.size, None));
250            } else if s.file_off + s.size <= data.len() {
251                rw_sections.push((
252                    s.addr,
253                    s.size,
254                    Some(data[s.file_off..s.file_off + s.size].to_vec()),
255                ));
256            }
257        }
258    }
259
260    // Parse symbol table
261    let mut symbols_by_idx: Vec<(String, u64)> = Vec::new();
262    if let Some(si) = symtab_idx {
263        let s = &sections[si];
264        let sym_strtab = {
265            let ss = &sections[s.link];
266            &data[ss.file_off..ss.file_off + ss.size]
267        };
268        let count = s.size / 24;
269        for j in 0..count {
270            let off = s.file_off + j * 24;
271            if off + 24 > data.len() {
272                break;
273            }
274            let st_name = u32::from_le_bytes(data[off..off + 4].try_into().unwrap()) as usize;
275            let st_value = u64::from_le_bytes(data[off + 8..off + 16].try_into().unwrap());
276
277            let name = {
278                if st_name < sym_strtab.len() {
279                    let end = sym_strtab[st_name..]
280                        .iter()
281                        .position(|&b| b == 0)
282                        .unwrap_or(0);
283                    std::str::from_utf8(&sym_strtab[st_name..st_name + end]).unwrap_or("")
284                } else {
285                    ""
286                }
287            };
288
289            symbols_by_idx.push((name.to_string(), st_value));
290        }
291    }
292
293    // Compute PVM memory layout
294    let ro_min = ro_sections.iter().map(|(a, _, _)| *a).min().unwrap_or(0);
295    let ro_max = ro_sections
296        .iter()
297        .map(|(a, sz, _)| *a + *sz as u64)
298        .max()
299        .unwrap_or(0);
300
301    let page_size: u64 = 4096;
302    let stack_size = if ro_min > 0 {
303        (ro_min / page_size) * page_size
304    } else {
305        4 * page_size
306    };
307
308    let ro_blob_size = if ro_max > stack_size {
309        (ro_max - stack_size) as usize
310    } else {
311        0
312    };
313    let mut ro_data = vec![0u8; ro_blob_size];
314    for (addr, sz, d) in &ro_sections {
315        let off = (*addr - stack_size) as usize;
316        if off + sz <= ro_data.len() {
317            ro_data[off..off + sz].copy_from_slice(d);
318        }
319    }
320
321    let ro_pages = ro_data.len().div_ceil(page_size as usize);
322    let rw_pvm_base = stack_size + (ro_pages as u64 * page_size);
323    let mut rw_data = Vec::new();
324    // Base RV vaddr that `rw_data[0]` corresponds to (for relocating
325    // absolute data pointers stored in `.data`).
326    let mut rw_base = rw_pvm_base;
327    if !rw_sections.is_empty() {
328        let rw_min = rw_sections.iter().map(|(a, _, _)| *a).min().unwrap();
329        let rw_max = rw_sections
330            .iter()
331            .map(|(a, sz, _)| *a + *sz as u64)
332            .max()
333            .unwrap();
334        rw_base = rw_pvm_base.min(rw_min);
335        let rw_blob_size = (rw_max - rw_base) as usize;
336        rw_data = vec![0u8; rw_blob_size];
337        for (addr, sz, d) in &rw_sections {
338            let off = (*addr - rw_base) as usize;
339            if let Some(d) = d
340                && off + sz <= rw_data.len()
341            {
342                rw_data[off..off + sz].copy_from_slice(d);
343            }
344        }
345    }
346
347    let mut hi20_targets: HashMap<u64, u64> = HashMap::new();
348    let mut lo12_targets: HashMap<u64, u64> = HashMap::new();
349    let mut lo12_to_hi20: HashMap<u64, u64> = HashMap::new();
350    let mut call_targets: HashMap<u64, u64> = HashMap::new();
351
352    let mut lo12_entries: Vec<(u64, u64)> = Vec::new();
353    let mut abs64_relocs: Vec<(u64, u64, u8)> = Vec::new();
354    let mut abs_data_relocs: Vec<(u64, u64, u8)> = Vec::new();
355    let mut sub32_relocs: Vec<(u64, u64)> = Vec::new();
356    let code_ranges: Vec<(u64, u64)> = code_sections
357        .iter()
358        .map(|(_, vaddr, data)| (*vaddr, *vaddr + data.len() as u64))
359        .collect();
360
361    for &ri in &rela_section_indices {
362        let rs = &sections[ri];
363        let count = rs.size / 24;
364        for j in 0..count {
365            let off = rs.file_off + j * 24;
366            if off + 24 > data.len() {
367                break;
368            }
369            let r_offset = u64::from_le_bytes(data[off..off + 8].try_into().unwrap());
370            let r_info = u64::from_le_bytes(data[off + 8..off + 16].try_into().unwrap());
371            let r_addend = i64::from_le_bytes(data[off + 16..off + 24].try_into().unwrap());
372            let r_type = (r_info & 0xFFFFFFFF) as u32;
373            let r_sym = (r_info >> 32) as usize;
374
375            let rtype = match RelocType::from_raw(r_type) {
376                Some(t) => t,
377                None => continue,
378            };
379
380            let sym_value = if r_sym < symbols_by_idx.len() {
381                symbols_by_idx[r_sym].1
382            } else {
383                0
384            };
385
386            let target_addr = (sym_value as i64 + r_addend) as u64;
387
388            match rtype {
389                RelocType::Abs32 => {
390                    let is_code_ptr = code_ranges
391                        .iter()
392                        .any(|(lo, hi)| target_addr >= *lo && target_addr < *hi);
393                    if is_code_ptr {
394                        abs64_relocs.push((r_offset, target_addr, 4));
395                    } else {
396                        abs_data_relocs.push((r_offset, target_addr, 4));
397                    }
398                }
399                RelocType::Abs64 => {
400                    let is_code_ptr = code_ranges
401                        .iter()
402                        .any(|(lo, hi)| target_addr >= *lo && target_addr < *hi);
403                    if is_code_ptr {
404                        abs64_relocs.push((r_offset, target_addr, 8));
405                    } else {
406                        abs_data_relocs.push((r_offset, target_addr, 8));
407                    }
408                }
409                RelocType::Add32 => {
410                    let is_code_ptr = code_ranges
411                        .iter()
412                        .any(|(lo, hi)| target_addr >= *lo && target_addr < *hi);
413                    if is_code_ptr {
414                        abs64_relocs.push((r_offset, target_addr, 4));
415                    }
416                }
417                RelocType::Sub32 => {
418                    sub32_relocs.push((r_offset, target_addr));
419                }
420                RelocType::CallPlt => {
421                    call_targets.insert(r_offset, target_addr);
422                }
423                RelocType::PcrelHi20 => {
424                    hi20_targets.insert(r_offset, target_addr);
425                }
426                RelocType::PcrelLo12I | RelocType::PcrelLo12S => {
427                    lo12_entries.push((r_offset, sym_value));
428                }
429            }
430        }
431    }
432
433    for (lo12_addr, hi20_addr) in lo12_entries {
434        if let Some(&data_addr) = hi20_targets.get(&hi20_addr) {
435            lo12_targets.insert(lo12_addr, data_addr);
436            lo12_to_hi20.insert(lo12_addr, hi20_addr);
437        }
438    }
439
440    let heap_pages = 16u32; // 64KB heap
441
442    Ok(LinkedElf {
443        code_sections,
444        ro_data,
445        rw_data,
446        stack_size: stack_size as u32,
447        heap_pages,
448        hi20_targets,
449        lo12_targets,
450        lo12_to_hi20,
451        call_targets,
452        abs_code_ptrs: abs64_relocs,
453        abs_data_ptrs: abs_data_relocs,
454        sub32_relocs,
455        rw_base,
456        code_ranges,
457    })
458}