import re
import subprocess
import sys
from pathlib import Path
from elftools.elf.elffile import ELFFile
from ._instr import Instruction
INSTRUCTIONS: dict[str, Instruction]= {}
"""Dictionary of all instructions based on contents of elf file"""
EXTENSIONS: dict[str, str] = {}
"""Dictionary of all available extensions and their version based on properties of elf file"""
def _parse_arch_string_(arch_string : str) -> dict[str, str]:
"""
Parse RISC-V architecture string like 'rv64i2p1_m2p0_a2p1...'
:param arch_string: Architectural definition string extracted from elf
:type arch_string: str
:returns Dictionary of extensions and versions
:rtype : dict[str, str]
"""
extensions = {}
if not arch_string.startswith('rv'):
return extensions
# Extract base (rv32/rv64)
if arch_string.startswith('rv64'):
base = 'RV64'
rest = arch_string[4:]
elif arch_string.startswith('rv32'):
base = 'RV32'
rest = arch_string[4:]
else:
return extensions
extensions['base'] = base
# Split by underscore
parts = rest.split('_')
# Parse each extension with version
# Format: extension_name + version (e.g., i2p1, m2p0, zicsr2p0)
for part in parts:
if not part:
continue
# Match pattern: letters followed by optional version (digit+p+digit)
match = re.match(r'^([a-z]+)(\d+p\d+)?$', part, re.IGNORECASE)
if match:
ext_name = match.group(1).upper()
version = match.group(2) if match.group(2) else None
# Handle special case: 'g' expands to imafd + zicsr + zifencei
if ext_name == 'G':
extensions['I'] = version
extensions['M'] = version
extensions['A'] = version
extensions['F'] = version
extensions['D'] = version
extensions['Zicsr'] = version
extensions['Zifencei'] = version
else:
extensions[ext_name] = version
return extensions
def _parse_riscv_attribes_(data : str) -> dict[str, str]:
"""
Quick parser specifically for your data format
:param data: Attributes data extracted from elf file
:type data: str
:returns Dictionary of extensions and versions
:rtype : dict[str, str]
"""
# Find the architecture string (starts with 'rv')
arch_start = data.find(b'rv')
if arch_start == -1:
return None
# Find the null terminator after the arch string
arch_end = data.find(b'\x00', arch_start)
if arch_end == -1:
arch_end = len(data)
arch_string = data[arch_start:arch_end].decode('ascii')
return _parse_arch_string_(arch_string)
[docs]
def parse_elf(elfpath : Path) -> None:
"""
Parse given elf file.
Extract class and instructions
:param elfpath: path to elf file
:type elfpath: path
"""
with open(elfpath, "rb") as f:
elf = ELFFile(f)
# Check if it's RISC-V
if elf.header['e_machine'] != 'EM_RISCV':
raise ValueError (f"Not a RISC-V binary: {elf.header['e_machine']}")
# Extract Extensions
attrs_section = elf.get_section_by_name('.riscv.attributes')
if not attrs_section:
raise ValueError("No .riscv.attributes section found")
else:
EXTENSIONS.update(_parse_riscv_attribes_(attrs_section.data()))
# Discover mode
if EXTENSIONS["base"] not in ["RV64", "RV32"]:
raise ValueError(f"Unknown base architecture {EXTENSIONS['base']}")
# Run objdump to extract instructions
try:
result = subprocess.run(
['riscv64-unknown-elf-objdump', '-d', elfpath],
capture_output=True, text=True, check=True
)
for line in result.stdout.split('\n'):
# Parse: " 2000000342: 0d0075d7 vsetvli a1,zero,e32,m1,ta,ma"
match = re.match(r'\s*([0-9a-f]+):\s+([0-9a-f]+)\s+(.+)', line)
if match:
addr = int(match.group(1), 16)
bytes_hex = int(match.group(2), 16)
INSTRUCTIONS[addr] = Instruction(addr, bytes_hex)
except (subprocess.CalledProcessError, FileNotFoundError) as e:
print(f"Could not run objdump: {e}")
sys.exit(1)
# Link instructions to prev / next
for instr in INSTRUCTIONS.values():
n = INSTRUCTIONS.get(instr.pc + instr.size, None)
instr.link(n)
__all__ = [
"INSTRUCTIONS",
"EXTENSIONS",
"parse_elf",
]