Phasor 3.1.1
Stack VM based Programming Language
Loading...
Searching...
No Matches
Native.py
Go to the documentation of this file.
1"""
2phasor.Native
3==============
4Extract bytecode from a ELF / PE / MachO binary.
5"""
6
7from __future__ import annotations
8
9import struct
10from pathlib import Path
11from typing import Optional, Tuple
12
13_MAGIC_BYTES = b"\x50\x48\x53\x42"
14
15def _arch_info(binary) -> Tuple[int, str]:
16 """Detect the pointer width and byte order of a parsed ``lief`` binary.
17
18 Args:
19 binary: A ``lief.Binary`` instance (ELF, PE, or MachO).
20
21 Returns:
22 A ``(pointer_width, endian)`` tuple where *pointer_width* is ``4`` or ``8``
23 bytes and *endian* is ``"<"`` (little-endian) or ``">"`` (big-endian).
24 Falls back to ``(8, "<")`` if detection fails.
25 """
26 try:
27 import lief
28 fmt = binary.format
29 if fmt == lief.Binary.FORMATS.ELF:
30 cls = binary.header.identity_class
31 data = binary.header.identity_data
32 bits = 64 if cls == lief.ELF.ELF_CLASS.CLASS64 else 32
33 end = "<" if data == lief.ELF.ELF_DATA.ELFDATA2LSB else ">"
34 return (8 if bits == 64 else 4), end
35 if fmt == lief.Binary.FORMATS.PE:
36 magic = binary.optional_header.magic
37 bits = 64 if magic == lief.PE.PE_TYPE.PE32_PLUS else 32
38 return (8 if bits == 64 else 4), "<"
39 if fmt == lief.Binary.FORMATS.MACHO:
40 cpu = binary.header.cpu_type
41 bits = 64 if cpu in (
42 lief.MachO.CPU_TYPES.ARM64,
43 lief.MachO.CPU_TYPES.x86_64,
44 ) else 32
45 return (8 if bits == 64 else 4), "<"
46 except Exception:
47 pass
48 return 8, "<"
49
50
51def _find_phasor_section(binary) -> Optional[object]:
52 """Locate the ``.phsb`` or ``phsb`` section in a parsed ``lief`` binary.
53
54 Checks top-level sections first; for MachO binaries it also searches
55 sections nested inside segments.
56
57 Args:
58 binary: A ``lief.Binary`` instance (ELF, PE, or MachO).
59
60 Returns:
61 The matching ``lief`` section object, or ``None`` if not found.
62 """
63 candidates = {".phsb", "phsb"}
64 try:
65 for sec in binary.sections:
66 if sec.name.rstrip("\x00") in candidates:
67 return sec
68 except Exception:
69 pass
70 try:
71 for seg in binary.segments:
72 for sec in seg.sections:
73 if sec.name.rstrip("\x00") in candidates:
74 return sec
75 except Exception:
76 pass
77 return None
78
79
80def _find_all(data: bytes, pattern: bytes) -> list[int]:
81 """Return every byte offset at which *pattern* appears in *data*.
82
83 Args:
84 data: The byte buffer to search.
85 pattern: The byte sequence to look for.
86
87 Returns:
88 A list of integer offsets in ascending order; empty if *pattern* is not found.
89 """
90 offsets, start = [], 0
91 while True:
92 pos = data.find(pattern, start)
93 if pos == -1:
94 break
95 offsets.append(pos)
96 start = pos + 1
97 return offsets
98
99
100def _find_payload_size(sec_data: bytes, sz_width: int, endian: str) -> Optional[int]:
101 """Heuristically determine the bytecode payload size encoded in the ``.phsb`` section.
102
103 Scans *sec_data* for an integer field that plausibly encodes the length of a
104 contiguous non-zero region — the strategy used by the Phasor linker to store
105 the bytecode size alongside the payload.
106
107 Args:
108 sec_data: Raw bytes of the ``.phsb`` binary section.
109 sz_width: Width of the size field in bytes (``4`` or ``8``, from :func:`_arch_info`).
110 endian: Struct endian character (``"<"`` or ``">"``, from :func:`_arch_info`).
111
112 Returns:
113 The detected payload length in bytes, or ``None`` if no consistent size
114 field could be located.
115 """
116 fmt = endian + ("Q" if sz_width == 8 else "I")
117 L = len(sec_data)
118 non_zero = [i for i in range(L) if sec_data[i] != 0]
119
120 for padding in range(L - sz_width):
121 N = L - sz_width - padding
122 if N <= 0:
123 break
124
125 for sz_off in _find_all(sec_data, struct.pack(fmt, N)):
126 sz_end = sz_off + sz_width
127 outside_nz = [i for i in non_zero if i < sz_off or i >= sz_end]
128
129 if not outside_nz:
130 return N
131
132 span_start = outside_nz[0]
133 span_end = outside_nz[-1] + 1
134
135 if span_end - span_start > N:
136 continue
137
138 for bc_start in range(
139 max(0, span_end - N), min(span_start, L - N) + 1
140 ):
141 bc_end = bc_start + N
142 if bc_start < sz_end and bc_end > sz_off:
143 continue
144 return N
145
146 return None
147
148def extract_phsb_bytes(path: Path) -> bytes:
149 """Extract the raw ``.phsb`` bytecode payload from a native binary.
150
151 Parses the ELF, PE, or MachO binary at *path* using ``lief``, locates the
152 ``.phsb`` section, and returns the bytecode payload starting at the
153 :data:`_MAGIC_BYTES` marker.
154
155 Args:
156 path: Path to the compiled native binary.
157
158 Returns:
159 The raw ``.phsb`` bytes, suitable for passing to
160 :meth:`~phasor.Bytecode.Bytecode.from_bytes`.
161
162 Raises:
163 ImportError: If the ``lief`` package is not installed.
164 FileNotFoundError: If *path* does not exist.
165 RuntimeError: If the binary cannot be parsed, no ``.phsb`` section is
166 found, the PHSB magic bytes are absent, or the payload size cannot
167 be determined.
168 """
169 try:
170 import lief
171 except ImportError as exc:
172 raise ImportError(
173 "The 'lief' package is required for native binary extraction.\n"
174 "Install it with: pip install lief"
175 ) from exc
176
177 path = Path(path)
178 if not path.is_file():
179 raise FileNotFoundError(f"Binary not found: {path}")
180
181 binary = lief.parse(str(path))
182 if binary is None:
183 raise RuntimeError(f"Could not parse binary: {path}")
184
185 section = _find_phasor_section(binary)
186 if section is None:
187 raise RuntimeError(f"No '.phsb' section found in {path}")
188
189 sz_width, endian = _arch_info(binary)
190 sec_data = bytes(section.content)
191
192 magic_off = sec_data.find(_MAGIC_BYTES)
193 if magic_off == -1:
194 raise RuntimeError("PHSB magic bytes not found in '.phsb' section")
195
196 N = _find_payload_size(sec_data, sz_width, endian)
197 if N is None:
198 raise RuntimeError(
199 "Could not determine bytecode payload size from '.phsb' section"
200 )
201
202 return sec_data[magic_off : magic_off + N]
Optional[object] _find_phasor_section(binary)
Definition Native.py:51
bytes extract_phsb_bytes(Path path)
Definition Native.py:148
list[int] _find_all(bytes data, bytes pattern)
Definition Native.py:80
Tuple[int, str] _arch_info(binary)
Definition Native.py:15
Optional[int] _find_payload_size(bytes sec_data, int sz_width, str endian)
Definition Native.py:100