Update asm processor and migrate/improve rodata for a few files (#209)

* Update asm processor * Migrate or improve rodata for a few files
2025-08-24 07:51:48 +00:00 · 2020-06-15 01:24:09 +02:00 · 2020-06-15 01:24:09 +02:00 · 229e0c8629
commit 229e0c8629
parent 1c98ac27eb
37 changed files with 964 additions and 870 deletions
--- a/tools/asm_processor/asm_processor.py
+++ b/tools/asm_processor/asm_processor.py
@ -10,6 +10,7 @@ from collections import namedtuple
 from io import StringIO

 MAX_FN_SIZE = 100
+SLOW_CHECKS = False

 EI_NIDENT     = 16
 EI_CLASS      = 4
@ -348,6 +349,20 @@ def is_temp_name(name):
    return name.startswith('_asmpp_')


+# https://stackoverflow.com/a/241506
+def re_comment_replacer(match):
+    s = match.group(0)
+    if s[0] in "/#":
+        return " "
+    else:
+        return s
+
+
+re_comment_or_string = re.compile(
+    r'#.*|/\*.*?\*/|"(?:\\.|[^\\"])*"'
+)
+
+
 class Failure(Exception):
    def __init__(self, message):
        self.message = message
@ -357,13 +372,14 @@ class Failure(Exception):


 class GlobalState:
-    def __init__(self, min_instr_count, skip_instr_count):
+    def __init__(self, min_instr_count, skip_instr_count, use_jtbl_for_rodata):
        # A value that hopefully never appears as a 32-bit rodata constant (or we
        # miscompile late rodata). Increases by 1 in each step.
        self.late_rodata_hex = 0xE0123456
        self.namectr = 0
        self.min_instr_count = min_instr_count
        self.skip_instr_count = skip_instr_count
+        self.use_jtbl_for_rodata = use_jtbl_for_rodata

    def next_late_rodata_hex(self):
        dummy_bytes = struct.pack('>I', self.late_rodata_hex)
@ -378,7 +394,7 @@ class GlobalState:
        return '_asmpp_{}{}'.format(cat, self.namectr)


-Function = namedtuple('Function', ['text_glabels', 'asm_conts', 'late_rodata_dummy_bytes', 'late_rodata_asm_conts', 'fn_desc', 'data'])
+Function = namedtuple('Function', ['text_glabels', 'asm_conts', 'late_rodata_dummy_bytes', 'jtbl_rodata_size', 'late_rodata_asm_conts', 'fn_desc', 'data'])


 class GlobalAsmBlock:
@ -476,8 +492,7 @@ class GlobalAsmBlock:
        self.glued_line = ''

        real_line = line
-        line = re.sub(r'/\*.*?\*/', '', line)
-        line = re.sub(r'#.*', '', line)
+        line = re.sub(re_comment_or_string, re_comment_replacer, line)
        line = line.strip()
        line = re.sub(r'^[a-zA-Z0-9_]+:\s*', '', line)
        changed_section = False
@ -536,6 +551,8 @@ class GlobalAsmBlock:
        elif line.startswith('.asci'):
            z = (line.startswith('.asciz') or line.startswith('.asciiz'))
            self.add_sized(self.count_quoted_size(line, z, real_line, output_enc), real_line)
+        elif line.startswith('.byte'):
+            self.add_sized(len(line.split(',')), real_line)
        elif line.startswith('.'):
            # .macro, ...
            self.fail("asm directive not supported", real_line)
@ -564,8 +581,11 @@ class GlobalAsmBlock:
    def finish(self, state):
        src = [''] * (self.num_lines + 1)
        late_rodata_dummy_bytes = []
+        jtbl_rodata_size = 0
        late_rodata_fn_output = []

+        num_instr = self.fn_section_sizes['.text'] // 4
+
        if self.fn_section_sizes['.late_rodata'] > 0:
            # Generate late rodata by emitting unique float constants.
            # This requires 3 instructions for each 4 bytes of rodata.
@ -573,10 +593,29 @@ class GlobalAsmBlock:
            # instructions for 8 bytes of rodata.
            size = self.fn_section_sizes['.late_rodata'] // 4
            skip_next = False
+            needs_double = (self.late_rodata_alignment != 0)
            for i in range(size):
                if skip_next:
                    skip_next = False
                    continue
+                # Jump tables give 9 instructions for >= 5 words of rodata, and should be
+                # emitted when:
+                # - -O2 or -O2 -g3 are used, which give the right codegen
+                # - we have emitted our first .float/.double (to ensure that we find the
+                #   created rodata in the binary)
+                # - we have emitted our first .double, if any (to ensure alignment of doubles
+                #   in shifted rodata sections)
+                # - we have at least 5 words of rodata left to emit (otherwise IDO does not
+                #   generate a jump table)
+                # - we have at least 10 more instructions to go in this function (otherwise our
+                #   function size computation will be wrong since the delay slot goes unused)
+                if (not needs_double and state.use_jtbl_for_rodata and i >= 1 and
+                        size - i >= 5 and num_instr - len(late_rodata_fn_output) >= 10):
+                    cases = " ".join("case {}:".format(case) for case in range(size - i))
+                    late_rodata_fn_output.append("switch (*(volatile int*)0) { " + cases + " ; }")
+                    late_rodata_fn_output.extend([""] * 8)
+                    jtbl_rodata_size = (size - i) * 4
+                    break
                dummy_bytes = state.next_late_rodata_hex()
                late_rodata_dummy_bytes.append(dummy_bytes)
                if self.late_rodata_alignment == 4 * ((i + 1) % 2 + 1) and i + 1 < size:
@ -585,6 +624,7 @@ class GlobalAsmBlock:
                    fval, = struct.unpack('>d', dummy_bytes + dummy_bytes2)
                    late_rodata_fn_output.append('*(volatile double*)0 = {};'.format(fval))
                    skip_next = True
+                    needs_double = True
                else:
                    fval, = struct.unpack('>f', dummy_bytes)
                    late_rodata_fn_output.append('*(volatile float*)0 = {}f;'.format(fval))
@ -651,6 +691,7 @@ class GlobalAsmBlock:
                text_glabels=self.text_glabels,
                asm_conts=self.asm_conts,
                late_rodata_dummy_bytes=late_rodata_dummy_bytes,
+                jtbl_rodata_size=jtbl_rodata_size,
                late_rodata_asm_conts=self.late_rodata_asm_conts,
                fn_desc=self.fn_desc,
                data={
@ -661,6 +702,9 @@ class GlobalAsmBlock:
                })
        return src, fn

+cutscene_data_regexpr = re.compile(r"CutsceneData (.|\n)*\[\] = {")
+float_regexpr = re.compile(r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?f")
+
 def repl_float_hex(m):
    return str(struct.unpack(">I", struct.pack(">f", float(m.group(0).strip().rstrip("f"))))[0])

@ -689,13 +733,18 @@ def parse_source(f, opt, framepointer, input_enc, output_enc, print_source=None)
            min_instr_count = 2
            skip_instr_count = 2

-    state = GlobalState(min_instr_count, skip_instr_count)
+    use_jtbl_for_rodata = False
+    if opt in ['O2', 'g3'] and not framepointer:
+        use_jtbl_for_rodata = True
+
+    state = GlobalState(min_instr_count, skip_instr_count, use_jtbl_for_rodata)

    global_asm = None
-    is_cutscene_data = False
    asm_functions = []
    output_lines = []

+    is_cutscene_data = False
+
    for line_no, raw_line in enumerate(f, 1):
        raw_line = raw_line.rstrip()
        line = raw_line.lstrip()
@ -730,6 +779,8 @@ def parse_source(f, opt, framepointer, input_enc, output_enc, print_source=None)
                asm_functions.append(fn)
                global_asm = None
            elif ((line.startswith('#include "')) and line.endswith('" EARLY')):
+                # C includes qualified with EARLY (i.e. #include "file.c" EARLY) will be
+                # processed recursively when encountered
                fpath = os.path.dirname(f.name)
                fname = line[line.index(' ') + 2 : -7]
                include_src = StringIO()
@ -739,12 +790,14 @@ def parse_source(f, opt, framepointer, input_enc, output_enc, print_source=None)
                include_src.write('#line ' + str(line_no) + '\n')
                include_src.close()
            else:
-                if re.compile(r"(CutsceneData (.|\n)*\[\] = {)").search(line) is not None:
+                # This is a hack to replace all floating-point numbers in an array of a particular type
+                # (in this case CutsceneData) with their corresponding IEEE-754 hexadecimal representation
+                if cutscene_data_regexpr.search(line) is not None:
                    is_cutscene_data = True
                elif line.endswith("};"):
                    is_cutscene_data = False
                if is_cutscene_data:
-                    raw_line = re.sub(re.compile(r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?f"), repl_float_hex, raw_line)
+                    raw_line = re.sub(float_regexpr, repl_float_hex, raw_line)
                output_lines[-1] = raw_line

    if print_source:
@ -779,7 +832,8 @@ def fixup_objfile(objfile_name, functions, asm_prelude, assembler, output_enc):
        '.bss': [],
    }
    asm = []
-    late_rodata_dummy_bytes = []
+    all_late_rodata_dummy_bytes = []
+    all_jtbl_rodata_size = []
    late_rodata_asm = []
    late_rodata_source_name_start = None
    late_rodata_source_name_end = None
@ -800,7 +854,8 @@ def fixup_objfile(objfile_name, functions, asm_prelude, assembler, output_enc):
                break
            loc = loc[1]
            prev_loc = prev_locs[sectype]
-            assert loc >= prev_loc, sectype
+            if loc < prev_loc:
+                raise Failure("Wrongly computed size for section {} (diff {}). This is an asm-processor bug!".format(sectype, prev_loc- loc))
            if loc != prev_loc:
                asm.append('.section ' + sectype)
                if sectype == '.text':
@ -812,7 +867,8 @@ def fixup_objfile(objfile_name, functions, asm_prelude, assembler, output_enc):
            prev_locs[sectype] = loc + size
        if not ifdefed:
            all_text_glabels.update(function.text_glabels)
-            late_rodata_dummy_bytes.append(function.late_rodata_dummy_bytes)
+            all_late_rodata_dummy_bytes.append(function.late_rodata_dummy_bytes)
+            all_jtbl_rodata_size.append(function.jtbl_rodata_size)
            late_rodata_asm.append(function.late_rodata_asm_conts)
            for sectype, (temp_name, size) in function.data.items():
                if temp_name is not None:
@ -863,6 +919,7 @@ def fixup_objfile(objfile_name, functions, asm_prelude, assembler, output_enc):

        # Move over section contents
        modified_text_positions = set()
+        jtbl_rodata_positions = set()
        last_rodata_pos = 0
        for sectype in SECTIONS:
            if not to_copy[sectype]:
@ -894,18 +951,19 @@ def fixup_objfile(objfile_name, functions, asm_prelude, assembler, output_enc):
        # Move over late rodata. This is heuristic, sadly, since I can't think
        # of another way of doing it.
        moved_late_rodata = {}
-        if any(late_rodata_dummy_bytes):
+        if any(all_late_rodata_dummy_bytes) or any(all_jtbl_rodata_size):
            source = asm_objfile.find_section('.rodata')
            target = objfile.find_section('.rodata')
            source_pos = asm_objfile.symtab.find_symbol_in_section(late_rodata_source_name_start, source)
            source_end = asm_objfile.symtab.find_symbol_in_section(late_rodata_source_name_end, source)
-            if source_end - source_pos != sum(map(len, late_rodata_dummy_bytes)) * 4:
+            if source_end - source_pos != sum(map(len, all_late_rodata_dummy_bytes)) * 4 + sum(all_jtbl_rodata_size):
                raise Failure("computed wrong size of .late_rodata")
            new_data = list(target.data)
-            for dummy_bytes_list in late_rodata_dummy_bytes:
+            for dummy_bytes_list, jtbl_rodata_size in zip(all_late_rodata_dummy_bytes, all_jtbl_rodata_size):
                for index, dummy_bytes in enumerate(dummy_bytes_list):
                    pos = target.data.index(dummy_bytes, last_rodata_pos)
-                    if target.data.find(dummy_bytes, pos + 4) != -1:
+                    # This check is nice, but makes time complexity worse for large files:
+                    if SLOW_CHECKS and target.data.find(dummy_bytes, pos + 4) != -1:
                        raise Failure("multiple occurrences of late_rodata hex magic. Change asm-processor to use something better than 0xE0123456!")
                    if index == 0 and len(dummy_bytes_list) > 1 and target.data[pos+4:pos+8] == b'\0\0\0\0':
                        # Ugly hack to handle double alignment for non-matching builds.
@ -920,6 +978,16 @@ def fixup_objfile(objfile_name, functions, asm_prelude, assembler, output_enc):
                    moved_late_rodata[source_pos] = pos
                    last_rodata_pos = pos + 4
                    source_pos += 4
+                if jtbl_rodata_size > 0:
+                    assert dummy_bytes_list, "should always have dummy bytes before jtbl data"
+                    pos = last_rodata_pos
+                    new_data[pos : pos + jtbl_rodata_size] = \
+                        source.data[source_pos : source_pos + jtbl_rodata_size]
+                    for i in range(0, jtbl_rodata_size, 4):
+                        moved_late_rodata[source_pos + i] = pos + i
+                        jtbl_rodata_positions.add(pos + i)
+                    last_rodata_pos += jtbl_rodata_size
+                    source_pos += jtbl_rodata_size
            target.data = bytes(new_data)

        # Merge strtab data.
@ -981,7 +1049,8 @@ def fixup_objfile(objfile_name, functions, asm_prelude, assembler, output_enc):
                for reltab in target.relocated_by:
                    nrels = []
                    for rel in reltab.relocations:
-                        if sectype == '.text' and rel.r_offset in modified_text_positions:
+                        if (sectype == '.text' and rel.r_offset in modified_text_positions or
+                            sectype == '.rodata' and rel.r_offset in jtbl_rodata_positions):
                            # don't include relocations for late_rodata dummy code
                            continue
                        # hopefully we don't have relocations for local or