Diff .data size, .bss size, and .rodata contents in retail_progress.py (#1706)

* Use iconv to convert strings to EUC-JP for reassembly * Compare .data size, .bss size, and .rodata contents in retail_progress.py * Show data diffs in summary for humans * Use multiprocessing to very significantly speed up retail_progress.py summary * Remove intermediate function * Make sigint less jank * Hide the evidence Co-authored-by: Dragorn421 <Dragorn421@users.noreply.github.com> * add --not-ok to only print not-OK files in summary --------- Co-authored-by: Dragorn421 <Dragorn421@users.noreply.github.com>
2025-07-03 06:24:30 +00:00 · 2024-02-09 05:30:32 -08:00 · 2024-02-09 05:30:32 -08:00 · d2a1abf8df
commit d2a1abf8df
parent f492c04186
2 changed files with 219 additions and 52 deletions
--- a/2
+++ b/2
@ -500,7 +500,7 @@ $(EXPECTED_DIR)/.disasm: $(DISASM_DATA_FILES)
 	touch $@
 $(EXPECTED_DIR)/%.o: $(EXPECTED_DIR)/.disasm
-	$(AS) $(ASFLAGS) $(@:.o=.s) -o $@
+	iconv --from UTF-8 --to EUC-JP $(@:.o=.s) | $(AS) $(ASFLAGS) -o $@
 -include $(DEP_FILES)
--- a/retail_progress.py
+++ b/retail_progress.py
@ -5,16 +5,34 @@
 import argparse
 import collections
 from colorama import Fore, Style
 from dataclasses import dataclass
 import difflib
 from enum import Enum
 import itertools
 import math
 from pathlib import Path
 import re
 import subprocess
 import sys
-from typing import Iterator, List, Optional, Tuple
+import multiprocessing
 import multiprocessing.pool
 from typing import Dict, Iterator, List, Optional, Tuple
 def green(s: str) -> str:
    return f"{Fore.GREEN}{s}{Style.RESET_ALL}"
 def red(s: str) -> str:
    return f"{Fore.RED}{s}{Style.RESET_ALL}"
 # Make interrupting with ^C less jank
 # https://stackoverflow.com/questions/72967793/keyboardinterrupt-with-python-multiprocessing-pool
 def set_sigint_ignored():
    import signal
    signal.signal(signal.SIGINT, signal.SIG_IGN)
@dataclass
@ -68,30 +86,25 @@ def parse_inst(func_name: str, line: str) -> Inst:
    return Inst(func_name, mnemonic, regs, imm, None, None)
-def run_objdump(path: Path) -> List[Inst]:
+def run_objdump(path: Path, args: List[str]) -> str:
    if not path.exists():
        raise Exception(f"file {path} does not exist")
-    command = [
+    command = ["mips-linux-gnu-objdump"] + args + [str(path)]
        "mips-linux-gnu-objdump",
        "-drz",
        "-m",
        "mips:4300",
        "-j",
        ".text",
        str(path),
    ]
    try:
-        lines = subprocess.run(
+        return subprocess.run(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=True,
            encoding="utf-8",
-        ).stdout.splitlines()
+        ).stdout
    except subprocess.CalledProcessError as e:
-        return []
+        return ""
 def disassemble(path: Path) -> List[Inst]:
    lines = run_objdump(path, ["-drz", "-m", "mips:4300", "-j", ".text"]).splitlines()
    result = []
    func_name = None
@ -156,14 +169,47 @@ def has_diff(inst1: Inst, inst2: Inst) -> bool:
    return inst1 != inst2
 def get_section_sizes(path: Path) -> Dict[str, int]:
    lines = run_objdump(path, ["-h"]).splitlines()
    if len(lines) < 5:
        return {}
    result = {}
    for i in range(5, len(lines), 2):
        parts = lines[i].split()
        name = parts[1]
        size = int(parts[2], 16)
        # Pad to 0x10-byte alignment
        result[parts[1]] = (size + 0xF) & ~0xF
    return result
 def get_section_hex_dump(path: Path, section: str) -> List[str]:
    lines = run_objdump(path, ["-s", "-j", section]).splitlines()
    return lines[4:]
 def parse_hex_dump(lines: List[str]) -> bytes:
    result = bytearray()
    for line in lines:
        data = line[6:41].replace(" ", "")
        result.extend(bytes.fromhex(data))
    # pad to 0x10-byte alignment
    while len(result) % 0x10:
        result.append(0)
    return result
 def find_functions_with_diffs(version: str, c_path: str):
    object_path = Path(c_path).with_suffix(".o")
    expected_dir = Path("expected/build") / version
    build_dir = Path("build") / version
-    insts1 = run_objdump(expected_dir / object_path)
+    insts1 = disassemble(expected_dir / object_path)
-    insts2 = run_objdump(build_dir / object_path)
+    insts2 = disassemble(build_dir / object_path)
    functions_with_diffs = collections.OrderedDict()
    for inst1, inst2 in pair_instructions(insts1, insts2):
@ -184,49 +230,156 @@ def find_functions_with_diffs(version: str, c_path: str):
        print(f"  {func_name}")
-def print_summary(version: str, csv: bool):
+def find_data_diffs(version: str, c_path: str):
    object_path = Path(c_path).with_suffix(".o")
    expected_dir = Path("expected/build") / version
    build_dir = Path("build") / version
-    if csv:
+    sizes1 = get_section_sizes(expected_dir / object_path)
-        print("path,expected,actual,added,removed,changed,progress")
+    sizes2 = get_section_sizes(build_dir / object_path)
-    for object_file in sorted(expected_dir.glob("src/**/*.o")):
+    rodata_dump1 = get_section_hex_dump(expected_dir / object_path, ".rodata")
-        object_path = object_file.relative_to(expected_dir)
+    rodata_dump2 = get_section_hex_dump(build_dir / object_path, ".rodata")
-        c_path = object_path.with_suffix(".c")
+    rodata1 = parse_hex_dump(rodata_dump1)
    rodata2 = parse_hex_dump(rodata_dump2)
-        insts1 = run_objdump(expected_dir / object_path)
+    rodata_matches = rodata1 == rodata2
-        insts2 = run_objdump(build_dir / object_path)
+    data_size_matches = sizes1.get(".data", 0) == sizes2.get(".data", 0)
    bss_size_matches = sizes1.get(".bss", 0) == sizes2.get(".bss", 0)
-        added = 0
+    if rodata_matches:
-        removed = 0
+        print(f"{c_path} .rodata OK")
-        changed = 0
+    else:
-        for inst1, inst2 in pair_instructions(insts1, insts2):
+        print(
-            if inst1 is None and inst2 is not None:
+            f"{c_path} .rodata differs: expected size 0x{sizes1.get('.rodata', 0):04x} vs build size 0x{sizes2.get('.rodata', 0):04x}"
-                added += 1
+        )
-            elif inst1 is not None and inst2 is None:
+        print(f"  expected:")
-                removed += 1
+        print("\n".join(rodata_dump1))
-            elif inst1 is not None and inst2 is not None and has_diff(inst1, inst2):
+        print(f"  build:")
-                changed += 1
+        print("\n".join(rodata_dump2))
-        if insts1:
+    if data_size_matches:
-            progress = max(1.0 - (added + removed + changed) / len(insts1), 0)
+        print(f"{c_path} .data size OK")
-        else:
+    else:
-            progress = 1.0
+        print(
            f"{c_path} .data size differs: expected size 0x{sizes1.get('.data', 0):04x} vs build size 0x{sizes2.get('.data', 0):04x}"
        )
-        if csv:
+    if bss_size_matches:
-            print(
+        print(f"{c_path} .bss size OK")
-                f"{c_path},{len(insts1)},{len(insts2)},{added},{removed},{changed},{progress:.3f}"
+    else:
        print(
            f"{c_path} .bss size differs: expected size 0x{sizes1.get('.bss', 0):04x} vs build size 0x{sizes2.get('.bss', 0):04x}"
        )
@dataclass
 class ObjectDataForComparison:
    insts1: List[Inst]
    insts2: List[Inst]
    sizes1: Dict[str, int]
    sizes2: Dict[str, int]
    rodata1: bytes
    rodata2: bytes
 def get_object_data_for_comparison(object1: Path, object2: Path):
    insts1 = disassemble(object1)
    insts2 = disassemble(object2)
    sizes1 = get_section_sizes(object1)
    sizes2 = get_section_sizes(object2)
    rodata_dump1 = get_section_hex_dump(object1, ".rodata")
    rodata_dump2 = get_section_hex_dump(object2, ".rodata")
    rodata1 = parse_hex_dump(rodata_dump1)
    rodata2 = parse_hex_dump(rodata_dump2)
    return ObjectDataForComparison(insts1, insts2, sizes1, sizes2, rodata1, rodata2)
 def print_summary(version: str, csv: bool, only_not_ok: bool):
    expected_dir = Path("expected/build") / version
    build_dir = Path("build") / version
    expected_object_files = sorted(expected_dir.glob("src/**/*.o"))
    comparison_data_list: List[multiprocessing.pool.AsyncResult] = []
    with multiprocessing.Pool(initializer=set_sigint_ignored) as p:
        for expected_object in expected_object_files:
            build_object = build_dir / expected_object.relative_to(expected_dir)
            comparison_data_list.append(
                p.apply_async(
                    get_object_data_for_comparison,
                    (expected_object, build_object),
                )
            )
-        elif progress == 1.0:
+        if csv:
-            print(f"   OK {c_path}")
+            print("path,expected,actual,.text,.rodata,.data size,.bss size")
-        else:
+        for expected_object, data_async in zip(
-            print(f"  {math.floor(progress * 100):>2}% {c_path}")
+            expected_object_files, comparison_data_list
        ):
            c_path = expected_object.relative_to(expected_dir).with_suffix(".c")
            data = data_async.get()
            insts1 = data.insts1
            insts2 = data.insts2
            added = 0
            removed = 0
            changed = 0
            for inst1, inst2 in pair_instructions(insts1, insts2):
                if inst1 is None and inst2 is not None:
                    added += 1
                elif inst1 is not None and inst2 is None:
                    removed += 1
                elif inst1 is not None and inst2 is not None and has_diff(inst1, inst2):
                    changed += 1
            if insts1:
                text_progress = max(1.0 - (added + removed + changed) / len(insts1), 0)
            else:
                text_progress = 1.0
            sizes1 = data.sizes1
            sizes2 = data.sizes2
            rodata1 = data.rodata1
            rodata2 = data.rodata2
            rodata_matches = rodata1 == rodata2
            data_size_matches = sizes1.get(".data", 0) == sizes2.get(".data", 0)
            bss_size_matches = sizes1.get(".bss", 0) == sizes2.get(".bss", 0)
            if only_not_ok:
                if (
                    text_progress == 1
                    and rodata_matches
                    and data_size_matches
                    and bss_size_matches
                ):
                    continue
            if csv:
                print(
                    f"{c_path},{len(insts1)},{len(insts2)},{text_progress:.3f},{rodata_matches},{data_size_matches},{bss_size_matches}"
                )
            else:
                ok = green("OK")
                diff = red("diff")
                text_progress_str = (
                    ok
                    if text_progress == 1
                    else red(f"{math.floor(text_progress * 100):>2}%")
                )
                rodata_str = ok if rodata_matches else diff
                data_size_str = ok if data_size_matches else diff
                bss_size_str = ok if bss_size_matches else diff
                print(
                    f"text:{text_progress_str:<13} rodata:{rodata_str:<13} data size:{data_size_str:<13} bss size:{bss_size_str:<13} {c_path}"
                )
            sys.stdout.flush()
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    parser = argparse.ArgumentParser(description="Calculate progress matching retail")
        description="Calculate progress matching .text sections"
    )
    parser.add_argument(
        "file",
        metavar="FILE",
@ -236,10 +389,24 @@ if __name__ == "__main__":
    parser.add_argument(
        "-v", "--version", help="version to compare", default="gc-eu-mq"
    )
    parser.add_argument(
        "--data",
        help="diff .data size, .bss size, and .rodata contents instead of text",
        action="store_true",
    )
    parser.add_argument(
        "--not-ok",
        help="only print non-OK files",
        action="store_true",
        dest="only_not_ok",
    )
    parser.add_argument("--csv", help="print summary CSV", action="store_true")
    args = parser.parse_args()
    if args.file is not None:
-        find_functions_with_diffs(args.version, args.file)
+        if args.data:
            find_data_diffs(args.version, args.file)
        else:
            find_functions_with_diffs(args.version, args.file)
    else:
-        print_summary(args.version, args.csv)
+        print_summary(args.version, args.csv, args.only_not_ok)