Diff .data size, .bss size, and .rodata contents in retail_progress.py (#1706)

* Use iconv to convert strings to EUC-JP for reassembly * Compare .data size, .bss size, and .rodata contents in retail_progress.py * Show data diffs in summary for humans * Use multiprocessing to very significantly speed up retail_progress.py summary * Remove intermediate function * Make sigint less jank * Hide the evidence Co-authored-by: Dragorn421 <Dragorn421@users.noreply.github.com> * add --not-ok to only print not-OK files in summary --------- Co-authored-by: Dragorn421 <Dragorn421@users.noreply.github.com>
2024-09-20 20:14:55 +00:00 · 2024-02-09 05:30:32 -08:00 · 2024-02-09 05:30:32 -08:00 · d2a1abf8df
commit d2a1abf8df
parent f492c04186
2 changed files with 219 additions and 52 deletions
--- a/2
+++ b/2
@ -500,7 +500,7 @@ $(EXPECTED_DIR)/.disasm: $(DISASM_DATA_FILES)
 	touch $@

 $(EXPECTED_DIR)/%.o: $(EXPECTED_DIR)/.disasm
-	$(AS) $(ASFLAGS) $(@:.o=.s) -o $@
+	iconv --from UTF-8 --to EUC-JP $(@:.o=.s) | $(AS) $(ASFLAGS) -o $@

 -include $(DEP_FILES)

--- a/retail_progress.py
+++ b/retail_progress.py
@ -5,16 +5,34 @@

 import argparse
 import collections
+from colorama import Fore, Style
 from dataclasses import dataclass
 import difflib
-from enum import Enum
 import itertools
 import math
 from pathlib import Path
 import re
 import subprocess
 import sys
-from typing import Iterator, List, Optional, Tuple
+import multiprocessing
+import multiprocessing.pool
+from typing import Dict, Iterator, List, Optional, Tuple
+
+
+def green(s: str) -> str:
+    return f"{Fore.GREEN}{s}{Style.RESET_ALL}"
+
+
+def red(s: str) -> str:
+    return f"{Fore.RED}{s}{Style.RESET_ALL}"
+
+
+# Make interrupting with ^C less jank
+# https://stackoverflow.com/questions/72967793/keyboardinterrupt-with-python-multiprocessing-pool
+def set_sigint_ignored():
+    import signal
+
+    signal.signal(signal.SIGINT, signal.SIG_IGN)


@dataclass
@ -68,30 +86,25 @@ def parse_inst(func_name: str, line: str) -> Inst:
    return Inst(func_name, mnemonic, regs, imm, None, None)


-def run_objdump(path: Path) -> List[Inst]:
+def run_objdump(path: Path, args: List[str]) -> str:
    if not path.exists():
        raise Exception(f"file {path} does not exist")

-    command = [
-        "mips-linux-gnu-objdump",
-        "-drz",
-        "-m",
-        "mips:4300",
-        "-j",
-        ".text",
-        str(path),
-    ]
+    command = ["mips-linux-gnu-objdump"] + args + [str(path)]
    try:
-        lines = subprocess.run(
+        return subprocess.run(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=True,
            encoding="utf-8",
-        ).stdout.splitlines()
+        ).stdout
    except subprocess.CalledProcessError as e:
-        return []
+        return ""

+
+def disassemble(path: Path) -> List[Inst]:
+    lines = run_objdump(path, ["-drz", "-m", "mips:4300", "-j", ".text"]).splitlines()
    result = []

    func_name = None
@ -156,14 +169,47 @@ def has_diff(inst1: Inst, inst2: Inst) -> bool:
    return inst1 != inst2


+def get_section_sizes(path: Path) -> Dict[str, int]:
+    lines = run_objdump(path, ["-h"]).splitlines()
+    if len(lines) < 5:
+        return {}
+
+    result = {}
+    for i in range(5, len(lines), 2):
+        parts = lines[i].split()
+        name = parts[1]
+        size = int(parts[2], 16)
+        # Pad to 0x10-byte alignment
+        result[parts[1]] = (size + 0xF) & ~0xF
+    return result
+
+
+def get_section_hex_dump(path: Path, section: str) -> List[str]:
+    lines = run_objdump(path, ["-s", "-j", section]).splitlines()
+    return lines[4:]
+
+
+def parse_hex_dump(lines: List[str]) -> bytes:
+    result = bytearray()
+    for line in lines:
+        data = line[6:41].replace(" ", "")
+        result.extend(bytes.fromhex(data))
+
+    # pad to 0x10-byte alignment
+    while len(result) % 0x10:
+        result.append(0)
+
+    return result
+
+
 def find_functions_with_diffs(version: str, c_path: str):
    object_path = Path(c_path).with_suffix(".o")

    expected_dir = Path("expected/build") / version
    build_dir = Path("build") / version

-    insts1 = run_objdump(expected_dir / object_path)
-    insts2 = run_objdump(build_dir / object_path)
+    insts1 = disassemble(expected_dir / object_path)
+    insts2 = disassemble(build_dir / object_path)

    functions_with_diffs = collections.OrderedDict()
    for inst1, inst2 in pair_instructions(insts1, insts2):
@ -184,49 +230,156 @@ def find_functions_with_diffs(version: str, c_path: str):
        print(f"  {func_name}")


-def print_summary(version: str, csv: bool):
+def find_data_diffs(version: str, c_path: str):
+    object_path = Path(c_path).with_suffix(".o")
+
    expected_dir = Path("expected/build") / version
    build_dir = Path("build") / version

-    if csv:
-        print("path,expected,actual,added,removed,changed,progress")
-    for object_file in sorted(expected_dir.glob("src/**/*.o")):
-        object_path = object_file.relative_to(expected_dir)
-        c_path = object_path.with_suffix(".c")
+    sizes1 = get_section_sizes(expected_dir / object_path)
+    sizes2 = get_section_sizes(build_dir / object_path)
+    rodata_dump1 = get_section_hex_dump(expected_dir / object_path, ".rodata")
+    rodata_dump2 = get_section_hex_dump(build_dir / object_path, ".rodata")
+    rodata1 = parse_hex_dump(rodata_dump1)
+    rodata2 = parse_hex_dump(rodata_dump2)

-        insts1 = run_objdump(expected_dir / object_path)
-        insts2 = run_objdump(build_dir / object_path)
+    rodata_matches = rodata1 == rodata2
+    data_size_matches = sizes1.get(".data", 0) == sizes2.get(".data", 0)
+    bss_size_matches = sizes1.get(".bss", 0) == sizes2.get(".bss", 0)

-        added = 0
-        removed = 0
-        changed = 0
-        for inst1, inst2 in pair_instructions(insts1, insts2):
-            if inst1 is None and inst2 is not None:
-                added += 1
-            elif inst1 is not None and inst2 is None:
-                removed += 1
-            elif inst1 is not None and inst2 is not None and has_diff(inst1, inst2):
-                changed += 1
+    if rodata_matches:
+        print(f"{c_path} .rodata OK")
+    else:
+        print(
+            f"{c_path} .rodata differs: expected size 0x{sizes1.get('.rodata', 0):04x} vs build size 0x{sizes2.get('.rodata', 0):04x}"
+        )
+        print(f"  expected:")
+        print("\n".join(rodata_dump1))
+        print(f"  build:")
+        print("\n".join(rodata_dump2))

-        if insts1:
-            progress = max(1.0 - (added + removed + changed) / len(insts1), 0)
-        else:
-            progress = 1.0
+    if data_size_matches:
+        print(f"{c_path} .data size OK")
+    else:
+        print(
+            f"{c_path} .data size differs: expected size 0x{sizes1.get('.data', 0):04x} vs build size 0x{sizes2.get('.data', 0):04x}"
+        )

-        if csv:
-            print(
-                f"{c_path},{len(insts1)},{len(insts2)},{added},{removed},{changed},{progress:.3f}"
+    if bss_size_matches:
+        print(f"{c_path} .bss size OK")
+    else:
+        print(
+            f"{c_path} .bss size differs: expected size 0x{sizes1.get('.bss', 0):04x} vs build size 0x{sizes2.get('.bss', 0):04x}"
+        )
+
+
+@dataclass
+class ObjectDataForComparison:
+    insts1: List[Inst]
+    insts2: List[Inst]
+    sizes1: Dict[str, int]
+    sizes2: Dict[str, int]
+    rodata1: bytes
+    rodata2: bytes
+
+
+def get_object_data_for_comparison(object1: Path, object2: Path):
+    insts1 = disassemble(object1)
+    insts2 = disassemble(object2)
+    sizes1 = get_section_sizes(object1)
+    sizes2 = get_section_sizes(object2)
+    rodata_dump1 = get_section_hex_dump(object1, ".rodata")
+    rodata_dump2 = get_section_hex_dump(object2, ".rodata")
+    rodata1 = parse_hex_dump(rodata_dump1)
+    rodata2 = parse_hex_dump(rodata_dump2)
+    return ObjectDataForComparison(insts1, insts2, sizes1, sizes2, rodata1, rodata2)
+
+
+def print_summary(version: str, csv: bool, only_not_ok: bool):
+    expected_dir = Path("expected/build") / version
+    build_dir = Path("build") / version
+
+    expected_object_files = sorted(expected_dir.glob("src/**/*.o"))
+
+    comparison_data_list: List[multiprocessing.pool.AsyncResult] = []
+
+    with multiprocessing.Pool(initializer=set_sigint_ignored) as p:
+        for expected_object in expected_object_files:
+            build_object = build_dir / expected_object.relative_to(expected_dir)
+            comparison_data_list.append(
+                p.apply_async(
+                    get_object_data_for_comparison,
+                    (expected_object, build_object),
+                )
            )
-        elif progress == 1.0:
-            print(f"   OK {c_path}")
-        else:
-            print(f"  {math.floor(progress * 100):>2}% {c_path}")
+        if csv:
+            print("path,expected,actual,.text,.rodata,.data size,.bss size")
+        for expected_object, data_async in zip(
+            expected_object_files, comparison_data_list
+        ):
+            c_path = expected_object.relative_to(expected_dir).with_suffix(".c")
+            data = data_async.get()
+
+            insts1 = data.insts1
+            insts2 = data.insts2
+
+            added = 0
+            removed = 0
+            changed = 0
+            for inst1, inst2 in pair_instructions(insts1, insts2):
+                if inst1 is None and inst2 is not None:
+                    added += 1
+                elif inst1 is not None and inst2 is None:
+                    removed += 1
+                elif inst1 is not None and inst2 is not None and has_diff(inst1, inst2):
+                    changed += 1
+
+            if insts1:
+                text_progress = max(1.0 - (added + removed + changed) / len(insts1), 0)
+            else:
+                text_progress = 1.0
+
+            sizes1 = data.sizes1
+            sizes2 = data.sizes2
+            rodata1 = data.rodata1
+            rodata2 = data.rodata2
+
+            rodata_matches = rodata1 == rodata2
+            data_size_matches = sizes1.get(".data", 0) == sizes2.get(".data", 0)
+            bss_size_matches = sizes1.get(".bss", 0) == sizes2.get(".bss", 0)
+
+            if only_not_ok:
+                if (
+                    text_progress == 1
+                    and rodata_matches
+                    and data_size_matches
+                    and bss_size_matches
+                ):
+                    continue
+
+            if csv:
+                print(
+                    f"{c_path},{len(insts1)},{len(insts2)},{text_progress:.3f},{rodata_matches},{data_size_matches},{bss_size_matches}"
+                )
+            else:
+                ok = green("OK")
+                diff = red("diff")
+                text_progress_str = (
+                    ok
+                    if text_progress == 1
+                    else red(f"{math.floor(text_progress * 100):>2}%")
+                )
+                rodata_str = ok if rodata_matches else diff
+                data_size_str = ok if data_size_matches else diff
+                bss_size_str = ok if bss_size_matches else diff
+                print(
+                    f"text:{text_progress_str:<13} rodata:{rodata_str:<13} data size:{data_size_str:<13} bss size:{bss_size_str:<13} {c_path}"
+                )
+            sys.stdout.flush()


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Calculate progress matching .text sections"
-    )
+    parser = argparse.ArgumentParser(description="Calculate progress matching retail")
    parser.add_argument(
        "file",
        metavar="FILE",
@ -236,10 +389,24 @@ if __name__ == "__main__":
    parser.add_argument(
        "-v", "--version", help="version to compare", default="gc-eu-mq"
    )
+    parser.add_argument(
+        "--data",
+        help="diff .data size, .bss size, and .rodata contents instead of text",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--not-ok",
+        help="only print non-OK files",
+        action="store_true",
+        dest="only_not_ok",
+    )
    parser.add_argument("--csv", help="print summary CSV", action="store_true")
    args = parser.parse_args()

    if args.file is not None:
-        find_functions_with_diffs(args.version, args.file)
+        if args.data:
+            find_data_diffs(args.version, args.file)
+        else:
+            find_functions_with_diffs(args.version, args.file)
    else:
-        print_summary(args.version, args.csv)
+        print_summary(args.version, args.csv, args.only_not_ok)