From d2a1abf8dfba0fb00e0bfdd801d3d17371da8f31 Mon Sep 17 00:00:00 2001
From: cadmic <cadmic24@gmail.com>
Date: Fri, 9 Feb 2024 05:30:32 -0800
Subject: [PATCH] Diff .data size, .bss size, and .rodata contents in
 retail_progress.py (#1706)

* Use iconv to convert strings to EUC-JP for reassembly

* Compare .data size, .bss size, and .rodata contents in retail_progress.py

* Show data diffs in summary for humans

* Use multiprocessing to very significantly speed up retail_progress.py summary

* Remove intermediate function

* Make sigint less jank

* Hide the evidence

Co-authored-by: Dragorn421 <Dragorn421@users.noreply.github.com>

* add --not-ok to only print not-OK files in summary

---------

Co-authored-by: Dragorn421 <Dragorn421@users.noreply.github.com>
---
 Makefile           |   2 +-
 retail_progress.py | 269 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 219 insertions(+), 52 deletions(-)

diff --git a/Makefile b/Makefile
index 5245a09b69..5f9ae59d56 100644
--- a/Makefile
+++ b/Makefile
@@ -500,7 +500,7 @@ $(EXPECTED_DIR)/.disasm: $(DISASM_DATA_FILES)
 	touch $@
 
 $(EXPECTED_DIR)/%.o: $(EXPECTED_DIR)/.disasm
-	$(AS) $(ASFLAGS) $(@:.o=.s) -o $@
+	iconv --from UTF-8 --to EUC-JP $(@:.o=.s) | $(AS) $(ASFLAGS) -o $@
 
 -include $(DEP_FILES)
 
diff --git a/retail_progress.py b/retail_progress.py
index 53a137a2d6..21a98ccdbc 100755
--- a/retail_progress.py
+++ b/retail_progress.py
@@ -5,16 +5,34 @@
 
 import argparse
 import collections
+from colorama import Fore, Style
 from dataclasses import dataclass
 import difflib
-from enum import Enum
 import itertools
 import math
 from pathlib import Path
 import re
 import subprocess
 import sys
-from typing import Iterator, List, Optional, Tuple
+import multiprocessing
+import multiprocessing.pool
+from typing import Dict, Iterator, List, Optional, Tuple
+
+
+def green(s: str) -> str:
+    return f"{Fore.GREEN}{s}{Style.RESET_ALL}"
+
+
+def red(s: str) -> str:
+    return f"{Fore.RED}{s}{Style.RESET_ALL}"
+
+
+# Make interrupting with ^C less jank
+# https://stackoverflow.com/questions/72967793/keyboardinterrupt-with-python-multiprocessing-pool
+def set_sigint_ignored():
+    import signal
+
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
 
 
 @dataclass
@@ -68,30 +86,25 @@ def parse_inst(func_name: str, line: str) -> Inst:
     return Inst(func_name, mnemonic, regs, imm, None, None)
 
 
-def run_objdump(path: Path) -> List[Inst]:
+def run_objdump(path: Path, args: List[str]) -> str:
     if not path.exists():
         raise Exception(f"file {path} does not exist")
 
-    command = [
-        "mips-linux-gnu-objdump",
-        "-drz",
-        "-m",
-        "mips:4300",
-        "-j",
-        ".text",
-        str(path),
-    ]
+    command = ["mips-linux-gnu-objdump"] + args + [str(path)]
     try:
-        lines = subprocess.run(
+        return subprocess.run(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             check=True,
             encoding="utf-8",
-        ).stdout.splitlines()
+        ).stdout
     except subprocess.CalledProcessError as e:
-        return []
+        return ""
 
+
+def disassemble(path: Path) -> List[Inst]:
+    lines = run_objdump(path, ["-drz", "-m", "mips:4300", "-j", ".text"]).splitlines()
     result = []
 
     func_name = None
@@ -156,14 +169,47 @@ def has_diff(inst1: Inst, inst2: Inst) -> bool:
     return inst1 != inst2
 
 
+def get_section_sizes(path: Path) -> Dict[str, int]:
+    lines = run_objdump(path, ["-h"]).splitlines()
+    if len(lines) < 5:
+        return {}
+
+    result = {}
+    for i in range(5, len(lines), 2):
+        parts = lines[i].split()
+        name = parts[1]
+        size = int(parts[2], 16)
+        # Pad to 0x10-byte alignment
+        result[parts[1]] = (size + 0xF) & ~0xF
+    return result
+
+
+def get_section_hex_dump(path: Path, section: str) -> List[str]:
+    lines = run_objdump(path, ["-s", "-j", section]).splitlines()
+    return lines[4:]
+
+
+def parse_hex_dump(lines: List[str]) -> bytes:
+    result = bytearray()
+    for line in lines:
+        data = line[6:41].replace(" ", "")
+        result.extend(bytes.fromhex(data))
+
+    # pad to 0x10-byte alignment
+    while len(result) % 0x10:
+        result.append(0)
+
+    return result
+
+
 def find_functions_with_diffs(version: str, c_path: str):
     object_path = Path(c_path).with_suffix(".o")
 
     expected_dir = Path("expected/build") / version
     build_dir = Path("build") / version
 
-    insts1 = run_objdump(expected_dir / object_path)
-    insts2 = run_objdump(build_dir / object_path)
+    insts1 = disassemble(expected_dir / object_path)
+    insts2 = disassemble(build_dir / object_path)
 
     functions_with_diffs = collections.OrderedDict()
     for inst1, inst2 in pair_instructions(insts1, insts2):
@@ -184,49 +230,156 @@ def find_functions_with_diffs(version: str, c_path: str):
         print(f"  {func_name}")
 
 
-def print_summary(version: str, csv: bool):
+def find_data_diffs(version: str, c_path: str):
+    object_path = Path(c_path).with_suffix(".o")
+
     expected_dir = Path("expected/build") / version
     build_dir = Path("build") / version
 
-    if csv:
-        print("path,expected,actual,added,removed,changed,progress")
-    for object_file in sorted(expected_dir.glob("src/**/*.o")):
-        object_path = object_file.relative_to(expected_dir)
-        c_path = object_path.with_suffix(".c")
+    sizes1 = get_section_sizes(expected_dir / object_path)
+    sizes2 = get_section_sizes(build_dir / object_path)
+    rodata_dump1 = get_section_hex_dump(expected_dir / object_path, ".rodata")
+    rodata_dump2 = get_section_hex_dump(build_dir / object_path, ".rodata")
+    rodata1 = parse_hex_dump(rodata_dump1)
+    rodata2 = parse_hex_dump(rodata_dump2)
 
-        insts1 = run_objdump(expected_dir / object_path)
-        insts2 = run_objdump(build_dir / object_path)
+    rodata_matches = rodata1 == rodata2
+    data_size_matches = sizes1.get(".data", 0) == sizes2.get(".data", 0)
+    bss_size_matches = sizes1.get(".bss", 0) == sizes2.get(".bss", 0)
 
-        added = 0
-        removed = 0
-        changed = 0
-        for inst1, inst2 in pair_instructions(insts1, insts2):
-            if inst1 is None and inst2 is not None:
-                added += 1
-            elif inst1 is not None and inst2 is None:
-                removed += 1
-            elif inst1 is not None and inst2 is not None and has_diff(inst1, inst2):
-                changed += 1
+    if rodata_matches:
+        print(f"{c_path} .rodata OK")
+    else:
+        print(
+            f"{c_path} .rodata differs: expected size 0x{sizes1.get('.rodata', 0):04x} vs build size 0x{sizes2.get('.rodata', 0):04x}"
+        )
+        print(f"  expected:")
+        print("\n".join(rodata_dump1))
+        print(f"  build:")
+        print("\n".join(rodata_dump2))
 
-        if insts1:
-            progress = max(1.0 - (added + removed + changed) / len(insts1), 0)
-        else:
-            progress = 1.0
+    if data_size_matches:
+        print(f"{c_path} .data size OK")
+    else:
+        print(
+            f"{c_path} .data size differs: expected size 0x{sizes1.get('.data', 0):04x} vs build size 0x{sizes2.get('.data', 0):04x}"
+        )
 
-        if csv:
-            print(
-                f"{c_path},{len(insts1)},{len(insts2)},{added},{removed},{changed},{progress:.3f}"
+    if bss_size_matches:
+        print(f"{c_path} .bss size OK")
+    else:
+        print(
+            f"{c_path} .bss size differs: expected size 0x{sizes1.get('.bss', 0):04x} vs build size 0x{sizes2.get('.bss', 0):04x}"
+        )
+
+
+@dataclass
+class ObjectDataForComparison:
+    insts1: List[Inst]
+    insts2: List[Inst]
+    sizes1: Dict[str, int]
+    sizes2: Dict[str, int]
+    rodata1: bytes
+    rodata2: bytes
+
+
+def get_object_data_for_comparison(object1: Path, object2: Path):
+    insts1 = disassemble(object1)
+    insts2 = disassemble(object2)
+    sizes1 = get_section_sizes(object1)
+    sizes2 = get_section_sizes(object2)
+    rodata_dump1 = get_section_hex_dump(object1, ".rodata")
+    rodata_dump2 = get_section_hex_dump(object2, ".rodata")
+    rodata1 = parse_hex_dump(rodata_dump1)
+    rodata2 = parse_hex_dump(rodata_dump2)
+    return ObjectDataForComparison(insts1, insts2, sizes1, sizes2, rodata1, rodata2)
+
+
+def print_summary(version: str, csv: bool, only_not_ok: bool):
+    expected_dir = Path("expected/build") / version
+    build_dir = Path("build") / version
+
+    expected_object_files = sorted(expected_dir.glob("src/**/*.o"))
+
+    comparison_data_list: List[multiprocessing.pool.AsyncResult] = []
+
+    with multiprocessing.Pool(initializer=set_sigint_ignored) as p:
+        for expected_object in expected_object_files:
+            build_object = build_dir / expected_object.relative_to(expected_dir)
+            comparison_data_list.append(
+                p.apply_async(
+                    get_object_data_for_comparison,
+                    (expected_object, build_object),
+                )
             )
-        elif progress == 1.0:
-            print(f"   OK {c_path}")
-        else:
-            print(f"  {math.floor(progress * 100):>2}% {c_path}")
+        if csv:
+            print("path,expected,actual,.text,.rodata,.data size,.bss size")
+        for expected_object, data_async in zip(
+            expected_object_files, comparison_data_list
+        ):
+            c_path = expected_object.relative_to(expected_dir).with_suffix(".c")
+            data = data_async.get()
+
+            insts1 = data.insts1
+            insts2 = data.insts2
+
+            added = 0
+            removed = 0
+            changed = 0
+            for inst1, inst2 in pair_instructions(insts1, insts2):
+                if inst1 is None and inst2 is not None:
+                    added += 1
+                elif inst1 is not None and inst2 is None:
+                    removed += 1
+                elif inst1 is not None and inst2 is not None and has_diff(inst1, inst2):
+                    changed += 1
+
+            if insts1:
+                text_progress = max(1.0 - (added + removed + changed) / len(insts1), 0)
+            else:
+                text_progress = 1.0
+
+            sizes1 = data.sizes1
+            sizes2 = data.sizes2
+            rodata1 = data.rodata1
+            rodata2 = data.rodata2
+
+            rodata_matches = rodata1 == rodata2
+            data_size_matches = sizes1.get(".data", 0) == sizes2.get(".data", 0)
+            bss_size_matches = sizes1.get(".bss", 0) == sizes2.get(".bss", 0)
+
+            if only_not_ok:
+                if (
+                    text_progress == 1
+                    and rodata_matches
+                    and data_size_matches
+                    and bss_size_matches
+                ):
+                    continue
+
+            if csv:
+                print(
+                    f"{c_path},{len(insts1)},{len(insts2)},{text_progress:.3f},{rodata_matches},{data_size_matches},{bss_size_matches}"
+                )
+            else:
+                ok = green("OK")
+                diff = red("diff")
+                text_progress_str = (
+                    ok
+                    if text_progress == 1
+                    else red(f"{math.floor(text_progress * 100):>2}%")
+                )
+                rodata_str = ok if rodata_matches else diff
+                data_size_str = ok if data_size_matches else diff
+                bss_size_str = ok if bss_size_matches else diff
+                print(
+                    f"text:{text_progress_str:<13} rodata:{rodata_str:<13} data size:{data_size_str:<13} bss size:{bss_size_str:<13} {c_path}"
+                )
+            sys.stdout.flush()
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Calculate progress matching .text sections"
-    )
+    parser = argparse.ArgumentParser(description="Calculate progress matching retail")
     parser.add_argument(
         "file",
         metavar="FILE",
@@ -236,10 +389,24 @@ if __name__ == "__main__":
     parser.add_argument(
         "-v", "--version", help="version to compare", default="gc-eu-mq"
     )
+    parser.add_argument(
+        "--data",
+        help="diff .data size, .bss size, and .rodata contents instead of text",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--not-ok",
+        help="only print non-OK files",
+        action="store_true",
+        dest="only_not_ok",
+    )
     parser.add_argument("--csv", help="print summary CSV", action="store_true")
     args = parser.parse_args()
 
     if args.file is not None:
-        find_functions_with_diffs(args.version, args.file)
+        if args.data:
+            find_data_diffs(args.version, args.file)
+        else:
+            find_functions_with_diffs(args.version, args.file)
     else:
-        print_summary(args.version, args.csv)
+        print_summary(args.version, args.csv, args.only_not_ok)