From d2a1abf8dfba0fb00e0bfdd801d3d17371da8f31 Mon Sep 17 00:00:00 2001 From: cadmic Date: Fri, 9 Feb 2024 05:30:32 -0800 Subject: [PATCH] Diff .data size, .bss size, and .rodata contents in retail_progress.py (#1706) * Use iconv to convert strings to EUC-JP for reassembly * Compare .data size, .bss size, and .rodata contents in retail_progress.py * Show data diffs in summary for humans * Use multiprocessing to very significantly speed up retail_progress.py summary * Remove intermediate function * Make sigint less jank * Hide the evidence Co-authored-by: Dragorn421 * add --not-ok to only print not-OK files in summary --------- Co-authored-by: Dragorn421 --- Makefile | 2 +- retail_progress.py | 269 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 219 insertions(+), 52 deletions(-) diff --git a/Makefile b/Makefile index 5245a09b69..5f9ae59d56 100644 --- a/Makefile +++ b/Makefile @@ -500,7 +500,7 @@ $(EXPECTED_DIR)/.disasm: $(DISASM_DATA_FILES) touch $@ $(EXPECTED_DIR)/%.o: $(EXPECTED_DIR)/.disasm - $(AS) $(ASFLAGS) $(@:.o=.s) -o $@ + iconv --from UTF-8 --to EUC-JP $(@:.o=.s) | $(AS) $(ASFLAGS) -o $@ -include $(DEP_FILES) diff --git a/retail_progress.py b/retail_progress.py index 53a137a2d6..21a98ccdbc 100755 --- a/retail_progress.py +++ b/retail_progress.py @@ -5,16 +5,34 @@ import argparse import collections +from colorama import Fore, Style from dataclasses import dataclass import difflib -from enum import Enum import itertools import math from pathlib import Path import re import subprocess import sys -from typing import Iterator, List, Optional, Tuple +import multiprocessing +import multiprocessing.pool +from typing import Dict, Iterator, List, Optional, Tuple + + +def green(s: str) -> str: + return f"{Fore.GREEN}{s}{Style.RESET_ALL}" + + +def red(s: str) -> str: + return f"{Fore.RED}{s}{Style.RESET_ALL}" + + +# Make interrupting with ^C less jank +# https://stackoverflow.com/questions/72967793/keyboardinterrupt-with-python-multiprocessing-pool +def set_sigint_ignored(): + import signal + + signal.signal(signal.SIGINT, signal.SIG_IGN) @dataclass @@ -68,30 +86,25 @@ def parse_inst(func_name: str, line: str) -> Inst: return Inst(func_name, mnemonic, regs, imm, None, None) -def run_objdump(path: Path) -> List[Inst]: +def run_objdump(path: Path, args: List[str]) -> str: if not path.exists(): raise Exception(f"file {path} does not exist") - command = [ - "mips-linux-gnu-objdump", - "-drz", - "-m", - "mips:4300", - "-j", - ".text", - str(path), - ] + command = ["mips-linux-gnu-objdump"] + args + [str(path)] try: - lines = subprocess.run( + return subprocess.run( command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, encoding="utf-8", - ).stdout.splitlines() + ).stdout except subprocess.CalledProcessError as e: - return [] + return "" + +def disassemble(path: Path) -> List[Inst]: + lines = run_objdump(path, ["-drz", "-m", "mips:4300", "-j", ".text"]).splitlines() result = [] func_name = None @@ -156,14 +169,47 @@ def has_diff(inst1: Inst, inst2: Inst) -> bool: return inst1 != inst2 +def get_section_sizes(path: Path) -> Dict[str, int]: + lines = run_objdump(path, ["-h"]).splitlines() + if len(lines) < 5: + return {} + + result = {} + for i in range(5, len(lines), 2): + parts = lines[i].split() + name = parts[1] + size = int(parts[2], 16) + # Pad to 0x10-byte alignment + result[parts[1]] = (size + 0xF) & ~0xF + return result + + +def get_section_hex_dump(path: Path, section: str) -> List[str]: + lines = run_objdump(path, ["-s", "-j", section]).splitlines() + return lines[4:] + + +def parse_hex_dump(lines: List[str]) -> bytes: + result = bytearray() + for line in lines: + data = line[6:41].replace(" ", "") + result.extend(bytes.fromhex(data)) + + # pad to 0x10-byte alignment + while len(result) % 0x10: + result.append(0) + + return result + + def find_functions_with_diffs(version: str, c_path: str): object_path = Path(c_path).with_suffix(".o") expected_dir = Path("expected/build") / version build_dir = Path("build") / version - insts1 = run_objdump(expected_dir / object_path) - insts2 = run_objdump(build_dir / object_path) + insts1 = disassemble(expected_dir / object_path) + insts2 = disassemble(build_dir / object_path) functions_with_diffs = collections.OrderedDict() for inst1, inst2 in pair_instructions(insts1, insts2): @@ -184,49 +230,156 @@ def find_functions_with_diffs(version: str, c_path: str): print(f" {func_name}") -def print_summary(version: str, csv: bool): +def find_data_diffs(version: str, c_path: str): + object_path = Path(c_path).with_suffix(".o") + expected_dir = Path("expected/build") / version build_dir = Path("build") / version - if csv: - print("path,expected,actual,added,removed,changed,progress") - for object_file in sorted(expected_dir.glob("src/**/*.o")): - object_path = object_file.relative_to(expected_dir) - c_path = object_path.with_suffix(".c") + sizes1 = get_section_sizes(expected_dir / object_path) + sizes2 = get_section_sizes(build_dir / object_path) + rodata_dump1 = get_section_hex_dump(expected_dir / object_path, ".rodata") + rodata_dump2 = get_section_hex_dump(build_dir / object_path, ".rodata") + rodata1 = parse_hex_dump(rodata_dump1) + rodata2 = parse_hex_dump(rodata_dump2) - insts1 = run_objdump(expected_dir / object_path) - insts2 = run_objdump(build_dir / object_path) + rodata_matches = rodata1 == rodata2 + data_size_matches = sizes1.get(".data", 0) == sizes2.get(".data", 0) + bss_size_matches = sizes1.get(".bss", 0) == sizes2.get(".bss", 0) - added = 0 - removed = 0 - changed = 0 - for inst1, inst2 in pair_instructions(insts1, insts2): - if inst1 is None and inst2 is not None: - added += 1 - elif inst1 is not None and inst2 is None: - removed += 1 - elif inst1 is not None and inst2 is not None and has_diff(inst1, inst2): - changed += 1 + if rodata_matches: + print(f"{c_path} .rodata OK") + else: + print( + f"{c_path} .rodata differs: expected size 0x{sizes1.get('.rodata', 0):04x} vs build size 0x{sizes2.get('.rodata', 0):04x}" + ) + print(f" expected:") + print("\n".join(rodata_dump1)) + print(f" build:") + print("\n".join(rodata_dump2)) - if insts1: - progress = max(1.0 - (added + removed + changed) / len(insts1), 0) - else: - progress = 1.0 + if data_size_matches: + print(f"{c_path} .data size OK") + else: + print( + f"{c_path} .data size differs: expected size 0x{sizes1.get('.data', 0):04x} vs build size 0x{sizes2.get('.data', 0):04x}" + ) - if csv: - print( - f"{c_path},{len(insts1)},{len(insts2)},{added},{removed},{changed},{progress:.3f}" + if bss_size_matches: + print(f"{c_path} .bss size OK") + else: + print( + f"{c_path} .bss size differs: expected size 0x{sizes1.get('.bss', 0):04x} vs build size 0x{sizes2.get('.bss', 0):04x}" + ) + + +@dataclass +class ObjectDataForComparison: + insts1: List[Inst] + insts2: List[Inst] + sizes1: Dict[str, int] + sizes2: Dict[str, int] + rodata1: bytes + rodata2: bytes + + +def get_object_data_for_comparison(object1: Path, object2: Path): + insts1 = disassemble(object1) + insts2 = disassemble(object2) + sizes1 = get_section_sizes(object1) + sizes2 = get_section_sizes(object2) + rodata_dump1 = get_section_hex_dump(object1, ".rodata") + rodata_dump2 = get_section_hex_dump(object2, ".rodata") + rodata1 = parse_hex_dump(rodata_dump1) + rodata2 = parse_hex_dump(rodata_dump2) + return ObjectDataForComparison(insts1, insts2, sizes1, sizes2, rodata1, rodata2) + + +def print_summary(version: str, csv: bool, only_not_ok: bool): + expected_dir = Path("expected/build") / version + build_dir = Path("build") / version + + expected_object_files = sorted(expected_dir.glob("src/**/*.o")) + + comparison_data_list: List[multiprocessing.pool.AsyncResult] = [] + + with multiprocessing.Pool(initializer=set_sigint_ignored) as p: + for expected_object in expected_object_files: + build_object = build_dir / expected_object.relative_to(expected_dir) + comparison_data_list.append( + p.apply_async( + get_object_data_for_comparison, + (expected_object, build_object), + ) ) - elif progress == 1.0: - print(f" OK {c_path}") - else: - print(f" {math.floor(progress * 100):>2}% {c_path}") + if csv: + print("path,expected,actual,.text,.rodata,.data size,.bss size") + for expected_object, data_async in zip( + expected_object_files, comparison_data_list + ): + c_path = expected_object.relative_to(expected_dir).with_suffix(".c") + data = data_async.get() + + insts1 = data.insts1 + insts2 = data.insts2 + + added = 0 + removed = 0 + changed = 0 + for inst1, inst2 in pair_instructions(insts1, insts2): + if inst1 is None and inst2 is not None: + added += 1 + elif inst1 is not None and inst2 is None: + removed += 1 + elif inst1 is not None and inst2 is not None and has_diff(inst1, inst2): + changed += 1 + + if insts1: + text_progress = max(1.0 - (added + removed + changed) / len(insts1), 0) + else: + text_progress = 1.0 + + sizes1 = data.sizes1 + sizes2 = data.sizes2 + rodata1 = data.rodata1 + rodata2 = data.rodata2 + + rodata_matches = rodata1 == rodata2 + data_size_matches = sizes1.get(".data", 0) == sizes2.get(".data", 0) + bss_size_matches = sizes1.get(".bss", 0) == sizes2.get(".bss", 0) + + if only_not_ok: + if ( + text_progress == 1 + and rodata_matches + and data_size_matches + and bss_size_matches + ): + continue + + if csv: + print( + f"{c_path},{len(insts1)},{len(insts2)},{text_progress:.3f},{rodata_matches},{data_size_matches},{bss_size_matches}" + ) + else: + ok = green("OK") + diff = red("diff") + text_progress_str = ( + ok + if text_progress == 1 + else red(f"{math.floor(text_progress * 100):>2}%") + ) + rodata_str = ok if rodata_matches else diff + data_size_str = ok if data_size_matches else diff + bss_size_str = ok if bss_size_matches else diff + print( + f"text:{text_progress_str:<13} rodata:{rodata_str:<13} data size:{data_size_str:<13} bss size:{bss_size_str:<13} {c_path}" + ) + sys.stdout.flush() if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Calculate progress matching .text sections" - ) + parser = argparse.ArgumentParser(description="Calculate progress matching retail") parser.add_argument( "file", metavar="FILE", @@ -236,10 +389,24 @@ if __name__ == "__main__": parser.add_argument( "-v", "--version", help="version to compare", default="gc-eu-mq" ) + parser.add_argument( + "--data", + help="diff .data size, .bss size, and .rodata contents instead of text", + action="store_true", + ) + parser.add_argument( + "--not-ok", + help="only print non-OK files", + action="store_true", + dest="only_not_ok", + ) parser.add_argument("--csv", help="print summary CSV", action="store_true") args = parser.parse_args() if args.file is not None: - find_functions_with_diffs(args.version, args.file) + if args.data: + find_data_diffs(args.version, args.file) + else: + find_functions_with_diffs(args.version, args.file) else: - print_summary(args.version, args.csv) + print_summary(args.version, args.csv, args.only_not_ok)