From 1890e751b99c2915360178c46bcfbc2cd1695a82 Mon Sep 17 00:00:00 2001 From: Tharo <17233964+Thar0@users.noreply.github.com> Date: Sun, 15 Aug 2021 23:52:10 +0100 Subject: [PATCH] Update asm-processor and diff.py (#903) * Update asm-processor * Update diff.py --- diff.py | 2031 ++++++++++++++++---------- diff_settings.py | 2 +- tools/asm_processor/asm_processor.py | 96 +- 3 files changed, 1319 insertions(+), 810 deletions(-) diff --git a/diff.py b/diff.py index 8f5daeada2..fbc06cb2f6 100755 --- a/diff.py +++ b/diff.py @@ -4,17 +4,18 @@ import argparse import sys from typing import ( Any, + Callable, Dict, + Iterator, List, Match, - NamedTuple, NoReturn, Optional, + Pattern, Set, Tuple, + Type, Union, - Callable, - Pattern, ) @@ -23,239 +24,269 @@ def fail(msg: str) -> NoReturn: sys.exit(1) -# Prefer to use diff_settings.py from the current working directory -sys.path.insert(0, ".") -try: - import diff_settings -except ModuleNotFoundError: - fail("Unable to find diff_settings.py in the same directory.") -sys.path.pop(0) +def static_assert_unreachable(x: NoReturn) -> NoReturn: + raise Exception("Unreachable! " + repr(x)) + # ==== COMMAND-LINE ==== -try: - import argcomplete # type: ignore -except ModuleNotFoundError: - argcomplete = None +if __name__ == "__main__": + # Prefer to use diff_settings.py from the current working directory + sys.path.insert(0, ".") + try: + import diff_settings + except ModuleNotFoundError: + fail("Unable to find diff_settings.py in the same directory.") + sys.path.pop(0) -parser = argparse.ArgumentParser(description="Diff MIPS or AArch64 assembly.") + try: + import argcomplete # type: ignore + except ModuleNotFoundError: + argcomplete = None -start_argument = parser.add_argument( - "start", - help="Function name or address to start diffing from.", -) + parser = argparse.ArgumentParser(description="Diff MIPS, PPC or AArch64 assembly.") -if argcomplete: + start_argument = parser.add_argument( + "start", + help="Function name or address to start diffing from.", + ) - def complete_symbol( - prefix: str, parsed_args: argparse.Namespace, **kwargs: object - ) -> List[str]: - if not prefix or prefix.startswith("-"): - # skip reading the map file, which would - # result in a lot of useless completions - return [] - config: Dict[str, Any] = {} - diff_settings.apply(config, parsed_args) # type: ignore - mapfile = config.get("mapfile") - if not mapfile: - return [] - completes = [] - with open(mapfile) as f: - data = f.read() - # assume symbols are prefixed by a space character - search = f" {prefix}" - pos = data.find(search) - while pos != -1: - # skip the space character in the search string - pos += 1 - # assume symbols are suffixed by either a space - # character or a (unix-style) line return - spacePos = data.find(" ", pos) - lineReturnPos = data.find("\n", pos) - if lineReturnPos == -1: - endPos = spacePos - elif spacePos == -1: - endPos = lineReturnPos - else: - endPos = min(spacePos, lineReturnPos) - if endPos == -1: - match = data[pos:] - pos = -1 - else: - match = data[pos:endPos] - pos = data.find(search, endPos) - completes.append(match) - return completes + if argcomplete: - setattr(start_argument, "completer", complete_symbol) + def complete_symbol( + prefix: str, parsed_args: argparse.Namespace, **kwargs: object + ) -> List[str]: + if not prefix or prefix.startswith("-"): + # skip reading the map file, which would + # result in a lot of useless completions + return [] + config: Dict[str, Any] = {} + diff_settings.apply(config, parsed_args) # type: ignore + mapfile = config.get("mapfile") + if not mapfile: + return [] + completes = [] + with open(mapfile) as f: + data = f.read() + # assume symbols are prefixed by a space character + search = f" {prefix}" + pos = data.find(search) + while pos != -1: + # skip the space character in the search string + pos += 1 + # assume symbols are suffixed by either a space + # character or a (unix-style) line return + spacePos = data.find(" ", pos) + lineReturnPos = data.find("\n", pos) + if lineReturnPos == -1: + endPos = spacePos + elif spacePos == -1: + endPos = lineReturnPos + else: + endPos = min(spacePos, lineReturnPos) + if endPos == -1: + match = data[pos:] + pos = -1 + else: + match = data[pos:endPos] + pos = data.find(search, endPos) + completes.append(match) + return completes -parser.add_argument( - "end", - nargs="?", - help="Address to end diff at.", -) -parser.add_argument( - "-o", - dest="diff_obj", - action="store_true", - help="Diff .o files rather than a whole binary. This makes it possible to " - "see symbol names. (Recommended)", -) -parser.add_argument( - "-e", - "--elf", - dest="diff_elf_symbol", - metavar="SYMBOL", - help="Diff a given function in two ELFs, one being stripped and the other " - "one non-stripped. Requires objdump from binutils 2.33+.", -) -parser.add_argument( - "--source", - action="store_true", - help="Show source code (if possible). Only works with -o and -e.", -) -parser.add_argument( - "--inlines", - action="store_true", - help="Show inline function calls (if possible). Only works with -o and -e.", -) -parser.add_argument( - "--base-asm", - dest="base_asm", - metavar="FILE", - help="Read assembly from given file instead of configured base img.", -) -parser.add_argument( - "--write-asm", - dest="write_asm", - metavar="FILE", - help="Write the current assembly output to file, e.g. for use with --base-asm.", -) -parser.add_argument( - "-m", - "--make", - dest="make", - action="store_true", - help="Automatically run 'make' on the .o file or binary before diffing.", -) -parser.add_argument( - "-l", - "--skip-lines", - dest="skip_lines", - type=int, - default=0, - metavar="LINES", - help="Skip the first N lines of output.", -) -parser.add_argument( - "-s", - "--stop-jr-ra", - dest="stop_jrra", - action="store_true", - help="Stop disassembling at the first 'jr ra'. Some functions have multiple return points, so use with care!", -) -parser.add_argument( - "-i", - "--ignore-large-imms", - dest="ignore_large_imms", - action="store_true", - help="Pretend all large enough immediates are the same.", -) -parser.add_argument( - "-I", - "--ignore-addr-diffs", - action="store_true", - help="Ignore address differences. Currently only affects AArch64.", -) -parser.add_argument( - "-B", - "--no-show-branches", - dest="show_branches", - action="store_false", - help="Don't visualize branches/branch targets.", -) -parser.add_argument( - "-S", - "--base-shift", - dest="base_shift", - type=str, - default="0", - help="Diff position X in our img against position X + shift in the base img. " - 'Arithmetic is allowed, so e.g. |-S "0x1234 - 0x4321"| is a reasonable ' - "flag to pass if it is known that position 0x1234 in the base img syncs " - "up with position 0x4321 in our img. Not supported together with -o.", -) -parser.add_argument( - "-w", - "--watch", - dest="watch", - action="store_true", - help="Automatically update when source/object files change. " - "Recommended in combination with -m.", -) -parser.add_argument( - "-3", - "--threeway=prev", - dest="threeway", - action="store_const", - const="prev", - help="Show a three-way diff between target asm, current asm, and asm " - "prior to -w rebuild. Requires -w.", -) -parser.add_argument( - "-b", - "--threeway=base", - dest="threeway", - action="store_const", - const="base", - help="Show a three-way diff between target asm, current asm, and asm " - "when diff.py was started. Requires -w.", -) -parser.add_argument( - "--width", - dest="column_width", - type=int, - default=50, - help="Sets the width of the left and right view column.", -) -parser.add_argument( - "--algorithm", - dest="algorithm", - default="levenshtein", - choices=["levenshtein", "difflib"], - help="Diff algorithm to use. Levenshtein gives the minimum diff, while difflib " - "aims for long sections of equal opcodes. Defaults to %(default)s.", -) -parser.add_argument( - "--max-size", - "--max-lines", - dest="max_lines", - type=int, - default=1024, - help="The maximum length of the diff, in lines.", -) + setattr(start_argument, "completer", complete_symbol) -# Project-specific flags, e.g. different versions/make arguments. -add_custom_arguments_fn = getattr(diff_settings, "add_custom_arguments", None) -if add_custom_arguments_fn: - add_custom_arguments_fn(parser) + parser.add_argument( + "end", + nargs="?", + help="Address to end diff at.", + ) + parser.add_argument( + "-o", + dest="diff_obj", + action="store_true", + help="Diff .o files rather than a whole binary. This makes it possible to " + "see symbol names. (Recommended)", + ) + parser.add_argument( + "-e", + "--elf", + dest="diff_elf_symbol", + metavar="SYMBOL", + help="Diff a given function in two ELFs, one being stripped and the other " + "one non-stripped. Requires objdump from binutils 2.33+.", + ) + parser.add_argument( + "--source", + "-c", + action="store_true", + help="Show source code (if possible). Only works with -o or -e.", + ) + parser.add_argument( + "--source-old-binutils", + "-C", + action="store_true", + help="Tweak --source handling to make it work with binutils < 2.33. Implies --source.", + ) + parser.add_argument( + "--inlines", + action="store_true", + help="Show inline function calls (if possible). Only works with -o or -e.", + ) + parser.add_argument( + "--base-asm", + dest="base_asm", + metavar="FILE", + help="Read assembly from given file instead of configured base img.", + ) + parser.add_argument( + "--write-asm", + dest="write_asm", + metavar="FILE", + help="Write the current assembly output to file, e.g. for use with --base-asm.", + ) + parser.add_argument( + "-m", + "--make", + dest="make", + action="store_true", + help="Automatically run 'make' on the .o file or binary before diffing.", + ) + parser.add_argument( + "-l", + "--skip-lines", + dest="skip_lines", + type=int, + default=0, + metavar="LINES", + help="Skip the first N lines of output.", + ) + parser.add_argument( + "-s", + "--stop-jr-ra", + dest="stop_jrra", + action="store_true", + help="Stop disassembling at the first 'jr ra'. Some functions have multiple return points, so use with care!", + ) + parser.add_argument( + "-i", + "--ignore-large-imms", + dest="ignore_large_imms", + action="store_true", + help="Pretend all large enough immediates are the same.", + ) + parser.add_argument( + "-I", + "--ignore-addr-diffs", + dest="ignore_addr_diffs", + action="store_true", + help="Ignore address differences. Currently only affects AArch64.", + ) + parser.add_argument( + "-B", + "--no-show-branches", + dest="show_branches", + action="store_false", + help="Don't visualize branches/branch targets.", + ) + parser.add_argument( + "-S", + "--base-shift", + dest="base_shift", + type=str, + default="0", + help="Diff position X in our img against position X + shift in the base img. " + 'Arithmetic is allowed, so e.g. |-S "0x1234 - 0x4321"| is a reasonable ' + "flag to pass if it is known that position 0x1234 in the base img syncs " + "up with position 0x4321 in our img. Not supported together with -o.", + ) + parser.add_argument( + "-w", + "--watch", + dest="watch", + action="store_true", + help="Automatically update when source/object files change. " + "Recommended in combination with -m.", + ) + parser.add_argument( + "-3", + "--threeway=prev", + dest="threeway", + action="store_const", + const="prev", + help="Show a three-way diff between target asm, current asm, and asm " + "prior to -w rebuild. Requires -w.", + ) + parser.add_argument( + "-b", + "--threeway=base", + dest="threeway", + action="store_const", + const="base", + help="Show a three-way diff between target asm, current asm, and asm " + "when diff.py was started. Requires -w.", + ) + parser.add_argument( + "--width", + dest="column_width", + type=int, + default=50, + help="Sets the width of the left and right view column.", + ) + parser.add_argument( + "--algorithm", + dest="algorithm", + default="levenshtein", + choices=["levenshtein", "difflib"], + help="Diff algorithm to use. Levenshtein gives the minimum diff, while difflib " + "aims for long sections of equal opcodes. Defaults to %(default)s.", + ) + parser.add_argument( + "--max-size", + "--max-lines", + dest="max_lines", + type=int, + default=1024, + help="The maximum length of the diff, in lines.", + ) + parser.add_argument( + "--no-pager", + dest="no_pager", + action="store_true", + help="Disable the pager; write output directly to stdout, then exit. " + "Incompatible with --watch.", + ) + parser.add_argument( + "--format", + choices=("color", "plain", "html"), + default="color", + help="Output format, default is color. --format=html implies --no-pager.", + ) -if argcomplete: - argcomplete.autocomplete(parser) + # Project-specific flags, e.g. different versions/make arguments. + add_custom_arguments_fn = getattr(diff_settings, "add_custom_arguments", None) + if add_custom_arguments_fn: + add_custom_arguments_fn(parser) + + if argcomplete: + argcomplete.autocomplete(parser) # ==== IMPORTS ==== # (We do imports late to optimize auto-complete performance.) -import re -import os +import abc import ast -import subprocess +from dataclasses import dataclass, field, replace import difflib -import string +import enum +import html import itertools -import threading +import os import queue +import re +import string +import subprocess +import threading import time @@ -265,7 +296,7 @@ MISSING_PREREQUISITES = ( ) try: - from colorama import Fore, Style, Back # type: ignore + from colorama import Fore, Style # type: ignore import ansiwrap # type: ignore import watchdog # type: ignore except ModuleNotFoundError as e: @@ -273,63 +304,105 @@ except ModuleNotFoundError as e: # ==== CONFIG ==== -args = parser.parse_args() -# Set imgs, map file and make flags in a project-specific manner. -config: Dict[str, Any] = {} -diff_settings.apply(config, args) # type: ignore +@dataclass +class ProjectSettings: + arch_str: str + objdump_executable: str + build_command: List[str] + map_format: str + mw_build_dir: str + baseimg: Optional[str] + myimg: Optional[str] + mapfile: Optional[str] + source_directories: Optional[List[str]] + source_extensions: List[str] -arch: str = config.get("arch", "mips") -baseimg: Optional[str] = config.get("baseimg") -myimg: Optional[str] = config.get("myimg") -mapfile: Optional[str] = config.get("mapfile") -makeflags: List[str] = config.get("makeflags", []) -source_directories: Optional[List[str]] = config.get("source_directories") -objdump_executable: Optional[str] = config.get("objdump_executable") -map_format: str = config.get("map_format", "gnu") -mw_build_dir: str = config.get("mw_build_dir", "build/") -MAX_FUNCTION_SIZE_LINES: int = args.max_lines -MAX_FUNCTION_SIZE_BYTES: int = MAX_FUNCTION_SIZE_LINES * 4 +@dataclass +class Config: + arch: "ArchSettings" -COLOR_ROTATION: List[str] = [ - Fore.MAGENTA, - Fore.CYAN, - Fore.GREEN, - Fore.RED, - Fore.LIGHTYELLOW_EX, - Fore.LIGHTMAGENTA_EX, - Fore.LIGHTCYAN_EX, - Fore.LIGHTGREEN_EX, - Fore.LIGHTBLACK_EX, -] + # Build/objdump options + diff_obj: bool + make: bool + source: bool + source_old_binutils: bool + inlines: bool + max_function_size_lines: int + max_function_size_bytes: int -BUFFER_CMD: List[str] = ["tail", "-c", str(10 ** 9)] -LESS_CMD: List[str] = ["less", "-SRic", "-#6"] + # Display options + formatter: "Formatter" + threeway: Optional[str] + base_shift: int + skip_lines: int + show_branches: bool + stop_jrra: bool + ignore_large_imms: bool + ignore_addr_diffs: bool + algorithm: str -DEBOUNCE_DELAY: float = 0.1 -FS_WATCH_EXTENSIONS: List[str] = [".c", ".h"] -# ==== LOGIC ==== +def create_project_settings(settings: Dict[str, Any]) -> ProjectSettings: + return ProjectSettings( + arch_str=settings.get("arch", "mips"), + baseimg=settings.get("baseimg"), + myimg=settings.get("myimg"), + mapfile=settings.get("mapfile"), + build_command=settings.get( + "make_command", ["make", *settings.get("makeflags", [])] + ), + source_directories=settings.get("source_directories"), + source_extensions=settings.get( + "source_extensions", [".c", ".h", ".cpp", ".hpp", ".s"] + ), + objdump_executable=get_objdump_executable(settings.get("objdump_executable")), + map_format=settings.get("map_format", "gnu"), + mw_build_dir=settings.get("mw_build_dir", "build/"), + ) -ObjdumpCommand = Tuple[List[str], str, Optional[str]] -if args.algorithm == "levenshtein": - try: - import Levenshtein # type: ignore - except ModuleNotFoundError as e: - fail(MISSING_PREREQUISITES.format(e.name)) +def create_config(args: argparse.Namespace, project: ProjectSettings) -> Config: + formatter: Formatter + if args.format == "plain": + formatter = PlainFormatter(column_width=args.column_width) + elif args.format == "color": + formatter = AnsiFormatter(column_width=args.column_width) + elif args.format == "html": + formatter = HtmlFormatter() + else: + raise ValueError(f"Unsupported --format: {args.format}") -if args.source: - try: - import cxxfilt # type: ignore - except ModuleNotFoundError as e: - fail(MISSING_PREREQUISITES.format(e.name)) + return Config( + arch=get_arch(project.arch_str), + # Build/objdump options + diff_obj=args.diff_obj, + make=args.make, + source=args.source or args.source_old_binutils, + source_old_binutils=args.source_old_binutils, + inlines=args.inlines, + max_function_size_lines=args.max_lines, + max_function_size_bytes=args.max_lines * 4, + # Display options + formatter=formatter, + threeway=args.threeway, + base_shift=eval_int( + args.base_shift, "Failed to parse --base-shift (-S) argument as an integer." + ), + skip_lines=args.skip_lines, + show_branches=args.show_branches, + stop_jrra=args.stop_jrra, + ignore_large_imms=args.ignore_large_imms, + ignore_addr_diffs=args.ignore_addr_diffs, + algorithm=args.algorithm, + ) -if args.threeway and not args.watch: - fail("Threeway diffing requires -w.") -if objdump_executable is None: +def get_objdump_executable(objdump_executable: Optional[str]) -> str: + if objdump_executable is not None: + return objdump_executable + for objdump_cand in ["mips-linux-gnu-objdump", "mips64-elf-objdump"]: try: subprocess.check_call( @@ -337,19 +410,311 @@ if objdump_executable is None: stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) - objdump_executable = objdump_cand - break + return objdump_cand except subprocess.CalledProcessError: pass except FileNotFoundError: pass -if not objdump_executable: - fail( + return fail( "Missing binutils; please ensure mips-linux-gnu-objdump or mips64-elf-objdump exist, or configure objdump_executable." ) +def get_arch(arch_str: str) -> "ArchSettings": + if arch_str == "mips": + return MIPS_SETTINGS + if arch_str == "aarch64": + return AARCH64_SETTINGS + if arch_str == "ppc": + return PPC_SETTINGS + return fail(f"Unknown architecture: {arch_str}") + + +BUFFER_CMD: List[str] = ["tail", "-c", str(10 ** 9)] + +# -S truncates long lines instead of wrapping them +# -R interprets color escape sequences +# -i ignores case when searching +# -c something about how the screen gets redrawn; I don't remember the purpose +# -#6 makes left/right arrow keys scroll by 6 characters +LESS_CMD: List[str] = ["less", "-SRic", "-#6"] + +DEBOUNCE_DELAY: float = 0.1 + +# ==== FORMATTING ==== + + +@enum.unique +class BasicFormat(enum.Enum): + NONE = enum.auto() + IMMEDIATE = enum.auto() + STACK = enum.auto() + REGISTER = enum.auto() + DELAY_SLOT = enum.auto() + DIFF_CHANGE = enum.auto() + DIFF_ADD = enum.auto() + DIFF_REMOVE = enum.auto() + SOURCE_FILENAME = enum.auto() + SOURCE_FUNCTION = enum.auto() + SOURCE_OTHER = enum.auto() + + +@dataclass(frozen=True) +class RotationFormat: + group: str + index: int + key: str + + +Format = Union[BasicFormat, RotationFormat] +FormatFunction = Callable[[str], Format] + + +class Text: + segments: List[Tuple[str, Format]] + + def __init__( + self, line: Optional[str] = None, f: Format = BasicFormat.NONE + ) -> None: + self.segments = [] + if line is not None: + self.segments.append((line, f)) + elif f is not BasicFormat.NONE: + raise ValueError("Text constructor provided `f`, but no line to format") + + def reformat(self, f: Format) -> "Text": + return Text(self.plain(), f) + + def plain(self) -> str: + return "".join(s for s, f in self.segments) + + def __repr__(self) -> str: + return f"" + + def __str__(self) -> str: + # Use Formatter.apply(...) instead + return NotImplemented + + def __eq__(self, other: object) -> bool: + return NotImplemented + + def __add__(self, other: Union["Text", str]) -> "Text": + if isinstance(other, str): + other = Text(other) + result = Text() + result.segments = self.segments + other.segments + return result + + def __radd__(self, other: Union["Text", str]) -> "Text": + if isinstance(other, str): + other = Text(other) + result = Text() + result.segments = other.segments + self.segments + return result + + def finditer(self, pat: Pattern[str]) -> Iterator[Match[str]]: + """Replacement for `pat.finditer(text)` that operates on the inner text, + and returns the exact same matches as `Text.sub(pat, ...)`.""" + for chunk, f in self.segments: + for match in pat.finditer(chunk): + yield match + + def sub(self, pat: Pattern[str], sub_fn: Callable[[Match[str]], "Text"]) -> "Text": + result = Text() + for chunk, f in self.segments: + i = 0 + for match in pat.finditer(chunk): + start, end = match.start(), match.end() + assert i <= start <= end <= len(chunk) + sub = sub_fn(match) + result.segments.append((chunk[i:start], f)) + result.segments.extend(sub.segments) + i = end + result.segments.append((chunk[i:], f)) + return result + + +class Formatter(abc.ABC): + @abc.abstractmethod + def apply_format(self, chunk: str, f: Format) -> str: + """Apply the formatting `f` to `chunk` and escape the contents.""" + ... + + @abc.abstractmethod + def table( + self, header: Optional[Tuple[str, ...]], lines: List[Tuple[str, ...]] + ) -> str: + """Format a multi-column table with an optional `header`""" + ... + + def apply(self, text: Text) -> str: + return "".join(self.apply_format(chunk, f) for chunk, f in text.segments) + + +@dataclass +class PlainFormatter(Formatter): + column_width: int + + def apply_format(self, chunk: str, f: Format) -> str: + return chunk + + def table( + self, header: Optional[Tuple[str, ...]], lines: List[Tuple[str, ...]] + ) -> str: + if header: + lines = [header] + lines + return "\n".join( + "".join(x.ljust(self.column_width) for x in line) for line in lines + ) + + +@dataclass +class AnsiFormatter(Formatter): + BASIC_ANSI_CODES = { + BasicFormat.NONE: "", + BasicFormat.IMMEDIATE: Fore.LIGHTBLUE_EX, + BasicFormat.STACK: Fore.YELLOW, + BasicFormat.REGISTER: Fore.YELLOW, + BasicFormat.DELAY_SLOT: Fore.LIGHTBLACK_EX, + BasicFormat.DIFF_CHANGE: Fore.LIGHTBLUE_EX, + BasicFormat.DIFF_ADD: Fore.GREEN, + BasicFormat.DIFF_REMOVE: Fore.RED, + BasicFormat.SOURCE_FILENAME: Style.DIM + Style.BRIGHT, + # Underline (not in colorama) + bright + dim + BasicFormat.SOURCE_FUNCTION: Style.DIM + Style.BRIGHT + "\u001b[4m", + BasicFormat.SOURCE_OTHER: Style.DIM, + } + + ROTATION_ANSI_COLORS = [ + Fore.MAGENTA, + Fore.CYAN, + Fore.GREEN, + Fore.RED, + Fore.LIGHTYELLOW_EX, + Fore.LIGHTMAGENTA_EX, + Fore.LIGHTCYAN_EX, + Fore.LIGHTGREEN_EX, + Fore.LIGHTBLACK_EX, + ] + + column_width: int + + def apply_format(self, chunk: str, f: Format) -> str: + if f == BasicFormat.NONE: + return chunk + if isinstance(f, BasicFormat): + ansi_code = self.BASIC_ANSI_CODES[f] + elif isinstance(f, RotationFormat): + ansi_code = self.ROTATION_ANSI_COLORS[ + f.index % len(self.ROTATION_ANSI_COLORS) + ] + else: + static_assert_unreachable(f) + return f"{ansi_code}{chunk}{Style.RESET_ALL}" + + def table( + self, header: Optional[Tuple[str, ...]], lines: List[Tuple[str, ...]] + ) -> str: + if header: + lines = [header] + lines + return "\n".join("".join(self.ansi_ljust(x) for x in line) for line in lines) + + def ansi_ljust(self, s: str) -> str: + """Like s.ljust(width), but accounting for ANSI colors.""" + needed: int = self.column_width - ansiwrap.ansilen(s) + if needed > 0: + return s + " " * needed + else: + return s + + +@dataclass +class HtmlFormatter(Formatter): + rotation_formats: int = 9 + + def apply_format(self, chunk: str, f: Format) -> str: + chunk = html.escape(chunk) + if f == BasicFormat.NONE: + return chunk + if isinstance(f, BasicFormat): + class_name = f.name.lower().replace("_", "-") + data_attr = "" + elif isinstance(f, RotationFormat): + class_name = f"rotation-{f.index % self.rotation_formats}" + rotation_key = html.escape(f"{f.group};{f.key}", quote=True) + data_attr = f'data-rotation="{rotation_key}"' + else: + static_assert_unreachable(f) + return f"{chunk}" + + def table( + self, header: Optional[Tuple[str, ...]], lines: List[Tuple[str, ...]] + ) -> str: + def table_row(line: Tuple[str, ...], cell_el: str) -> str: + output_row = " " + for cell in line: + output_row += f"<{cell_el}>{cell}" + output_row += "\n" + return output_row + + output = "\n" + if header: + output += " \n" + output += table_row(header, "th") + output += " \n" + output += " \n" + output += "".join(table_row(line, "td") for line in lines) + output += " \n" + output += "
\n" + return output + + +def format_fields( + pat: Pattern[str], + out1: Text, + out2: Text, + color1: FormatFunction, + color2: Optional[FormatFunction] = None, +) -> Tuple[Text, Text]: + diffs = [ + of.group() != nf.group() + for (of, nf) in zip(out1.finditer(pat), out2.finditer(pat)) + ] + + it = iter(diffs) + + def maybe_color(color: FormatFunction, s: str) -> Text: + return Text(s, color(s)) if next(it, False) else Text(s) + + out1 = out1.sub(pat, lambda m: maybe_color(color1, m.group())) + it = iter(diffs) + out2 = out2.sub(pat, lambda m: maybe_color(color2 or color1, m.group())) + + return out1, out2 + + +def symbol_formatter(group: str, base_index: int) -> FormatFunction: + symbol_formats: Dict[str, Format] = {} + + def symbol_format(s: str) -> Format: + # TODO: it would be nice to use a unique Format for each symbol, so we could + # add extra UI elements in the HTML version + f = symbol_formats.get(s) + if f is None: + index = len(symbol_formats) + base_index + f = RotationFormat(key=s, index=index, group=group) + symbol_formats[s] = f + return f + + return symbol_format + + +# ==== LOGIC ==== + +ObjdumpCommand = Tuple[List[str], str, Optional[str]] + + def maybe_eval_int(expr: str) -> Optional[int]: try: ret = ast.literal_eval(expr) @@ -371,25 +736,27 @@ def eval_line_num(expr: str) -> int: return int(expr.strip().replace(":", ""), 16) -def run_make(target: str) -> None: - subprocess.check_call(["make"] + makeflags + [target]) +def run_make(target: str, project: ProjectSettings) -> None: + subprocess.check_call(project.build_command + [target]) -def run_make_capture_output(target: str) -> "subprocess.CompletedProcess[bytes]": +def run_make_capture_output( + target: str, project: ProjectSettings +) -> "subprocess.CompletedProcess[bytes]": return subprocess.run( - ["make"] + makeflags + [target], + project.build_command + [target], stderr=subprocess.PIPE, stdout=subprocess.PIPE, ) -def restrict_to_function(dump: str, fn_name: str) -> str: +def restrict_to_function(dump: str, fn_name: str, config: Config) -> str: out: List[str] = [] search = f"<{fn_name}>:" found = False for line in dump.split("\n"): if found: - if len(out) >= MAX_FUNCTION_SIZE_LINES: + if len(out) >= config.max_function_size_lines: break out.append(line) elif search in line: @@ -397,49 +764,59 @@ def restrict_to_function(dump: str, fn_name: str) -> str: return "\n".join(out) -def maybe_get_objdump_source_flags() -> List[str]: - if not args.source: +def maybe_get_objdump_source_flags(config: Config) -> List[str]: + if not config.source: return [] flags = [ "--source", - "--source-comment=│ ", "-l", ] - if args.inlines: + if not config.source_old_binutils: + flags.append("--source-comment=│ ") + + if config.inlines: flags.append("--inlines") return flags -def run_objdump(cmd: ObjdumpCommand) -> str: +def run_objdump(cmd: ObjdumpCommand, config: Config, project: ProjectSettings) -> str: flags, target, restrict = cmd - assert objdump_executable, "checked previously" - out = subprocess.check_output( - [objdump_executable] + arch_flags + flags + [target], universal_newlines=True - ) + try: + out = subprocess.run( + [project.objdump_executable] + config.arch.arch_flags + flags + [target], + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ).stdout + except subprocess.CalledProcessError as e: + print(e.stdout) + print(e.stderr) + if "unrecognized option '--source-comment" in e.stderr: + fail("** Try using --source-old-binutils instead of --source **") + raise e + if restrict is not None: - return restrict_to_function(out, restrict) + return restrict_to_function(out, restrict, config) return out -base_shift: int = eval_int( - args.base_shift, "Failed to parse --base-shift (-S) argument as an integer." -) - - -def search_map_file(fn_name: str) -> Tuple[Optional[str], Optional[int]]: - if not mapfile: +def search_map_file( + fn_name: str, project: ProjectSettings +) -> Tuple[Optional[str], Optional[int]]: + if not project.mapfile: fail(f"No map file configured; cannot find function {fn_name}.") try: - with open(mapfile) as f: + with open(project.mapfile) as f: contents = f.read() except Exception: - fail(f"Failed to open map file {mapfile} for reading.") + fail(f"Failed to open map file {project.mapfile} for reading.") - if map_format == 'gnu': + if project.map_format == "gnu": lines = contents.split("\n") try: @@ -470,41 +847,66 @@ def search_map_file(fn_name: str) -> Tuple[Optional[str], Optional[int]]: fail(f"Found multiple occurrences of function {fn_name} in map file.") if len(cands) == 1: return cands[0] - elif map_format == 'mw': - # ram elf rom object name - find = re.findall(re.compile(r' \S+ \S+ (\S+) (\S+) . ' + fn_name + r'(?: \(entry of \.(?:init|text)\))? \t(\S+)'), contents) + elif project.map_format == "mw": + find = re.findall( + re.compile( + # ram elf rom + r" \S+ \S+ (\S+) (\S+) . " + + fn_name + # object name + + r"(?: \(entry of \.(?:init|text)\))? \t(\S+)" + ), + contents, + ) if len(find) > 1: fail(f"Found multiple occurrences of function {fn_name} in map file.") if len(find) == 1: - rom = int(find[0][1],16) + rom = int(find[0][1], 16) objname = find[0][2] - # The metrowerks linker map format does not contain the full object path, so we must complete it manually. - objfiles = [os.path.join(dirpath, f) for dirpath, _, filenames in os.walk(mw_build_dir) for f in filenames if f == objname] + # The metrowerks linker map format does not contain the full object path, + # so we must complete it manually. + objfiles = [ + os.path.join(dirpath, f) + for dirpath, _, filenames in os.walk(project.mw_build_dir) + for f in filenames + if f == objname + ] if len(objfiles) > 1: all_objects = "\n".join(objfiles) - fail(f"Found multiple objects of the same name {objname} in {mw_build_dir}, cannot determine which to diff against: \n{all_objects}") + fail( + f"Found multiple objects of the same name {objname} in {project.mw_build_dir}, " + f"cannot determine which to diff against: \n{all_objects}" + ) if len(objfiles) == 1: objfile = objfiles[0] - # TODO Currently the ram-rom conversion only works for diffing ELF executables, but it would likely be more convenient to diff DOLs. - # At this time it is recommended to always use -o when running the diff script as this mode does not make use of the ram-rom conversion + # TODO Currently the ram-rom conversion only works for diffing ELF + # executables, but it would likely be more convenient to diff DOLs. + # At this time it is recommended to always use -o when running the diff + # script as this mode does not make use of the ram-rom conversion. return objfile, rom else: - fail(f"Linker map format {map_format} unrecognised.") + fail(f"Linker map format {project.map_format} unrecognised.") return None, None -def dump_elf() -> Tuple[str, ObjdumpCommand, ObjdumpCommand]: - if not baseimg or not myimg: +def dump_elf( + start: str, + end: Optional[str], + diff_elf_symbol: str, + config: Config, + project: ProjectSettings, +) -> Tuple[str, ObjdumpCommand, ObjdumpCommand]: + if not project.baseimg or not project.myimg: fail("Missing myimg/baseimg in config.") - if base_shift: + if config.base_shift: fail("--base-shift not compatible with -e") - start_addr = eval_int(args.start, "Start address must be an integer expression.") + start_addr = eval_int(start, "Start address must be an integer expression.") - if args.end is not None: - end_addr = eval_int(args.end, "End address must be an integer expression.") + if end is not None: + end_addr = eval_int(end, "End address must be an integer expression.") else: - end_addr = start_addr + MAX_FUNCTION_SIZE_BYTES + end_addr = start_addr + config.max_function_size_bytes flags1 = [ f"--start-address={start_addr}", @@ -512,31 +914,37 @@ def dump_elf() -> Tuple[str, ObjdumpCommand, ObjdumpCommand]: ] flags2 = [ - f"--disassemble={args.diff_elf_symbol}", + f"--disassemble={diff_elf_symbol}", ] objdump_flags = ["-drz", "-j", ".text"] return ( - myimg, - (objdump_flags + flags1, baseimg, None), - (objdump_flags + flags2 + maybe_get_objdump_source_flags(), myimg, None), + project.myimg, + (objdump_flags + flags1, project.baseimg, None), + ( + objdump_flags + flags2 + maybe_get_objdump_source_flags(config), + project.myimg, + None, + ), ) -def dump_objfile() -> Tuple[str, ObjdumpCommand, ObjdumpCommand]: - if base_shift: +def dump_objfile( + start: str, end: Optional[str], config: Config, project: ProjectSettings +) -> Tuple[str, ObjdumpCommand, ObjdumpCommand]: + if config.base_shift: fail("--base-shift not compatible with -o") - if args.end is not None: + if end is not None: fail("end address not supported together with -o") - if args.start.startswith("0"): + if start.startswith("0"): fail("numerical start address not supported with -o; pass a function name") - objfile, _ = search_map_file(args.start) + objfile, _ = search_map_file(start, project) if not objfile: fail("Not able to find .o file for function.") - if args.make: - run_make(objfile) + if config.make: + run_make(objfile, project) if not os.path.isfile(objfile): fail(f"Not able to find .o file for function: {objfile} is not a file.") @@ -548,281 +956,49 @@ def dump_objfile() -> Tuple[str, ObjdumpCommand, ObjdumpCommand]: objdump_flags = ["-drz"] return ( objfile, - (objdump_flags, refobjfile, args.start), - (objdump_flags + maybe_get_objdump_source_flags(), objfile, args.start), + (objdump_flags, refobjfile, start), + (objdump_flags + maybe_get_objdump_source_flags(config), objfile, start), ) -def dump_binary() -> Tuple[str, ObjdumpCommand, ObjdumpCommand]: - if not baseimg or not myimg: +def dump_binary( + start: str, end: Optional[str], config: Config, project: ProjectSettings +) -> Tuple[str, ObjdumpCommand, ObjdumpCommand]: + if not project.baseimg or not project.myimg: fail("Missing myimg/baseimg in config.") - if args.make: - run_make(myimg) - start_addr = maybe_eval_int(args.start) + if config.make: + run_make(project.myimg, project) + start_addr = maybe_eval_int(start) if start_addr is None: - _, start_addr = search_map_file(args.start) + _, start_addr = search_map_file(start, project) if start_addr is None: fail("Not able to find function in map file.") - if args.end is not None: - end_addr = eval_int(args.end, "End address must be an integer expression.") + if end is not None: + end_addr = eval_int(end, "End address must be an integer expression.") else: - end_addr = start_addr + MAX_FUNCTION_SIZE_BYTES + end_addr = start_addr + config.max_function_size_bytes objdump_flags = ["-Dz", "-bbinary", "-EB"] flags1 = [ - f"--start-address={start_addr + base_shift}", - f"--stop-address={end_addr + base_shift}", + f"--start-address={start_addr + config.base_shift}", + f"--stop-address={end_addr + config.base_shift}", ] flags2 = [f"--start-address={start_addr}", f"--stop-address={end_addr}"] return ( - myimg, - (objdump_flags + flags1, baseimg, None), - (objdump_flags + flags2, myimg, None), + project.myimg, + (objdump_flags + flags1, project.baseimg, None), + (objdump_flags + flags2, project.myimg, None), ) -def ansi_ljust(s: str, width: int) -> str: - """Like s.ljust(width), but accounting for ANSI colors.""" - needed: int = width - ansiwrap.ansilen(s) - if needed > 0: - return s + " " * needed - else: - return s - - -if arch == "mips": - re_int = re.compile(r"[0-9]+") - re_comment = re.compile(r"<.*?>") - re_reg = re.compile( - r"\$?\b(a[0-3]|t[0-9]|s[0-8]|at|v[01]|f[12]?[0-9]|f3[01]|k[01]|fp|ra|zero)\b" - ) - re_sprel = re.compile(r"(?<=,)([0-9]+|0x[0-9a-f]+)\(sp\)") - re_large_imm = re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}") - re_imm = re.compile(r"(\b|-)([0-9]+|0x[0-9a-fA-F]+)\b(?!\(sp)|%(lo|hi)\([^)]*\)") - forbidden = set(string.ascii_letters + "_") - arch_flags = ["-m", "mips:4300"] - branch_likely_instructions = { - "beql", - "bnel", - "beqzl", - "bnezl", - "bgezl", - "bgtzl", - "blezl", - "bltzl", - "bc1tl", - "bc1fl", - } - branch_instructions = branch_likely_instructions.union( - { - "b", - "beq", - "bne", - "beqz", - "bnez", - "bgez", - "bgtz", - "blez", - "bltz", - "bc1t", - "bc1f", - } - ) - instructions_with_address_immediates = branch_instructions.union({"jal", "j"}) -elif arch == "aarch64": - re_int = re.compile(r"[0-9]+") - re_comment = re.compile(r"(<.*?>|//.*$)") - # GPRs and FP registers: X0-X30, W0-W30, [DSHQ]0..31 - # The zero registers and SP should not be in this list. - re_reg = re.compile(r"\$?\b([dshq][12]?[0-9]|[dshq]3[01]|[xw][12]?[0-9]|[xw]30)\b") - re_sprel = re.compile(r"sp, #-?(0x[0-9a-fA-F]+|[0-9]+)\b") - re_large_imm = re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}") - re_imm = re.compile(r"(?|//.*$)") - re_reg = re.compile(r"\$?\b([rf][0-9]+)\b") - re_sprel = re.compile(r"(?<=,)(-?[0-9]+|-?0x[0-9a-f]+)\(r1\)") - re_large_imm = re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}") - re_imm = re.compile(r"(\b|-)([0-9]+|0x[0-9a-fA-F]+)\b(?!\(r1)|[^@]*@(ha|h|lo)") - arch_flags = [] - forbidden = set(string.ascii_letters + "_") - branch_likely_instructions = set() - branch_instructions = { - "b", - "beq", - "beq+", - "beq-", - "bne", - "bne+", - "bne-", - "blt", - "blt+", - "blt-", - "ble", - "ble+", - "ble-", - "bdnz", - "bdnz+", - "bdnz-", - "bge", - "bge+", - "bge-", - "bgt", - "bgt+", - "bgt-", - } - instructions_with_address_immediates = branch_instructions.union({"bl"}) -else: - fail(f"Unknown architecture: {arch}") - - -def hexify_int(row: str, pat: Match[str]) -> str: - full = pat.group(0) - if len(full) <= 1: - # leave one-digit ints alone - return full - start, end = pat.span() - if start and row[start - 1] in forbidden: - return full - if end < len(row) and row[end] in forbidden: - return full - return hex(int(full)) - - -def parse_relocated_line(line: str) -> Tuple[str, str, str]: - try: - ind2 = line.rindex(",") - except ValueError: - try: - ind2 = line.rindex("\t") - except ValueError: - ind2 = line.rindex(" ") - before = line[: ind2 + 1] - after = line[ind2 + 1 :] - ind2 = after.find("(") - if ind2 == -1: - imm, after = after, "" - else: - imm, after = after[:ind2], after[ind2:] - if imm == "0x0": - imm = "0" - return before, imm, after - - -def process_mips_reloc(row: str, prev: str) -> str: - before, imm, after = parse_relocated_line(prev) - repl = row.split()[-1] - if imm != "0": - # MIPS uses relocations with addends embedded in the code as immediates. - # If there is an immediate, show it as part of the relocation. Ideally - # we'd show this addend in both %lo/%hi, but annoyingly objdump's output - # doesn't include enough information to pair up %lo's and %hi's... - # TODO: handle unambiguous cases where all addends for a symbol are the - # same, or show "+???". - mnemonic = prev.split()[0] - if mnemonic in instructions_with_address_immediates and not imm.startswith("0x"): - imm = "0x" + imm - repl += "+" + imm if int(imm, 0) > 0 else imm - if "R_MIPS_LO16" in row: - repl = f"%lo({repl})" - elif "R_MIPS_HI16" in row: - # Ideally we'd pair up R_MIPS_LO16 and R_MIPS_HI16 to generate a - # correct addend for each, but objdump doesn't give us the order of - # the relocations, so we can't find the right LO16. :( - repl = f"%hi({repl})" - elif "R_MIPS_26" in row: - # Function calls - pass - elif "R_MIPS_PC16" in row: - # Branch to glabel. This gives confusing output, but there's not much - # we can do here. - pass - else: - assert False, f"unknown relocation type '{row}' for line '{prev}'" - return before + repl + after - - -def process_ppc_reloc(row: str, prev: str) -> str: - assert any(r in row for r in ["R_PPC_REL24", "R_PPC_ADDR16", "R_PPC_EMB_SDA21"]), f"unknown relocation type '{row}' for line '{prev}'" - before, imm, after = parse_relocated_line(prev) - repl = row.split()[-1] - if "R_PPC_REL24" in row: - # function calls - pass - elif "R_PPC_ADDR16_HI" in row: - # absolute hi of addr - repl = f"{repl}@h" - elif "R_PPC_ADDR16_HA" in row: - # adjusted hi of addr - repl = f"{repl}@ha" - elif "R_PPC_ADDR16_LO" in row: - # lo of addr - repl = f"{repl}@l" - elif "R_PPC_ADDR16" in row: - # 16-bit absolute addr - if "+0x7" in repl: - # remove the very large addends as they are an artifact of (label-_SDA(2)_BASE_) - # computations and are unimportant in a diff setting. - if int(repl.split("+")[1],16) > 0x70000000: - repl = repl.split("+")[0] - elif "R_PPC_EMB_SDA21" in row: - # small data area - pass - return before + repl + after - - -def pad_mnemonic(line: str) -> str: - if "\t" not in line: - return line - mn, args = line.split("\t", 1) - return f"{mn:<7s} {args}" - - -class Line(NamedTuple): - mnemonic: str - diff_row: str - original: str - normalized_original: str - line_num: str - branch_target: Optional[str] - source_lines: List[str] - comment: Optional[str] - - class DifferenceNormalizer: + def __init__(self, config: Config) -> None: + self.config = config + def normalize(self, mnemonic: str, row: str) -> str: """This should be called exactly once for each line.""" row = self._normalize_arch_specific(mnemonic, row) - if args.ignore_large_imms: - row = re.sub(re_large_imm, "", row) + if self.config.ignore_large_imms: + row = re.sub(self.config.arch.re_large_imm, "", row) return row def _normalize_arch_specific(self, mnemonic: str, row: str) -> str: @@ -830,12 +1006,12 @@ class DifferenceNormalizer: class DifferenceNormalizerAArch64(DifferenceNormalizer): - def __init__(self) -> None: - super().__init__() + def __init__(self, config: Config) -> None: + super().__init__(config) self._adrp_pair_registers: Set[str] = set() def _normalize_arch_specific(self, mnemonic: str, row: str) -> str: - if args.ignore_addr_diffs: + if self.config.ignore_addr_diffs: row = self._normalize_adrp_differences(mnemonic, row) row = self._normalize_bl(mnemonic, row) return row @@ -867,28 +1043,276 @@ class DifferenceNormalizerAArch64(DifferenceNormalizer): # ldr xxx, [reg, ] if f", [{reg}" in row_parts[1]: self._adrp_pair_registers.remove(reg) - return normalize_imms(row) + return normalize_imms(row, AARCH64_SETTINGS) elif mnemonic == "add": for reg in self._adrp_pair_registers: # add reg, reg, if row_parts[1].startswith(f"{reg}, {reg}, "): self._adrp_pair_registers.remove(reg) - return normalize_imms(row) + return normalize_imms(row, AARCH64_SETTINGS) return row -def make_difference_normalizer() -> DifferenceNormalizer: - if arch == "aarch64": - return DifferenceNormalizerAArch64() - return DifferenceNormalizer() +@dataclass +class ArchSettings: + re_int: Pattern[str] + re_comment: Pattern[str] + re_reg: Pattern[str] + re_sprel: Pattern[str] + re_large_imm: Pattern[str] + re_imm: Pattern[str] + branch_instructions: Set[str] + instructions_with_address_immediates: Set[str] + forbidden: Set[str] = field(default_factory=lambda: set(string.ascii_letters + "_")) + arch_flags: List[str] = field(default_factory=list) + branch_likely_instructions: Set[str] = field(default_factory=set) + difference_normalizer: Type[DifferenceNormalizer] = DifferenceNormalizer -def process(lines: List[str]) -> List[Line]: - normalizer = make_difference_normalizer() +MIPS_BRANCH_LIKELY_INSTRUCTIONS = { + "beql", + "bnel", + "beqzl", + "bnezl", + "bgezl", + "bgtzl", + "blezl", + "bltzl", + "bc1tl", + "bc1fl", +} +MIPS_BRANCH_INSTRUCTIONS = MIPS_BRANCH_LIKELY_INSTRUCTIONS.union( + { + "b", + "beq", + "bne", + "beqz", + "bnez", + "bgez", + "bgtz", + "blez", + "bltz", + "bc1t", + "bc1f", + } +) + +AARCH64_BRANCH_INSTRUCTIONS = { + "bl", + "b", + "b.eq", + "b.ne", + "b.cs", + "b.hs", + "b.cc", + "b.lo", + "b.mi", + "b.pl", + "b.vs", + "b.vc", + "b.hi", + "b.ls", + "b.ge", + "b.lt", + "b.gt", + "b.le", + "cbz", + "cbnz", + "tbz", + "tbnz", +} + +PPC_BRANCH_INSTRUCTIONS = { + "b", + "beq", + "beq+", + "beq-", + "bne", + "bne+", + "bne-", + "blt", + "blt+", + "blt-", + "ble", + "ble+", + "ble-", + "bdnz", + "bdnz+", + "bdnz-", + "bge", + "bge+", + "bge-", + "bgt", + "bgt+", + "bgt-", +} + +MIPS_SETTINGS = ArchSettings( + re_int=re.compile(r"[0-9]+"), + re_comment=re.compile(r"<.*?>"), + re_reg=re.compile( + r"\$?\b(a[0-3]|t[0-9]|s[0-8]|at|v[01]|f[12]?[0-9]|f3[01]|k[01]|fp|ra|zero)\b" + ), + re_sprel=re.compile(r"(?<=,)([0-9]+|0x[0-9a-f]+)\(sp\)"), + re_large_imm=re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}"), + re_imm=re.compile(r"(\b|-)([0-9]+|0x[0-9a-fA-F]+)\b(?!\(sp)|%(lo|hi)\([^)]*\)"), + arch_flags=["-m", "mips:4300"], + branch_likely_instructions=MIPS_BRANCH_LIKELY_INSTRUCTIONS, + branch_instructions=MIPS_BRANCH_INSTRUCTIONS, + instructions_with_address_immediates=MIPS_BRANCH_INSTRUCTIONS.union({"jal", "j"}), +) + +AARCH64_SETTINGS = ArchSettings( + re_int=re.compile(r"[0-9]+"), + re_comment=re.compile(r"(<.*?>|//.*$)"), + # GPRs and FP registers: X0-X30, W0-W30, [DSHQ]0..31 + # The zero registers and SP should not be in this list. + re_reg=re.compile(r"\$?\b([dshq][12]?[0-9]|[dshq]3[01]|[xw][12]?[0-9]|[xw]30)\b"), + re_sprel=re.compile(r"sp, #-?(0x[0-9a-fA-F]+|[0-9]+)\b"), + re_large_imm=re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}"), + re_imm=re.compile(r"(?|//.*$)"), + re_reg=re.compile(r"\$?\b([rf][0-9]+)\b"), + re_sprel=re.compile(r"(?<=,)(-?[0-9]+|-?0x[0-9a-f]+)\(r1\)"), + re_large_imm=re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}"), + re_imm=re.compile(r"(\b|-)([0-9]+|0x[0-9a-fA-F]+)\b(?!\(r1)|[^@]*@(ha|h|lo)"), + branch_instructions=PPC_BRANCH_INSTRUCTIONS, + instructions_with_address_immediates=PPC_BRANCH_INSTRUCTIONS.union({"bl"}), +) + + +def hexify_int(row: str, pat: Match[str], arch: ArchSettings) -> str: + full = pat.group(0) + if len(full) <= 1: + # leave one-digit ints alone + return full + start, end = pat.span() + if start and row[start - 1] in arch.forbidden: + return full + if end < len(row) and row[end] in arch.forbidden: + return full + return hex(int(full)) + + +def parse_relocated_line(line: str) -> Tuple[str, str, str]: + for c in ",\t ": + if c in line: + ind2 = line.rindex(c) + break + else: + raise Exception(f"failed to parse relocated line: {line}") + before = line[: ind2 + 1] + after = line[ind2 + 1 :] + ind2 = after.find("(") + if ind2 == -1: + imm, after = after, "" + else: + imm, after = after[:ind2], after[ind2:] + if imm == "0x0": + imm = "0" + return before, imm, after + + +def process_mips_reloc(row: str, prev: str, arch: ArchSettings) -> str: + before, imm, after = parse_relocated_line(prev) + repl = row.split()[-1] + if imm != "0": + # MIPS uses relocations with addends embedded in the code as immediates. + # If there is an immediate, show it as part of the relocation. Ideally + # we'd show this addend in both %lo/%hi, but annoyingly objdump's output + # doesn't include enough information to pair up %lo's and %hi's... + # TODO: handle unambiguous cases where all addends for a symbol are the + # same, or show "+???". + mnemonic = prev.split()[0] + if ( + mnemonic in arch.instructions_with_address_immediates + and not imm.startswith("0x") + ): + imm = "0x" + imm + repl += "+" + imm if int(imm, 0) > 0 else imm + if "R_MIPS_LO16" in row: + repl = f"%lo({repl})" + elif "R_MIPS_HI16" in row: + # Ideally we'd pair up R_MIPS_LO16 and R_MIPS_HI16 to generate a + # correct addend for each, but objdump doesn't give us the order of + # the relocations, so we can't find the right LO16. :( + repl = f"%hi({repl})" + elif "R_MIPS_26" in row: + # Function calls + pass + elif "R_MIPS_PC16" in row: + # Branch to glabel. This gives confusing output, but there's not much + # we can do here. + pass + else: + assert False, f"unknown relocation type '{row}' for line '{prev}'" + return before + repl + after + + +def process_ppc_reloc(row: str, prev: str) -> str: + assert any( + r in row for r in ["R_PPC_REL24", "R_PPC_ADDR16", "R_PPC_EMB_SDA21"] + ), f"unknown relocation type '{row}' for line '{prev}'" + before, imm, after = parse_relocated_line(prev) + repl = row.split()[-1] + if "R_PPC_REL24" in row: + # function calls + pass + elif "R_PPC_ADDR16_HI" in row: + # absolute hi of addr + repl = f"{repl}@h" + elif "R_PPC_ADDR16_HA" in row: + # adjusted hi of addr + repl = f"{repl}@ha" + elif "R_PPC_ADDR16_LO" in row: + # lo of addr + repl = f"{repl}@l" + elif "R_PPC_ADDR16" in row: + # 16-bit absolute addr + if "+0x7" in repl: + # remove the very large addends as they are an artifact of (label-_SDA(2)_BASE_) + # computations and are unimportant in a diff setting. + if int(repl.split("+")[1], 16) > 0x70000000: + repl = repl.split("+")[0] + elif "R_PPC_EMB_SDA21" in row: + # small data area + pass + return before + repl + after + + +def pad_mnemonic(line: str) -> str: + if "\t" not in line: + return line + mn, args = line.split("\t", 1) + return f"{mn:<7s} {args}" + + +@dataclass +class Line: + mnemonic: str + diff_row: str + original: str + normalized_original: str + line_num: str + branch_target: Optional[str] + source_lines: List[str] + comment: Optional[str] + + +def process(lines: List[str], config: Config) -> List[Line]: + arch = config.arch + normalizer = arch.difference_normalizer(config) skip_next = False source_lines = [] - if not args.diff_obj: + if not config.diff_obj: lines = lines[7:] if lines and not lines[-1]: lines.pop() @@ -896,10 +1320,10 @@ def process(lines: List[str]) -> List[Line]: output: List[Line] = [] stop_after_delay_slot = False for row in lines: - if args.diff_obj and (">:" in row or not row): + if config.diff_obj and (">:" in row or not row): continue - if args.source and (row and row[0] != " "): + if config.source and not config.source_old_binutils and (row and row[0] != " "): source_lines.append(row) continue @@ -910,19 +1334,28 @@ def process(lines: List[str]) -> List[Line]: if "R_MIPS_" in row: # N.B. Don't transform the diff rows, they already ignore immediates # if output[-1].diff_row != "": - # output[-1] = output[-1].replace(diff_row=process_mips_reloc(row, output[-1].row_with_imm)) - new_original = process_mips_reloc(row, output[-1].original) - output[-1] = output[-1]._replace(original=new_original) + # output[-1] = output[-1].replace(diff_row=process_mips_reloc(row, output[-1].row_with_imm, arch)) + new_original = process_mips_reloc(row, output[-1].original, arch) + output[-1] = replace(output[-1], original=new_original) continue if "R_PPC_" in row: new_original = process_ppc_reloc(row, output[-1].original) - output[-1] = output[-1]._replace(original=new_original) + output[-1] = replace(output[-1], original=new_original) continue - m_comment = re.search(re_comment, row) + # match source lines here to avoid matching relocation lines + if ( + config.source + and config.source_old_binutils + and (row and not re.match(r"^ +[0-9a-f]+:\t", row)) + ): + source_lines.append(row) + continue + + m_comment = re.search(arch.re_comment, row) comment = m_comment[0] if m_comment else None - row = re.sub(re_comment, "", row) + row = re.sub(arch.re_comment, "", row) row = row.rstrip() tabs = row.split("\t") row = "\t".join(tabs[2:]) @@ -935,32 +1368,32 @@ def process(lines: List[str]) -> List[Line]: row_parts = [part.lstrip() for part in row.split(" ", 1)] mnemonic = row_parts[0].strip() - if mnemonic not in instructions_with_address_immediates: - row = re.sub(re_int, lambda m: hexify_int(row, m), row) + if mnemonic not in arch.instructions_with_address_immediates: + row = re.sub(arch.re_int, lambda m: hexify_int(row, m, arch), row) original = row normalized_original = normalizer.normalize(mnemonic, original) if skip_next: skip_next = False row = "" mnemonic = "" - if mnemonic in branch_likely_instructions: + if mnemonic in arch.branch_likely_instructions: skip_next = True - row = re.sub(re_reg, "", row) - row = re.sub(re_sprel, "addr(sp)", row) + row = re.sub(arch.re_reg, "", row) + row = re.sub(arch.re_sprel, "addr(sp)", row) row_with_imm = row - if mnemonic in instructions_with_address_immediates: + if mnemonic in arch.instructions_with_address_immediates: row = row.strip() row, _ = split_off_branch(row) row += "" else: - row = normalize_imms(row) + row = normalize_imms(row, arch) branch_target = None - if mnemonic in branch_instructions: - target = row_parts[1].strip().split(",")[-1] - if mnemonic in branch_likely_instructions: - target = hex(int(target, 16) - 4)[2:] - branch_target = target.strip() + if mnemonic in arch.branch_instructions: + target = int(row_parts[1].strip().split(",")[-1], 16) + if mnemonic in arch.branch_likely_instructions: + target -= 4 + branch_target = hex(target)[2:] output.append( Line( @@ -976,7 +1409,7 @@ def process(lines: List[str]) -> List[Line]: ) source_lines = [] - if args.stop_jrra and mnemonic == "jr" and row_parts[1].strip() == "ra": + if config.stop_jrra and mnemonic == "jr" and row_parts[1].strip() == "ra": stop_after_delay_slot = True elif stop_after_delay_slot: break @@ -984,34 +1417,12 @@ def process(lines: List[str]) -> List[Line]: return output -def format_single_line_diff(line1: str, line2: str, column_width: int) -> str: - return ansi_ljust(line1, column_width) + line2 +def normalize_imms(row: str, arch: ArchSettings) -> str: + return re.sub(arch.re_imm, "", row) -class SymbolColorer: - symbol_colors: Dict[str, str] - - def __init__(self, base_index: int) -> None: - self.color_index = base_index - self.symbol_colors = {} - - def color_symbol(self, s: str, t: Optional[str] = None) -> str: - try: - color = self.symbol_colors[s] - except: - color = COLOR_ROTATION[self.color_index % len(COLOR_ROTATION)] - self.color_index += 1 - self.symbol_colors[s] = color - t = t or s - return f"{color}{t}{Fore.RESET}" - - -def normalize_imms(row: str) -> str: - return re.sub(re_imm, "", row) - - -def normalize_stack(row: str) -> str: - return re.sub(re_sprel, "addr(sp)", row) +def normalize_stack(row: str, arch: ArchSettings) -> str: + return re.sub(arch.re_sprel, "addr(sp)", row) def split_off_branch(line: str) -> Tuple[str, str]: @@ -1021,28 +1432,6 @@ def split_off_branch(line: str) -> Tuple[str, str]: off = len(line) - len(parts[-1]) return line[:off], line[off:] -ColorFunction = Callable[[str], str] - -def color_fields(pat: Pattern[str], out1: str, out2: str, color1: ColorFunction, color2: Optional[ColorFunction]=None) -> Tuple[str, str]: - diffs = [of.group() != nf.group() for (of, nf) in zip(pat.finditer(out1), pat.finditer(out2))] - - it = iter(diffs) - def maybe_color(color: ColorFunction, s: str) -> str: - return color(s) if next(it, False) else f"{Style.RESET_ALL}{s}" - - out1 = pat.sub(lambda m: maybe_color(color1, m.group()), out1) - it = iter(diffs) - out2 = pat.sub(lambda m: maybe_color(color2 or color1, m.group()), out2) - - return out1, out2 - - -def color_branch_imms(br1: str, br2: str) -> Tuple[str, str]: - if br1 != br2: - br1 = f"{Fore.LIGHTBLUE_EX}{br1}{Style.RESET_ALL}" - br2 = f"{Fore.LIGHTBLUE_EX}{br2}{Style.RESET_ALL}" - return br1, br2 - def diff_sequences_difflib( seq1: List[str], seq2: List[str] @@ -1052,10 +1441,10 @@ def diff_sequences_difflib( def diff_sequences( - seq1: List[str], seq2: List[str] + seq1: List[str], seq2: List[str], algorithm: str ) -> List[Tuple[str, int, int, int, int]]: if ( - args.algorithm != "levenshtein" + algorithm != "levenshtein" or len(seq1) * len(seq2) > 4 * 10 ** 8 or len(seq1) + len(seq2) >= 0x110000 ): @@ -1077,17 +1466,21 @@ def diff_sequences( rem1 = remap(seq1) rem2 = remap(seq2) + import Levenshtein # type: ignore + return Levenshtein.opcodes(rem1, rem2) # type: ignore def diff_lines( lines1: List[Line], lines2: List[Line], + algorithm: str, ) -> List[Tuple[Optional[Line], Optional[Line]]]: ret = [] for (tag, i1, i2, j1, j2) in diff_sequences( [line.mnemonic for line in lines1], [line.mnemonic for line in lines2], + algorithm, ): for line1, line2 in itertools.zip_longest(lines1[i1:i2], lines2[j1:j2]): if tag == "replace": @@ -1104,41 +1497,33 @@ def diff_lines( return ret +@dataclass(frozen=True) class OutputLine: - base: Optional[str] - fmt2: str + base: Optional[Text] = field(compare=False) + fmt2: Text = field(compare=False) key2: Optional[str] - def __init__(self, base: Optional[str], fmt2: str, key2: Optional[str]) -> None: - self.base = base - self.fmt2 = fmt2 - self.key2 = key2 - def __eq__(self, other: object) -> bool: - if not isinstance(other, OutputLine): - return NotImplemented - return self.key2 == other.key2 - - def __hash__(self) -> int: - return hash(self.key2) - - -def do_diff(basedump: str, mydump: str) -> List[OutputLine]: +def do_diff(basedump: str, mydump: str, config: Config) -> List[OutputLine]: + if config.source: + import cxxfilt # type: ignore + arch = config.arch + fmt = config.formatter output: List[OutputLine] = [] - lines1 = process(basedump.split("\n")) - lines2 = process(mydump.split("\n")) + lines1 = process(basedump.split("\n"), config) + lines2 = process(mydump.split("\n"), config) - sc1 = SymbolColorer(0) - sc2 = SymbolColorer(0) - sc3 = SymbolColorer(4) - sc4 = SymbolColorer(4) - sc5 = SymbolColorer(0) - sc6 = SymbolColorer(0) + sc1 = symbol_formatter("base-reg", 0) + sc2 = symbol_formatter("my-reg", 0) + sc3 = symbol_formatter("base-stack", 4) + sc4 = symbol_formatter("my-stack", 4) + sc5 = symbol_formatter("base-branch", 0) + sc6 = symbol_formatter("my-branch", 0) bts1: Set[str] = set() bts2: Set[str] = set() - if args.show_branches: + if config.show_branches: for (lines, btset, sc) in [ (lines1, bts1, sc5), (lines2, bts2, sc6), @@ -1146,130 +1531,150 @@ def do_diff(basedump: str, mydump: str) -> List[OutputLine]: for line in lines: bt = line.branch_target if bt is not None: - btset.add(bt + ":") - sc.color_symbol(bt + ":") + text = f"{bt}:" + btset.add(text) + sc(text) - for (line1, line2) in diff_lines(lines1, lines2): - line_color1 = line_color2 = sym_color = Fore.RESET + for (line1, line2) in diff_lines(lines1, lines2, config.algorithm): + line_color1 = line_color2 = sym_color = BasicFormat.NONE line_prefix = " " + out1 = Text() if not line1 else Text(pad_mnemonic(line1.original)) + out2 = Text() if not line2 else Text(pad_mnemonic(line2.original)) if line1 and line2 and line1.diff_row == line2.diff_row: if line1.normalized_original == line2.normalized_original: - out1 = line1.original - out2 = line2.original + pass elif line1.diff_row == "": - out1 = f"{Style.BRIGHT}{Fore.LIGHTBLACK_EX}{line1.original}" - out2 = f"{Style.BRIGHT}{Fore.LIGHTBLACK_EX}{line2.original}" + out1 = out1.reformat(BasicFormat.DELAY_SLOT) + out2 = out2.reformat(BasicFormat.DELAY_SLOT) else: mnemonic = line1.original.split()[0] - out1, out2 = line1.original, line2.original - branch1 = branch2 = "" - if mnemonic in instructions_with_address_immediates: - out1, branch1 = split_off_branch(line1.original) - out2, branch2 = split_off_branch(line2.original) - branchless1 = out1 - branchless2 = out2 - out1, out2 = color_fields(re_imm, out1, out2, lambda s: f"{Fore.LIGHTBLUE_EX}{s}{Style.RESET_ALL}") + branch1 = branch2 = Text() + if mnemonic in arch.instructions_with_address_immediates: + out1, branch1 = map(Text, split_off_branch(out1.plain())) + out2, branch2 = map(Text, split_off_branch(out2.plain())) + branchless1 = out1.plain() + branchless2 = out2.plain() + out1, out2 = format_fields( + arch.re_imm, out1, out2, lambda _: BasicFormat.IMMEDIATE + ) same_relative_target = False if line1.branch_target is not None and line2.branch_target is not None: - relative_target1 = eval_line_num(line1.branch_target) - eval_line_num(line1.line_num) - relative_target2 = eval_line_num(line2.branch_target) - eval_line_num(line2.line_num) + relative_target1 = eval_line_num( + line1.branch_target + ) - eval_line_num(line1.line_num) + relative_target2 = eval_line_num( + line2.branch_target + ) - eval_line_num(line2.line_num) same_relative_target = relative_target1 == relative_target2 - if not same_relative_target: - branch1, branch2 = color_branch_imms(branch1, branch2) + if not same_relative_target and branch1.plain() != branch2.plain(): + branch1 = branch1.reformat(BasicFormat.IMMEDIATE) + branch2 = branch2.reformat(BasicFormat.IMMEDIATE) out1 += branch1 out2 += branch2 - if normalize_imms(branchless1) == normalize_imms(branchless2): + if normalize_imms(branchless1, arch) == normalize_imms( + branchless2, arch + ): if not same_relative_target: # only imms differences - sym_color = Fore.LIGHTBLUE_EX + sym_color = BasicFormat.IMMEDIATE line_prefix = "i" else: - out1, out2 = color_fields(re_sprel, out1, out2, sc3.color_symbol, sc4.color_symbol) - if normalize_stack(branchless1) == normalize_stack(branchless2): + out1, out2 = format_fields(arch.re_sprel, out1, out2, sc3, sc4) + if normalize_stack(branchless1, arch) == normalize_stack( + branchless2, arch + ): # only stack differences (luckily stack and imm # differences can't be combined in MIPS, so we # don't have to think about that case) - sym_color = Fore.YELLOW + sym_color = BasicFormat.STACK line_prefix = "s" else: # regs differences and maybe imms as well - out1, out2 = color_fields(re_reg, out1, out2, sc1.color_symbol, sc2.color_symbol) - line_color1 = line_color2 = sym_color = Fore.YELLOW + out1, out2 = format_fields(arch.re_reg, out1, out2, sc1, sc2) + line_color1 = line_color2 = sym_color = BasicFormat.REGISTER line_prefix = "r" elif line1 and line2: line_prefix = "|" - line_color1 = Fore.LIGHTBLUE_EX - line_color2 = Fore.LIGHTBLUE_EX - sym_color = Fore.LIGHTBLUE_EX - out1 = line1.original - out2 = line2.original + line_color1 = line_color2 = sym_color = BasicFormat.DIFF_CHANGE + out1 = out1.reformat(line_color1) + out2 = out2.reformat(line_color2) elif line1: line_prefix = "<" - line_color1 = sym_color = Fore.RED - out1 = line1.original - out2 = "" + line_color1 = sym_color = BasicFormat.DIFF_REMOVE + out1 = out1.reformat(line_color1) + out2 = Text() elif line2: line_prefix = ">" - line_color2 = sym_color = Fore.GREEN - out1 = "" - out2 = line2.original + line_color2 = sym_color = BasicFormat.DIFF_ADD + out1 = Text() + out2 = out2.reformat(line_color2) - if args.source and line2 and line2.comment: + if config.source and line2 and line2.comment: out2 += f" {line2.comment}" def format_part( - out: str, + out: Text, line: Optional[Line], - line_color: str, + line_color: Format, btset: Set[str], - sc: SymbolColorer, - ) -> Optional[str]: + sc: FormatFunction, + ) -> Optional[Text]: if line is None: return None - in_arrow = " " - out_arrow = "" - if args.show_branches: + in_arrow = Text(" ") + out_arrow = Text() + if config.show_branches: if line.line_num in btset: - in_arrow = sc.color_symbol(line.line_num, "~>") + line_color + in_arrow = Text("~>", sc(line.line_num)) if line.branch_target is not None: - out_arrow = " " + sc.color_symbol(line.branch_target + ":", "~>") - out = pad_mnemonic(out) - return f"{line_color}{line.line_num} {in_arrow} {out}{Style.RESET_ALL}{out_arrow}" + out_arrow = " " + Text("~>", sc(line.branch_target + ":")) + return ( + Text(line.line_num, line_color) + " " + in_arrow + " " + out + out_arrow + ) part1 = format_part(out1, line1, line_color1, bts1, sc5) part2 = format_part(out2, line2, line_color2, bts2, sc6) key2 = line2.original if line2 else None - mid = f"{sym_color}{line_prefix}" - if line2: for source_line in line2.source_lines: - color = Style.DIM - # File names and function names - if source_line and source_line[0] != "│": - color += Style.BRIGHT - # Function names - if source_line.endswith("():"): - # Underline. Colorama does not provide this feature, unfortunately. - color += "\u001b[4m" + line_format = BasicFormat.SOURCE_OTHER + if config.source_old_binutils: + if source_line and re.fullmatch(".*\.c(?:pp)?:\d+", source_line): + line_format = BasicFormat.SOURCE_FILENAME + elif source_line and source_line.endswith("():"): + line_format = BasicFormat.SOURCE_FUNCTION try: source_line = cxxfilt.demangle( source_line[:-3], external_only=False ) except: pass + else: + # File names and function names + if source_line and source_line[0] != "│": + line_format = BasicFormat.SOURCE_FILENAME + # Function names + if source_line.endswith("():"): + line_format = BasicFormat.SOURCE_FUNCTION + try: + source_line = cxxfilt.demangle( + source_line[:-3], external_only=False + ) + except: + pass output.append( OutputLine( None, - f" {color}{source_line}{Style.RESET_ALL}", + " " + Text(source_line, line_format), source_line, ) ) - fmt2 = mid + " " + (part2 or "") + fmt2 = Text(line_prefix, sym_color) + " " + (part2 or Text()) output.append(OutputLine(part1, fmt2, key2)) return output @@ -1290,13 +1695,14 @@ def chunk_diff(diff: List[OutputLine]) -> List[Union[List[OutputLine], OutputLin def format_diff( - old_diff: List[OutputLine], new_diff: List[OutputLine] -) -> Tuple[str, List[str]]: + old_diff: List[OutputLine], new_diff: List[OutputLine], config: Config +) -> Tuple[Optional[Tuple[str, ...]], List[Tuple[str, ...]]]: + fmt = config.formatter old_chunks = chunk_diff(old_diff) new_chunks = chunk_diff(new_diff) - output: List[Tuple[str, OutputLine, OutputLine]] = [] + output: List[Tuple[Text, OutputLine, OutputLine]] = [] assert len(old_chunks) == len(new_chunks), "same target" - empty = OutputLine("", "", None) + empty = OutputLine(Text(), Text(), None) for old_chunk, new_chunk in zip(old_chunks, new_chunks): if isinstance(old_chunk, list): assert isinstance(new_chunk, list) @@ -1308,13 +1714,13 @@ def format_diff( for (tag, i1, i2, j1, j2) in differ.get_opcodes(): if tag in ["equal", "replace"]: for i, j in zip(range(i1, i2), range(j1, j2)): - output.append(("", old_chunk[i], new_chunk[j])) + output.append((Text(), old_chunk[i], new_chunk[j])) if tag in ["insert", "replace"]: for j in range(j1 + i2 - i1, j2): - output.append(("", empty, new_chunk[j])) + output.append((Text(), empty, new_chunk[j])) if tag in ["delete", "replace"]: for i in range(i1 + j2 - j1, i2): - output.append(("", old_chunk[i], empty)) + output.append((Text(), old_chunk[i], empty)) else: assert isinstance(new_chunk, OutputLine) assert new_chunk.base @@ -1324,19 +1730,22 @@ def format_diff( output.append((new_chunk.base, old_chunk, new_chunk)) # TODO: status line, with e.g. approximate permuter score? - width = args.column_width - if args.threeway: - header_line = "TARGET".ljust(width) + " CURRENT".ljust(width) + " PREVIOUS" + header_line: Optional[Tuple[str, ...]] + diff_lines: List[Tuple[str, ...]] + if config.threeway: + header_line = ("TARGET", " CURRENT", " PREVIOUS") diff_lines = [ - ansi_ljust(base, width) - + ansi_ljust(new.fmt2, width) - + (old.fmt2 or "-" if old != new else "") + ( + fmt.apply(base), + fmt.apply(new.fmt2), + fmt.apply(old.fmt2) or "-" if old != new else "", + ) for (base, old, new) in output ] else: - header_line = "" + header_line = None diff_lines = [ - ansi_ljust(base, width) + new.fmt2 + (fmt.apply(base), fmt.apply(new.fmt2)) for (base, old, new) in output if base or new.key2 is not None ] @@ -1346,7 +1755,8 @@ def format_diff( def debounced_fs_watch( targets: List[str], outq: "queue.Queue[Optional[float]]", - debounce_delay: float, + config: Config, + project: ProjectSettings, ) -> None: import watchdog.events # type: ignore import watchdog.observers # type: ignore @@ -1368,10 +1778,10 @@ def debounced_fs_watch( def should_notify(self, path: str) -> bool: for target in self.file_targets: - if path == target: + if os.path.normpath(path) == target: return True - if args.make and any( - path.endswith(suffix) for suffix in FS_WATCH_EXTENSIONS + if config.make and any( + path.endswith(suffix) for suffix in project.source_extensions ): return True return False @@ -1390,7 +1800,7 @@ def debounced_fs_watch( if os.path.isdir(target): observer.schedule(event_handler, target, recursive=True) else: - file_targets.append(target) + file_targets.append(os.path.normpath(target)) target = os.path.dirname(target) or "." if target not in observed: observed.add(target) @@ -1400,7 +1810,7 @@ def debounced_fs_watch( t = listenq.get() more = True while more: - delay = t + debounce_delay - time.time() + delay = t + DEBOUNCE_DELAY - time.time() if delay > 0: time.sleep(delay) # consume entire queue @@ -1420,6 +1830,7 @@ def debounced_fs_watch( class Display: basedump: str mydump: str + config: Config emsg: Optional[str] last_diff_output: Optional[List[OutputLine]] pending_update: Optional[Tuple[str, bool]] @@ -1427,23 +1838,26 @@ class Display: watch_queue: "queue.Queue[Optional[float]]" less_proc: "Optional[subprocess.Popen[bytes]]" - def __init__(self, basedump: str, mydump: str) -> None: + def __init__(self, basedump: str, mydump: str, config: Config) -> None: + self.config = config self.basedump = basedump self.mydump = mydump self.emsg = None self.last_diff_output = None - def run_less(self) -> "Tuple[subprocess.Popen[bytes], subprocess.Popen[bytes]]": + def run_diff(self) -> str: if self.emsg is not None: - output = self.emsg - else: - diff_output = do_diff(self.basedump, self.mydump) - last_diff_output = self.last_diff_output or diff_output - if args.threeway != "base" or not self.last_diff_output: - self.last_diff_output = diff_output - header, diff_lines = format_diff(last_diff_output, diff_output) - header_lines = [header] if header else [] - output = "\n".join(header_lines + diff_lines[args.skip_lines :]) + return self.emsg + + diff_output = do_diff(self.basedump, self.mydump, self.config) + last_diff_output = self.last_diff_output or diff_output + if self.config.threeway != "base" or not self.last_diff_output: + self.last_diff_output = diff_output + header, diff_lines = format_diff(last_diff_output, diff_output, self.config) + return self.config.formatter.table(header, diff_lines[self.config.skip_lines :]) + + def run_less(self) -> "Tuple[subprocess.Popen[bytes], subprocess.Popen[bytes]]": + output = self.run_diff() # Pipe the output through 'tail' and only then to less, to ensure the # write call doesn't block. ('tail' has to buffer all its input before @@ -1525,15 +1939,47 @@ class Display: def main() -> None: + args = parser.parse_args() + + # Apply project-specific configuration. + settings: Dict[str, Any] = {} + diff_settings.apply(settings, args) # type: ignore + project = create_project_settings(settings) + + config = create_config(args, project) + + if config.algorithm == "levenshtein": + try: + import Levenshtein + except ModuleNotFoundError as e: + fail(MISSING_PREREQUISITES.format(e.name)) + + if config.source: + try: + import cxxfilt + except ModuleNotFoundError as e: + fail(MISSING_PREREQUISITES.format(e.name)) + + if config.threeway and not args.watch: + fail("Threeway diffing requires -w.") + if args.diff_elf_symbol: - make_target, basecmd, mycmd = dump_elf() - elif args.diff_obj: - make_target, basecmd, mycmd = dump_objfile() + make_target, basecmd, mycmd = dump_elf( + args.start, args.end, args.diff_elf_symbol, config, project + ) + elif config.diff_obj: + make_target, basecmd, mycmd = dump_objfile( + args.start, args.end, config, project + ) else: - make_target, basecmd, mycmd = dump_binary() + make_target, basecmd, mycmd = dump_binary(args.start, args.end, config, project) + + map_build_target_fn = getattr(diff_settings, "map_build_target", None) + if map_build_target_fn: + make_target = map_build_target_fn(make_target=make_target) if args.write_asm is not None: - mydump = run_objdump(mycmd) + mydump = run_objdump(mycmd, config, project) with open(args.write_asm, "w") as f: f.write(mydump) print(f"Wrote assembly to {args.write_asm}.") @@ -1543,13 +1989,15 @@ def main() -> None: with open(args.base_asm) as f: basedump = f.read() else: - basedump = run_objdump(basecmd) + basedump = run_objdump(basecmd, config, project) - mydump = run_objdump(mycmd) + mydump = run_objdump(mycmd, config, project) - display = Display(basedump, mydump) + display = Display(basedump, mydump, config) - if not args.watch: + if args.no_pager or args.format == "html": + print(display.run_diff()) + elif not args.watch: display.run_sync() else: if not args.make: @@ -1566,13 +2014,13 @@ def main() -> None: ) if watch_sources_for_target_fn: watch_sources = watch_sources_for_target_fn(make_target) - watch_sources = watch_sources or source_directories + watch_sources = watch_sources or project.source_directories if not watch_sources: fail("Missing source_directories config, don't know what to watch.") else: watch_sources = [make_target] q: "queue.Queue[Optional[float]]" = queue.Queue() - debounced_fs_watch(watch_sources, q, DEBOUNCE_DELAY) + debounced_fs_watch(watch_sources, q, config, project) display.run_async(q) last_build = 0.0 try: @@ -1585,7 +2033,7 @@ def main() -> None: last_build = time.time() if args.make: display.progress("Building...") - ret = run_make_capture_output(make_target) + ret = run_make_capture_output(make_target, project) if ret.returncode != 0: display.update( ret.stderr.decode("utf-8-sig", "replace") @@ -1593,10 +2041,11 @@ def main() -> None: error=True, ) continue - mydump = run_objdump(mycmd) + mydump = run_objdump(mycmd, config, project) display.update(mydump, error=False) except KeyboardInterrupt: display.terminate() -main() +if __name__ == "__main__": + main() diff --git a/diff_settings.py b/diff_settings.py index e9dd374f9e..fb2f1454eb 100644 --- a/diff_settings.py +++ b/diff_settings.py @@ -3,4 +3,4 @@ def apply(config, args): config['myimg'] = 'zelda_ocarina_mq_dbg.z64' config['baseimg'] = 'baserom.z64' config['makeflags'] = [] - config['source_directories'] = ['src', 'include', 'spec'] \ No newline at end of file + config['source_directories'] = ['src', 'include', 'spec'] diff --git a/tools/asm_processor/asm_processor.py b/tools/asm_processor/asm_processor.py index 302668d7de..6923752be4 100644 --- a/tools/asm_processor/asm_processor.py +++ b/tools/asm_processor/asm_processor.py @@ -269,6 +269,43 @@ class Section: assert self.sh_type == SHT_SYMTAB return self.symbol_entries[self.sh_info:] + def relocate_mdebug(self, original_offset): + assert self.sh_type == SHT_MIPS_DEBUG + new_data = bytearray(self.data) + shift_by = self.sh_offset - original_offset + + # Update the file-relative offsets in the Symbolic HDRR + hdrr_magic, hdrr_vstamp, hdrr_ilineMax, hdrr_cbLine, \ + hdrr_cbLineOffset, hdrr_idnMax, hdrr_cbDnOffset, hdrr_ipdMax, \ + hdrr_cbPdOffset, hdrr_isymMax, hdrr_cbSymOffset, hdrr_ioptMax, \ + hdrr_cbOptOffset, hdrr_iauxMax, hdrr_cbAuxOffset, hdrr_issMax, \ + hdrr_cbSsOffset, hdrr_issExtMax, hdrr_cbSsExtOffset, hdrr_ifdMax, \ + hdrr_cbFdOffset, hdrr_crfd, hdrr_cbRfdOffset, hdrr_iextMax, \ + hdrr_cbExtOffset = struct.unpack(">HHIIIIIIIIIIIIIIIIIIIIIII", self.data[0:0x60]) + + assert hdrr_magic == 0x7009 , "Invalid magic value for .mdebug symbolic header" + + hdrr_cbLineOffset += shift_by + hdrr_cbDnOffset += shift_by + hdrr_cbPdOffset += shift_by + hdrr_cbSymOffset += shift_by + hdrr_cbOptOffset += shift_by + hdrr_cbAuxOffset += shift_by + hdrr_cbSsOffset += shift_by + hdrr_cbSsExtOffset += shift_by + hdrr_cbFdOffset += shift_by + hdrr_cbRfdOffset += shift_by + hdrr_cbExtOffset += shift_by + + new_data[0:0x60] = struct.pack(">HHIIIIIIIIIIIIIIIIIIIIIII", hdrr_magic, hdrr_vstamp, hdrr_ilineMax, hdrr_cbLine, \ + hdrr_cbLineOffset, hdrr_idnMax, hdrr_cbDnOffset, hdrr_ipdMax, \ + hdrr_cbPdOffset, hdrr_isymMax, hdrr_cbSymOffset, hdrr_ioptMax, \ + hdrr_cbOptOffset, hdrr_iauxMax, hdrr_cbAuxOffset, hdrr_issMax, \ + hdrr_cbSsOffset, hdrr_issExtMax, hdrr_cbSsExtOffset, hdrr_ifdMax, \ + hdrr_cbFdOffset, hdrr_crfd, hdrr_cbRfdOffset, hdrr_iextMax, \ + hdrr_cbExtOffset) + + self.data = bytes(new_data) class ElfFile: def __init__(self, data): @@ -317,7 +354,7 @@ class ElfFile: s.late_init(self.sections) return s - def drop_irrelevant_sections(self): + def drop_mdebug_gptab(self): # We can only drop sections at the end, since otherwise section # references might be wrong. Luckily, these sections typically are. while self.sections[-1].sh_type in [SHT_MIPS_DEBUG, SHT_MIPS_GPTAB]: @@ -340,7 +377,11 @@ class ElfFile: for s in self.sections: if s.sh_type != SHT_NOBITS and s.sh_type != SHT_NULL: pad_out(s.sh_addralign) + old_offset = s.sh_offset s.sh_offset = outidx + if s.sh_type == SHT_MIPS_DEBUG and s.sh_offset != old_offset: + # The .mdebug section has moved, relocate offsets + s.relocate_mdebug(old_offset) write_out(s.data) pad_out(4) @@ -380,7 +421,7 @@ class Failure(Exception): class GlobalState: - def __init__(self, min_instr_count, skip_instr_count, use_jtbl_for_rodata): + def __init__(self, min_instr_count, skip_instr_count, use_jtbl_for_rodata, mips1): # A value that hopefully never appears as a 32-bit rodata constant (or we # miscompile late rodata). Increases by 1 in each step. self.late_rodata_hex = 0xE0123456 @@ -388,6 +429,7 @@ class GlobalState: self.min_instr_count = min_instr_count self.skip_instr_count = skip_instr_count self.use_jtbl_for_rodata = use_jtbl_for_rodata + self.mips1 = mips1 def next_late_rodata_hex(self): dummy_bytes = struct.pack('>I', self.late_rodata_hex) @@ -608,12 +650,14 @@ class GlobalAsmBlock: size = self.fn_section_sizes['.late_rodata'] // 4 skip_next = False needs_double = (self.late_rodata_alignment != 0) + extra_mips1_nop = False + jtbl_size = 11 if state.mips1 else 9 for i in range(size): if skip_next: skip_next = False continue - # Jump tables give 9 instructions for >= 5 words of rodata, and should be - # emitted when: + # Jump tables give 9 instructions (11 with -mips1) for >= 5 words of rodata, + # and should be emitted when: # - -O2 or -O2 -g3 are used, which give the right codegen # - we have emitted our first .float/.double (to ensure that we find the # created rodata in the binary) @@ -624,11 +668,12 @@ class GlobalAsmBlock: # - we have at least 10 more instructions to go in this function (otherwise our # function size computation will be wrong since the delay slot goes unused) if (not needs_double and state.use_jtbl_for_rodata and i >= 1 and - size - i >= 5 and num_instr - len(late_rodata_fn_output) >= 10): + size - i >= 5 and num_instr - len(late_rodata_fn_output) >= jtbl_size + 1): cases = " ".join("case {}:".format(case) for case in range(size - i)) late_rodata_fn_output.append("switch (*(volatile int*)0) { " + cases + " ; }") - late_rodata_fn_output.extend([""] * 8) + late_rodata_fn_output.extend([""] * (jtbl_size - 1)) jtbl_rodata_size = (size - i) * 4 + extra_mips1_nop = i != 2 break dummy_bytes = state.next_late_rodata_hex() late_rodata_dummy_bytes.append(dummy_bytes) @@ -638,12 +683,20 @@ class GlobalAsmBlock: fval, = struct.unpack('>d', dummy_bytes + dummy_bytes2) late_rodata_fn_output.append('*(volatile double*)0 = {};'.format(fval)) skip_next = True - needs_double = True + needs_double = False + if state.mips1: + # mips1 does not have ldc1/sdc1 + late_rodata_fn_output.append('') + late_rodata_fn_output.append('') + extra_mips1_nop = False else: fval, = struct.unpack('>f', dummy_bytes) late_rodata_fn_output.append('*(volatile float*)0 = {}f;'.format(fval)) + extra_mips1_nop = True late_rodata_fn_output.append('') late_rodata_fn_output.append('') + if state.mips1 and extra_mips1_nop: + late_rodata_fn_output.append('') text_name = None if self.fn_section_sizes['.text'] > 0 or late_rodata_fn_output: @@ -722,7 +775,7 @@ float_regexpr = re.compile(r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?f") def repl_float_hex(m): return str(struct.unpack(">I", struct.pack(">f", float(m.group(0).strip().rstrip("f"))))[0]) -def parse_source(f, opt, framepointer, input_enc, output_enc, out_dependencies, print_source=None): +def parse_source(f, opt, framepointer, mips1, input_enc, output_enc, out_dependencies, print_source=None): if opt in ['O2', 'O1']: if framepointer: min_instr_count = 6 @@ -751,7 +804,7 @@ def parse_source(f, opt, framepointer, input_enc, output_enc, out_dependencies, if opt in ['O2', 'g3'] and not framepointer: use_jtbl_for_rodata = True - state = GlobalState(min_instr_count, skip_instr_count, use_jtbl_for_rodata) + state = GlobalState(min_instr_count, skip_instr_count, use_jtbl_for_rodata, mips1) global_asm = None asm_functions = [] @@ -803,7 +856,7 @@ def parse_source(f, opt, framepointer, input_enc, output_enc, out_dependencies, out_dependencies.append(fname) include_src = StringIO() with open(fname, encoding=input_enc) as include_file: - parse_source(include_file, opt, framepointer, input_enc, output_enc, out_dependencies, include_src) + parse_source(include_file, opt, framepointer, mips1, input_enc, output_enc, out_dependencies, include_src) include_src.write('#line ' + str(line_no + 1) + ' "' + f.name + '"') output_lines[-1] = include_src.getvalue() include_src.close() @@ -831,7 +884,7 @@ def parse_source(f, opt, framepointer, input_enc, output_enc, out_dependencies, return asm_functions -def fixup_objfile(objfile_name, functions, asm_prelude, assembler, output_enc): +def fixup_objfile(objfile_name, functions, asm_prelude, assembler, output_enc, drop_mdebug_gptab): SECTIONS = ['.data', '.text', '.rodata', '.bss'] with open(objfile_name, 'rb') as f: @@ -927,9 +980,12 @@ def fixup_objfile(objfile_name, functions, asm_prelude, assembler, output_enc): with open(o_name, 'rb') as f: asm_objfile = ElfFile(f.read()) - # Remove some clutter from objdump output + # Remove clutter from objdump output for tests, and make the tests + # portable by avoiding absolute paths. Outside of tests .mdebug is + # useful for showing source together with asm, though. mdebug_section = objfile.find_section('.mdebug') - objfile.drop_irrelevant_sections() + if drop_mdebug_gptab: + objfile.drop_mdebug_gptab() # Unify reginfo sections target_reginfo = objfile.find_section('.reginfo') @@ -1176,9 +1232,11 @@ def run_wrapped(argv, outfile, functions): parser.add_argument('--post-process', dest='objfile', help="path to .o file to post-process") parser.add_argument('--assembler', dest='assembler', help="assembler command (e.g. \"mips-linux-gnu-as -march=vr4300 -mabi=32\")") parser.add_argument('--asm-prelude', dest='asm_prelude', help="path to a file containing a prelude to the assembly file (with .set and .macro directives, e.g.)") - parser.add_argument('--input-enc', default='latin1', help="Input encoding (default: latin1)") - parser.add_argument('--output-enc', default='latin1', help="Output encoding (default: latin1)") + parser.add_argument('--input-enc', default='latin1', help="input encoding (default: %(default)s)") + parser.add_argument('--output-enc', default='latin1', help="output encoding (default: %(default)s)") + parser.add_argument('--drop-mdebug-gptab', dest='drop_mdebug_gptab', action='store_true', help="drop mdebug and gptab sections") parser.add_argument('-framepointer', dest='framepointer', action='store_true') + parser.add_argument('-mips1', dest='mips1', action='store_true') parser.add_argument('-g3', dest='g3', action='store_true') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('-O1', dest='opt', action='store_const', const='O1') @@ -1190,25 +1248,27 @@ def run_wrapped(argv, outfile, functions): if opt != 'O2': raise Failure("-g3 is only supported together with -O2") opt = 'g3' + if args.mips1 and (opt != 'O2' or args.framepointer): + raise Failure("-mips1 is only supported together with -O2") if args.objfile is None: with open(args.filename, encoding=args.input_enc) as f: deps = [] - functions = parse_source(f, opt=opt, framepointer=args.framepointer, input_enc=args.input_enc, output_enc=args.output_enc, out_dependencies=deps, print_source=outfile) + functions = parse_source(f, opt=opt, framepointer=args.framepointer, mips1=args.mips1, input_enc=args.input_enc, output_enc=args.output_enc, out_dependencies=deps, print_source=outfile) return functions, deps else: if args.assembler is None: raise Failure("must pass assembler command") if functions is None: with open(args.filename, encoding=args.input_enc) as f: - functions = parse_source(f, opt=opt, framepointer=args.framepointer, input_enc=args.input_enc, out_dependencies=[], output_enc=args.output_enc) + functions = parse_source(f, opt=opt, framepointer=args.framepointer, mips1=args.mips1, input_enc=args.input_enc, out_dependencies=[], output_enc=args.output_enc) if not functions: return asm_prelude = b'' if args.asm_prelude: with open(args.asm_prelude, 'rb') as f: asm_prelude = f.read() - fixup_objfile(args.objfile, functions, asm_prelude, args.assembler, args.output_enc) + fixup_objfile(args.objfile, functions, asm_prelude, args.assembler, args.output_enc, args.drop_mdebug_gptab) def run(argv, outfile=sys.stdout.buffer, functions=None): try: