diff --git a/diff.py b/diff.py
index 8dec9ef182..0e1da09deb 100755
--- a/diff.py
+++ b/diff.py
@@ -1,47 +1,71 @@
 #!/usr/bin/env python3
 import sys
-import re
-import os
-import ast
-import argparse
-import subprocess
-import difflib
-import string
-import itertools
-import threading
-import queue
-import time
-
 
 def fail(msg):
     print(msg, file=sys.stderr)
     sys.exit(1)
 
-
-MISSING_PREREQUISITES = (
-    "Missing prerequisite python module {}. "
-    "Run `python3 -m pip install --user colorama ansiwrap attrs watchdog python-Levenshtein` to install prerequisites (python-Levenshtein only needed for --algorithm=levenshtein)."
-)
-
-try:
-    import attr
-    from colorama import Fore, Style, Back
-    import ansiwrap
-    import watchdog
-except ModuleNotFoundError as e:
-    fail(MISSING_PREREQUISITES.format(e.name))
-
 # Prefer to use diff_settings.py from the current working directory
 sys.path.insert(0, ".")
 try:
     import diff_settings
 except ModuleNotFoundError:
     fail("Unable to find diff_settings.py in the same directory.")
+sys.path.pop(0)
 
-# ==== CONFIG ====
+# ==== COMMAND-LINE ====
+
+try:
+    import argcomplete  # type: ignore
+except ModuleNotFoundError:
+    argcomplete = None
+import argparse
 
 parser = argparse.ArgumentParser(description="Diff MIPS assembly.")
-parser.add_argument("start", help="Function name or address to start diffing from.")
+
+start_argument = parser.add_argument("start", help="Function name or address to start diffing from.")
+if argcomplete:
+    def complete_symbol(**kwargs):
+        prefix = kwargs["prefix"]
+        if prefix == "":
+            # skip reading the map file, which would
+            # result in a lot of useless completions
+            return []
+        parsed_args = kwargs["parsed_args"]
+        config = {}
+        diff_settings.apply(config, parsed_args)
+        mapfile = config.get("mapfile")
+        if not mapfile:
+            return []
+        completes = []
+        with open(mapfile) as f:
+            data = f.read()
+            # assume symbols are prefixed by a space character
+            search = f" {prefix}"
+            pos = data.find(search)
+            while pos != -1:
+                # skip the space character in the search string
+                pos += 1
+                # assume symbols are suffixed by either a space
+                # character or a (unix-style) line return
+                spacePos = data.find(" ", pos)
+                lineReturnPos = data.find("\n", pos)
+                if lineReturnPos == -1:
+                    endPos = spacePos
+                elif spacePos == -1:
+                    endPos = lineReturnPos
+                else:
+                    endPos = min(spacePos, lineReturnPos)
+                if endPos == -1:
+                    match = data[pos:]
+                    pos = -1
+                else:
+                    match = data[pos:endPos]
+                    pos = data.find(search, endPos)
+                completes.append(match)
+        return completes
+    start_argument.completer = complete_symbol
+
 parser.add_argument("end", nargs="?", help="Address to end diff at.")
 parser.add_argument(
     "-o",
@@ -49,6 +73,22 @@ parser.add_argument(
     action="store_true",
     help="Diff .o files rather than a whole binary. This makes it possible to see symbol names. (Recommended)",
 )
+parser.add_argument(
+    "-e",
+    "--elf",
+    dest="diff_elf_symbol",
+    help="Diff a given function in two ELFs, one being stripped and the other one non-stripped. Requires objdump from binutils 2.33+.",
+)
+parser.add_argument(
+    "--source",
+    action="store_true",
+    help="Show source code (if possible). Only works with -o and -e.",
+)
+parser.add_argument(
+    "--inlines",
+    action="store_true",
+    help="Show inline function calls (if possible). Only works with -o and -e.",
+)
 parser.add_argument(
     "--base-asm",
     dest="base_asm",
@@ -116,6 +156,14 @@ parser.add_argument(
     help="Automatically update when source/object files change. "
     "Recommended in combination with -m.",
 )
+parser.add_argument(
+    "-3",
+    "--threeway",
+    dest="threeway",
+    action="store_true",
+    help="Show a three-way diff between target asm, current asm, and asm "
+    "prior to -w rebuild. Requires -w.",
+)
 parser.add_argument(
     "--width",
     dest="column_width",
@@ -126,28 +174,70 @@ parser.add_argument(
 parser.add_argument(
     "--algorithm",
     dest="algorithm",
-    default="difflib",
+    default="levenshtein",
     choices=["levenshtein", "difflib"],
     help="Diff algorithm to use.",
 )
+parser.add_argument(
+    "--max-size",
+    "--max-lines",
+    dest="max_lines",
+    type=int,
+    default=1024,
+    help="The maximum length of the diff, in lines.",
+)
 
 # Project-specific flags, e.g. different versions/make arguments.
 if hasattr(diff_settings, "add_custom_arguments"):
-    diff_settings.add_custom_arguments(parser)
+    diff_settings.add_custom_arguments(parser)  # type: ignore
+
+if argcomplete:
+    argcomplete.autocomplete(parser)
+
+# ==== IMPORTS ====
+
+import re
+import os
+import ast
+import subprocess
+import difflib
+import string
+import itertools
+import threading
+import queue
+import time
+from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, Union
+
+
+MISSING_PREREQUISITES = (
+    "Missing prerequisite python module {}. "
+    "Run `python3 -m pip install --user colorama ansiwrap watchdog python-Levenshtein cxxfilt` to install prerequisites (cxxfilt only needed with --source)."
+)
+
+try:
+    from colorama import Fore, Style, Back  # type: ignore
+    import ansiwrap  # type: ignore
+    import watchdog  # type: ignore
+except ModuleNotFoundError as e:
+    fail(MISSING_PREREQUISITES.format(e.name))
+
+# ==== CONFIG ====
 
 args = parser.parse_args()
 
 # Set imgs, map file and make flags in a project-specific manner.
-config = {}
+config: Dict[str, Any] = {}
 diff_settings.apply(config, args)
 
+arch = config.get("arch", "mips")
 baseimg = config.get("baseimg", None)
 myimg = config.get("myimg", None)
 mapfile = config.get("mapfile", None)
 makeflags = config.get("makeflags", [])
 source_directories = config.get("source_directories", None)
+objdump_executable = config.get("objdump_executable", None)
 
-MAX_FUNCTION_SIZE_LINES = 4096
+MAX_FUNCTION_SIZE_LINES = args.max_lines
 MAX_FUNCTION_SIZE_BYTES = MAX_FUNCTION_SIZE_LINES * 4
 
 COLOR_ROTATION = [
@@ -163,7 +253,7 @@ COLOR_ROTATION = [
 ]
 
 BUFFER_CMD = ["tail", "-c", str(10 ** 9)]
-LESS_CMD = ["less", "-Ric"]
+LESS_CMD = ["less", "-SRic", "-#6"]
 
 DEBOUNCE_DELAY = 0.1
 FS_WATCH_EXTENSIONS = [".c", ".h"]
@@ -172,29 +262,34 @@ FS_WATCH_EXTENSIONS = [".c", ".h"]
 
 if args.algorithm == "levenshtein":
     try:
-        import Levenshtein
+        import Levenshtein  # type: ignore
     except ModuleNotFoundError as e:
         fail(MISSING_PREREQUISITES.format(e.name))
 
-binutils_prefix = None
-
-for binutils_cand in ["mips-linux-gnu-", "mips64-elf-"]:
+if args.source:
     try:
-        subprocess.check_call(
-            [binutils_cand + "objdump", "--version"],
-            stdout=subprocess.DEVNULL,
-            stderr=subprocess.DEVNULL,
-        )
-        binutils_prefix = binutils_cand
-        break
-    except subprocess.CalledProcessError:
-        pass
-    except FileNotFoundError:
-        pass
+        import cxxfilt  # type: ignore
+    except ModuleNotFoundError as e:
+        fail(MISSING_PREREQUISITES.format(e.name))
 
-if not binutils_prefix:
+if objdump_executable is None:
+    for objdump_cand in ["mips-linux-gnu-objdump", "mips64-elf-objdump"]:
+        try:
+            subprocess.check_call(
+                [objdump_cand, "--version"],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
+            objdump_executable = objdump_cand
+            break
+        except subprocess.CalledProcessError:
+            pass
+        except FileNotFoundError:
+            pass
+
+if not objdump_executable:
     fail(
-        "Missing binutils; please ensure mips-linux-gnu-objdump or mips64-elf-objdump exist."
+        "Missing binutils; please ensure mips-linux-gnu-objdump or mips64-elf-objdump exist, or configure objdump_executable."
     )
 
 
@@ -210,6 +305,10 @@ def eval_int(expr, emsg=None):
         return None
 
 
+def eval_line_num(expr):
+    return int(expr.strip().replace(":", ""), 16)
+
+
 def run_make(target, capture_output=False):
     if capture_output:
         return subprocess.run(
@@ -235,10 +334,26 @@ def restrict_to_function(dump, fn_name):
     return "\n".join(out)
 
 
+def maybe_get_objdump_source_flags():
+    if not args.source:
+        return []
+
+    flags = [
+        "--source",
+        "--source-comment=| ",
+        "-l",
+    ]
+
+    if args.inlines:
+        flags.append("--inlines")
+
+    return flags
+
+
 def run_objdump(cmd):
     flags, target, restrict = cmd
     out = subprocess.check_output(
-        [binutils_prefix + "objdump"] + flags + [target], universal_newlines=True
+        [objdump_executable] + arch_flags + flags + [target], universal_newlines=True
     )
     if restrict is not None:
         return restrict_to_function(out, restrict)
@@ -291,6 +406,36 @@ def search_map_file(fn_name):
     return None, None
 
 
+def dump_elf():
+    if not baseimg or not myimg:
+        fail("Missing myimg/baseimg in config.")
+    if base_shift:
+        fail("--base-shift not compatible with -e")
+
+    start_addr = eval_int(args.start, "Start address must be an integer expression.")
+
+    if args.end is not None:
+        end_addr = eval_int(args.end, "End address must be an integer expression.")
+    else:
+        end_addr = start_addr + MAX_FUNCTION_SIZE_BYTES
+
+    flags1 = [
+        f"--start-address={start_addr}",
+        f"--stop-address={end_addr}",
+    ]
+
+    flags2 = [
+        f"--disassemble={args.diff_elf_symbol}",
+    ]
+
+    objdump_flags = ["-drz", "-j", ".text"]
+    return (
+        myimg,
+        (objdump_flags + flags1, baseimg, None),
+        (objdump_flags + flags2 + maybe_get_objdump_source_flags(), myimg, None),
+    )
+
+
 def dump_objfile():
     if base_shift:
         fail("--base-shift not compatible with -o")
@@ -317,7 +462,7 @@ def dump_objfile():
     return (
         objfile,
         (objdump_flags, refobjfile, args.start),
-        (objdump_flags, objfile, args.start),
+        (objdump_flags + maybe_get_objdump_source_flags(), objfile, args.start),
     )
 
 
@@ -357,29 +502,47 @@ def ansi_ljust(s, width):
         return s
 
 
-re_int = re.compile(r"[0-9]+")
-re_comments = re.compile(r"<.*?>")
-re_regs = re.compile(r"\$?\b(a[0-3]|t[0-9]|s[0-8]|at|v[01]|f[12]?[0-9]|f3[01]|fp)\b")
-re_sprel = re.compile(r",([1-9][0-9]*|0x[1-9a-f][0-9a-f]*)\(sp\)")
-re_large_imm = re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}")
-re_imm = re.compile(r"(\b|-)([0-9]+|0x[0-9a-fA-F]+)\b(?!\(sp)|%(lo|hi)\([^)]*\)")
-forbidden = set(string.ascii_letters + "_")
-branch_likely_instructions = {
-    "beql",
-    "bnel",
-    "beqzl",
-    "bnezl",
-    "bgezl",
-    "bgtzl",
-    "blezl",
-    "bltzl",
-    "bc1tl",
-    "bc1fl",
-}
-branch_instructions = branch_likely_instructions.union(
-    {"b", "beq", "bne", "beqz", "bnez", "bgez", "bgtz", "blez", "bltz", "bc1t", "bc1f"}
-)
-jump_instructions = branch_instructions.union({"jal", "j"})
+if arch == "mips":
+    re_int = re.compile(r"[0-9]+")
+    re_comment = re.compile(r"<.*?>")
+    re_reg = re.compile(r"\$?\b(a[0-3]|t[0-9]|s[0-8]|at|v[01]|f[12]?[0-9]|f3[01]|k[01]|fp|ra)\b")
+    re_sprel = re.compile(r"(?<=,)([0-9]+|0x[0-9a-f]+)\(sp\)")
+    re_large_imm = re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}")
+    re_imm = re.compile(r"(\b|-)([0-9]+|0x[0-9a-fA-F]+)\b(?!\(sp)|%(lo|hi)\([^)]*\)")
+    forbidden = set(string.ascii_letters + "_")
+    arch_flags = ["-m", "mips:4300"]
+    branch_likely_instructions = {
+        "beql",
+        "bnel",
+        "beqzl",
+        "bnezl",
+        "bgezl",
+        "bgtzl",
+        "blezl",
+        "bltzl",
+        "bc1tl",
+        "bc1fl",
+    }
+    branch_instructions = branch_likely_instructions.union(
+        {"b", "beq", "bne", "beqz", "bnez", "bgez", "bgtz", "blez", "bltz", "bc1t", "bc1f"}
+    )
+    instructions_with_address_immediates = branch_instructions.union({"jal", "j"})
+elif arch == "aarch64":
+    re_int = re.compile(r"[0-9]+")
+    re_comment = re.compile(r"(<.*?>|//.*$)")
+    # GPRs and FP registers: X0-X30, W0-W30, [DSHQ]0..31
+    # The zero registers and SP should not be in this list.
+    re_reg = re.compile(r"\$?\b([dshq][12]?[0-9]|[dshq]3[01]|[xw][12]?[0-9]|[xw]30)\b")
+    re_sprel = re.compile(r"sp, #-?(0x[0-9a-fA-F]+|[0-9]+)\b")
+    re_large_imm = re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}")
+    re_imm = re.compile(r"(?<!sp, )#-?(0x[0-9a-fA-F]+|[0-9]+)\b")
+    arch_flags = []
+    forbidden = set(string.ascii_letters + "_")
+    branch_likely_instructions = set()
+    branch_instructions = {"bl", "b", "b.eq", "b.ne", "b.cs", "b.hs", "b.cc", "b.lo", "b.mi", "b.pl", "b.vs", "b.vc", "b.hi", "b.ls", "b.ge", "b.lt", "b.gt", "b.le", "cbz", "cbnz", "tbz", "tbnz"}
+    instructions_with_address_immediates = branch_instructions.union({"adrp"})
+else:
+    fail("Unknown architecture.")
 
 
 def hexify_int(row, pat):
@@ -412,11 +575,18 @@ def parse_relocated_line(line):
     return before, imm, after
 
 
-def process_reloc(row, prev):
+def process_mips_reloc(row, prev):
     before, imm, after = parse_relocated_line(prev)
     repl = row.split()[-1]
     if imm != "0":
-        if before.strip() == "jal" and not imm.startswith("0x"):
+        # MIPS uses relocations with addends embedded in the code as immediates.
+        # If there is an immediate, show it as part of the relocation. Ideally
+        # we'd show this addend in both %lo/%hi, but annoyingly objdump's output
+        # doesn't include enough information to pair up %lo's and %hi's...
+        # TODO: handle unambiguous cases where all addends for a symbol are the
+        # same, or show "+???".
+        mnemonic = prev.split()[0]
+        if mnemonic in instructions_with_address_immediates and not imm.startswith("0x"):
             imm = "0x" + imm
         repl += "+" + imm if int(imm, 0) > 0 else imm
     if "R_MIPS_LO16" in row:
@@ -431,38 +601,63 @@ def process_reloc(row, prev):
     return before + repl + after
 
 
+def pad_mnemonic(line):
+    if "\t" not in line:
+        return line
+    mn, args = line.split("\t", 1)
+    return f"{mn:<7s} {args}"
+
+
+class Line(NamedTuple):
+    mnemonic: str
+    diff_row: str
+    original: str
+    line_num: str
+    branch_target: Optional[str]
+    source_lines: List[str]
+    comment: Optional[str]
+
+
 def process(lines):
-    mnemonics = []
-    diff_rows = []
-    rows_with_imms = []
     skip_next = False
-    originals = []
-    line_nums = []
-    branch_targets = []
+    source_lines = []
     if not args.diff_obj:
         lines = lines[7:]
         if lines and not lines[-1]:
             lines.pop()
 
+    output = []
+    stop_after_delay_slot = False
     for row in lines:
         if args.diff_obj and (">:" in row or not row):
             continue
 
-        if "R_MIPS_" in row:
-            # N.B. Don't transform the diff rows, they already ignore immediates
-            # if diff_rows[-1] != '<delay-slot>':
-            # diff_rows[-1] = process_reloc(row, rows_with_imms[-1])
-            originals[-1] = process_reloc(row, originals[-1])
+        if args.source and (row and row[0] != " "):
+            source_lines.append(row)
             continue
 
-        row = re.sub(re_comments, "", row)
+        if "R_AARCH64_" in row:
+            # TODO: handle relocation
+            continue
+
+        if "R_MIPS_" in row:
+            # N.B. Don't transform the diff rows, they already ignore immediates
+            # if output[-1].diff_row != "<delay-slot>":
+            # output[-1] = output[-1].replace(diff_row=process_mips_reloc(row, output[-1].row_with_imm))
+            new_original = process_mips_reloc(row, output[-1].original)
+            output[-1] = output[-1]._replace(original=new_original)
+            continue
+
+        m_comment = re.search(re_comment, row)
+        comment = m_comment[0] if m_comment else None
+        row = re.sub(re_comment, "", row)
         row = row.rstrip()
         tabs = row.split("\t")
         row = "\t".join(tabs[2:])
         line_num = tabs[0].strip()
         row_parts = row.split("\t", 1)
         mnemonic = row_parts[0].strip()
-        if mnemonic not in jump_instructions:
+        if mnemonic not in instructions_with_address_immediates:
             row = re.sub(re_int, lambda s: hexify_int(row, s), row)
         original = row
         if skip_next:
@@ -471,42 +666,46 @@ def process(lines):
             mnemonic = "<delay-slot>"
         if mnemonic in branch_likely_instructions:
             skip_next = True
-        row = re.sub(re_regs, "<reg>", row)
-        row = re.sub(re_sprel, ",addr(sp)", row)
+        row = re.sub(re_reg, "<reg>", row)
+        row = re.sub(re_sprel, "addr(sp)", row)
         row_with_imm = row
-        if mnemonic in jump_instructions:
+        if mnemonic in instructions_with_address_immediates:
             row = row.strip()
             row, _ = split_off_branch(row)
             row += "<imm>"
         else:
-            row = re.sub(re_imm, "<imm>", row)
+            row = normalize_imms(row)
 
-        mnemonics.append(mnemonic)
-        rows_with_imms.append(row_with_imm)
-        diff_rows.append(row)
-        originals.append(original)
-        line_nums.append(line_num)
+        branch_target = None
         if mnemonic in branch_instructions:
             target = row_parts[1].strip().split(",")[-1]
             if mnemonic in branch_likely_instructions:
                 target = hex(int(target, 16) - 4)[2:]
-            branch_targets.append(target)
-        else:
-            branch_targets.append(None)
+            branch_target = target.strip()
+
+        output.append(
+            Line(
+                mnemonic=mnemonic,
+                diff_row=row,
+                original=original,
+                line_num=line_num,
+                branch_target=branch_target,
+                source_lines=source_lines,
+                comment=comment,
+            )
+        )
+        source_lines = []
+
         if args.stop_jrra and mnemonic == "jr" and row_parts[1].strip() == "ra":
+            stop_after_delay_slot = True
+        elif stop_after_delay_slot:
             break
 
-    # Cleanup whitespace
-    originals = [original.strip() for original in originals]
-    originals = [
-        "".join(f"{o:<8s}" for o in original.split("\t")) for original in originals
-    ]
-    # return diff_rows, diff_rows, line_nums
-    return mnemonics, diff_rows, originals, line_nums, branch_targets
+    return output
 
 
 def format_single_line_diff(line1, line2, column_width):
-    return f"{ansi_ljust(line1,column_width)}{ansi_ljust(line2,column_width)}"
+    return f"{ansi_ljust(line1,column_width)}{line2}"
 
 
 class SymbolColorer:
@@ -535,10 +734,14 @@ def normalize_imms(row):
     return re.sub(re_imm, "<imm>", row)
 
 
+def normalize_stack(row):
+    return re.sub(re_sprel, "addr(sp)", row)
+
+
 def split_off_branch(line):
     parts = line.split(",")
     if len(parts) < 2:
-        parts = line.split()
+        parts = line.split(None, 1)
     off = len(line) - len(parts[-1])
     return line[:off], line[off:]
 
@@ -600,21 +803,30 @@ def diff_sequences(seq1, seq2):
     return Levenshtein.opcodes(seq1, seq2)
 
 
-def do_diff(basedump, mydump):
-    asm_lines1 = basedump.split("\n")
-    asm_lines2 = mydump.split("\n")
+class OutputLine:
+    base: Optional[str]
+    fmt2: str
+    key2: str
 
-    output = []
+    def __init__(self, base: Optional[str], fmt2: str, key2: str) -> None:
+        self.base = base
+        self.fmt2 = fmt2
+        self.key2 = key2
 
-    # TODO: status line?
-    # output.append(sha1sum(mydump))
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, OutputLine):
+            return NotImplemented
+        return self.key2 == other.key2
 
-    mnemonics1, asm_lines1, originals1, line_nums1, branch_targets1 = process(
-        asm_lines1
-    )
-    mnemonics2, asm_lines2, originals2, line_nums2, branch_targets2 = process(
-        asm_lines2
-    )
+    def __hash__(self) -> int:
+        return hash(self.key2)
+
+
+def do_diff(basedump: str, mydump: str) -> List[OutputLine]:
+    output: List[OutputLine] = []
+
+    lines1 = process(basedump.split("\n"))
+    lines2 = process(mydump.split("\n"))
 
     sc1 = SymbolColorer(0)
     sc2 = SymbolColorer(0)
@@ -622,141 +834,227 @@ def do_diff(basedump, mydump):
     sc4 = SymbolColorer(4)
     sc5 = SymbolColorer(0)
     sc6 = SymbolColorer(0)
-    bts1 = set()
-    bts2 = set()
+    bts1: Set[str] = set()
+    bts2: Set[str] = set()
 
     if args.show_branches:
-        for (bts, btset, sc) in [
-            (branch_targets1, bts1, sc5),
-            (branch_targets2, bts2, sc6),
+        for (lines, btset, sc) in [
+            (lines1, bts1, sc5),
+            (lines2, bts2, sc6),
         ]:
-            for bt in bts:
+            for line in lines:
+                bt = line.branch_target
                 if bt is not None:
                     btset.add(bt + ":")
                     sc.color_symbol(bt + ":")
 
-    for (tag, i1, i2, j1, j2) in diff_sequences(mnemonics1, mnemonics2):
-        lines1 = asm_lines1[i1:i2]
-        lines2 = asm_lines2[j1:j2]
-
-        for k, (line1, line2) in enumerate(itertools.zip_longest(lines1, lines2)):
+    for (tag, i1, i2, j1, j2) in diff_sequences(
+        [line.mnemonic for line in lines1], [line.mnemonic for line in lines2]
+    ):
+        for line1, line2 in itertools.zip_longest(lines1[i1:i2], lines2[j1:j2]):
             if tag == "replace":
                 if line1 is None:
                     tag = "insert"
                 elif line2 is None:
                     tag = "delete"
-
-            try:
-                original1 = originals1[i1 + k]
-                line_num1 = line_nums1[i1 + k]
-            except:
-                original1 = ""
-                line_num1 = ""
-            try:
-                original2 = originals2[j1 + k]
-                line_num2 = line_nums2[j1 + k]
-            except:
-                original2 = ""
-                line_num2 = ""
+            elif tag == "insert":
+                assert line1 is None
+            elif tag == "delete":
+                assert line2 is None
 
             line_color1 = line_color2 = sym_color = Fore.RESET
             line_prefix = " "
-            if line1 == line2:
-                if maybe_normalize_large_imms(original1) == maybe_normalize_large_imms(
-                    original2
-                ):
-                    out1 = f"{original1}"
-                    out2 = f"{original2}"
-                elif line1 == "<delay-slot>":
-                    out1 = f"{Style.DIM}{original1}"
-                    out2 = f"{Style.DIM}{original2}"
+            if line1 and line2 and line1.diff_row == line2.diff_row:
+                if maybe_normalize_large_imms(
+                    line1.original
+                ) == maybe_normalize_large_imms(line2.original):
+                    out1 = line1.original
+                    out2 = line2.original
+                elif line1.diff_row == "<delay-slot>":
+                    out1 = f"{Style.BRIGHT}{Fore.LIGHTBLACK_EX}{line1.original}"
+                    out2 = f"{Style.BRIGHT}{Fore.LIGHTBLACK_EX}{line2.original}"
                 else:
-                    mnemonic = original1.split()[0]
-                    out1, out2 = original1, original2
+                    mnemonic = line1.original.split()[0]
+                    out1, out2 = line1.original, line2.original
                     branch1 = branch2 = ""
-                    if mnemonic in jump_instructions:
-                        out1, branch1 = split_off_branch(original1)
-                        out2, branch2 = split_off_branch(original2)
+                    if mnemonic in instructions_with_address_immediates:
+                        out1, branch1 = split_off_branch(line1.original)
+                        out2, branch2 = split_off_branch(line2.original)
                     branchless1 = out1
                     branchless2 = out2
                     out1, out2 = color_imms(out1, out2)
-                    branch1, branch2 = color_branch_imms(branch1, branch2)
+
+                    same_relative_target = False
+                    if line1.branch_target is not None and line2.branch_target is not None:
+                        relative_target1 = eval_line_num(line1.branch_target) - eval_line_num(line1.line_num)
+                        relative_target2 = eval_line_num(line2.branch_target) - eval_line_num(line2.line_num)
+                        same_relative_target = relative_target1 == relative_target2
+
+                    if not same_relative_target:
+                        branch1, branch2 = color_branch_imms(branch1, branch2)
+
                     out1 += branch1
                     out2 += branch2
                     if normalize_imms(branchless1) == normalize_imms(branchless2):
-                        # only imms differences
-                        sym_color = Fore.LIGHTBLUE_EX
-                        line_prefix = "i"
+                        if not same_relative_target:
+                            # only imms differences
+                            sym_color = Fore.LIGHTBLUE_EX
+                            line_prefix = "i"
                     else:
-                        # regs differences and maybe imms as well
-                        line_color1 = line_color2 = sym_color = Fore.YELLOW
-                        line_prefix = "r"
                         out1 = re.sub(
-                            re_regs, lambda s: sc1.color_symbol(s.group()), out1
+                            re_sprel, lambda s: sc3.color_symbol(s.group()), out1,
                         )
                         out2 = re.sub(
-                            re_regs, lambda s: sc2.color_symbol(s.group()), out2
+                            re_sprel, lambda s: sc4.color_symbol(s.group()), out2,
                         )
-                        out1 = re.sub(
-                            re_sprel, lambda s: sc3.color_symbol(s.group()), out1
-                        )
-                        out2 = re.sub(
-                            re_sprel, lambda s: sc4.color_symbol(s.group()), out2
-                        )
-                        out1 = f"{Fore.YELLOW}{out1}{Style.RESET_ALL}"
-                        out2 = f"{Fore.YELLOW}{out2}{Style.RESET_ALL}"
-            elif tag in ["replace", "equal"]:
+                        if normalize_stack(branchless1) == normalize_stack(branchless2):
+                            # only stack differences (luckily stack and imm
+                            # differences can't be combined in MIPS, so we
+                            # don't have to think about that case)
+                            sym_color = Fore.YELLOW
+                            line_prefix = "s"
+                        else:
+                            # regs differences and maybe imms as well
+                            out1 = re.sub(
+                                re_reg, lambda s: sc1.color_symbol(s.group()), out1
+                            )
+                            out2 = re.sub(
+                                re_reg, lambda s: sc2.color_symbol(s.group()), out2
+                            )
+                            line_color1 = line_color2 = sym_color = Fore.YELLOW
+                            line_prefix = "r"
+            elif line1 and line2:
                 line_prefix = "|"
                 line_color1 = Fore.LIGHTBLUE_EX
                 line_color2 = Fore.LIGHTBLUE_EX
                 sym_color = Fore.LIGHTBLUE_EX
-                out1 = f"{Fore.LIGHTBLUE_EX}{original1}{Style.RESET_ALL}"
-                out2 = f"{Fore.LIGHTBLUE_EX}{original2}{Style.RESET_ALL}"
-            elif tag == "delete":
+                out1 = line1.original
+                out2 = line2.original
+            elif line1:
                 line_prefix = "<"
-                line_color1 = line_color2 = sym_color = Fore.RED
-                out1 = f"{Fore.RED}{original1}{Style.RESET_ALL}"
+                line_color1 = sym_color = Fore.RED
+                out1 = line1.original
                 out2 = ""
-            elif tag == "insert":
+            elif line2:
                 line_prefix = ">"
-                line_color1 = line_color2 = sym_color = Fore.GREEN
+                line_color2 = sym_color = Fore.GREEN
                 out1 = ""
-                out2 = f"{Fore.GREEN}{original2}{Style.RESET_ALL}"
+                out2 = line2.original
 
-            in_arrow1 = "  "
-            in_arrow2 = "  "
-            out_arrow1 = ""
-            out_arrow2 = ""
-            line_num1 = line_num1 if out1 else ""
-            line_num2 = line_num2 if out2 else ""
+            if args.source and line2 and line2.comment:
+                out2 += f" {line2.comment}"
 
-            if args.show_branches and out1:
-                if line_num1 in bts1:
-                    in_arrow1 = sc5.color_symbol(line_num1, "~>")
-                if branch_targets1[i1 + k] is not None:
-                    out_arrow1 = " " + sc5.color_symbol(
-                        branch_targets1[i1 + k] + ":", "~>"
-                    )
-            if args.show_branches and out2:
-                if line_num2 in bts2:
-                    in_arrow2 = sc6.color_symbol(line_num2, "~>")
-                if branch_targets2[j1 + k] is not None:
-                    out_arrow2 = " " + sc6.color_symbol(
-                        branch_targets2[j1 + k] + ":", "~>"
-                    )
+            def format_part(out: str, line: Optional[Line], line_color: str, btset: Set[str], sc: SymbolColorer) -> Optional[str]:
+                if line is None:
+                    return None
+                in_arrow = "  "
+                out_arrow = ""
+                if args.show_branches:
+                    if line.line_num in btset:
+                        in_arrow = sc.color_symbol(line.line_num, "~>") + line_color
+                    if line.branch_target is not None:
+                        out_arrow = " " + sc.color_symbol(line.branch_target + ":", "~>")
+                out = pad_mnemonic(out)
+                return f"{line_color}{line.line_num} {in_arrow} {out}{Style.RESET_ALL}{out_arrow}"
 
-            if sym_color == line_color2:
-                line_color2 = ""
-            out1 = f"{line_color1}{line_num1} {in_arrow1} {out1}{Style.RESET_ALL}{out_arrow1}"
-            out2 = f"{sym_color}{line_prefix} {line_color2}{line_num2} {in_arrow2} {out2}{Style.RESET_ALL}{out_arrow2}"
-            output.append(format_single_line_diff(out1, out2, args.column_width))
+            part1 = format_part(out1, line1, line_color1, bts1, sc5)
+            part2 = format_part(out2, line2, line_color2, bts2, sc6)
+            key2 = line2.original if line2 else ""
 
-    return output[args.skip_lines :]
+            mid = f"{sym_color}{line_prefix}"
+
+            if line2:
+                for source_line in line2.source_lines:
+                    color = Style.DIM
+                    # File names and function names
+                    if source_line and source_line[0] != "|":
+                        color += Style.BRIGHT
+                        # Function names
+                        if source_line.endswith("():"):
+                            # Underline. Colorama does not provide this feature, unfortunately.
+                            color += "\u001b[4m"
+                            try:
+                                source_line = cxxfilt.demangle(
+                                    source_line[:-3], external_only=False
+                                )
+                            except:
+                                pass
+                    output.append(OutputLine(None, f"  {color}{source_line}{Style.RESET_ALL}", source_line))
+
+            fmt2 = mid + " " + (part2 or "")
+            output.append(OutputLine(part1, fmt2, key2))
+
+    return output
+
+
+def chunk_diff(diff: List[OutputLine]) -> List[Union[List[OutputLine], OutputLine]]:
+    cur_right: List[OutputLine] = []
+    chunks: List[Union[List[OutputLine], OutputLine]] = []
+    for output_line in diff:
+        if output_line.base is not None:
+            chunks.append(cur_right)
+            chunks.append(output_line)
+            cur_right = []
+        else:
+            cur_right.append(output_line)
+    chunks.append(cur_right)
+    return chunks
+
+
+def format_diff(old_diff: List[OutputLine], new_diff: List[OutputLine]) -> Tuple[str, List[str]]:
+    old_chunks = chunk_diff(old_diff)
+    new_chunks = chunk_diff(new_diff)
+    output: List[Tuple[str, OutputLine, OutputLine]] = []
+    assert len(old_chunks) == len(new_chunks), "same target"
+    empty = OutputLine("", "", "")
+    for old_chunk, new_chunk in zip(old_chunks, new_chunks):
+        if isinstance(old_chunk, list):
+            assert isinstance(new_chunk, list)
+            if not old_chunk and not new_chunk:
+                # Most of the time lines sync up without insertions/deletions,
+                # and there's no interdiffing to be done.
+                continue
+            differ = difflib.SequenceMatcher(a=old_chunk, b=new_chunk, autojunk=False)
+            for (tag, i1, i2, j1, j2) in differ.get_opcodes():
+                if tag in ["equal", "replace"]:
+                    for i, j in zip(range(i1, i2), range(j1, j2)):
+                        output.append(("", old_chunk[i], new_chunk[j]))
+                elif tag == "insert":
+                    for j in range(j1, j2):
+                        output.append(("", empty, new_chunk[j]))
+                else:
+                    for i in range(i1, i2):
+                        output.append(("", old_chunk[i], empty))
+        else:
+            assert isinstance(new_chunk, OutputLine)
+            # old_chunk.base and new_chunk.base have the same text since
+            # both diffs are based on the same target, but they might
+            # differ in color. Use the new version.
+            output.append((new_chunk.base or "", old_chunk, new_chunk))
+
+    # TODO: status line, with e.g. approximate permuter score?
+    width = args.column_width
+    if args.threeway:
+        header_line = "TARGET".ljust(width) + "  CURRENT".ljust(width) + "  PREVIOUS"
+        diff_lines = [
+            ansi_ljust(base, width)
+            + ansi_ljust(new.fmt2, width)
+            + (old.fmt2 or "-" if old != new else "")
+            for (base, old, new) in output
+        ]
+    else:
+        header_line = ""
+        diff_lines = [
+            ansi_ljust(base, width) + new.fmt2
+            for (base, old, new) in output
+            if base or new.key2
+        ]
+    return header_line, diff_lines
 
 
 def debounced_fs_watch(targets, outq, debounce_delay):
-    import watchdog.events
-    import watchdog.observers
+    import watchdog.events  # type: ignore
+    import watchdog.observers  # type: ignore
 
     class WatchEventHandler(watchdog.events.FileSystemEventHandler):
         def __init__(self, queue, file_targets):
@@ -827,12 +1125,18 @@ class Display:
         self.basedump = basedump
         self.mydump = mydump
         self.emsg = None
+        self.last_diff_output = None
 
     def run_less(self):
         if self.emsg is not None:
             output = self.emsg
         else:
-            output = "\n".join(do_diff(self.basedump, self.mydump))
+            diff_output = do_diff(self.basedump, self.mydump)
+            last_diff_output = self.last_diff_output or diff_output
+            self.last_diff_output = diff_output
+            header, diff_lines = format_diff(last_diff_output, diff_output)
+            header_lines = [header] if header else []
+            output = "\n".join(header_lines + diff_lines[args.skip_lines :])
 
         # Pipe the output through 'tail' and only then to less, to ensure the
         # write call doesn't block. ('tail' has to buffer all its input before
@@ -912,14 +1216,16 @@ class Display:
 
 
 def main():
-    if args.diff_obj:
+    if args.diff_elf_symbol:
+        make_target, basecmd, mycmd = dump_elf()
+    elif args.diff_obj:
         make_target, basecmd, mycmd = dump_objfile()
     else:
         make_target, basecmd, mycmd = dump_binary()
 
     if args.write_asm is not None:
         mydump = run_objdump(mycmd)
-        with open(args.write_asm) as f:
+        with open(args.write_asm, "w") as f:
             f.write(mydump)
         print(f"Wrote assembly to {args.write_asm}.")
         sys.exit(0)
@@ -980,4 +1286,4 @@ def main():
             display.terminate()
 
 
-main()
\ No newline at end of file
+main()