From 91a534cbc9cb4e0b7e5a1198731b1b66972e7b43 Mon Sep 17 00:00:00 2001
From: Dragorn421 <Dragorn421@users.noreply.github.com>
Date: Wed, 14 Aug 2024 10:05:36 +0200
Subject: [PATCH] Rewrite preprocess.py with bash and C (#2035)

* add C preprocess_pragmas and Bash preprocess

* "line return" -> newline

* align tools sources

* fix: handle files that are not newline-terminated

* use a temp directory with a same-basename file instead of a temp file

* macos compat

* remove debug code
---
 Makefile                   |   2 +-
 tools/.gitignore           |   1 +
 tools/Makefile             |  15 ++--
 tools/preprocess.py        | 110 --------------------------
 tools/preprocess.sh        |  83 ++++++++++++++++++++
 tools/preprocess_pragmas.c | 154 +++++++++++++++++++++++++++++++++++++
 6 files changed, 247 insertions(+), 118 deletions(-)
 delete mode 100755 tools/preprocess.py
 create mode 100755 tools/preprocess.sh
 create mode 100644 tools/preprocess_pragmas.c

diff --git a/Makefile b/Makefile
index 4a7ec5ac8b..c5eaeff79a 100644
--- a/Makefile
+++ b/Makefile
@@ -452,7 +452,7 @@ $(BUILD_DIR)/src/code/jpegdecoder.o: CC := $(CC_OLD)
 
 ifeq ($(PERMUTER),)  # permuter + preprocess.py misbehaves, permuter doesn't care about rodata diffs or bss ordering so just don't use it in that case
 # Handle encoding (UTF-8 -> EUC-JP) and custom pragmas
-$(BUILD_DIR)/src/%.o: CC := $(PYTHON) tools/preprocess.py -v $(VERSION) -- $(CC)
+$(BUILD_DIR)/src/%.o: CC := ./tools/preprocess.sh -v $(VERSION) -- $(CC)
 endif
 
 else
diff --git a/tools/.gitignore b/tools/.gitignore
index 4dff1be3ae..9583c693b9 100644
--- a/tools/.gitignore
+++ b/tools/.gitignore
@@ -4,6 +4,7 @@ elf2rom
 makeromfs
 mkdmadata
 mkldscript
+preprocess_pragmas
 reloc_prereq
 vtxdis
 yaz0
diff --git a/tools/Makefile b/tools/Makefile
index 62bc881e17..3cde87ebc2 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,5 +1,5 @@
 CFLAGS := -Wall -Wextra -pedantic -std=c99 -g -O2
-PROGRAMS := elf2rom makeromfs mkdmadata mkldscript reloc_prereq vtxdis
+PROGRAMS := elf2rom makeromfs mkdmadata mkldscript preprocess_pragmas reloc_prereq vtxdis
 
 ifeq ($(shell command -v clang >/dev/null 2>&1; echo $$?),0)
   CC := clang
@@ -33,12 +33,13 @@ distclean: clean
 
 .PHONY: all clean distclean
 
-elf2rom_SOURCES      := elf2rom.c elf32.c n64chksum.c util.c
-makeromfs_SOURCES    := makeromfs.c n64chksum.c util.c
-mkdmadata_SOURCES    := mkdmadata.c spec.c util.c
-mkldscript_SOURCES   := mkldscript.c spec.c util.c
-reloc_prereq_SOURCES := reloc_prereq.c spec.c util.c
-vtxdis_SOURCES       := vtxdis.c
+elf2rom_SOURCES            := elf2rom.c elf32.c n64chksum.c util.c
+makeromfs_SOURCES          := makeromfs.c n64chksum.c util.c
+mkdmadata_SOURCES          := mkdmadata.c spec.c util.c
+mkldscript_SOURCES         := mkldscript.c spec.c util.c
+preprocess_pragmas_SOURCES := preprocess_pragmas.c
+reloc_prereq_SOURCES       := reloc_prereq.c spec.c util.c
+vtxdis_SOURCES             := vtxdis.c
 
 
 define COMPILE =
diff --git a/tools/preprocess.py b/tools/preprocess.py
deleted file mode 100755
index c39bf835c8..0000000000
--- a/tools/preprocess.py
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/usr/bin/env python3
-
-# SPDX-FileCopyrightText: © 2024 ZeldaRET
-# SPDX-License-Identifier: CC0-1.0
-
-# Usage: preprocess.py [flags] -- [compile command minus input file...] [single input file]
-# Preprocess a C file to:
-# * Re-encode from UTF-8 to EUC-JP (the repo uses UTF-8 for text encoding, but
-#   the strings in the ROM are encoded in EUC-JP)
-# * Replace `#pragma increment_block_number` with fake structs for controlling BSS ordering
-
-import argparse
-from pathlib import Path
-import re
-import tempfile
-import subprocess
-import sys
-import typing
-
-
-def fail(message):
-    print(message, file=sys.stderr)
-    sys.exit(1)
-
-
-def process_file(
-    version: str,
-    filename: str,
-    input: typing.TextIO,
-    output: typing.TextIO,
-):
-    output.write(f'#line 1 "{filename}"\n')
-    # whether the current line follows a #pragma increment_block_number,
-    # including continuation lines (lines after a \-ending line)
-    in_pragma_incblocknum = False
-    # the line where the #pragma increment_block_number is
-    pragma_incblocknum_first_line_num = None
-    # all the lines from the #pragma increment_block_number line to the last
-    # continuation line, as a list[str]
-    pragma_incblocknum_lines = None
-    for i, line in enumerate(input, start=1):
-        if not in_pragma_incblocknum and line.startswith(
-            "#pragma increment_block_number"
-        ):
-            in_pragma_incblocknum = True
-            pragma_incblocknum_first_line_num = i
-            pragma_incblocknum_lines = []
-
-        if in_pragma_incblocknum:
-            if line.endswith("\\\n"):
-                pragma_incblocknum_lines.append(line)
-            else:
-                in_pragma_incblocknum = False
-                pragma_incblocknum_lines.append(line)
-                amount = 0
-                for s in pragma_incblocknum_lines:
-                    # Note if we had two versions like "abc-def-version" and "def-version"
-                    # then this code would find either given "def-version", but
-                    # thankfully we don't have such nested version names.
-                    m = re.search(rf"{version}:(\d+)\b", s)
-                    if m:
-                        amount = int(m.group(1))
-                        break
-
-                # Always generate at least one struct,
-                # so that fix_bss.py can know where the increment_block_number pragmas are
-                if amount == 0:
-                    amount = 256
-
-                # Write fake structs for BSS ordering
-                # pragma_incblocknum_first_line_num is used for symbol uniqueness, and
-                # also by fix_bss.py to locate the pragma these symbols originate from.
-                for j in range(amount):
-                    output.write(
-                        "struct increment_block_number_"
-                        f"{pragma_incblocknum_first_line_num:05}_{j:03};\n"
-                    )
-                output.write(f'#line {i + 1} "{filename}"\n')
-        else:
-            output.write(line)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-v", "--oot-version", help="Which version should be processed")
-    parser.add_argument(
-        "args",
-        nargs="+",
-    )
-
-    args = parser.parse_args()
-
-    filename = Path(args.args[-1])
-    with tempfile.TemporaryDirectory(prefix="oot_") as tmpdir:
-        tmpfile = Path(tmpdir) / filename.name
-
-        with open(filename, mode="r", encoding="utf-8") as input:
-            with open(tmpfile, mode="w", encoding="euc-jp") as output:
-                process_file(args.oot_version, filename, input, output)
-
-        compile_command = args.args[:-1] + ["-I", filename.parent, tmpfile]
-        process = subprocess.run(compile_command)
-        return process.returncode
-
-
-if __name__ == "__main__":
-    try:
-        sys.exit(main())
-    except KeyboardInterrupt:
-        sys.exit(1)
diff --git a/tools/preprocess.sh b/tools/preprocess.sh
new file mode 100755
index 0000000000..e29c751727
--- /dev/null
+++ b/tools/preprocess.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: © 2024 ZeldaRET
+# SPDX-License-Identifier: CC0-1.0
+
+# Usage: preprocess [flags] -- [compile command minus input file...] [single input file]
+# Flags: -v OOT_VERSION (required)
+# Preprocess a C file to:
+# * Re-encode from UTF-8 to EUC-JP
+#   (the repo uses UTF-8 for text encoding, but the strings in the ROM are encoded in EUC-JP)
+# * Replace `#pragma increment_block_number` (see preprocess_pragma)
+
+set -e
+set -o pipefail
+
+if [ "${VERBOSE-}" ]
+then
+    set -x
+fi
+
+for i in `seq ${#@}`
+do
+    if [[ "${!i}" = '--' ]]
+    then
+        # flags before --
+        flags=("${@:1:$(($i - 1))}")
+        # compile command, betwen -- and the input source file
+        compilecmd="${@:$(($i + 1)):$((${#@} - $i - 1))}"
+        # The last argument, the input source file to be compiled
+        srcfile="${@: -1}"
+        break
+    fi
+done
+
+if [ "${VERBOSE-}" ]
+then
+    echo flags="${flags[@]}"
+    echo compilecmd="$compilecmd"
+    echo srcfile="$srcfile"
+fi
+
+while getopts "v:" opt "${flags[@]}"
+do
+    case $opt in
+        v)
+            OOT_VERSION=$OPTARG
+            ;;
+        ?)
+            echo "Error: Bad flags"
+            exit 1
+            ;;
+    esac
+done
+
+if [[ "${!OPTIND}" != '--' ]]
+then
+    echo "Error: Positional arguments in flags not allowed"
+    exit 1
+fi
+
+if [ -z "${OOT_VERSION-}" ]
+then
+    echo Missing -v
+    exit
+fi
+
+# Create a temporary directory, and remove it on script exit
+# We use a temp dir instead of a temp file because ido_block_numbers.py and fix_bss.py
+# need the symbol table .T file from IDO, which is always named like the input file.
+# So we use a file named like the original input file, inside a temp dir.
+tempdir=`mktemp -d`
+tempfile=$tempdir/`basename $srcfile`
+trap "rm -rf $tempdir" EXIT
+
+# Preprocess pragmas and re-encode from UTF-8 to EUC-JP
+{
+    printf '#line 1 "%s"\n' "$srcfile"  # linemarker
+    ./tools/preprocess_pragmas $OOT_VERSION "$srcfile" < "$srcfile"
+} | iconv -f UTF-8 -t EUC-JP > "$tempfile"
+
+# Also include the source file's directory to have the include path as if we compiled the original source.
+# Pass the processed temporary file for compilation.
+$compilecmd -I `dirname $srcfile` $tempfile
diff --git a/tools/preprocess_pragmas.c b/tools/preprocess_pragmas.c
new file mode 100644
index 0000000000..e31b863d07
--- /dev/null
+++ b/tools/preprocess_pragmas.c
@@ -0,0 +1,154 @@
+
+// SPDX-FileCopyrightText: © 2024 ZeldaRET
+// SPDX-License-Identifier: CC0-1.0
+
+// Usage: preprocess_pragmas OOT_VERSION filename < source.c
+// The filename argument is only used for linemarkers.
+// Preprocess C source on stdin, writes to stdout
+// Replace `#pragma increment_block_number` with fake structs for controlling BSS ordering.
+// The names of these fake structs are expected to be increment_block_number_%d_%d with the first number indicating
+// the line number of the #pragma in the original source file. (this is for use by fix_bss.py)
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+const char str_pragma_increment_block_number[] = "#pragma increment_block_number";
+
+int main(int argc, char** argv) {
+    if (argc != 3) {
+        fprintf(stderr, "Usage: preprocess_pragmas OOT_VERSION filename < source.c\n");
+        return EXIT_FAILURE;
+    }
+    char* const version = argv[1];
+    const int len_version = strlen(version);
+    char* const filename = argv[2];
+
+    char buf[32 * 1024];
+    char* const bufend = buf + sizeof(buf);
+    char* bufp = buf;
+    bool cont = true;
+    int line_num = 1;
+    // whether the current line follows a #pragma increment_block_number,
+    // including continuation lines (lines after a \-ending line)
+    bool is_in_pragma = false;
+    // the line where the #pragma increment_block_number is
+    int pragma_line_number;
+    // how many fake structs to write to replace the current pragma
+    int n_fake_structs;
+
+    while (cont) {
+        size_t nread = fread(bufp, 1, bufend - bufp, stdin);
+        bufp += nread;
+        if (nread == 0) {
+            if (!feof(stdin)) {
+                perror("fread");
+                fprintf(stderr, "Failed to read from stdin\n");
+                return EXIT_FAILURE;
+            }
+            cont = false;
+            if (bufp == buf) {
+                // All lines processed
+                break;
+            } else {
+                // The buffer contains the last line and that line isn't terminated with a newline.
+                // Add a final newline and do one last iteration.
+                assert(bufp < bufend);
+                *bufp = '\n';
+                bufp++;
+            }
+        }
+
+        char* last_newline = NULL;
+        for (char* p = bufp - 1; p >= buf; p--) {
+            if (*p == '\n') {
+                last_newline = p;
+                break;
+            }
+        }
+        if (last_newline == NULL) {
+            // No newline, read more data.
+            // Assert there is space for it (there should be no line long enough to not fit in buf).
+            assert(bufp < bufend);
+            continue;
+        }
+
+        char* line = buf;
+        while (true) {
+            char* line_end = line;
+            while (*line_end != '\n') {
+                line_end++;
+                assert(line_end <= last_newline);
+            }
+            if (!strncmp(line, str_pragma_increment_block_number, strlen(str_pragma_increment_block_number))) {
+                is_in_pragma = true;
+                pragma_line_number = line_num;
+                n_fake_structs = 0;
+            }
+            if (is_in_pragma) {
+                *line_end = '\0';
+                char* version_amount_item = strstr(line, version);
+                if (version_amount_item != NULL) {
+                    if (version_amount_item[len_version] != ':') {
+                        fprintf(stderr, "Found version %s in pragma line but no :amount attached\n", version);
+                        fprintf(stderr, "%s\n", line);
+                        return EXIT_FAILURE;
+                    }
+                    char* version_amount_str_start = &version_amount_item[len_version + 1];
+                    char* version_amount_str_end;
+                    long amount = strtol(version_amount_str_start, &version_amount_str_end, 10);
+                    if (version_amount_str_start == version_amount_str_end) {
+                        fprintf(stderr, "Found version %s in pragma line but no amount integer\n", version);
+                        fprintf(stderr, "%s\n", line);
+                        return EXIT_FAILURE;
+                    }
+                    n_fake_structs = (int)amount;
+                }
+            } else {
+                char* p = line;
+                size_t sz = line_end + 1 - line;
+                while (sz != 0) {
+                    size_t nwritten = fwrite(p, 1, sz, stdout);
+                    if (nwritten == 0) {
+                        fprintf(stderr, "Failed to write to stdout\n");
+                        return EXIT_FAILURE;
+                    }
+                    p += nwritten;
+                    sz -= nwritten;
+                }
+            }
+            if (is_in_pragma && line_end[-1] != '\\') {
+                is_in_pragma = false;
+
+                // Always generate at least one struct,
+                // so that fix_bss.py can know where the increment_block_number pragmas are
+                if (n_fake_structs == 0) {
+                    n_fake_structs = 256;
+                }
+
+                // Write fake structs for BSS ordering
+                // pragma_line_number is used for symbol uniqueness,
+                // and also by fix_bss.py to locate the pragma these symbols originate from.
+                for (int i = 0; i < n_fake_structs; i++)
+                    fprintf(stdout, "struct increment_block_number_%05d_%03d;\n", pragma_line_number, i);
+                fprintf(stdout, "#line %d \"%s\"\n", line_num + 1, filename);
+            }
+            line_num++;
+            if (line_end == last_newline)
+                break;
+            line = line_end + 1;
+        }
+        assert(bufp <= bufend);
+        assert(bufp > last_newline);
+        char* next_incomplete_line_start = last_newline + 1;
+        ptrdiff_t next_incomplete_line_sz = bufp - next_incomplete_line_start;
+        assert(next_incomplete_line_sz >= 0);
+        memmove(buf, next_incomplete_line_start, next_incomplete_line_sz);
+        bufp = buf + next_incomplete_line_sz;
+    }
+
+    return EXIT_SUCCESS;
+}