Rewrite preprocess.py with bash and C (#2035)

* add C preprocess_pragmas and Bash preprocess * "line return" -> newline * align tools sources * fix: handle files that are not newline-terminated * use a temp directory with a same-basename file instead of a temp file * macos compat * remove debug code
2025-08-07 06:40:15 +00:00 · 2024-08-14 10:05:36 +02:00 · 2024-08-14 10:05:36 +02:00 · 91a534cbc9
commit 91a534cbc9
parent f6338bab1f
6 changed files with 247 additions and 118 deletions
--- a/2
+++ b/2
@ -452,7 +452,7 @@ $(BUILD_DIR)/src/code/jpegdecoder.o: CC := $(CC_OLD)
 ifeq ($(PERMUTER),)  # permuter + preprocess.py misbehaves, permuter doesn't care about rodata diffs or bss ordering so just don't use it in that case
 # Handle encoding (UTF-8 -> EUC-JP) and custom pragmas
-$(BUILD_DIR)/src/%.o: CC := $(PYTHON) tools/preprocess.py -v $(VERSION) -- $(CC)
+$(BUILD_DIR)/src/%.o: CC := ./tools/preprocess.sh -v $(VERSION) -- $(CC)
 endif
 else
--- a/tools/.gitignore
+++ b/tools/.gitignore
@ -4,6 +4,7 @@ elf2rom
 makeromfs
 mkdmadata
 mkldscript
 preprocess_pragmas
 reloc_prereq
 vtxdis
 yaz0
--- a/tools/Makefile
+++ b/tools/Makefile
@ -1,5 +1,5 @@
 CFLAGS := -Wall -Wextra -pedantic -std=c99 -g -O2
-PROGRAMS := elf2rom makeromfs mkdmadata mkldscript reloc_prereq vtxdis
+PROGRAMS := elf2rom makeromfs mkdmadata mkldscript preprocess_pragmas reloc_prereq vtxdis
 ifeq ($(shell command -v clang >/dev/null 2>&1; echo $$?),0)
  CC := clang
@ -33,12 +33,13 @@ distclean: clean
 .PHONY: all clean distclean
-elf2rom_SOURCES      := elf2rom.c elf32.c n64chksum.c util.c
+elf2rom_SOURCES            := elf2rom.c elf32.c n64chksum.c util.c
-makeromfs_SOURCES    := makeromfs.c n64chksum.c util.c
+makeromfs_SOURCES          := makeromfs.c n64chksum.c util.c
-mkdmadata_SOURCES    := mkdmadata.c spec.c util.c
+mkdmadata_SOURCES          := mkdmadata.c spec.c util.c
-mkldscript_SOURCES   := mkldscript.c spec.c util.c
+mkldscript_SOURCES         := mkldscript.c spec.c util.c
-reloc_prereq_SOURCES := reloc_prereq.c spec.c util.c
+preprocess_pragmas_SOURCES := preprocess_pragmas.c
-vtxdis_SOURCES       := vtxdis.c
+reloc_prereq_SOURCES       := reloc_prereq.c spec.c util.c
 vtxdis_SOURCES             := vtxdis.c
 define COMPILE =
--- a/tools/preprocess.py
+++ b/tools/preprocess.py
@ -1,110 +0,0 @@
 #!/usr/bin/env python3
 # SPDX-FileCopyrightText: © 2024 ZeldaRET
 # SPDX-License-Identifier: CC0-1.0
 # Usage: preprocess.py [flags] -- [compile command minus input file...] [single input file]
 # Preprocess a C file to:
 # * Re-encode from UTF-8 to EUC-JP (the repo uses UTF-8 for text encoding, but
 #   the strings in the ROM are encoded in EUC-JP)
 # * Replace `#pragma increment_block_number` with fake structs for controlling BSS ordering
 import argparse
 from pathlib import Path
 import re
 import tempfile
 import subprocess
 import sys
 import typing
 def fail(message):
    print(message, file=sys.stderr)
    sys.exit(1)
 def process_file(
    version: str,
    filename: str,
    input: typing.TextIO,
    output: typing.TextIO,
 ):
    output.write(f'#line 1 "{filename}"\n')
    # whether the current line follows a #pragma increment_block_number,
    # including continuation lines (lines after a \-ending line)
    in_pragma_incblocknum = False
    # the line where the #pragma increment_block_number is
    pragma_incblocknum_first_line_num = None
    # all the lines from the #pragma increment_block_number line to the last
    # continuation line, as a list[str]
    pragma_incblocknum_lines = None
    for i, line in enumerate(input, start=1):
        if not in_pragma_incblocknum and line.startswith(
            "#pragma increment_block_number"
        ):
            in_pragma_incblocknum = True
            pragma_incblocknum_first_line_num = i
            pragma_incblocknum_lines = []
        if in_pragma_incblocknum:
            if line.endswith("\\\n"):
                pragma_incblocknum_lines.append(line)
            else:
                in_pragma_incblocknum = False
                pragma_incblocknum_lines.append(line)
                amount = 0
                for s in pragma_incblocknum_lines:
                    # Note if we had two versions like "abc-def-version" and "def-version"
                    # then this code would find either given "def-version", but
                    # thankfully we don't have such nested version names.
                    m = re.search(rf"{version}:(\d+)\b", s)
                    if m:
                        amount = int(m.group(1))
                        break
                # Always generate at least one struct,
                # so that fix_bss.py can know where the increment_block_number pragmas are
                if amount == 0:
                    amount = 256
                # Write fake structs for BSS ordering
                # pragma_incblocknum_first_line_num is used for symbol uniqueness, and
                # also by fix_bss.py to locate the pragma these symbols originate from.
                for j in range(amount):
                    output.write(
                        "struct increment_block_number_"
                        f"{pragma_incblocknum_first_line_num:05}_{j:03};\n"
                    )
                output.write(f'#line {i + 1} "{filename}"\n')
        else:
            output.write(line)
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-v", "--oot-version", help="Which version should be processed")
    parser.add_argument(
        "args",
        nargs="+",
    )
    args = parser.parse_args()
    filename = Path(args.args[-1])
    with tempfile.TemporaryDirectory(prefix="oot_") as tmpdir:
        tmpfile = Path(tmpdir) / filename.name
        with open(filename, mode="r", encoding="utf-8") as input:
            with open(tmpfile, mode="w", encoding="euc-jp") as output:
                process_file(args.oot_version, filename, input, output)
        compile_command = args.args[:-1] + ["-I", filename.parent, tmpfile]
        process = subprocess.run(compile_command)
        return process.returncode
 if __name__ == "__main__":
    try:
        sys.exit(main())
    except KeyboardInterrupt:
        sys.exit(1)
--- a/tools/preprocess.sh
+++ b/tools/preprocess.sh
@ -0,0 +1,83 @@
 #!/bin/bash
 # SPDX-FileCopyrightText: © 2024 ZeldaRET
 # SPDX-License-Identifier: CC0-1.0
 # Usage: preprocess [flags] -- [compile command minus input file...] [single input file]
 # Flags: -v OOT_VERSION (required)
 # Preprocess a C file to:
 # * Re-encode from UTF-8 to EUC-JP
 #   (the repo uses UTF-8 for text encoding, but the strings in the ROM are encoded in EUC-JP)
 # * Replace `#pragma increment_block_number` (see preprocess_pragma)
 set -e
 set -o pipefail
 if [ "${VERBOSE-}" ]
 then
    set -x
 fi
 for i in `seq ${#@}`
 do
    if [[ "${!i}" = '--' ]]
    then
        # flags before --
        flags=("${@:1:$(($i - 1))}")
        # compile command, betwen -- and the input source file
        compilecmd="${@:$(($i + 1)):$((${#@} - $i - 1))}"
        # The last argument, the input source file to be compiled
        srcfile="${@: -1}"
        break
    fi
 done
 if [ "${VERBOSE-}" ]
 then
    echo flags="${flags[@]}"
    echo compilecmd="$compilecmd"
    echo srcfile="$srcfile"
 fi
 while getopts "v:" opt "${flags[@]}"
 do
    case $opt in
        v)
            OOT_VERSION=$OPTARG
            ;;
        ?)
            echo "Error: Bad flags"
            exit 1
            ;;
    esac
 done
 if [[ "${!OPTIND}" != '--' ]]
 then
    echo "Error: Positional arguments in flags not allowed"
    exit 1
 fi
 if [ -z "${OOT_VERSION-}" ]
 then
    echo Missing -v
    exit
 fi
 # Create a temporary directory, and remove it on script exit
 # We use a temp dir instead of a temp file because ido_block_numbers.py and fix_bss.py
 # need the symbol table .T file from IDO, which is always named like the input file.
 # So we use a file named like the original input file, inside a temp dir.
 tempdir=`mktemp -d`
 tempfile=$tempdir/`basename $srcfile`
 trap "rm -rf $tempdir" EXIT
 # Preprocess pragmas and re-encode from UTF-8 to EUC-JP
 {
    printf '#line 1 "%s"\n' "$srcfile"  # linemarker
    ./tools/preprocess_pragmas $OOT_VERSION "$srcfile" < "$srcfile"
 } | iconv -f UTF-8 -t EUC-JP > "$tempfile"
 # Also include the source file's directory to have the include path as if we compiled the original source.
 # Pass the processed temporary file for compilation.
 $compilecmd -I `dirname $srcfile` $tempfile
--- a/tools/preprocess_pragmas.c
+++ b/tools/preprocess_pragmas.c
@ -0,0 +1,154 @@
 // SPDX-FileCopyrightText: © 2024 ZeldaRET
 // SPDX-License-Identifier: CC0-1.0
 // Usage: preprocess_pragmas OOT_VERSION filename < source.c
 // The filename argument is only used for linemarkers.
 // Preprocess C source on stdin, writes to stdout
 // Replace `#pragma increment_block_number` with fake structs for controlling BSS ordering.
 // The names of these fake structs are expected to be increment_block_number_%d_%d with the first number indicating
 // the line number of the #pragma in the original source file. (this is for use by fix_bss.py)
 #include <assert.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 const char str_pragma_increment_block_number[] = "#pragma increment_block_number";
 int main(int argc, char** argv) {
    if (argc != 3) {
        fprintf(stderr, "Usage: preprocess_pragmas OOT_VERSION filename < source.c\n");
        return EXIT_FAILURE;
    }
    char* const version = argv[1];
    const int len_version = strlen(version);
    char* const filename = argv[2];
    char buf[32 * 1024];
    char* const bufend = buf + sizeof(buf);
    char* bufp = buf;
    bool cont = true;
    int line_num = 1;
    // whether the current line follows a #pragma increment_block_number,
    // including continuation lines (lines after a \-ending line)
    bool is_in_pragma = false;
    // the line where the #pragma increment_block_number is
    int pragma_line_number;
    // how many fake structs to write to replace the current pragma
    int n_fake_structs;
    while (cont) {
        size_t nread = fread(bufp, 1, bufend - bufp, stdin);
        bufp += nread;
        if (nread == 0) {
            if (!feof(stdin)) {
                perror("fread");
                fprintf(stderr, "Failed to read from stdin\n");
                return EXIT_FAILURE;
            }
            cont = false;
            if (bufp == buf) {
                // All lines processed
                break;
            } else {
                // The buffer contains the last line and that line isn't terminated with a newline.
                // Add a final newline and do one last iteration.
                assert(bufp < bufend);
                *bufp = '\n';
                bufp++;
            }
        }
        char* last_newline = NULL;
        for (char* p = bufp - 1; p >= buf; p--) {
            if (*p == '\n') {
                last_newline = p;
                break;
            }
        }
        if (last_newline == NULL) {
            // No newline, read more data.
            // Assert there is space for it (there should be no line long enough to not fit in buf).
            assert(bufp < bufend);
            continue;
        }
        char* line = buf;
        while (true) {
            char* line_end = line;
            while (*line_end != '\n') {
                line_end++;
                assert(line_end <= last_newline);
            }
            if (!strncmp(line, str_pragma_increment_block_number, strlen(str_pragma_increment_block_number))) {
                is_in_pragma = true;
                pragma_line_number = line_num;
                n_fake_structs = 0;
            }
            if (is_in_pragma) {
                *line_end = '\0';
                char* version_amount_item = strstr(line, version);
                if (version_amount_item != NULL) {
                    if (version_amount_item[len_version] != ':') {
                        fprintf(stderr, "Found version %s in pragma line but no :amount attached\n", version);
                        fprintf(stderr, "%s\n", line);
                        return EXIT_FAILURE;
                    }
                    char* version_amount_str_start = &version_amount_item[len_version + 1];
                    char* version_amount_str_end;
                    long amount = strtol(version_amount_str_start, &version_amount_str_end, 10);
                    if (version_amount_str_start == version_amount_str_end) {
                        fprintf(stderr, "Found version %s in pragma line but no amount integer\n", version);
                        fprintf(stderr, "%s\n", line);
                        return EXIT_FAILURE;
                    }
                    n_fake_structs = (int)amount;
                }
            } else {
                char* p = line;
                size_t sz = line_end + 1 - line;
                while (sz != 0) {
                    size_t nwritten = fwrite(p, 1, sz, stdout);
                    if (nwritten == 0) {
                        fprintf(stderr, "Failed to write to stdout\n");
                        return EXIT_FAILURE;
                    }
                    p += nwritten;
                    sz -= nwritten;
                }
            }
            if (is_in_pragma && line_end[-1] != '\\') {
                is_in_pragma = false;
                // Always generate at least one struct,
                // so that fix_bss.py can know where the increment_block_number pragmas are
                if (n_fake_structs == 0) {
                    n_fake_structs = 256;
                }
                // Write fake structs for BSS ordering
                // pragma_line_number is used for symbol uniqueness,
                // and also by fix_bss.py to locate the pragma these symbols originate from.
                for (int i = 0; i < n_fake_structs; i++)
                    fprintf(stdout, "struct increment_block_number_%05d_%03d;\n", pragma_line_number, i);
                fprintf(stdout, "#line %d \"%s\"\n", line_num + 1, filename);
            }
            line_num++;
            if (line_end == last_newline)
                break;
            line = line_end + 1;
        }
        assert(bufp <= bufend);
        assert(bufp > last_newline);
        char* next_incomplete_line_start = last_newline + 1;
        ptrdiff_t next_incomplete_line_sz = bufp - next_incomplete_line_start;
        assert(next_incomplete_line_sz >= 0);
        memmove(buf, next_incomplete_line_start, next_incomplete_line_sz);
        bufp = buf + next_incomplete_line_sz;
    }
    return EXIT_SUCCESS;
 }