Reworked text extraction + add JP text extraction (#1980)

* Reworked text extraction + add JP text extraction * Format * Suggested changes * Correct address for gc-us sJpnMessageEntryTable Co-authored-by: cadmic <cadmic24@gmail.com> --------- Co-authored-by: cadmic <cadmic24@gmail.com>
2025-08-23 23:41:24 +00:00 · 2024-07-03 03:42:52 +01:00 · 2024-07-03 03:42:52 +01:00 · baf1e8c174
commit baf1e8c174
parent 264581ff3f
19 changed files with 1326 additions and 707 deletions
--- a/tools/msgenc.py
+++ b/tools/msgenc.py
@ -3,41 +3,104 @@
 #   message_data_static text encoder
 #

-import argparse, ast, re
-import sys
+import argparse, ast, re, sys
+from typing import Dict, Optional

-def read_charmap(path):
+def read_charmap(path : str, wchar : bool) -> Dict[str,str]:
    with open(path) as infile:
        charmap = infile.read()
-
    charmap = ast.literal_eval(charmap)
-    charmap = { repr(k)[1:-1] : chr(v) for k,v in charmap.items() }

-    return charmap
+    out_charmap = {}
+    for k,v in charmap.items():
+        v = v[wchar]
+        if v is None:
+            v = 0
+        assert isinstance(k, str)
+        assert v in (range(0xFFFF + 1) if wchar else range(0xFF + 1))
+
+        k = repr(k)[1:-1]
+
+        if wchar:
+            u = (v >> 8) & 0xFF
+            l = (v >> 0) & 0xFF
+            out_charmap[k] = f"0x{u:02X}, 0x{l:02X},"
+        else:
+            out_charmap[k] = f"0x{v:02X},"
+
+    return out_charmap

 # From https://stackoverflow.com/questions/241327/remove-c-and-c-comments-using-python
-def remove_comments(text):
-    def replacer(match):
-        s = match.group(0)
-        if s.startswith('/'):
-            return " " # note: a space and not an empty string
+def remove_comments(text : str) -> str:
+    def replacer(match : re.Match) -> str:
+        string : str = match.group(0)
+        if string.startswith("/"):
+            return " "  # note: a space and not an empty string
        else:
-            return s
+            return string

    pattern = re.compile(
-        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
-        re.DOTALL | re.MULTILINE
+        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', re.DOTALL | re.MULTILINE
    )
    return re.sub(pattern, replacer, text)

-def convert_text(text, charmap):
-    def cvt_str(m):
-        string = m.group(0)
+def convert_text(text : str, encoding : str, charmap : Dict[str, str]) -> str:
+    def cvt_str(match : re.Match) -> str:
+        string : str = match.group(0)

-        for orig,char in charmap.items():
-            string = string.replace(orig, char)
+        # strip quotes
+        string = string[1:-1]

-        return string
+        def cvt_escape(s : str):
+            # Convert escape sequences such as "\\\"" to "\""
+            return s.encode("ascii").decode("unicode-escape")
+
+        run_start = 0
+
+        def emit(text : Optional[str], advance : int):
+            nonlocal out, string, i, run_start
+            # flush text
+            to_flush = string[run_start:i]
+            if len(string[run_start:i]) != 0:
+                out += ",".join(f"0x{b:02X}" for b in to_flush.encode(encoding))
+                out += ","
+            if text is None:
+                return
+            # emit + advance source pos
+            out += text
+            i += advance
+            # start new run
+            run_start = i
+
+        out = ""
+
+        i = 0
+        while i != len(string):
+            # check charmap
+            for k in charmap.keys():
+                if string.startswith(k, i):
+                    # is in charmap, emit the mapped sequence
+                    emit(charmap[k], len(k))
+                    break
+            else:
+                if string[i] == "\\" and string[i + 1] != "\\":
+                    # is already escaped, emit the escape sequence verbatim
+                    if string[i + 1] == "x":
+                        # \x**
+                        emit("0" + string[i + 1 : i + 4] + ",", 4)
+                    else:
+                        # \*
+                        e = cvt_escape(string[i : i + 2]).encode(encoding)
+                        assert len(e) == 1
+                        emit(f"0x{e[0]:02X},", 2)
+                else:
+                    # increment pos, accumulating text that requires encoding
+                    i += 1
+
+        # emit remaining accumulated text
+        emit(None, 0)
+
+        return out

    # Naive string matcher, assumes single line strings and no comments, handles escaped quotations
    string_regex = re.compile(r'"((?:[^\\"\n]|\\.)*)"')
@ -50,16 +113,23 @@ def convert_text(text, charmap):
    return text

 def main():
-    parser = argparse.ArgumentParser(description="Encode message_data_static text headers")
+    parser = argparse.ArgumentParser(
+        description="Encode message_data_static text headers"
+    )
    parser.add_argument(
        "input",
        help="path to file to be encoded, or - for stdin",
    )
    parser.add_argument(
-        "--output",
-        "-o",
+        "output",
        help="path to write encoded file, or - for stdout",
+    )
+    parser.add_argument(
+        "--encoding",
+        help="encoding (jpn or nes)",
        required=True,
+        type=str,
+        choices=("jpn", "nes"),
    )
    parser.add_argument(
        "--charmap",
@ -68,7 +138,12 @@ def main():
    )
    args = parser.parse_args()

-    charmap = read_charmap(args.charmap)
+    wchar,encoding = {
+        "jpn" : (True, "SHIFT-JIS"),
+        "nes" : (False, "raw-unicode-escape"),
+    }[args.encoding]
+
+    charmap = read_charmap(args.charmap, wchar)

    text = ""
    if args.input == "-":
@ -78,12 +153,12 @@ def main():
            text = infile.read()

    text = remove_comments(text)
-    text = convert_text(text, charmap)
+    text = convert_text(text, encoding, charmap)

    if args.output == "-":
-        sys.stdout.buffer.write(text.encode("raw_unicode_escape"))
+        sys.stdout.buffer.write(text.encode("utf-8"))
    else:
-        with open(args.output, "w", encoding="raw_unicode_escape") as outfile:
+        with open(args.output, "w") as outfile:
            outfile.write(text)

 if __name__ == "__main__":