#!/usr/bin/env python3 # # message_data_static text encoder # import argparse, ast, re, sys from typing import Dict, Optional def read_charmap(path : str, wchar : bool) -> Dict[str,str]: with open(path) as infile: charmap = infile.read() charmap = ast.literal_eval(charmap) out_charmap = {} for k,v in charmap.items(): assert isinstance(k, str) assert isinstance(v, int) and v in range(0xFFFF + 1) k = repr(k)[1:-1] if wchar or v > 0xFF: # split value across two bytes u = (v >> 8) & 0xFF l = (v >> 0) & 0xFF out_charmap[k] = f"0x{u:02X},0x{l:02X}," else: out_charmap[k] = f"0x{v:02X}," return out_charmap # From https://stackoverflow.com/questions/241327/remove-c-and-c-comments-using-python def remove_comments(text : str) -> str: def replacer(match : re.Match) -> str: string : str = match.group(0) if string.startswith("/"): return " " # note: a space and not an empty string else: return string pattern = re.compile( r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', re.DOTALL | re.MULTILINE ) return re.sub(pattern, replacer, text) def convert_text(text : str, encoding : str, charmap : Dict[str, str]) -> str: def cvt_str(match : re.Match) -> str: string : str = match.group(0) # strip quotes string = string[1:-1] def cvt_escape(s : str): # Convert escape sequences such as "\\\"" to "\"" return s.encode("ascii").decode("unicode-escape") run_start = 0 def emit(text : Optional[str], advance : int): nonlocal out, string, i, run_start # flush text to_flush = string[run_start:i] if len(string[run_start:i]) != 0: out += ",".join(f"0x{b:02X}" for b in to_flush.encode(encoding, "replace")) out += "," if text is None: return # emit + advance source pos out += text i += advance # start new run run_start = i out = "" i = 0 while i != len(string): # check charmap for k in charmap.keys(): if string.startswith(k, i): # is in charmap, emit the mapped sequence emit(charmap[k], len(k)) break else: if string[i] == "\\" and string[i + 1] != "\\": # is already escaped, emit the escape sequence verbatim if string[i + 1] == "x": # \x** emit("0" + string[i + 1 : i + 4] + ",", 4) else: # \* e = cvt_escape(string[i : i + 2]).encode(encoding) assert len(e) == 1 emit(f"0x{e[0]:02X},", 2) else: # increment pos, accumulating text that requires encoding i += 1 # emit remaining accumulated text emit(None, 0) return out # Naive string matcher, assumes single line strings and no comments, handles escaped quotations string_regex = re.compile(r'"((?:[^\\"\n]|\\.)*)"') # Collapse escaped newlines text = text.replace("\\\n", "") # Encode according to charmap text = re.sub(string_regex, cvt_str, text) return text def main(): parser = argparse.ArgumentParser( description="Encode message_data_static text headers" ) parser.add_argument( "input", help="path to file to be encoded, or - for stdin", ) parser.add_argument( "output", help="path to write encoded file, or - for stdout", ) parser.add_argument( "--encoding", help="base text encoding", required=True, type=str, choices=("utf-8", "SHIFT-JIS"), ) parser.add_argument( "--charmap", help="path to charmap file specifying custom encoding elements", required=True, ) parser.add_argument( "--wchar", help="force wide encoding", required=False, action="store_true" ) args = parser.parse_args() charmap = read_charmap(args.charmap, args.wchar) text = "" if args.input == "-": text = sys.stdin.read() else: with open(args.input, "r") as infile: text = infile.read() text = remove_comments(text) text = convert_text(text, args.encoding, charmap) if args.output == "-": sys.stdout.buffer.write(text.encode("utf-8")) else: with open(args.output, "w") as outfile: outfile.write(text) if __name__ == "__main__": main()