Last active
June 27, 2025 05:01
-
-
Save UserUnknownFactor/571fa194799ba94440ecfbd289795390 to your computer and use it in GitHub Desktop.
Tool for extracting and modifying strings in Unity IL2CPP global-metadata.dat file to variable length strings; can keep overall string table size the same by trimming/padding disposable strings (with 1 in the 4th csv column)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import json | |
import os | |
import shutil | |
import sys | |
from filetranslate.service_fn import read_csv_list, write_csv_list | |
MAGIC_BYTES = b'\xAF\x1B\xB1\xFA' | |
LOOKUP_TABLE_DEFINITION_OFFSET = 8 | |
LOOKUP_TABLE_SIZE_DEFINITION_OFFSET = 12 | |
STRINGLITERAL_DATA_DEFINITION_OFFSET = 16 | |
STRINGLITERAL_DATA_SIZE_DEFINITION_OFFSET = 20 | |
IGNORE_NON_UTF8 = False | |
class LookupTableEntry: | |
length: int | |
index: int | |
def __init__(self, length: int, index: int): | |
self.length = length | |
self.index = index | |
class StringLiteral: | |
index: int | |
data: bytes | |
disposable: bool | |
valid: bool | |
def __init__(self, index: int, data: bytes, disposable: bool = False, valid_utf8: bool = True): | |
self.index = index | |
self.data = data | |
self.disposable = disposable | |
self.valid = valid_utf8 | |
def to_dict(self): | |
return { | |
'index': self.index, | |
'value': self.data.decode("utf-8", "ignore"), | |
'disposable': self.disposable | |
} | |
@staticmethod | |
def from_dict(d: dict): | |
if 'index' not in d or 'value' not in d: | |
raise Exception('Invalid StringLiteral object') | |
disposable = d.get('disposable', False) | |
# Encode string to bytes when importing from JSON | |
return StringLiteral(d['index'], d['value'].encode("utf-8"), disposable) | |
def get_string_value(self): | |
# Helper method to get string representation when needed | |
return self.data.decode("utf-8", "ignore") | |
def set_string_value(self, string_value): | |
# Helper method to set from string when needed | |
self.data = string_value.encode("utf-8") | |
def __iter__(self): | |
yield self.index | |
yield self.get_string_value() | |
yield self.disposable | |
def __getitem__(self, key): | |
if key == 0 or key == 'index': | |
return self.index | |
elif key == 1 or key == 'value': | |
return self.get_string_value() | |
elif key == 2 or key == 'disposable': | |
return self.disposable | |
elif key == 'valid': | |
return self.valid | |
else: | |
raise IndexError(f"Invalid index: {key}") | |
def __len__(self): | |
return 3 | |
class StringLiteralManager: | |
def __init__(self, filepath=None): | |
self.filepath = filepath | |
self.lookup_table = [] | |
self.stringliterals = [] | |
self.original_sizes = [] | |
self.original_total_size = 0 | |
if filepath: | |
self.extract() | |
def extract(self, filepath=None): | |
"""Extract string literals from a global-metadata.dat file""" | |
if filepath: | |
self.filepath = filepath | |
if not self.filepath: | |
raise ValueError("No filepath specified") | |
with open(self.filepath, "rb") as f: | |
# Validate magic bytes | |
if f.read(4) != MAGIC_BYTES: | |
raise Exception("Invalid global-metadata file") | |
# Get offsets and sizes | |
f.seek(LOOKUP_TABLE_DEFINITION_OFFSET) | |
lookup_table_offset = int.from_bytes(f.read(4), byteorder="little") | |
f.seek(LOOKUP_TABLE_SIZE_DEFINITION_OFFSET) | |
lookup_table_size = int.from_bytes(f.read(4), byteorder="little") | |
f.seek(STRINGLITERAL_DATA_DEFINITION_OFFSET) | |
stringliteral_data_offset = int.from_bytes(f.read(4), byteorder="little") | |
# Extract lookup table | |
f.seek(lookup_table_offset) | |
bytes_read = 0 | |
while bytes_read < lookup_table_size: | |
length = int.from_bytes(f.read(4), byteorder="little") | |
index = int.from_bytes(f.read(4), byteorder="little") | |
self.lookup_table.append(LookupTableEntry(length, index)) | |
bytes_read += 8 | |
# Extract string literals | |
for idx, entry in enumerate(self.lookup_table): | |
f.seek(stringliteral_data_offset + entry.index) | |
literal_data = f.read(entry.length) | |
if IGNORE_NON_UTF8: | |
self.stringliterals.append(StringLiteral(idx, literal_data)) | |
else: | |
try: | |
allowed_chars = {'\n', '\t', '\r', '\x3000'} | |
if any(not c.isprintable() and c not in allowed_chars for c in literal_data.decode("utf-8")): | |
raise | |
self.stringliterals.append(StringLiteral(idx, literal_data)) | |
except: | |
#print(f"Invalid text in literal {literal_data} @ {idx}") | |
self.stringliterals.append(StringLiteral(idx, literal_data, valid_utf8=False)) | |
# Save original string sizes for balancing | |
self.original_sizes = [len(s.data) for s in self.stringliterals] | |
self.original_total_size = sum(self.original_sizes) | |
return self | |
def dump_csv(self, output_path): | |
"""Export string literals to CSV file""" | |
array_form = [] | |
for sl in self.stringliterals: | |
if not sl.valid: | |
print("Invalid literal:", sl.data) | |
continue | |
# Decode bytes to string only when dumping to CSV | |
array_form.append([sl.get_string_value(), '', sl.index, '']) | |
write_csv_list(output_path, array_form) | |
return output_path | |
def dump_json(self, output_path): | |
"""Export string literals to JSON file""" | |
with open(output_path, "w", encoding="utf-8") as f: | |
string_data = [sl.to_dict() for sl in self.stringliterals if sl.valid] | |
f.write(json.dumps(string_data, indent=2, ensure_ascii=False)) | |
return output_path | |
def load_modified_strings(self, filepath): | |
"""Load modified strings from a CSV or JSON file""" | |
# Dictionary to track which indices were updated | |
updated_indices = set() | |
if filepath.endswith('.json'): | |
with open(filepath, "r", encoding="utf-8") as f: | |
data = json.load(f) | |
for entry in data: | |
idx = entry['index'] | |
if idx >= 0 and idx < len(self.stringliterals): | |
value = entry['value'] | |
disposable = entry.get('disposable', False) | |
# Convert to bytes when setting | |
self.stringliterals[idx].data = value.encode("utf-8") | |
self.stringliterals[idx].disposable = disposable | |
updated_indices.add(idx) | |
else: | |
print(f"Warning: Index {idx} out of range, skipping") | |
elif filepath.endswith('.csv'): | |
reader = read_csv_list(filepath) | |
for row in reader: | |
if len(row) < 3: continue | |
try: | |
idx = int(row[2]) | |
if idx >= 0 and idx < len(self.stringliterals): | |
new_value = row[1] | |
disposable = len(row) >= 4 and row[3] == '1' | |
self.stringliterals[idx].disposable = disposable | |
if row[0].startswith("//") and not new_value.startswith("//"): continue | |
# Only update if new value is provided | |
if new_value: | |
self.stringliterals[idx].data = new_value.encode("utf-8") | |
updated_indices.add(idx) | |
else: | |
print(f"Warning: Index {idx} out of range, skipping") | |
except (ValueError, KeyError) as e: | |
print(f"Warning: Invalid row in CSV, skipping: {e}") | |
else: | |
raise ValueError("Unsupported file format. Use .json or .csv") | |
# Report on modification stats | |
print(f"Modified {len(updated_indices)} strings out of {len(self.stringliterals)} total") | |
print(f"Kept {len(self.stringliterals) - len(updated_indices)} original strings") | |
return updated_indices | |
def balance_string_sizes(self): | |
"""Balance string sizes to match the original total size""" | |
# Calculate total size before modifications | |
original_size = self.original_total_size | |
# Calculate new size after modifications | |
new_size = 0 | |
for idx, string_literal in enumerate(self.stringliterals): | |
new_size += len(string_literal.data) | |
# If new size is larger, we need to trim disposable strings | |
if new_size > original_size: | |
excess_bytes = new_size - original_size | |
print(f"New strings exceed original size by {excess_bytes} bytes") | |
# First, identify all disposable strings | |
disposable_strings = [] | |
for idx, string_literal in enumerate(self.stringliterals): | |
if string_literal.disposable: | |
disposable_strings.append((idx, len(string_literal.data))) | |
# Sort disposable strings by length (descending) to trim larger strings first | |
disposable_strings.sort(key=lambda x: x[1], reverse=True) | |
# Trim disposable strings to fit within original size | |
bytes_trimmed = 0 | |
for idx, size in disposable_strings: | |
if bytes_trimmed >= excess_bytes: | |
break | |
string_literal = self.stringliterals[idx] | |
max_trim = min(size - 1, excess_bytes - bytes_trimmed) # Keep at least 1 byte | |
if max_trim > 0: | |
# Trim the string to reduce its size | |
new_length = size - max_trim | |
# Ensure we cut at valid UTF-8 boundaries | |
while new_length > 0: | |
try: | |
trimmed = string_literal.data[:new_length].decode("utf-8") | |
string_literal.data = trimmed.encode("utf-8") | |
break | |
except UnicodeDecodeError: | |
new_length -= 1 | |
if new_length > 0: | |
bytes_trimmed += size - len(string_literal.data) | |
print(f"Trimmed string {idx} from {size} to {len(string_literal.data)} bytes") | |
# If we still couldn't trim enough, warn the user | |
if bytes_trimmed < excess_bytes: | |
print(f"Warning: Could only trim {bytes_trimmed} of {excess_bytes} excess bytes") | |
print("The resulting file may be larger than the original") | |
# If new size is smaller, we can pad disposable strings to match original size | |
elif new_size < original_size: | |
deficit_bytes = original_size - new_size | |
print(f"New strings are {deficit_bytes} bytes smaller than original") | |
# Identify all disposable strings | |
disposable_strings = [] | |
for idx, string_literal in enumerate(self.stringliterals): | |
if string_literal.disposable: | |
disposable_strings.append(idx) | |
if disposable_strings and len(disposable_strings) > 0: | |
# Sort by index to ensure consistent padding | |
last_idx = None | |
for idx in disposable_strings: | |
if last_idx is None or idx >= last_idx: | |
last_idx = idx | |
if last_idx is not None and deficit_bytes > 0: | |
string_literal = self.stringliterals[last_idx] | |
# Decode to string, add spaces, then encode back to bytes | |
string_value = string_literal.get_string_value() | |
padded_value = string_value + ' ' * deficit_bytes | |
string_literal.data = padded_value.encode("utf-8") | |
actual_padding = len(string_literal.data) - len(string_value.encode("utf-8")) | |
print(f"Added a total of {actual_padding} bytes of padding") | |
else: | |
print("No disposable strings found for padding") | |
return self | |
def update_string_offsets(self): | |
"""Update string offsets in the lookup table""" | |
# Update lengths based on new string values | |
for idx, string_literal in enumerate(self.stringliterals): | |
self.lookup_table[idx].length = len(string_literal.data) | |
# Recalculate indices | |
index = 0 | |
for entry in self.lookup_table: | |
entry.index = index | |
index += entry.length | |
return self | |
def patch(self, output_filepath): | |
"""Create a patched metadata file with modified strings""" | |
# Create a copy of the original file | |
shutil.copy2(self.filepath, output_filepath) | |
# Calculate total size after modifications | |
new_total_size = sum(len(s.data) for s in self.stringliterals) | |
in_place_replacement = new_total_size <= self.original_total_size | |
with open(output_filepath, "rb+") as f: | |
# Get string literals offset from header | |
f.seek(STRINGLITERAL_DATA_DEFINITION_OFFSET) | |
stringliteral_data_offset = int.from_bytes(f.read(4), byteorder="little") | |
if in_place_replacement: | |
# In-place replacement: write strings at their original location | |
print("Performing in-place string replacement") | |
# Update lookup table first (lengths might have changed) | |
f.seek(LOOKUP_TABLE_DEFINITION_OFFSET) | |
lookup_table_offset = int.from_bytes(f.read(4), byteorder="little") | |
f.seek(lookup_table_offset) | |
for entry in self.lookup_table: | |
f.write(entry.length.to_bytes(4, byteorder="little")) | |
f.write(entry.index.to_bytes(4, byteorder="little")) | |
# Write strings at their original locations | |
for string_literal in self.stringliterals: | |
entry = self.lookup_table[string_literal.index] | |
f.seek(stringliteral_data_offset + entry.index) | |
f.write(string_literal.data) | |
else: | |
# Append mode: add strings at the end of the file | |
print("Appending strings to end of file (balancing was insufficient)") | |
# Get end of file position for appending strings | |
f.seek(0, os.SEEK_END) | |
new_strings_offset = f.tell() | |
# Append all strings at the end | |
for entry in self.stringliterals: | |
f.write(entry.data) | |
# Update lookup table | |
f.seek(LOOKUP_TABLE_DEFINITION_OFFSET) | |
lookup_table_offset = int.from_bytes(f.read(4), byteorder="little") | |
f.seek(lookup_table_offset) | |
for entry in self.lookup_table: | |
f.write(entry.length.to_bytes(4, byteorder="little")) | |
f.write(entry.index.to_bytes(4, byteorder="little")) | |
# Update string data offset in header | |
f.seek(STRINGLITERAL_DATA_DEFINITION_OFFSET) | |
f.write(new_strings_offset.to_bytes(4, byteorder="little")) | |
return output_filepath | |
def main(): | |
parser = argparse.ArgumentParser( | |
description='Extract and reinsert strings in Unity IL2CPP global-metadata.dat' | |
) | |
subparsers = parser.add_subparsers(dest='command', help='Command to execute') | |
# Extract command | |
extract_parser = subparsers.add_parser('extract', help='Extract strings from global-metadata.dat') | |
extract_parser.add_argument('input', default='global-metadata.dat', help='Path to global-metadata.dat file') | |
extract_parser.add_argument('output', default='global-metadata_strings.csv', help='Path to output CSV or JSON file') | |
# Reinsert command | |
reinsert_parser = subparsers.add_parser('reinsert', help='Reinsert strings into global-metadata.dat') | |
reinsert_parser.add_argument('input_metadata', default='global-metadata.dat', help='Path to original global-metadata.dat file') | |
reinsert_parser.add_argument('input_strings', default='global-metadata_strings.csv', help='Path to CSV or JSON file with modified strings') | |
reinsert_parser.add_argument('output_metadata', default='global-metadata.dat_patched', help='Path to output modified global-metadata.dat file') | |
args = parser.parse_args() | |
if args.command == 'extract': | |
manager = StringLiteralManager(args.input) | |
if args.output.endswith('.json'): | |
manager.dump_json(args.output) | |
else: | |
manager.dump_csv(args.output) | |
print(f"Successfully extracted {len(manager.stringliterals)} strings to {args.output}") | |
elif args.command == 'reinsert': | |
manager = StringLiteralManager(args.input_metadata) | |
manager.load_modified_strings(args.input_strings) | |
manager.balance_string_sizes() | |
manager.update_string_offsets() | |
manager.patch(args.output_metadata) | |
print(f"Successfully patched metadata file: {args.output_metadata}") | |
else: | |
parser.print_help() | |
return 1 | |
return 0 | |
if __name__ == "__main__": | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment