Skip to content

Instantly share code, notes, and snippets.

@UserUnknownFactor
Last active June 27, 2025 05:01
Show Gist options
  • Save UserUnknownFactor/571fa194799ba94440ecfbd289795390 to your computer and use it in GitHub Desktop.
Save UserUnknownFactor/571fa194799ba94440ecfbd289795390 to your computer and use it in GitHub Desktop.
Tool for extracting and modifying strings in Unity IL2CPP global-metadata.dat file to variable length strings; can keep overall string table size the same by trimming/padding disposable strings (with 1 in the 4th csv column)
#!/usr/bin/env python3
import argparse
import json
import os
import shutil
import sys
from filetranslate.service_fn import read_csv_list, write_csv_list
MAGIC_BYTES = b'\xAF\x1B\xB1\xFA'
LOOKUP_TABLE_DEFINITION_OFFSET = 8
LOOKUP_TABLE_SIZE_DEFINITION_OFFSET = 12
STRINGLITERAL_DATA_DEFINITION_OFFSET = 16
STRINGLITERAL_DATA_SIZE_DEFINITION_OFFSET = 20
IGNORE_NON_UTF8 = False
class LookupTableEntry:
length: int
index: int
def __init__(self, length: int, index: int):
self.length = length
self.index = index
class StringLiteral:
index: int
data: bytes
disposable: bool
valid: bool
def __init__(self, index: int, data: bytes, disposable: bool = False, valid_utf8: bool = True):
self.index = index
self.data = data
self.disposable = disposable
self.valid = valid_utf8
def to_dict(self):
return {
'index': self.index,
'value': self.data.decode("utf-8", "ignore"),
'disposable': self.disposable
}
@staticmethod
def from_dict(d: dict):
if 'index' not in d or 'value' not in d:
raise Exception('Invalid StringLiteral object')
disposable = d.get('disposable', False)
# Encode string to bytes when importing from JSON
return StringLiteral(d['index'], d['value'].encode("utf-8"), disposable)
def get_string_value(self):
# Helper method to get string representation when needed
return self.data.decode("utf-8", "ignore")
def set_string_value(self, string_value):
# Helper method to set from string when needed
self.data = string_value.encode("utf-8")
def __iter__(self):
yield self.index
yield self.get_string_value()
yield self.disposable
def __getitem__(self, key):
if key == 0 or key == 'index':
return self.index
elif key == 1 or key == 'value':
return self.get_string_value()
elif key == 2 or key == 'disposable':
return self.disposable
elif key == 'valid':
return self.valid
else:
raise IndexError(f"Invalid index: {key}")
def __len__(self):
return 3
class StringLiteralManager:
def __init__(self, filepath=None):
self.filepath = filepath
self.lookup_table = []
self.stringliterals = []
self.original_sizes = []
self.original_total_size = 0
if filepath:
self.extract()
def extract(self, filepath=None):
"""Extract string literals from a global-metadata.dat file"""
if filepath:
self.filepath = filepath
if not self.filepath:
raise ValueError("No filepath specified")
with open(self.filepath, "rb") as f:
# Validate magic bytes
if f.read(4) != MAGIC_BYTES:
raise Exception("Invalid global-metadata file")
# Get offsets and sizes
f.seek(LOOKUP_TABLE_DEFINITION_OFFSET)
lookup_table_offset = int.from_bytes(f.read(4), byteorder="little")
f.seek(LOOKUP_TABLE_SIZE_DEFINITION_OFFSET)
lookup_table_size = int.from_bytes(f.read(4), byteorder="little")
f.seek(STRINGLITERAL_DATA_DEFINITION_OFFSET)
stringliteral_data_offset = int.from_bytes(f.read(4), byteorder="little")
# Extract lookup table
f.seek(lookup_table_offset)
bytes_read = 0
while bytes_read < lookup_table_size:
length = int.from_bytes(f.read(4), byteorder="little")
index = int.from_bytes(f.read(4), byteorder="little")
self.lookup_table.append(LookupTableEntry(length, index))
bytes_read += 8
# Extract string literals
for idx, entry in enumerate(self.lookup_table):
f.seek(stringliteral_data_offset + entry.index)
literal_data = f.read(entry.length)
if IGNORE_NON_UTF8:
self.stringliterals.append(StringLiteral(idx, literal_data))
else:
try:
allowed_chars = {'\n', '\t', '\r', '\x3000'}
if any(not c.isprintable() and c not in allowed_chars for c in literal_data.decode("utf-8")):
raise
self.stringliterals.append(StringLiteral(idx, literal_data))
except:
#print(f"Invalid text in literal {literal_data} @ {idx}")
self.stringliterals.append(StringLiteral(idx, literal_data, valid_utf8=False))
# Save original string sizes for balancing
self.original_sizes = [len(s.data) for s in self.stringliterals]
self.original_total_size = sum(self.original_sizes)
return self
def dump_csv(self, output_path):
"""Export string literals to CSV file"""
array_form = []
for sl in self.stringliterals:
if not sl.valid:
print("Invalid literal:", sl.data)
continue
# Decode bytes to string only when dumping to CSV
array_form.append([sl.get_string_value(), '', sl.index, ''])
write_csv_list(output_path, array_form)
return output_path
def dump_json(self, output_path):
"""Export string literals to JSON file"""
with open(output_path, "w", encoding="utf-8") as f:
string_data = [sl.to_dict() for sl in self.stringliterals if sl.valid]
f.write(json.dumps(string_data, indent=2, ensure_ascii=False))
return output_path
def load_modified_strings(self, filepath):
"""Load modified strings from a CSV or JSON file"""
# Dictionary to track which indices were updated
updated_indices = set()
if filepath.endswith('.json'):
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
for entry in data:
idx = entry['index']
if idx >= 0 and idx < len(self.stringliterals):
value = entry['value']
disposable = entry.get('disposable', False)
# Convert to bytes when setting
self.stringliterals[idx].data = value.encode("utf-8")
self.stringliterals[idx].disposable = disposable
updated_indices.add(idx)
else:
print(f"Warning: Index {idx} out of range, skipping")
elif filepath.endswith('.csv'):
reader = read_csv_list(filepath)
for row in reader:
if len(row) < 3: continue
try:
idx = int(row[2])
if idx >= 0 and idx < len(self.stringliterals):
new_value = row[1]
disposable = len(row) >= 4 and row[3] == '1'
self.stringliterals[idx].disposable = disposable
if row[0].startswith("//") and not new_value.startswith("//"): continue
# Only update if new value is provided
if new_value:
self.stringliterals[idx].data = new_value.encode("utf-8")
updated_indices.add(idx)
else:
print(f"Warning: Index {idx} out of range, skipping")
except (ValueError, KeyError) as e:
print(f"Warning: Invalid row in CSV, skipping: {e}")
else:
raise ValueError("Unsupported file format. Use .json or .csv")
# Report on modification stats
print(f"Modified {len(updated_indices)} strings out of {len(self.stringliterals)} total")
print(f"Kept {len(self.stringliterals) - len(updated_indices)} original strings")
return updated_indices
def balance_string_sizes(self):
"""Balance string sizes to match the original total size"""
# Calculate total size before modifications
original_size = self.original_total_size
# Calculate new size after modifications
new_size = 0
for idx, string_literal in enumerate(self.stringliterals):
new_size += len(string_literal.data)
# If new size is larger, we need to trim disposable strings
if new_size > original_size:
excess_bytes = new_size - original_size
print(f"New strings exceed original size by {excess_bytes} bytes")
# First, identify all disposable strings
disposable_strings = []
for idx, string_literal in enumerate(self.stringliterals):
if string_literal.disposable:
disposable_strings.append((idx, len(string_literal.data)))
# Sort disposable strings by length (descending) to trim larger strings first
disposable_strings.sort(key=lambda x: x[1], reverse=True)
# Trim disposable strings to fit within original size
bytes_trimmed = 0
for idx, size in disposable_strings:
if bytes_trimmed >= excess_bytes:
break
string_literal = self.stringliterals[idx]
max_trim = min(size - 1, excess_bytes - bytes_trimmed) # Keep at least 1 byte
if max_trim > 0:
# Trim the string to reduce its size
new_length = size - max_trim
# Ensure we cut at valid UTF-8 boundaries
while new_length > 0:
try:
trimmed = string_literal.data[:new_length].decode("utf-8")
string_literal.data = trimmed.encode("utf-8")
break
except UnicodeDecodeError:
new_length -= 1
if new_length > 0:
bytes_trimmed += size - len(string_literal.data)
print(f"Trimmed string {idx} from {size} to {len(string_literal.data)} bytes")
# If we still couldn't trim enough, warn the user
if bytes_trimmed < excess_bytes:
print(f"Warning: Could only trim {bytes_trimmed} of {excess_bytes} excess bytes")
print("The resulting file may be larger than the original")
# If new size is smaller, we can pad disposable strings to match original size
elif new_size < original_size:
deficit_bytes = original_size - new_size
print(f"New strings are {deficit_bytes} bytes smaller than original")
# Identify all disposable strings
disposable_strings = []
for idx, string_literal in enumerate(self.stringliterals):
if string_literal.disposable:
disposable_strings.append(idx)
if disposable_strings and len(disposable_strings) > 0:
# Sort by index to ensure consistent padding
last_idx = None
for idx in disposable_strings:
if last_idx is None or idx >= last_idx:
last_idx = idx
if last_idx is not None and deficit_bytes > 0:
string_literal = self.stringliterals[last_idx]
# Decode to string, add spaces, then encode back to bytes
string_value = string_literal.get_string_value()
padded_value = string_value + ' ' * deficit_bytes
string_literal.data = padded_value.encode("utf-8")
actual_padding = len(string_literal.data) - len(string_value.encode("utf-8"))
print(f"Added a total of {actual_padding} bytes of padding")
else:
print("No disposable strings found for padding")
return self
def update_string_offsets(self):
"""Update string offsets in the lookup table"""
# Update lengths based on new string values
for idx, string_literal in enumerate(self.stringliterals):
self.lookup_table[idx].length = len(string_literal.data)
# Recalculate indices
index = 0
for entry in self.lookup_table:
entry.index = index
index += entry.length
return self
def patch(self, output_filepath):
"""Create a patched metadata file with modified strings"""
# Create a copy of the original file
shutil.copy2(self.filepath, output_filepath)
# Calculate total size after modifications
new_total_size = sum(len(s.data) for s in self.stringliterals)
in_place_replacement = new_total_size <= self.original_total_size
with open(output_filepath, "rb+") as f:
# Get string literals offset from header
f.seek(STRINGLITERAL_DATA_DEFINITION_OFFSET)
stringliteral_data_offset = int.from_bytes(f.read(4), byteorder="little")
if in_place_replacement:
# In-place replacement: write strings at their original location
print("Performing in-place string replacement")
# Update lookup table first (lengths might have changed)
f.seek(LOOKUP_TABLE_DEFINITION_OFFSET)
lookup_table_offset = int.from_bytes(f.read(4), byteorder="little")
f.seek(lookup_table_offset)
for entry in self.lookup_table:
f.write(entry.length.to_bytes(4, byteorder="little"))
f.write(entry.index.to_bytes(4, byteorder="little"))
# Write strings at their original locations
for string_literal in self.stringliterals:
entry = self.lookup_table[string_literal.index]
f.seek(stringliteral_data_offset + entry.index)
f.write(string_literal.data)
else:
# Append mode: add strings at the end of the file
print("Appending strings to end of file (balancing was insufficient)")
# Get end of file position for appending strings
f.seek(0, os.SEEK_END)
new_strings_offset = f.tell()
# Append all strings at the end
for entry in self.stringliterals:
f.write(entry.data)
# Update lookup table
f.seek(LOOKUP_TABLE_DEFINITION_OFFSET)
lookup_table_offset = int.from_bytes(f.read(4), byteorder="little")
f.seek(lookup_table_offset)
for entry in self.lookup_table:
f.write(entry.length.to_bytes(4, byteorder="little"))
f.write(entry.index.to_bytes(4, byteorder="little"))
# Update string data offset in header
f.seek(STRINGLITERAL_DATA_DEFINITION_OFFSET)
f.write(new_strings_offset.to_bytes(4, byteorder="little"))
return output_filepath
def main():
parser = argparse.ArgumentParser(
description='Extract and reinsert strings in Unity IL2CPP global-metadata.dat'
)
subparsers = parser.add_subparsers(dest='command', help='Command to execute')
# Extract command
extract_parser = subparsers.add_parser('extract', help='Extract strings from global-metadata.dat')
extract_parser.add_argument('input', default='global-metadata.dat', help='Path to global-metadata.dat file')
extract_parser.add_argument('output', default='global-metadata_strings.csv', help='Path to output CSV or JSON file')
# Reinsert command
reinsert_parser = subparsers.add_parser('reinsert', help='Reinsert strings into global-metadata.dat')
reinsert_parser.add_argument('input_metadata', default='global-metadata.dat', help='Path to original global-metadata.dat file')
reinsert_parser.add_argument('input_strings', default='global-metadata_strings.csv', help='Path to CSV or JSON file with modified strings')
reinsert_parser.add_argument('output_metadata', default='global-metadata.dat_patched', help='Path to output modified global-metadata.dat file')
args = parser.parse_args()
if args.command == 'extract':
manager = StringLiteralManager(args.input)
if args.output.endswith('.json'):
manager.dump_json(args.output)
else:
manager.dump_csv(args.output)
print(f"Successfully extracted {len(manager.stringliterals)} strings to {args.output}")
elif args.command == 'reinsert':
manager = StringLiteralManager(args.input_metadata)
manager.load_modified_strings(args.input_strings)
manager.balance_string_sizes()
manager.update_string_offsets()
manager.patch(args.output_metadata)
print(f"Successfully patched metadata file: {args.output_metadata}")
else:
parser.print_help()
return 1
return 0
if __name__ == "__main__":
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment