Last active
June 27, 2025 05:04
-
-
Save UserUnknownFactor/9826e3c686c29a07bf4ad8fc4ba8b973 to your computer and use it in GitHub Desktop.
Python tool to translate x86 exe strings by replacing their references to a newly created .trans PE section
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import struct | |
import pefile | |
import re | |
import capstone | |
from tqdm import tqdm | |
CACHE_DIR = "__pycache__" | |
cache = None | |
ENABLE_CACHE = True | |
try: | |
from diskcache import Cache | |
cache = Cache(CACHE_DIR) | |
except ImportError: | |
ENABLE_CACHE = False | |
from filetranslate.service_fn import read_csv_list, write_csv_list | |
def patch_exe_with_translations(exe_path, csv_path, output_path, allowed_sections=[b'.text']): | |
# Read translations from CSV | |
translations = read_csv_list(csv_path) | |
for row in translations: | |
if len(row) >= 3: | |
offset_str = row[2] | |
if offset_str: | |
row[2] = int(offset_str, 16) if offset_str.startswith('0x') else int(offset_str) | |
if not translations: | |
print(f"Empty {csv_path}") | |
return False | |
print(f"Using {csv_path} with {len(translations)} lines to translate {exe_path}...") | |
# Parse the PE file | |
pe = pefile.PE(exe_path) | |
# Initialize disassembler for x86 | |
md = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_32) | |
md.detail = True | |
md.skipdata = True | |
# Calculate where to add the new section | |
last_section = pe.sections[-1] | |
new_section_offset = (last_section.VirtualAddress + | |
last_section.Misc_VirtualSize + | |
pe.OPTIONAL_HEADER.SectionAlignment - 1) & ~(pe.OPTIONAL_HEADER.SectionAlignment - 1) | |
# Create the new section | |
new_section_data = bytearray() | |
va_of_new_strings = {} # Maps original string RVA to new string RVA | |
patch_locations = [] # List of (file_offset, new_va) tuples | |
patch_locations_byte = [] # List of (file_offset, new byte) tuples | |
reference_map = {} | |
print("Preprocessing sections...") | |
for sn, section in enumerate(pe.sections): | |
if not any(section.Name.startswith(allowed) for allowed in allowed_sections): | |
continue | |
section_data = section.get_data() | |
#section_va = section.VirtualAddress + pe.OPTIONAL_HEADER.ImageBase | |
section_offset = section.PointerToRawData | |
reference_map = None | |
if ENABLE_CACHE and cache: | |
reference_map = cache.get(f"cached_disasm_of_{section.Name}") | |
if reference_map is None and ENABLE_CACHE: | |
reference_map = {} | |
has_prior_push = False | |
for insn in md.disasm(section_data, 0): | |
if '.byte' == insn.mnemonic: | |
continue | |
for op in insn.operands: | |
if op.type == capstone.CS_OP_IMM: | |
imm_pos = insn.imm_offset | |
if imm_pos is not None: | |
if op.imm not in reference_map: | |
reference_map[op.imm] = [] | |
inst_offset_in_section = insn.address + imm_pos | |
imm_file_offset = section_offset + inst_offset_in_section | |
#for section in pe.sections: | |
#if section.PointerToRawData <= string_offset < (section.PointerToRawData + section.SizeOfRawData): | |
#pass | |
reference_map[op.imm].append((insn.address, imm_file_offset, sn, has_prior_push and insn.mnemonic == 'push')) | |
has_prior_push = insn.mnemonic == 'push' and insn.imm_size == 1 | |
cache.set(f"cached_disasm_of_{section.Name}", reference_map) | |
print(f"Collected {sum(len(v) for v in reference_map.values())} operand references.") | |
# Add all translations to the new section | |
current_offset = 0 | |
i = 0 | |
image_base = pe.OPTIONAL_HEADER.ImageBase | |
for row in tqdm(translations, desc="Processing translations"): | |
i += 1 | |
if len(row) < 3: | |
print(f"Error in row {row}") | |
continue | |
original = row[0] | |
translation = row[1] | |
if original.startswith('//') and not translation.startswith('//'): | |
continue | |
string_offset = row[2] | |
if not string_offset: | |
print(f"Error in row {row}") | |
continue | |
# Convert file offset to RVA by finding the section containing this offset | |
string_rva = None | |
for section in pe.sections: | |
if section.PointerToRawData <= string_offset < (section.PointerToRawData + section.SizeOfRawData): | |
string_rva = string_offset - section.PointerToRawData + section.VirtualAddress | |
break | |
if string_rva is None: | |
print(f"Warning: Could not map file offset {string_offset} to RVA") | |
continue | |
# Store mapping from original string RVA to new string RVA | |
va_of_new_strings[string_rva] = new_section_offset + current_offset | |
# Add the translated string to our new section | |
encoded_translation = translation.encode('utf-16le') + b'\0\0' | |
new_section_data.extend(encoded_translation) | |
current_offset += len(encoded_translation) | |
# Apply patches for string RVA references | |
sec = pe.sections[sn] | |
section_data = sec.get_data() | |
for inst_offset_in_section, instr_offset, sn, has_prior_push in reference_map.get( | |
string_rva + image_base, []): | |
new_va = image_base + va_of_new_strings[string_rva] | |
patch_locations.append((instr_offset, new_va)) | |
# fix string lengths pushed to stack right before the string address NOTE: DANGER ZONE | |
if has_prior_push and section_data[inst_offset_in_section - 2] == 0x6A: # push #LEN; push #STR => 6A XX 68 XX XX XX XX | |
if section_data[inst_offset_in_section - 1] == len(original): | |
# 127 because short push immediate sign-extends the value | |
tl_len = len(translation) | |
if tl_len > 127: | |
print(f"\nNeed to fix line {i} = {translation}") | |
instr_offset -= 2 | |
patch_locations_byte.append( (instr_offset, min(127, tl_len)) ) | |
pass | |
if inst_offset_in_section < len(section_data) and section_data[inst_offset_in_section] == 0x68 and section_data[inst_offset_in_section - 3] == 0x6A: # push #LEN; push #STR => 6A XX 68 XX XX XX XX | |
if section_data[inst_offset_in_section - 2] == len(original): | |
# 127 because short push immediate sign-extends the value | |
tl_len = len(translation) | |
if tl_len > 127: | |
print(f"\nNeed to fix line {i} = {translation}") | |
instr_offset -= 3 | |
print(hex(instr_offset)) | |
patch_locations_byte.append( (instr_offset, min(127, tl_len)) ) | |
pass | |
# Align the section size to file alignment | |
aligned_size = (len(new_section_data) + pe.OPTIONAL_HEADER.FileAlignment - 1) & ~(pe.OPTIONAL_HEADER.FileAlignment - 1) | |
new_section_data.extend(b'\0' * (aligned_size - len(new_section_data))) | |
# Calculate where the new section will be placed in the file | |
new_section_raw_pointer = ( | |
last_section.PointerToRawData + | |
last_section.SizeOfRawData + | |
pe.OPTIONAL_HEADER.FileAlignment - 1) & ~( | |
pe.OPTIONAL_HEADER.FileAlignment - 1) | |
# Create a new section header | |
new_section = pefile.SectionStructure(pe.__IMAGE_SECTION_HEADER_format__) | |
new_section.set_file_offset(pe.sections[-1].get_file_offset() + pe.sections[-1].sizeof()) | |
# Set the section properties | |
new_section_name = b'.trans' | |
new_section.Name = new_section_name[:8] + b'\0' * max(8 - len(new_section_name), 0) | |
new_section.Misc = len(new_section_data) # virtual size | |
new_section.VirtualAddress = new_section_offset | |
new_section.SizeOfRawData = aligned_size | |
new_section.PointerToRawData = new_section_raw_pointer | |
new_section.PointerToRelocations = 0 | |
new_section.PointerToLinenumbers = 0 | |
new_section.NumberOfRelocations = 0 | |
new_section.NumberOfLinenumbers = 0 | |
new_section.Characteristics = 0x40000040 # READ | INITIALIZED | |
# Update the PE header | |
pe.FILE_HEADER.NumberOfSections += 1 | |
pe.OPTIONAL_HEADER.SizeOfImage = ( | |
new_section_offset + aligned_size + | |
pe.OPTIONAL_HEADER.SectionAlignment - 1) & ~( | |
pe.OPTIONAL_HEADER.SectionAlignment - 1) | |
# Make a copy of the original exe | |
with open(exe_path, 'rb') as f: | |
exe_data = bytearray(f.read()) | |
# Apply patches to the original data | |
for file_offset, new_va in patch_locations: | |
struct.pack_into('<I', exe_data, file_offset, new_va) | |
for file_offset, new_byte in patch_locations_byte: | |
exe_data[file_offset] = new_byte | |
#for insn_file_offset, new_bytes in instruction_patches: | |
#for i, byte in enumerate(new_bytes): | |
#exe_data[insn_file_offset + i] = byte | |
# Write the modified exe | |
with open(output_path, 'wb') as f: | |
# Calculate where the section table is | |
section_table_offset = pe.DOS_HEADER.e_lfanew + 4 + pe.FILE_HEADER.sizeof() + pe.FILE_HEADER.SizeOfOptionalHeader | |
# Update NumberOfSections in the file header | |
struct.pack_into('<H', exe_data, pe.DOS_HEADER.e_lfanew + 6, pe.FILE_HEADER.NumberOfSections) | |
# Update SizeOfImage in the optional header | |
image_size_offset = pe.DOS_HEADER.e_lfanew + 4 + pe.FILE_HEADER.sizeof() + 56 # Offset to SizeOfImage | |
struct.pack_into('<I', exe_data, image_size_offset, pe.OPTIONAL_HEADER.SizeOfImage) | |
# Write everything up to the section table | |
f.write(exe_data[:section_table_offset]) | |
# Write all original section headers | |
for section in pe.sections: | |
f.write(section.__pack__()) | |
# Write our new section header | |
new_section_header = new_section.__pack__() | |
f.write(new_section_header) | |
# Write everything after the section table but before our new section | |
# First calculate where the original section table ends | |
original_section_table_end = section_table_offset + (pe.FILE_HEADER.NumberOfSections - 1) * 40 | |
# Write data between section table and our new section | |
if new_section_raw_pointer > original_section_table_end: | |
f.write(exe_data[original_section_table_end+len(new_section_header):new_section_raw_pointer]) | |
else: | |
# Seek to where our section should start | |
f.seek(new_section_raw_pointer) | |
# Write our new section data | |
f.write(new_section_data) | |
if pe.OPTIONAL_HEADER.CheckSum: | |
# Recalculate the checksum | |
try: | |
# Use pefile to recalculate the checksum | |
patched_pe = pefile.PE(output_path) | |
patched_pe.OPTIONAL_HEADER.CheckSum = patched_pe.generate_checksum() | |
patched_pe.write(filename=output_path) | |
except Exception as e: | |
print(f"Warning: Could not recalculate checksum: {e}") | |
print(f"Patched {len(patch_locations)} references to {len(va_of_new_strings)} strings") | |
return True | |
def find_instruction_references(exe_path, target_addresses, output_csv=None): | |
""" | |
Find all instructions in the code section that reference specific addresses | |
Args: | |
exe_path: Path to the executable | |
target_addresses: List of virtual addresses to find references to | |
output_csv: Optional path to save results to CSV | |
""" | |
pe = pefile.PE(exe_path) | |
# Initialize disassembler | |
md = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_32) | |
md.detail = True | |
results = [] | |
# Process each code section | |
for section in [s for s in pe.sections if s.Name.startswith(b'.text')]: | |
section_data = section.get_data() | |
section_va = section.VirtualAddress + pe.OPTIONAL_HEADER.ImageBase | |
# Disassemble the section | |
for insn in md.disasm(section_data, section_va): | |
# Check each operand for references to target addresses | |
for op in insn.operands: | |
if hasattr(op, 'imm') and op.imm in target_addresses: | |
# Calculate file offset of the instruction | |
file_offset = section.PointerToRawData + (insn.address - section_va) | |
results.append({ | |
'address': insn.address, | |
'file_offset': file_offset, | |
'mnemonic': insn.mnemonic, | |
'op_str': insn.op_str, | |
'bytes': ' '.join(f'{b:02x}' for b in insn.bytes), | |
'target_address': op.imm | |
}) | |
# Optionally save to CSV | |
if output_csv and results: | |
headers = ['address', 'file_offset', 'mnemonic', 'op_str', 'bytes', 'target_address'] | |
csv_data = [headers] + [[str(row[h]) for h in headers] for row in results] | |
write_csv_list(output_csv, csv_data) | |
return results | |
def main(): | |
exe_path = 'Original.exe' | |
csv_path = exe_path.replace('.exe', '_strings.csv') | |
output_path = f'translation_out\\{exe_path}' | |
if patch_exe_with_translations(exe_path, csv_path, output_path): | |
print(f"Successfully patched {exe_path} with translations to {output_path}") | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pefile | |
import re | |
import struct | |
from filetranslate.service_fn import read_csv_list, write_csv_list | |
def extract_strings_from_exe(exe_path): | |
import pefile | |
import re | |
import struct | |
# Load the PE file | |
pe = pefile.PE(exe_path) | |
# Initialize results array [string, '', file_offset] | |
results = [] | |
found_strings = set() # To avoid duplicates | |
# Helper function to validate string quality | |
def is_valid_string(s, min_length=1): | |
if len(s) < min_length: | |
return False | |
# All characters should be printable or whitespace | |
if not all(c.isprintable() or c.isspace() for c in s): | |
return False | |
# For longer strings, apply more heuristics | |
if len(s) > 8: | |
# Check for too many special characters | |
special_char_count = sum(not c.isalnum() and not c.isspace() for c in s) | |
if special_char_count / len(s) > 0.5: # More than 50% special chars | |
return False | |
return True | |
# PART 1: Find all strings in data sections | |
for section in pe.sections: | |
# Skip code sections - focus on data sections | |
if section.Name.startswith(b'.text'): | |
continue | |
section_data = section.get_data() | |
section_offset = section.PointerToRawData | |
# Find UTF-16 strings with better validation | |
# Find ASCII strings | |
unicode_pattern = re.compile(b'\0\0[\s\S]{4,512}\0\0', re.DOTALL) | |
for match in unicode_pattern.finditer(section_data): | |
start = match.start() | |
end = match.end() - 1 # Exclude null terminator | |
try: | |
string_value = section_data[start:end].decode('utf-16le') | |
if is_valid_string(string_value): | |
file_offset = section_offset + start | |
if string_value not in found_strings: | |
results.append([string_value, '', file_offset]) | |
found_strings.add(string_value) | |
except UnicodeDecodeError: | |
pass | |
# Find ASCII strings | |
ascii_pattern = re.compile(b'[^\x00-\x1F\x7F-\xFF]{4,}?\x00', re.DOTALL) | |
for match in ascii_pattern.finditer(section_data): | |
start = match.start() | |
end = match.end() - 1 # Exclude null terminator | |
try: | |
string_value = section_data[start:end].decode('ascii') | |
if is_valid_string(string_value): | |
file_offset = section_offset + start | |
if string_value not in found_strings: | |
results.append([string_value, '', file_offset]) | |
found_strings.add(string_value) | |
except UnicodeDecodeError: | |
pass | |
# PART 2: Find strings by reference patterns in code sections | |
# Define instruction patterns to search for | |
instr_patterns = [ | |
{'pattern': re.compile(b'\x68(....)', re.DOTALL), 'operand_offset': 1, 'size': 4}, # PUSH immediate (68 xx xx xx xx) | |
{'pattern': re.compile(b'[\xB8-\xBF](....)', re.DOTALL), 'operand_offset': 1, 'size': 4}, # MOV reg, immediate (B8-BF xx xx xx xx) | |
{'pattern': re.compile(b'\x8D[\x05\x0D\x15\x1D\x25\x2D\x35\x3D](....)', re.DOTALL), 'operand_offset': 2, 'size': 4} # LEA reg, [addr] (8D xx xx xx xx xx) | |
] | |
def parse_potential_string_reference(match, pattern_info, section): | |
addr_operand_offset = match.start() + pattern_info['operand_offset'] | |
string_va = struct.unpack('<I', match.group(1))[0] | |
# Check if this points to a valid string | |
string_rva = string_va - pe.OPTIONAL_HEADER.ImageBase | |
try: | |
file_offset = pe.get_offset_from_rva(string_rva) | |
string_data = pe.get_data(string_rva, 1024) | |
if not string_data or len(string_data) < 2: | |
return None | |
# Find end of UTF-16LE string | |
null_pos = string_data.find(b'\0\0') | |
if null_pos >= 2: | |
try: | |
string_value = string_data[:null_pos].decode('utf-16le') | |
if is_valid_string(string_value): | |
return string_value, file_offset | |
except UnicodeDecodeError: | |
pass | |
try: # maybe stray ascii ending? | |
string_value = string_data[:null_pos+1].decode('utf-16le') | |
if is_valid_string(string_value): | |
return string_value, file_offset | |
except UnicodeDecodeError: | |
pass | |
try: # maybe stray ascii ending? | |
string_value = string_data[:null_pos+2].decode('utf-16le') | |
if is_valid_string(string_value): | |
return string_value, file_offset | |
except UnicodeDecodeError: | |
pass | |
# Try ASCII if not UTF-16LE | |
null_pos = string_data.find(b'\x00') | |
if null_pos > 0: | |
string_value = string_data[:null_pos].decode('ascii', errors='ignore') | |
if is_valid_string(string_value): | |
return string_value, file_offset | |
except: | |
pass | |
return None | |
# Search for string references in code sections | |
for section in [s for s in pe.sections if s.Name.startswith(b'.text')]: | |
section_data = section.get_data() | |
# Process each instruction pattern | |
for pattern_info in instr_patterns: | |
for match in pattern_info['pattern'].finditer(section_data): | |
result = parse_potential_string_reference(match, pattern_info, section) | |
if result: | |
string_value, file_offset = result | |
string_value = string_value.replace('\r', '') | |
if string_value not in found_strings: | |
results.append([string_value, '', file_offset]) | |
found_strings.add(string_value) | |
return results | |
def validate_string(string): | |
valid_pattern = re.compile(r'[\u3041-\u3096\u30A0-\u30FF\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A\u2E80-\u2FD5\uFF5F-\uFF9F\u3000-\u303F\u31F0-\u31FF\u3220-\u3243\u3280-\u337F\uFF01-\uFF5E\u2026-\u203Ba-zA-Z\d\s.,!?()\-\[\[\!@#\$%\^&\*:;\n\'\"()_\+=,\.\/?\\\|\[\]`~]+') | |
return bool(valid_pattern.match(string)) | |
# Usage example: | |
file_path = 'Original.exe' | |
csv_path = 'Original_strings.csv' | |
strings = extract_strings_from_exe(file_path) | |
tled = read_csv_list(csv_path) | |
for i, item1 in enumerate(strings): | |
for item in tled: | |
if item1[0] == item[0]: | |
strings[i][1] = item[1] | |
break | |
write_csv_list(csv_path, strings) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diskcache | |
tqdm | |
capstone | |
filetranslate |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment