Skip to content

Instantly share code, notes, and snippets.

@UserUnknownFactor
Last active June 27, 2025 05:04
Show Gist options
  • Save UserUnknownFactor/9826e3c686c29a07bf4ad8fc4ba8b973 to your computer and use it in GitHub Desktop.
Save UserUnknownFactor/9826e3c686c29a07bf4ad8fc4ba8b973 to your computer and use it in GitHub Desktop.
Python tool to translate x86 exe strings by replacing their references to a newly created .trans PE section
import struct
import pefile
import re
import capstone
from tqdm import tqdm
CACHE_DIR = "__pycache__"
cache = None
ENABLE_CACHE = True
try:
from diskcache import Cache
cache = Cache(CACHE_DIR)
except ImportError:
ENABLE_CACHE = False
from filetranslate.service_fn import read_csv_list, write_csv_list
def patch_exe_with_translations(exe_path, csv_path, output_path, allowed_sections=[b'.text']):
# Read translations from CSV
translations = read_csv_list(csv_path)
for row in translations:
if len(row) >= 3:
offset_str = row[2]
if offset_str:
row[2] = int(offset_str, 16) if offset_str.startswith('0x') else int(offset_str)
if not translations:
print(f"Empty {csv_path}")
return False
print(f"Using {csv_path} with {len(translations)} lines to translate {exe_path}...")
# Parse the PE file
pe = pefile.PE(exe_path)
# Initialize disassembler for x86
md = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_32)
md.detail = True
md.skipdata = True
# Calculate where to add the new section
last_section = pe.sections[-1]
new_section_offset = (last_section.VirtualAddress +
last_section.Misc_VirtualSize +
pe.OPTIONAL_HEADER.SectionAlignment - 1) & ~(pe.OPTIONAL_HEADER.SectionAlignment - 1)
# Create the new section
new_section_data = bytearray()
va_of_new_strings = {} # Maps original string RVA to new string RVA
patch_locations = [] # List of (file_offset, new_va) tuples
patch_locations_byte = [] # List of (file_offset, new byte) tuples
reference_map = {}
print("Preprocessing sections...")
for sn, section in enumerate(pe.sections):
if not any(section.Name.startswith(allowed) for allowed in allowed_sections):
continue
section_data = section.get_data()
#section_va = section.VirtualAddress + pe.OPTIONAL_HEADER.ImageBase
section_offset = section.PointerToRawData
reference_map = None
if ENABLE_CACHE and cache:
reference_map = cache.get(f"cached_disasm_of_{section.Name}")
if reference_map is None and ENABLE_CACHE:
reference_map = {}
has_prior_push = False
for insn in md.disasm(section_data, 0):
if '.byte' == insn.mnemonic:
continue
for op in insn.operands:
if op.type == capstone.CS_OP_IMM:
imm_pos = insn.imm_offset
if imm_pos is not None:
if op.imm not in reference_map:
reference_map[op.imm] = []
inst_offset_in_section = insn.address + imm_pos
imm_file_offset = section_offset + inst_offset_in_section
#for section in pe.sections:
#if section.PointerToRawData <= string_offset < (section.PointerToRawData + section.SizeOfRawData):
#pass
reference_map[op.imm].append((insn.address, imm_file_offset, sn, has_prior_push and insn.mnemonic == 'push'))
has_prior_push = insn.mnemonic == 'push' and insn.imm_size == 1
cache.set(f"cached_disasm_of_{section.Name}", reference_map)
print(f"Collected {sum(len(v) for v in reference_map.values())} operand references.")
# Add all translations to the new section
current_offset = 0
i = 0
image_base = pe.OPTIONAL_HEADER.ImageBase
for row in tqdm(translations, desc="Processing translations"):
i += 1
if len(row) < 3:
print(f"Error in row {row}")
continue
original = row[0]
translation = row[1]
if original.startswith('//') and not translation.startswith('//'):
continue
string_offset = row[2]
if not string_offset:
print(f"Error in row {row}")
continue
# Convert file offset to RVA by finding the section containing this offset
string_rva = None
for section in pe.sections:
if section.PointerToRawData <= string_offset < (section.PointerToRawData + section.SizeOfRawData):
string_rva = string_offset - section.PointerToRawData + section.VirtualAddress
break
if string_rva is None:
print(f"Warning: Could not map file offset {string_offset} to RVA")
continue
# Store mapping from original string RVA to new string RVA
va_of_new_strings[string_rva] = new_section_offset + current_offset
# Add the translated string to our new section
encoded_translation = translation.encode('utf-16le') + b'\0\0'
new_section_data.extend(encoded_translation)
current_offset += len(encoded_translation)
# Apply patches for string RVA references
sec = pe.sections[sn]
section_data = sec.get_data()
for inst_offset_in_section, instr_offset, sn, has_prior_push in reference_map.get(
string_rva + image_base, []):
new_va = image_base + va_of_new_strings[string_rva]
patch_locations.append((instr_offset, new_va))
# fix string lengths pushed to stack right before the string address NOTE: DANGER ZONE
if has_prior_push and section_data[inst_offset_in_section - 2] == 0x6A: # push #LEN; push #STR => 6A XX 68 XX XX XX XX
if section_data[inst_offset_in_section - 1] == len(original):
# 127 because short push immediate sign-extends the value
tl_len = len(translation)
if tl_len > 127:
print(f"\nNeed to fix line {i} = {translation}")
instr_offset -= 2
patch_locations_byte.append( (instr_offset, min(127, tl_len)) )
pass
if inst_offset_in_section < len(section_data) and section_data[inst_offset_in_section] == 0x68 and section_data[inst_offset_in_section - 3] == 0x6A: # push #LEN; push #STR => 6A XX 68 XX XX XX XX
if section_data[inst_offset_in_section - 2] == len(original):
# 127 because short push immediate sign-extends the value
tl_len = len(translation)
if tl_len > 127:
print(f"\nNeed to fix line {i} = {translation}")
instr_offset -= 3
print(hex(instr_offset))
patch_locations_byte.append( (instr_offset, min(127, tl_len)) )
pass
# Align the section size to file alignment
aligned_size = (len(new_section_data) + pe.OPTIONAL_HEADER.FileAlignment - 1) & ~(pe.OPTIONAL_HEADER.FileAlignment - 1)
new_section_data.extend(b'\0' * (aligned_size - len(new_section_data)))
# Calculate where the new section will be placed in the file
new_section_raw_pointer = (
last_section.PointerToRawData +
last_section.SizeOfRawData +
pe.OPTIONAL_HEADER.FileAlignment - 1) & ~(
pe.OPTIONAL_HEADER.FileAlignment - 1)
# Create a new section header
new_section = pefile.SectionStructure(pe.__IMAGE_SECTION_HEADER_format__)
new_section.set_file_offset(pe.sections[-1].get_file_offset() + pe.sections[-1].sizeof())
# Set the section properties
new_section_name = b'.trans'
new_section.Name = new_section_name[:8] + b'\0' * max(8 - len(new_section_name), 0)
new_section.Misc = len(new_section_data) # virtual size
new_section.VirtualAddress = new_section_offset
new_section.SizeOfRawData = aligned_size
new_section.PointerToRawData = new_section_raw_pointer
new_section.PointerToRelocations = 0
new_section.PointerToLinenumbers = 0
new_section.NumberOfRelocations = 0
new_section.NumberOfLinenumbers = 0
new_section.Characteristics = 0x40000040 # READ | INITIALIZED
# Update the PE header
pe.FILE_HEADER.NumberOfSections += 1
pe.OPTIONAL_HEADER.SizeOfImage = (
new_section_offset + aligned_size +
pe.OPTIONAL_HEADER.SectionAlignment - 1) & ~(
pe.OPTIONAL_HEADER.SectionAlignment - 1)
# Make a copy of the original exe
with open(exe_path, 'rb') as f:
exe_data = bytearray(f.read())
# Apply patches to the original data
for file_offset, new_va in patch_locations:
struct.pack_into('<I', exe_data, file_offset, new_va)
for file_offset, new_byte in patch_locations_byte:
exe_data[file_offset] = new_byte
#for insn_file_offset, new_bytes in instruction_patches:
#for i, byte in enumerate(new_bytes):
#exe_data[insn_file_offset + i] = byte
# Write the modified exe
with open(output_path, 'wb') as f:
# Calculate where the section table is
section_table_offset = pe.DOS_HEADER.e_lfanew + 4 + pe.FILE_HEADER.sizeof() + pe.FILE_HEADER.SizeOfOptionalHeader
# Update NumberOfSections in the file header
struct.pack_into('<H', exe_data, pe.DOS_HEADER.e_lfanew + 6, pe.FILE_HEADER.NumberOfSections)
# Update SizeOfImage in the optional header
image_size_offset = pe.DOS_HEADER.e_lfanew + 4 + pe.FILE_HEADER.sizeof() + 56 # Offset to SizeOfImage
struct.pack_into('<I', exe_data, image_size_offset, pe.OPTIONAL_HEADER.SizeOfImage)
# Write everything up to the section table
f.write(exe_data[:section_table_offset])
# Write all original section headers
for section in pe.sections:
f.write(section.__pack__())
# Write our new section header
new_section_header = new_section.__pack__()
f.write(new_section_header)
# Write everything after the section table but before our new section
# First calculate where the original section table ends
original_section_table_end = section_table_offset + (pe.FILE_HEADER.NumberOfSections - 1) * 40
# Write data between section table and our new section
if new_section_raw_pointer > original_section_table_end:
f.write(exe_data[original_section_table_end+len(new_section_header):new_section_raw_pointer])
else:
# Seek to where our section should start
f.seek(new_section_raw_pointer)
# Write our new section data
f.write(new_section_data)
if pe.OPTIONAL_HEADER.CheckSum:
# Recalculate the checksum
try:
# Use pefile to recalculate the checksum
patched_pe = pefile.PE(output_path)
patched_pe.OPTIONAL_HEADER.CheckSum = patched_pe.generate_checksum()
patched_pe.write(filename=output_path)
except Exception as e:
print(f"Warning: Could not recalculate checksum: {e}")
print(f"Patched {len(patch_locations)} references to {len(va_of_new_strings)} strings")
return True
def find_instruction_references(exe_path, target_addresses, output_csv=None):
"""
Find all instructions in the code section that reference specific addresses
Args:
exe_path: Path to the executable
target_addresses: List of virtual addresses to find references to
output_csv: Optional path to save results to CSV
"""
pe = pefile.PE(exe_path)
# Initialize disassembler
md = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_32)
md.detail = True
results = []
# Process each code section
for section in [s for s in pe.sections if s.Name.startswith(b'.text')]:
section_data = section.get_data()
section_va = section.VirtualAddress + pe.OPTIONAL_HEADER.ImageBase
# Disassemble the section
for insn in md.disasm(section_data, section_va):
# Check each operand for references to target addresses
for op in insn.operands:
if hasattr(op, 'imm') and op.imm in target_addresses:
# Calculate file offset of the instruction
file_offset = section.PointerToRawData + (insn.address - section_va)
results.append({
'address': insn.address,
'file_offset': file_offset,
'mnemonic': insn.mnemonic,
'op_str': insn.op_str,
'bytes': ' '.join(f'{b:02x}' for b in insn.bytes),
'target_address': op.imm
})
# Optionally save to CSV
if output_csv and results:
headers = ['address', 'file_offset', 'mnemonic', 'op_str', 'bytes', 'target_address']
csv_data = [headers] + [[str(row[h]) for h in headers] for row in results]
write_csv_list(output_csv, csv_data)
return results
def main():
exe_path = 'Original.exe'
csv_path = exe_path.replace('.exe', '_strings.csv')
output_path = f'translation_out\\{exe_path}'
if patch_exe_with_translations(exe_path, csv_path, output_path):
print(f"Successfully patched {exe_path} with translations to {output_path}")
if __name__ == "__main__":
main()
import pefile
import re
import struct
from filetranslate.service_fn import read_csv_list, write_csv_list
def extract_strings_from_exe(exe_path):
import pefile
import re
import struct
# Load the PE file
pe = pefile.PE(exe_path)
# Initialize results array [string, '', file_offset]
results = []
found_strings = set() # To avoid duplicates
# Helper function to validate string quality
def is_valid_string(s, min_length=1):
if len(s) < min_length:
return False
# All characters should be printable or whitespace
if not all(c.isprintable() or c.isspace() for c in s):
return False
# For longer strings, apply more heuristics
if len(s) > 8:
# Check for too many special characters
special_char_count = sum(not c.isalnum() and not c.isspace() for c in s)
if special_char_count / len(s) > 0.5: # More than 50% special chars
return False
return True
# PART 1: Find all strings in data sections
for section in pe.sections:
# Skip code sections - focus on data sections
if section.Name.startswith(b'.text'):
continue
section_data = section.get_data()
section_offset = section.PointerToRawData
# Find UTF-16 strings with better validation
# Find ASCII strings
unicode_pattern = re.compile(b'\0\0[\s\S]{4,512}\0\0', re.DOTALL)
for match in unicode_pattern.finditer(section_data):
start = match.start()
end = match.end() - 1 # Exclude null terminator
try:
string_value = section_data[start:end].decode('utf-16le')
if is_valid_string(string_value):
file_offset = section_offset + start
if string_value not in found_strings:
results.append([string_value, '', file_offset])
found_strings.add(string_value)
except UnicodeDecodeError:
pass
# Find ASCII strings
ascii_pattern = re.compile(b'[^\x00-\x1F\x7F-\xFF]{4,}?\x00', re.DOTALL)
for match in ascii_pattern.finditer(section_data):
start = match.start()
end = match.end() - 1 # Exclude null terminator
try:
string_value = section_data[start:end].decode('ascii')
if is_valid_string(string_value):
file_offset = section_offset + start
if string_value not in found_strings:
results.append([string_value, '', file_offset])
found_strings.add(string_value)
except UnicodeDecodeError:
pass
# PART 2: Find strings by reference patterns in code sections
# Define instruction patterns to search for
instr_patterns = [
{'pattern': re.compile(b'\x68(....)', re.DOTALL), 'operand_offset': 1, 'size': 4}, # PUSH immediate (68 xx xx xx xx)
{'pattern': re.compile(b'[\xB8-\xBF](....)', re.DOTALL), 'operand_offset': 1, 'size': 4}, # MOV reg, immediate (B8-BF xx xx xx xx)
{'pattern': re.compile(b'\x8D[\x05\x0D\x15\x1D\x25\x2D\x35\x3D](....)', re.DOTALL), 'operand_offset': 2, 'size': 4} # LEA reg, [addr] (8D xx xx xx xx xx)
]
def parse_potential_string_reference(match, pattern_info, section):
addr_operand_offset = match.start() + pattern_info['operand_offset']
string_va = struct.unpack('<I', match.group(1))[0]
# Check if this points to a valid string
string_rva = string_va - pe.OPTIONAL_HEADER.ImageBase
try:
file_offset = pe.get_offset_from_rva(string_rva)
string_data = pe.get_data(string_rva, 1024)
if not string_data or len(string_data) < 2:
return None
# Find end of UTF-16LE string
null_pos = string_data.find(b'\0\0')
if null_pos >= 2:
try:
string_value = string_data[:null_pos].decode('utf-16le')
if is_valid_string(string_value):
return string_value, file_offset
except UnicodeDecodeError:
pass
try: # maybe stray ascii ending?
string_value = string_data[:null_pos+1].decode('utf-16le')
if is_valid_string(string_value):
return string_value, file_offset
except UnicodeDecodeError:
pass
try: # maybe stray ascii ending?
string_value = string_data[:null_pos+2].decode('utf-16le')
if is_valid_string(string_value):
return string_value, file_offset
except UnicodeDecodeError:
pass
# Try ASCII if not UTF-16LE
null_pos = string_data.find(b'\x00')
if null_pos > 0:
string_value = string_data[:null_pos].decode('ascii', errors='ignore')
if is_valid_string(string_value):
return string_value, file_offset
except:
pass
return None
# Search for string references in code sections
for section in [s for s in pe.sections if s.Name.startswith(b'.text')]:
section_data = section.get_data()
# Process each instruction pattern
for pattern_info in instr_patterns:
for match in pattern_info['pattern'].finditer(section_data):
result = parse_potential_string_reference(match, pattern_info, section)
if result:
string_value, file_offset = result
string_value = string_value.replace('\r', '')
if string_value not in found_strings:
results.append([string_value, '', file_offset])
found_strings.add(string_value)
return results
def validate_string(string):
valid_pattern = re.compile(r'[\u3041-\u3096\u30A0-\u30FF\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A\u2E80-\u2FD5\uFF5F-\uFF9F\u3000-\u303F\u31F0-\u31FF\u3220-\u3243\u3280-\u337F\uFF01-\uFF5E\u2026-\u203Ba-zA-Z\d\s.,!?()\-\[\[\!@#\$%\^&\*:;\n\'\"()_\+=,\.\/?\\\|\[\]`~]+')
return bool(valid_pattern.match(string))
# Usage example:
file_path = 'Original.exe'
csv_path = 'Original_strings.csv'
strings = extract_strings_from_exe(file_path)
tled = read_csv_list(csv_path)
for i, item1 in enumerate(strings):
for item in tled:
if item1[0] == item[0]:
strings[i][1] = item[1]
break
write_csv_list(csv_path, strings)
diskcache
tqdm
capstone
filetranslate
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment