-
-
Save hauzerlee/d18d3c2fa04507fc9e44c1ec0dafc8d3 to your computer and use it in GitHub Desktop.
OLE file research.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import click | |
import olefile | |
import os | |
import imghdr | |
import struct | |
import filetype | |
@click.command() | |
@click.argument("file_name", type=click.Path(exists=True)) | |
@click.argument("output_directory", type=click.Path()) | |
def extract_images(file_name: str, output_directory: str) -> None: | |
if not os.path.exists(output_directory): | |
os.makedirs(output_directory) | |
if os.path.isfile(file_name): | |
try: | |
process_file(file_name, output_directory) | |
except Exception as e: | |
print(e) | |
elif os.path.isdir(file_name): | |
for root, dirs, files in os.walk(file_name): | |
for file in files: | |
file_path = os.path.join(root, file) | |
try: | |
process_file(file_path, output_directory) | |
except Exception as e: | |
print(e) | |
else: | |
click.echo("Invalid file or directory.") | |
return | |
def process_file(file_path: str, output_directory: str) -> None: | |
if not olefile.isOleFile(file_path): | |
file_info = filetype.guess(file_path) | |
if file_info is None: | |
file_format = "Unknown" | |
else: | |
file_format = file_info.mime | |
click.echo( | |
f"Invalid file format: {file_path}. Expected OLE2 structured storage file, found: {file_format}." | |
) | |
return | |
image_formats = { | |
"jpg": (b"\xff\xd8", b"\xff\xd9"), | |
"tif": (b"\x49\x49\x2A\x00", b"\x00\x00\x00\x00"), | |
"jpeg": (b"\xff\xd8", b"\xff\xd9"), | |
"bmp": (b"\x42\x4D", None), | |
"png": (b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A", None), | |
"dib": (b"\x42\x4D", None), | |
} | |
ole = olefile.OleFileIO(file_path) | |
file_ext = os.path.splitext(file_path)[1][1:] | |
file_stream_dict = { | |
"doc": "Data", | |
"dot": "Data", | |
"dotm": "Data", | |
"xlt": "Workbook", | |
"xlsm": "Workbook", | |
"pps": "Pictures", | |
"pptm": "Pictures", | |
"wps": "Data", | |
"wpt": "Data", | |
"dps": "Pictures", | |
"dpt": "Pictures", | |
"et": "Workbook", | |
"ett": "Workbook", | |
} | |
stream = ole.openstream(file_stream_dict.get(file_ext)) | |
req_data = stream.read() | |
image_count = 0 | |
for ext, (start_marker, end_marker) in image_formats.items(): | |
start = 0 | |
while True: | |
start = req_data.find(start_marker, start) | |
if start == -1: | |
break | |
if end_marker is None: | |
end = len(req_data) | |
else: | |
end = req_data.find(end_marker, start) + len(end_marker) | |
image_data = req_data[start:end] | |
image_count += 1 | |
if ext in ["bmp", "dib"]: | |
image_data = process_bmp_image(image_data) | |
output_file = os.path.join( | |
output_directory, f"{os.path.basename(file_path)}_{image_count}.{ext}" | |
) | |
if os.path.exists(output_file): | |
start = end | |
continue | |
with open(output_file, "wb") as f: | |
f.write(image_data) | |
# Validating the extracted image using imghdr | |
if imghdr.what(output_file) != ext or ( | |
ext in ["bmp", "dib"] and not is_valid_bmp(output_file) | |
): | |
os.remove(output_file) | |
break | |
click.echo( | |
f"Extracted {ext.upper()} image {image_count} from {file_path} to {output_file}" | |
) | |
start = end | |
ole.close() | |
def process_bmp_image(image_data: bytes) -> bytes: | |
# Check if the BMP image has a valid header | |
if image_data[:2] != b"\x42\x4D": | |
return image_data | |
# Get the length of the BMP image from the header | |
length = struct.unpack("<I", image_data[2:6])[0] | |
# Trim the image data based on the length | |
image_data = image_data[:length] | |
return image_data | |
def is_valid_bmp(file_path: str) -> bool: | |
with open(file_path, "rb") as f: | |
f.seek(6) | |
size_fields = f.read(4) | |
return all(field == b"\x00" * 2 for field in struct.unpack("<I", size_fields)) | |
if __name__ == "__main__": | |
extract_images() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment