Created
August 26, 2024 09:27
-
-
Save sudoaza/62e8b8b9e4cb3fe505582b61db9eee62 to your computer and use it in GitHub Desktop.
Quick and dirty parse PDF file and extract objects/images
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import zlib | |
import argparse | |
from PIL import Image | |
def parse_args(): | |
parser = argparse.ArgumentParser(description='Decompress data using zlib') | |
parser.add_argument('pdf_file', help='The PDF file to analyze') | |
return parser.parse_args() | |
def get_objects(pdf_data): | |
"""Get objects offset from the beginning of the file""" | |
rx = rb'(\d+) (\d+) obj\b' | |
objects = [] | |
for match in re.finditer(rx, pdf_data): | |
obj_idx = match.start() | |
endobj_idx = pdf_data[obj_idx:].find(b'endobj') + obj_idx +7 | |
obj_data = pdf_data[obj_idx:endobj_idx] | |
definition, arguments, *others = obj_data.splitlines() | |
if not others: | |
print('No data', definition, arguments) | |
continue | |
if others[0] == b'endobj': | |
objects.append((definition, arguments, b'')) | |
elif others[0] == b'stream': | |
stream_idx = obj_data.find(b'\nstream\n') +8 | |
endstream_idx = obj_data.find(b'\nendstream\n') | |
objects.append((definition, arguments, obj_data[stream_idx:endstream_idx])) | |
else: | |
raise ValueError('Unknown object type') | |
return objects | |
def parse_pdf(pdf_data): | |
objects = get_objects(pdf_data) | |
for definition, arguments, stream in objects: | |
# Check if the stream is compressed | |
if not stream: | |
pass | |
elif b'/FlateDecode' in arguments: | |
stream = zlib.decompress(stream) | |
elif b'/LZWDecode' in arguments: | |
stream = zlib.decompress(stream, -15) | |
else: | |
print('Unknown compression method') | |
# Check if the stream is an image | |
if b'/Image' in arguments: | |
s_arguments = arguments.decode() | |
filename = re.sub(r'\W', '_', definition) + '.bmp' | |
width = int(re.search(r'/Width (\d+)', s_arguments).group(1)) | |
height = int(re.search(r'/Height (\d+)', s_arguments).group(1)) | |
bits_per_component = int(re.search(r'/BitsPerComponent (\d+)', s_arguments).group(1)) | |
color_space = re.search(r'/ColorSpace/(\w+)', s_arguments).group(1).replace('Device', '') | |
image = Image.frombytes(color_space, (width, height), stream) | |
image.save(filename) | |
print('Image saved as', filename) | |
print(definition, arguments, stream[:50]) | |
def main(): | |
args = parse_args() | |
with open(args.pdf_file, 'rb') as f: | |
pdf_data = f.read() | |
assert pdf_data[:4] == b'%PDF', 'Not a PDF file' | |
parse_pdf(pdf_data) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment