Created
July 15, 2022 16:32
-
-
Save sapher/033830caf4cf95b172e9c1dde10484f5 to your computer and use it in GitHub Desktop.
Parse SUPER U monmagasin U ticket
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from asyncore import read | |
import argparse | |
from PyPDF2 import PdfReader | |
import re | |
import json | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Parse super u ticket to json") | |
parser.add_argument('filename', help="ticket pdf filename") | |
args = parser.parse_args() | |
filepath = args.filename | |
reader = PdfReader(filepath) | |
def to_float(number): | |
comma = number.replace(',','.') | |
return float(comma) | |
for page in reader.pages: | |
lines = page.extractText().split('\n') | |
lines = [line for line in lines if line.strip()] | |
# Remove useless | |
is_ticket_find = False | |
is_end_find = False | |
filtered_lines = [] | |
for line in lines: | |
if "ticket" in line.lower(): | |
is_ticket_find = True | |
if "===" in line.lower(): | |
is_end_find = True | |
if is_ticket_find and not is_end_find: | |
filtered_lines.append(' '.join(line.split())) | |
# keep only needed | |
filtered_lines = filtered_lines[2:] | |
# keep all on one line | |
reduced_lines = [] | |
for index, line in enumerate(filtered_lines): | |
if "€" in line: | |
prev_line = filtered_lines[index-1] | |
if "€" not in prev_line: | |
reduced_lines.append(f"{prev_line} {line}") | |
else: | |
reduced_lines.append(line) | |
products = [] | |
# parse all | |
for line in reduced_lines: | |
product = {} | |
# parse name | |
name = re.sub(r"(\s\d+,\d+\s€.*)", '', line) | |
product['name'] = re.sub(r"(\s\d+\sx)", '', name) | |
# parse prices | |
prices = re.findall(r"(\d+,\d+)\s€", line) | |
if len(prices) == 1: | |
product['unit_price'] = to_float(prices[0]) | |
product['total_price'] = to_float(prices[0]) | |
elif len(prices) == 2: | |
product['unit_price'] = to_float(prices[0]) | |
product['total_price'] = to_float(prices[1]) | |
else: | |
print('not handled') | |
products.append(product) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Just use main.py <ticket.pdf>