Created
October 31, 2021 09:42
-
-
Save momijiame/2d3e0ad7cf92864fdda1ff8719af9372 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
"""PDF にした Amazon の領収書から「注文日」「注文番号」「請求総額」を取り出すやつ | |
# 下準備 | |
$ pip install pymupdf click | |
# 使い方 | |
$ python amznreceipt.py -f <pdf-filepath> | |
""" | |
from __future__ import annotations | |
import fitz | |
import click | |
def _is_digital_order(page_text: str) -> bool: | |
return 'デジタル注⽂概要' in page_text | |
def _seek_normal_order(page_text: str) -> tuple[str, str, int]: | |
"""通常の注文から情報を取り出す""" | |
lines = page_text.split('\n') | |
order_date = order_id = billing_amount = None | |
for line in lines: | |
if '注文日: ' in line: | |
order_date = line[line.rfind('注文日: ') + 5:] | |
if '注⽂番号 ' in line: | |
order_id = line[line.rfind('注⽂番号 ') + 5:] | |
if 'ご請求額:¥' in line: | |
billing_amount = int(line[line.rfind('ご請求額:¥') + 6:].replace(',', '')) | |
return order_date, order_id, billing_amount | |
def _seek_digital_order(page_text: str) -> tuple[str, str, int]: | |
"""デジタル注文から情報を取り出す""" | |
lines = page_text.split('\n') | |
order_date = order_id = billing_amount = None | |
for idx, line in enumerate(lines): | |
if '注⽂⽇: ' in line: | |
order_date = line[line.rfind('注⽂⽇: ') + 5:] | |
if '注⽂番号: ' in line: | |
order_id = line[line.rfind('注⽂番号: ') + 6:] | |
if '総計:' in line: | |
# 次の行に総計があることを仮定する | |
next_line = lines[idx + 1] | |
billing_amount = int(next_line.replace('¥', '').replace(',', '')) | |
return order_date, order_id, billing_amount | |
@click.command() | |
@click.option('--filepath', '-f', | |
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True), | |
required=True, | |
help='Amazon receipt PDF file') | |
def main(filepath: str): | |
# PDF ファイルを読み込む | |
with fitz.open(filepath) as pdf_in: | |
# 含まれるページを読み込む | |
for pdf_page in pdf_in: | |
# ページに含まれる文字列を取得する | |
page_text = pdf_page.get_text() | |
# デジタル注文と通常の注文でフォーマットが異なるため判定する | |
is_digital_order = _is_digital_order(page_text) | |
# 注文日、注文番号、請求額を取り出す | |
infos = _seek_digital_order(page_text) if is_digital_order else _seek_normal_order(page_text) | |
# カンマ区切りで出力する | |
print(','.join(str(info) for info in infos)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment