Last active
February 23, 2025 20:31
-
-
Save brunoamaral/258e73a488a75c8fb60a16b99119933b to your computer and use it in GitHub Desktop.
paperless-ngx, post consume script to fill in a custom field with the total for an invoice
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import requests | |
import sys | |
import re | |
# Environment variables for Paperless API | |
PAPERLESS_API_URL = os.getenv("PAPERLESS_API_URL", "http://localhost:8000/api") | |
PAPERLESS_API_TOKEN = os.getenv("SCRIPT_PAPERLESS_API_TOKEN", None) | |
HEADERS = {"authorization": f"Basic {PAPERLESS_API_TOKEN}"} | |
def get_document(document_id): | |
""" | |
Retrieve the text content of a document from Paperless-ngx using its API. | |
""" | |
try: | |
# Construct API endpoint for the document | |
url = f"{PAPERLESS_API_URL}/documents/{document_id}/" | |
# Send GET request to retrieve the OCR text | |
response = requests.get(url, headers=HEADERS) | |
# Check response status | |
if response.status_code == 200: | |
return response.json() | |
else: | |
print(f"Error: Unable to retrieve text for document {document_id}. Status code: {response.status_code}") | |
return None | |
except Exception as e: | |
print(f"Error: {e}") | |
return None | |
def extract_invoice_total(content, regex_patterns): | |
""" | |
Extract the total amount and currency from the content field using a list of regular expressions. | |
Args: | |
content (str): The invoice text content. | |
regex_patterns (list of str): List of regular expressions to match total amounts. | |
Returns: | |
tuple: A tuple containing the matched total amount (str) and currency (str) or (None, None) if no match is found. | |
""" | |
for pattern in regex_patterns: | |
match = re.search(pattern, content, re.IGNORECASE) | |
if match: | |
amount = match.group(1) | |
# Check for currency in the content | |
if "€" in content or "EUR" in content: | |
currency = "EUR" | |
elif "$" in content or "USD" in content: | |
currency = "USD" | |
else: | |
currency = "Unknown" | |
return amount, currency | |
return None, None | |
def format_total_value(total, currency="EUR"): | |
""" | |
Format the total amount to comply with Paperless-ngx requirements. | |
Args: | |
total (str): The extracted total value. | |
currency (str): The currency code to prepend (default is EUR). | |
Returns: | |
str: Formatted total value (e.g., EUR123.45). | |
""" | |
try: | |
# Normalize total (replace commas with dots and remove spaces) | |
total = total.replace(",", ".").strip() | |
# Ensure it's a valid float and format to two decimal places | |
total_float = float(total) | |
formatted_total = f"{currency}{total_float:.2f}" | |
return formatted_total | |
except ValueError: | |
print(f"Error: Unable to format total value '{total}'.") | |
return None | |
def update_document_custom_field(document_id, value): | |
""" | |
Update or add a document's custom field using the Paperless-ngx API. | |
Args: | |
document_id (int): The ID of the document to update. | |
value (str): The value to set for the custom field. | |
Returns: | |
bool: True if the update was successful, False otherwise. | |
""" | |
try: | |
# Get the current document details | |
url = f"{PAPERLESS_API_URL}/documents/{document_id}/" | |
response = requests.get(url, headers=HEADERS) | |
if response.status_code != 200: | |
print(f"Error: Unable to fetch document {document_id} for update. Status: {response.status_code}") | |
return False | |
document = response.json() | |
custom_fields = document.get("custom_fields", []) | |
# Check if field with pk=1 exists | |
field_exists = any(field.get("field") == 1 for field in custom_fields) | |
if field_exists: | |
# Update existing field | |
for field in custom_fields: | |
if field.get("field") == 1: | |
field["value"] = value | |
else: | |
# Add new field | |
custom_fields.append({"field": 1, "value": value}) | |
# Construct the payload | |
payload = {"custom_fields": custom_fields} | |
# Send the PATCH request | |
response = requests.patch(url, headers=HEADERS, json=payload) | |
if response.status_code == 200: | |
print(f"Document {document_id} updated successfully with custom field.") | |
return True | |
else: | |
print(f"Failed to update document {document_id}. Status: {response.status_code}, Response: {response.text}") | |
return False | |
except Exception as e: | |
print(f"Error updating document {document_id}: {e}") | |
return False | |
# List of regex patterns for matching different invoice formats | |
patterns = [ | |
r"total due\s+\$([\d.,]+)", # DigitalOcean (specific) | |
r"total\s+\$([\d.,]+)\s*(USD)?", # Cloudflare (optional "USD") | |
r"total da fatura\s+EUR?\s*([\d.,]+)", # BOLT | |
r"total a pagar\s+([\d.,]+)\s*€?", # Endesa/Continente | |
r"total a pagar\s+([\d.,]+)", # Generic for "total a pagar" | |
r"total\s+€\s*([\d.,]+)", # VENDUS invoices | |
r"total\s+([\d.,]+)" # Generic fallback | |
] | |
def main(): | |
""" | |
Main entry point for the post-consumption script. | |
""" | |
document_id = sys.argv[1] | |
if document_id == None: | |
document_id = os.getenv('DOCUMENT_ID') | |
document = get_document(document_id) | |
if document['document_type'] == 2: | |
# print(f"Document ID: {document_id}") | |
# print(document['content']) | |
total, currency = extract_invoice_total(document['content'], patterns) | |
print(total) | |
if total != None: | |
# add try catch here | |
total = format_total_value(total, currency) | |
update_document_custom_field(document_id,total) | |
else: | |
print(f"Failed to retrieve text for document {document_id}.") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment