Skip to content

Instantly share code, notes, and snippets.

@brunoamaral
Last active February 23, 2025 20:31
Show Gist options
  • Save brunoamaral/258e73a488a75c8fb60a16b99119933b to your computer and use it in GitHub Desktop.
Save brunoamaral/258e73a488a75c8fb60a16b99119933b to your computer and use it in GitHub Desktop.
paperless-ngx, post consume script to fill in a custom field with the total for an invoice
#!/usr/bin/env python3
import os
import requests
import sys
import re
# Environment variables for Paperless API
PAPERLESS_API_URL = os.getenv("PAPERLESS_API_URL", "http://localhost:8000/api")
PAPERLESS_API_TOKEN = os.getenv("SCRIPT_PAPERLESS_API_TOKEN", None)
HEADERS = {"authorization": f"Basic {PAPERLESS_API_TOKEN}"}
def get_document(document_id):
"""
Retrieve the text content of a document from Paperless-ngx using its API.
"""
try:
# Construct API endpoint for the document
url = f"{PAPERLESS_API_URL}/documents/{document_id}/"
# Send GET request to retrieve the OCR text
response = requests.get(url, headers=HEADERS)
# Check response status
if response.status_code == 200:
return response.json()
else:
print(f"Error: Unable to retrieve text for document {document_id}. Status code: {response.status_code}")
return None
except Exception as e:
print(f"Error: {e}")
return None
def extract_invoice_total(content, regex_patterns):
"""
Extract the total amount and currency from the content field using a list of regular expressions.
Args:
content (str): The invoice text content.
regex_patterns (list of str): List of regular expressions to match total amounts.
Returns:
tuple: A tuple containing the matched total amount (str) and currency (str) or (None, None) if no match is found.
"""
for pattern in regex_patterns:
match = re.search(pattern, content, re.IGNORECASE)
if match:
amount = match.group(1)
# Check for currency in the content
if "€" in content or "EUR" in content:
currency = "EUR"
elif "$" in content or "USD" in content:
currency = "USD"
else:
currency = "Unknown"
return amount, currency
return None, None
def format_total_value(total, currency="EUR"):
"""
Format the total amount to comply with Paperless-ngx requirements.
Args:
total (str): The extracted total value.
currency (str): The currency code to prepend (default is EUR).
Returns:
str: Formatted total value (e.g., EUR123.45).
"""
try:
# Normalize total (replace commas with dots and remove spaces)
total = total.replace(",", ".").strip()
# Ensure it's a valid float and format to two decimal places
total_float = float(total)
formatted_total = f"{currency}{total_float:.2f}"
return formatted_total
except ValueError:
print(f"Error: Unable to format total value '{total}'.")
return None
def update_document_custom_field(document_id, value):
"""
Update or add a document's custom field using the Paperless-ngx API.
Args:
document_id (int): The ID of the document to update.
value (str): The value to set for the custom field.
Returns:
bool: True if the update was successful, False otherwise.
"""
try:
# Get the current document details
url = f"{PAPERLESS_API_URL}/documents/{document_id}/"
response = requests.get(url, headers=HEADERS)
if response.status_code != 200:
print(f"Error: Unable to fetch document {document_id} for update. Status: {response.status_code}")
return False
document = response.json()
custom_fields = document.get("custom_fields", [])
# Check if field with pk=1 exists
field_exists = any(field.get("field") == 1 for field in custom_fields)
if field_exists:
# Update existing field
for field in custom_fields:
if field.get("field") == 1:
field["value"] = value
else:
# Add new field
custom_fields.append({"field": 1, "value": value})
# Construct the payload
payload = {"custom_fields": custom_fields}
# Send the PATCH request
response = requests.patch(url, headers=HEADERS, json=payload)
if response.status_code == 200:
print(f"Document {document_id} updated successfully with custom field.")
return True
else:
print(f"Failed to update document {document_id}. Status: {response.status_code}, Response: {response.text}")
return False
except Exception as e:
print(f"Error updating document {document_id}: {e}")
return False
# List of regex patterns for matching different invoice formats
patterns = [
r"total due\s+\$([\d.,]+)", # DigitalOcean (specific)
r"total\s+\$([\d.,]+)\s*(USD)?", # Cloudflare (optional "USD")
r"total da fatura\s+EUR?\s*([\d.,]+)", # BOLT
r"total a pagar\s+([\d.,]+)\s*€?", # Endesa/Continente
r"total a pagar\s+([\d.,]+)", # Generic for "total a pagar"
r"total\s+€\s*([\d.,]+)", # VENDUS invoices
r"total\s+([\d.,]+)" # Generic fallback
]
def main():
"""
Main entry point for the post-consumption script.
"""
document_id = sys.argv[1]
if document_id == None:
document_id = os.getenv('DOCUMENT_ID')
document = get_document(document_id)
if document['document_type'] == 2:
# print(f"Document ID: {document_id}")
# print(document['content'])
total, currency = extract_invoice_total(document['content'], patterns)
print(total)
if total != None:
# add try catch here
total = format_total_value(total, currency)
update_document_custom_field(document_id,total)
else:
print(f"Failed to retrieve text for document {document_id}.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment