brunoamaral · February 23, 2025 20:31
diff --git a/find-total.py b/find-total.py
 #!/usr/bin/env python3

 import os
 import requests
 import sys
 import re 
 # Environment variables for Paperless API
 PAPERLESS_API_URL = os.getenv("PAPERLESS_API_URL", "http://localhost:8000/api")
 PAPERLESS_API_TOKEN = os.getenv("SCRIPT_PAPERLESS_API_TOKEN", None)
 HEADERS = {"authorization": f"Basic {PAPERLESS_API_TOKEN}"}

 def get_document(document_id):
 	"""
 	Retrieve the text content of a document from Paperless-ngx using its API.
 	"""
 	try:
 		# Construct API endpoint for the document
 		url = f"{PAPERLESS_API_URL}/documents/{document_id}/"

 		# Send GET request to retrieve the OCR text
 		response = requests.get(url, headers=HEADERS)

 		# Check response status
 		if response.status_code == 200:
 			return response.json()
 		else:
 			print(f"Error: Unable to retrieve text for document {document_id}. Status code: {response.status_code}")
 			return None
 	except Exception as e:
 		print(f"Error: {e}")
 		return None

 def extract_invoice_total(content, regex_patterns):
 	"""
 	Extract the total amount and currency from the content field using a list of regular expressions.
 	
 	Args:
 		content (str): The invoice text content.
 		regex_patterns (list of str): List of regular expressions to match total amounts.
 	
 	Returns:
 		tuple: A tuple containing the matched total amount (str) and currency (str) or (None, None) if no match is found.
 	"""
 	for pattern in regex_patterns:
 		match = re.search(pattern, content, re.IGNORECASE)
 		if match:
 			amount = match.group(1)
 			# Check for currency in the content
 			if "€" in content or "EUR" in content:
 				currency = "EUR"
 			elif "$" in content or "USD" in content:
 				currency = "USD"
 			else:
 				currency = "Unknown"
 			return amount, currency
 	return None, None

 def format_total_value(total, currency="EUR"):
 	"""
 	Format the total amount to comply with Paperless-ngx requirements.
 	
 	Args:
 		total (str): The extracted total value.
 		currency (str): The currency code to prepend (default is EUR).
 	
 	Returns:
 		str: Formatted total value (e.g., EUR123.45).
 	"""
 	try:
 		# Normalize total (replace commas with dots and remove spaces)
 		total = total.replace(",", ".").strip()
 		# Ensure it's a valid float and format to two decimal places
 		total_float = float(total)
 		formatted_total = f"{currency}{total_float:.2f}"
 		return formatted_total
 	except ValueError:
 		print(f"Error: Unable to format total value '{total}'.")
 		return None

 def update_document_custom_field(document_id, value):
 	"""
 	Update or add a document's custom field using the Paperless-ngx API.
 	
 	Args:
 		document_id (int): The ID of the document to update.
 		value (str): The value to set for the custom field.
 	
 	Returns:
 		bool: True if the update was successful, False otherwise.
 	"""
 	try:
 		# Get the current document details
 		url = f"{PAPERLESS_API_URL}/documents/{document_id}/"
 		response = requests.get(url, headers=HEADERS)
 		if response.status_code != 200:
 			print(f"Error: Unable to fetch document {document_id} for update. Status: {response.status_code}")
 			return False

 		document = response.json()
 		custom_fields = document.get("custom_fields", [])

 		# Check if field with pk=1 exists
 		field_exists = any(field.get("field") == 1 for field in custom_fields)

 		if field_exists:
 			# Update existing field
 			for field in custom_fields:
 				if field.get("field") == 1:
 					field["value"] = value
 		else:
 			# Add new field
 			custom_fields.append({"field": 1, "value": value})

 		# Construct the payload
 		payload = {"custom_fields": custom_fields}

 		# Send the PATCH request
 		response = requests.patch(url, headers=HEADERS, json=payload)

 		if response.status_code == 200:
 			print(f"Document {document_id} updated successfully with custom field.")
 			return True
 		else:
 			print(f"Failed to update document {document_id}. Status: {response.status_code}, Response: {response.text}")
 			return False
 	except Exception as e:
 		print(f"Error updating document {document_id}: {e}")
 		return False


 # List of regex patterns for matching different invoice formats
 patterns = [
 	r"total due\s+\$([\d.,]+)",            # DigitalOcean (specific)
 	r"total\s+\$([\d.,]+)\s*(USD)?",       # Cloudflare (optional "USD")
 	r"total da fatura\s+EUR?\s*([\d.,]+)", # BOLT
 	r"total a pagar\s+([\d.,]+)\s*€?",     # Endesa/Continente
 	r"total a pagar\s+([\d.,]+)",          # Generic for "total a pagar"
 	r"total\s+€\s*([\d.,]+)",              # VENDUS invoices
 	r"total\s+([\d.,]+)"                   # Generic fallback
 ]
 def main():
 	"""
 	Main entry point for the post-consumption script.
 	"""

 	document_id = sys.argv[1]
 	if document_id == None:
 		document_id = os.getenv('DOCUMENT_ID')
 	document = get_document(document_id)

 	if document['document_type'] == 2:
 		# print(f"Document ID: {document_id}")
 		# print(document['content'])
 		total, currency = extract_invoice_total(document['content'], patterns)
 		print(total)
 		if total != None:
 			# add try catch here
 			total = format_total_value(total, currency)
 			update_document_custom_field(document_id,total)
 	else:
 		print(f"Failed to retrieve text for document {document_id}.")

 if __name__ == "__main__":
 	main()
	#!/usr/bin/env python3

	import os
	import requests
	import sys
	import re
	# Environment variables for Paperless API
	PAPERLESS_API_URL = os.getenv("PAPERLESS_API_URL", "http://localhost:8000/api")
	PAPERLESS_API_TOKEN = os.getenv("SCRIPT_PAPERLESS_API_TOKEN", None)
	HEADERS = {"authorization": f"Basic {PAPERLESS_API_TOKEN}"}

	def get_document(document_id):
	"""
	Retrieve the text content of a document from Paperless-ngx using its API.
	"""
	try:
	# Construct API endpoint for the document
	url = f"{PAPERLESS_API_URL}/documents/{document_id}/"

	# Send GET request to retrieve the OCR text
	response = requests.get(url, headers=HEADERS)

	# Check response status
	if response.status_code == 200:
	return response.json()
	else:
	print(f"Error: Unable to retrieve text for document {document_id}. Status code: {response.status_code}")
	return None
	except Exception as e:
	print(f"Error: {e}")
	return None

	def extract_invoice_total(content, regex_patterns):
	"""
	Extract the total amount and currency from the content field using a list of regular expressions.

	Args:
	content (str): The invoice text content.
	regex_patterns (list of str): List of regular expressions to match total amounts.

	Returns:
	tuple: A tuple containing the matched total amount (str) and currency (str) or (None, None) if no match is found.
	"""
	for pattern in regex_patterns:
	match = re.search(pattern, content, re.IGNORECASE)
	if match:
	amount = match.group(1)
	# Check for currency in the content
	if "€" in content or "EUR" in content:
	currency = "EUR"
	elif "$" in content or "USD" in content:
	currency = "USD"
	else:
	currency = "Unknown"
	return amount, currency
	return None, None

	def format_total_value(total, currency="EUR"):
	"""
	Format the total amount to comply with Paperless-ngx requirements.

	Args:
	total (str): The extracted total value.
	currency (str): The currency code to prepend (default is EUR).

	Returns:
	str: Formatted total value (e.g., EUR123.45).
	"""
	try:
	# Normalize total (replace commas with dots and remove spaces)
	total = total.replace(",", ".").strip()
	# Ensure it's a valid float and format to two decimal places
	total_float = float(total)
	formatted_total = f"{currency}{total_float:.2f}"
	return formatted_total
	except ValueError:
	print(f"Error: Unable to format total value '{total}'.")
	return None

	def update_document_custom_field(document_id, value):
	"""
	Update or add a document's custom field using the Paperless-ngx API.

	Args:
	document_id (int): The ID of the document to update.
	value (str): The value to set for the custom field.

	Returns:
	bool: True if the update was successful, False otherwise.
	"""
	try:
	# Get the current document details
	url = f"{PAPERLESS_API_URL}/documents/{document_id}/"
	response = requests.get(url, headers=HEADERS)
	if response.status_code != 200:
	print(f"Error: Unable to fetch document {document_id} for update. Status: {response.status_code}")
	return False

	document = response.json()
	custom_fields = document.get("custom_fields", [])

	# Check if field with pk=1 exists
	field_exists = any(field.get("field") == 1 for field in custom_fields)

	if field_exists:
	# Update existing field
	for field in custom_fields:
	if field.get("field") == 1:
	field["value"] = value
	else:
	# Add new field
	custom_fields.append({"field": 1, "value": value})

	# Construct the payload
	payload = {"custom_fields": custom_fields}

	# Send the PATCH request
	response = requests.patch(url, headers=HEADERS, json=payload)

	if response.status_code == 200:
	print(f"Document {document_id} updated successfully with custom field.")
	return True
	else:
	print(f"Failed to update document {document_id}. Status: {response.status_code}, Response: {response.text}")
	return False
	except Exception as e:
	print(f"Error updating document {document_id}: {e}")
	return False


	# List of regex patterns for matching different invoice formats
	patterns = [
	r"total due\s+\$([\d.,]+)", # DigitalOcean (specific)
	r"total\s+\$([\d.,]+)\s*(USD)?", # Cloudflare (optional "USD")
	r"total da fatura\s+EUR?\s*([\d.,]+)", # BOLT
	r"total a pagar\s+([\d.,]+)\s*€?", # Endesa/Continente
	r"total a pagar\s+([\d.,]+)", # Generic for "total a pagar"
	r"total\s+€\s*([\d.,]+)", # VENDUS invoices
	r"total\s+([\d.,]+)" # Generic fallback
	]
	def main():
	"""
	Main entry point for the post-consumption script.
	"""

	document_id = sys.argv[1]
	if document_id == None:
	document_id = os.getenv('DOCUMENT_ID')
	document = get_document(document_id)

	if document['document_type'] == 2:
	# print(f"Document ID: {document_id}")
	# print(document['content'])
	total, currency = extract_invoice_total(document['content'], patterns)
	print(total)
	if total != None:
	# add try catch here
	total = format_total_value(total, currency)
	update_document_custom_field(document_id,total)
	else:
	print(f"Failed to retrieve text for document {document_id}.")

	if __name__ == "__main__":
	main()