Created
June 3, 2025 13:48
-
-
Save up1/8b74930a0ac9fd190e4c769e77cd88d7 to your computer and use it in GitHub Desktop.
Mistral OCR with PDF file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import base64 | |
import os | |
from mistralai import Mistral | |
def encode_pdf(pdf_path): | |
"""Encode the pdf to base64.""" | |
try: | |
with open(pdf_path, "rb") as pdf_file: | |
return base64.b64encode(pdf_file.read()).decode('utf-8') | |
except FileNotFoundError: | |
print(f"Error: The file {pdf_path} was not found.") | |
return None | |
except Exception as e: # Added general exception handling | |
print(f"Error: {e}") | |
return None | |
# Create a markdown file from the OCR response | |
def create_markdown_file(ocr_response, output_filename = "output.md"): | |
with open(output_filename, "wt") as f: | |
for page in ocr_response.pages: | |
f.write(page.markdown) | |
if __name__ == "__main__": | |
if "MISTRAL_API_KEY" not in os.environ: | |
print("Error: MISTRAL_API_KEY environment variable is not set.") | |
exit(1) | |
# Path to your pdf | |
pdf_path = "doc-scan.pdf" | |
# API key and client initialization | |
api_key = os.environ["MISTRAL_API_KEY"] | |
client = Mistral(api_key=api_key) | |
# Getting the base64 string | |
base64_pdf = encode_pdf(pdf_path) | |
# Check if the base64 encoding was successful | |
if base64_pdf is None: | |
print("Error: Failed to encode the PDF file.") | |
exit(1) | |
# Process the OCR request | |
ocr_response = client.ocr.process( | |
model="mistral-ocr-latest", | |
document={ | |
"type": "document_url", | |
"document_url": f"data:application/pdf;base64,{base64_pdf}" | |
}, | |
include_image_base64=True | |
) | |
# Print the OCR response | |
create_markdown_file(ocr_response) | |
print("OCR processing complete. Markdown file created.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment