Last active
February 11, 2026 13:15
-
-
Save diversen/88605dabf9608ab66268af363a8b838a to your computer and use it in GitHub Desktop.
Mistral OCR
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env -S uv run --script --no-project | |
| # /// script | |
| # requires-python = "==3.11.*" | |
| # dependencies = [ | |
| # "mistralai>=1.0.0", | |
| # ] | |
| # /// | |
| # | |
| # usage: mistral-ocr.py input.jpg output-dir/ | |
| import argparse | |
| import base64 | |
| import json | |
| import os | |
| from pathlib import Path | |
| from mistralai import Mistral | |
| from typing import Literal | |
| def mime_type_for_image(path: Path) -> str: | |
| suffix = path.suffix.lower() | |
| if suffix in {".jpg", ".jpeg"}: | |
| return "image/jpeg" | |
| if suffix == ".png": | |
| return "image/png" | |
| if suffix == ".gif": | |
| return "image/gif" | |
| if suffix == ".webp": | |
| return "image/webp" | |
| if suffix in {".tif", ".tiff"}: | |
| return "image/tiff" | |
| if suffix == ".bmp": | |
| return "image/bmp" | |
| return "application/octet-stream" | |
| def image_data_url(path: Path) -> str: | |
| mime = mime_type_for_image(path) | |
| b64 = base64.b64encode(path.read_bytes()).decode("ascii") | |
| return f"data:{mime};base64,{b64}" | |
| def ocr_image( | |
| client: Mistral, | |
| image_path: Path, | |
| model: str = "mistral-ocr-latest", | |
| table_format: Literal["markdown", "html"] = "html", | |
| include_image_base64: bool = True, | |
| ) -> dict: | |
| base64_image = image_data_url(image_path) | |
| ocr_response = client.ocr.process( | |
| model=model, | |
| document={"type": "image_url", "image_url": base64_image}, | |
| table_format=table_format, | |
| include_image_base64=include_image_base64, | |
| ) | |
| try: | |
| return json.loads(ocr_response.model_dump_json()) | |
| except AttributeError: | |
| if isinstance(ocr_response, dict): | |
| return ocr_response | |
| raise TypeError("Unexpected OCR response type; cannot convert to JSON dict.") | |
| def extract_markdown(ocr_json: dict, separator: str = "\n\n---\n\n") -> str: | |
| pages = ocr_json.get("pages", []) | |
| parts: list[str] = [] | |
| for page in pages: | |
| md = page.get("markdown", "") | |
| if md: | |
| parts.append(md.strip()) | |
| return (separator.join(parts) + "\n") if parts else "" | |
| def save_embedded_images(ocr_json: dict, output_dir: Path) -> list[Path]: | |
| """ | |
| Saves pages[*].images[*].image_base64 (data URL) to output_dir / images[*].id. | |
| Returns list of written file paths. | |
| """ | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| written: list[Path] = [] | |
| for page in ocr_json.get("pages", []): | |
| for img in page.get("images", []) or []: | |
| img_id = img.get("id") | |
| data_url = img.get("image_base64") | |
| if not img_id or not data_url: | |
| continue | |
| # Expect: data:<mime>;base64,<payload> | |
| if not data_url.startswith("data:") or ";base64," not in data_url: | |
| continue | |
| _, b64_payload = data_url.split(";base64,", 1) | |
| raw = base64.b64decode(b64_payload) | |
| out_path = output_dir / img_id | |
| out_path.write_bytes(raw) | |
| written.append(out_path) | |
| return written | |
| def parse_args() -> argparse.Namespace: | |
| p = argparse.ArgumentParser( | |
| description="Run Mistral OCR on an input image and write JSON, Markdown, and embedded images." | |
| ) | |
| p.add_argument( | |
| "image", | |
| type=Path, | |
| help="Path to the input image file.", | |
| ) | |
| p.add_argument( | |
| "out_dir", | |
| type=Path, | |
| default=Path("output"), | |
| help="Output directory where ocr.json, single.md, and embedded images will be saved.", | |
| ) | |
| p.add_argument( | |
| "--model", | |
| default="mistral-ocr-latest", | |
| help='OCR model name (default: "mistral-ocr-latest").', | |
| ) | |
| p.add_argument( | |
| "--table-format", | |
| default="markdown", | |
| choices=["html", "markdown"], | |
| help='Table extraction format (default: "markdown").', | |
| ) | |
| p.add_argument( | |
| "--no-embedded-images", | |
| action="store_true", | |
| help="Do not request embedded images in the OCR response (faster/smaller JSON).", | |
| ) | |
| p.add_argument( | |
| "--json-name", | |
| default="ocr.json", | |
| help='Filename for OCR JSON output (default: "ocr.json").', | |
| ) | |
| p.add_argument( | |
| "--md-name", | |
| default="single.md", | |
| help='Filename for Markdown output (default: "single.md").', | |
| ) | |
| return p.parse_args() | |
| def main() -> None: | |
| args = parse_args() | |
| api_key = os.environ.get("MISTRAL_API_KEY") | |
| if not api_key: | |
| raise RuntimeError("MISTRAL_API_KEY is not set in the environment.") | |
| image_path: Path = args.image | |
| out_dir: Path = args.out_dir | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| if not image_path.exists(): | |
| raise FileNotFoundError(f"Input image not found: {image_path}") | |
| if not image_path.is_file(): | |
| raise ValueError(f"Input path is not a file: {image_path}") | |
| client = Mistral(api_key=api_key) | |
| json_out = out_dir / args.json_name | |
| md_out = out_dir / args.md_name | |
| include_images = not args.no_embedded_images | |
| # 1) OCR -> JSON | |
| ocr_json = ocr_image( | |
| client, | |
| image_path, | |
| model=args.model, | |
| table_format=args.table_format, | |
| include_image_base64=include_images, | |
| ) | |
| # 2) Write JSON | |
| json_out.write_text( | |
| json.dumps(ocr_json, ensure_ascii=False, indent=2), | |
| encoding="utf-8", | |
| ) | |
| # 3) Save embedded images next to markdown (same dir) | |
| saved: list[Path] = [] | |
| if include_images: | |
| saved = save_embedded_images(ocr_json, output_dir=out_dir) | |
| # 4) Write markdown (image links like  will resolve in same dir) | |
| md_out.write_text(extract_markdown(ocr_json), encoding="utf-8") | |
| print(f"Wrote {json_out}") | |
| print(f"Wrote {md_out}") | |
| if include_images: | |
| print(f"Saved {len(saved)} embedded images into {out_dir}/") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment