Skip to content

Instantly share code, notes, and snippets.

@diversen
Last active February 11, 2026 13:15
Show Gist options
  • Select an option

  • Save diversen/88605dabf9608ab66268af363a8b838a to your computer and use it in GitHub Desktop.

Select an option

Save diversen/88605dabf9608ab66268af363a8b838a to your computer and use it in GitHub Desktop.
Mistral OCR
#!/usr/bin/env -S uv run --script --no-project
# /// script
# requires-python = "==3.11.*"
# dependencies = [
# "mistralai>=1.0.0",
# ]
# ///
#
# usage: mistral-ocr.py input.jpg output-dir/
import argparse
import base64
import json
import os
from pathlib import Path
from mistralai import Mistral
from typing import Literal
def mime_type_for_image(path: Path) -> str:
suffix = path.suffix.lower()
if suffix in {".jpg", ".jpeg"}:
return "image/jpeg"
if suffix == ".png":
return "image/png"
if suffix == ".gif":
return "image/gif"
if suffix == ".webp":
return "image/webp"
if suffix in {".tif", ".tiff"}:
return "image/tiff"
if suffix == ".bmp":
return "image/bmp"
return "application/octet-stream"
def image_data_url(path: Path) -> str:
mime = mime_type_for_image(path)
b64 = base64.b64encode(path.read_bytes()).decode("ascii")
return f"data:{mime};base64,{b64}"
def ocr_image(
client: Mistral,
image_path: Path,
model: str = "mistral-ocr-latest",
table_format: Literal["markdown", "html"] = "html",
include_image_base64: bool = True,
) -> dict:
base64_image = image_data_url(image_path)
ocr_response = client.ocr.process(
model=model,
document={"type": "image_url", "image_url": base64_image},
table_format=table_format,
include_image_base64=include_image_base64,
)
try:
return json.loads(ocr_response.model_dump_json())
except AttributeError:
if isinstance(ocr_response, dict):
return ocr_response
raise TypeError("Unexpected OCR response type; cannot convert to JSON dict.")
def extract_markdown(ocr_json: dict, separator: str = "\n\n---\n\n") -> str:
pages = ocr_json.get("pages", [])
parts: list[str] = []
for page in pages:
md = page.get("markdown", "")
if md:
parts.append(md.strip())
return (separator.join(parts) + "\n") if parts else ""
def save_embedded_images(ocr_json: dict, output_dir: Path) -> list[Path]:
"""
Saves pages[*].images[*].image_base64 (data URL) to output_dir / images[*].id.
Returns list of written file paths.
"""
output_dir.mkdir(parents=True, exist_ok=True)
written: list[Path] = []
for page in ocr_json.get("pages", []):
for img in page.get("images", []) or []:
img_id = img.get("id")
data_url = img.get("image_base64")
if not img_id or not data_url:
continue
# Expect: data:<mime>;base64,<payload>
if not data_url.startswith("data:") or ";base64," not in data_url:
continue
_, b64_payload = data_url.split(";base64,", 1)
raw = base64.b64decode(b64_payload)
out_path = output_dir / img_id
out_path.write_bytes(raw)
written.append(out_path)
return written
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Run Mistral OCR on an input image and write JSON, Markdown, and embedded images."
)
p.add_argument(
"image",
type=Path,
help="Path to the input image file.",
)
p.add_argument(
"out_dir",
type=Path,
default=Path("output"),
help="Output directory where ocr.json, single.md, and embedded images will be saved.",
)
p.add_argument(
"--model",
default="mistral-ocr-latest",
help='OCR model name (default: "mistral-ocr-latest").',
)
p.add_argument(
"--table-format",
default="markdown",
choices=["html", "markdown"],
help='Table extraction format (default: "markdown").',
)
p.add_argument(
"--no-embedded-images",
action="store_true",
help="Do not request embedded images in the OCR response (faster/smaller JSON).",
)
p.add_argument(
"--json-name",
default="ocr.json",
help='Filename for OCR JSON output (default: "ocr.json").',
)
p.add_argument(
"--md-name",
default="single.md",
help='Filename for Markdown output (default: "single.md").',
)
return p.parse_args()
def main() -> None:
args = parse_args()
api_key = os.environ.get("MISTRAL_API_KEY")
if not api_key:
raise RuntimeError("MISTRAL_API_KEY is not set in the environment.")
image_path: Path = args.image
out_dir: Path = args.out_dir
out_dir.mkdir(parents=True, exist_ok=True)
if not image_path.exists():
raise FileNotFoundError(f"Input image not found: {image_path}")
if not image_path.is_file():
raise ValueError(f"Input path is not a file: {image_path}")
client = Mistral(api_key=api_key)
json_out = out_dir / args.json_name
md_out = out_dir / args.md_name
include_images = not args.no_embedded_images
# 1) OCR -> JSON
ocr_json = ocr_image(
client,
image_path,
model=args.model,
table_format=args.table_format,
include_image_base64=include_images,
)
# 2) Write JSON
json_out.write_text(
json.dumps(ocr_json, ensure_ascii=False, indent=2),
encoding="utf-8",
)
# 3) Save embedded images next to markdown (same dir)
saved: list[Path] = []
if include_images:
saved = save_embedded_images(ocr_json, output_dir=out_dir)
# 4) Write markdown (image links like ![](img-0.jpeg) will resolve in same dir)
md_out.write_text(extract_markdown(ocr_json), encoding="utf-8")
print(f"Wrote {json_out}")
print(f"Wrote {md_out}")
if include_images:
print(f"Saved {len(saved)} embedded images into {out_dir}/")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment