diversen · February 11, 2026 13:15
diff --git a/mistral-ocr.py b/mistral-ocr.py
 #!/usr/bin/env -S uv run --script --no-project
 # /// script
 # requires-python = "==3.11.*"
 # dependencies = [
 #   "mistralai>=1.0.0",
 # ]
 # ///
 #
 # usage: mistral-ocr.py input.jpg output-dir/


 import argparse
 import base64
 import json
 import os
 from pathlib import Path

 from mistralai import Mistral
 from typing import Literal


 def mime_type_for_image(path: Path) -> str:
    suffix = path.suffix.lower()
    if suffix in {".jpg", ".jpeg"}:
        return "image/jpeg"
    if suffix == ".png":
        return "image/png"
    if suffix == ".gif":
        return "image/gif"
    if suffix == ".webp":
        return "image/webp"
    if suffix in {".tif", ".tiff"}:
        return "image/tiff"
    if suffix == ".bmp":
        return "image/bmp"
    return "application/octet-stream"


 def image_data_url(path: Path) -> str:
    mime = mime_type_for_image(path)
    b64 = base64.b64encode(path.read_bytes()).decode("ascii")
    return f"data:{mime};base64,{b64}"


 def ocr_image(
    client: Mistral,
    image_path: Path,
    model: str = "mistral-ocr-latest",
    table_format: Literal["markdown", "html"] = "html",
    include_image_base64: bool = True,
 ) -> dict:
    base64_image = image_data_url(image_path)

    ocr_response = client.ocr.process(
        model=model,
        document={"type": "image_url", "image_url": base64_image},
        table_format=table_format,
        include_image_base64=include_image_base64,
    )

    try:
        return json.loads(ocr_response.model_dump_json())
    except AttributeError:
        if isinstance(ocr_response, dict):
            return ocr_response
        raise TypeError("Unexpected OCR response type; cannot convert to JSON dict.")


 def extract_markdown(ocr_json: dict, separator: str = "\n\n---\n\n") -> str:
    pages = ocr_json.get("pages", [])
    parts: list[str] = []
    for page in pages:
        md = page.get("markdown", "")
        if md:
            parts.append(md.strip())
    return (separator.join(parts) + "\n") if parts else ""


 def save_embedded_images(ocr_json: dict, output_dir: Path) -> list[Path]:
    """
    Saves pages[*].images[*].image_base64 (data URL) to output_dir / images[*].id.
    Returns list of written file paths.
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    written: list[Path] = []

    for page in ocr_json.get("pages", []):
        for img in page.get("images", []) or []:
            img_id = img.get("id")
            data_url = img.get("image_base64")
            if not img_id or not data_url:
                continue

            # Expect: data:<mime>;base64,<payload>
            if not data_url.startswith("data:") or ";base64," not in data_url:
                continue

            _, b64_payload = data_url.split(";base64,", 1)
            raw = base64.b64decode(b64_payload)

            out_path = output_dir / img_id
            out_path.write_bytes(raw)
            written.append(out_path)

    return written


 def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(
        description="Run Mistral OCR on an input image and write JSON, Markdown, and embedded images."
    )
    p.add_argument(
        "image",
        type=Path,
        help="Path to the input image file.",
    )
    p.add_argument(
        "out_dir",
        type=Path,
        default=Path("output"),
        help="Output directory where ocr.json, single.md, and embedded images will be saved.",
    )
    p.add_argument(
        "--model",
        default="mistral-ocr-latest",
        help='OCR model name (default: "mistral-ocr-latest").',
    )
    p.add_argument(
        "--table-format",
        default="markdown",
        choices=["html", "markdown"],
        help='Table extraction format (default: "markdown").',
    )
    p.add_argument(
        "--no-embedded-images",
        action="store_true",
        help="Do not request embedded images in the OCR response (faster/smaller JSON).",
    )
    p.add_argument(
        "--json-name",
        default="ocr.json",
        help='Filename for OCR JSON output (default: "ocr.json").',
    )
    p.add_argument(
        "--md-name",
        default="single.md",
        help='Filename for Markdown output (default: "single.md").',
    )
    return p.parse_args()


 def main() -> None:
    args = parse_args()

    api_key = os.environ.get("MISTRAL_API_KEY")
    if not api_key:
        raise RuntimeError("MISTRAL_API_KEY is not set in the environment.")

    image_path: Path = args.image
    out_dir: Path = args.out_dir
    out_dir.mkdir(parents=True, exist_ok=True)

    if not image_path.exists():
        raise FileNotFoundError(f"Input image not found: {image_path}")
    if not image_path.is_file():
        raise ValueError(f"Input path is not a file: {image_path}")

    client = Mistral(api_key=api_key)

    json_out = out_dir / args.json_name
    md_out = out_dir / args.md_name

    include_images = not args.no_embedded_images

    # 1) OCR -> JSON
    ocr_json = ocr_image(
        client,
        image_path,
        model=args.model,
        table_format=args.table_format,
        include_image_base64=include_images,
    )

    # 2) Write JSON
    json_out.write_text(
        json.dumps(ocr_json, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )

    # 3) Save embedded images next to markdown (same dir)
    saved: list[Path] = []
    if include_images:
        saved = save_embedded_images(ocr_json, output_dir=out_dir)

    # 4) Write markdown (image links like ![](img-0.jpeg) will resolve in same dir)
    md_out.write_text(extract_markdown(ocr_json), encoding="utf-8")

    print(f"Wrote {json_out}")
    print(f"Wrote {md_out}")
    if include_images:
        print(f"Saved {len(saved)} embedded images into {out_dir}/")


 if __name__ == "__main__":
    main()
	#!/usr/bin/env -S uv run --script --no-project
	# /// script
	# requires-python = "==3.11.*"
	# dependencies = [
	# "mistralai>=1.0.0",
	# ]
	# ///
	#
	# usage: mistral-ocr.py input.jpg output-dir/


	import argparse
	import base64
	import json
	import os
	from pathlib import Path

	from mistralai import Mistral
	from typing import Literal


	def mime_type_for_image(path: Path) -> str:
	suffix = path.suffix.lower()
	if suffix in {".jpg", ".jpeg"}:
	return "image/jpeg"
	if suffix == ".png":
	return "image/png"
	if suffix == ".gif":
	return "image/gif"
	if suffix == ".webp":
	return "image/webp"
	if suffix in {".tif", ".tiff"}:
	return "image/tiff"
	if suffix == ".bmp":
	return "image/bmp"
	return "application/octet-stream"


	def image_data_url(path: Path) -> str:
	mime = mime_type_for_image(path)
	b64 = base64.b64encode(path.read_bytes()).decode("ascii")
	return f"data:{mime};base64,{b64}"


	def ocr_image(
	client: Mistral,
	image_path: Path,
	model: str = "mistral-ocr-latest",
	table_format: Literal["markdown", "html"] = "html",
	include_image_base64: bool = True,
	) -> dict:
	base64_image = image_data_url(image_path)

	ocr_response = client.ocr.process(
	model=model,
	document={"type": "image_url", "image_url": base64_image},
	table_format=table_format,
	include_image_base64=include_image_base64,
	)

	try:
	return json.loads(ocr_response.model_dump_json())
	except AttributeError:
	if isinstance(ocr_response, dict):
	return ocr_response
	raise TypeError("Unexpected OCR response type; cannot convert to JSON dict.")


	def extract_markdown(ocr_json: dict, separator: str = "\n\n---\n\n") -> str:
	pages = ocr_json.get("pages", [])
	parts: list[str] = []
	for page in pages:
	md = page.get("markdown", "")
	if md:
	parts.append(md.strip())
	return (separator.join(parts) + "\n") if parts else ""


	def save_embedded_images(ocr_json: dict, output_dir: Path) -> list[Path]:
	"""
	Saves pages[].images[].image_base64 (data URL) to output_dir / images[*].id.
	Returns list of written file paths.
	"""
	output_dir.mkdir(parents=True, exist_ok=True)
	written: list[Path] = []

	for page in ocr_json.get("pages", []):
	for img in page.get("images", []) or []:
	img_id = img.get("id")
	data_url = img.get("image_base64")
	if not img_id or not data_url:
	continue

	# Expect: data:<mime>;base64,<payload>
	if not data_url.startswith("data:") or ";base64," not in data_url:
	continue

	_, b64_payload = data_url.split(";base64,", 1)
	raw = base64.b64decode(b64_payload)

	out_path = output_dir / img_id
	out_path.write_bytes(raw)
	written.append(out_path)

	return written


	def parse_args() -> argparse.Namespace:
	p = argparse.ArgumentParser(
	description="Run Mistral OCR on an input image and write JSON, Markdown, and embedded images."
	)
	p.add_argument(
	"image",
	type=Path,
	help="Path to the input image file.",
	)
	p.add_argument(
	"out_dir",
	type=Path,
	default=Path("output"),
	help="Output directory where ocr.json, single.md, and embedded images will be saved.",
	)
	p.add_argument(
	"--model",
	default="mistral-ocr-latest",
	help='OCR model name (default: "mistral-ocr-latest").',
	)
	p.add_argument(
	"--table-format",
	default="markdown",
	choices=["html", "markdown"],
	help='Table extraction format (default: "markdown").',
	)
	p.add_argument(
	"--no-embedded-images",
	action="store_true",
	help="Do not request embedded images in the OCR response (faster/smaller JSON).",
	)
	p.add_argument(
	"--json-name",
	default="ocr.json",
	help='Filename for OCR JSON output (default: "ocr.json").',
	)
	p.add_argument(
	"--md-name",
	default="single.md",
	help='Filename for Markdown output (default: "single.md").',
	)
	return p.parse_args()


	def main() -> None:
	args = parse_args()

	api_key = os.environ.get("MISTRAL_API_KEY")
	if not api_key:
	raise RuntimeError("MISTRAL_API_KEY is not set in the environment.")

	image_path: Path = args.image
	out_dir: Path = args.out_dir
	out_dir.mkdir(parents=True, exist_ok=True)

	if not image_path.exists():
	raise FileNotFoundError(f"Input image not found: {image_path}")
	if not image_path.is_file():
	raise ValueError(f"Input path is not a file: {image_path}")

	client = Mistral(api_key=api_key)

	json_out = out_dir / args.json_name
	md_out = out_dir / args.md_name

	include_images = not args.no_embedded_images

	# 1) OCR -> JSON
	ocr_json = ocr_image(
	client,
	image_path,
	model=args.model,
	table_format=args.table_format,
	include_image_base64=include_images,
	)

	# 2) Write JSON
	json_out.write_text(
	json.dumps(ocr_json, ensure_ascii=False, indent=2),
	encoding="utf-8",
	)

	# 3) Save embedded images next to markdown (same dir)
	saved: list[Path] = []
	if include_images:
	saved = save_embedded_images(ocr_json, output_dir=out_dir)

	# 4) Write markdown (image links like ![](img-0.jpeg) will resolve in same dir)
	md_out.write_text(extract_markdown(ocr_json), encoding="utf-8")

	print(f"Wrote {json_out}")
	print(f"Wrote {md_out}")
	if include_images:
	print(f"Saved {len(saved)} embedded images into {out_dir}/")


	if __name__ == "__main__":
	main()
No results found