hathibelagal-dev · October 27, 2025 09:44 · iqiancheng · Oct 27, 2025
diff --git a/deepseekocr.ipynb b/deepseekocr.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "authorship_tag": "ABX9TyN1b8Rb+9emWJvuMAK42Lc3",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/hathibelagal-dev/6f2d35a1bf01b2222a0e57cf5c493416/deepseekocr.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Optional\n",
        "This is for word-wrapping the output of OCR process"
      ],
      "metadata": {
        "id": "LJVecLhurpri"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from IPython.display import HTML, display\n",
        "\n",
        "def set_css():\n",
        "  display(HTML('''\n",
        "  <style>\n",
        "    pre {\n",
        "      white-space: pre-wrap;\n",
        "    }\n",
        "  </style>\n",
        "  '''))\n",
        "get_ipython().events.register('pre_run_cell', set_css)"
      ],
      "metadata": {
        "id": "H01_mrj5EImo"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Setup"
      ],
      "metadata": {
        "id": "x6gOGtW0mEwm"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install --no-deps -q bitsandbytes"
      ],
      "metadata": {
        "id": "axaHlCqp8vMO"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install addict transformers==4.46.3 tokenizers==0.20.3 pdf2image"
      ],
      "metadata": {
        "id": "lRq1Gp3R5veF"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!apt install poppler-utils"
      ],
      "metadata": {
        "id": "eNwbNdtIlKQX"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!mkdir -p /content/outputs /content/pdf_pages/"
      ],
      "metadata": {
        "id": "tHhuMouI6tH7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### PDF to Images"
      ],
      "metadata": {
        "id": "nTeG5pyemIf7"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from pdf2image import convert_from_path\n",
        "\n",
        "pdf_file = 'test.pdf'\n",
        "images = convert_from_path(pdf_file)\n",
        "\n",
        "for i, image in enumerate(images):\n",
        "    image.save(f'/content/pdf_pages/page_{i+1}.jpg', 'JPEG')"
      ],
      "metadata": {
        "id": "fg65Tr_kkBdw"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from IPython.display import Image as _I\n",
        "_I(\"/content/pdf_pages/page_1.jpg\", width=640)"
      ],
      "metadata": {
        "id": "2OTEhoXyBbTN"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### OCR"
      ],
      "metadata": {
        "id": "X5ETqD2OmMHM"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4urzCKTC5ArJ"
      },
      "outputs": [],
      "source": [
        "from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig\n",
        "import torch\n",
        "\n",
        "model_name = 'deepseek-ai/DeepSeek-OCR'"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "qc = BitsAndBytesConfig(\n",
        "    load_in_4bit=True,\n",
        "    bnb_4bit_use_double_quant=True,\n",
        "    bnb_4bit_quant_type=\"nf4\",\n",
        "    bnb_4bit_compute_dtype=torch.float\n",
        ")"
      ],
      "metadata": {
        "id": "2Qf2ctNZ82Ic"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
        "model = AutoModel.from_pretrained(\n",
        "    model_name, trust_remote_code=True,\n",
        "    use_safetensors=True, device_map=\"auto\",\n",
        "    quantization_config=qc, torch_dtype=torch.float\n",
        ")\n",
        "model = model.eval()"
      ],
      "metadata": {
        "id": "z45-aaPY5F0u"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "page_number = 5 #@param {type:\"integer\"}\n",
        "\n",
        "prompt = \"<image>\\nParse the figure.\"\n",
        "image_file = f'/content/pdf_pages/page_{page_number}.jpg'\n",
        "output_path = f'/content/outputs/page_{page_number}'"
      ],
      "metadata": {
        "id": "HijaQwdYmiR7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%%time\n",
        "model.infer(tokenizer, prompt=prompt, image_file=image_file, output_path = output_path, base_size = 1024, image_size = 1024, crop_mode=False, save_results = True, test_compress = True)"
      ],
      "metadata": {
        "id": "gHpzukbw8g7t"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"gpuType": "T4",
	"authorship_tag": "ABX9TyN1b8Rb+9emWJvuMAK42Lc3",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/hathibelagal-dev/6f2d35a1bf01b2222a0e57cf5c493416/deepseekocr.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"### Optional\n",
	"This is for word-wrapping the output of OCR process"
	],
	"metadata": {
	"id": "LJVecLhurpri"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"from IPython.display import HTML, display\n",
	"\n",
	"def set_css():\n",
	" display(HTML('''\n",
	" <style>\n",
	" pre {\n",
	" white-space: pre-wrap;\n",
	" }\n",
	" </style>\n",
	" '''))\n",
	"get_ipython().events.register('pre_run_cell', set_css)"
	],
	"metadata": {
	"id": "H01_mrj5EImo"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"### Setup"
	],
	"metadata": {
	"id": "x6gOGtW0mEwm"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"!pip install --no-deps -q bitsandbytes"
	],
	"metadata": {
	"id": "axaHlCqp8vMO"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"!pip install addict transformers==4.46.3 tokenizers==0.20.3 pdf2image"
	],
	"metadata": {
	"id": "lRq1Gp3R5veF"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"!apt install poppler-utils"
	],
	"metadata": {
	"id": "eNwbNdtIlKQX"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"!mkdir -p /content/outputs /content/pdf_pages/"
	],
	"metadata": {
	"id": "tHhuMouI6tH7"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"### PDF to Images"
	],
	"metadata": {
	"id": "nTeG5pyemIf7"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"from pdf2image import convert_from_path\n",
	"\n",
	"pdf_file = 'test.pdf'\n",
	"images = convert_from_path(pdf_file)\n",
	"\n",
	"for i, image in enumerate(images):\n",
	" image.save(f'/content/pdf_pages/page_{i+1}.jpg', 'JPEG')"
	],
	"metadata": {
	"id": "fg65Tr_kkBdw"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"from IPython.display import Image as _I\n",
	"_I(\"/content/pdf_pages/page_1.jpg\", width=640)"
	],
	"metadata": {
	"id": "2OTEhoXyBbTN"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"### OCR"
	],
	"metadata": {
	"id": "X5ETqD2OmMHM"
	}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "4urzCKTC5ArJ"
	},
	"outputs": [],
	"source": [
	"from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig\n",
	"import torch\n",
	"\n",
	"model_name = 'deepseek-ai/DeepSeek-OCR'"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"qc = BitsAndBytesConfig(\n",
	" load_in_4bit=True,\n",
	" bnb_4bit_use_double_quant=True,\n",
	" bnb_4bit_quant_type=\"nf4\",\n",
	" bnb_4bit_compute_dtype=torch.float\n",
	")"
	],
	"metadata": {
	"id": "2Qf2ctNZ82Ic"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
	"model = AutoModel.from_pretrained(\n",
	" model_name, trust_remote_code=True,\n",
	" use_safetensors=True, device_map=\"auto\",\n",
	" quantization_config=qc, torch_dtype=torch.float\n",
	")\n",
	"model = model.eval()"
	],
	"metadata": {
	"id": "z45-aaPY5F0u"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"page_number = 5 #@param {type:\"integer\"}\n",
	"\n",
	"prompt = \"<image>\\nParse the figure.\"\n",
	"image_file = f'/content/pdf_pages/page_{page_number}.jpg'\n",
	"output_path = f'/content/outputs/page_{page_number}'"
	],
	"metadata": {
	"id": "HijaQwdYmiR7"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"%%time\n",
	"model.infer(tokenizer, prompt=prompt, image_file=image_file, output_path = output_path, base_size = 1024, image_size = 1024, crop_mode=False, save_results = True, test_compress = True)"
	],
	"metadata": {
	"id": "gHpzukbw8g7t"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}