Skip to content

Instantly share code, notes, and snippets.

@hathibelagal-dev
Last active October 27, 2025 09:44
Show Gist options
  • Save hathibelagal-dev/6f2d35a1bf01b2222a0e57cf5c493416 to your computer and use it in GitHub Desktop.
Save hathibelagal-dev/6f2d35a1bf01b2222a0e57cf5c493416 to your computer and use it in GitHub Desktop.
deepseekocr.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4",
"authorship_tag": "ABX9TyN1b8Rb+9emWJvuMAK42Lc3",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/hathibelagal-dev/6f2d35a1bf01b2222a0e57cf5c493416/deepseekocr.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"### Optional\n",
"This is for word-wrapping the output of OCR process"
],
"metadata": {
"id": "LJVecLhurpri"
}
},
{
"cell_type": "code",
"source": [
"from IPython.display import HTML, display\n",
"\n",
"def set_css():\n",
" display(HTML('''\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" '''))\n",
"get_ipython().events.register('pre_run_cell', set_css)"
],
"metadata": {
"id": "H01_mrj5EImo"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Setup"
],
"metadata": {
"id": "x6gOGtW0mEwm"
}
},
{
"cell_type": "code",
"source": [
"!pip install --no-deps -q bitsandbytes"
],
"metadata": {
"id": "axaHlCqp8vMO"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!pip install addict transformers==4.46.3 tokenizers==0.20.3 pdf2image"
],
"metadata": {
"id": "lRq1Gp3R5veF"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!apt install poppler-utils"
],
"metadata": {
"id": "eNwbNdtIlKQX"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!mkdir -p /content/outputs /content/pdf_pages/"
],
"metadata": {
"id": "tHhuMouI6tH7"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### PDF to Images"
],
"metadata": {
"id": "nTeG5pyemIf7"
}
},
{
"cell_type": "code",
"source": [
"from pdf2image import convert_from_path\n",
"\n",
"pdf_file = 'test.pdf'\n",
"images = convert_from_path(pdf_file)\n",
"\n",
"for i, image in enumerate(images):\n",
" image.save(f'/content/pdf_pages/page_{i+1}.jpg', 'JPEG')"
],
"metadata": {
"id": "fg65Tr_kkBdw"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from IPython.display import Image as _I\n",
"_I(\"/content/pdf_pages/page_1.jpg\", width=640)"
],
"metadata": {
"id": "2OTEhoXyBbTN"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### OCR"
],
"metadata": {
"id": "X5ETqD2OmMHM"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4urzCKTC5ArJ"
},
"outputs": [],
"source": [
"from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig\n",
"import torch\n",
"\n",
"model_name = 'deepseek-ai/DeepSeek-OCR'"
]
},
{
"cell_type": "code",
"source": [
"qc = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_quant_type=\"nf4\",\n",
" bnb_4bit_compute_dtype=torch.float\n",
")"
],
"metadata": {
"id": "2Qf2ctNZ82Ic"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
"model = AutoModel.from_pretrained(\n",
" model_name, trust_remote_code=True,\n",
" use_safetensors=True, device_map=\"auto\",\n",
" quantization_config=qc, torch_dtype=torch.float\n",
")\n",
"model = model.eval()"
],
"metadata": {
"id": "z45-aaPY5F0u"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"page_number = 5 #@param {type:\"integer\"}\n",
"\n",
"prompt = \"<image>\\nParse the figure.\"\n",
"image_file = f'/content/pdf_pages/page_{page_number}.jpg'\n",
"output_path = f'/content/outputs/page_{page_number}'"
],
"metadata": {
"id": "HijaQwdYmiR7"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%%time\n",
"model.infer(tokenizer, prompt=prompt, image_file=image_file, output_path = output_path, base_size = 1024, image_size = 1024, crop_mode=False, save_results = True, test_compress = True)"
],
"metadata": {
"id": "gHpzukbw8g7t"
},
"execution_count": null,
"outputs": []
}
]
}
@iqiancheng
Copy link

wget -O test.pdf https://www.arxiv.org/pdf/2510.18234

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment