Created
July 4, 2025 16:36
-
-
Save jadsongmatos/941b5f0518021c0fb1ccdad972af64f6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "908eb30d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/jadson/anaconda3/envs/spacy/lib/python3.12/site-packages/doctr/file_utils.py:86: DeprecationWarning: Support for TensorFlow in DocTR is deprecated and will be removed in the next major release (v1.0.0). Please switch to the PyTorch backend.\n", | |
" warnings.warn(\n", | |
"2025-07-04 12:13:23.033776: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", | |
"2025-07-04 12:13:23.177404: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", | |
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", | |
"E0000 00:00:1751642003.252893 73028 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", | |
"E0000 00:00:1751642003.267469 73028 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", | |
"W0000 00:00:1751642003.416950 73028 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", | |
"W0000 00:00:1751642003.417010 73028 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", | |
"W0000 00:00:1751642003.417013 73028 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", | |
"W0000 00:00:1751642003.417016 73028 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", | |
"2025-07-04 12:13:23.429989: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", | |
"To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", | |
"/home/jadson/anaconda3/envs/spacy/lib/python3.12/site-packages/google/protobuf/internal/well_known_types.py:91: DeprecationWarning: datetime.datetime.utcfromtimestamp() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.fromtimestamp(timestamp, datetime.UTC).\n", | |
" _EPOCH_DATETIME_NAIVE = datetime.datetime.utcfromtimestamp(0)\n" | |
] | |
} | |
], | |
"source": [ | |
"import os\n", | |
"\n", | |
"# Let's pick the desired backend\n", | |
"os.environ['USE_TF'] = '1'\n", | |
"\n", | |
"import matplotlib.pyplot as plt\n", | |
"\n", | |
"from doctr.io import DocumentFile\n", | |
"from doctr.models import ocr_predictor" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "e22bb7bb", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK\n", | |
"Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3050 Laptop GPU, compute capability 8.6\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"I0000 00:00:1751642006.965868 73028 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1208 MB memory: -> device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6\n", | |
"2025-07-04 12:13:28.276017: E tensorflow/core/util/util.cc:131] oneDNN supports DT_HALF only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.\n", | |
"I0000 00:00:1751642008.339816 73028 cuda_dnn.cc:529] Loaded cuDNN version 90300\n", | |
"WARNING:tensorflow:Model's `__init__()` arguments contain non-serializable objects. Please implement a `get_config()` method in the subclassed Model for proper saving and loading. Defaulting to empty config.\n", | |
"WARNING:tensorflow:Model's `__init__()` arguments contain non-serializable objects. Please implement a `get_config()` method in the subclassed Model for proper saving and loading. Defaulting to empty config.\n", | |
"DEBUG:tensorflow:Layer lstm will use cuDNN kernels when running on GPU.\n", | |
"DEBUG:tensorflow:Layer lstm_1 will use cuDNN kernels when running on GPU.\n", | |
"WARNING:tensorflow:Model's `__init__()` arguments contain non-serializable objects. Please implement a `get_config()` method in the subclassed Model for proper saving and loading. Defaulting to empty config.\n", | |
"WARNING:tensorflow:Model's `__init__()` arguments contain non-serializable objects. Please implement a `get_config()` method in the subclassed Model for proper saving and loading. Defaulting to empty config.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"OCRPredictor(\n", | |
" (det_predictor): DetectionPredictor(\n", | |
" (pre_processor): PreProcessor(\n", | |
" (resize): Resize(output_size=(1024, 1024), method='bilinear', preserve_aspect_ratio=True, symmetric_pad=True)\n", | |
" (normalize): Normalize(mean=[0.7979999780654907, 0.7850000262260437, 0.7720000147819519], std=[0.2639999985694885, 0.27489998936653137, 0.28700000047683716])\n", | |
" )\n", | |
" (model): DBNet(\n", | |
" (feat_extractor): IntermediateLayerGetter()\n", | |
" (fpn): FeaturePyramidNetwork(channels=128)\n", | |
" (probability_head): <tf_keras.src.engine.sequential.Sequential object at 0x7f294426e840>\n", | |
" (threshold_head): <tf_keras.src.engine.sequential.Sequential object at 0x7f2938509580>\n", | |
" (postprocessor): DBPostProcessor(bin_thresh=0.3, box_thresh=0.1)\n", | |
" )\n", | |
" )\n", | |
" (reco_predictor): RecognitionPredictor(\n", | |
" (pre_processor): PreProcessor(\n", | |
" (resize): Resize(output_size=(32, 128), method='bilinear', preserve_aspect_ratio=True, symmetric_pad=False)\n", | |
" (normalize): Normalize(mean=[0.6940000057220459, 0.6949999928474426, 0.6930000185966492], std=[0.29899999499320984, 0.29600000381469727, 0.3009999990463257])\n", | |
" )\n", | |
" (model): SAR(\n", | |
" (feat_extractor): <doctr.models.classification.resnet.tensorflow.ResNet object at 0x7f29385926c0>\n", | |
" (encoder): SAREncoder()\n", | |
" (decoder): SARDecoder()\n", | |
" (postprocessor): SARPostProcessor(vocab_size=126)\n", | |
" )\n", | |
" )\n", | |
" (doc_builder): DocumentBuilder(resolve_lines=True, resolve_blocks=False, paragraph_break=0.035, export_as_straight_boxes=False)\n", | |
")\n" | |
] | |
} | |
], | |
"source": [ | |
"import tensorflow as tf\n", | |
"\n", | |
"from tensorflow.keras import mixed_precision\n", | |
"mixed_precision.set_global_policy('mixed_float16')\n", | |
"\n", | |
"# Carregar o modelo de OCR\n", | |
"#predictor = ocr_predictor(det_arch=\"db_resnet50\",reco_arch=\"crnn_vgg16_bn\", pretrained=True).model.half()\n", | |
"predictor = ocr_predictor(det_arch=\"db_resnet50\",reco_arch=\"sar_resnet31\", pretrained=True)\n", | |
"\n", | |
"# Display the architecture\n", | |
"print(predictor)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "bd2aafec", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/jadson/anaconda3/envs/spacy/lib/python3.12/site-packages/PyPDF2/__init__.py:21: DeprecationWarning: PyPDF2 is deprecated. Please move to the pypdf library instead.\n", | |
" warnings.warn(\n" | |
] | |
} | |
], | |
"source": [ | |
"from reportlab.pdfgen import canvas\n", | |
"from PyPDF2 import PdfReader, PdfWriter\n", | |
"from PyPDF2.generic import NameObject\n", | |
"from PyPDF2._page import ContentStream # para manipular streams\n", | |
"from io import BytesIO\n", | |
"\n", | |
"def create_ocr_pdf(ocr_data, w: float, h: float):\n", | |
" \"\"\"\n", | |
" Gera um PDF em memória com o texto OCR invisível, posicionado corretamente.\n", | |
" \"\"\"\n", | |
" packet = BytesIO()\n", | |
" c = canvas.Canvas(packet, pagesize=(w, h))\n", | |
"\n", | |
" for page_data in ocr_data['pages']:\n", | |
" for block in page_data['blocks']:\n", | |
" for line in block['lines']:\n", | |
" for word in line['words']:\n", | |
" (x1, y1), (x2, y2) = word['geometry']\n", | |
" absolute_x = x1 * w\n", | |
" absolute_y = (1 - y1) * h\n", | |
" font_size = (y2 - y1) * h\n", | |
" c.setFont(\"Helvetica\", font_size)\n", | |
" c.setFillAlpha(0.0)\n", | |
" c.drawString(absolute_x, absolute_y - font_size, word['value'])\n", | |
" c.showPage()\n", | |
"\n", | |
" c.save()\n", | |
" packet.seek(0)\n", | |
" return packet\n", | |
"\n", | |
"def remove_text_streams(page, reader):\n", | |
" \"\"\"\n", | |
" Remove apenas os operadores de TEXTO (entre BT e ET) do conteúdo da página,\n", | |
" mantendo imagens e gráficos intactos.\n", | |
" \"\"\"\n", | |
" if \"/Contents\" not in page:\n", | |
" return\n", | |
"\n", | |
" content = page[\"/Contents\"]\n", | |
" content_stream = ContentStream(content, reader)\n", | |
" new_ops = []\n", | |
" skipping = False\n", | |
"\n", | |
" for operands, operator in content_stream.operations:\n", | |
" if operator == b\"BT\":\n", | |
" skipping = True\n", | |
" if not skipping:\n", | |
" new_ops.append((operands, operator))\n", | |
" if operator == b\"ET\":\n", | |
" skipping = False\n", | |
"\n", | |
" content_stream.operations = new_ops\n", | |
" page[NameObject(\"/Contents\")] = content_stream\n", | |
"\n", | |
"def merge_pdfs(original_page, overlay_stream):\n", | |
" overlay = PdfReader(overlay_stream)\n", | |
" if len(overlay.pages) == 0:\n", | |
" return original_page\n", | |
" overlay_page = overlay.pages[0]\n", | |
" original_page.merge_page(overlay_page)\n", | |
" return original_page" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "bf7767c0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Páginas totais: 260, sendo processadas: 260\n", | |
"Página 1 OK\n", | |
"Página 2 OK\n", | |
"Página 3 OK\n", | |
"Página 4 OK\n", | |
"Página 5 OK\n", | |
"Página 6 OK\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2025-07-04 11:28:32.012670: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 916.62MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.\n", | |
"2025-07-04 11:28:32.155030: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.54GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Página 7 OK\n", | |
"Página 8 OK\n", | |
"Página 9 OK\n", | |
"Página 10 OK\n", | |
"Página 11 OK\n", | |
"Página 12 OK\n", | |
"Página 13 OK\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2025-07-04 11:28:49.489062: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 910.06MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.\n", | |
"2025-07-04 11:28:49.637849: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.54GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Página 14 OK\n", | |
"Página 15 OK\n", | |
"Página 16 OK\n", | |
"Página 17 OK\n", | |
"Página 18 OK\n", | |
"Página 19 OK\n", | |
"Página 20 OK\n", | |
"Página 21 OK\n", | |
"Página 22 OK\n", | |
"Página 23 OK\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2025-07-04 11:29:16.821190: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 913.88MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Página 24 OK\n", | |
"Página 25 OK\n", | |
"Página 26 OK\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2025-07-04 11:29:24.411722: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 934.88MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Página 27 OK\n", | |
"Página 28 OK\n", | |
"Página 29 OK\n", | |
"Página 30 OK\n", | |
"Página 31 OK\n", | |
"Página 32 OK\n", | |
"Página 33 OK\n", | |
"Página 34 OK\n", | |
"Página 35 OK\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2025-07-04 11:29:43.917924: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 919.12MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Página 36 OK\n", | |
"Página 37 OK\n", | |
"Página 38 OK\n", | |
"Página 39 OK\n", | |
"Página 40 OK\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2025-07-04 11:29:53.689526: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 945.38MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Página 41 OK\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2025-07-04 11:29:56.220805: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 898.12MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Página 42 OK\n", | |
"Página 43 OK\n", | |
"Página 44 OK\n", | |
"Página 45 OK\n", | |
"Página 46 OK\n", | |
"Página 47 OK\n", | |
"Página 48 OK\n", | |
"Página 49 OK\n", | |
"Página 50 OK\n", | |
"Página 51 OK\n", | |
"Página 52 OK\n", | |
"Página 53 OK\n", | |
"Página 54 OK\n", | |
"Página 55 OK\n", | |
"Página 56 OK\n", | |
"Página 57 OK\n", | |
"Página 58 OK\n", | |
"Página 59 OK\n", | |
"Página 60 OK\n", | |
"Página 61 OK\n", | |
"Página 62 OK\n", | |
"Página 63 OK\n", | |
"Página 64 OK\n", | |
"Página 65 OK\n", | |
"Página 66 OK\n", | |
"Página 67 OK\n", | |
"Página 68 OK\n", | |
"Página 69 OK\n", | |
"Página 70 OK\n", | |
"Página 71 OK\n", | |
"Página 72 OK\n", | |
"Página 73 OK\n", | |
"Página 74 OK\n", | |
"Página 75 OK\n", | |
"Página 76 OK\n", | |
"Página 77 OK\n", | |
"Página 78 OK\n", | |
"Página 79 OK\n", | |
"Página 80 OK\n", | |
"Página 81 OK\n", | |
"Página 82 OK\n", | |
"Página 83 OK\n", | |
"Página 84 OK\n", | |
"Página 85 OK\n", | |
"Página 86 OK\n", | |
"Página 87 OK\n", | |
"Página 88 OK\n", | |
"Página 89 OK\n", | |
"Página 90 OK\n", | |
"Página 91 OK\n", | |
"Página 92 OK\n", | |
"Página 93 OK\n", | |
"Página 94 OK\n", | |
"Página 95 OK\n", | |
"Página 96 OK\n", | |
"Página 97 OK\n", | |
"Página 98 OK\n", | |
"Página 99 OK\n", | |
"Página 100 OK\n", | |
"Página 101 OK\n", | |
"Página 102 OK\n", | |
"Página 103 OK\n", | |
"Página 104 OK\n", | |
"Página 105 OK\n", | |
"Página 106 OK\n", | |
"Página 107 OK\n", | |
"Página 108 OK\n", | |
"Página 109 OK\n", | |
"Página 110 OK\n", | |
"Página 111 OK\n", | |
"Página 112 OK\n", | |
"Página 113 OK\n", | |
"Página 114 OK\n", | |
"Página 115 OK\n", | |
"Página 116 OK\n", | |
"Página 117 OK\n", | |
"Página 118 OK\n", | |
"Página 119 OK\n", | |
"Página 120 OK\n", | |
"Página 121 OK\n", | |
"Página 122 OK\n", | |
"Página 123 OK\n", | |
"Página 124 OK\n", | |
"Página 125 OK\n", | |
"Página 126 OK\n", | |
"Página 127 OK\n", | |
"Página 128 OK\n", | |
"Página 129 OK\n", | |
"Página 130 OK\n", | |
"Página 131 OK\n", | |
"Página 132 OK\n", | |
"Página 133 OK\n", | |
"Página 134 OK\n", | |
"Página 135 OK\n", | |
"Página 136 OK\n", | |
"Página 137 OK\n", | |
"Página 138 OK\n", | |
"Página 139 OK\n", | |
"Página 140 OK\n", | |
"Página 141 OK\n", | |
"Página 142 OK\n", | |
"Página 143 OK\n", | |
"Página 144 OK\n", | |
"Página 145 OK\n", | |
"Página 146 OK\n", | |
"Página 147 OK\n", | |
"Página 148 OK\n", | |
"Página 149 OK\n", | |
"Página 150 OK\n", | |
"Página 151 OK\n", | |
"Página 152 OK\n", | |
"Página 153 OK\n", | |
"Página 154 OK\n", | |
"Página 155 OK\n", | |
"Página 156 OK\n", | |
"Página 157 OK\n", | |
"Página 158 OK\n", | |
"Página 159 OK\n", | |
"Página 160 OK\n", | |
"Página 161 OK\n", | |
"Página 162 OK\n", | |
"Página 163 OK\n", | |
"Página 164 OK\n", | |
"Página 165 OK\n", | |
"Página 166 OK\n", | |
"Página 167 OK\n", | |
"Página 168 OK\n", | |
"Página 169 OK\n", | |
"Página 170 OK\n", | |
"Página 171 OK\n", | |
"Página 172 OK\n", | |
"Página 173 OK\n", | |
"Página 174 OK\n", | |
"Página 175 OK\n", | |
"Página 176 OK\n", | |
"Página 177 OK\n", | |
"Página 178 OK\n", | |
"Página 179 OK\n", | |
"Página 180 OK\n", | |
"Página 181 OK\n", | |
"Página 182 OK\n", | |
"Página 183 OK\n", | |
"Página 184 OK\n", | |
"Página 185 OK\n", | |
"Página 186 OK\n", | |
"Página 187 OK\n", | |
"Página 188 OK\n", | |
"Página 189 OK\n", | |
"Página 190 OK\n", | |
"Página 191 OK\n", | |
"Página 192 OK\n", | |
"Página 193 OK\n", | |
"Página 194 OK\n", | |
"Página 195 OK\n", | |
"Página 196 OK\n", | |
"Página 197 OK\n", | |
"Página 198 OK\n", | |
"Página 199 OK\n", | |
"Página 200 OK\n", | |
"Página 201 OK\n", | |
"Página 202 OK\n", | |
"Página 203 OK\n", | |
"Página 204 OK\n", | |
"Página 205 OK\n", | |
"Página 206 OK\n", | |
"Página 207 OK\n", | |
"Página 208 OK\n", | |
"Página 209 OK\n", | |
"Página 210 OK\n", | |
"Página 211 OK\n", | |
"Página 212 OK\n", | |
"Página 213 OK\n", | |
"Página 214 OK\n", | |
"Página 215 OK\n", | |
"Página 216 OK\n", | |
"Página 217 OK\n", | |
"Página 218 OK\n", | |
"Página 219 OK\n", | |
"Página 220 OK\n", | |
"Página 221 OK\n", | |
"Página 222 OK\n", | |
"Página 223 OK\n", | |
"Página 224 OK\n", | |
"Página 225 OK\n", | |
"Página 226 OK\n", | |
"Página 227 OK\n", | |
"Página 228 OK\n", | |
"Página 229 OK\n", | |
"Página 230 OK\n", | |
"Página 231 OK\n", | |
"Página 232 OK\n", | |
"Página 233 OK\n", | |
"Página 234 OK\n", | |
"Página 235 OK\n", | |
"Página 236 OK\n", | |
"Página 237 OK\n", | |
"Página 238 OK\n", | |
"Página 239 OK\n", | |
"Página 240 OK\n", | |
"Página 241 OK\n", | |
"Página 242 OK\n", | |
"Página 243 OK\n", | |
"Página 244 OK\n", | |
"Página 245 OK\n", | |
"Página 246 OK\n", | |
"Página 247 OK\n", | |
"Página 248 OK\n", | |
"Página 249 OK\n", | |
"Página 250 OK\n", | |
"Página 251 OK\n", | |
"Página 252 OK\n", | |
"Página 253 OK\n", | |
"Página 254 OK\n", | |
"Página 255 OK\n", | |
"Página 256 OK\n", | |
"Página 257 OK\n", | |
"Página 258 OK\n", | |
"Página 259 OK\n", | |
"Página 260 OK\n", | |
"Salvo: output.pdf\n" | |
] | |
} | |
], | |
"source": [ | |
"input_pdf = \"DEWALT Carpentry and Framing Complete Handbook.pdf\"\n", | |
"doctr_pdf = DocumentFile.from_pdf(input_pdf)\n", | |
"reader = PdfReader(input_pdf)\n", | |
"num_pages = min(len(reader.pages), len(doctr_pdf))\n", | |
"\n", | |
"print(f\"Páginas totais: {len(reader.pages)}, sendo processadas: {num_pages}\")\n", | |
"\n", | |
"writer = PdfWriter()\n", | |
"\n", | |
"for i in range(num_pages):\n", | |
" page = reader.pages[i]\n", | |
"\n", | |
" # 1) remove só o texto\n", | |
" remove_text_streams(page, reader)\n", | |
"\n", | |
" # 2) dimensões como float\n", | |
" w = float(page.mediabox.width)\n", | |
" h = float(page.mediabox.height)\n", | |
"\n", | |
" # 3) OCR e export\n", | |
" result = predictor([doctr_pdf[i]])\n", | |
" ocr_data = result.export()\n", | |
" if not ocr_data.get(\"pages\"):\n", | |
" print(f\"[Aviso] OCR falhou na página {i+1}, mantendo o page sem overlay.\")\n", | |
" writer.add_page(page)\n", | |
" continue\n", | |
"\n", | |
" # 4) gera overlay invisível e faz merge\n", | |
" overlay = create_ocr_pdf(ocr_data, w, h)\n", | |
" merged = merge_pdfs(page, overlay)\n", | |
" writer.add_page(merged)\n", | |
" print(f\"Página {i+1} OK\")\n", | |
"\n", | |
"# grava único output.pdf\n", | |
"output_pdf = \"output.pdf\"\n", | |
"with open(output_pdf, \"wb\") as f:\n", | |
" writer.write(f)\n", | |
"print(f\"Salvo: {output_pdf}\")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "spacy", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.12.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment