这个是油猴脚本。安装了暴力猴插件之后,点击上面这个文件的 Raw 按钮,会提示安装。
- pdf.js 可以提取所有的 TextItem 包括文本和包围盒
- 根据包围盒可以大致判断一下是否换行了
- 如果一行中包含了公式,那么一行会有很多个 TextItem,这些行会称之为 complex 的行
- 多个连续的 complex 行变成了 complex 块
- 如果有 claude 3 haiku 的账号会对 complex 块做一次基于图片的 OCR 来清洗嘈杂的带公式的文本
| // ==UserScript== | |
| // @name 拷贝 PDF 中的文本 | |
| // @description 方便粘贴到 chatgpt 进行问答 | |
| // @namespace github.com/taowen | |
| // @match *://*/*pdf* | |
| // @version 1.0.0 | |
| // @author taowen | |
| // @license MIT | |
| // @grant GM.registerMenuCommand | |
| // @grant GM_setClipboard | |
| // @grant GM.getValue | |
| // @grant GM.setValue | |
| // @grant GM.xmlHttpRequest | |
| // ==/UserScript== | |
| GM.registerMenuCommand("复制 Pdf 为 MarkDown", async () => { | |
| const PDFJS = await import('https://unpkg.com/pdfjs-dist/build/pdf.min.mjs'); | |
| PDFJS.GlobalWorkerOptions.workerSrc = 'https://unpkg.com/pdfjs-dist/build/pdf.worker.min.mjs'; | |
| const doc = await PDFJS.getDocument(window.location.href).promise; | |
| const lines = []; | |
| for (let i = 1; i < doc.numPages + 1; i++) { | |
| const page = await doc.getPage(i) | |
| const textContent = await page.getTextContent(); | |
| let currentLineY = 0; | |
| let currentLineText = ''; | |
| let currentLineHeight = 0; | |
| for (let item of textContent.items) { | |
| if(item.height === 0) { | |
| continue; | |
| } | |
| const y = item.transform[5] | |
| if (y !== currentLineY && (item.str.length > 4 || currentLineY - y > 11.5)) { | |
| if (currentLineText) { | |
| if (currentLineHeight > 11) { | |
| lines.push('## ' + currentLineText); | |
| } else { | |
| lines.push(currentLineText); | |
| } | |
| } | |
| currentLineText = item.str; | |
| currentLineY = y; | |
| currentLineHeight = item.height; | |
| } else { | |
| currentLineText += item.str; | |
| } | |
| } | |
| lines.push(currentLineText); | |
| } | |
| const allText = lines.join('\n'); | |
| GM_setClipboard(allText); | |
| alert('copied ' + allText.length + ' characters'); | |
| }); | |
| function ocr(imageBase64, referenceText) { | |
| alert('请本地修改 user script,填入 Claude 账号'); | |
| throw new Error('请本地修改 user script,填入 Claude 账号'); | |
| if (!imageBase64.startsWith('data:image/png;base64,')) { | |
| throw new Error('expect png'); | |
| } | |
| imageBase64 = imageBase64.substring('data:image/png;base64,'.length) | |
| return new Promise((resolve, reject) => { | |
| GM.xmlHttpRequest({ | |
| method: 'POST', | |
| url: '', | |
| headers: { | |
| "x-api-key": '', | |
| "anthropic-version": "2023-06-01", | |
| "Content-Type": "application/json" | |
| }, | |
| data: JSON.stringify({ | |
| model: "claude-3-haiku-20240307", | |
| max_tokens: 4000, | |
| temperature: 0, | |
| messages: [{ | |
| role: 'user', content: [ | |
| {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": imageBase64}}, | |
| {"type": "text", "text": `<referenceText>${referenceText}</referenceText>\n` + 'Markdown latex can have single $ or double $$. Transcribe this paper to markdown with latex exactly.'} | |
| ]}, { | |
| role: 'assistant', | |
| content: [{ type: 'text', text: 'Here is the text transcribed to Markdown:\n```markdown'}] | |
| }], | |
| }), | |
| onload: function(response) { | |
| const result = JSON.parse(response.responseText); | |
| const markdown = result['content'][0]['text']; | |
| resolve(markdown) | |
| }, | |
| onerror: function(response) { | |
| console.error('failed to ocr', response); | |
| reject(new Error('failed to ocr')) | |
| } | |
| }); | |
| }) | |
| } | |
| function logImage(url, size = 50) { | |
| const image = new Image(); | |
| image.src = url; | |
| image.onload = function() { | |
| var style = [ | |
| 'font-size: 1px;', | |
| 'padding: ' + this.height/100*size + 'px ' + this.width/100*size + 'px;', | |
| 'background: url('+ url +') no-repeat;', | |
| 'background-size: contain;' | |
| ].join(' '); | |
| console.log('%c ', style); | |
| }; | |
| } | |
| function blobToBase64(blob) { | |
| const fileReader = new FileReader(); | |
| return new Promise(resolve => { | |
| fileReader.onload = e => { | |
| resolve(e.target.result) | |
| } | |
| fileReader.readAsDataURL(blob) | |
| }) | |
| } | |
| async function cropImage(img, boundingBox) { | |
| const { x, y, width, height } = boundingBox; | |
| let offscreen = new OffscreenCanvas(width, height); | |
| let ctx = offscreen.getContext('2d'); | |
| ctx.drawImage(img, x, y, width, height, 0, 0, width, height); | |
| return await blobToBase64(await offscreen.convertToBlob()) | |
| } | |
| async function processComplexBlock(result, img, complexBlock) { | |
| if (complexBlock.length > 4) { | |
| let xMin = Infinity; | |
| let yMin = Infinity; | |
| let xMax = -Infinity; | |
| let yMax = -Infinity; | |
| let referenceText = ''; | |
| for (let j = 0; j < complexBlock.length - 1; j++) { | |
| const line = complexBlock[j]; | |
| referenceText = referenceText + line[1] + '\n'; | |
| for (const { x, y, width, height } of line[2]) { | |
| const xMaxBox = x + width; | |
| const yMaxBox = y + height; | |
| xMin = Math.min(xMin, x); | |
| yMin = Math.min(yMin, y); | |
| xMax = Math.max(xMax, xMaxBox); | |
| yMax = Math.max(yMax, yMaxBox); | |
| } | |
| } | |
| const patch = await cropImage(img, { | |
| x: xMin, | |
| y: yMin, | |
| width: xMax - xMin, | |
| height: yMax - yMin | |
| }); | |
| logImage(patch); | |
| console.log(referenceText) | |
| let patchText = await ocr(patch, referenceText); | |
| const end = patchText.lastIndexOf('```'); | |
| if (end !== -1) { | |
| patchText = patchText.substring(0, end); | |
| } | |
| patchText = patchText.trim(); | |
| console.log(patchText) | |
| result.push(patchText) | |
| result.push(complexBlock[complexBlock.length - 1][1]) | |
| } else { | |
| for (const line of complexBlock) { | |
| result.push(line[1]) | |
| } | |
| } | |
| } | |
| GM.registerMenuCommand("复制 Pdf 为 MarkDown (用 haiku 清洗)", async () => { | |
| const PDFJS = await import('https://unpkg.com/pdfjs-dist/build/pdf.min.mjs'); | |
| PDFJS.GlobalWorkerOptions.workerSrc = 'https://unpkg.com/pdfjs-dist/build/pdf.worker.min.mjs'; | |
| const doc = await PDFJS.getDocument(window.location.href).promise; | |
| let result = []; | |
| for (let i = 1; i < doc.numPages + 1; i++) { | |
| const lines = []; | |
| const page = await doc.getPage(i) | |
| const textContent = await page.getTextContent(); | |
| const viewport = page.getViewport({ scale: 4 }); | |
| var canvas = document.createElement('canvas'); | |
| canvas.width = viewport.width; | |
| canvas.height = viewport.height; | |
| await page.render({ | |
| canvasContext: canvas.getContext('2d'), | |
| viewport, | |
| }).promise; | |
| const imgUrl = canvas.toDataURL('image/jpeg'); | |
| const img = new Image(); | |
| await new Promise(resolve => { | |
| img.onload = () => { | |
| resolve(img); | |
| }; | |
| img.src = imgUrl; | |
| }); | |
| let currentLineY = 0; | |
| let currentLineText = ''; | |
| let currentLineHeight = 0; | |
| let currentLineRects = []; | |
| for (let item of textContent.items) { | |
| if(item.height === 0) { | |
| continue; | |
| } | |
| const y = item.transform[5] | |
| if (y !== currentLineY && (item.str.length > 4 || currentLineY - y > 11.5)) { | |
| if (currentLineText) { | |
| const isComplex = currentLineRects.length > 4 && !currentLineText.includes('i.e .') && !currentLineText.includes('e.g .') && !currentLineText.includes('↓') && !currentLineText.includes('↑') && !/^[A-Za-z•.,\s]+$/.test(currentLineText) | |
| if (!isComplex && currentLineHeight > 11) { | |
| lines.push([false, '## ' + currentLineText, currentLineRects]); | |
| } else { | |
| if (isComplex && lines.length) { | |
| lines[lines.length - 1][0] = true; | |
| } | |
| lines.push([isComplex, currentLineText, currentLineRects]); | |
| } | |
| } | |
| currentLineText = ''; | |
| currentLineY = y; | |
| currentLineHeight = item.height; | |
| currentLineRects = []; | |
| } | |
| currentLineText = currentLineText + ' ' + item.str; | |
| const rect = { x: item.transform[4] * 4, y: viewport.height - item.transform[5] * 4 - item.height * 2.7, width: item.width * 4, height: item.height * 4.4 }; | |
| currentLineRects.push(rect); | |
| } | |
| lines.push([false, currentLineText, [{ x:0, y:0, width:0, height: 0}]]); | |
| let complexBlock = [] | |
| let continuousSimple = 0 | |
| for (let i = 0; i < lines.length; i++) { | |
| const line = lines[i] | |
| const [isComplex, text, rects] = line; | |
| if (isComplex) { | |
| continuousSimple = 0; | |
| } else { | |
| continuousSimple += 1; | |
| } | |
| if (complexBlock.length === 0) { | |
| if (isComplex) { | |
| complexBlock = [line]; | |
| } else { | |
| result.push(text); | |
| } | |
| } else { | |
| if (continuousSimple < 3 && i < lines.length - 1 && Math.abs(rects[0].x - complexBlock[0][2][0].x) < 800) { | |
| complexBlock.push(line); | |
| } else { | |
| await processComplexBlock(result, img, complexBlock); | |
| complexBlock = []; | |
| result.push(text); | |
| } | |
| } | |
| } | |
| } | |
| const allText = result.join('\n'); | |
| GM_setClipboard(allText); | |
| alert('copied ' + allText.length + ' characters'); | |
| }); |