taowen · September 15, 2025 01:45
diff --git a/extractTextFromPdf.md b/extractTextFromPdf.md
diff --git a/extractTextFromPdf.user.js b/extractTextFromPdf.user.js
 // ==UserScript==
 // @name        拷贝 PDF 中的文本
 // @description 方便粘贴到 chatgpt 进行问答
 // @namespace   github.com/taowen
 // @match       *://*/*pdf*
 // @version     1.0.0
 // @author      taowen
 // @license     MIT
 // @grant       GM.registerMenuCommand
 // @grant       GM_setClipboard
 // @grant       GM.getValue
 // @grant       GM.setValue
 // @grant       GM.xmlHttpRequest
 // ==/UserScript==
 GM.registerMenuCommand("复制 Pdf 为 MarkDown", async () => {
  const PDFJS = await import('https://unpkg.com/pdfjs-dist/build/pdf.min.mjs');
  PDFJS.GlobalWorkerOptions.workerSrc = 'https://unpkg.com/pdfjs-dist/build/pdf.worker.min.mjs';
  const doc = await PDFJS.getDocument(window.location.href).promise;
  const lines = [];
  for (let i = 1; i < doc.numPages + 1; i++) {
    const page = await doc.getPage(i)
    const textContent = await page.getTextContent();
    let currentLineY = 0;
    let currentLineText = '';
    let currentLineHeight = 0;
    for (let item of textContent.items) {
      if(item.height === 0) {
        continue;
      }
      const y = item.transform[5]
      if (y !== currentLineY && (item.str.length > 4 || currentLineY - y > 11.5)) {
        if (currentLineText) {
          if (currentLineHeight > 11) {
            lines.push('## ' + currentLineText);
          } else {
            lines.push(currentLineText);
          }
        }
        currentLineText = item.str;
        currentLineY = y;
        currentLineHeight = item.height;
      } else {
        currentLineText += item.str;
      }
    }
    lines.push(currentLineText);
  }
  const allText = lines.join('\n');
  GM_setClipboard(allText);
  alert('copied ' + allText.length + ' characters');
 });

 function ocr(imageBase64, referenceText) {
  alert('请本地修改 user script，填入 Claude 账号');
  throw new Error('请本地修改 user script，填入 Claude 账号');
  if (!imageBase64.startsWith('data:image/png;base64,')) {
    throw new Error('expect png');
  }
  imageBase64 = imageBase64.substring('data:image/png;base64,'.length)
  return new Promise((resolve, reject) => {
    GM.xmlHttpRequest({
    method: 'POST',
    url: '',
    headers: {
        "x-api-key": '',
        "anthropic-version": "2023-06-01",
        "Content-Type": "application/json"
    },
    data: JSON.stringify({
        model: "claude-3-haiku-20240307",
        max_tokens: 4000,
        temperature: 0,
        messages: [{
          role: 'user', content: [
            {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": imageBase64}},
            {"type": "text", "text": `<referenceText>${referenceText}</referenceText>\n` + 'Markdown latex can have single $ or double $$. Transcribe this paper to markdown with latex exactly.'}
        ]}, {
          role: 'assistant',
          content: [{ type: 'text', text: 'Here is the text transcribed to Markdown:\n```markdown'}]
        }],
    }),
    onload: function(response) {
      const result = JSON.parse(response.responseText);
      const markdown = result['content'][0]['text'];
      resolve(markdown)
    },
    onerror: function(response) {
      console.error('failed to ocr', response);
      reject(new Error('failed to ocr'))
    }
  });
  })
 }

 function logImage(url, size = 50) {
    const image = new Image();
    image.src = url;
    image.onload = function() {
      var style = [
        'font-size: 1px;',
        'padding: ' + this.height/100*size + 'px ' + this.width/100*size + 'px;',
        'background: url('+ url +') no-repeat;',
        'background-size: contain;'
       ].join(' ');
       console.log('%c ', style);
    };
 }

 function blobToBase64(blob) {
    const fileReader = new FileReader();
    return new Promise(resolve => {
        fileReader.onload = e => {
            resolve(e.target.result)
        }
        fileReader.readAsDataURL(blob)
    })
 }

 async function cropImage(img, boundingBox) {
    const { x, y, width, height } = boundingBox;
    let offscreen = new OffscreenCanvas(width, height);
    let ctx = offscreen.getContext('2d');
    ctx.drawImage(img, x, y, width, height, 0, 0, width, height);
    return await blobToBase64(await offscreen.convertToBlob())
 }

 async function processComplexBlock(result, img, complexBlock) {
  if (complexBlock.length > 4) {
    let xMin = Infinity;
    let yMin = Infinity;
    let xMax = -Infinity;
    let yMax = -Infinity;
    let referenceText = '';
    for (let j = 0; j < complexBlock.length - 1; j++) {
      const line = complexBlock[j];
      referenceText = referenceText + line[1] + '\n';
      for (const { x, y, width, height } of line[2]) {
        const xMaxBox = x + width;
        const yMaxBox = y + height;
        xMin = Math.min(xMin, x);
        yMin = Math.min(yMin, y);
        xMax = Math.max(xMax, xMaxBox);
        yMax = Math.max(yMax, yMaxBox);
      }
    }
    const patch = await cropImage(img, {
      x: xMin,
      y: yMin,
      width: xMax - xMin,
      height: yMax - yMin
    });
    logImage(patch);
    console.log(referenceText)
    let patchText = await ocr(patch, referenceText);
    const end = patchText.lastIndexOf('```');
    if (end !== -1) {
      patchText = patchText.substring(0, end);
    }
    patchText = patchText.trim();
    console.log(patchText)
    result.push(patchText)
    result.push(complexBlock[complexBlock.length - 1][1])
  } else {
    for (const line of complexBlock) {
      result.push(line[1])
    }
  }
 }

 GM.registerMenuCommand("复制 Pdf 为 MarkDown (用 haiku 清洗)", async () => {
  const PDFJS = await import('https://unpkg.com/pdfjs-dist/build/pdf.min.mjs');
  PDFJS.GlobalWorkerOptions.workerSrc = 'https://unpkg.com/pdfjs-dist/build/pdf.worker.min.mjs';
  const doc = await PDFJS.getDocument(window.location.href).promise;
  let result = [];
  for (let i = 1; i < doc.numPages + 1; i++) {
    const lines = [];
    const page = await doc.getPage(i)
    const textContent = await page.getTextContent();
    const viewport = page.getViewport({ scale: 4 });

    var canvas = document.createElement('canvas');

    canvas.width = viewport.width;
    canvas.height = viewport.height;

    await page.render({
      canvasContext: canvas.getContext('2d'),
      viewport,
    }).promise;
    const imgUrl = canvas.toDataURL('image/jpeg');
    const img = new Image();
    await new Promise(resolve => {
        img.onload = () => {
            resolve(img);
        };
        img.src = imgUrl;
    });


    let currentLineY = 0;
    let currentLineText = '';
    let currentLineHeight = 0;
    let currentLineRects = [];
    for (let item of textContent.items) {
      if(item.height === 0) {
        continue;
      }
      const y = item.transform[5]
      if (y !== currentLineY && (item.str.length > 4 || currentLineY - y > 11.5)) {
        if (currentLineText) {
          const isComplex = currentLineRects.length > 4 && !currentLineText.includes('i.e .') && !currentLineText.includes('e.g .') && !currentLineText.includes('↓') && !currentLineText.includes('↑') && !/^[A-Za-z•.,\s]+$/.test(currentLineText)
          if (!isComplex && currentLineHeight > 11) {
            lines.push([false, '## ' + currentLineText, currentLineRects]);
          } else {
            if (isComplex && lines.length) {
              lines[lines.length - 1][0] = true;
            }
            lines.push([isComplex, currentLineText, currentLineRects]);
          }
        }
        currentLineText = '';
        currentLineY = y;
        currentLineHeight = item.height;
        currentLineRects = [];
      }
      currentLineText = currentLineText + ' ' + item.str;
      const rect = { x: item.transform[4] * 4, y: viewport.height - item.transform[5] * 4 - item.height * 2.7, width: item.width * 4, height: item.height * 4.4 };
      currentLineRects.push(rect);
    }
    lines.push([false, currentLineText, [{ x:0, y:0, width:0, height: 0}]]);

    let complexBlock = []
    let continuousSimple = 0
    for (let i = 0; i < lines.length; i++) {
      const line = lines[i]
      const [isComplex, text, rects] = line;
      if (isComplex) {
        continuousSimple = 0;
      } else {
        continuousSimple += 1;
      }
      if (complexBlock.length === 0) {
        if (isComplex) {
          complexBlock = [line];
        } else {
          result.push(text);
        }
      } else {
        if (continuousSimple < 3 && i < lines.length - 1 && Math.abs(rects[0].x - complexBlock[0][2][0].x) < 800) {
          complexBlock.push(line);
        } else {
          await processComplexBlock(result, img, complexBlock);
          complexBlock = [];
          result.push(text);
        }
      }
    }
  }
  const allText = result.join('\n');
  GM_setClipboard(allText);
  alert('copied ' + allText.length + ' characters');
 });
	// ==UserScript==
	// @name 拷贝 PDF 中的文本
	// @description 方便粘贴到 chatgpt 进行问答
	// @namespace github.com/taowen
	// @match :///pdf
	// @version 1.0.0
	// @author taowen
	// @license MIT
	// @grant GM.registerMenuCommand
	// @grant GM_setClipboard
	// @grant GM.getValue
	// @grant GM.setValue
	// @grant GM.xmlHttpRequest
	// ==/UserScript==
	GM.registerMenuCommand("复制 Pdf 为 MarkDown", async () => {
	const PDFJS = await import('https://unpkg.com/pdfjs-dist/build/pdf.min.mjs');
	PDFJS.GlobalWorkerOptions.workerSrc = 'https://unpkg.com/pdfjs-dist/build/pdf.worker.min.mjs';
	const doc = await PDFJS.getDocument(window.location.href).promise;
	const lines = [];
	for (let i = 1; i < doc.numPages + 1; i++) {
	const page = await doc.getPage(i)
	const textContent = await page.getTextContent();
	let currentLineY = 0;
	let currentLineText = '';
	let currentLineHeight = 0;
	for (let item of textContent.items) {
	if(item.height === 0) {
	continue;
	}
	const y = item.transform[5]
	if (y !== currentLineY && (item.str.length > 4 \|\| currentLineY - y > 11.5)) {
	if (currentLineText) {
	if (currentLineHeight > 11) {
	lines.push('## ' + currentLineText);
	} else {
	lines.push(currentLineText);
	}
	}
	currentLineText = item.str;
	currentLineY = y;
	currentLineHeight = item.height;
	} else {
	currentLineText += item.str;
	}
	}
	lines.push(currentLineText);
	}
	const allText = lines.join('\n');
	GM_setClipboard(allText);
	alert('copied ' + allText.length + ' characters');
	});

	function ocr(imageBase64, referenceText) {
	alert('请本地修改 user script，填入 Claude 账号');
	throw new Error('请本地修改 user script，填入 Claude 账号');
	if (!imageBase64.startsWith('data:image/png;base64,')) {
	throw new Error('expect png');
	}
	imageBase64 = imageBase64.substring('data:image/png;base64,'.length)
	return new Promise((resolve, reject) => {
	GM.xmlHttpRequest({
	method: 'POST',
	url: '',
	headers: {
	"x-api-key": '',
	"anthropic-version": "2023-06-01",
	"Content-Type": "application/json"
	},
	data: JSON.stringify({
	model: "claude-3-haiku-20240307",
	max_tokens: 4000,
	temperature: 0,
	messages: [{
	role: 'user', content: [
	{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": imageBase64}},
	{"type": "text", "text": `<referenceText>${referenceText}</referenceText>\n` + 'Markdown latex can have single $ or double $$. Transcribe this paper to markdown with latex exactly.'}
	]}, {
	role: 'assistant',
	content: [{ type: 'text', text: 'Here is the text transcribed to Markdown:\n```markdown'}]
	}],
	}),
	onload: function(response) {
	const result = JSON.parse(response.responseText);
	const markdown = result['content'][0]['text'];
	resolve(markdown)
	},
	onerror: function(response) {
	console.error('failed to ocr', response);
	reject(new Error('failed to ocr'))
	}
	});
	})
	}

	function logImage(url, size = 50) {
	const image = new Image();
	image.src = url;
	image.onload = function() {
	var style = [
	'font-size: 1px;',
	'padding: ' + this.height/100size + 'px ' + this.width/100size + 'px;',
	'background: url('+ url +') no-repeat;',
	'background-size: contain;'
	].join(' ');
	console.log('%c ', style);
	};
	}

	function blobToBase64(blob) {
	const fileReader = new FileReader();
	return new Promise(resolve => {
	fileReader.onload = e => {
	resolve(e.target.result)
	}
	fileReader.readAsDataURL(blob)
	})
	}

	async function cropImage(img, boundingBox) {
	const { x, y, width, height } = boundingBox;
	let offscreen = new OffscreenCanvas(width, height);
	let ctx = offscreen.getContext('2d');
	ctx.drawImage(img, x, y, width, height, 0, 0, width, height);
	return await blobToBase64(await offscreen.convertToBlob())
	}

	async function processComplexBlock(result, img, complexBlock) {
	if (complexBlock.length > 4) {
	let xMin = Infinity;
	let yMin = Infinity;
	let xMax = -Infinity;
	let yMax = -Infinity;
	let referenceText = '';
	for (let j = 0; j < complexBlock.length - 1; j++) {
	const line = complexBlock[j];
	referenceText = referenceText + line[1] + '\n';
	for (const { x, y, width, height } of line[2]) {
	const xMaxBox = x + width;
	const yMaxBox = y + height;
	xMin = Math.min(xMin, x);
	yMin = Math.min(yMin, y);
	xMax = Math.max(xMax, xMaxBox);
	yMax = Math.max(yMax, yMaxBox);
	}
	}
	const patch = await cropImage(img, {
	x: xMin,
	y: yMin,
	width: xMax - xMin,
	height: yMax - yMin
	});
	logImage(patch);
	console.log(referenceText)
	let patchText = await ocr(patch, referenceText);
	const end = patchText.lastIndexOf('```');
	if (end !== -1) {
	patchText = patchText.substring(0, end);
	}
	patchText = patchText.trim();
	console.log(patchText)
	result.push(patchText)
	result.push(complexBlock[complexBlock.length - 1][1])
	} else {
	for (const line of complexBlock) {
	result.push(line[1])
	}
	}
	}

	GM.registerMenuCommand("复制 Pdf 为 MarkDown (用 haiku 清洗)", async () => {
	const PDFJS = await import('https://unpkg.com/pdfjs-dist/build/pdf.min.mjs');
	PDFJS.GlobalWorkerOptions.workerSrc = 'https://unpkg.com/pdfjs-dist/build/pdf.worker.min.mjs';
	const doc = await PDFJS.getDocument(window.location.href).promise;
	let result = [];
	for (let i = 1; i < doc.numPages + 1; i++) {
	const lines = [];
	const page = await doc.getPage(i)
	const textContent = await page.getTextContent();
	const viewport = page.getViewport({ scale: 4 });

	var canvas = document.createElement('canvas');

	canvas.width = viewport.width;
	canvas.height = viewport.height;

	await page.render({
	canvasContext: canvas.getContext('2d'),
	viewport,
	}).promise;
	const imgUrl = canvas.toDataURL('image/jpeg');
	const img = new Image();
	await new Promise(resolve => {
	img.onload = () => {
	resolve(img);
	};
	img.src = imgUrl;
	});


	let currentLineY = 0;
	let currentLineText = '';
	let currentLineHeight = 0;
	let currentLineRects = [];
	for (let item of textContent.items) {
	if(item.height === 0) {
	continue;
	}
	const y = item.transform[5]
	if (y !== currentLineY && (item.str.length > 4 \|\| currentLineY - y > 11.5)) {
	if (currentLineText) {
	const isComplex = currentLineRects.length > 4 && !currentLineText.includes('i.e .') && !currentLineText.includes('e.g .') && !currentLineText.includes('↓') && !currentLineText.includes('↑') && !/^[A-Za-z•.,\s]+$/.test(currentLineText)
	if (!isComplex && currentLineHeight > 11) {
	lines.push([false, '## ' + currentLineText, currentLineRects]);
	} else {
	if (isComplex && lines.length) {
	lines[lines.length - 1][0] = true;
	}
	lines.push([isComplex, currentLineText, currentLineRects]);
	}
	}
	currentLineText = '';
	currentLineY = y;
	currentLineHeight = item.height;
	currentLineRects = [];
	}
	currentLineText = currentLineText + ' ' + item.str;
	const rect = { x: item.transform[4] * 4, y: viewport.height - item.transform[5] * 4 - item.height * 2.7, width: item.width * 4, height: item.height * 4.4 };
	currentLineRects.push(rect);
	}
	lines.push([false, currentLineText, [{ x:0, y:0, width:0, height: 0}]]);

	let complexBlock = []
	let continuousSimple = 0
	for (let i = 0; i < lines.length; i++) {
	const line = lines[i]
	const [isComplex, text, rects] = line;
	if (isComplex) {
	continuousSimple = 0;
	} else {
	continuousSimple += 1;
	}
	if (complexBlock.length === 0) {
	if (isComplex) {
	complexBlock = [line];
	} else {
	result.push(text);
	}
	} else {
	if (continuousSimple < 3 && i < lines.length - 1 && Math.abs(rects[0].x - complexBlock[0][2][0].x) < 800) {
	complexBlock.push(line);
	} else {
	await processComplexBlock(result, img, complexBlock);
	complexBlock = [];
	result.push(text);
	}
	}
	}
	}
	const allText = result.join('\n');
	GM_setClipboard(allText);
	alert('copied ' + allText.length + ' characters');
	});
No results found