Last active
August 10, 2023 04:07
-
-
Save eduwass/1088102195dbc25c9d3b167c8c968d00 to your computer and use it in GitHub Desktop.
pdf to text using pdfjs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<head></head> | |
<body> | |
<input type="file" id="pdf-upload" accept=".pdf" /> | |
<h1>Text will go here</h1> | |
<div class="pdf-text" id="pdf-text"></div> | |
<script src="https://mozilla.github.io/pdf.js/build/pdf.js"></script> | |
<script> | |
document | |
.getElementById("pdf-upload") | |
.addEventListener("change", function () { | |
var file = this.files[0]; | |
if (file) { | |
var reader = new FileReader(); | |
reader.onload = function (e) { | |
var pdfData = new Uint8Array(this.result); | |
extractText(pdfData); | |
}; | |
reader.readAsArrayBuffer(file); | |
} | |
}); | |
pdfjsLib.GlobalWorkerOptions.workerSrc = | |
"https://mozilla.github.io/pdf.js/build/pdf.worker.js"; | |
async function appendTextWordByWord(text) { | |
const words = text.split(/\s+/); | |
for (const word of words) { | |
document.getElementById("pdf-text").innerHTML += word + " "; | |
await new Promise((resolve) => setTimeout(resolve, 100)); // 100-millisecond delay | |
} | |
} | |
function extractText(pdfData) { | |
var pdf = pdfjsLib.getDocument({ data: pdfData }); | |
return pdf.promise.then(async function (pdf) { | |
var totalPageCount = pdf.numPages; | |
for ( | |
var currentPage = 1; | |
currentPage <= totalPageCount; | |
currentPage++ | |
) { | |
let page = await pdf.getPage(currentPage); | |
let textContent = await page.getTextContent(); | |
let pageText = textContent.items | |
.map(function (s) { | |
return s.str; | |
}) | |
.join(""); | |
await appendTextWordByWord(pageText); | |
} | |
}); | |
} | |
console.log("start"); | |
</script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment