Created
March 15, 2021 08:00
-
-
Save rileypeterson/723a8650affec02098fd5146f47bf488 to your computer and use it in GitHub Desktop.
Redact phrases of text from a PDF using Python and (only) PyPDF2. Would need to modify (trivial) for multiple pages.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import PyPDF2 | |
from PyPDF2 import PdfFileReader, PdfFileWriter | |
from PyPDF2.generic import StreamObject, DecodedStreamObject | |
from PyPDF2 import filters | |
# You change the phrases/text you want redacted and the file paths: | |
phrases = ["123-45-6789", "YOUR_SECRET_PASSWORD", "YOUR_SECRET_ID"] | |
read_path = "/path/to/yourpdf.pdf" | |
write_path = "/path/to/yourpdf-redacted.pdf" | |
def replace_w_stars(text, phrase): | |
return text.replace(phrase, "*" * len(phrase)) | |
def redact(text, phrases=phrases): | |
if isinstance(text, bytes): | |
text = text.decode() | |
for phrase in phrases: | |
text = replace_w_stars(text, phrase) | |
return text | |
class EncodedStreamObject(StreamObject): | |
def __init__(self): | |
self.decodedSelf = None | |
def getData(self): | |
if self.decodedSelf: | |
# cached version of decoded object | |
return self.decodedSelf.getData() | |
else: | |
# create decoded object | |
decoded = DecodedStreamObject() | |
decoded._data = filters.decodeStreamData(self) | |
decoded._data = redact(decoded._data) | |
for key, value in list(self.items()): | |
if not key in ("/Length", "/Filter", "/DecodeParms"): | |
decoded[key] = value | |
self.decodedSelf = decoded | |
return decoded._data | |
# Overload with redaction version | |
PyPDF2.generic.EncodedStreamObject = EncodedStreamObject | |
# Read in the PDFs | |
f = open(read_path, "rb") | |
r = PdfFileReader(f) | |
# Force the redaction by merging with itself | |
page = r.getPage(0) | |
page.mergePage(r.getPage(0)) | |
# Write out the result | |
f_out = open(write_path, "wb") | |
w = PdfFileWriter() | |
w.addPage(page) | |
w.write(f_out) | |
f_out.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for your quick reply riley, what I am saying is that the content of the
text
variable is encoded or maybe encrypted.Just a more comprehensive example:
extract_text()
method to the page without overwriting classEncodedStreamObject
, I correctly get 'This is a very standard text. \n '.text
variable has the following content: