Created
March 15, 2021 08:00
-
-
Save rileypeterson/723a8650affec02098fd5146f47bf488 to your computer and use it in GitHub Desktop.
Redact phrases of text from a PDF using Python and (only) PyPDF2. Would need to modify (trivial) for multiple pages.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import PyPDF2 | |
from PyPDF2 import PdfFileReader, PdfFileWriter | |
from PyPDF2.generic import StreamObject, DecodedStreamObject | |
from PyPDF2 import filters | |
# You change the phrases/text you want redacted and the file paths: | |
phrases = ["123-45-6789", "YOUR_SECRET_PASSWORD", "YOUR_SECRET_ID"] | |
read_path = "/path/to/yourpdf.pdf" | |
write_path = "/path/to/yourpdf-redacted.pdf" | |
def replace_w_stars(text, phrase): | |
return text.replace(phrase, "*" * len(phrase)) | |
def redact(text, phrases=phrases): | |
if isinstance(text, bytes): | |
text = text.decode() | |
for phrase in phrases: | |
text = replace_w_stars(text, phrase) | |
return text | |
class EncodedStreamObject(StreamObject): | |
def __init__(self): | |
self.decodedSelf = None | |
def getData(self): | |
if self.decodedSelf: | |
# cached version of decoded object | |
return self.decodedSelf.getData() | |
else: | |
# create decoded object | |
decoded = DecodedStreamObject() | |
decoded._data = filters.decodeStreamData(self) | |
decoded._data = redact(decoded._data) | |
for key, value in list(self.items()): | |
if not key in ("/Length", "/Filter", "/DecodeParms"): | |
decoded[key] = value | |
self.decodedSelf = decoded | |
return decoded._data | |
# Overload with redaction version | |
PyPDF2.generic.EncodedStreamObject = EncodedStreamObject | |
# Read in the PDFs | |
f = open(read_path, "rb") | |
r = PdfFileReader(f) | |
# Force the redaction by merging with itself | |
page = r.getPage(0) | |
page.mergePage(r.getPage(0)) | |
# Write out the result | |
f_out = open(write_path, "wb") | |
w = PdfFileWriter() | |
w.addPage(page) | |
w.write(f_out) | |
f_out.close() |
Thanks for your quick reply riley, what I am saying is that the content of the text
variable is encoded or maybe encrypted.
Just a more comprehensive example:
- I created a .docx file, wrote "This is a very standard text." inside and saved it as PDF
- If I apply
extract_text()
method to the page without overwriting classEncodedStreamObject
, I correctly get 'This is a very standard text. \n '. - If I overwrite the class and place a breakpoint on line 21 above, the
text
variable has the following content:
/Span <</MCID 0/Lang (en-GB)>> BDC q
0.000008871 0 595.32 841.92 re
W* n
BT
/F1 11.04 Tf
1 0 0 1 56.64 760.68 Tm
/GS7 gs
0 g
/GS8 gs
0 G
[(T)] TJ
ET
Q
q
0.000008871 0 595.32 841.92 re
W* n
BT
/F1 11.04 Tf
1 0 0 1 62.04 760.68 Tm
0 g
0 G
[(h)3(is is a )8(v)-4(er)10(y)-3( )] TJ
ET
Q
q
0.000008871 0 595.32 841.92 re
W* n
BT
/F1 11.04 Tf
1 0 0 1 116.06 760.68 Tm
0 g
0 G
[(s)11(tand)5(ard)5( t)-3(ex)7(t.)] TJ
ET
Q
q
0.000008871 0 595.32 841.92 re
W* n
BT
/F1 11.04 Tf
1 0 0 1 178.46 760.68 Tm
0 g
0 G
[( )] TJ
ET
Q
EMC /Span <</MCID 1/Lang (en-GB)>> BDC q
0.000008871 0 595.32 841.92 re
W* n
BT
/F1 11.04 Tf
1 0 0 1 56.64 738.1 Tm
0 g
0 G
[( )] TJ
ET
Q
EMC
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You're replacing line 7 with the actual phrases you want to redact, right? I would check if you can find your phrases within the
text
variable at your breakpoint (e.g.assert phrases[0] in text
).Also, I would see if adding
'595.0000'
as a phrase, results in it being redacted (replaced with*
) from your particular PDF, since that is a phrase which definitely does appear in thetext
variable based on your post.Let me know the results and hopefully we can figure out why it's not working.