Last active
February 2, 2024 22:15
-
-
Save tiarno/8a2995e70cee42f01e79 to your computer and use it in GitHub Desktop.
find PDF font info with PyPDF2, example code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PyPDF2 import PdfFileReader | |
from pprint import pprint | |
def walk(obj, fnt, emb): | |
''' | |
If there is a key called 'BaseFont', that is a font that is used in the document. | |
If there is a key called 'FontName' and another key in the same dictionary object | |
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is | |
embedded. | |
We create and add to two sets, fnt = fonts used and emb = fonts embedded. | |
''' | |
if not hasattr(obj, 'keys'): | |
return None, None | |
fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3']) | |
if '/BaseFont' in obj: | |
fnt.add(obj['/BaseFont']) | |
if '/FontName' in obj: | |
if [x for x in fontkeys if x in obj]:# test to see if there is FontFile | |
emb.add(obj['/FontName']) | |
for k in obj.keys(): | |
walk(obj[k], fnt, emb) | |
return fnt, emb# return the sets for each page | |
if __name__ == '__main__': | |
fname = 'myfile.pdf' | |
pdf = PdfFileReader(fname) | |
fonts = set() | |
embedded = set() | |
for page in pdf.pages: | |
obj = page.getObject() | |
# updated via this answer: | |
# https://stackoverflow.com/questions/60876103/use-pypdf2-to-detect-non-embedded-fonts-in-pdf-file-generated-by-google-docs/60895334#60895334 | |
# in order to handle lists inside objects. Thanks misingnoglic ! | |
# untested code since I don't have such a PDF to play with. | |
if type(obj) == PyPDF2.generic.ArrayObject: # You can also do ducktyping here | |
for i in obj: | |
if hasattr(i, 'keys'): | |
f, e = walk(i, fonts, embedded_fonts) | |
fonts = fonts.union(f) | |
embedded = embedded.union(e) | |
else: | |
f, e = walk(obj['/Resources'], fonts, embedded) | |
fonts = fonts.union(f) | |
embedded = embedded.union(e) | |
unembedded = fonts - embedded | |
print 'Font List' | |
pprint(sorted(list(fonts))) | |
if unembedded: | |
print '\nUnembedded Fonts' | |
pprint(unembedded) |
Hi! Is there a way to get bold words or bold phrases inside a page containing information about font used?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@mteam88 here: