Last active
February 2, 2024 22:15
-
-
Save tiarno/8a2995e70cee42f01e79 to your computer and use it in GitHub Desktop.
find PDF font info with PyPDF2, example code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PyPDF2 import PdfFileReader | |
from pprint import pprint | |
def walk(obj, fnt, emb): | |
''' | |
If there is a key called 'BaseFont', that is a font that is used in the document. | |
If there is a key called 'FontName' and another key in the same dictionary object | |
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is | |
embedded. | |
We create and add to two sets, fnt = fonts used and emb = fonts embedded. | |
''' | |
if not hasattr(obj, 'keys'): | |
return None, None | |
fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3']) | |
if '/BaseFont' in obj: | |
fnt.add(obj['/BaseFont']) | |
if '/FontName' in obj: | |
if [x for x in fontkeys if x in obj]:# test to see if there is FontFile | |
emb.add(obj['/FontName']) | |
for k in obj.keys(): | |
walk(obj[k], fnt, emb) | |
return fnt, emb# return the sets for each page | |
if __name__ == '__main__': | |
fname = 'myfile.pdf' | |
pdf = PdfFileReader(fname) | |
fonts = set() | |
embedded = set() | |
for page in pdf.pages: | |
obj = page.getObject() | |
# updated via this answer: | |
# https://stackoverflow.com/questions/60876103/use-pypdf2-to-detect-non-embedded-fonts-in-pdf-file-generated-by-google-docs/60895334#60895334 | |
# in order to handle lists inside objects. Thanks misingnoglic ! | |
# untested code since I don't have such a PDF to play with. | |
if type(obj) == PyPDF2.generic.ArrayObject: # You can also do ducktyping here | |
for i in obj: | |
if hasattr(i, 'keys'): | |
f, e = walk(i, fonts, embedded_fonts) | |
fonts = fonts.union(f) | |
embedded = embedded.union(e) | |
else: | |
f, e = walk(obj['/Resources'], fonts, embedded) | |
fonts = fonts.union(f) | |
embedded = embedded.union(e) | |
unembedded = fonts - embedded | |
print 'Font List' | |
pprint(sorted(list(fonts))) | |
if unembedded: | |
print '\nUnembedded Fonts' | |
pprint(unembedded) |
@mteam88 here:
from PyPDF2 import PdfReader
from pprint import pprint
import PyPDF2
def walk(obj, fnt, emb):
'''
If there is a key called 'BaseFont', that is a font that is used in the document.
If there is a key called 'FontName' and another key in the same dictionary object
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
embedded.
We create and add to two sets, fnt = fonts used and emb = fonts embedded.
'''
if not hasattr(obj, 'keys'):
return None, None
fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3'])
if '/BaseFont' in obj:
fnt.add(obj['/BaseFont'])
if '/FontName' in obj:
if [x for x in fontkeys if x in obj]:# test to see if there is FontFile
emb.add(obj['/FontName'])
for k in obj.keys():
walk(obj[k], fnt, emb)
return fnt, emb# return the sets for each page
if __name__ == '__main__':
fname = 'myfile.pdf'
pdf = PdfReader(fname)
fonts = set()
embedded = set()
for page in pdf.pages:
obj = page.get_object()
# updated via this answer:
# https://stackoverflow.com/questions/60876103/use-pypdf2-to-detect-non-embedded-fonts-in-pdf-file-generated-by-google-docs/60895334#60895334
# in order to handle lists inside objects. Thanks misingnoglic !
# untested code since I don't have such a PDF to play with.
if type(obj) == PyPDF2.generic.ArrayObject: # You can also do ducktyping here
for i in obj:
if hasattr(i, 'keys'):
f, e = walk(i, fonts, embedded_fonts)
fonts = fonts.union(f)
embedded = embedded.union(e)
else:
f, e = walk(obj['/Resources'], fonts, embedded)
fonts = fonts.union(f)
embedded = embedded.union(e)
unembedded = fonts - embedded
print('Font List')
pprint(sorted(list(fonts)))
if unembedded:
print ('\nUnembedded Fonts')
pprint(unembedded)
Hi! Is there a way to get bold words or bold phrases inside a page containing information about font used?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I would love this in python 3.