Skip to content

Instantly share code, notes, and snippets.

@dnwe
Created January 8, 2017 10:42
Show Gist options
  • Save dnwe/8da9b5262e4615dbc512cdcef5ff5830 to your computer and use it in GitHub Desktop.
Save dnwe/8da9b5262e4615dbc512cdcef5ff5830 to your computer and use it in GitHub Desktop.
Pure Python script to count the number of pages in a directory of PDF files
#!/usr/bin/env python2
import os
import sys
import re
import fnmatch
import logging
logging.basicConfig(
stream=sys.stdout,
level=logging.INFO,
format='%(levelname)-8s %(message)s')
rxcountpages = re.compile(r"/Type\s*/Page([^s]|$)", re.MULTILINE|re.DOTALL)
def count_pages(pdf_name):
x = 0
with open(pdf_name, mode='rb') as f:
x = len(rxcountpages.findall(f.read()))
logging.info('%s has %d', pdf_name, int(x))
return x
def main():
'''main entrypoint'''
if len(sys.argv) != 2:
sys.stderr.writelines('Usage: {} <directory>\n'.format(sys.argv[0]))
sys.exit(1)
check_dir = sys.argv[1]
logging.info('Counting pages of any PDFs in %s', check_dir)
total_pages = 0
for path, subdirs, files in os.walk(check_dir):
for name in fnmatch.filter(files, '*.pdf'):
pdf_name = os.path.join(path, name)
total_pages += count_pages(pdf_name)
print('-' * 80)
print('Total pages in {}: {}'.format(check_dir, total_pages))
print('-' * 80)
if __name__ == "__main__":
main()
@clairton
Copy link

clairton commented Aug 2, 2024

Awesome, i need use f.read().decode('ISO-8859-1')

#!/usr/bin/env python2

import os
import sys
import re
import fnmatch
import logging

logging.basicConfig(
    stream=sys.stdout,
    level=logging.INFO,
    format='%(levelname)-8s %(message)s')

rxcountpages = re.compile(r"/Type\s*/Page([^s]|$)", re.MULTILINE|re.DOTALL)

def count_pages(pdf_name):
    x = 0
    with open(pdf_name, mode='rb') as f:
        x = len(rxcountpages.findall(f.read().decode('ISO-8859-1')))
        logging.info('%s has %d', pdf_name, int(x))
    return x


def main():
    '''main entrypoint'''
    if len(sys.argv) != 2:
        sys.stderr.writelines('Usage: {} <directory>\n'.format(sys.argv[0]))
        sys.exit(1)
    check_dir = sys.argv[1]
    logging.info('Counting pages of any PDFs in %s', check_dir)
    total_pages = 0
    for path, subdirs, files in os.walk(check_dir):
        for name in fnmatch.filter(files, '*.pdf'):
            pdf_name = os.path.join(path, name)
            total_pages += count_pages(pdf_name)
    print('-' * 80)
    print('Total pages in {}: {}'.format(check_dir, total_pages))
    print('-' * 80)

if __name__ == "__main__":
    main()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment