Last active
September 21, 2023 20:56
-
-
Save petrinkae/5e53d080088042f97bbd2dbf7518adae to your computer and use it in GitHub Desktop.
Framework for basic Python table extraction from multiple PDFs, using pandas, os, and pdfplumber
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[18]: | |
import pdfplumber as pp | |
import pandas as pd | |
import os | |
import numpy as np | |
import fnmatch | |
import glob | |
# In[19]: | |
pdf_dir = 'pdfs/' | |
output_dir = 'extracts/' | |
# In[20]: | |
print(len(fnmatch.filter(os.listdir(pdf_directory), '*.pdf'))) | |
# In[23]: | |
# set wildcard match list to create PDF sets | |
dataset = ['Hires','Resignations','Firings'] | |
# In[28]: | |
for data in dataset: | |
all_files = glob.glob(pdf_dir + '*' + data + '*.pdf',recursive=True) | |
print(all_files) | |
file_cnt = 1 # used to log progress | |
for file in all_files: | |
# iterate through files, pull extension for validity check | |
f = os.path.join(file) | |
split_tup = os.path.splitext(f) | |
ext = split_tup[1] | |
# grab file length to remove header and extension text | |
length = len(file) | |
file_name = file[0:length - 4] | |
# extract data from valid PDFs | |
if (ext == '.pdf'): | |
# access pdf | |
pdf = pp.open(f) | |
pages = pdf.pages | |
for page in pages: | |
tables = page.extract_tables() | |
for table in tables: | |
# create empty filename column | |
# we use this as a unique ID for each PDF | |
# commented out because headers are not consistent | |
table[0].insert(0, file_name) | |
# starts at 2 to skip double header | |
row_cnt = 1 | |
while row_cnt < len(table): | |
# add school name from filename | |
table[row_cnt].insert(0, file_name) | |
row_cnt += 1 | |
# create csv name based on table number | |
csv_name = output_dir + data | |
# create dataframe (skipping double header rows) | |
df = pd.DataFrame(table[:],columns=table[0]) | |
# create new csv for first pdf then append to existing after | |
# !!!!!! remember to delete old files from output folder/change output directory | |
# or this check will add rows to your previous scrape | |
if os.path.exists(csv_name + '.csv'): | |
# for existing file: appends df and excludes header | |
df.to_csv(csv_name + '.csv', mode='a', index=False, header=False) | |
else: | |
# for nonexisting file: creates new csv w/ header | |
df.to_csv(csv_name + '.csv', index=False) | |
# log out progress | |
print(str(file_cnt) + " PDF has been scraped") | |
print(f) | |
file_cnt += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment