Created
March 31, 2025 15:59
-
-
Save rcsmit/ca410fb8f9f80e37e110a928a9ba1a08 to your computer and use it in GitHub Desktop.
read_pdf_convert_to_xls.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read a PDF and convert it to an XLS | |
# Tested on Bijlage B - Wob-deelbesluit 'Vaccinaties en medicatie oktober 2020' | |
# Not tested 100% | |
# inspired by https://x.com/Transparangst/status/1906717209423974689 | |
def read_directly_from_pdf(): | |
# read a file | |
# Install PyPDF2 if not already installed | |
# pip install PyPDF2 | |
# Path to the PDF file | |
pdf_path = "C:/Users/rcxsm/Downloads/vac_med_okt_2020.pdf" | |
# Create a PDF reader object | |
reader = PdfReader(pdf_path) | |
all_text = "" | |
# Extract text from each page | |
number_of_pages = len(reader.pages) | |
for i,page in enumerate(reader.pages): | |
text = page.extract_text() | |
text = re.sub(r'(\n\d{6})', r'\1#', text) | |
for t in ["Reeds Openbaar", "Deels Openbaar", "Niet Openbaar", "Openbaar"]: | |
text = text.replace(t, f'#{t}#') | |
text = text.replace('#Deels #Openbaar##','#Deels Openbaar#') | |
text = text.replace('#Reeds #Openbaar##','#Reeds Openbaar#') | |
text = text.replace('#Niet #Openbaar##','#Niet Openbaar#') | |
text = text.replace("# ", "#") | |
text = text.replace("; 10.","#10.") | |
text = text.replace("; 11.","#11.") | |
text = text.replace("; buiten verzoek","#buiten verzoek") | |
print (f"Reading page {i}/{number_of_pages}") | |
all_text +="\n"+text | |
# if i>2: | |
# test purposes | |
# break | |
# Split text into rows and columns using '#' as a separator | |
rows = [line.split('#') for line in all_text.splitlines()] | |
# Convert to DataFrame | |
df = pd.DataFrame(rows) | |
print(df) | |
# Iterate through rows and check columns 3 to 8 for "10.2.a" | |
for i in ["a","b","c","d","e","f","g"]: | |
df[f"101{i}"] = df.iloc[:, 3:9].apply(lambda row: f"10.1.{i}" in row.values, axis=1) | |
df[f"102{i}"] = df.iloc[:, 3:9].apply(lambda row: f"10.2.{i}" in row.values, axis=1) | |
df["BuitenVerzoek"] = df.iloc[:, 3:9].apply(lambda row: "buiten verzoek" in row.values, axis=1) | |
df["111concept"] = df.iloc[:, 3:9].apply(lambda row: "11.1, concept" in row.values, axis=1) | |
df.to_csv("output.csv", index=False) | |
df.to_excel("output.xlsx", index=False) | |
def main(): | |
read_directly_from_pdf() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment