Created
February 17, 2022 20:01
-
-
Save reuf/363bab430d4cdf74aa9054ba4daff579 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import tabula | |
#pages=[212,213] | |
#df = tabula.read_pdf('Knjiga_mrtvih.pdf', pages = '212')[0] | |
#df.columns = df.columns.str.replace('\r', ' ') | |
#data = df.dropna() | |
#print(df) | |
#data.to_excel('data.xlsx') | |
from tabula import read_pdf | |
import pandas as pd | |
import openpyxl | |
import glob, os | |
os.chdir("./") | |
# for filename in glob.glob("*.xlsx"): | |
# print(filename) | |
# table = pd.read_ excel(filename, skiprows=list(range(1))) | |
# table.to_excel(filename, index=False) | |
# empty data frame for the new output excel file with the merged excel files | |
# # csv files in the path | |
# file_list = glob.glob("*.xlsx") | |
# for file in file_list: | |
# print(file) | |
# table = pd.read_excel(file, header=None) | |
# table.set_axis( | |
# ["last_name", "first_name", "fathers_name", "date_of_birth", "place_of_birth", "nationality", "status", | |
# "formation", "date_of_death", "municipality_of_death"], axis=1, inplace=True) | |
# table.to_excel(file, index=False) | |
# # list of excel files we want to merge. | |
# # pd.read_excel(file_path) reads the | |
# # excel data into pandas dataframe. | |
# excl_list = [] | |
# | |
# for file in file_list: | |
# excl_list.append(pd.read_excel(file)) | |
# | |
# # concatenate all DataFrames in the list | |
# # into a single DataFrame, returns new | |
# # DataFrame. | |
# excl_merged = pd.concat(excl_list, ignore_index=True) | |
# | |
# # exports the dataframe into excel file | |
# # with specified name. | |
# excl_merged.to_excel('Srebrenica.xlsx', index=False) | |
# excl_list = [] | |
# | |
# for filename in glob.glob("*.xlsx"): | |
# print(filename) | |
# excl_list.append(pd.read_excel(filename)) | |
# | |
# excl_merged = pd.DataFrame() | |
# | |
# for excl_file in excl_list: | |
# # appends the data into the excl_merged | |
# # dataframe. | |
# excl_merged = excl_merged.append(excl_file, ignore_index=True) | |
# | |
# # exports the dataframe into excel file with | |
# # specified name. | |
# excl_merged.to_excel('Sarajevo.xlsx', index=False, header=False) | |
# table = pd.read_excel(filename) | |
# print(len(list(table.columns))) | |
# if (len(list(table.columns)) > 10): | |
# print(len(list(table.columns))) | |
# table.drop(table.columns[9], axis=1, inplace=True) | |
# table.to_excel(filename, index=False) | |
# if( 'Prezime' in list(table.columns)[0]): | |
# print(filename) | |
# print(list(table.columns)[0]) | |
# table.to_excel(filename, index=False, header=False) | |
# print(str(table.iat[0,0]) + str(table.iat[0,1])) | |
# if (table.iat[0,0].contains("Prezime")): | |
# print(filename) | |
# table.dropna(how='all', axis=1, inplace=True) | |
# table.to_excel(filename, index=False) | |
# for page in range(499, 495): | |
# table = read_pdf("Knjiga_mrtvih.pdf", pages='499', pandas_options={'header': None})[0] | |
# table.to_excel("page-"+str(page)+".xlsx", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment