reuf · February 17, 2022 20:01
diff --git a/pandaPDFTableExtractExcelCleanup.py b/pandaPDFTableExtractExcelCleanup.py
 #import tabula
 #pages=[212,213]
 #df = tabula.read_pdf('Knjiga_mrtvih.pdf', pages = '212')[0]

 #df.columns = df.columns.str.replace('\r', ' ')
 #data = df.dropna()
 #print(df)
 #data.to_excel('data.xlsx')

 from tabula import read_pdf
 import pandas as pd
 import openpyxl
 import glob, os
 os.chdir("./")
 # for filename in glob.glob("*.xlsx"):
 #     print(filename)
 #     table = pd.read_ excel(filename, skiprows=list(range(1)))
 #     table.to_excel(filename, index=False)

 # empty data frame for the new output excel file with the merged excel files




 # # csv files in the path
 # file_list = glob.glob("*.xlsx")
 # for file in file_list:
 #     print(file)
 #     table = pd.read_excel(file, header=None)
 #     table.set_axis(
 #         ["last_name", "first_name", "fathers_name", "date_of_birth", "place_of_birth", "nationality", "status",
 #          "formation", "date_of_death", "municipality_of_death"], axis=1, inplace=True)
 #     table.to_excel(file, index=False)








 # # list of excel files we want to merge.
 # # pd.read_excel(file_path) reads the
 # # excel data into pandas dataframe.
 # excl_list = []
 #
 # for file in file_list:
 #     excl_list.append(pd.read_excel(file))
 #
 # # concatenate all DataFrames in the list
 # # into a single DataFrame, returns new
 # # DataFrame.
 # excl_merged = pd.concat(excl_list, ignore_index=True)
 #
 # # exports the dataframe into excel file
 # # with specified name.
 # excl_merged.to_excel('Srebrenica.xlsx', index=False)








 # excl_list = []
 #
 # for filename in glob.glob("*.xlsx"):
 #     print(filename)
 #     excl_list.append(pd.read_excel(filename))
 #
 # excl_merged = pd.DataFrame()
 #
 # for excl_file in excl_list:
 #     # appends the data into the excl_merged
 #     # dataframe.
 #     excl_merged = excl_merged.append(excl_file, ignore_index=True)
 #
 # # exports the dataframe into excel file with
 # # specified name.
 # excl_merged.to_excel('Sarajevo.xlsx', index=False, header=False)
    # table = pd.read_excel(filename)
    # print(len(list(table.columns)))
    # if (len(list(table.columns)) > 10):
    #     print(len(list(table.columns)))
    #     table.drop(table.columns[9], axis=1, inplace=True)
    #     table.to_excel(filename, index=False)
    # if( 'Prezime' in list(table.columns)[0]):
    #     print(filename)
    #     print(list(table.columns)[0])
        # table.to_excel(filename, index=False, header=False)
    # print(str(table.iat[0,0]) + str(table.iat[0,1]))
    # if (table.iat[0,0].contains("Prezime")):
    #     print(filename)
    # table.dropna(how='all', axis=1, inplace=True)
    # table.to_excel(filename, index=False)

 # for page in range(499, 495):
 #     table = read_pdf("Knjiga_mrtvih.pdf", pages='499', pandas_options={'header': None})[0]
 #     table.to_excel("page-"+str(page)+".xlsx", index=False)
	#import tabula
	#pages=[212,213]
	#df = tabula.read_pdf('Knjiga_mrtvih.pdf', pages = '212')[0]

	#df.columns = df.columns.str.replace('\r', ' ')
	#data = df.dropna()
	#print(df)
	#data.to_excel('data.xlsx')

	from tabula import read_pdf
	import pandas as pd
	import openpyxl
	import glob, os
	os.chdir("./")
	# for filename in glob.glob("*.xlsx"):
	# print(filename)
	# table = pd.read_ excel(filename, skiprows=list(range(1)))
	# table.to_excel(filename, index=False)

	# empty data frame for the new output excel file with the merged excel files




	# # csv files in the path
	# file_list = glob.glob("*.xlsx")
	# for file in file_list:
	# print(file)
	# table = pd.read_excel(file, header=None)
	# table.set_axis(
	# ["last_name", "first_name", "fathers_name", "date_of_birth", "place_of_birth", "nationality", "status",
	# "formation", "date_of_death", "municipality_of_death"], axis=1, inplace=True)
	# table.to_excel(file, index=False)








	# # list of excel files we want to merge.
	# # pd.read_excel(file_path) reads the
	# # excel data into pandas dataframe.
	# excl_list = []
	#
	# for file in file_list:
	# excl_list.append(pd.read_excel(file))
	#
	# # concatenate all DataFrames in the list
	# # into a single DataFrame, returns new
	# # DataFrame.
	# excl_merged = pd.concat(excl_list, ignore_index=True)
	#
	# # exports the dataframe into excel file
	# # with specified name.
	# excl_merged.to_excel('Srebrenica.xlsx', index=False)








	# excl_list = []
	#
	# for filename in glob.glob("*.xlsx"):
	# print(filename)
	# excl_list.append(pd.read_excel(filename))
	#
	# excl_merged = pd.DataFrame()
	#
	# for excl_file in excl_list:
	# # appends the data into the excl_merged
	# # dataframe.
	# excl_merged = excl_merged.append(excl_file, ignore_index=True)
	#
	# # exports the dataframe into excel file with
	# # specified name.
	# excl_merged.to_excel('Sarajevo.xlsx', index=False, header=False)
	# table = pd.read_excel(filename)
	# print(len(list(table.columns)))
	# if (len(list(table.columns)) > 10):
	# print(len(list(table.columns)))
	# table.drop(table.columns[9], axis=1, inplace=True)
	# table.to_excel(filename, index=False)
	# if( 'Prezime' in list(table.columns)[0]):
	# print(filename)
	# print(list(table.columns)[0])
	# table.to_excel(filename, index=False, header=False)
	# print(str(table.iat[0,0]) + str(table.iat[0,1]))
	# if (table.iat[0,0].contains("Prezime")):
	# print(filename)
	# table.dropna(how='all', axis=1, inplace=True)
	# table.to_excel(filename, index=False)

	# for page in range(499, 495):
	# table = read_pdf("Knjiga_mrtvih.pdf", pages='499', pandas_options={'header': None})[0]
	# table.to_excel("page-"+str(page)+".xlsx", index=False)