Created
July 31, 2022 08:22
-
-
Save quangtuan202/f73755d0076bbd426059c00a12700d2e to your computer and use it in GitHub Desktop.
combine excel files v1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import os | |
import pyxlsb | |
# Specify the file path | |
path="D:/data/raw_files" | |
# Initialize a empty list | |
df_list=[] | |
# Use os.listdir() and for loop to loop through files | |
for file_name in os.listdir(path): | |
# Print file name, this line is optional | |
print(" File name is : "+file_name) | |
# Check if file is a valid excel file, if not, ignore the file. | |
# Sometimes Excel creates a temporary hidden file in the same folder whose name starts with "~" symbol | |
# Please pay attention to lower() method call,this method returns a string where all characters are lower case, the file_name will be a lowercase string. | |
if "~" in file_name or not file_name.lower().endswith(("xls","xlsx,xlsb")): | |
continue | |
# Initialize an empty DataFrame object | |
df=pd.DataFrame() | |
# Read Excel files using Pandas, if your file name has .xlsb extension, you have to use "pyxlsb" as reading engine | |
if file_name.lower().endswith("xlsb"): | |
df=pd.read_excel(path+"/"+file_name,header=None,engine="pyxlsb") | |
else: | |
df=pd.read_excel(path+"/"+file_name,header=None) | |
# Find the row number of the header since the row number is not the same. I assume that the row number is within the range(0,30) | |
# In my case, I wanted to find the row number at which the value of the cell at first column contains "year" | |
header_row_num=0 | |
for i in range(30): | |
if 'year' in str(df.iat[i,0]).lower(): | |
print(f"i={i} val="+str(df.iat[i,0])) | |
header_row_num=i | |
# Break the loop if the value is found | |
break | |
# Select the row found as header | |
header=df.iloc[header_row_num] | |
# Select DataFrame from header row + 1 | |
df=df.iloc[header_row_num+1:] | |
# Remove columns without name | |
new_header=[x for x in header if not str(x).startswith("Unnamed:")] | |
# Set new columns name | |
df.columns=new_header | |
df_list.append(df) | |
# Merge data frames | |
df_merge=pd.concat(df_list) | |
# Export to excel | |
df_merge.to_excel("D:/data/result"+"/"+"combined.xlsx") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment