Skip to content

Instantly share code, notes, and snippets.

@quangtuan202
Created July 31, 2022 08:22
Show Gist options
  • Save quangtuan202/f73755d0076bbd426059c00a12700d2e to your computer and use it in GitHub Desktop.
Save quangtuan202/f73755d0076bbd426059c00a12700d2e to your computer and use it in GitHub Desktop.
combine excel files v1
import pandas as pd
import os
import pyxlsb
# Specify the file path
path="D:/data/raw_files"
# Initialize a empty list
df_list=[]
# Use os.listdir() and for loop to loop through files
for file_name in os.listdir(path):
# Print file name, this line is optional
print(" File name is : "+file_name)
# Check if file is a valid excel file, if not, ignore the file.
# Sometimes Excel creates a temporary hidden file in the same folder whose name starts with "~" symbol
# Please pay attention to lower() method call,this method returns a string where all characters are lower case, the file_name will be a lowercase string.
if "~" in file_name or not file_name.lower().endswith(("xls","xlsx,xlsb")):
continue
# Initialize an empty DataFrame object
df=pd.DataFrame()
# Read Excel files using Pandas, if your file name has .xlsb extension, you have to use "pyxlsb" as reading engine
if file_name.lower().endswith("xlsb"):
df=pd.read_excel(path+"/"+file_name,header=None,engine="pyxlsb")
else:
df=pd.read_excel(path+"/"+file_name,header=None)
# Find the row number of the header since the row number is not the same. I assume that the row number is within the range(0,30)
# In my case, I wanted to find the row number at which the value of the cell at first column contains "year"
header_row_num=0
for i in range(30):
if 'year' in str(df.iat[i,0]).lower():
print(f"i={i} val="+str(df.iat[i,0]))
header_row_num=i
# Break the loop if the value is found
break
# Select the row found as header
header=df.iloc[header_row_num]
# Select DataFrame from header row + 1
df=df.iloc[header_row_num+1:]
# Remove columns without name
new_header=[x for x in header if not str(x).startswith("Unnamed:")]
# Set new columns name
df.columns=new_header
df_list.append(df)
# Merge data frames
df_merge=pd.concat(df_list)
# Export to excel
df_merge.to_excel("D:/data/result"+"/"+"combined.xlsx")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment