quangtuan202 · July 31, 2022 08:22
diff --git a/combine_excel.py b/combine_excel.py
 import pandas as pd
 import os
 import pyxlsb

 # Specify the file path
 path="D:/data/raw_files"

 # Initialize a empty list
 df_list=[]

 # Use os.listdir() and for loop to loop through files 
 for file_name in os.listdir(path):
    # Print file name, this line is optional
    print(" File name is : "+file_name) 
    # Check if file is a valid excel file, if not, ignore the file. 
    # Sometimes Excel creates a temporary hidden file in the same folder whose name starts with "~" symbol
    # Please pay attention to lower() method call,this method returns a string where all characters are lower case, the file_name will be a lowercase string.
    if "~" in file_name or not file_name.lower().endswith(("xls","xlsx,xlsb")):
        continue
    # Initialize an empty DataFrame object
    df=pd.DataFrame()
    
    # Read Excel files using Pandas, if your file name has .xlsb extension, you have to use "pyxlsb" as reading engine
    if file_name.lower().endswith("xlsb"):
        df=pd.read_excel(path+"/"+file_name,header=None,engine="pyxlsb")
    else:
        df=pd.read_excel(path+"/"+file_name,header=None)
   
    # Find the row number of the header since the row number is not the same. I assume that the row number is within the range(0,30) 
    # In my case, I wanted to find the row number at which the value of the cell at first column contains "year"
    header_row_num=0
    for i in range(30):
        if 'year' in str(df.iat[i,0]).lower():
            print(f"i={i} val="+str(df.iat[i,0]))
            header_row_num=i
            # Break the loop if the value is found
            break
    
    # Select the row found as header
    header=df.iloc[header_row_num]
    # Select DataFrame from header row + 1
    df=df.iloc[header_row_num+1:]
    # Remove columns without name
    new_header=[x for x in header if not str(x).startswith("Unnamed:")]
    # Set new columns name
    df.columns=new_header
    df_list.append(df)

 # Merge data frames
 df_merge=pd.concat(df_list)
 # Export to excel
 df_merge.to_excel("D:/data/result"+"/"+"combined.xlsx")
	import pandas as pd
	import os
	import pyxlsb

	# Specify the file path
	path="D:/data/raw_files"

	# Initialize a empty list
	df_list=[]

	# Use os.listdir() and for loop to loop through files
	for file_name in os.listdir(path):
	# Print file name, this line is optional
	print(" File name is : "+file_name)
	# Check if file is a valid excel file, if not, ignore the file.
	# Sometimes Excel creates a temporary hidden file in the same folder whose name starts with "~" symbol
	# Please pay attention to lower() method call,this method returns a string where all characters are lower case, the file_name will be a lowercase string.
	if "~" in file_name or not file_name.lower().endswith(("xls","xlsx,xlsb")):
	continue
	# Initialize an empty DataFrame object
	df=pd.DataFrame()

	# Read Excel files using Pandas, if your file name has .xlsb extension, you have to use "pyxlsb" as reading engine
	if file_name.lower().endswith("xlsb"):
	df=pd.read_excel(path+"/"+file_name,header=None,engine="pyxlsb")
	else:
	df=pd.read_excel(path+"/"+file_name,header=None)

	# Find the row number of the header since the row number is not the same. I assume that the row number is within the range(0,30)
	# In my case, I wanted to find the row number at which the value of the cell at first column contains "year"
	header_row_num=0
	for i in range(30):
	if 'year' in str(df.iat[i,0]).lower():
	print(f"i={i} val="+str(df.iat[i,0]))
	header_row_num=i
	# Break the loop if the value is found
	break

	# Select the row found as header
	header=df.iloc[header_row_num]
	# Select DataFrame from header row + 1
	df=df.iloc[header_row_num+1:]
	# Remove columns without name
	new_header=[x for x in header if not str(x).startswith("Unnamed:")]
	# Set new columns name
	df.columns=new_header
	df_list.append(df)

	# Merge data frames
	df_merge=pd.concat(df_list)
	# Export to excel
	df_merge.to_excel("D:/data/result"+"/"+"combined.xlsx")