Skip to content

Instantly share code, notes, and snippets.

@terryjbates
Created January 19, 2025 11:14
Show Gist options
  • Save terryjbates/b539694d37bc2e6ce98f14248fa113be to your computer and use it in GitHub Desktop.
Save terryjbates/b539694d37bc2e6ce98f14248fa113be to your computer and use it in GitHub Desktop.
def parse_html_table(file_path):
#with open(file_path, 'r') as f:
with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
soup = BeautifulSoup(f, 'html.parser')
table = soup.find('table') # Adjust the selector if needed
# Find headers and data rows directly from the table
headers = [th.text for th in table.find_all('th')]
data = []
for row in table.find_all('tr')[1:]: # Skip the first row if it's a header
row_data = [td.text for td in row.find_all('td')]
data.append(row_data)
# Create DataFrame
df = pd.DataFrame(data, columns=headers)
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment