Last active
November 6, 2024 11:05
-
-
Save zachschillaci27/887c272cbdda63316d9a02a770d15040 to your computer and use it in GitHub Desktop.
Extract URLs from hyperlinks in Excel using openpyxl and pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import openpyxl | |
def _get_link_if_exists(cell) -> str | None: | |
try: | |
return cell.hyperlink.target | |
except AttributeError: | |
return None | |
def extract_hyperlinks_from_xlsx( | |
file_name: str, sheet_name: str, columns_to_parse: list[str], row_header: int = 1 | |
) -> pd.DataFrame: | |
df = pd.read_excel(file_name, sheet_name) | |
ws = openpyxl.load_workbook(file_name)[sheet_name] | |
for column in columns_to_parse: | |
row_offset = row_header + 1 | |
column_index = list(df.columns).index(column) + 1 | |
df[column] = [ | |
_get_link_if_exists(ws.cell(row=row_offset + i, column=column_index)) | |
for i in range(len(df[column])) | |
] | |
return df |
Excellent, thank you
Thanks!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you!