Last active
April 17, 2024 14:48
-
-
Save Marinell0/98e141ac3c01cac9254cb325cf97f210 to your computer and use it in GitHub Desktop.
Filter lines from a csv file inside a zip. It does this in an efficient way so to not waste time on doing this.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import zipfile | |
import io | |
import re | |
from typing import Iterator | |
def rows_with_index(pattern, sep, file) -> Iterator[str]: | |
row_index = 0 | |
for row in io.TextIOWrapper(file): | |
if pattern.match(row): | |
yield f"{row_index}{sep}{row}" | |
row_index += 1 | |
def filter_lines_from_zipped_csv(path: str, pattern: re.Pattern[str], sep: str = ",", column_names: list[str] | None = None) -> pd.DataFrame: | |
""" | |
Filter lines from a zipped csv file (with one file inside) using a regex pattern. | |
The pattern needs to be compiled generating the re.Pattern object. | |
If no lines are found, an empty dataframe is returned. | |
Parameters | |
---------- | |
path: str | |
path to the zipped csv file | |
pattern: re.Pattern[str] | |
compiled regex pattern | |
sep: str, default : | |
separator of the csv file | |
""" | |
with zipfile.ZipFile(path, 'r') as zip_file: | |
with zip_file.open(zip_file.namelist()[0]) as file: | |
rows = rows_with_index(pattern, sep, file) | |
return pd.read_csv(io.StringIO("\n".join(rows)), sep=sep, index_col=0, header=None, names=column_names, lineterminator='\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment