Marinell0 · April 17, 2024 14:48
diff --git a/filter_lines_from_zip_csv.py b/filter_lines_from_zip_csv.py
 import pandas as pd
 import zipfile
 import io
 import re

 from typing import Iterator

 def rows_with_index(pattern, sep, file) -> Iterator[str]:
    row_index = 0
    for row in io.TextIOWrapper(file):
        if pattern.match(row):
            yield f"{row_index}{sep}{row}"
        row_index += 1

 def filter_lines_from_zipped_csv(path: str, pattern: re.Pattern[str], sep: str = ",", column_names: list[str] | None = None) -> pd.DataFrame:
    """
    Filter lines from a zipped csv file (with one file inside) using a regex pattern.
    The pattern needs to be compiled generating the re.Pattern object.
    If no lines are found, an empty dataframe is returned.
    Parameters
    ----------
        path: str
            path to the zipped csv file
        pattern: re.Pattern[str]
            compiled regex pattern
        sep: str, default :
            separator of the csv file
    """
    with zipfile.ZipFile(path, 'r') as zip_file:
        with zip_file.open(zip_file.namelist()[0]) as file:
            rows = rows_with_index(pattern, sep, file)

            return pd.read_csv(io.StringIO("\n".join(rows)), sep=sep, index_col=0, header=None, names=column_names, lineterminator='\n')
	import pandas as pd
	import zipfile
	import io
	import re

	from typing import Iterator

	def rows_with_index(pattern, sep, file) -> Iterator[str]:
	row_index = 0
	for row in io.TextIOWrapper(file):
	if pattern.match(row):
	yield f"{row_index}{sep}{row}"
	row_index += 1

	def filter_lines_from_zipped_csv(path: str, pattern: re.Pattern[str], sep: str = ",", column_names: list[str] \| None = None) -> pd.DataFrame:
	"""
	Filter lines from a zipped csv file (with one file inside) using a regex pattern.
	The pattern needs to be compiled generating the re.Pattern object.
	If no lines are found, an empty dataframe is returned.
	Parameters
	----------
	path: str
	path to the zipped csv file
	pattern: re.Pattern[str]
	compiled regex pattern
	sep: str, default :
	separator of the csv file
	"""
	with zipfile.ZipFile(path, 'r') as zip_file:
	with zip_file.open(zip_file.namelist()[0]) as file:
	rows = rows_with_index(pattern, sep, file)

	return pd.read_csv(io.StringIO("\n".join(rows)), sep=sep, index_col=0, header=None, names=column_names, lineterminator='\n')