Last active
April 11, 2025 20:04
-
-
Save u8sand/4618e05a0181eb65c0b431c87df8a1cc to your computer and use it in GitHub Desktop.
Cascade remove all invalid records from a datapackage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import sys | |
import click | |
import pathlib | |
import logging | |
import shutil | |
import frictionless | |
logging.basicConfig(stream=sys.stderr, level=logging.INFO) | |
@click.command(help=''' | |
Prune a datapackage by removing any rows with validation errors | |
''') | |
@click.version_option() | |
@click.argument('datapackage', type=click.Path(file_okay=True, dir_okay=False, exists=True)) | |
def main(datapackage): | |
pkg = frictionless.Package(datapackage) | |
base_dir = pathlib.Path(datapackage).parent | |
logging.info('pkg.validate()') | |
validation = pkg.validate(limit_errors=float('inf')) | |
while validation.stats['errors'] > 0: | |
logging.info(f" -> {validation.stats['errors']} errors") | |
for task in validation.tasks: | |
if not task.errors: continue | |
error_rows = {error.row_number for error in task.errors} | |
logging.info(f"removing {len(error_rows)} rows from {task.name}") | |
with pkg.get_resource(task.name) as rc: | |
with (base_dir/rc.path).with_suffix('.tmp').open('w') as fw: | |
writer = csv.DictWriter(fw, fieldnames=[field.name for field in rc.schema.fields], delimiter='\t') | |
writer.writeheader() | |
writer.writerows(( | |
row.to_dict() | |
for i, row in enumerate(rc.row_stream, start=2) | |
if i not in error_rows | |
)) | |
shutil.move((base_dir/rc.path).with_suffix('.tmp'), (base_dir/rc.path)) | |
logging.info('pkg.validate()') | |
validation = pkg.validate(limit_errors=float('inf')) | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Allow you to pip install this gist with: | |
# pip install git+https://gist.github.com/4618e05a0181eb65c0b431c87df8a1cc.git | |
[project] | |
name = 'frictionless_prune' | |
version = '1.1.0' | |
description = '' | |
authors = [ | |
{name = "Daniel J. B. Clarke", email = "[email protected]"} | |
] | |
requires-python = ">=3.9" | |
dependencies = [ | |
"frictionless", | |
"click" | |
] | |
[tool.poetry.scripts] | |
frictionless_prune = 'frictionless_prune:main' | |
[build-system] | |
requires = ["poetry-core>=2.0.0,<3.0.0"] | |
build-backend = "poetry.core.masonry.api" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment