Skip to content

Instantly share code, notes, and snippets.

@u8sand
Last active April 11, 2025 20:04
Show Gist options
  • Save u8sand/4618e05a0181eb65c0b431c87df8a1cc to your computer and use it in GitHub Desktop.
Save u8sand/4618e05a0181eb65c0b431c87df8a1cc to your computer and use it in GitHub Desktop.
Cascade remove all invalid records from a datapackage
import csv
import sys
import click
import pathlib
import logging
import shutil
import frictionless
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
@click.command(help='''
Prune a datapackage by removing any rows with validation errors
''')
@click.version_option()
@click.argument('datapackage', type=click.Path(file_okay=True, dir_okay=False, exists=True))
def main(datapackage):
pkg = frictionless.Package(datapackage)
base_dir = pathlib.Path(datapackage).parent
logging.info('pkg.validate()')
validation = pkg.validate(limit_errors=float('inf'))
while validation.stats['errors'] > 0:
logging.info(f" -> {validation.stats['errors']} errors")
for task in validation.tasks:
if not task.errors: continue
error_rows = {error.row_number for error in task.errors}
logging.info(f"removing {len(error_rows)} rows from {task.name}")
with pkg.get_resource(task.name) as rc:
with (base_dir/rc.path).with_suffix('.tmp').open('w') as fw:
writer = csv.DictWriter(fw, fieldnames=[field.name for field in rc.schema.fields], delimiter='\t')
writer.writeheader()
writer.writerows((
row.to_dict()
for i, row in enumerate(rc.row_stream, start=2)
if i not in error_rows
))
shutil.move((base_dir/rc.path).with_suffix('.tmp'), (base_dir/rc.path))
logging.info('pkg.validate()')
validation = pkg.validate(limit_errors=float('inf'))
if __name__ == '__main__':
main()
# Allow you to pip install this gist with:
# pip install git+https://gist.github.com/4618e05a0181eb65c0b431c87df8a1cc.git
[project]
name = 'frictionless_prune'
version = '1.1.0'
description = ''
authors = [
{name = "Daniel J. B. Clarke", email = "[email protected]"}
]
requires-python = ">=3.9"
dependencies = [
"frictionless",
"click"
]
[tool.poetry.scripts]
frictionless_prune = 'frictionless_prune:main'
[build-system]
requires = ["poetry-core>=2.0.0,<3.0.0"]
build-backend = "poetry.core.masonry.api"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment