cas-- · April 3, 2025 14:36
diff --git a/s3_tar_extract.py b/s3_tar_extract.py
 #! /usr/bin/env -S uv run
 # /// script
 # requires-python = ">=3.12"
 # dependencies = [
 #     "boto3",
 #     "click",
 #     "zstandard",
 # ]
 # ///
 """
 Stream and extract a tar file from S3 using minimal memory and disk.

 This is a Python alternative to using AWS cli:

    aws s3 cp s3://example-bucket/file.tar.gz - | tar -xz -C $(mktemp -d)
    aws s3 cp s3://example-bucket/file.tar.zst - | tar --zstd -x -C $(mktemp -d)

 Requires: [uv](https://docs.astral.sh/uv/getting-started/installation/)

 Usage:

    ./s3_tar_extract.py s3://example-bucket/file.tar.zst $(mktemp -d)
    uv run s3_tar_extract.py s3://example-bucket/file.tar.zst $(mktemp -d)
 """

 import click
 import boto3
 import tarfile

 import zstandard

 import os


 def s3_streaming_body(s3_path):
    """Return a file-like streaming body object from an S3 object"""
    s3_client = boto3.client("s3")

    bucket_name, object_key = s3_path.replace("s3://", "").split("/", 1)

    s3_response = s3_client.get_object(Bucket=bucket_name, Key=object_key)

    return s3_response["Body"]


 def extract_tar_gz(s3_path, extract_path):
    """
    Stream a tar.gz file from S3 and extract it.

    Args:
        bucket_name (str): S3 bucket name
        object_key (str): S3 object key of the tar.gz file
        extract_path (str): Local directory to extract contents to
    """
    streaming_body = s3_streaming_body(s3_path)

    with tarfile.open(fileobj=streaming_body, mode="r|gz") as tar:
        tar.extractall(path=extract_path, filter="tar")


 def extract_tar_zstd(s3_path, extract_path):
    """Stream a zstd-compressed tar file from S3 and extract it.

    Args:
        bucket_name (str): S3 bucket name
        object_key (str): S3 object key of the zstd-compressed file
        extract_path (str): Local directory to extract contents to
    """
    streaming_body = s3_streaming_body(s3_path)

    # Create a stream reader that decompresses as it reads
    dctx = zstandard.ZstdDecompressor()
    reader = dctx.stream_reader(streaming_body)

    with tarfile.open(fileobj=reader, mode="r|") as tar:
        tar.extractall(path=extract_path, filter="tar")


 @click.command()
 @click.argument("s3_path")
 @click.argument("extract_path")
 def extract(s3_path, extract_path):
    """Stream and extract tar file from S3 using minimal memory and disk."""
    os.makedirs(extract_path, exist_ok=True)

    click.echo(f"Extracting {s3_path} to {extract_path}...")
    if s3_path.endswith(".tar.gz"):
        extract_tar_gz(s3_path, extract_path)
    elif s3_path.endswith(".tar.zst"):
        extract_tar_zstd(s3_path, extract_path)
    else:
        raise ValueError("Tar file must end with .tar.gz or .tar.zst")

    click.echo("Done extracting.")


 if __name__ == "__main__":
    extract()
	#! /usr/bin/env -S uv run
	# /// script
	# requires-python = ">=3.12"
	# dependencies = [
	# "boto3",
	# "click",
	# "zstandard",
	# ]
	# ///
	"""
	Stream and extract a tar file from S3 using minimal memory and disk.

	This is a Python alternative to using AWS cli:

	aws s3 cp s3://example-bucket/file.tar.gz - \| tar -xz -C $(mktemp -d)
	aws s3 cp s3://example-bucket/file.tar.zst - \| tar --zstd -x -C $(mktemp -d)

	Requires: [uv](https://docs.astral.sh/uv/getting-started/installation/)

	Usage:

	./s3_tar_extract.py s3://example-bucket/file.tar.zst $(mktemp -d)
	uv run s3_tar_extract.py s3://example-bucket/file.tar.zst $(mktemp -d)
	"""

	import click
	import boto3
	import tarfile

	import zstandard

	import os


	def s3_streaming_body(s3_path):
	"""Return a file-like streaming body object from an S3 object"""
	s3_client = boto3.client("s3")

	bucket_name, object_key = s3_path.replace("s3://", "").split("/", 1)

	s3_response = s3_client.get_object(Bucket=bucket_name, Key=object_key)

	return s3_response["Body"]


	def extract_tar_gz(s3_path, extract_path):
	"""
	Stream a tar.gz file from S3 and extract it.

	Args:
	bucket_name (str): S3 bucket name
	object_key (str): S3 object key of the tar.gz file
	extract_path (str): Local directory to extract contents to
	"""
	streaming_body = s3_streaming_body(s3_path)

	with tarfile.open(fileobj=streaming_body, mode="r\|gz") as tar:
	tar.extractall(path=extract_path, filter="tar")


	def extract_tar_zstd(s3_path, extract_path):
	"""Stream a zstd-compressed tar file from S3 and extract it.

	Args:
	bucket_name (str): S3 bucket name
	object_key (str): S3 object key of the zstd-compressed file
	extract_path (str): Local directory to extract contents to
	"""
	streaming_body = s3_streaming_body(s3_path)

	# Create a stream reader that decompresses as it reads
	dctx = zstandard.ZstdDecompressor()
	reader = dctx.stream_reader(streaming_body)

	with tarfile.open(fileobj=reader, mode="r\|") as tar:
	tar.extractall(path=extract_path, filter="tar")


	@click.command()
	@click.argument("s3_path")
	@click.argument("extract_path")
	def extract(s3_path, extract_path):
	"""Stream and extract tar file from S3 using minimal memory and disk."""
	os.makedirs(extract_path, exist_ok=True)

	click.echo(f"Extracting {s3_path} to {extract_path}...")
	if s3_path.endswith(".tar.gz"):
	extract_tar_gz(s3_path, extract_path)
	elif s3_path.endswith(".tar.zst"):
	extract_tar_zstd(s3_path, extract_path)
	else:
	raise ValueError("Tar file must end with .tar.gz or .tar.zst")

	click.echo("Done extracting.")


	if __name__ == "__main__":
	extract()