Created
April 3, 2025 14:36
-
-
Save cas--/314dde7ed51e7b3e9183eef4ee76bc02 to your computer and use it in GitHub Desktop.
Stream and extract S3 tar file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env -S uv run | |
# /// script | |
# requires-python = ">=3.12" | |
# dependencies = [ | |
# "boto3", | |
# "click", | |
# "zstandard", | |
# ] | |
# /// | |
""" | |
Stream and extract a tar file from S3 using minimal memory and disk. | |
This is a Python alternative to using AWS cli: | |
aws s3 cp s3://example-bucket/file.tar.gz - | tar -xz -C $(mktemp -d) | |
aws s3 cp s3://example-bucket/file.tar.zst - | tar --zstd -x -C $(mktemp -d) | |
Requires: [uv](https://docs.astral.sh/uv/getting-started/installation/) | |
Usage: | |
./s3_tar_extract.py s3://example-bucket/file.tar.zst $(mktemp -d) | |
uv run s3_tar_extract.py s3://example-bucket/file.tar.zst $(mktemp -d) | |
""" | |
import click | |
import boto3 | |
import tarfile | |
import zstandard | |
import os | |
def s3_streaming_body(s3_path): | |
"""Return a file-like streaming body object from an S3 object""" | |
s3_client = boto3.client("s3") | |
bucket_name, object_key = s3_path.replace("s3://", "").split("/", 1) | |
s3_response = s3_client.get_object(Bucket=bucket_name, Key=object_key) | |
return s3_response["Body"] | |
def extract_tar_gz(s3_path, extract_path): | |
""" | |
Stream a tar.gz file from S3 and extract it. | |
Args: | |
bucket_name (str): S3 bucket name | |
object_key (str): S3 object key of the tar.gz file | |
extract_path (str): Local directory to extract contents to | |
""" | |
streaming_body = s3_streaming_body(s3_path) | |
with tarfile.open(fileobj=streaming_body, mode="r|gz") as tar: | |
tar.extractall(path=extract_path, filter="tar") | |
def extract_tar_zstd(s3_path, extract_path): | |
"""Stream a zstd-compressed tar file from S3 and extract it. | |
Args: | |
bucket_name (str): S3 bucket name | |
object_key (str): S3 object key of the zstd-compressed file | |
extract_path (str): Local directory to extract contents to | |
""" | |
streaming_body = s3_streaming_body(s3_path) | |
# Create a stream reader that decompresses as it reads | |
dctx = zstandard.ZstdDecompressor() | |
reader = dctx.stream_reader(streaming_body) | |
with tarfile.open(fileobj=reader, mode="r|") as tar: | |
tar.extractall(path=extract_path, filter="tar") | |
@click.command() | |
@click.argument("s3_path") | |
@click.argument("extract_path") | |
def extract(s3_path, extract_path): | |
"""Stream and extract tar file from S3 using minimal memory and disk.""" | |
os.makedirs(extract_path, exist_ok=True) | |
click.echo(f"Extracting {s3_path} to {extract_path}...") | |
if s3_path.endswith(".tar.gz"): | |
extract_tar_gz(s3_path, extract_path) | |
elif s3_path.endswith(".tar.zst"): | |
extract_tar_zstd(s3_path, extract_path) | |
else: | |
raise ValueError("Tar file must end with .tar.gz or .tar.zst") | |
click.echo("Done extracting.") | |
if __name__ == "__main__": | |
extract() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment