Skip to content

Instantly share code, notes, and snippets.

@cas--
Created April 3, 2025 14:36
Show Gist options
  • Save cas--/314dde7ed51e7b3e9183eef4ee76bc02 to your computer and use it in GitHub Desktop.
Save cas--/314dde7ed51e7b3e9183eef4ee76bc02 to your computer and use it in GitHub Desktop.
Stream and extract S3 tar file
#! /usr/bin/env -S uv run
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "boto3",
# "click",
# "zstandard",
# ]
# ///
"""
Stream and extract a tar file from S3 using minimal memory and disk.
This is a Python alternative to using AWS cli:
aws s3 cp s3://example-bucket/file.tar.gz - | tar -xz -C $(mktemp -d)
aws s3 cp s3://example-bucket/file.tar.zst - | tar --zstd -x -C $(mktemp -d)
Requires: [uv](https://docs.astral.sh/uv/getting-started/installation/)
Usage:
./s3_tar_extract.py s3://example-bucket/file.tar.zst $(mktemp -d)
uv run s3_tar_extract.py s3://example-bucket/file.tar.zst $(mktemp -d)
"""
import click
import boto3
import tarfile
import zstandard
import os
def s3_streaming_body(s3_path):
"""Return a file-like streaming body object from an S3 object"""
s3_client = boto3.client("s3")
bucket_name, object_key = s3_path.replace("s3://", "").split("/", 1)
s3_response = s3_client.get_object(Bucket=bucket_name, Key=object_key)
return s3_response["Body"]
def extract_tar_gz(s3_path, extract_path):
"""
Stream a tar.gz file from S3 and extract it.
Args:
bucket_name (str): S3 bucket name
object_key (str): S3 object key of the tar.gz file
extract_path (str): Local directory to extract contents to
"""
streaming_body = s3_streaming_body(s3_path)
with tarfile.open(fileobj=streaming_body, mode="r|gz") as tar:
tar.extractall(path=extract_path, filter="tar")
def extract_tar_zstd(s3_path, extract_path):
"""Stream a zstd-compressed tar file from S3 and extract it.
Args:
bucket_name (str): S3 bucket name
object_key (str): S3 object key of the zstd-compressed file
extract_path (str): Local directory to extract contents to
"""
streaming_body = s3_streaming_body(s3_path)
# Create a stream reader that decompresses as it reads
dctx = zstandard.ZstdDecompressor()
reader = dctx.stream_reader(streaming_body)
with tarfile.open(fileobj=reader, mode="r|") as tar:
tar.extractall(path=extract_path, filter="tar")
@click.command()
@click.argument("s3_path")
@click.argument("extract_path")
def extract(s3_path, extract_path):
"""Stream and extract tar file from S3 using minimal memory and disk."""
os.makedirs(extract_path, exist_ok=True)
click.echo(f"Extracting {s3_path} to {extract_path}...")
if s3_path.endswith(".tar.gz"):
extract_tar_gz(s3_path, extract_path)
elif s3_path.endswith(".tar.zst"):
extract_tar_zstd(s3_path, extract_path)
else:
raise ValueError("Tar file must end with .tar.gz or .tar.zst")
click.echo("Done extracting.")
if __name__ == "__main__":
extract()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment