Skip to content

Instantly share code, notes, and snippets.

@smothiki
Created February 18, 2026 14:33
Show Gist options
  • Select an option

  • Save smothiki/fd540f515951f680b3ac7e98f92533fd to your computer and use it in GitHub Desktop.

Select an option

Save smothiki/fd540f515951f680b3ac7e98f92533fd to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Script to download model metadata files from S3 and create structured JSON output.
Uses AWS CLI to download modelmetadata.json and metadata.yaml files from a single S3 path.
Usage examples:
# Basic usage for NGC
python download_model_metadata.py \
--repo-type NGC \
--s3-path "s3://my-bucket/models/model1/" \
--output "ngc_model_metadata.json"
# With AWS profile and region
python download_model_metadata.py \
--repo-type HuggingFace \
--s3-path "s3://my-bucket/hf-models/" \
--aws-profile my-profile \
--aws-region us-west-2 \
--output "hf_model_metadata.json"
# With custom S3 endpoint and CA bundle
python download_model_metadata.py \
--repo-type NGC \
--s3-path "s3://my-bucket/models/" \
--endpoint-url "https://s3.example.com" \
--ca-bundle "/path/to/ca-bundle.crt"
"""
import json
import subprocess
import argparse
import sys
import os
import tempfile
from pathlib import Path
def run_aws_command(command, env=None):
"""Run AWS CLI command and return the result."""
try:
result = subprocess.run(
command,
shell=True,
capture_output=True,
text=True,
check=True,
env=env
)
return result.stdout.strip()
except subprocess.CalledProcessError as e:
print(f"Error running AWS command: {command}", file=sys.stderr)
print(f"Error: {e.stderr}", file=sys.stderr)
sys.exit(1)
def download_file_from_s3(s3_path, local_path, endpoint_url=None, ca_bundle=None, env=None):
"""Download a file from S3 using AWS CLI."""
command_parts = ["aws s3 cp"]
if endpoint_url:
command_parts.append(f"--endpoint-url {endpoint_url}")
if ca_bundle:
command_parts.append(f"--ca-bundle {ca_bundle}")
command_parts.extend([s3_path, local_path])
command = " ".join(command_parts)
print(f"Downloading: {s3_path} -> {local_path}", file=sys.stderr)
run_aws_command(command, env=env)
if not os.path.exists(local_path):
raise FileNotFoundError(f"Failed to download file: {s3_path}")
return local_path
def read_file_content(file_path):
"""Read and return the content of a file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
print(f"Error reading file {file_path}: {e}", file=sys.stderr)
sys.exit(1)
def create_model_metadata_json(repo_type, manifest_yaml_content, model_info_content):
"""Create the structured JSON with the specified format."""
# Map repo type to the appropriate metadata field
repo_type_upper = repo_type.upper()
if repo_type_upper == "NGC":
metadata_key = "ngc_metadata"
elif repo_type_upper == "HUGGINGFACE":
metadata_key = "huggingface_metadata"
elif repo_type_upper == "MLFLOW":
metadata_key = "mlflow_metadata"
else:
# Default to using the repo type as lowercase with _metadata suffix
metadata_key = f"{repo_type.lower()}_metadata"
result = {
"model_repo_type": repo_type_upper,
metadata_key: {
"manifest_yaml": manifest_yaml_content,
"model_info": model_info_content
}
}
return result
def main():
parser = argparse.ArgumentParser(
description="Download model metadata files from S3 and create structured JSON"
)
parser.add_argument(
"--repo-type",
required=True,
help="Repository type (e.g., NGC, HuggingFace, MLFlow)"
)
parser.add_argument(
"--s3-path",
required=True,
help="S3 path containing both metadata.yaml and modelmetadata.json files (e.g., s3://bucket/path/to/model/)"
)
parser.add_argument(
"--output",
"-o",
default="model_metadata_output.json",
help="Output JSON file path (default: model_metadata_output.json)"
)
parser.add_argument(
"--temp-dir",
help="Temporary directory for downloads (default: system temp)"
)
parser.add_argument(
"--aws-profile",
help="AWS profile to use (optional)"
)
parser.add_argument(
"--aws-region",
help="AWS region to use (optional)"
)
parser.add_argument(
"--endpoint-url",
help="S3 endpoint URL for custom S3-compatible storage (e.g., https://s3.example.com)"
)
parser.add_argument(
"--ca-bundle",
help="Path to CA bundle file for SSL verification (e.g., /path/to/ca-bundle.crt)"
)
args = parser.parse_args()
# Set AWS profile and region if provided
env = os.environ.copy()
if args.aws_profile:
env['AWS_PROFILE'] = args.aws_profile
if args.aws_region:
env['AWS_DEFAULT_REGION'] = args.aws_region
# Create temporary directory for downloads
temp_dir = args.temp_dir or tempfile.mkdtemp()
temp_path = Path(temp_dir)
temp_path.mkdir(parents=True, exist_ok=True)
try:
print(f"Using temporary directory: {temp_dir}", file=sys.stderr)
# Ensure S3 path ends with /
s3_base_path = args.s3_path.rstrip('/') + '/'
# Construct S3 paths for both files
metadata_yaml_s3 = s3_base_path + "metadata.yaml"
model_metadata_s3 = s3_base_path + "modelmetadata.json"
print(f"S3 base path: {s3_base_path}", file=sys.stderr)
print(f"Looking for files:", file=sys.stderr)
print(f" - {metadata_yaml_s3}", file=sys.stderr)
print(f" - {model_metadata_s3}", file=sys.stderr)
# Define local file paths
metadata_yaml_local = temp_path / "metadata.yaml"
model_metadata_local = temp_path / "modelmetadata.json"
# Download files from S3
download_file_from_s3(
metadata_yaml_s3,
str(metadata_yaml_local),
endpoint_url=args.endpoint_url,
ca_bundle=args.ca_bundle,
env=env
)
download_file_from_s3(
model_metadata_s3,
str(model_metadata_local),
endpoint_url=args.endpoint_url,
ca_bundle=args.ca_bundle,
env=env
)
# Read file contents
print("Reading downloaded files...", file=sys.stderr)
manifest_yaml_content = read_file_content(metadata_yaml_local)
model_info_content = read_file_content(model_metadata_local)
# Create structured JSON
print(f"Creating structured JSON for repo type: {args.repo_type}", file=sys.stderr)
result_json = create_model_metadata_json(
args.repo_type,
manifest_yaml_content,
model_info_content
)
# Print JSON to stdout
json_output = json.dumps(result_json, indent=2, ensure_ascii=False)
print(json_output)
# Write output JSON to file
output_path = Path(args.output)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(json_output)
print(f"\nSuccessfully created: {output_path}", file=sys.stderr)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
finally:
# Clean up temporary files if we created the temp directory
if not args.temp_dir:
import shutil
try:
shutil.rmtree(temp_dir)
print(f"Cleaned up temporary directory: {temp_dir}", file=sys.stderr)
except Exception as e:
print(f"Warning: Could not clean up temp directory {temp_dir}: {e}", file=sys.stderr)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment