smothiki · February 18, 2026 14:33
diff --git a/download_metadata.py b/download_metadata.py
 #!/usr/bin/env python3
 """
 Script to download model metadata files from S3 and create structured JSON output.
 Uses AWS CLI to download modelmetadata.json and metadata.yaml files from a single S3 path.

 Usage examples:
  # Basic usage for NGC
  python download_model_metadata.py \
    --repo-type NGC \
    --s3-path "s3://my-bucket/models/model1/" \
    --output "ngc_model_metadata.json"

  # With AWS profile and region
  python download_model_metadata.py \
    --repo-type HuggingFace \
    --s3-path "s3://my-bucket/hf-models/" \
    --aws-profile my-profile \
    --aws-region us-west-2 \
    --output "hf_model_metadata.json"

  # With custom S3 endpoint and CA bundle
  python download_model_metadata.py \
    --repo-type NGC \
    --s3-path "s3://my-bucket/models/" \
    --endpoint-url "https://s3.example.com" \
    --ca-bundle "/path/to/ca-bundle.crt"
 """

 import json
 import subprocess
 import argparse
 import sys
 import os
 import tempfile
 from pathlib import Path


 def run_aws_command(command, env=None):
    """Run AWS CLI command and return the result."""
    try:
        result = subprocess.run(
            command,
            shell=True,
            capture_output=True,
            text=True,
            check=True,
            env=env
        )
        return result.stdout.strip()
    except subprocess.CalledProcessError as e:
        print(f"Error running AWS command: {command}", file=sys.stderr)
        print(f"Error: {e.stderr}", file=sys.stderr)
        sys.exit(1)


 def download_file_from_s3(s3_path, local_path, endpoint_url=None, ca_bundle=None, env=None):
    """Download a file from S3 using AWS CLI."""
    command_parts = ["aws s3 cp"]
    
    if endpoint_url:
        command_parts.append(f"--endpoint-url {endpoint_url}")
    
    if ca_bundle:
        command_parts.append(f"--ca-bundle {ca_bundle}")
    
    command_parts.extend([s3_path, local_path])
    command = " ".join(command_parts)
    
    print(f"Downloading: {s3_path} -> {local_path}", file=sys.stderr)
    run_aws_command(command, env=env)
    
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"Failed to download file: {s3_path}")
    
    return local_path


 def read_file_content(file_path):
    """Read and return the content of a file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {e}", file=sys.stderr)
        sys.exit(1)


 def create_model_metadata_json(repo_type, manifest_yaml_content, model_info_content):
    """Create the structured JSON with the specified format."""
    
    # Map repo type to the appropriate metadata field
    repo_type_upper = repo_type.upper()
    
    if repo_type_upper == "NGC":
        metadata_key = "ngc_metadata"
    elif repo_type_upper == "HUGGINGFACE":
        metadata_key = "huggingface_metadata"
    elif repo_type_upper == "MLFLOW":
        metadata_key = "mlflow_metadata"
    else:
        # Default to using the repo type as lowercase with _metadata suffix
        metadata_key = f"{repo_type.lower()}_metadata"
    
    result = {
        "model_repo_type": repo_type_upper,
        metadata_key: {
            "manifest_yaml": manifest_yaml_content,
            "model_info": model_info_content
        }
    }
    
    return result


 def main():
    parser = argparse.ArgumentParser(
        description="Download model metadata files from S3 and create structured JSON"
    )
    parser.add_argument(
        "--repo-type",
        required=True,
        help="Repository type (e.g., NGC, HuggingFace, MLFlow)"
    )
    parser.add_argument(
        "--s3-path",
        required=True,
        help="S3 path containing both metadata.yaml and modelmetadata.json files (e.g., s3://bucket/path/to/model/)"
    )
    parser.add_argument(
        "--output",
        "-o",
        default="model_metadata_output.json",
        help="Output JSON file path (default: model_metadata_output.json)"
    )
    parser.add_argument(
        "--temp-dir",
        help="Temporary directory for downloads (default: system temp)"
    )
    parser.add_argument(
        "--aws-profile",
        help="AWS profile to use (optional)"
    )
    parser.add_argument(
        "--aws-region",
        help="AWS region to use (optional)"
    )
    parser.add_argument(
        "--endpoint-url",
        help="S3 endpoint URL for custom S3-compatible storage (e.g., https://s3.example.com)"
    )
    parser.add_argument(
        "--ca-bundle",
        help="Path to CA bundle file for SSL verification (e.g., /path/to/ca-bundle.crt)"
    )
    
    args = parser.parse_args()
    
    # Set AWS profile and region if provided
    env = os.environ.copy()
    if args.aws_profile:
        env['AWS_PROFILE'] = args.aws_profile
    if args.aws_region:
        env['AWS_DEFAULT_REGION'] = args.aws_region
    
    # Create temporary directory for downloads
    temp_dir = args.temp_dir or tempfile.mkdtemp()
    temp_path = Path(temp_dir)
    temp_path.mkdir(parents=True, exist_ok=True)
    
    try:
        print(f"Using temporary directory: {temp_dir}", file=sys.stderr)
        
        # Ensure S3 path ends with /
        s3_base_path = args.s3_path.rstrip('/') + '/'
        
        # Construct S3 paths for both files
        metadata_yaml_s3 = s3_base_path + "metadata.yaml"
        model_metadata_s3 = s3_base_path + "modelmetadata.json"
        
        print(f"S3 base path: {s3_base_path}", file=sys.stderr)
        print(f"Looking for files:", file=sys.stderr)
        print(f"  - {metadata_yaml_s3}", file=sys.stderr)
        print(f"  - {model_metadata_s3}", file=sys.stderr)
        
        # Define local file paths
        metadata_yaml_local = temp_path / "metadata.yaml"
        model_metadata_local = temp_path / "modelmetadata.json"
        
        # Download files from S3
        download_file_from_s3(
            metadata_yaml_s3, 
            str(metadata_yaml_local),
            endpoint_url=args.endpoint_url,
            ca_bundle=args.ca_bundle,
            env=env
        )
        download_file_from_s3(
            model_metadata_s3, 
            str(model_metadata_local),
            endpoint_url=args.endpoint_url,
            ca_bundle=args.ca_bundle,
            env=env
        )
        
        # Read file contents
        print("Reading downloaded files...", file=sys.stderr)
        manifest_yaml_content = read_file_content(metadata_yaml_local)
        model_info_content = read_file_content(model_metadata_local)
        
        # Create structured JSON
        print(f"Creating structured JSON for repo type: {args.repo_type}", file=sys.stderr)
        result_json = create_model_metadata_json(
            args.repo_type,
            manifest_yaml_content,
            model_info_content
        )
        
        # Print JSON to stdout
        json_output = json.dumps(result_json, indent=2, ensure_ascii=False)
        print(json_output)
        
        # Write output JSON to file
        output_path = Path(args.output)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(json_output)
        
        print(f"\nSuccessfully created: {output_path}", file=sys.stderr)
        
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)
    
    finally:
        # Clean up temporary files if we created the temp directory
        if not args.temp_dir:
            import shutil
            try:
                shutil.rmtree(temp_dir)
                print(f"Cleaned up temporary directory: {temp_dir}", file=sys.stderr)
            except Exception as e:
                print(f"Warning: Could not clean up temp directory {temp_dir}: {e}", file=sys.stderr)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Script to download model metadata files from S3 and create structured JSON output.
	Uses AWS CLI to download modelmetadata.json and metadata.yaml files from a single S3 path.

	Usage examples:
	# Basic usage for NGC
	python download_model_metadata.py \
	--repo-type NGC \
	--s3-path "s3://my-bucket/models/model1/" \
	--output "ngc_model_metadata.json"

	# With AWS profile and region
	python download_model_metadata.py \
	--repo-type HuggingFace \
	--s3-path "s3://my-bucket/hf-models/" \
	--aws-profile my-profile \
	--aws-region us-west-2 \
	--output "hf_model_metadata.json"

	# With custom S3 endpoint and CA bundle
	python download_model_metadata.py \
	--repo-type NGC \
	--s3-path "s3://my-bucket/models/" \
	--endpoint-url "https://s3.example.com" \
	--ca-bundle "/path/to/ca-bundle.crt"
	"""

	import json
	import subprocess
	import argparse
	import sys
	import os
	import tempfile
	from pathlib import Path


	def run_aws_command(command, env=None):
	"""Run AWS CLI command and return the result."""
	try:
	result = subprocess.run(
	command,
	shell=True,
	capture_output=True,
	text=True,
	check=True,
	env=env
	)
	return result.stdout.strip()
	except subprocess.CalledProcessError as e:
	print(f"Error running AWS command: {command}", file=sys.stderr)
	print(f"Error: {e.stderr}", file=sys.stderr)
	sys.exit(1)


	def download_file_from_s3(s3_path, local_path, endpoint_url=None, ca_bundle=None, env=None):
	"""Download a file from S3 using AWS CLI."""
	command_parts = ["aws s3 cp"]

	if endpoint_url:
	command_parts.append(f"--endpoint-url {endpoint_url}")

	if ca_bundle:
	command_parts.append(f"--ca-bundle {ca_bundle}")

	command_parts.extend([s3_path, local_path])
	command = " ".join(command_parts)

	print(f"Downloading: {s3_path} -> {local_path}", file=sys.stderr)
	run_aws_command(command, env=env)

	if not os.path.exists(local_path):
	raise FileNotFoundError(f"Failed to download file: {s3_path}")

	return local_path


	def read_file_content(file_path):
	"""Read and return the content of a file."""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	return f.read()
	except Exception as e:
	print(f"Error reading file {file_path}: {e}", file=sys.stderr)
	sys.exit(1)


	def create_model_metadata_json(repo_type, manifest_yaml_content, model_info_content):
	"""Create the structured JSON with the specified format."""

	# Map repo type to the appropriate metadata field
	repo_type_upper = repo_type.upper()

	if repo_type_upper == "NGC":
	metadata_key = "ngc_metadata"
	elif repo_type_upper == "HUGGINGFACE":
	metadata_key = "huggingface_metadata"
	elif repo_type_upper == "MLFLOW":
	metadata_key = "mlflow_metadata"
	else:
	# Default to using the repo type as lowercase with _metadata suffix
	metadata_key = f"{repo_type.lower()}_metadata"

	result = {
	"model_repo_type": repo_type_upper,
	metadata_key: {
	"manifest_yaml": manifest_yaml_content,
	"model_info": model_info_content
	}
	}

	return result


	def main():
	parser = argparse.ArgumentParser(
	description="Download model metadata files from S3 and create structured JSON"
	)
	parser.add_argument(
	"--repo-type",
	required=True,
	help="Repository type (e.g., NGC, HuggingFace, MLFlow)"
	)
	parser.add_argument(
	"--s3-path",
	required=True,
	help="S3 path containing both metadata.yaml and modelmetadata.json files (e.g., s3://bucket/path/to/model/)"
	)
	parser.add_argument(
	"--output",
	"-o",
	default="model_metadata_output.json",
	help="Output JSON file path (default: model_metadata_output.json)"
	)
	parser.add_argument(
	"--temp-dir",
	help="Temporary directory for downloads (default: system temp)"
	)
	parser.add_argument(
	"--aws-profile",
	help="AWS profile to use (optional)"
	)
	parser.add_argument(
	"--aws-region",
	help="AWS region to use (optional)"
	)
	parser.add_argument(
	"--endpoint-url",
	help="S3 endpoint URL for custom S3-compatible storage (e.g., https://s3.example.com)"
	)
	parser.add_argument(
	"--ca-bundle",
	help="Path to CA bundle file for SSL verification (e.g., /path/to/ca-bundle.crt)"
	)

	args = parser.parse_args()

	# Set AWS profile and region if provided
	env = os.environ.copy()
	if args.aws_profile:
	env['AWS_PROFILE'] = args.aws_profile
	if args.aws_region:
	env['AWS_DEFAULT_REGION'] = args.aws_region

	# Create temporary directory for downloads
	temp_dir = args.temp_dir or tempfile.mkdtemp()
	temp_path = Path(temp_dir)
	temp_path.mkdir(parents=True, exist_ok=True)

	try:
	print(f"Using temporary directory: {temp_dir}", file=sys.stderr)

	# Ensure S3 path ends with /
	s3_base_path = args.s3_path.rstrip('/') + '/'

	# Construct S3 paths for both files
	metadata_yaml_s3 = s3_base_path + "metadata.yaml"
	model_metadata_s3 = s3_base_path + "modelmetadata.json"

	print(f"S3 base path: {s3_base_path}", file=sys.stderr)
	print(f"Looking for files:", file=sys.stderr)
	print(f" - {metadata_yaml_s3}", file=sys.stderr)
	print(f" - {model_metadata_s3}", file=sys.stderr)

	# Define local file paths
	metadata_yaml_local = temp_path / "metadata.yaml"
	model_metadata_local = temp_path / "modelmetadata.json"

	# Download files from S3
	download_file_from_s3(
	metadata_yaml_s3,
	str(metadata_yaml_local),
	endpoint_url=args.endpoint_url,
	ca_bundle=args.ca_bundle,
	env=env
	)
	download_file_from_s3(
	model_metadata_s3,
	str(model_metadata_local),
	endpoint_url=args.endpoint_url,
	ca_bundle=args.ca_bundle,
	env=env
	)

	# Read file contents
	print("Reading downloaded files...", file=sys.stderr)
	manifest_yaml_content = read_file_content(metadata_yaml_local)
	model_info_content = read_file_content(model_metadata_local)

	# Create structured JSON
	print(f"Creating structured JSON for repo type: {args.repo_type}", file=sys.stderr)
	result_json = create_model_metadata_json(
	args.repo_type,
	manifest_yaml_content,
	model_info_content
	)

	# Print JSON to stdout
	json_output = json.dumps(result_json, indent=2, ensure_ascii=False)
	print(json_output)

	# Write output JSON to file
	output_path = Path(args.output)
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(json_output)

	print(f"\nSuccessfully created: {output_path}", file=sys.stderr)

	except Exception as e:
	print(f"Error: {e}", file=sys.stderr)
	sys.exit(1)

	finally:
	# Clean up temporary files if we created the temp directory
	if not args.temp_dir:
	import shutil
	try:
	shutil.rmtree(temp_dir)
	print(f"Cleaned up temporary directory: {temp_dir}", file=sys.stderr)
	except Exception as e:
	print(f"Warning: Could not clean up temp directory {temp_dir}: {e}", file=sys.stderr)


	if __name__ == "__main__":
	main()
No results found