Created
February 18, 2026 14:33
-
-
Save smothiki/fd540f515951f680b3ac7e98f92533fd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Script to download model metadata files from S3 and create structured JSON output. | |
| Uses AWS CLI to download modelmetadata.json and metadata.yaml files from a single S3 path. | |
| Usage examples: | |
| # Basic usage for NGC | |
| python download_model_metadata.py \ | |
| --repo-type NGC \ | |
| --s3-path "s3://my-bucket/models/model1/" \ | |
| --output "ngc_model_metadata.json" | |
| # With AWS profile and region | |
| python download_model_metadata.py \ | |
| --repo-type HuggingFace \ | |
| --s3-path "s3://my-bucket/hf-models/" \ | |
| --aws-profile my-profile \ | |
| --aws-region us-west-2 \ | |
| --output "hf_model_metadata.json" | |
| # With custom S3 endpoint and CA bundle | |
| python download_model_metadata.py \ | |
| --repo-type NGC \ | |
| --s3-path "s3://my-bucket/models/" \ | |
| --endpoint-url "https://s3.example.com" \ | |
| --ca-bundle "/path/to/ca-bundle.crt" | |
| """ | |
| import json | |
| import subprocess | |
| import argparse | |
| import sys | |
| import os | |
| import tempfile | |
| from pathlib import Path | |
| def run_aws_command(command, env=None): | |
| """Run AWS CLI command and return the result.""" | |
| try: | |
| result = subprocess.run( | |
| command, | |
| shell=True, | |
| capture_output=True, | |
| text=True, | |
| check=True, | |
| env=env | |
| ) | |
| return result.stdout.strip() | |
| except subprocess.CalledProcessError as e: | |
| print(f"Error running AWS command: {command}", file=sys.stderr) | |
| print(f"Error: {e.stderr}", file=sys.stderr) | |
| sys.exit(1) | |
| def download_file_from_s3(s3_path, local_path, endpoint_url=None, ca_bundle=None, env=None): | |
| """Download a file from S3 using AWS CLI.""" | |
| command_parts = ["aws s3 cp"] | |
| if endpoint_url: | |
| command_parts.append(f"--endpoint-url {endpoint_url}") | |
| if ca_bundle: | |
| command_parts.append(f"--ca-bundle {ca_bundle}") | |
| command_parts.extend([s3_path, local_path]) | |
| command = " ".join(command_parts) | |
| print(f"Downloading: {s3_path} -> {local_path}", file=sys.stderr) | |
| run_aws_command(command, env=env) | |
| if not os.path.exists(local_path): | |
| raise FileNotFoundError(f"Failed to download file: {s3_path}") | |
| return local_path | |
| def read_file_content(file_path): | |
| """Read and return the content of a file.""" | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| return f.read() | |
| except Exception as e: | |
| print(f"Error reading file {file_path}: {e}", file=sys.stderr) | |
| sys.exit(1) | |
| def create_model_metadata_json(repo_type, manifest_yaml_content, model_info_content): | |
| """Create the structured JSON with the specified format.""" | |
| # Map repo type to the appropriate metadata field | |
| repo_type_upper = repo_type.upper() | |
| if repo_type_upper == "NGC": | |
| metadata_key = "ngc_metadata" | |
| elif repo_type_upper == "HUGGINGFACE": | |
| metadata_key = "huggingface_metadata" | |
| elif repo_type_upper == "MLFLOW": | |
| metadata_key = "mlflow_metadata" | |
| else: | |
| # Default to using the repo type as lowercase with _metadata suffix | |
| metadata_key = f"{repo_type.lower()}_metadata" | |
| result = { | |
| "model_repo_type": repo_type_upper, | |
| metadata_key: { | |
| "manifest_yaml": manifest_yaml_content, | |
| "model_info": model_info_content | |
| } | |
| } | |
| return result | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Download model metadata files from S3 and create structured JSON" | |
| ) | |
| parser.add_argument( | |
| "--repo-type", | |
| required=True, | |
| help="Repository type (e.g., NGC, HuggingFace, MLFlow)" | |
| ) | |
| parser.add_argument( | |
| "--s3-path", | |
| required=True, | |
| help="S3 path containing both metadata.yaml and modelmetadata.json files (e.g., s3://bucket/path/to/model/)" | |
| ) | |
| parser.add_argument( | |
| "--output", | |
| "-o", | |
| default="model_metadata_output.json", | |
| help="Output JSON file path (default: model_metadata_output.json)" | |
| ) | |
| parser.add_argument( | |
| "--temp-dir", | |
| help="Temporary directory for downloads (default: system temp)" | |
| ) | |
| parser.add_argument( | |
| "--aws-profile", | |
| help="AWS profile to use (optional)" | |
| ) | |
| parser.add_argument( | |
| "--aws-region", | |
| help="AWS region to use (optional)" | |
| ) | |
| parser.add_argument( | |
| "--endpoint-url", | |
| help="S3 endpoint URL for custom S3-compatible storage (e.g., https://s3.example.com)" | |
| ) | |
| parser.add_argument( | |
| "--ca-bundle", | |
| help="Path to CA bundle file for SSL verification (e.g., /path/to/ca-bundle.crt)" | |
| ) | |
| args = parser.parse_args() | |
| # Set AWS profile and region if provided | |
| env = os.environ.copy() | |
| if args.aws_profile: | |
| env['AWS_PROFILE'] = args.aws_profile | |
| if args.aws_region: | |
| env['AWS_DEFAULT_REGION'] = args.aws_region | |
| # Create temporary directory for downloads | |
| temp_dir = args.temp_dir or tempfile.mkdtemp() | |
| temp_path = Path(temp_dir) | |
| temp_path.mkdir(parents=True, exist_ok=True) | |
| try: | |
| print(f"Using temporary directory: {temp_dir}", file=sys.stderr) | |
| # Ensure S3 path ends with / | |
| s3_base_path = args.s3_path.rstrip('/') + '/' | |
| # Construct S3 paths for both files | |
| metadata_yaml_s3 = s3_base_path + "metadata.yaml" | |
| model_metadata_s3 = s3_base_path + "modelmetadata.json" | |
| print(f"S3 base path: {s3_base_path}", file=sys.stderr) | |
| print(f"Looking for files:", file=sys.stderr) | |
| print(f" - {metadata_yaml_s3}", file=sys.stderr) | |
| print(f" - {model_metadata_s3}", file=sys.stderr) | |
| # Define local file paths | |
| metadata_yaml_local = temp_path / "metadata.yaml" | |
| model_metadata_local = temp_path / "modelmetadata.json" | |
| # Download files from S3 | |
| download_file_from_s3( | |
| metadata_yaml_s3, | |
| str(metadata_yaml_local), | |
| endpoint_url=args.endpoint_url, | |
| ca_bundle=args.ca_bundle, | |
| env=env | |
| ) | |
| download_file_from_s3( | |
| model_metadata_s3, | |
| str(model_metadata_local), | |
| endpoint_url=args.endpoint_url, | |
| ca_bundle=args.ca_bundle, | |
| env=env | |
| ) | |
| # Read file contents | |
| print("Reading downloaded files...", file=sys.stderr) | |
| manifest_yaml_content = read_file_content(metadata_yaml_local) | |
| model_info_content = read_file_content(model_metadata_local) | |
| # Create structured JSON | |
| print(f"Creating structured JSON for repo type: {args.repo_type}", file=sys.stderr) | |
| result_json = create_model_metadata_json( | |
| args.repo_type, | |
| manifest_yaml_content, | |
| model_info_content | |
| ) | |
| # Print JSON to stdout | |
| json_output = json.dumps(result_json, indent=2, ensure_ascii=False) | |
| print(json_output) | |
| # Write output JSON to file | |
| output_path = Path(args.output) | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| f.write(json_output) | |
| print(f"\nSuccessfully created: {output_path}", file=sys.stderr) | |
| except Exception as e: | |
| print(f"Error: {e}", file=sys.stderr) | |
| sys.exit(1) | |
| finally: | |
| # Clean up temporary files if we created the temp directory | |
| if not args.temp_dir: | |
| import shutil | |
| try: | |
| shutil.rmtree(temp_dir) | |
| print(f"Cleaned up temporary directory: {temp_dir}", file=sys.stderr) | |
| except Exception as e: | |
| print(f"Warning: Could not clean up temp directory {temp_dir}: {e}", file=sys.stderr) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment