tokenbender · July 14, 2025 09:30
diff --git a/how_transformer_learns.md b/how_transformer_learns.md
diff --git a/inference_modal.py b/inference_modal.py
 #!/usr/bin/env python3
 """
 Inference script for models trained with train_modal_standalone.py
 Uses the exact same architecture as train.py for consistency
 """

 import os
 import sys
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import modal
 from pathlib import Path
 from typing import Optional, Tuple
 import numpy as np
 import pickle
 from tokenizers import Tokenizer

 # Modal configuration
 app = modal.App("shakespeare-inference-modal")

 # Model architecture settings (matching train.py and train_modal_standalone.py)
 N_LAYER = 6
 N_HEAD = 6
 N_EMB = 384
 CONTEXT_LEN = 256  # Default context length
 DROPOUT = 0.1

 # Volume setup
 data_volume = modal.Volume.from_name("nanogpt-data", create_if_missing=False)

 # GPU image setup
 image = modal.Image.debian_slim(python_version="3.11").pip_install(
    "torch", "numpy", "tokenizers"
 )

 # ============================================================================
 # TOKENIZER HELPERS - From train_modal_standalone.py
 # ============================================================================

 # Global tokenizer instance to avoid reloading
 _tokenizer = None

 def check_tokenizer_exists(vocab_size=1024, data_root="/data"):
    """Check if the custom tokenizer exists and provide instructions if not."""
    tokenizer_path = os.path.join(data_root, "tokenizers", f"shakespeare-bpe-{vocab_size}.json")
    if not os.path.exists(tokenizer_path):
        print(f"\n{'='*60}")
        print(f"ERROR: Custom tokenizer not found!")
        print(f"{'='*60}")
        print(f"Expected tokenizer at: {tokenizer_path}")
        print(f"\nTo create the tokenizer, run:")
        print(f"  modal run train_tokenizer_modal.py::train_bpe_tokenizer")
        print(f"\nOr for multiple vocab sizes:")
        print(f"  modal run train_tokenizer_modal.py::vocab_size_grid_search")
        print(f"{'='*60}\n")
        return False
    return True

 def load_custom_tokenizer(vocab_size=1024, data_root="/data"):
    """Load the custom BPE tokenizer from the Modal volume."""
    global _tokenizer
    if _tokenizer is None:
        tokenizer_path = os.path.join(data_root, "tokenizers", f"shakespeare-bpe-{vocab_size}.json")
        if not check_tokenizer_exists(vocab_size, data_root):
            raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
        _tokenizer = Tokenizer.from_file(tokenizer_path)
        print(f"Loaded custom tokenizer from {tokenizer_path}")
    return _tokenizer

 def decode_tokens(tokens, vocab_size=1024):
    """Decode tokens using the custom tokenizer."""
    tokenizer = load_custom_tokenizer(vocab_size=vocab_size)
    return tokenizer.decode(tokens)

 def encode_tokens(text, vocab_size=1024):
    """Encode text using the custom tokenizer."""
    tokenizer = load_custom_tokenizer(vocab_size=vocab_size)
    encoding = tokenizer.encode(text)
    return encoding.ids

 # ============================================================================
 # MODEL DEFINITION - Exact copy from train_modal_standalone.py
 # ============================================================================

 def norm(x: torch.Tensor) -> torch.Tensor:
    """RMSNorm implementation using PyTorch built-in"""
    return F.rms_norm(x, (x.size(-1),))

 def _rotate_half(x: torch.Tensor) -> torch.Tensor:
    """Helper for rotary embeddings"""
    x1, x2 = x[..., ::2], x[..., 1::2]
    return torch.stack((-x2, x1), dim=-1).flatten(-2)

 class RotaryCache(nn.Module):
    """Pre-computed rotary position embeddings"""
    def __init__(self, head_dim: int, max_len: int):
        super().__init__()
        inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2) / head_dim))
        t = torch.arange(max_len)
        freqs = torch.einsum("i,j->ij", t, inv_freq)
        sin, cos = freqs.sin(), freqs.cos()
        self.register_buffer("sin_base", sin, persistent=False)
        self.register_buffer("cos_base", cos, persistent=False)

    def forward(self, seq_len: int):
        sin = self.sin_base[:seq_len].repeat_interleave(2, dim=-1)
        cos = self.cos_base[:seq_len].repeat_interleave(2, dim=-1)
        return sin[None, None, :, :], cos[None, None, :, :]

 class KVCache(nn.Module):
    """
    KV cache for efficient inference - caches past key and values during generation.
    Based on Meta's implementation for torchtune.
    """
    def __init__(
        self,
        batch_size: int,
        max_seq_len: int,
        num_kv_heads: int,
        head_dim: int,
        dtype: torch.dtype = torch.bfloat16,
        device: torch.device = None,
    ) -> None:
        super().__init__()
        if device is None:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        cache_shape = (batch_size, num_kv_heads, max_seq_len, head_dim)
        self.register_buffer(
            "k_cache", torch.zeros(cache_shape, dtype=dtype, device=device), persistent=False
        )
        self.register_buffer(
            "v_cache", torch.zeros(cache_shape, dtype=dtype, device=device), persistent=False
        )
        self.register_buffer(
            "cache_pos", torch.arange(0, cache_shape[2], device=device), persistent=False
        )
        self.batch_size = batch_size
        self.max_seq_len = max_seq_len

    def reset(self) -> None:
        """Reset the cache to zero."""
        self.k_cache.zero_()
        self.v_cache.zero_()
        self.cache_pos -= self.size

    @property
    def size(self) -> int:
        return self.cache_pos[0].item()

    def update(
        self, k_val: torch.Tensor, v_val: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Update KV cache with new k_val, v_val and return the updated cache.
        Args:
            k_val: Current key tensor with shape [B, H, S, D]
            v_val: Current value tensor with shape [B, H, S, D]
        Returns:
            Updated key and value cache tensors
        """
        bsz, _, seq_len, _ = k_val.shape
        if bsz > self.k_cache.shape[0]:
            raise ValueError(
                f"Cache batch size is {self.k_cache.shape[0]} but got {bsz}"
            )

        assert (self.cache_pos[0] + seq_len) <= self.k_cache.shape[2]
        
        k_out = self.k_cache
        v_out = self.v_cache

        # Use integer indexing instead of tensor indexing to avoid dtype mismatch
        cache_start = self.cache_pos[0].item()
        cache_end = cache_start + seq_len
        k_out[:, :, cache_start:cache_end] = k_val
        v_out[:, :, cache_start:cache_end] = v_val

        # Update position tracker
        self.cache_pos.add_(seq_len)

        return k_out, v_out

 class ReLUSquared(nn.Module):
    """ReLU squared activation - faster than GELU, better than plain ReLU"""
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return F.relu(x).square()

 class OptimizedAttention(nn.Module):
    """Multi-head attention with Flash Attention support and RoPE"""
    def __init__(self, n_emb: int, n_head: int, context_len: int, dropout: float = 0.1):
        super().__init__()
        self.n_head = n_head
        self.n_emb = n_emb
        self.head_dim = n_emb // n_head
        
        # Fused QKV projection for efficiency
        self.qkv = nn.Linear(n_emb, 3 * n_emb, bias=False)
        self.o_proj = nn.Linear(n_emb, n_emb, bias=False)
        self.dropout = nn.Dropout(dropout)
        
        # Rotary embeddings
        max_seq = context_len
        self.rope = RotaryCache(self.head_dim, max_seq)
        
        # Try to use Flash Attention
        self.use_flash_attn = False
        if hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
            self.use_flash_attn = True
        
        # KV cache for inference (not used during training)
        self.kv_cache = None
        self.cache_enabled = False
    
    def init_kv_cache(self, batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16, device: torch.device = None):
        """Initialize KV cache for inference"""
        self.kv_cache = KVCache(
            batch_size=batch_size,
            max_seq_len=max_seq_len,
            num_kv_heads=self.n_head,
            head_dim=self.head_dim,
            dtype=dtype,
            device=device
        )
        self.cache_enabled = True
    
    def reset_kv_cache(self):
        """Reset the KV cache"""
        if self.kv_cache is not None:
            self.kv_cache.reset()
    
    def disable_kv_cache(self):
        """Disable KV cache (for training)"""
        self.cache_enabled = False
        self.kv_cache = None
    
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None, use_cache: bool = False) -> torch.Tensor:
        B, T, C = x.shape
        
        # Use KV cache if enabled and requested
        if use_cache and self.cache_enabled and self.kv_cache is not None:
            return self._forward_with_cache(x, mask)
        
        # Standard forward pass (for training)
        # Compute QKV in one go
        qkv = self.qkv(x).reshape(B, T, 3, self.n_head, self.head_dim)
        
        # Standard attention path
        q, k, v = qkv.unbind(dim=2)
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
        
        # Apply RoPE
        sin, cos = self.rope(T)
        q = (q * cos) + (_rotate_half(q) * sin)
        k = (k * cos) + (_rotate_half(k) * sin)
        
        # QK normalization
        q, k = norm(q), norm(k)
        
        # Scaled dot-product attention with causal mask
        # Note: is_causal=True automatically applies causal masking
        out = F.scaled_dot_product_attention(q, k, v, is_causal=True, dropout_p=self.dropout.p if self.training else 0.0)
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        
        return self.o_proj(out)
    
    def _forward_with_cache(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        """Forward pass using KV cache for efficient inference"""
        B, T, C = x.shape
        
        # Compute QKV
        qkv = self.qkv(x).reshape(B, T, 3, self.n_head, self.head_dim)
        q, k, v = qkv.unbind(dim=2)
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
        
        # Apply RoPE to current position
        cache_size = self.kv_cache.size
        sin, cos = self.rope(cache_size + T)
        # Only apply to new positions
        sin_new = sin[:, :, cache_size:cache_size+T, :]
        cos_new = cos[:, :, cache_size:cache_size+T, :]
        
        q = (q * cos_new) + (_rotate_half(q) * sin_new)
        k = (k * cos_new) + (_rotate_half(k) * sin_new)
        
        # Normalize
        q, k = norm(q), norm(k)
        
        # Update KV cache
        k_cache, v_cache = self.kv_cache.update(k, v)
        
        # Compute attention with cached keys/values
        # Get only the valid portion of cache
        valid_cache_size = self.kv_cache.size
        k_valid = k_cache[:, :, :valid_cache_size, :]
        v_valid = v_cache[:, :, :valid_cache_size, :]
        
        # Standard attention computation
        out = F.scaled_dot_product_attention(q, k_valid, v_valid, is_causal=False)
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        
        return self.o_proj(out)

 class TransformerBlock(nn.Module):
    """Transformer block with pre-norm architecture"""
    def __init__(self, n_emb: int, n_head: int, context_len: int, dropout: float = 0.1):
        super().__init__()
        self.attn = OptimizedAttention(n_emb, n_head, context_len, dropout)
        
        # Feed-forward network with ReLU squared
        self.ffn = nn.Sequential(
            nn.Linear(n_emb, 4 * n_emb, bias=False),
            ReLUSquared(),
            nn.Linear(4 * n_emb, n_emb, bias=False),
            nn.Dropout(dropout)
        )
    
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None, use_cache: bool = False) -> torch.Tensor:
        # Pre-norm architecture with residual connections
        x = x + self.attn(norm(x), mask, use_cache=use_cache)
        x = x + self.ffn(norm(x))
        return x

 class GPT(nn.Module):
    """GPT model optimized for multi-GPU training - matching train.py architecture"""
    def __init__(self, vocab_size: int, n_layer: int = 6, n_head: int = 6, 
                 n_emb: int = 384, context_len: int = 256, dropout: float = 0.1):
        super().__init__()
        self.vocab_size = vocab_size
        self.context_len = context_len
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_emb = n_emb
        
        # Token embeddings
        self.wte = nn.Embedding(vocab_size, n_emb)
        self.drop = nn.Dropout(dropout)
        
        # Transformer blocks
        self.layers = nn.ModuleList([
            TransformerBlock(n_emb, n_head, context_len, dropout)
            for _ in range(n_layer)
        ])
        
        # Output head with weight tying
        self.head = nn.Linear(n_emb, vocab_size, bias=False)
        # Weight tying - delete the head weight first to avoid issues
        del self.head.weight
        self.head.weight = self.wte.weight  # Share the embedding weights
        
        # Initialize weights
        self.apply(self._init_weights)
        
        # Pre-compute causal mask (not used in this architecture but kept for compatibility)
        self.register_buffer("causal_mask", torch.triu(
            torch.ones(context_len, context_len), diagonal=1
        ).bool())
    
    def _init_weights(self, module):
        """Initialize weights with appropriate scaling"""
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def forward(self, idx: torch.Tensor, targets=None, use_cache: bool = False) -> torch.Tensor:
        B, T = idx.shape
        
        # Token embeddings
        tok_emb = self.wte(idx)
        x = self.drop(tok_emb)
        
        # Get causal mask
        mask = self.causal_mask[:T, :T] if T <= self.context_len else None
        
        # Forward through transformer layers
        for layer in self.layers:
            x = layer(x, mask, use_cache=use_cache)
        
        # Final norm and output projection
        x = norm(x)
        
        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss
    
    def init_kv_caches(self, batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16, device: torch.device = None):
        """Initialize KV caches for all attention layers"""
        for layer in self.layers:
            layer.attn.init_kv_cache(batch_size, max_seq_len, dtype, device)
    
    def reset_kv_caches(self):
        """Reset all KV caches"""
        for layer in self.layers:
            layer.attn.reset_kv_cache()
    
    def disable_kv_caches(self):
        """Disable all KV caches"""
        for layer in self.layers:
            layer.attn.disable_kv_cache()
    
    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None, use_cache=True, 
                 repetition_penalty=1.0, repetition_window=128):
        """
        Generate tokens using the model with optional KV caching and repetition penalty.
        Matches the generate function from train.py
        """
        device = idx.device
        B, T = idx.shape
        
        # Initialize KV cache if requested
        if use_cache:
            # Use the model's dtype (from embeddings) not the input indices dtype
            model_dtype = self.wte.weight.dtype
            self.init_kv_caches(B, T + max_new_tokens, dtype=model_dtype, device=device)
        
        # Generate tokens
        generated = idx
        for _ in range(max_new_tokens):
            # Get logits for next token
            if use_cache and generated.shape[1] > T:
                # Only feed the new token(s) when using cache
                logits, _ = self(generated[:, -1:], use_cache=True)
            else:
                # Feed full sequence (first iteration or no cache)
                # Crop to context length if needed
                idx_cond = generated if generated.shape[1] <= self.context_len else generated[:, -self.context_len:]
                logits, _ = self(idx_cond, use_cache=use_cache)
            
            # Get logits for last position
            logits = logits[:, -1, :] / temperature
            
            # Apply repetition penalty to discourage generating tokens that have recently appeared.
            if repetition_penalty != 0.7:
                B, T = generated.shape
                for b in range(B):
                    # Get the set of unique tokens in the window to penalize
                    window_start = max(0, T - repetition_window)
                    recent_tokens = set(generated[b, window_start:].tolist())
                    
                    # Apply penalty to the logits of these tokens
                    for token_id in recent_tokens:
                        # Correctly apply penalty to both positive and negative logits
                        if logits[b, token_id] > 0:
                            logits[b, token_id] /= repetition_penalty
                        else:
                            logits[b, token_id] *= repetition_penalty
            
            # Optional top-k sampling
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            
            # Sample from distribution
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            
            # Append to generated sequence
            generated = torch.cat((generated, idx_next), dim=1)
        
        # Clean up cache
        if use_cache:
            self.disable_kv_caches()
        
        return generated

 # ============================================================================
 # LOADING AND INFERENCE
 # ============================================================================

 def load_shakespeare_checkpoint(checkpoint_path: str, device: torch.device) -> Optional[Tuple[GPT, dict, int]]:
    """Load model from checkpoint with metadata"""
    try:
        print(f"Loading checkpoint from {checkpoint_path}...")
        
        if not os.path.exists(checkpoint_path):
            print(f"Checkpoint not found at {checkpoint_path}")
            return None
        
        # Load checkpoint
        checkpoint = torch.load(checkpoint_path, map_location=device)
        
        # Extract configuration
        if 'model_args' in checkpoint:
            model_args = checkpoint['model_args']
        else:
            print("Warning: No model_args in checkpoint, using defaults")
            model_args = {
                'n_layer': N_LAYER,
                'n_head': N_HEAD,
                'n_embd': N_EMB,
                'block_size': CONTEXT_LEN,
                'vocab_size': 50257,  # GPT-2 vocab size
                'dropout': DROPOUT
            }
        
        vocab_size = model_args.get('vocab_size', 50257)
        
        # Create model
        model = GPT(
            vocab_size=vocab_size,
            n_layer=model_args.get('n_layer', N_LAYER),
            n_head=model_args.get('n_head', N_HEAD),
            n_emb=model_args.get('n_embd', N_EMB),
            context_len=model_args.get('block_size', CONTEXT_LEN),
            dropout=model_args.get('dropout', DROPOUT)
        )
        
        # Load state dict
        state_dict = checkpoint['model']
        
        # Remove unwanted prefix if present
        unwanted_prefix = '_orig_mod.'
        for k in list(state_dict.keys()):
            if k.startswith(unwanted_prefix):
                state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
        
        model.load_state_dict(state_dict)
        model.to(device)
        model.eval()
        
        print(f"Successfully loaded model:")
        print(f"  - Layers: {model_args.get('n_layer', N_LAYER)}")
        print(f"  - Heads: {model_args.get('n_head', N_HEAD)}")
        print(f"  - Embedding: {model_args.get('n_embd', N_EMB)}")
        print(f"  - Context: {model_args.get('block_size', CONTEXT_LEN)}")
        print(f"  - Vocab: {vocab_size}")
        print(f"  - Parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")
        
        return model, model_args, vocab_size
        
    except Exception as e:
        print(f"Failed to load checkpoint: {e}")
        import traceback
        traceback.print_exc()
        return None

 @app.function(
    image=image,
    gpu="T4",
    volumes={"/data": data_volume},
    timeout=600
 )
 def run_inference():
    """Run inference with model trained by train_modal_standalone.py"""
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load model from checkpoint first to determine dataset type
    checkpoint_path = "/data/checkpoints/shakespeare_token/ckpt.pt"
    result = load_shakespeare_checkpoint(checkpoint_path, device)
    
    if result is None:
        print("Failed to load model!")
        return
    
    model, model_args, model_vocab_size = result
    
    # Check if custom tokenizer exists
    if not check_tokenizer_exists(model_vocab_size, "/data"):
        print("Custom tokenizer not found! Please create it first.")
        return
    
    # Token-level encoding/decoding using custom tokenizer
    print(f"Using custom BPE tokenizer with vocab_size={model_vocab_size}")
    
    def encode(s: str) -> torch.Tensor:
        return torch.tensor(encode_tokens(s, vocab_size=model_vocab_size), dtype=torch.long)
    
    def decode(t: torch.Tensor) -> str:
        return decode_tokens(t.tolist(), vocab_size=model_vocab_size)
    
    # Sample prompts (matching train.py)
    sample_prompts = [
        "O God, O God",
        "What is",
        "To be or not to be",
        "KING HENRY",
        "The quality of mercy",
        "All the world's a stage",
        "Now is the winter",
        "If music be"
    ]
    
    print("\n" + "="*80)
    print("Generating text samples with different prompts:")
    print("="*80)
    
    with torch.no_grad():
        for i, prompt in enumerate(sample_prompts):
            print(f"\n[{i+1}] Prompt: '{prompt}'")
            
            # Encode prompt
            prompt_encoded = encode(prompt).unsqueeze(0).to(device)
            
            # Generate text with settings matching train.py
            generated = model.generate(
                prompt_encoded,
                max_new_tokens=100,      # Generate 100 characters
                temperature=0.8,         # Moderate temperature
                top_k=40,               # Top-k sampling
                use_cache=True,         # Use KV cache for efficiency
                repetition_penalty=1.2,  # Use a more reasonable repetition penalty
                repetition_window=128    # Check last 128 tokens
            )
            
            # Decode and print
            generated_text = decode(generated[0])
            print(f"Generated: {generated_text}")
            print("-" * 80)
    
    # Generate some unconditional samples
    print("\n" + "="*80)
    print("Generating unconditional samples:")
    print("="*80)
    
    for i in range(3):
        # Start with newline character
        start_token = encode("\n").unsqueeze(0).to(device)
        
        generated = model.generate(
            start_token,
            max_new_tokens=200,      # Longer for unconditional
            temperature=0.8,
            top_k=40,
            use_cache=True,
            repetition_penalty=1.2,
            repetition_window=128
        )
        
        generated_text = decode(generated[0])
        print(f"\n[Unconditional {i+1}]")
        print(f"Generated: {generated_text}")
        print("-" * 80)
    
    print("\nInference complete!")

 @app.local_entrypoint()
 def main():
    """Run model inference"""
    print("Starting model inference on Modal...")
    run_inference.remote()

 if __name__ == "__main__":
    main()
diff --git a/RL_Pretraining_Comprehensive_Plan.md b/RL_Pretraining_Comprehensive_Plan.md
diff --git a/train_modal_standalone.py b/train_modal_standalone.py
 import os
 import sys
 import time
 import math
 from pathlib import Path
 import subprocess
 from dataclasses import dataclass
 import inspect
 from typing import Optional, Tuple
 import numpy as np
 import requests
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.distributed import init_process_group, destroy_process_group
 import modal

 N_GPUS = 4
 GPU_TYPE = "A100"

 # ┌─────────────────────────────────────────────────────────────────────────────┐
 # │                              TRAINING CONFIG                                │
 # ├─────────────────────────────────────────────────────────────────────────────┤
 # │                                                                             │
 # │  Token-Level Training Configuration:                                       │
 # │  ┌─────────────────────────────────────────────────────────────────────────┤
 # │  │ Data:     token-level tokenization with BPE                            │
 # │  │ Vocab:    1024 tokens (custom Shakespeare tokenizer)                   │
 # │  │ Context:  512 tokens per sequence                                      │
 # │  │ Batch:    128 sequences per batch                                      │
 # │  │ Model:    6-layer transformer, 384 dims, 6 heads                      │
 # │  │ Training: 2 epochs with cosine learning rate decay                     │
 # │  └─────────────────────────────────────────────────────────────────────────┤
 # │                                                                             │
 # └─────────────────────────────────────────────────────────────────────────────┘

 CONFIG = {
    "dataset_type": "token",
    "vocab_size": 1024,  # Custom tokenizer vocab size
    "block_size": 512,
    "batch_size": 128,
    "out_dir": "/data/checkpoints/shakespeare_token",
    "eval_interval": 50,
    "log_interval": 10,
    "eval_iters": 20,
    "eval_only": False,
    "always_save_checkpoint": True,
    "init_from": "scratch",
    "wandb_log": False,
    "wandb_project": "nanogpt-shakespeare",
    "wandb_run_name": "shakespeare-token-1",
    "dataset": "shakespeare_tokens",
    "gradient_accumulation_steps": 4,
    "n_layer": 6,
    "n_head": 6,
    "n_embd": 384,
    "dropout": 0.2,
    "bias": False,
    "num_epochs": 2.0,
    "learning_rate": 1e-3,
    "max_iters": None,
    "weight_decay": 1e-1,
    "beta1": 0.9,
    "beta2": 0.95,
    "grad_clip": 1.0,
    "decay_lr": True,
    "warmup_iters": None,
    "lr_decay_iters": None,
    "min_lr": 1e-4,
    "backend": "nccl",
    "device": "cuda",
    "dtype": "bfloat16",
    "compile": True,
 }

 def norm(x: torch.Tensor) -> torch.Tensor:
    return F.rms_norm(x, (x.size(-1),))

 def _rotate_half(x: torch.Tensor) -> torch.Tensor:
    x1, x2 = x[..., ::2], x[..., 1::2]
    return torch.stack((-x2, x1), dim=-1).flatten(-2)

 class RotaryCache(nn.Module):
    def __init__(self, head_dim: int, max_len: int):
        super().__init__()
        inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2) / head_dim))
        t = torch.arange(max_len)
        freqs = torch.einsum("i,j->ij", t, inv_freq)
        sin, cos = freqs.sin(), freqs.cos()
        self.register_buffer("sin_base", sin, persistent=False)
        self.register_buffer("cos_base", cos, persistent=False)

    def forward(self, seq_len: int):
        sin = self.sin_base[:seq_len].repeat_interleave(2, dim=-1)
        cos = self.cos_base[:seq_len].repeat_interleave(2, dim=-1)
        return sin[None, None, :, :], cos[None, None, :, :]

 class KVCache(nn.Module):
    def __init__(
        self,
        batch_size: int,
        max_seq_len: int,
        num_kv_heads: int,
        head_dim: int,
        dtype: torch.dtype = torch.bfloat16,
    ) -> None:
        super().__init__()
        cache_shape = (batch_size, num_kv_heads, max_seq_len, head_dim)
        self.register_buffer(
            "k_cache", torch.zeros(cache_shape, dtype=dtype), persistent=False
        )
        self.register_buffer(
            "v_cache", torch.zeros(cache_shape, dtype=dtype), persistent=False
        )
        self.register_buffer(
            "cache_pos", torch.arange(0, cache_shape[2]), persistent=False
        )
        self.batch_size = batch_size
        self.max_seq_len = max_seq_len

    def reset(self) -> None:
        self.k_cache.zero_()
        self.v_cache.zero_()
        self.cache_pos -= self.size

    @property
    def size(self) -> int:
        return self.cache_pos[0].item()

    def update(
        self, k_val: torch.Tensor, v_val: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        bsz, _, seq_len, _ = k_val.shape
        if bsz > self.k_cache.shape[0]:
            raise ValueError(
                f"Cache batch size is {self.k_cache.shape[0]} but got {bsz}"
            )

        assert (self.cache_pos[0] + seq_len) <= self.k_cache.shape[2]
        
        k_out = self.k_cache
        v_out = self.v_cache

        cache_start = self.cache_pos[0].item()
        cache_end = cache_start + seq_len
        k_out[:, :, cache_start:cache_end] = k_val
        v_out[:, :, cache_start:cache_end] = v_val

        self.cache_pos.add_(seq_len)

        return k_out, v_out

 class ReLUSquared(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return F.relu(x).square()

 class OptimizedAttention(nn.Module):
    def __init__(self, n_emb: int, n_head: int, context_len: int, dropout: float = 0.1):
        super().__init__()
        self.n_head = n_head
        self.n_emb = n_emb
        self.head_dim = n_emb // n_head
        self.qkv = nn.Linear(n_emb, 3 * n_emb, bias=False)
        self.o_proj = nn.Linear(n_emb, n_emb, bias=False)
        self.dropout = nn.Dropout(dropout)
        max_seq = context_len
        self.rope = RotaryCache(self.head_dim, max_seq)
        self.use_flash_attn = False
        if hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
            self.use_flash_attn = True
        self.kv_cache = None
        self.cache_enabled = False
    
    def init_kv_cache(self, batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16):
        self.kv_cache = KVCache(
            batch_size=batch_size,
            max_seq_len=max_seq_len,
            num_kv_heads=self.n_head,
            head_dim=self.head_dim,
            dtype=dtype
        )
        self.cache_enabled = True
    
    def reset_kv_cache(self):
        if self.kv_cache is not None:
            self.kv_cache.reset()
    
    def disable_kv_cache(self):
        self.cache_enabled = False
        self.kv_cache = None
    
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None, use_cache: bool = False) -> torch.Tensor:
        B, T, C = x.shape
        
        if use_cache and self.cache_enabled and self.kv_cache is not None:
            return self._forward_with_cache(x, mask)
        
        qkv = self.qkv(x).reshape(B, T, 3, self.n_head, self.head_dim)
        
        q, k, v = qkv.unbind(dim=2)
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
        
        sin, cos = self.rope(T)
        q = (q * cos) + (_rotate_half(q) * sin)
        k = (k * cos) + (_rotate_half(k) * sin)
        
        q, k = norm(q), norm(k)
        
        out = F.scaled_dot_product_attention(q, k, v, is_causal=True, dropout_p=self.dropout.p if self.training else 0.0)
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        
        return self.o_proj(out)
    
    def _forward_with_cache(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        B, T, C = x.shape
        
        qkv = self.qkv(x).reshape(B, T, 3, self.n_head, self.head_dim)
        q, k, v = qkv.unbind(dim=2)
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
        
        cache_size = self.kv_cache.size
        sin, cos = self.rope(cache_size + T)
        sin_new = sin[:, :, cache_size:cache_size+T, :]
        cos_new = cos[:, :, cache_size:cache_size+T, :]
        
        q = (q * cos_new) + (_rotate_half(q) * sin_new)
        k = (k * cos_new) + (_rotate_half(k) * sin_new)
        
        q, k = norm(q), norm(k)
        
        k_cache, v_cache = self.kv_cache.update(k, v)
        
        valid_cache_size = self.kv_cache.size
        k_valid = k_cache[:, :, :valid_cache_size, :]
        v_valid = v_cache[:, :, :valid_cache_size, :]
        
        out = F.scaled_dot_product_attention(q, k_valid, v_valid, is_causal=False)
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        
        return self.o_proj(out)

 class TransformerBlock(nn.Module):
    def __init__(self, n_emb: int, n_head: int, context_len: int, dropout: float = 0.1):
        super().__init__()
        self.attn = OptimizedAttention(n_emb, n_head, context_len, dropout)
        
        self.ffn = nn.Sequential(
            nn.Linear(n_emb, 4 * n_emb, bias=False),
            ReLUSquared(),
            nn.Linear(4 * n_emb, n_emb, bias=False),
            nn.Dropout(dropout)
        )
    
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None, use_cache: bool = False) -> torch.Tensor:
        x = x + self.attn(norm(x), mask, use_cache=use_cache)
        x = x + self.ffn(norm(x))
        return x

 # ┌─────────────────────────────────────────────────────────────────────────────┐
 # │                            GPT MODEL ARCHITECTURE                           │
 # ├─────────────────────────────────────────────────────────────────────────────┤
 # │                                                                             │
 # │  Input: Token IDs [B, T]                                                   │
 # │     ↓                                                                       │
 # │  ┌─────────────────────────────────────────────────────────────────────────┤
 # │  │ Token Embedding (wte): vocab_size → n_emb                              │
 # │  │ [B, T] → [B, T, 384]                                                   │
 # │  └─────────────────────────────────────────────────────────────────────────┤
 # │     ↓                                                                       │
 # │  ┌─────────────────────────────────────────────────────────────────────────┤
 # │  │ Dropout Layer                                                           │
 # │  └─────────────────────────────────────────────────────────────────────────┤
 # │     ↓                                                                       │
 # │  ┌─────────────────────────────────────────────────────────────────────────┤
 # │  │ TransformerBlock #1:                                                    │
 # │  │   ├─ RMS Norm → Multi-Head Attention (6 heads) → Residual              │
 # │  │   └─ RMS Norm → FFN (384→1536→384) → Residual                          │
 # │  └─────────────────────────────────────────────────────────────────────────┤
 # │     ↓                                                                       │
 # │  ┌─────────────────────────────────────────────────────────────────────────┤
 # │  │ TransformerBlock #2-6: (same structure)                                │
 # │  └─────────────────────────────────────────────────────────────────────────┤
 # │     ↓                                                                       │
 # │  ┌─────────────────────────────────────────────────────────────────────────┤
 # │  │ Final RMS Norm                                                          │
 # │  └─────────────────────────────────────────────────────────────────────────┤
 # │     ↓                                                                       │
 # │  ┌─────────────────────────────────────────────────────────────────────────┤
 # │  │ Language Modeling Head: n_emb → vocab_size                             │
 # │  │ [B, T, 384] → [B, T, 1024] (tied weights with embedding)               │
 # │  └─────────────────────────────────────────────────────────────────────────┤
 # │     ↓                                                                       │
 # │  Output: Logits over vocabulary [B, T, vocab_size]                         │
 # │                                                                             │
 # └─────────────────────────────────────────────────────────────────────────────┘

 class GPT(nn.Module):
    def __init__(self, vocab_size: int, n_layer: int = 6, n_head: int = 6, 
                 n_emb: int = 384, context_len: int = 256, dropout: float = 0.1):
        super().__init__()
        self.vocab_size = vocab_size
        self.context_len = context_len
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_emb = n_emb
        
        self.wte = nn.Embedding(vocab_size, n_emb)
        self.drop = nn.Dropout(dropout)
        
        self.layers = nn.ModuleList([
            TransformerBlock(n_emb, n_head, context_len, dropout)
            for _ in range(n_layer)
        ])
        
        self.head = nn.Linear(n_emb, vocab_size, bias=False)
        del self.head.weight
        self.head.weight = self.wte.weight
        
        self.apply(self._init_weights)
        
        self.register_buffer("causal_mask", torch.triu(
            torch.ones(context_len, context_len), diagonal=1
        ).bool())
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def forward(self, idx: torch.Tensor, targets=None, use_cache: bool = False) -> torch.Tensor:
        B, T = idx.shape
        
        tok_emb = self.wte(idx)
        x = self.drop(tok_emb)
        
        mask = self.causal_mask[:T, :T] if T <= self.context_len else None
        
        for layer in self.layers:
            x = layer(x, mask, use_cache=use_cache)
        
        x = norm(x)
        
        if targets is not None:
            logits = self.head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            logits = self.head(x[:, [-1], :])
            loss = None

        return logits, loss
    
    def init_kv_caches(self, batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16):
        for layer in self.layers:
            layer.attn.init_kv_cache(batch_size, max_seq_len, dtype)
    
    def reset_kv_caches(self):
        for layer in self.layers:
            layer.attn.reset_kv_cache()
    
    def disable_kv_caches(self):
        for layer in self.layers:
            layer.attn.disable_kv_cache()
    
    def get_num_params(self, non_embedding=True):
        n_params = sum(p.numel() for p in self.parameters())
        return n_params
    
    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    def estimate_mfu(self, fwdbwd_per_iter, dt):
        N = self.get_num_params()
        L, H, Q, T = self.n_layer, self.n_head, self.n_emb//self.n_head, self.context_len
        flops_per_token = 6*N + 12*L*H*Q*T
        flops_per_fwdbwd = flops_per_token * T
        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
        flops_achieved = flops_per_iter * (1.0/dt)
        flops_promised = 312e12
        mfu = flops_achieved / flops_promised
        return mfu

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None, use_cache=True):
        device = idx.device
        B, T = idx.shape
        
        if use_cache:
            self.init_kv_caches(B, T + max_new_tokens, dtype=idx.dtype)
        
        generated = idx
        for _ in range(max_new_tokens):
            if use_cache and generated.shape[1] > T:
                logits, _ = self(generated[:, -1:], use_cache=True)
            else:
                idx_cond = generated if generated.shape[1] <= self.context_len else generated[:, -self.context_len:]
                logits, _ = self(idx_cond, use_cache=use_cache)
            
            logits = logits[:, -1, :] / temperature
            
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            
            generated = torch.cat((generated, idx_next), dim=1)
        
        if use_cache:
            self.disable_kv_caches()
        
        return generated

 # Global tokenizer instance to avoid reloading
 _tokenizer = None

 def check_tokenizer_exists(vocab_size=1024, data_root="/data"):
    """Check if the custom tokenizer exists and provide instructions if not."""
    tokenizer_path = os.path.join(data_root, "tokenizers", f"shakespeare-bpe-{vocab_size}.json")
    if not os.path.exists(tokenizer_path):
        print(f"\n{'='*60}")
        print(f"ERROR: Custom tokenizer not found!")
        print(f"{'='*60}")
        print(f"Expected tokenizer at: {tokenizer_path}")
        print(f"\nTo create the tokenizer, run:")
        print(f"  modal run train_tokenizer_modal.py::train_bpe_tokenizer")
        print(f"\nOr for multiple vocab sizes:")
        print(f"  modal run train_tokenizer_modal.py::vocab_size_grid_search")
        print(f"{'='*60}\n")
        return False
    return True

 def load_custom_tokenizer(vocab_size=1024, data_root="/data"):
    """Load the custom BPE tokenizer from the Modal volume."""
    global _tokenizer
    if _tokenizer is None:
        from tokenizers import Tokenizer
        tokenizer_path = os.path.join(data_root, "tokenizers", f"shakespeare-bpe-{vocab_size}.json")
        if not check_tokenizer_exists(vocab_size, data_root):
            raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
        _tokenizer = Tokenizer.from_file(tokenizer_path)
        print(f"Loaded custom tokenizer from {tokenizer_path}")
    return _tokenizer

 def decode_tokens(tokens):
    """Decode tokens using the custom tokenizer."""
    tokenizer = load_custom_tokenizer(vocab_size=CONFIG.get("vocab_size", 1024))
    return tokenizer.decode(tokens)

 def encode_tokens(text):
    """Encode text using the custom tokenizer."""
    tokenizer = load_custom_tokenizer(vocab_size=CONFIG.get("vocab_size", 1024))
    encoding = tokenizer.encode(text)
    return encoding.ids


 def ensure_shakespeare_data_tokens(data_root="/data"):
    vocab_size = CONFIG.get("vocab_size", 1024)
    data_dir = os.path.join(data_root, f'shakespeare_tokens_bpe{vocab_size}')
    os.makedirs(data_dir, exist_ok=True)
    
    # Check if tokenized data already exists
    train_path = os.path.join(data_dir, 'train.bin')
    val_path = os.path.join(data_dir, 'val.bin')
    if os.path.exists(train_path) and os.path.exists(val_path):
        print(f"Tokenized data already exists in {data_dir}")
        return data_dir
    
    input_file_path = os.path.join(data_dir, 'input.txt')
    if not os.path.exists(input_file_path):
        data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
        response = requests.get(data_url)
        with open(input_file_path, 'w') as f:
            f.write(response.text)
    
    # Load custom tokenizer
    tokenizer = load_custom_tokenizer(vocab_size=vocab_size, data_root=data_root)
    
    with open(input_file_path, 'r') as f:
        data = f.read()
    
    n = len(data)
    train_data = data[:int(n*0.9)]
    val_data = data[int(n*0.9):]
    
    # Encode with custom tokenizer
    train_encoding = tokenizer.encode(train_data)
    val_encoding = tokenizer.encode(val_data)
    
    train_ids = np.array(train_encoding.ids, dtype=np.uint16)
    val_ids = np.array(val_encoding.ids, dtype=np.uint16)
    
    train_ids.tofile(train_path)
    val_ids.tofile(val_path)
    
    print(f"Token data saved with BPE-{vocab_size}: train={len(train_ids)}, val={len(val_ids)}")
    return data_dir

 def train():
    cfg = CONFIG
    
    ddp = int(os.environ.get('RANK', -1)) != -1
    if ddp:
        # Set longer timeout for short training runs
        import datetime
        timeout_minutes = 30  # Increased from default 10 minutes
        init_process_group(backend=cfg['backend'], timeout=datetime.timedelta(minutes=timeout_minutes))
        ddp_rank = int(os.environ['RANK'])
        ddp_local_rank = int(os.environ['LOCAL_RANK'])
        ddp_world_size = int(os.environ['WORLD_SIZE'])
        device = f'cuda:{ddp_local_rank}'
        torch.cuda.set_device(device)
        assert 'cuda' in device, "this script requires a GPU to run"
        master_process = ddp_rank == 0
        seed_offset = ddp_rank
        assert cfg['gradient_accumulation_steps'] % ddp_world_size == 0
        gradient_accumulation_steps = cfg['gradient_accumulation_steps'] // ddp_world_size
    else:
        master_process = True
        seed_offset = 0
        ddp_world_size = 1
        device = cfg['device']
        gradient_accumulation_steps = cfg['gradient_accumulation_steps']
    
    tokens_per_iter = gradient_accumulation_steps * ddp_world_size * cfg['batch_size'] * cfg['block_size']
    print(f"tokens per iteration will be: {tokens_per_iter:,}")
    
    if master_process:
        os.makedirs(cfg['out_dir'], exist_ok=True)
    
    torch.manual_seed(1337 + seed_offset)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    device_type = 'cuda' if 'cuda' in device else 'cpu'
    if device_type == 'cpu':
        print("This training script requires a GPU, but is running on CPU.")
        print("Exiting...")
        sys.exit(1)
    
    vocab_size = cfg.get('vocab_size', 1024)
    data_dir = os.path.join("/data" if os.path.exists("/data") else "data", f"shakespeare_tokens_bpe{vocab_size}")
    
    train_data_path = os.path.join(data_dir, 'train.bin')
    if os.path.exists(train_data_path):
        train_data = np.memmap(train_data_path, dtype=np.uint16, mode='r')
        dataset_tokens = len(train_data)
        print(f"Training dataset has {dataset_tokens:,} tokens")
        
        # Verify that all tokens are within vocabulary bounds
        max_token = np.max(train_data[:min(10000, len(train_data))])  # Check first 10k tokens
        if max_token >= vocab_size:
            print(f"WARNING: Found token {max_token} >= vocab_size {vocab_size}")
            print(f"Data may have been tokenized with a different vocabulary!")
            print(f"Expected data in: {data_dir}")
            raise ValueError(f"Token {max_token} exceeds vocabulary size {vocab_size}")
        
        if cfg['num_epochs'] is not None:
            iterations_per_epoch = dataset_tokens / tokens_per_iter
            cfg['max_iters'] = int(math.ceil(cfg['num_epochs'] * iterations_per_epoch))
            print(f"For {cfg['num_epochs']} epochs, need {cfg['max_iters']} iterations")
            print(f"Each epoch is ~{iterations_per_epoch:.1f} iterations")
            
            if cfg['warmup_iters'] is None:
                cfg['warmup_iters'] = max(1, int(0.02 * cfg['max_iters']))
            
            if cfg['lr_decay_iters'] is None:
                cfg['lr_decay_iters'] = cfg['max_iters']
            
            if cfg['max_iters'] < 20:
                cfg['eval_interval'] = max(1, cfg['max_iters'] // 4)
                cfg['log_interval'] = 1
                cfg['eval_iters'] = min(5, cfg['eval_iters'])  # Reduced from 50 to 5 for short runs
                print(f"Adjusted for short run: eval_interval={cfg['eval_interval']}, log_interval={cfg['log_interval']}, eval_iters={cfg['eval_iters']}")
            
            if cfg['max_iters'] < 10:
                cfg['decay_lr'] = False
                cfg['warmup_iters'] = 0
                print("Disabled learning rate decay for very short run")
        del train_data
    else:
        if cfg['max_iters'] is None:
            raise ValueError("Cannot calculate max_iters: training data not found and max_iters not specified")
    
    def get_batch(split):
        batch_data_dir = data_dir
        if split == 'train':
            data = np.memmap(os.path.join(batch_data_dir, 'train.bin'), dtype=np.uint16, mode='r')
        else:
            data = np.memmap(os.path.join(batch_data_dir, 'val.bin'), dtype=np.uint16, mode='r')
        ix = torch.randint(len(data) - cfg['block_size'], (cfg['batch_size'],))
        x = torch.stack([torch.from_numpy((data[i:i+cfg['block_size']]).astype(np.int64)) for i in ix])
        y = torch.stack([torch.from_numpy((data[i+1:i+1+cfg['block_size']]).astype(np.int64)) for i in ix])
        if device_type == 'cuda':
            x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
        else:
            x, y = x.to(device), y.to(device)
        return x, y
    
    iter_num = 0
    best_val_loss = 1e9
    
    meta_vocab_size = cfg['vocab_size']  # Use custom tokenizer vocab size
    print(f"Using custom BPE vocab_size = {meta_vocab_size}")
    
    model_args = dict(
        n_layer=cfg['n_layer'],
        n_head=cfg['n_head'],
        n_embd=cfg['n_embd'],
        block_size=cfg['block_size'],
        bias=cfg['bias'],
        vocab_size=meta_vocab_size if meta_vocab_size is not None else 50304,
        dropout=cfg['dropout']
    )
    
    if cfg['init_from'] == 'scratch':
        print("Initializing a new model from scratch")
        model = GPT(
            vocab_size=model_args['vocab_size'],
            n_layer=model_args['n_layer'],
            n_head=model_args['n_head'],
            n_emb=model_args['n_embd'],
            context_len=model_args['block_size'],
            dropout=model_args['dropout']
        )
    elif cfg['init_from'] == 'resume':
        print(f"Resuming training from {cfg['out_dir']}")
        ckpt_path = os.path.join(cfg['out_dir'], 'ckpt.pt')
        checkpoint = torch.load(ckpt_path, map_location=device)
        checkpoint_model_args = checkpoint['model_args']
        for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
            model_args[k] = checkpoint_model_args[k]
        model = GPT(
            vocab_size=model_args['vocab_size'],
            n_layer=model_args['n_layer'],
            n_head=model_args['n_head'],
            n_emb=model_args['n_embd'],
            context_len=model_args['block_size'],
            dropout=model_args['dropout']
        )
        state_dict = checkpoint['model']
        unwanted_prefix = '_orig_mod.'
        for k,v in list(state_dict.items()):
            if k.startswith(unwanted_prefix):
                state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
        model.load_state_dict(state_dict)
        iter_num = checkpoint['iter_num']
        best_val_loss = checkpoint['best_val_loss']
    
    model.to(device)
    
    scaler = torch.amp.GradScaler('cuda', enabled=(cfg['dtype'] == 'float16'))
    
    optimizer = model.configure_optimizers(cfg['weight_decay'], cfg['learning_rate'], 
                                         (cfg['beta1'], cfg['beta2']), 'cuda')
    if cfg['init_from'] == 'resume' and 'optimizer' in checkpoint:
        optimizer.load_state_dict(checkpoint['optimizer'])
    checkpoint = None
    
    if cfg['compile']:
        print("compiling the model... (takes a ~minute)")
        unoptimized_model = model
        model = torch.compile(model)
    
    if ddp:
        model = DDP(model, device_ids=[ddp_local_rank])
    
    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[cfg['dtype']]
    ctx = torch.amp.autocast(device_type='cuda', dtype=ptdtype)
    
    @torch.no_grad()
    def estimate_loss():
        out = {}
        model.eval()
        for split in ['train', 'val']:
            losses = torch.zeros(cfg['eval_iters'])
            for k in range(cfg['eval_iters']):
                X, Y = get_batch(split)
                with ctx:
                    logits, loss = model(X, Y)
                losses[k] = loss.item()
            out[split] = losses.mean()
        model.train()
        return out
    
    def get_lr(it):
        if it < cfg['warmup_iters']:
            return cfg['learning_rate'] * (it + 1) / (cfg['warmup_iters'] + 1)
        if it > cfg['lr_decay_iters']:
            return cfg['min_lr']
        decay_ratio = (it - cfg['warmup_iters']) / (cfg['lr_decay_iters'] - cfg['warmup_iters'])
        assert 0 <= decay_ratio <= 1
        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
        return cfg['min_lr'] + coeff * (cfg['learning_rate'] - cfg['min_lr'])
    
    if cfg['wandb_log'] and master_process:
        import wandb
        wandb.init(project=cfg['wandb_project'], name=cfg['wandb_run_name'], config=cfg)
    
    X, Y = get_batch('train')
    t0 = time.time()
    local_iter_num = 0
    raw_model = model.module if ddp else model
    running_mfu = -1.0
    
    while True:
        lr = get_lr(iter_num) if cfg['decay_lr'] else cfg['learning_rate']
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        
        if iter_num % cfg['eval_interval'] == 0 and master_process:
            losses = estimate_loss()
            print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
            if cfg['wandb_log']:
                wandb.log({
                    "iter": iter_num,
                    "train/loss": losses['train'],
                    "val/loss": losses['val'],
                    "lr": lr,
                    "mfu": running_mfu*100,
                })
            if losses['val'] < best_val_loss or cfg['always_save_checkpoint']:
                best_val_loss = losses['val']
                if iter_num > 0:
                    checkpoint = {
                        'model': raw_model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'model_args': model_args,
                        'iter_num': iter_num,
                        'best_val_loss': best_val_loss,
                        'config': cfg,
                    }
                    print(f"saving checkpoint to {cfg['out_dir']}")
                    torch.save(checkpoint, os.path.join(cfg['out_dir'], 'ckpt.pt'))
        
        if iter_num == 0 and cfg['eval_only']:
            break
        
        for micro_step in range(gradient_accumulation_steps):
            if ddp:
                model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
            with ctx:
                logits, loss = model(X, Y)
                loss = loss / gradient_accumulation_steps
            X, Y = get_batch('train')
            scaler.scale(loss).backward()
        
        if cfg['grad_clip'] != 0.0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg['grad_clip'])
        
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)
        
        t1 = time.time()
        dt = t1 - t0
        t0 = t1
        if iter_num % cfg['log_interval'] == 0 and master_process:
            lossf = loss.item() * gradient_accumulation_steps
            if local_iter_num >= 5:
                mfu = raw_model.estimate_mfu(cfg['batch_size'] * gradient_accumulation_steps, dt)
                running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
            print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
        
        iter_num += 1
        local_iter_num += 1
        
        if iter_num > cfg['max_iters']:
            break
    
    if ddp:
        destroy_process_group()

 app = modal.App("nanogpt-training-2")

 image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install(
        "numpy",
        "torch",
        "wandb",
        "requests",
        "tokenizers"
    )
 )

 volume = modal.Volume.from_name("nanogpt-data", create_if_missing=True)

 @app.function(
    gpu=f"{GPU_TYPE}:{N_GPUS}",
    volumes={"/data": volume},
    timeout=60 * 60 * 6,
    image=image,
    secrets=[modal.Secret.from_name("wandb-secret")] if CONFIG.get("wandb_log", False) else [],
 )
 def train_modal():
    cfg = CONFIG
    print(f"Starting Modal training with {N_GPUS} {GPU_TYPE} GPUs")
    print(f"Dataset type: {cfg['dataset_type']}")
    
    # Check if custom tokenizer exists before proceeding
    if not check_tokenizer_exists(cfg['vocab_size'], "/data"):
        raise RuntimeError("Cannot proceed without tokenizer. Please create it first.")
    
    ensure_shakespeare_data_tokens("/data")
    
    script_path = Path(__file__)
    script_content = script_path.read_text()
    temp_script = "/tmp/train_modal.py"
    Path(temp_script).write_text(script_content)
    
    cmd = [
        "torchrun",
        f"--nproc-per-node={N_GPUS}",
        temp_script,
    ]
    
    print(f"Running command: {' '.join(cmd)}")
    
    os.chdir("/tmp")
    
    subprocess.run(cmd, check=True)
    
    print("Training completed successfully!")
    return "Training completed"

 if __name__ == "__main__":
    if "RANK" in os.environ:
        train()
    else:
        print("This script should be run with torchrun or through Modal")
        print("Examples:")
        print("  Local: torchrun --nproc-per-node=4 train_modal_standalone.py")
        print("  Modal: modal run train_modal_standalone.py::train_modal")
        sys.exit(1)
	#!/usr/bin/env python3
	"""
	Inference script for models trained with train_modal_standalone.py
	Uses the exact same architecture as train.py for consistency
	"""

	import os
	import sys
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import modal
	from pathlib import Path
	from typing import Optional, Tuple
	import numpy as np
	import pickle
	from tokenizers import Tokenizer

	# Modal configuration
	app = modal.App("shakespeare-inference-modal")

	# Model architecture settings (matching train.py and train_modal_standalone.py)
	N_LAYER = 6
	N_HEAD = 6
	N_EMB = 384
	CONTEXT_LEN = 256 # Default context length
	DROPOUT = 0.1

	# Volume setup
	data_volume = modal.Volume.from_name("nanogpt-data", create_if_missing=False)

	# GPU image setup
	image = modal.Image.debian_slim(python_version="3.11").pip_install(
	"torch", "numpy", "tokenizers"
	)

	# ============================================================================
	# TOKENIZER HELPERS - From train_modal_standalone.py
	# ============================================================================

	# Global tokenizer instance to avoid reloading
	_tokenizer = None

	def check_tokenizer_exists(vocab_size=1024, data_root="/data"):
	"""Check if the custom tokenizer exists and provide instructions if not."""
	tokenizer_path = os.path.join(data_root, "tokenizers", f"shakespeare-bpe-{vocab_size}.json")
	if not os.path.exists(tokenizer_path):
	print(f"\n{'='*60}")
	print(f"ERROR: Custom tokenizer not found!")
	print(f"{'='*60}")
	print(f"Expected tokenizer at: {tokenizer_path}")
	print(f"\nTo create the tokenizer, run:")
	print(f" modal run train_tokenizer_modal.py::train_bpe_tokenizer")
	print(f"\nOr for multiple vocab sizes:")
	print(f" modal run train_tokenizer_modal.py::vocab_size_grid_search")
	print(f"{'='*60}\n")
	return False
	return True

	def load_custom_tokenizer(vocab_size=1024, data_root="/data"):
	"""Load the custom BPE tokenizer from the Modal volume."""
	global _tokenizer
	if _tokenizer is None:
	tokenizer_path = os.path.join(data_root, "tokenizers", f"shakespeare-bpe-{vocab_size}.json")
	if not check_tokenizer_exists(vocab_size, data_root):
	raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
	_tokenizer = Tokenizer.from_file(tokenizer_path)
	print(f"Loaded custom tokenizer from {tokenizer_path}")
	return _tokenizer

	def decode_tokens(tokens, vocab_size=1024):
	"""Decode tokens using the custom tokenizer."""
	tokenizer = load_custom_tokenizer(vocab_size=vocab_size)
	return tokenizer.decode(tokens)

	def encode_tokens(text, vocab_size=1024):
	"""Encode text using the custom tokenizer."""
	tokenizer = load_custom_tokenizer(vocab_size=vocab_size)
	encoding = tokenizer.encode(text)
	return encoding.ids

	# ============================================================================
	# MODEL DEFINITION - Exact copy from train_modal_standalone.py
	# ============================================================================

	def norm(x: torch.Tensor) -> torch.Tensor:
	"""RMSNorm implementation using PyTorch built-in"""
	return F.rms_norm(x, (x.size(-1),))

	def _rotate_half(x: torch.Tensor) -> torch.Tensor:
	"""Helper for rotary embeddings"""
	x1, x2 = x[..., ::2], x[..., 1::2]
	return torch.stack((-x2, x1), dim=-1).flatten(-2)

	class RotaryCache(nn.Module):
	"""Pre-computed rotary position embeddings"""
	def __init__(self, head_dim: int, max_len: int):
	super().__init__()
	inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2) / head_dim))
	t = torch.arange(max_len)
	freqs = torch.einsum("i,j->ij", t, inv_freq)
	sin, cos = freqs.sin(), freqs.cos()
	self.register_buffer("sin_base", sin, persistent=False)
	self.register_buffer("cos_base", cos, persistent=False)

	def forward(self, seq_len: int):
	sin = self.sin_base[:seq_len].repeat_interleave(2, dim=-1)
	cos = self.cos_base[:seq_len].repeat_interleave(2, dim=-1)
	return sin[None, None, :, :], cos[None, None, :, :]

	class KVCache(nn.Module):
	"""
	KV cache for efficient inference - caches past key and values during generation.
	Based on Meta's implementation for torchtune.
	"""
	def __init__(
	self,
	batch_size: int,
	max_seq_len: int,
	num_kv_heads: int,
	head_dim: int,
	dtype: torch.dtype = torch.bfloat16,
	device: torch.device = None,
	) -> None:
	super().__init__()
	if device is None:
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	cache_shape = (batch_size, num_kv_heads, max_seq_len, head_dim)
	self.register_buffer(
	"k_cache", torch.zeros(cache_shape, dtype=dtype, device=device), persistent=False
	)
	self.register_buffer(
	"v_cache", torch.zeros(cache_shape, dtype=dtype, device=device), persistent=False
	)
	self.register_buffer(
	"cache_pos", torch.arange(0, cache_shape[2], device=device), persistent=False
	)
	self.batch_size = batch_size
	self.max_seq_len = max_seq_len

	def reset(self) -> None:
	"""Reset the cache to zero."""
	self.k_cache.zero_()
	self.v_cache.zero_()
	self.cache_pos -= self.size

	@property
	def size(self) -> int:
	return self.cache_pos[0].item()

	def update(
	self, k_val: torch.Tensor, v_val: torch.Tensor
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Update KV cache with new k_val, v_val and return the updated cache.
	Args:
	k_val: Current key tensor with shape [B, H, S, D]
	v_val: Current value tensor with shape [B, H, S, D]
	Returns:
	Updated key and value cache tensors
	"""
	bsz, _, seq_len, _ = k_val.shape
	if bsz > self.k_cache.shape[0]:
	raise ValueError(
	f"Cache batch size is {self.k_cache.shape[0]} but got {bsz}"
	)

	assert (self.cache_pos[0] + seq_len) <= self.k_cache.shape[2]

	k_out = self.k_cache
	v_out = self.v_cache

	# Use integer indexing instead of tensor indexing to avoid dtype mismatch
	cache_start = self.cache_pos[0].item()
	cache_end = cache_start + seq_len
	k_out[:, :, cache_start:cache_end] = k_val
	v_out[:, :, cache_start:cache_end] = v_val

	# Update position tracker
	self.cache_pos.add_(seq_len)

	return k_out, v_out

	class ReLUSquared(nn.Module):
	"""ReLU squared activation - faster than GELU, better than plain ReLU"""
	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return F.relu(x).square()

	class OptimizedAttention(nn.Module):
	"""Multi-head attention with Flash Attention support and RoPE"""
	def __init__(self, n_emb: int, n_head: int, context_len: int, dropout: float = 0.1):
	super().__init__()
	self.n_head = n_head
	self.n_emb = n_emb
	self.head_dim = n_emb // n_head

	# Fused QKV projection for efficiency
	self.qkv = nn.Linear(n_emb, 3 * n_emb, bias=False)
	self.o_proj = nn.Linear(n_emb, n_emb, bias=False)
	self.dropout = nn.Dropout(dropout)

	# Rotary embeddings
	max_seq = context_len
	self.rope = RotaryCache(self.head_dim, max_seq)

	# Try to use Flash Attention
	self.use_flash_attn = False
	if hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
	self.use_flash_attn = True

	# KV cache for inference (not used during training)
	self.kv_cache = None
	self.cache_enabled = False

	def init_kv_cache(self, batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16, device: torch.device = None):
	"""Initialize KV cache for inference"""
	self.kv_cache = KVCache(
	batch_size=batch_size,
	max_seq_len=max_seq_len,
	num_kv_heads=self.n_head,
	head_dim=self.head_dim,
	dtype=dtype,
	device=device
	)
	self.cache_enabled = True

	def reset_kv_cache(self):
	"""Reset the KV cache"""
	if self.kv_cache is not None:
	self.kv_cache.reset()

	def disable_kv_cache(self):
	"""Disable KV cache (for training)"""
	self.cache_enabled = False
	self.kv_cache = None

	def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None, use_cache: bool = False) -> torch.Tensor:
	B, T, C = x.shape

	# Use KV cache if enabled and requested
	if use_cache and self.cache_enabled and self.kv_cache is not None:
	return self._forward_with_cache(x, mask)

	# Standard forward pass (for training)
	# Compute QKV in one go
	qkv = self.qkv(x).reshape(B, T, 3, self.n_head, self.head_dim)

	# Standard attention path
	q, k, v = qkv.unbind(dim=2)
	q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

	# Apply RoPE
	sin, cos = self.rope(T)
	q = (q * cos) + (_rotate_half(q) * sin)
	k = (k * cos) + (_rotate_half(k) * sin)

	# QK normalization
	q, k = norm(q), norm(k)

	# Scaled dot-product attention with causal mask
	# Note: is_causal=True automatically applies causal masking
	out = F.scaled_dot_product_attention(q, k, v, is_causal=True, dropout_p=self.dropout.p if self.training else 0.0)
	out = out.transpose(1, 2).contiguous().view(B, T, C)

	return self.o_proj(out)

	def _forward_with_cache(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	"""Forward pass using KV cache for efficient inference"""
	B, T, C = x.shape

	# Compute QKV
	qkv = self.qkv(x).reshape(B, T, 3, self.n_head, self.head_dim)
	q, k, v = qkv.unbind(dim=2)
	q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

	# Apply RoPE to current position
	cache_size = self.kv_cache.size
	sin, cos = self.rope(cache_size + T)
	# Only apply to new positions
	sin_new = sin[:, :, cache_size:cache_size+T, :]
	cos_new = cos[:, :, cache_size:cache_size+T, :]

	q = (q * cos_new) + (_rotate_half(q) * sin_new)
	k = (k * cos_new) + (_rotate_half(k) * sin_new)

	# Normalize
	q, k = norm(q), norm(k)

	# Update KV cache
	k_cache, v_cache = self.kv_cache.update(k, v)

	# Compute attention with cached keys/values
	# Get only the valid portion of cache
	valid_cache_size = self.kv_cache.size
	k_valid = k_cache[:, :, :valid_cache_size, :]
	v_valid = v_cache[:, :, :valid_cache_size, :]

	# Standard attention computation
	out = F.scaled_dot_product_attention(q, k_valid, v_valid, is_causal=False)
	out = out.transpose(1, 2).contiguous().view(B, T, C)

	return self.o_proj(out)

	class TransformerBlock(nn.Module):
	"""Transformer block with pre-norm architecture"""
	def __init__(self, n_emb: int, n_head: int, context_len: int, dropout: float = 0.1):
	super().__init__()
	self.attn = OptimizedAttention(n_emb, n_head, context_len, dropout)

	# Feed-forward network with ReLU squared
	self.ffn = nn.Sequential(
	nn.Linear(n_emb, 4 * n_emb, bias=False),
	ReLUSquared(),
	nn.Linear(4 * n_emb, n_emb, bias=False),
	nn.Dropout(dropout)
	)

	def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None, use_cache: bool = False) -> torch.Tensor:
	# Pre-norm architecture with residual connections
	x = x + self.attn(norm(x), mask, use_cache=use_cache)
	x = x + self.ffn(norm(x))
	return x

	class GPT(nn.Module):
	"""GPT model optimized for multi-GPU training - matching train.py architecture"""
	def __init__(self, vocab_size: int, n_layer: int = 6, n_head: int = 6,
	n_emb: int = 384, context_len: int = 256, dropout: float = 0.1):
	super().__init__()
	self.vocab_size = vocab_size
	self.context_len = context_len
	self.n_layer = n_layer
	self.n_head = n_head
	self.n_emb = n_emb

	# Token embeddings
	self.wte = nn.Embedding(vocab_size, n_emb)
	self.drop = nn.Dropout(dropout)

	# Transformer blocks
	self.layers = nn.ModuleList([
	TransformerBlock(n_emb, n_head, context_len, dropout)
	for _ in range(n_layer)
	])

	# Output head with weight tying
	self.head = nn.Linear(n_emb, vocab_size, bias=False)
	# Weight tying - delete the head weight first to avoid issues
	del self.head.weight
	self.head.weight = self.wte.weight # Share the embedding weights

	# Initialize weights
	self.apply(self._init_weights)

	# Pre-compute causal mask (not used in this architecture but kept for compatibility)
	self.register_buffer("causal_mask", torch.triu(
	torch.ones(context_len, context_len), diagonal=1
	).bool())

	def _init_weights(self, module):
	"""Initialize weights with appropriate scaling"""
	if isinstance(module, nn.Linear):
	torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
	if module.bias is not None:
	torch.nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Embedding):
	torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

	def forward(self, idx: torch.Tensor, targets=None, use_cache: bool = False) -> torch.Tensor:
	B, T = idx.shape

	# Token embeddings
	tok_emb = self.wte(idx)
	x = self.drop(tok_emb)

	# Get causal mask
	mask = self.causal_mask[:T, :T] if T <= self.context_len else None

	# Forward through transformer layers
	for layer in self.layers:
	x = layer(x, mask, use_cache=use_cache)

	# Final norm and output projection
	x = norm(x)

	if targets is not None:
	# if we are given some desired targets also calculate the loss
	logits = self.head(x)
	loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
	else:
	# inference-time mini-optimization: only forward the lm_head on the very last position
	logits = self.head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
	loss = None

	return logits, loss

	def init_kv_caches(self, batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16, device: torch.device = None):
	"""Initialize KV caches for all attention layers"""
	for layer in self.layers:
	layer.attn.init_kv_cache(batch_size, max_seq_len, dtype, device)

	def reset_kv_caches(self):
	"""Reset all KV caches"""
	for layer in self.layers:
	layer.attn.reset_kv_cache()

	def disable_kv_caches(self):
	"""Disable all KV caches"""
	for layer in self.layers:
	layer.attn.disable_kv_cache()

	@torch.no_grad()
	def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None, use_cache=True,
	repetition_penalty=1.0, repetition_window=128):
	"""
	Generate tokens using the model with optional KV caching and repetition penalty.
	Matches the generate function from train.py
	"""
	device = idx.device
	B, T = idx.shape

	# Initialize KV cache if requested
	if use_cache:
	# Use the model's dtype (from embeddings) not the input indices dtype
	model_dtype = self.wte.weight.dtype
	self.init_kv_caches(B, T + max_new_tokens, dtype=model_dtype, device=device)

	# Generate tokens
	generated = idx
	for _ in range(max_new_tokens):
	# Get logits for next token
	if use_cache and generated.shape[1] > T:
	# Only feed the new token(s) when using cache
	logits, _ = self(generated[:, -1:], use_cache=True)
	else:
	# Feed full sequence (first iteration or no cache)
	# Crop to context length if needed
	idx_cond = generated if generated.shape[1] <= self.context_len else generated[:, -self.context_len:]
	logits, _ = self(idx_cond, use_cache=use_cache)

	# Get logits for last position
	logits = logits[:, -1, :] / temperature

	# Apply repetition penalty to discourage generating tokens that have recently appeared.
	if repetition_penalty != 0.7:
	B, T = generated.shape
	for b in range(B):
	# Get the set of unique tokens in the window to penalize
	window_start = max(0, T - repetition_window)
	recent_tokens = set(generated[b, window_start:].tolist())

	# Apply penalty to the logits of these tokens
	for token_id in recent_tokens:
	# Correctly apply penalty to both positive and negative logits
	if logits[b, token_id] > 0:
	logits[b, token_id] /= repetition_penalty
	else:
	logits[b, token_id] *= repetition_penalty

	# Optional top-k sampling
	if top_k is not None:
	v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
	logits[logits < v[:, [-1]]] = -float('Inf')

	# Sample from distribution
	probs = F.softmax(logits, dim=-1)
	idx_next = torch.multinomial(probs, num_samples=1)

	# Append to generated sequence
	generated = torch.cat((generated, idx_next), dim=1)

	# Clean up cache
	if use_cache:
	self.disable_kv_caches()

	return generated

	# ============================================================================
	# LOADING AND INFERENCE
	# ============================================================================

	def load_shakespeare_checkpoint(checkpoint_path: str, device: torch.device) -> Optional[Tuple[GPT, dict, int]]:
	"""Load model from checkpoint with metadata"""
	try:
	print(f"Loading checkpoint from {checkpoint_path}...")

	if not os.path.exists(checkpoint_path):
	print(f"Checkpoint not found at {checkpoint_path}")
	return None

	# Load checkpoint
	checkpoint = torch.load(checkpoint_path, map_location=device)

	# Extract configuration
	if 'model_args' in checkpoint:
	model_args = checkpoint['model_args']
	else:
	print("Warning: No model_args in checkpoint, using defaults")
	model_args = {
	'n_layer': N_LAYER,
	'n_head': N_HEAD,
	'n_embd': N_EMB,
	'block_size': CONTEXT_LEN,
	'vocab_size': 50257, # GPT-2 vocab size
	'dropout': DROPOUT
	}

	vocab_size = model_args.get('vocab_size', 50257)

	# Create model
	model = GPT(
	vocab_size=vocab_size,
	n_layer=model_args.get('n_layer', N_LAYER),
	n_head=model_args.get('n_head', N_HEAD),
	n_emb=model_args.get('n_embd', N_EMB),
	context_len=model_args.get('block_size', CONTEXT_LEN),
	dropout=model_args.get('dropout', DROPOUT)
	)

	# Load state dict
	state_dict = checkpoint['model']

	# Remove unwanted prefix if present
	unwanted_prefix = '_orig_mod.'
	for k in list(state_dict.keys()):
	if k.startswith(unwanted_prefix):
	state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)

	model.load_state_dict(state_dict)
	model.to(device)
	model.eval()

	print(f"Successfully loaded model:")
	print(f" - Layers: {model_args.get('n_layer', N_LAYER)}")
	print(f" - Heads: {model_args.get('n_head', N_HEAD)}")
	print(f" - Embedding: {model_args.get('n_embd', N_EMB)}")
	print(f" - Context: {model_args.get('block_size', CONTEXT_LEN)}")
	print(f" - Vocab: {vocab_size}")
	print(f" - Parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

	return model, model_args, vocab_size

	except Exception as e:
	print(f"Failed to load checkpoint: {e}")
	import traceback
	traceback.print_exc()
	return None

	@app.function(
	image=image,
	gpu="T4",
	volumes={"/data": data_volume},
	timeout=600
	)
	def run_inference():
	"""Run inference with model trained by train_modal_standalone.py"""

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	# Load model from checkpoint first to determine dataset type
	checkpoint_path = "/data/checkpoints/shakespeare_token/ckpt.pt"
	result = load_shakespeare_checkpoint(checkpoint_path, device)

	if result is None:
	print("Failed to load model!")
	return

	model, model_args, model_vocab_size = result

	# Check if custom tokenizer exists
	if not check_tokenizer_exists(model_vocab_size, "/data"):
	print("Custom tokenizer not found! Please create it first.")
	return

	# Token-level encoding/decoding using custom tokenizer
	print(f"Using custom BPE tokenizer with vocab_size={model_vocab_size}")

	def encode(s: str) -> torch.Tensor:
	return torch.tensor(encode_tokens(s, vocab_size=model_vocab_size), dtype=torch.long)

	def decode(t: torch.Tensor) -> str:
	return decode_tokens(t.tolist(), vocab_size=model_vocab_size)

	# Sample prompts (matching train.py)
	sample_prompts = [
	"O God, O God",
	"What is",
	"To be or not to be",
	"KING HENRY",
	"The quality of mercy",
	"All the world's a stage",
	"Now is the winter",
	"If music be"
	]

	print("\n" + "="*80)
	print("Generating text samples with different prompts:")
	print("="*80)

	with torch.no_grad():
	for i, prompt in enumerate(sample_prompts):
	print(f"\n[{i+1}] Prompt: '{prompt}'")

	# Encode prompt
	prompt_encoded = encode(prompt).unsqueeze(0).to(device)

	# Generate text with settings matching train.py
	generated = model.generate(
	prompt_encoded,
	max_new_tokens=100, # Generate 100 characters
	temperature=0.8, # Moderate temperature
	top_k=40, # Top-k sampling
	use_cache=True, # Use KV cache for efficiency
	repetition_penalty=1.2, # Use a more reasonable repetition penalty
	repetition_window=128 # Check last 128 tokens
	)

	# Decode and print
	generated_text = decode(generated[0])
	print(f"Generated: {generated_text}")
	print("-" * 80)

	# Generate some unconditional samples
	print("\n" + "="*80)
	print("Generating unconditional samples:")
	print("="*80)

	for i in range(3):
	# Start with newline character
	start_token = encode("\n").unsqueeze(0).to(device)

	generated = model.generate(
	start_token,
	max_new_tokens=200, # Longer for unconditional
	temperature=0.8,
	top_k=40,
	use_cache=True,
	repetition_penalty=1.2,
	repetition_window=128
	)

	generated_text = decode(generated[0])
	print(f"\n[Unconditional {i+1}]")
	print(f"Generated: {generated_text}")
	print("-" * 80)

	print("\nInference complete!")

	@app.local_entrypoint()
	def main():
	"""Run model inference"""
	print("Starting model inference on Modal...")
	run_inference.remote()

	if __name__ == "__main__":
	main()