pszemraj · March 26, 2025 03:08 · pszemraj · Mar 26, 2025
diff --git a/layernorm_scaling.py b/layernorm_scaling.py
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F


 class LayerNormScaling(nn.Module):
    """
    LayerNorm Scaling implementation to mitigate the Curse of Depth in LLMs.
    
    This module applies Layer Normalization and then scales the output by 1/sqrt(layer_depth)
    to prevent the variance explosion issue in deeper transformer layers.
    
    Args:
        hidden_size (int): The size of the input and output features.
        layer_idx (int): The index of the current layer, starting from 1.
        eps (float, optional): A small value added to the denominator for numerical stability. Default: 1e-6.
    """
    def __init__(self, hidden_size, layer_idx, eps=1e-6):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps
        self.layer_idx = layer_idx
        self.scale_factor = 1.0 / math.sqrt(max(1, layer_idx))  # Scale factor based on layer depth
    
    def forward(self, hidden_states):
        """
        Apply layer normalization with depth-based scaling.
        
        Args:
            hidden_states (torch.Tensor): Input tensor of shape [batch_size, seq_length, hidden_size]
            
        Returns:
            torch.Tensor: Normalized and scaled tensor of the same shape
        """
        # Standard Layer Normalization
        mean = hidden_states.mean(-1, keepdim=True)
        variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
        normalized = (hidden_states - mean) / torch.sqrt(variance + self.variance_epsilon)
        
        # Apply weight and bias
        output = self.weight * normalized + self.bias
        
        # Apply scaling based on layer depth to mitigate CoD
        output = output * self.scale_factor
        
        return output


 class RMSNormScaling(nn.Module):
    """
    RMSNorm Scaling implementation (as used in LLaMA) to mitigate the Curse of Depth in LLMs.
    
    This module applies RMS Normalization and then scales the output by 1/sqrt(layer_depth)
    to prevent the variance explosion issue in deeper transformer layers.
    
    Args:
        hidden_size (int): The size of the input and output features.
        layer_idx (int): The index of the current layer, starting from 1.
        eps (float, optional): A small value added to the denominator for numerical stability. Default: 1e-6.
    """
    def __init__(self, hidden_size, layer_idx, eps=1e-6):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
        self.layer_idx = layer_idx
        self.scale_factor = 1.0 / math.sqrt(max(1, layer_idx))  # Scale factor based on layer depth
    
    def forward(self, hidden_states):
        """
        Apply RMS normalization with depth-based scaling.
        
        Args:
            hidden_states (torch.Tensor): Input tensor of shape [batch_size, seq_length, hidden_size]
            
        Returns:
            torch.Tensor: Normalized and scaled tensor of the same shape
        """
        # RMS Normalization (as used in LLaMA)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        
        # Apply weight
        output = self.weight * hidden_states
        
        # Apply scaling based on layer depth to mitigate CoD
        output = output * self.scale_factor
        
        return output


 # Example of how to use these in a transformer layer
 class TransformerLayerWithScaling(nn.Module):
    """
    Example transformer layer using LayerNormScaling to mitigate the Curse of Depth.
    
    This is a simple implementation showing how to use the scaling in 
    a Pre-LN transformer architecture.
    """
    def __init__(self, hidden_size, num_heads, ff_dim, layer_idx, dropout=0.1):
        super().__init__()
        self.layer_idx = layer_idx
        
        # Pre-LN with Scaling for Attention
        self.ln1 = RMSNormScaling(hidden_size, layer_idx)
        
        # Multi-head attention
        self.attention = nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout, batch_first=True)
        
        # Pre-LN with Scaling for Feed-forward
        self.ln2 = RMSNormScaling(hidden_size, layer_idx)
        
        # Feed-forward network
        self.ff = nn.Sequential(
            nn.Linear(hidden_size, ff_dim),
            nn.SiLU(),  # SiLU/Swish activation as used in LLaMA
            nn.Linear(ff_dim, hidden_size),
            nn.Dropout(dropout)
        )
    
    def forward(self, x, attention_mask=None):
        # Apply Pre-LN with scaling for attention
        ln_out = self.ln1(x)
        attn_out, _ = self.attention(ln_out, ln_out, ln_out, key_padding_mask=attention_mask)
        x = x + attn_out
        
        # Apply Pre-LN with scaling for feed-forward
        ln_out = self.ln2(x)
        ff_out = self.ff(ln_out)
        x = x + ff_out
        
        return x


 # Example of how to apply to an existing architecture like LLaMA
 def apply_cod_mitigation_to_llama(model):
    """
    Apply the Curse of Depth mitigation to an existing LLaMA model.
    This function demonstrates how to retrofit existing models.

    Args:
        model: A LLaMA model instance
    
    Returns:
        The modified model with LayerNorm Scaling
    """
    # Get the transformer layers
    layers = model.model.layers
    
    # Replace the layer norms with scaled versions
    for i, layer in enumerate(layers):
        # Layer index starts from 1 in the paper
        layer_idx = i + 1
        
        # Convert input LayerNorm
        if hasattr(layer, 'input_layernorm'):
            hidden_size = layer.input_layernorm.weight.shape[0]
            eps = layer.input_layernorm.variance_epsilon
            layer.input_layernorm = RMSNormScaling(hidden_size, layer_idx, eps)
        
        # Convert post-attention LayerNorm
        if hasattr(layer, 'post_attention_layernorm'):
            hidden_size = layer.post_attention_layernorm.weight.shape[0]
            eps = layer.post_attention_layernorm.variance_epsilon
            layer.post_attention_layernorm = RMSNormScaling(hidden_size, layer_idx, eps)
    
    return model
diff --git a/usage.md b/usage.md
	import math
	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	class LayerNormScaling(nn.Module):
	"""
	LayerNorm Scaling implementation to mitigate the Curse of Depth in LLMs.

	This module applies Layer Normalization and then scales the output by 1/sqrt(layer_depth)
	to prevent the variance explosion issue in deeper transformer layers.

	Args:
	hidden_size (int): The size of the input and output features.
	layer_idx (int): The index of the current layer, starting from 1.
	eps (float, optional): A small value added to the denominator for numerical stability. Default: 1e-6.
	"""
	def __init__(self, hidden_size, layer_idx, eps=1e-6):
	super().__init__()
	self.weight = nn.Parameter(torch.ones(hidden_size))
	self.bias = nn.Parameter(torch.zeros(hidden_size))
	self.variance_epsilon = eps
	self.layer_idx = layer_idx
	self.scale_factor = 1.0 / math.sqrt(max(1, layer_idx)) # Scale factor based on layer depth

	def forward(self, hidden_states):
	"""
	Apply layer normalization with depth-based scaling.

	Args:
	hidden_states (torch.Tensor): Input tensor of shape [batch_size, seq_length, hidden_size]

	Returns:
	torch.Tensor: Normalized and scaled tensor of the same shape
	"""
	# Standard Layer Normalization
	mean = hidden_states.mean(-1, keepdim=True)
	variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
	normalized = (hidden_states - mean) / torch.sqrt(variance + self.variance_epsilon)

	# Apply weight and bias
	output = self.weight * normalized + self.bias

	# Apply scaling based on layer depth to mitigate CoD
	output = output * self.scale_factor

	return output


	class RMSNormScaling(nn.Module):
	"""
	RMSNorm Scaling implementation (as used in LLaMA) to mitigate the Curse of Depth in LLMs.

	This module applies RMS Normalization and then scales the output by 1/sqrt(layer_depth)
	to prevent the variance explosion issue in deeper transformer layers.

	Args:
	hidden_size (int): The size of the input and output features.
	layer_idx (int): The index of the current layer, starting from 1.
	eps (float, optional): A small value added to the denominator for numerical stability. Default: 1e-6.
	"""
	def __init__(self, hidden_size, layer_idx, eps=1e-6):
	super().__init__()
	self.weight = nn.Parameter(torch.ones(hidden_size))
	self.variance_epsilon = eps
	self.layer_idx = layer_idx
	self.scale_factor = 1.0 / math.sqrt(max(1, layer_idx)) # Scale factor based on layer depth

	def forward(self, hidden_states):
	"""
	Apply RMS normalization with depth-based scaling.

	Args:
	hidden_states (torch.Tensor): Input tensor of shape [batch_size, seq_length, hidden_size]

	Returns:
	torch.Tensor: Normalized and scaled tensor of the same shape
	"""
	# RMS Normalization (as used in LLaMA)
	variance = hidden_states.pow(2).mean(-1, keepdim=True)
	hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)

	# Apply weight
	output = self.weight * hidden_states

	# Apply scaling based on layer depth to mitigate CoD
	output = output * self.scale_factor

	return output


	# Example of how to use these in a transformer layer
	class TransformerLayerWithScaling(nn.Module):
	"""
	Example transformer layer using LayerNormScaling to mitigate the Curse of Depth.

	This is a simple implementation showing how to use the scaling in
	a Pre-LN transformer architecture.
	"""
	def __init__(self, hidden_size, num_heads, ff_dim, layer_idx, dropout=0.1):
	super().__init__()
	self.layer_idx = layer_idx

	# Pre-LN with Scaling for Attention
	self.ln1 = RMSNormScaling(hidden_size, layer_idx)

	# Multi-head attention
	self.attention = nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout, batch_first=True)

	# Pre-LN with Scaling for Feed-forward
	self.ln2 = RMSNormScaling(hidden_size, layer_idx)

	# Feed-forward network
	self.ff = nn.Sequential(
	nn.Linear(hidden_size, ff_dim),
	nn.SiLU(), # SiLU/Swish activation as used in LLaMA
	nn.Linear(ff_dim, hidden_size),
	nn.Dropout(dropout)
	)

	def forward(self, x, attention_mask=None):
	# Apply Pre-LN with scaling for attention
	ln_out = self.ln1(x)
	attn_out, _ = self.attention(ln_out, ln_out, ln_out, key_padding_mask=attention_mask)
	x = x + attn_out

	# Apply Pre-LN with scaling for feed-forward
	ln_out = self.ln2(x)
	ff_out = self.ff(ln_out)
	x = x + ff_out

	return x


	# Example of how to apply to an existing architecture like LLaMA
	def apply_cod_mitigation_to_llama(model):
	"""
	Apply the Curse of Depth mitigation to an existing LLaMA model.
	This function demonstrates how to retrofit existing models.

	Args:
	model: A LLaMA model instance

	Returns:
	The modified model with LayerNorm Scaling
	"""
	# Get the transformer layers
	layers = model.model.layers

	# Replace the layer norms with scaled versions
	for i, layer in enumerate(layers):
	# Layer index starts from 1 in the paper
	layer_idx = i + 1

	# Convert input LayerNorm
	if hasattr(layer, 'input_layernorm'):
	hidden_size = layer.input_layernorm.weight.shape[0]
	eps = layer.input_layernorm.variance_epsilon
	layer.input_layernorm = RMSNormScaling(hidden_size, layer_idx, eps)

	# Convert post-attention LayerNorm
	if hasattr(layer, 'post_attention_layernorm'):
	hidden_size = layer.post_attention_layernorm.weight.shape[0]
	eps = layer.post_attention_layernorm.variance_epsilon
	layer.post_attention_layernorm = RMSNormScaling(hidden_size, layer_idx, eps)

	return model