lucidrains · June 24, 2025 09:51 · sayakpaul · Feb 18, 2021 · lucidrains · Mar 19, 2021
diff --git a/adaptive_gradient_clip.py b/adaptive_gradient_clip.py
 def unitwise_norm(x):
    if len(x.squeeze().shape) <= 1:
        dim = None
        keepdim = False
    elif len(x.shape) in (2, 3):
        dim = 1
        keepdim = True
    elif len(x.shape) == 4:
        dim = (1, 2, 3)      # pytorch convolution kernel is OIHW
        keepdim = True
    else:
        raise ValueError(f'got a parameter with shape not in (1, 2, 3, 4) {x}')

    return x.norm(dim = dim, keepdim = keepdim, p = 2)

 def adaptive_clip_grad_(parameters, clipping = 0.01, eps = 1e-3):
    parameters = [p for p in parameters if p.grad is not None]

    if len(parameters) == 0:
        return

    for p in parameters:
        param_norm = unitwise_norm(p).clamp_(min = eps)
        grad_norm = unitwise_norm(p.grad)
        max_norm = param_norm * clipping
        trigger = grad_norm > max_norm
        clipped_grad = p.grad * (max_norm / grad_norm.clamp(min = 1e-6))
        new_grads = torch.where(trigger, clipped_grad, p.grad)
        p.grad.detach().copy_(new_grads)
	def unitwise_norm(x):
	if len(x.squeeze().shape) <= 1:
	dim = None
	keepdim = False
	elif len(x.shape) in (2, 3):
	dim = 1
	keepdim = True
	elif len(x.shape) == 4:
	dim = (1, 2, 3) # pytorch convolution kernel is OIHW
	keepdim = True
	else:
	raise ValueError(f'got a parameter with shape not in (1, 2, 3, 4) {x}')

	return x.norm(dim = dim, keepdim = keepdim, p = 2)

	def adaptive_clip_grad_(parameters, clipping = 0.01, eps = 1e-3):
	parameters = [p for p in parameters if p.grad is not None]

	if len(parameters) == 0:
	return

	for p in parameters:
	param_norm = unitwise_norm(p).clamp_(min = eps)
	grad_norm = unitwise_norm(p.grad)
	max_norm = param_norm * clipping
	trigger = grad_norm > max_norm
	clipped_grad = p.grad * (max_norm / grad_norm.clamp(min = 1e-6))
	new_grads = torch.where(trigger, clipped_grad, p.grad)
	p.grad.detach().copy_(new_grads)