Skip to content

Instantly share code, notes, and snippets.

View dhbrojas's full-sized avatar
🐉
Code LLMs @ Zhipu.AI, THU

罗杰斯 dhbrojas

🐉
Code LLMs @ Zhipu.AI, THU
View GitHub Profile
@dhbrojas
dhbrojas / wsd.py
Created July 28, 2025 12:24
Warmup Stable Decay LR
import math
def warmup_stable_decay(*, W: int, S: int, D: int, min_lr_scale_factor: float = 0.1):
"""
Returns a lambda function for PyTorch's LambdaLR scheduler implementing the
WSD learning rate schedule.
Parameters:
- W: The last step of the warmup phase.
from typing import Callable, Protocol
import torch
from torch import Tensor
from torch.nn import Linear, Module
from torch.nn.functional import silu
def compute_frequencies(
*,
import torch
from tqdm import tqdm
from torch.nn import Module
from torch.nn.functional import cross_entropy
from transformers import (
AutoConfig,
AutoModelForCausalLM,
)
BATCH = 16
@dhbrojas
dhbrojas / acc.py
Created July 21, 2025 11:05
Linear Gradient Accumulation Schedule
import torch
class GradientAccumulationSchedule:
"""
A schedule that linearly increases the number of gradient accumulation
steps throughout training to converge faster.
"""
def __init__(self, *, min: int, max: int, steps: int, factor: int | None = None):
@dhbrojas
dhbrojas / config.json
Last active July 19, 2025 16:18
Minitron, LLM Training
{
"architectures": ["Qwen3ForCausalLM"],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 1024,
"initializer_range": 0.02,
@dhbrojas
dhbrojas / parquet.py
Created July 18, 2025 11:11
Parquet Streaming Reader
from typing import Any, Dict, List
import pyarrow.parquet as pq
class ParquetReader:
def __init__(self, file: str, batch_size: int = 256):
self.fp = pq.ParquetFile(file)
self.num_rows = self.fp.metadata.num_rows
self.num_rows_read = 0
import random
YEAR_SHIFT = 32
def encode(uid, year):
return (year << YEAR_SHIFT) | uid
def decode(docid):
@dhbrojas
dhbrojas / dataproc.py
Last active June 30, 2025 12:10
Data Processing for LLM Training
from abc import ABC, abstractmethod
from dataclasses import dataclass
from random import choices, randint
from typing import Any, Callable, Dict, Generic, List, TypeVar
import torch
from torch import Tensor
T = TypeVar("T")
@dhbrojas
dhbrojas / mask.py
Created June 27, 2025 09:07
HuggingFace Compatible Attention Mask
class AttentionMask:
"""
A (Batch, 1, Queries, Keys & Values) attention mask for attention between queries and keys/values.
The mask is "additive" or "inversed" meaning it is a tensor of floating point values
that can be added to the attention scores before the softmax operation.
>>> 0 = Unmasked
>>> dtype.min = Masked
"""
@dhbrojas
dhbrojas / collator.py
Last active June 30, 2025 11:13
Beautiful ARLM Sequence Packing & Padding
from dataclasses import dataclass
from typing import List, Iterator
@dataclass
class Sequence:
"""Contains a single token sequence"""
x: List[int]
y: List[int]