罗杰斯 dhbrojas

🐉

Code LLMs @ Zhipu.AI, THU

Research Engineer @ Zhipu.AI 🇫🇷🇨🇳

dhbrojas / wsd.py

Created July 28, 2025 12:24

Warmup Stable Decay LR

	import math


	def warmup_stable_decay(*, W: int, S: int, D: int, min_lr_scale_factor: float = 0.1):
	"""
	Returns a lambda function for PyTorch's LambdaLR scheduler implementing the
	WSD learning rate schedule.

	Parameters:
	- W: The last step of the warmup phase.

dhbrojas / transformer.py

Last active July 27, 2025 21:17

	from typing import Callable, Protocol

	import torch
	from torch import Tensor
	from torch.nn import Linear, Module
	from torch.nn.functional import silu


	def compute_frequencies(
	*,

dhbrojas / bench.py

Last active July 21, 2025 17:09

	import torch
	from tqdm import tqdm
	from torch.nn import Module
	from torch.nn.functional import cross_entropy
	from transformers import (
	AutoConfig,
	AutoModelForCausalLM,
	)

	BATCH = 16

dhbrojas / acc.py

Created July 21, 2025 11:05

Linear Gradient Accumulation Schedule

	import torch


	class GradientAccumulationSchedule:
	"""
	A schedule that linearly increases the number of gradient accumulation
	steps throughout training to converge faster.
	"""

	def __init__(self, *, min: int, max: int, steps: int, factor: int \| None = None):

dhbrojas / config.json

Last active July 19, 2025 16:18

Minitron, LLM Training

	{
	"architectures": ["Qwen3ForCausalLM"],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 151643,
	"eos_token_id": 151645,
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 1024,
	"initializer_range": 0.02,

dhbrojas / parquet.py

Created July 18, 2025 11:11

Parquet Streaming Reader

	from typing import Any, Dict, List

	import pyarrow.parquet as pq


	class ParquetReader:
	def __init__(self, file: str, batch_size: int = 256):
	self.fp = pq.ParquetFile(file)
	self.num_rows = self.fp.metadata.num_rows
	self.num_rows_read = 0

dhbrojas / 64bit-year-docid.py

Created July 3, 2025 11:52

dhbrojas / dataproc.py

Last active June 30, 2025 12:10

Data Processing for LLM Training

	from abc import ABC, abstractmethod
	from dataclasses import dataclass
	from random import choices, randint
	from typing import Any, Callable, Dict, Generic, List, TypeVar

	import torch
	from torch import Tensor

	T = TypeVar("T")

dhbrojas / mask.py

Created June 27, 2025 09:07

HuggingFace Compatible Attention Mask

	class AttentionMask:
	"""
	A (Batch, 1, Queries, Keys & Values) attention mask for attention between queries and keys/values.

	The mask is "additive" or "inversed" meaning it is a tensor of floating point values
	that can be added to the attention scores before the softmax operation.

	>>> 0 = Unmasked
	>>> dtype.min = Masked
	"""

dhbrojas / collator.py

Last active June 30, 2025 11:13

Beautiful ARLM Sequence Packing & Padding

	from dataclasses import dataclass
	from typing import List, Iterator

	@dataclass
	class Sequence:
	"""Contains a single token sequence"""

	x: List[int]
	y: List[int]