Skip to content

Instantly share code, notes, and snippets.

@huseinzol05
Last active July 23, 2025 06:35
Show Gist options
  • Save huseinzol05/5e3dc3f26633bcaed9f7328f0b6cab97 to your computer and use it in GitHub Desktop.
Save huseinzol05/5e3dc3f26633bcaed9f7328f0b6cab97 to your computer and use it in GitHub Desktop.
HuggingFace Transformers 4.51.3 Flash multipacking
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "fa9c5a2e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2025-07-23 14:23:00,522] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/bin/ld: cannot find -laio\n",
"collect2: error: ld returned 1 exit status\n",
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'\n",
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'\n",
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'\n",
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'\n",
"collect2: error: ld returned 1 exit status\n"
]
}
],
"source": [
"import torch\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, AddedToken, Qwen3ForCausalLM\n",
"from streaming import LocalDataset\n",
"from streaming.base.format.mds.encodings import Encoding, _encodings\n",
"from cut_cross_entropy import linear_cross_entropy\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d59049af",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"32779"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B-Base')\n",
"\n",
"extra = [AddedToken('<|speech_start|>')]\n",
"for i in range(32768):\n",
" extra.append(AddedToken(f'<|speech_{i}|>'))\n",
"for i in range(5):\n",
" extra.append(AddedToken(f'<|pitch_{i}|>'))\n",
" extra.append(AddedToken(f'<|rate_{i}|>'))\n",
"tokenizer.add_tokens(extra)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e648a52b",
"metadata": {},
"outputs": [],
"source": [
"class Model(Qwen3ForCausalLM):\n",
" def __init__(self, config):\n",
" super().__init__(config)\n",
" \n",
" def forward(self, input_ids, attention_mask = None, position_ids = None, labels = None, **kwargs):\n",
" super_out = self.model.forward(\n",
" input_ids = input_ids, \n",
" position_ids = position_ids, \n",
" attention_mask = attention_mask, \n",
" output_hidden_states = True,\n",
" )\n",
" if labels is not None:\n",
" embeddings = super_out.last_hidden_state\n",
" auto_shift_loss = linear_cross_entropy(\n",
" embeddings.to(torch.bfloat16), \n",
" self.lm_head.weight.to(torch.bfloat16), \n",
" labels, \n",
" shift=True,\n",
" impl=\"cce_kahan_full_c\"\n",
" )\n",
" return {'loss': auto_shift_loss}\n",
" return super_out"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2c2f678e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n"
]
}
],
"source": [
"model = Model.from_pretrained(\n",
" 'Qwen/Qwen3-0.6B-Base', attn_implementation = 'flash_attention_2',\n",
" torch_dtype = torch.bfloat16\n",
").cuda()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "bbaeec27",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Embedding(184448, 1024)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.resize_token_embeddings(len(tokenizer), mean_resizing=False, pad_to_multiple_of=8)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0520294d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wed Jul 23 14:23:06 2025 \n",
"+-----------------------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 |\n",
"|-----------------------------------------+------------------------+----------------------+\n",
"| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|=========================================+========================+======================|\n",
"| 0 NVIDIA GeForce RTX 3090 Ti Off | 00000000:01:00.0 Off | Off |\n",
"| 30% 41C P0 37W / 400W | 1824MiB / 24564MiB | 19% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
"| 1 NVIDIA GeForce RTX 3090 Ti Off | 00000000:08:00.0 Off | Off |\n",
"| 0% 49C P8 22W / 350W | 18MiB / 24564MiB | 0% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
"| 2 NVIDIA GeForce RTX 3090 Off | 00000000:09:00.0 Off | N/A |\n",
"| 0% 36C P8 25W / 300W | 4847MiB / 24576MiB | 0% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=========================================================================================|\n",
"| 0 N/A N/A 1886 G /usr/lib/xorg/Xorg 9MiB |\n",
"| 0 N/A N/A 4930 G /usr/bin/gnome-shell 6MiB |\n",
"| 0 N/A N/A 3157815 C /usr/bin/python3.10 1784MiB |\n",
"| 1 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n",
"| 2 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n",
"| 2 N/A N/A 3130035 C /usr/bin/python3.10 4824MiB |\n",
"+-----------------------------------------------------------------------------------------+\n"
]
}
],
"source": [
"!nvidia-smi"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d840eb0a",
"metadata": {},
"outputs": [],
"source": [
"class UInt32(Encoding):\n",
" def encode(self, obj) -> bytes:\n",
" return obj.tobytes()\n",
"\n",
" def decode(self, data: bytes):\n",
" return np.frombuffer(data, np.uint32)\n",
"\n",
"_encodings['uint32'] = UInt32\n",
"\n",
"class DatasetFixed(torch.utils.data.Dataset):\n",
" def __init__(self, local):\n",
" self.dataset = LocalDataset(local=local)\n",
"\n",
" def __getitem__(self, idx):\n",
" data = self.dataset[idx]\n",
" data.pop('audio', None)\n",
" data.pop('text', None)\n",
" data.pop('token_type_ids', None)\n",
"\n",
" for k in data.keys():\n",
" data[k] = data[k].astype(np.int64)\n",
" \n",
" \"\"\"\n",
" {'attention_mask': array([ 471, 775, 661, 1255, 568]),\n",
" 'input_ids': array([151644, 71, 810, ..., 160361, 182069, 151645]),\n",
" 'position_ids': array([ 0, 1, 2, ..., 565, 566, 567])\n",
" }\n",
" \"\"\"\n",
"\n",
" return data\n",
"\n",
" def __len__(self):\n",
" return len(self.dataset)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "62a5f926",
"metadata": {},
"outputs": [],
"source": [
"dataset = DatasetFixed('packing-qwen3')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "ec3522db",
"metadata": {},
"outputs": [],
"source": [
"def collator(batch):\n",
" batch = [b for b in batch if b is not None]\n",
" input_ids = [b['input_ids'] for b in batch]\n",
" position_ids = [b['position_ids'] for b in batch]\n",
" labels = [b['input_ids'].copy() for b in batch]\n",
" attention_mask = [b['attention_mask'] for b in batch]\n",
" input_ids = np.concatenate(input_ids)\n",
" position_ids = np.concatenate(position_ids)\n",
" labels = np.concatenate(labels)\n",
" query_lens = np.concatenate(attention_mask)\n",
" cumsum = [0] + np.cumsum(query_lens).tolist()\n",
" max_cumsum = int(np.max(cumsum))\n",
" cu_seq_lens_q = torch.tensor(cumsum, dtype=torch.int32)\n",
" cu_seq_lens_k = torch.tensor(cumsum, dtype=torch.int32)\n",
" max_seqlen_q = np.max(query_lens)\n",
" return {\n",
" 'input_ids': torch.tensor(input_ids)[None],\n",
" 'position_ids': torch.tensor(position_ids)[None],\n",
" 'labels': torch.tensor(labels)[None],\n",
" 'cu_seq_lens_q': cu_seq_lens_q,\n",
" 'cu_seq_lens_k': cu_seq_lens_k,\n",
" 'max_length_q': max_seqlen_q,\n",
" 'max_length_k': max_seqlen_q\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "1743bcc1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'attention_mask': array([ 471, 775, 661, 1255, 568]),\n",
" 'input_ids': array([151644, 71, 810, ..., 160361, 182069, 151645]),\n",
" 'position_ids': array([ 0, 1, 2, ..., 565, 566, 567])}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "59d244d8",
"metadata": {},
"outputs": [],
"source": [
"input_ids = collator([dataset[0], dataset[1], dataset[2]])\n",
"for k in input_ids.keys():\n",
" if isinstance(input_ids[k], torch.Tensor):\n",
" input_ids[k] = input_ids[k].cuda()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "aa27142f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'input_ids': tensor([[151644, 71, 810, ..., 155599, 158241, 151645]],\n",
" device='cuda:0'),\n",
" 'position_ids': tensor([[ 0, 1, 2, ..., 821, 822, 823]], device='cuda:0'),\n",
" 'labels': tensor([[151644, 71, 810, ..., 155599, 158241, 151645]],\n",
" device='cuda:0'),\n",
" 'cu_seq_lens_q': tensor([ 0, 471, 1246, 1907, 3162, 3730, 4983, 5917, 6944, 7853,\n",
" 8852, 9561, 10385], device='cuda:0', dtype=torch.int32),\n",
" 'cu_seq_lens_k': tensor([ 0, 471, 1246, 1907, 3162, 3730, 4983, 5917, 6944, 7853,\n",
" 8852, 9561, 10385], device='cuda:0', dtype=torch.int32),\n",
" 'max_length_q': 1255,\n",
" 'max_length_k': 1255}"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"input_ids"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "350f5e51",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'loss': tensor(15.1131, device='cuda:0', grad_fn=<LinearCrossEntropyFunctionBackward>)}"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model(**input_ids)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "b7308f51",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wed Jul 23 14:24:33 2025 \n",
"+-----------------------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 |\n",
"|-----------------------------------------+------------------------+----------------------+\n",
"| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|=========================================+========================+======================|\n",
"| 0 NVIDIA GeForce RTX 3090 Ti Off | 00000000:01:00.0 Off | Off |\n",
"| 30% 38C P8 22W / 400W | 22814MiB / 24564MiB | 0% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
"| 1 NVIDIA GeForce RTX 3090 Ti Off | 00000000:08:00.0 Off | Off |\n",
"| 0% 49C P8 23W / 350W | 18MiB / 24564MiB | 0% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
"| 2 NVIDIA GeForce RTX 3090 Off | 00000000:09:00.0 Off | N/A |\n",
"| 0% 38C P8 24W / 300W | 4847MiB / 24576MiB | 0% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=========================================================================================|\n",
"| 0 N/A N/A 1886 G /usr/lib/xorg/Xorg 9MiB |\n",
"| 0 N/A N/A 4930 G /usr/bin/gnome-shell 6MiB |\n",
"| 0 N/A N/A 3157815 C /usr/bin/python3.10 22774MiB |\n",
"| 1 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n",
"| 2 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n",
"| 2 N/A N/A 3130035 C /usr/bin/python3.10 4824MiB |\n",
"+-----------------------------------------------------------------------------------------+\n"
]
}
],
"source": [
"!nvidia-smi"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "python3.10",
"language": "python",
"name": "python3.10"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.17"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment