Last active
July 23, 2025 06:35
-
-
Save huseinzol05/5e3dc3f26633bcaed9f7328f0b6cab97 to your computer and use it in GitHub Desktop.
HuggingFace Transformers 4.51.3 Flash multipacking
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "fa9c5a2e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[2025-07-23 14:23:00,522] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/usr/bin/ld: cannot find -laio\n", | |
"collect2: error: ld returned 1 exit status\n", | |
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'\n", | |
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'\n", | |
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'\n", | |
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'\n", | |
"collect2: error: ld returned 1 exit status\n" | |
] | |
} | |
], | |
"source": [ | |
"import torch\n", | |
"from transformers import AutoTokenizer, AutoModelForCausalLM, AddedToken, Qwen3ForCausalLM\n", | |
"from streaming import LocalDataset\n", | |
"from streaming.base.format.mds.encodings import Encoding, _encodings\n", | |
"from cut_cross_entropy import linear_cross_entropy\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "d59049af", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"32779" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B-Base')\n", | |
"\n", | |
"extra = [AddedToken('<|speech_start|>')]\n", | |
"for i in range(32768):\n", | |
" extra.append(AddedToken(f'<|speech_{i}|>'))\n", | |
"for i in range(5):\n", | |
" extra.append(AddedToken(f'<|pitch_{i}|>'))\n", | |
" extra.append(AddedToken(f'<|rate_{i}|>'))\n", | |
"tokenizer.add_tokens(extra)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "e648a52b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class Model(Qwen3ForCausalLM):\n", | |
" def __init__(self, config):\n", | |
" super().__init__(config)\n", | |
" \n", | |
" def forward(self, input_ids, attention_mask = None, position_ids = None, labels = None, **kwargs):\n", | |
" super_out = self.model.forward(\n", | |
" input_ids = input_ids, \n", | |
" position_ids = position_ids, \n", | |
" attention_mask = attention_mask, \n", | |
" output_hidden_states = True,\n", | |
" )\n", | |
" if labels is not None:\n", | |
" embeddings = super_out.last_hidden_state\n", | |
" auto_shift_loss = linear_cross_entropy(\n", | |
" embeddings.to(torch.bfloat16), \n", | |
" self.lm_head.weight.to(torch.bfloat16), \n", | |
" labels, \n", | |
" shift=True,\n", | |
" impl=\"cce_kahan_full_c\"\n", | |
" )\n", | |
" return {'loss': auto_shift_loss}\n", | |
" return super_out" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "2c2f678e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n" | |
] | |
} | |
], | |
"source": [ | |
"model = Model.from_pretrained(\n", | |
" 'Qwen/Qwen3-0.6B-Base', attn_implementation = 'flash_attention_2',\n", | |
" torch_dtype = torch.bfloat16\n", | |
").cuda()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "bbaeec27", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Embedding(184448, 1024)" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"model.resize_token_embeddings(len(tokenizer), mean_resizing=False, pad_to_multiple_of=8)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "0520294d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Wed Jul 23 14:23:06 2025 \n", | |
"+-----------------------------------------------------------------------------------------+\n", | |
"| NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 |\n", | |
"|-----------------------------------------+------------------------+----------------------+\n", | |
"| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", | |
"| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", | |
"| | | MIG M. |\n", | |
"|=========================================+========================+======================|\n", | |
"| 0 NVIDIA GeForce RTX 3090 Ti Off | 00000000:01:00.0 Off | Off |\n", | |
"| 30% 41C P0 37W / 400W | 1824MiB / 24564MiB | 19% Default |\n", | |
"| | | N/A |\n", | |
"+-----------------------------------------+------------------------+----------------------+\n", | |
"| 1 NVIDIA GeForce RTX 3090 Ti Off | 00000000:08:00.0 Off | Off |\n", | |
"| 0% 49C P8 22W / 350W | 18MiB / 24564MiB | 0% Default |\n", | |
"| | | N/A |\n", | |
"+-----------------------------------------+------------------------+----------------------+\n", | |
"| 2 NVIDIA GeForce RTX 3090 Off | 00000000:09:00.0 Off | N/A |\n", | |
"| 0% 36C P8 25W / 300W | 4847MiB / 24576MiB | 0% Default |\n", | |
"| | | N/A |\n", | |
"+-----------------------------------------+------------------------+----------------------+\n", | |
" \n", | |
"+-----------------------------------------------------------------------------------------+\n", | |
"| Processes: |\n", | |
"| GPU GI CI PID Type Process name GPU Memory |\n", | |
"| ID ID Usage |\n", | |
"|=========================================================================================|\n", | |
"| 0 N/A N/A 1886 G /usr/lib/xorg/Xorg 9MiB |\n", | |
"| 0 N/A N/A 4930 G /usr/bin/gnome-shell 6MiB |\n", | |
"| 0 N/A N/A 3157815 C /usr/bin/python3.10 1784MiB |\n", | |
"| 1 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n", | |
"| 2 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n", | |
"| 2 N/A N/A 3130035 C /usr/bin/python3.10 4824MiB |\n", | |
"+-----------------------------------------------------------------------------------------+\n" | |
] | |
} | |
], | |
"source": [ | |
"!nvidia-smi" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "d840eb0a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class UInt32(Encoding):\n", | |
" def encode(self, obj) -> bytes:\n", | |
" return obj.tobytes()\n", | |
"\n", | |
" def decode(self, data: bytes):\n", | |
" return np.frombuffer(data, np.uint32)\n", | |
"\n", | |
"_encodings['uint32'] = UInt32\n", | |
"\n", | |
"class DatasetFixed(torch.utils.data.Dataset):\n", | |
" def __init__(self, local):\n", | |
" self.dataset = LocalDataset(local=local)\n", | |
"\n", | |
" def __getitem__(self, idx):\n", | |
" data = self.dataset[idx]\n", | |
" data.pop('audio', None)\n", | |
" data.pop('text', None)\n", | |
" data.pop('token_type_ids', None)\n", | |
"\n", | |
" for k in data.keys():\n", | |
" data[k] = data[k].astype(np.int64)\n", | |
" \n", | |
" \"\"\"\n", | |
" {'attention_mask': array([ 471, 775, 661, 1255, 568]),\n", | |
" 'input_ids': array([151644, 71, 810, ..., 160361, 182069, 151645]),\n", | |
" 'position_ids': array([ 0, 1, 2, ..., 565, 566, 567])\n", | |
" }\n", | |
" \"\"\"\n", | |
"\n", | |
" return data\n", | |
"\n", | |
" def __len__(self):\n", | |
" return len(self.dataset)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "62a5f926", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dataset = DatasetFixed('packing-qwen3')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "ec3522db", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def collator(batch):\n", | |
" batch = [b for b in batch if b is not None]\n", | |
" input_ids = [b['input_ids'] for b in batch]\n", | |
" position_ids = [b['position_ids'] for b in batch]\n", | |
" labels = [b['input_ids'].copy() for b in batch]\n", | |
" attention_mask = [b['attention_mask'] for b in batch]\n", | |
" input_ids = np.concatenate(input_ids)\n", | |
" position_ids = np.concatenate(position_ids)\n", | |
" labels = np.concatenate(labels)\n", | |
" query_lens = np.concatenate(attention_mask)\n", | |
" cumsum = [0] + np.cumsum(query_lens).tolist()\n", | |
" max_cumsum = int(np.max(cumsum))\n", | |
" cu_seq_lens_q = torch.tensor(cumsum, dtype=torch.int32)\n", | |
" cu_seq_lens_k = torch.tensor(cumsum, dtype=torch.int32)\n", | |
" max_seqlen_q = np.max(query_lens)\n", | |
" return {\n", | |
" 'input_ids': torch.tensor(input_ids)[None],\n", | |
" 'position_ids': torch.tensor(position_ids)[None],\n", | |
" 'labels': torch.tensor(labels)[None],\n", | |
" 'cu_seq_lens_q': cu_seq_lens_q,\n", | |
" 'cu_seq_lens_k': cu_seq_lens_k,\n", | |
" 'max_length_q': max_seqlen_q,\n", | |
" 'max_length_k': max_seqlen_q\n", | |
" }" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "1743bcc1", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'attention_mask': array([ 471, 775, 661, 1255, 568]),\n", | |
" 'input_ids': array([151644, 71, 810, ..., 160361, 182069, 151645]),\n", | |
" 'position_ids': array([ 0, 1, 2, ..., 565, 566, 567])}" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dataset[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "59d244d8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"input_ids = collator([dataset[0], dataset[1], dataset[2]])\n", | |
"for k in input_ids.keys():\n", | |
" if isinstance(input_ids[k], torch.Tensor):\n", | |
" input_ids[k] = input_ids[k].cuda()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "aa27142f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'input_ids': tensor([[151644, 71, 810, ..., 155599, 158241, 151645]],\n", | |
" device='cuda:0'),\n", | |
" 'position_ids': tensor([[ 0, 1, 2, ..., 821, 822, 823]], device='cuda:0'),\n", | |
" 'labels': tensor([[151644, 71, 810, ..., 155599, 158241, 151645]],\n", | |
" device='cuda:0'),\n", | |
" 'cu_seq_lens_q': tensor([ 0, 471, 1246, 1907, 3162, 3730, 4983, 5917, 6944, 7853,\n", | |
" 8852, 9561, 10385], device='cuda:0', dtype=torch.int32),\n", | |
" 'cu_seq_lens_k': tensor([ 0, 471, 1246, 1907, 3162, 3730, 4983, 5917, 6944, 7853,\n", | |
" 8852, 9561, 10385], device='cuda:0', dtype=torch.int32),\n", | |
" 'max_length_q': 1255,\n", | |
" 'max_length_k': 1255}" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"input_ids" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "350f5e51", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'loss': tensor(15.1131, device='cuda:0', grad_fn=<LinearCrossEntropyFunctionBackward>)}" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"model(**input_ids)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "b7308f51", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Wed Jul 23 14:24:33 2025 \n", | |
"+-----------------------------------------------------------------------------------------+\n", | |
"| NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 |\n", | |
"|-----------------------------------------+------------------------+----------------------+\n", | |
"| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", | |
"| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", | |
"| | | MIG M. |\n", | |
"|=========================================+========================+======================|\n", | |
"| 0 NVIDIA GeForce RTX 3090 Ti Off | 00000000:01:00.0 Off | Off |\n", | |
"| 30% 38C P8 22W / 400W | 22814MiB / 24564MiB | 0% Default |\n", | |
"| | | N/A |\n", | |
"+-----------------------------------------+------------------------+----------------------+\n", | |
"| 1 NVIDIA GeForce RTX 3090 Ti Off | 00000000:08:00.0 Off | Off |\n", | |
"| 0% 49C P8 23W / 350W | 18MiB / 24564MiB | 0% Default |\n", | |
"| | | N/A |\n", | |
"+-----------------------------------------+------------------------+----------------------+\n", | |
"| 2 NVIDIA GeForce RTX 3090 Off | 00000000:09:00.0 Off | N/A |\n", | |
"| 0% 38C P8 24W / 300W | 4847MiB / 24576MiB | 0% Default |\n", | |
"| | | N/A |\n", | |
"+-----------------------------------------+------------------------+----------------------+\n", | |
" \n", | |
"+-----------------------------------------------------------------------------------------+\n", | |
"| Processes: |\n", | |
"| GPU GI CI PID Type Process name GPU Memory |\n", | |
"| ID ID Usage |\n", | |
"|=========================================================================================|\n", | |
"| 0 N/A N/A 1886 G /usr/lib/xorg/Xorg 9MiB |\n", | |
"| 0 N/A N/A 4930 G /usr/bin/gnome-shell 6MiB |\n", | |
"| 0 N/A N/A 3157815 C /usr/bin/python3.10 22774MiB |\n", | |
"| 1 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n", | |
"| 2 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n", | |
"| 2 N/A N/A 3130035 C /usr/bin/python3.10 4824MiB |\n", | |
"+-----------------------------------------------------------------------------------------+\n" | |
] | |
} | |
], | |
"source": [ | |
"!nvidia-smi" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "python3.10", | |
"language": "python", | |
"name": "python3.10" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.17" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment