Last active
July 23, 2025 06:35
-
-
Save huseinzol05/5e3dc3f26633bcaed9f7328f0b6cab97 to your computer and use it in GitHub Desktop.
HuggingFace Transformers 4.51.3 Flash multipacking
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "4f36b078", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[2025-07-23 14:33:37,504] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/usr/bin/ld: cannot find -laio\n", | |
"collect2: error: ld returned 1 exit status\n", | |
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'\n", | |
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'\n", | |
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'\n", | |
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'\n", | |
"collect2: error: ld returned 1 exit status\n" | |
] | |
} | |
], | |
"source": [ | |
"import torch\n", | |
"import torch.nn.functional as F\n", | |
"from transformers import AutoTokenizer, AutoModelForCausalLM, AddedToken, Qwen3ForCausalLM\n", | |
"from transformers import default_data_collator\n", | |
"from streaming import LocalDataset\n", | |
"from streaming.base.format.mds.encodings import Encoding, _encodings\n", | |
"from cut_cross_entropy import linear_cross_entropy\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "2f2a9eb2", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"32779" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B-Base')\n", | |
"\n", | |
"extra = [AddedToken('<|speech_start|>')]\n", | |
"for i in range(32768):\n", | |
" extra.append(AddedToken(f'<|speech_{i}|>'))\n", | |
"for i in range(5):\n", | |
" extra.append(AddedToken(f'<|pitch_{i}|>'))\n", | |
" extra.append(AddedToken(f'<|rate_{i}|>'))\n", | |
"tokenizer.add_tokens(extra)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "f8c2fe70", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class Model(Qwen3ForCausalLM):\n", | |
" def __init__(self, config):\n", | |
" super().__init__(config)\n", | |
" \n", | |
" def forward(self, input_ids, attention_mask = None, position_ids = None, labels = None, **kwargs):\n", | |
" super_out = self.model.forward(\n", | |
" input_ids = input_ids, \n", | |
" position_ids = position_ids, \n", | |
" attention_mask = attention_mask, \n", | |
" output_hidden_states = True,\n", | |
" )\n", | |
" if labels is not None:\n", | |
" embeddings = super_out.last_hidden_state\n", | |
" auto_shift_loss = linear_cross_entropy(\n", | |
" embeddings.to(torch.bfloat16), \n", | |
" self.lm_head.weight.to(torch.bfloat16), \n", | |
" labels, \n", | |
" shift=True,\n", | |
" impl=\"cce_kahan_full_c\"\n", | |
" )\n", | |
" return {'loss': auto_shift_loss}\n", | |
" return super_out" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "10f428ef", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model = Model.from_pretrained(\n", | |
" 'Qwen/Qwen3-0.6B-Base', attn_implementation = 'sdpa',\n", | |
" torch_dtype = torch.bfloat16\n", | |
").cuda()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "aed214ff", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Embedding(184448, 1024)" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"model.resize_token_embeddings(len(tokenizer), mean_resizing=False, pad_to_multiple_of=8)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "4f2ed186", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Wed Jul 23 14:33:43 2025 \n", | |
"+-----------------------------------------------------------------------------------------+\n", | |
"| NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 |\n", | |
"|-----------------------------------------+------------------------+----------------------+\n", | |
"| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", | |
"| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", | |
"| | | MIG M. |\n", | |
"|=========================================+========================+======================|\n", | |
"| 0 NVIDIA GeForce RTX 3090 Ti Off | 00000000:01:00.0 Off | Off |\n", | |
"| 30% 40C P0 53W / 400W | 1824MiB / 24564MiB | 27% Default |\n", | |
"| | | N/A |\n", | |
"+-----------------------------------------+------------------------+----------------------+\n", | |
"| 1 NVIDIA GeForce RTX 3090 Ti Off | 00000000:08:00.0 Off | Off |\n", | |
"| 0% 49C P8 23W / 350W | 18MiB / 24564MiB | 0% Default |\n", | |
"| | | N/A |\n", | |
"+-----------------------------------------+------------------------+----------------------+\n", | |
"| 2 NVIDIA GeForce RTX 3090 Off | 00000000:09:00.0 Off | N/A |\n", | |
"| 0% 47C P8 26W / 300W | 4847MiB / 24576MiB | 0% Default |\n", | |
"| | | N/A |\n", | |
"+-----------------------------------------+------------------------+----------------------+\n", | |
" \n", | |
"+-----------------------------------------------------------------------------------------+\n", | |
"| Processes: |\n", | |
"| GPU GI CI PID Type Process name GPU Memory |\n", | |
"| ID ID Usage |\n", | |
"|=========================================================================================|\n", | |
"| 0 N/A N/A 1886 G /usr/lib/xorg/Xorg 9MiB |\n", | |
"| 0 N/A N/A 4930 G /usr/bin/gnome-shell 6MiB |\n", | |
"| 0 N/A N/A 3160824 C /usr/bin/python3.10 1784MiB |\n", | |
"| 1 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n", | |
"| 2 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n", | |
"| 2 N/A N/A 3130035 C /usr/bin/python3.10 4824MiB |\n", | |
"+-----------------------------------------------------------------------------------------+\n" | |
] | |
} | |
], | |
"source": [ | |
"!nvidia-smi" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "bb0df87d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def pad_attention_mask_3d(attention_mask, max_size = 4096, value = 0.0):\n", | |
" maxlen = attention_mask.shape[-1]\n", | |
" return F.pad(\n", | |
" attention_mask,\n", | |
" (0, max_size - maxlen, 0, max_size - maxlen),\n", | |
" value = value,\n", | |
" )\n", | |
"\n", | |
"def block_diagonal_concat_inverted(*masks, dtype=torch.bfloat16):\n", | |
" total_size = sum(mask.size(0) for mask in masks)\n", | |
" combined_mask = torch.zeros(total_size, total_size, dtype=dtype)\n", | |
"\n", | |
" current_pos = 0\n", | |
"\n", | |
" for mask in masks:\n", | |
" size = mask.size(0)\n", | |
" combined_mask[current_pos:current_pos + size, current_pos:current_pos + size] = mask\n", | |
" current_pos += size\n", | |
"\n", | |
" min_value = torch.finfo(dtype).min if dtype.is_floating_point else torch.iinfo(dtype).min\n", | |
" inverted_mask = torch.where(combined_mask == 1, torch.tensor(0, dtype=dtype), min_value)\n", | |
" return inverted_mask.unsqueeze(0)\n", | |
"\n", | |
"class UInt32(Encoding):\n", | |
" def encode(self, obj) -> bytes:\n", | |
" return obj.tobytes()\n", | |
"\n", | |
" def decode(self, data: bytes):\n", | |
" return np.frombuffer(data, np.uint32)\n", | |
"\n", | |
"_encodings['uint32'] = UInt32\n", | |
"min_dtype = torch.finfo(torch.bfloat16).min\n", | |
"sequence_length = 4096\n", | |
"\n", | |
"class DatasetFixed(torch.utils.data.Dataset):\n", | |
" def __init__(self, local):\n", | |
" self.dataset = LocalDataset(local=local)\n", | |
"\n", | |
" def __getitem__(self, idx):\n", | |
" data = self.dataset[idx]\n", | |
" data.pop('audio', None)\n", | |
" data.pop('text', None)\n", | |
" data.pop('token_type_ids', None)\n", | |
"\n", | |
" for k in data.keys():\n", | |
" data[k] = data[k].astype(np.int64)\n", | |
"\n", | |
" data['input_ids'] = np.pad(data['input_ids'], \n", | |
" (0, sequence_length - data['input_ids'].shape[0]), \n", | |
" 'constant', constant_values=(0, tokenizer.pad_token_id))\n", | |
" data['position_ids'] = np.pad(data['position_ids'], \n", | |
" (0, sequence_length - data['position_ids'].shape[0]), \n", | |
" 'constant', constant_values=(0, 32000))\n", | |
" data['labels'] = data[\"input_ids\"].copy()\n", | |
" data['labels'][data['labels'] == tokenizer.pad_token_id] = -100\n", | |
"\n", | |
" masking = data.pop('attention_mask')\n", | |
" masks = []\n", | |
" for m in masking:\n", | |
" masks.append(torch.tril(torch.ones(m, m)))\n", | |
" attention_mask = block_diagonal_concat_inverted(*masks)\n", | |
" data['attention_mask'] = pad_attention_mask_3d(\n", | |
" attention_mask, sequence_length, min_dtype)\n", | |
" data['attention_mask'] = data['attention_mask']\n", | |
"\n", | |
" return data\n", | |
"\n", | |
" def __len__(self):\n", | |
" return len(self.dataset)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "1a0e2e29", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dataset = DatasetFixed('packing-qwen3')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "eb493e24", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"input_ids = default_data_collator([dataset[0], dataset[1]])\n", | |
"for k in input_ids.keys():\n", | |
" if isinstance(input_ids[k], torch.Tensor):\n", | |
" input_ids[k] = input_ids[k].cuda()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "40ddfbd6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'loss': tensor(14.9695, device='cuda:0', grad_fn=<LinearCrossEntropyFunctionBackward>)}" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"model(**input_ids)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "baecd701", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Wed Jul 23 14:33:44 2025 \n", | |
"+-----------------------------------------------------------------------------------------+\n", | |
"| NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 |\n", | |
"|-----------------------------------------+------------------------+----------------------+\n", | |
"| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", | |
"| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", | |
"| | | MIG M. |\n", | |
"|=========================================+========================+======================|\n", | |
"| 0 NVIDIA GeForce RTX 3090 Ti Off | 00000000:01:00.0 Off | Off |\n", | |
"| 30% 43C P0 201W / 400W | 19792MiB / 24564MiB | 17% Default |\n", | |
"| | | N/A |\n", | |
"+-----------------------------------------+------------------------+----------------------+\n", | |
"| 1 NVIDIA GeForce RTX 3090 Ti Off | 00000000:08:00.0 Off | Off |\n", | |
"| 0% 49C P5 33W / 350W | 18MiB / 24564MiB | 0% Default |\n", | |
"| | | N/A |\n", | |
"+-----------------------------------------+------------------------+----------------------+\n", | |
"| 2 NVIDIA GeForce RTX 3090 Off | 00000000:09:00.0 Off | N/A |\n", | |
"| 0% 47C P8 26W / 300W | 4847MiB / 24576MiB | 0% Default |\n", | |
"| | | N/A |\n", | |
"+-----------------------------------------+------------------------+----------------------+\n", | |
" \n", | |
"+-----------------------------------------------------------------------------------------+\n", | |
"| Processes: |\n", | |
"| GPU GI CI PID Type Process name GPU Memory |\n", | |
"| ID ID Usage |\n", | |
"|=========================================================================================|\n", | |
"| 0 N/A N/A 1886 G /usr/lib/xorg/Xorg 9MiB |\n", | |
"| 0 N/A N/A 4930 G /usr/bin/gnome-shell 6MiB |\n", | |
"| 0 N/A N/A 3160824 C /usr/bin/python3.10 19752MiB |\n", | |
"| 1 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n", | |
"| 2 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n", | |
"| 2 N/A N/A 3130035 C /usr/bin/python3.10 4824MiB |\n", | |
"+-----------------------------------------------------------------------------------------+\n" | |
] | |
} | |
], | |
"source": [ | |
"!nvidia-smi" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "python3.10", | |
"language": "python", | |
"name": "python3.10" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.17" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment