Skip to content

Instantly share code, notes, and snippets.

@huseinzol05
Last active July 23, 2025 06:35
Show Gist options
  • Save huseinzol05/5e3dc3f26633bcaed9f7328f0b6cab97 to your computer and use it in GitHub Desktop.
Save huseinzol05/5e3dc3f26633bcaed9f7328f0b6cab97 to your computer and use it in GitHub Desktop.
HuggingFace Transformers 4.51.3 Flash multipacking
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "4f36b078",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2025-07-23 14:33:37,504] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/bin/ld: cannot find -laio\n",
"collect2: error: ld returned 1 exit status\n",
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'\n",
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'\n",
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'\n",
"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'\n",
"collect2: error: ld returned 1 exit status\n"
]
}
],
"source": [
"import torch\n",
"import torch.nn.functional as F\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, AddedToken, Qwen3ForCausalLM\n",
"from transformers import default_data_collator\n",
"from streaming import LocalDataset\n",
"from streaming.base.format.mds.encodings import Encoding, _encodings\n",
"from cut_cross_entropy import linear_cross_entropy\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "2f2a9eb2",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"32779"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B-Base')\n",
"\n",
"extra = [AddedToken('<|speech_start|>')]\n",
"for i in range(32768):\n",
" extra.append(AddedToken(f'<|speech_{i}|>'))\n",
"for i in range(5):\n",
" extra.append(AddedToken(f'<|pitch_{i}|>'))\n",
" extra.append(AddedToken(f'<|rate_{i}|>'))\n",
"tokenizer.add_tokens(extra)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f8c2fe70",
"metadata": {},
"outputs": [],
"source": [
"class Model(Qwen3ForCausalLM):\n",
" def __init__(self, config):\n",
" super().__init__(config)\n",
" \n",
" def forward(self, input_ids, attention_mask = None, position_ids = None, labels = None, **kwargs):\n",
" super_out = self.model.forward(\n",
" input_ids = input_ids, \n",
" position_ids = position_ids, \n",
" attention_mask = attention_mask, \n",
" output_hidden_states = True,\n",
" )\n",
" if labels is not None:\n",
" embeddings = super_out.last_hidden_state\n",
" auto_shift_loss = linear_cross_entropy(\n",
" embeddings.to(torch.bfloat16), \n",
" self.lm_head.weight.to(torch.bfloat16), \n",
" labels, \n",
" shift=True,\n",
" impl=\"cce_kahan_full_c\"\n",
" )\n",
" return {'loss': auto_shift_loss}\n",
" return super_out"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "10f428ef",
"metadata": {},
"outputs": [],
"source": [
"model = Model.from_pretrained(\n",
" 'Qwen/Qwen3-0.6B-Base', attn_implementation = 'sdpa',\n",
" torch_dtype = torch.bfloat16\n",
").cuda()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "aed214ff",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Embedding(184448, 1024)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.resize_token_embeddings(len(tokenizer), mean_resizing=False, pad_to_multiple_of=8)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "4f2ed186",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wed Jul 23 14:33:43 2025 \n",
"+-----------------------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 |\n",
"|-----------------------------------------+------------------------+----------------------+\n",
"| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|=========================================+========================+======================|\n",
"| 0 NVIDIA GeForce RTX 3090 Ti Off | 00000000:01:00.0 Off | Off |\n",
"| 30% 40C P0 53W / 400W | 1824MiB / 24564MiB | 27% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
"| 1 NVIDIA GeForce RTX 3090 Ti Off | 00000000:08:00.0 Off | Off |\n",
"| 0% 49C P8 23W / 350W | 18MiB / 24564MiB | 0% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
"| 2 NVIDIA GeForce RTX 3090 Off | 00000000:09:00.0 Off | N/A |\n",
"| 0% 47C P8 26W / 300W | 4847MiB / 24576MiB | 0% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=========================================================================================|\n",
"| 0 N/A N/A 1886 G /usr/lib/xorg/Xorg 9MiB |\n",
"| 0 N/A N/A 4930 G /usr/bin/gnome-shell 6MiB |\n",
"| 0 N/A N/A 3160824 C /usr/bin/python3.10 1784MiB |\n",
"| 1 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n",
"| 2 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n",
"| 2 N/A N/A 3130035 C /usr/bin/python3.10 4824MiB |\n",
"+-----------------------------------------------------------------------------------------+\n"
]
}
],
"source": [
"!nvidia-smi"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "bb0df87d",
"metadata": {},
"outputs": [],
"source": [
"def pad_attention_mask_3d(attention_mask, max_size = 4096, value = 0.0):\n",
" maxlen = attention_mask.shape[-1]\n",
" return F.pad(\n",
" attention_mask,\n",
" (0, max_size - maxlen, 0, max_size - maxlen),\n",
" value = value,\n",
" )\n",
"\n",
"def block_diagonal_concat_inverted(*masks, dtype=torch.bfloat16):\n",
" total_size = sum(mask.size(0) for mask in masks)\n",
" combined_mask = torch.zeros(total_size, total_size, dtype=dtype)\n",
"\n",
" current_pos = 0\n",
"\n",
" for mask in masks:\n",
" size = mask.size(0)\n",
" combined_mask[current_pos:current_pos + size, current_pos:current_pos + size] = mask\n",
" current_pos += size\n",
"\n",
" min_value = torch.finfo(dtype).min if dtype.is_floating_point else torch.iinfo(dtype).min\n",
" inverted_mask = torch.where(combined_mask == 1, torch.tensor(0, dtype=dtype), min_value)\n",
" return inverted_mask.unsqueeze(0)\n",
"\n",
"class UInt32(Encoding):\n",
" def encode(self, obj) -> bytes:\n",
" return obj.tobytes()\n",
"\n",
" def decode(self, data: bytes):\n",
" return np.frombuffer(data, np.uint32)\n",
"\n",
"_encodings['uint32'] = UInt32\n",
"min_dtype = torch.finfo(torch.bfloat16).min\n",
"sequence_length = 4096\n",
"\n",
"class DatasetFixed(torch.utils.data.Dataset):\n",
" def __init__(self, local):\n",
" self.dataset = LocalDataset(local=local)\n",
"\n",
" def __getitem__(self, idx):\n",
" data = self.dataset[idx]\n",
" data.pop('audio', None)\n",
" data.pop('text', None)\n",
" data.pop('token_type_ids', None)\n",
"\n",
" for k in data.keys():\n",
" data[k] = data[k].astype(np.int64)\n",
"\n",
" data['input_ids'] = np.pad(data['input_ids'], \n",
" (0, sequence_length - data['input_ids'].shape[0]), \n",
" 'constant', constant_values=(0, tokenizer.pad_token_id))\n",
" data['position_ids'] = np.pad(data['position_ids'], \n",
" (0, sequence_length - data['position_ids'].shape[0]), \n",
" 'constant', constant_values=(0, 32000))\n",
" data['labels'] = data[\"input_ids\"].copy()\n",
" data['labels'][data['labels'] == tokenizer.pad_token_id] = -100\n",
"\n",
" masking = data.pop('attention_mask')\n",
" masks = []\n",
" for m in masking:\n",
" masks.append(torch.tril(torch.ones(m, m)))\n",
" attention_mask = block_diagonal_concat_inverted(*masks)\n",
" data['attention_mask'] = pad_attention_mask_3d(\n",
" attention_mask, sequence_length, min_dtype)\n",
" data['attention_mask'] = data['attention_mask']\n",
"\n",
" return data\n",
"\n",
" def __len__(self):\n",
" return len(self.dataset)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1a0e2e29",
"metadata": {},
"outputs": [],
"source": [
"dataset = DatasetFixed('packing-qwen3')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "eb493e24",
"metadata": {},
"outputs": [],
"source": [
"input_ids = default_data_collator([dataset[0], dataset[1]])\n",
"for k in input_ids.keys():\n",
" if isinstance(input_ids[k], torch.Tensor):\n",
" input_ids[k] = input_ids[k].cuda()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "40ddfbd6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'loss': tensor(14.9695, device='cuda:0', grad_fn=<LinearCrossEntropyFunctionBackward>)}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model(**input_ids)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "baecd701",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wed Jul 23 14:33:44 2025 \n",
"+-----------------------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 |\n",
"|-----------------------------------------+------------------------+----------------------+\n",
"| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|=========================================+========================+======================|\n",
"| 0 NVIDIA GeForce RTX 3090 Ti Off | 00000000:01:00.0 Off | Off |\n",
"| 30% 43C P0 201W / 400W | 19792MiB / 24564MiB | 17% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
"| 1 NVIDIA GeForce RTX 3090 Ti Off | 00000000:08:00.0 Off | Off |\n",
"| 0% 49C P5 33W / 350W | 18MiB / 24564MiB | 0% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
"| 2 NVIDIA GeForce RTX 3090 Off | 00000000:09:00.0 Off | N/A |\n",
"| 0% 47C P8 26W / 300W | 4847MiB / 24576MiB | 0% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=========================================================================================|\n",
"| 0 N/A N/A 1886 G /usr/lib/xorg/Xorg 9MiB |\n",
"| 0 N/A N/A 4930 G /usr/bin/gnome-shell 6MiB |\n",
"| 0 N/A N/A 3160824 C /usr/bin/python3.10 19752MiB |\n",
"| 1 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n",
"| 2 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB |\n",
"| 2 N/A N/A 3130035 C /usr/bin/python3.10 4824MiB |\n",
"+-----------------------------------------------------------------------------------------+\n"
]
}
],
"source": [
"!nvidia-smi"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "python3.10",
"language": "python",
"name": "python3.10"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.17"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment