huseinzol05 · July 23, 2025 06:35
diff --git a/test-flash-multipacking.ipynb b/test-flash-multipacking.ipynb
diff --git a/test-sdpa-multipacking.ipynb b/test-sdpa-multipacking.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4f36b078",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-07-23 14:33:37,504] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/bin/ld: cannot find -laio\n",
      "collect2: error: ld returned 1 exit status\n",
      "/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'\n",
      "/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'\n",
      "/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'\n",
      "/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'\n",
      "collect2: error: ld returned 1 exit status\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn.functional as F\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, AddedToken, Qwen3ForCausalLM\n",
    "from transformers import default_data_collator\n",
    "from streaming import LocalDataset\n",
    "from streaming.base.format.mds.encodings import Encoding, _encodings\n",
    "from cut_cross_entropy import linear_cross_entropy\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "2f2a9eb2",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32779"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B-Base')\n",
    "\n",
    "extra = [AddedToken('<|speech_start|>')]\n",
    "for i in range(32768):\n",
    "    extra.append(AddedToken(f'<|speech_{i}|>'))\n",
    "for i in range(5):\n",
    "    extra.append(AddedToken(f'<|pitch_{i}|>'))\n",
    "    extra.append(AddedToken(f'<|rate_{i}|>'))\n",
    "tokenizer.add_tokens(extra)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f8c2fe70",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Model(Qwen3ForCausalLM):\n",
    "    def __init__(self, config):\n",
    "        super().__init__(config)\n",
    "        \n",
    "    def forward(self, input_ids, attention_mask = None, position_ids = None, labels = None, **kwargs):\n",
    "        super_out = self.model.forward(\n",
    "            input_ids = input_ids, \n",
    "            position_ids = position_ids, \n",
    "            attention_mask = attention_mask, \n",
    "            output_hidden_states = True,\n",
    "        )\n",
    "        if labels is not None:\n",
    "            embeddings = super_out.last_hidden_state\n",
    "            auto_shift_loss = linear_cross_entropy(\n",
    "                embeddings.to(torch.bfloat16), \n",
    "                self.lm_head.weight.to(torch.bfloat16), \n",
    "                labels, \n",
    "                shift=True,\n",
    "                impl=\"cce_kahan_full_c\"\n",
    "            )\n",
    "            return {'loss': auto_shift_loss}\n",
    "        return super_out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "10f428ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Model.from_pretrained(\n",
    "    'Qwen/Qwen3-0.6B-Base', attn_implementation = 'sdpa',\n",
    "    torch_dtype = torch.bfloat16\n",
    ").cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "aed214ff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Embedding(184448, 1024)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.resize_token_embeddings(len(tokenizer), mean_resizing=False, pad_to_multiple_of=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4f2ed186",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Jul 23 14:33:43 2025       \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 555.42.06              Driver Version: 555.42.06      CUDA Version: 12.5     |\n",
      "|-----------------------------------------+------------------------+----------------------+\n",
      "| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n",
      "|                                         |                        |               MIG M. |\n",
      "|=========================================+========================+======================|\n",
      "|   0  NVIDIA GeForce RTX 3090 Ti     Off |   00000000:01:00.0 Off |                  Off |\n",
      "| 30%   40C    P0             53W /  400W |    1824MiB /  24564MiB |     27%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "|   1  NVIDIA GeForce RTX 3090 Ti     Off |   00000000:08:00.0 Off |                  Off |\n",
      "|  0%   49C    P8             23W /  350W |      18MiB /  24564MiB |      0%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "|   2  NVIDIA GeForce RTX 3090        Off |   00000000:09:00.0 Off |                  N/A |\n",
      "|  0%   47C    P8             26W /  300W |    4847MiB /  24576MiB |      0%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "                                                                                         \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| Processes:                                                                              |\n",
      "|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |\n",
      "|        ID   ID                                                               Usage      |\n",
      "|=========================================================================================|\n",
      "|    0   N/A  N/A      1886      G   /usr/lib/xorg/Xorg                              9MiB |\n",
      "|    0   N/A  N/A      4930      G   /usr/bin/gnome-shell                            6MiB |\n",
      "|    0   N/A  N/A   3160824      C   /usr/bin/python3.10                          1784MiB |\n",
      "|    1   N/A  N/A      1886      G   /usr/lib/xorg/Xorg                              4MiB |\n",
      "|    2   N/A  N/A      1886      G   /usr/lib/xorg/Xorg                              4MiB |\n",
      "|    2   N/A  N/A   3130035      C   /usr/bin/python3.10                          4824MiB |\n",
      "+-----------------------------------------------------------------------------------------+\n"
     ]
    }
   ],
   "source": [
    "!nvidia-smi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "bb0df87d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def pad_attention_mask_3d(attention_mask, max_size = 4096, value = 0.0):\n",
    "    maxlen = attention_mask.shape[-1]\n",
    "    return F.pad(\n",
    "        attention_mask,\n",
    "        (0, max_size - maxlen, 0, max_size - maxlen),\n",
    "        value = value,\n",
    "    )\n",
    "\n",
    "def block_diagonal_concat_inverted(*masks, dtype=torch.bfloat16):\n",
    "    total_size = sum(mask.size(0) for mask in masks)\n",
    "    combined_mask = torch.zeros(total_size, total_size, dtype=dtype)\n",
    "\n",
    "    current_pos = 0\n",
    "\n",
    "    for mask in masks:\n",
    "        size = mask.size(0)\n",
    "        combined_mask[current_pos:current_pos + size, current_pos:current_pos + size] = mask\n",
    "        current_pos += size\n",
    "\n",
    "    min_value = torch.finfo(dtype).min if dtype.is_floating_point else torch.iinfo(dtype).min\n",
    "    inverted_mask = torch.where(combined_mask == 1, torch.tensor(0, dtype=dtype), min_value)\n",
    "    return inverted_mask.unsqueeze(0)\n",
    "\n",
    "class UInt32(Encoding):\n",
    "    def encode(self, obj) -> bytes:\n",
    "        return obj.tobytes()\n",
    "\n",
    "    def decode(self, data: bytes):\n",
    "        return np.frombuffer(data, np.uint32)\n",
    "\n",
    "_encodings['uint32'] = UInt32\n",
    "min_dtype = torch.finfo(torch.bfloat16).min\n",
    "sequence_length = 4096\n",
    "\n",
    "class DatasetFixed(torch.utils.data.Dataset):\n",
    "    def __init__(self, local):\n",
    "        self.dataset = LocalDataset(local=local)\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        data = self.dataset[idx]\n",
    "        data.pop('audio', None)\n",
    "        data.pop('text', None)\n",
    "        data.pop('token_type_ids', None)\n",
    "\n",
    "        for k in data.keys():\n",
    "            data[k] = data[k].astype(np.int64)\n",
    "\n",
    "        data['input_ids'] = np.pad(data['input_ids'], \n",
    "            (0, sequence_length - data['input_ids'].shape[0]), \n",
    "            'constant', constant_values=(0, tokenizer.pad_token_id))\n",
    "        data['position_ids'] = np.pad(data['position_ids'], \n",
    "            (0, sequence_length - data['position_ids'].shape[0]), \n",
    "            'constant', constant_values=(0, 32000))\n",
    "        data['labels'] = data[\"input_ids\"].copy()\n",
    "        data['labels'][data['labels'] == tokenizer.pad_token_id] = -100\n",
    "\n",
    "        masking = data.pop('attention_mask')\n",
    "        masks = []\n",
    "        for m in masking:\n",
    "            masks.append(torch.tril(torch.ones(m, m)))\n",
    "        attention_mask = block_diagonal_concat_inverted(*masks)\n",
    "        data['attention_mask'] = pad_attention_mask_3d(\n",
    "            attention_mask, sequence_length, min_dtype)\n",
    "        data['attention_mask'] = data['attention_mask']\n",
    "\n",
    "        return data\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "1a0e2e29",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = DatasetFixed('packing-qwen3')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "eb493e24",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_ids = default_data_collator([dataset[0], dataset[1]])\n",
    "for k in input_ids.keys():\n",
    "    if isinstance(input_ids[k], torch.Tensor):\n",
    "        input_ids[k] = input_ids[k].cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "40ddfbd6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'loss': tensor(14.9695, device='cuda:0', grad_fn=<LinearCrossEntropyFunctionBackward>)}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model(**input_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "baecd701",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Jul 23 14:33:44 2025       \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 555.42.06              Driver Version: 555.42.06      CUDA Version: 12.5     |\n",
      "|-----------------------------------------+------------------------+----------------------+\n",
      "| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n",
      "|                                         |                        |               MIG M. |\n",
      "|=========================================+========================+======================|\n",
      "|   0  NVIDIA GeForce RTX 3090 Ti     Off |   00000000:01:00.0 Off |                  Off |\n",
      "| 30%   43C    P0            201W /  400W |   19792MiB /  24564MiB |     17%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "|   1  NVIDIA GeForce RTX 3090 Ti     Off |   00000000:08:00.0 Off |                  Off |\n",
      "|  0%   49C    P5             33W /  350W |      18MiB /  24564MiB |      0%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "|   2  NVIDIA GeForce RTX 3090        Off |   00000000:09:00.0 Off |                  N/A |\n",
      "|  0%   47C    P8             26W /  300W |    4847MiB /  24576MiB |      0%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "                                                                                         \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| Processes:                                                                              |\n",
      "|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |\n",
      "|        ID   ID                                                               Usage      |\n",
      "|=========================================================================================|\n",
      "|    0   N/A  N/A      1886      G   /usr/lib/xorg/Xorg                              9MiB |\n",
      "|    0   N/A  N/A      4930      G   /usr/bin/gnome-shell                            6MiB |\n",
      "|    0   N/A  N/A   3160824      C   /usr/bin/python3.10                         19752MiB |\n",
      "|    1   N/A  N/A      1886      G   /usr/lib/xorg/Xorg                              4MiB |\n",
      "|    2   N/A  N/A      1886      G   /usr/lib/xorg/Xorg                              4MiB |\n",
      "|    2   N/A  N/A   3130035      C   /usr/bin/python3.10                          4824MiB |\n",
      "+-----------------------------------------------------------------------------------------+\n"
     ]
    }
   ],
   "source": [
    "!nvidia-smi"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python3.10",
   "language": "python",
   "name": "python3.10"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "4f36b078",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[2025-07-23 14:33:37,504] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/usr/bin/ld: cannot find -laio\n",
	"collect2: error: ld returned 1 exit status\n",
	"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'\n",
	"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'\n",
	"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'\n",
	"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'\n",
	"collect2: error: ld returned 1 exit status\n"
	]
	}
	],
	"source": [
	"import torch\n",
	"import torch.nn.functional as F\n",
	"from transformers import AutoTokenizer, AutoModelForCausalLM, AddedToken, Qwen3ForCausalLM\n",
	"from transformers import default_data_collator\n",
	"from streaming import LocalDataset\n",
	"from streaming.base.format.mds.encodings import Encoding, _encodings\n",
	"from cut_cross_entropy import linear_cross_entropy\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "2f2a9eb2",
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"32779"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B-Base')\n",
	"\n",
	"extra = [AddedToken('<\|speech_start\|>')]\n",
	"for i in range(32768):\n",
	" extra.append(AddedToken(f'<\|speech_{i}\|>'))\n",
	"for i in range(5):\n",
	" extra.append(AddedToken(f'<\|pitch_{i}\|>'))\n",
	" extra.append(AddedToken(f'<\|rate_{i}\|>'))\n",
	"tokenizer.add_tokens(extra)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "f8c2fe70",
	"metadata": {},
	"outputs": [],
	"source": [
	"class Model(Qwen3ForCausalLM):\n",
	" def __init__(self, config):\n",
	" super().__init__(config)\n",
	" \n",
	" def forward(self, input_ids, attention_mask = None, position_ids = None, labels = None, **kwargs):\n",
	" super_out = self.model.forward(\n",
	" input_ids = input_ids, \n",
	" position_ids = position_ids, \n",
	" attention_mask = attention_mask, \n",
	" output_hidden_states = True,\n",
	" )\n",
	" if labels is not None:\n",
	" embeddings = super_out.last_hidden_state\n",
	" auto_shift_loss = linear_cross_entropy(\n",
	" embeddings.to(torch.bfloat16), \n",
	" self.lm_head.weight.to(torch.bfloat16), \n",
	" labels, \n",
	" shift=True,\n",
	" impl=\"cce_kahan_full_c\"\n",
	" )\n",
	" return {'loss': auto_shift_loss}\n",
	" return super_out"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "10f428ef",
	"metadata": {},
	"outputs": [],
	"source": [
	"model = Model.from_pretrained(\n",
	" 'Qwen/Qwen3-0.6B-Base', attn_implementation = 'sdpa',\n",
	" torch_dtype = torch.bfloat16\n",
	").cuda()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "aed214ff",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"Embedding(184448, 1024)"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"model.resize_token_embeddings(len(tokenizer), mean_resizing=False, pad_to_multiple_of=8)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "4f2ed186",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Wed Jul 23 14:33:43 2025 \n",
	"+-----------------------------------------------------------------------------------------+\n",
	"\| NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 \|\n",
	"\|-----------------------------------------+------------------------+----------------------+\n",
	"\| GPU Name Persistence-M \| Bus-Id Disp.A \| Volatile Uncorr. ECC \|\n",
	"\| Fan Temp Perf Pwr:Usage/Cap \| Memory-Usage \| GPU-Util Compute M. \|\n",
	"\| \| \| MIG M. \|\n",
	"\|=========================================+========================+======================\|\n",
	"\| 0 NVIDIA GeForce RTX 3090 Ti Off \| 00000000:01:00.0 Off \| Off \|\n",
	"\| 30% 40C P0 53W / 400W \| 1824MiB / 24564MiB \| 27% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	"\| 1 NVIDIA GeForce RTX 3090 Ti Off \| 00000000:08:00.0 Off \| Off \|\n",
	"\| 0% 49C P8 23W / 350W \| 18MiB / 24564MiB \| 0% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	"\| 2 NVIDIA GeForce RTX 3090 Off \| 00000000:09:00.0 Off \| N/A \|\n",
	"\| 0% 47C P8 26W / 300W \| 4847MiB / 24576MiB \| 0% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	" \n",
	"+-----------------------------------------------------------------------------------------+\n",
	"\| Processes: \|\n",
	"\| GPU GI CI PID Type Process name GPU Memory \|\n",
	"\| ID ID Usage \|\n",
	"\|=========================================================================================\|\n",
	"\| 0 N/A N/A 1886 G /usr/lib/xorg/Xorg 9MiB \|\n",
	"\| 0 N/A N/A 4930 G /usr/bin/gnome-shell 6MiB \|\n",
	"\| 0 N/A N/A 3160824 C /usr/bin/python3.10 1784MiB \|\n",
	"\| 1 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB \|\n",
	"\| 2 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB \|\n",
	"\| 2 N/A N/A 3130035 C /usr/bin/python3.10 4824MiB \|\n",
	"+-----------------------------------------------------------------------------------------+\n"
	]
	}
	],
	"source": [
	"!nvidia-smi"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "bb0df87d",
	"metadata": {},
	"outputs": [],
	"source": [
	"def pad_attention_mask_3d(attention_mask, max_size = 4096, value = 0.0):\n",
	" maxlen = attention_mask.shape[-1]\n",
	" return F.pad(\n",
	" attention_mask,\n",
	" (0, max_size - maxlen, 0, max_size - maxlen),\n",
	" value = value,\n",
	" )\n",
	"\n",
	"def block_diagonal_concat_inverted(*masks, dtype=torch.bfloat16):\n",
	" total_size = sum(mask.size(0) for mask in masks)\n",
	" combined_mask = torch.zeros(total_size, total_size, dtype=dtype)\n",
	"\n",
	" current_pos = 0\n",
	"\n",
	" for mask in masks:\n",
	" size = mask.size(0)\n",
	" combined_mask[current_pos:current_pos + size, current_pos:current_pos + size] = mask\n",
	" current_pos += size\n",
	"\n",
	" min_value = torch.finfo(dtype).min if dtype.is_floating_point else torch.iinfo(dtype).min\n",
	" inverted_mask = torch.where(combined_mask == 1, torch.tensor(0, dtype=dtype), min_value)\n",
	" return inverted_mask.unsqueeze(0)\n",
	"\n",
	"class UInt32(Encoding):\n",
	" def encode(self, obj) -> bytes:\n",
	" return obj.tobytes()\n",
	"\n",
	" def decode(self, data: bytes):\n",
	" return np.frombuffer(data, np.uint32)\n",
	"\n",
	"_encodings['uint32'] = UInt32\n",
	"min_dtype = torch.finfo(torch.bfloat16).min\n",
	"sequence_length = 4096\n",
	"\n",
	"class DatasetFixed(torch.utils.data.Dataset):\n",
	" def __init__(self, local):\n",
	" self.dataset = LocalDataset(local=local)\n",
	"\n",
	" def __getitem__(self, idx):\n",
	" data = self.dataset[idx]\n",
	" data.pop('audio', None)\n",
	" data.pop('text', None)\n",
	" data.pop('token_type_ids', None)\n",
	"\n",
	" for k in data.keys():\n",
	" data[k] = data[k].astype(np.int64)\n",
	"\n",
	" data['input_ids'] = np.pad(data['input_ids'], \n",
	" (0, sequence_length - data['input_ids'].shape[0]), \n",
	" 'constant', constant_values=(0, tokenizer.pad_token_id))\n",
	" data['position_ids'] = np.pad(data['position_ids'], \n",
	" (0, sequence_length - data['position_ids'].shape[0]), \n",
	" 'constant', constant_values=(0, 32000))\n",
	" data['labels'] = data[\"input_ids\"].copy()\n",
	" data['labels'][data['labels'] == tokenizer.pad_token_id] = -100\n",
	"\n",
	" masking = data.pop('attention_mask')\n",
	" masks = []\n",
	" for m in masking:\n",
	" masks.append(torch.tril(torch.ones(m, m)))\n",
	" attention_mask = block_diagonal_concat_inverted(*masks)\n",
	" data['attention_mask'] = pad_attention_mask_3d(\n",
	" attention_mask, sequence_length, min_dtype)\n",
	" data['attention_mask'] = data['attention_mask']\n",
	"\n",
	" return data\n",
	"\n",
	" def __len__(self):\n",
	" return len(self.dataset)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "1a0e2e29",
	"metadata": {},
	"outputs": [],
	"source": [
	"dataset = DatasetFixed('packing-qwen3')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "eb493e24",
	"metadata": {},
	"outputs": [],
	"source": [
	"input_ids = default_data_collator([dataset[0], dataset[1]])\n",
	"for k in input_ids.keys():\n",
	" if isinstance(input_ids[k], torch.Tensor):\n",
	" input_ids[k] = input_ids[k].cuda()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "40ddfbd6",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'loss': tensor(14.9695, device='cuda:0', grad_fn=<LinearCrossEntropyFunctionBackward>)}"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"model(**input_ids)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "baecd701",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Wed Jul 23 14:33:44 2025 \n",
	"+-----------------------------------------------------------------------------------------+\n",
	"\| NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 \|\n",
	"\|-----------------------------------------+------------------------+----------------------+\n",
	"\| GPU Name Persistence-M \| Bus-Id Disp.A \| Volatile Uncorr. ECC \|\n",
	"\| Fan Temp Perf Pwr:Usage/Cap \| Memory-Usage \| GPU-Util Compute M. \|\n",
	"\| \| \| MIG M. \|\n",
	"\|=========================================+========================+======================\|\n",
	"\| 0 NVIDIA GeForce RTX 3090 Ti Off \| 00000000:01:00.0 Off \| Off \|\n",
	"\| 30% 43C P0 201W / 400W \| 19792MiB / 24564MiB \| 17% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	"\| 1 NVIDIA GeForce RTX 3090 Ti Off \| 00000000:08:00.0 Off \| Off \|\n",
	"\| 0% 49C P5 33W / 350W \| 18MiB / 24564MiB \| 0% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	"\| 2 NVIDIA GeForce RTX 3090 Off \| 00000000:09:00.0 Off \| N/A \|\n",
	"\| 0% 47C P8 26W / 300W \| 4847MiB / 24576MiB \| 0% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	" \n",
	"+-----------------------------------------------------------------------------------------+\n",
	"\| Processes: \|\n",
	"\| GPU GI CI PID Type Process name GPU Memory \|\n",
	"\| ID ID Usage \|\n",
	"\|=========================================================================================\|\n",
	"\| 0 N/A N/A 1886 G /usr/lib/xorg/Xorg 9MiB \|\n",
	"\| 0 N/A N/A 4930 G /usr/bin/gnome-shell 6MiB \|\n",
	"\| 0 N/A N/A 3160824 C /usr/bin/python3.10 19752MiB \|\n",
	"\| 1 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB \|\n",
	"\| 2 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB \|\n",
	"\| 2 N/A N/A 3130035 C /usr/bin/python3.10 4824MiB \|\n",
	"+-----------------------------------------------------------------------------------------+\n"
	]
	}
	],
	"source": [
	"!nvidia-smi"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "python3.10",
	"language": "python",
	"name": "python3.10"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.17"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}