huseinzol05 · July 23, 2025 06:35
diff --git a/test-flash-multipacking.ipynb b/test-flash-multipacking.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fa9c5a2e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-07-23 14:23:00,522] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/bin/ld: cannot find -laio\n",
      "collect2: error: ld returned 1 exit status\n",
      "/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'\n",
      "/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'\n",
      "/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'\n",
      "/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'\n",
      "collect2: error: ld returned 1 exit status\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, AddedToken, Qwen3ForCausalLM\n",
    "from streaming import LocalDataset\n",
    "from streaming.base.format.mds.encodings import Encoding, _encodings\n",
    "from cut_cross_entropy import linear_cross_entropy\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d59049af",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32779"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B-Base')\n",
    "\n",
    "extra = [AddedToken('<|speech_start|>')]\n",
    "for i in range(32768):\n",
    "    extra.append(AddedToken(f'<|speech_{i}|>'))\n",
    "for i in range(5):\n",
    "    extra.append(AddedToken(f'<|pitch_{i}|>'))\n",
    "    extra.append(AddedToken(f'<|rate_{i}|>'))\n",
    "tokenizer.add_tokens(extra)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e648a52b",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Model(Qwen3ForCausalLM):\n",
    "    def __init__(self, config):\n",
    "        super().__init__(config)\n",
    "        \n",
    "    def forward(self, input_ids, attention_mask = None, position_ids = None, labels = None, **kwargs):\n",
    "        super_out = self.model.forward(\n",
    "            input_ids = input_ids, \n",
    "            position_ids = position_ids, \n",
    "            attention_mask = attention_mask, \n",
    "            output_hidden_states = True,\n",
    "        )\n",
    "        if labels is not None:\n",
    "            embeddings = super_out.last_hidden_state\n",
    "            auto_shift_loss = linear_cross_entropy(\n",
    "                embeddings.to(torch.bfloat16), \n",
    "                self.lm_head.weight.to(torch.bfloat16), \n",
    "                labels, \n",
    "                shift=True,\n",
    "                impl=\"cce_kahan_full_c\"\n",
    "            )\n",
    "            return {'loss': auto_shift_loss}\n",
    "        return super_out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2c2f678e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n"
     ]
    }
   ],
   "source": [
    "model = Model.from_pretrained(\n",
    "    'Qwen/Qwen3-0.6B-Base', attn_implementation = 'flash_attention_2',\n",
    "    torch_dtype = torch.bfloat16\n",
    ").cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "bbaeec27",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Embedding(184448, 1024)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.resize_token_embeddings(len(tokenizer), mean_resizing=False, pad_to_multiple_of=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0520294d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Jul 23 14:23:06 2025       \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 555.42.06              Driver Version: 555.42.06      CUDA Version: 12.5     |\n",
      "|-----------------------------------------+------------------------+----------------------+\n",
      "| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n",
      "|                                         |                        |               MIG M. |\n",
      "|=========================================+========================+======================|\n",
      "|   0  NVIDIA GeForce RTX 3090 Ti     Off |   00000000:01:00.0 Off |                  Off |\n",
      "| 30%   41C    P0             37W /  400W |    1824MiB /  24564MiB |     19%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "|   1  NVIDIA GeForce RTX 3090 Ti     Off |   00000000:08:00.0 Off |                  Off |\n",
      "|  0%   49C    P8             22W /  350W |      18MiB /  24564MiB |      0%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "|   2  NVIDIA GeForce RTX 3090        Off |   00000000:09:00.0 Off |                  N/A |\n",
      "|  0%   36C    P8             25W /  300W |    4847MiB /  24576MiB |      0%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "                                                                                         \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| Processes:                                                                              |\n",
      "|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |\n",
      "|        ID   ID                                                               Usage      |\n",
      "|=========================================================================================|\n",
      "|    0   N/A  N/A      1886      G   /usr/lib/xorg/Xorg                              9MiB |\n",
      "|    0   N/A  N/A      4930      G   /usr/bin/gnome-shell                            6MiB |\n",
      "|    0   N/A  N/A   3157815      C   /usr/bin/python3.10                          1784MiB |\n",
      "|    1   N/A  N/A      1886      G   /usr/lib/xorg/Xorg                              4MiB |\n",
      "|    2   N/A  N/A      1886      G   /usr/lib/xorg/Xorg                              4MiB |\n",
      "|    2   N/A  N/A   3130035      C   /usr/bin/python3.10                          4824MiB |\n",
      "+-----------------------------------------------------------------------------------------+\n"
     ]
    }
   ],
   "source": [
    "!nvidia-smi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d840eb0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "class UInt32(Encoding):\n",
    "    def encode(self, obj) -> bytes:\n",
    "        return obj.tobytes()\n",
    "\n",
    "    def decode(self, data: bytes):\n",
    "        return np.frombuffer(data, np.uint32)\n",
    "\n",
    "_encodings['uint32'] = UInt32\n",
    "\n",
    "class DatasetFixed(torch.utils.data.Dataset):\n",
    "    def __init__(self, local):\n",
    "        self.dataset = LocalDataset(local=local)\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        data = self.dataset[idx]\n",
    "        data.pop('audio', None)\n",
    "        data.pop('text', None)\n",
    "        data.pop('token_type_ids', None)\n",
    "\n",
    "        for k in data.keys():\n",
    "            data[k] = data[k].astype(np.int64)\n",
    "        \n",
    "        \"\"\"\n",
    "        {'attention_mask': array([ 471,  775,  661, 1255,  568]),\n",
    "         'input_ids': array([151644,     71,    810, ..., 160361, 182069, 151645]),\n",
    "         'position_ids': array([  0,   1,   2, ..., 565, 566, 567])\n",
    "        }\n",
    "        \"\"\"\n",
    "\n",
    "        return data\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "62a5f926",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = DatasetFixed('packing-qwen3')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ec3522db",
   "metadata": {},
   "outputs": [],
   "source": [
    "def collator(batch):\n",
    "    batch = [b for b in batch if b is not None]\n",
    "    input_ids = [b['input_ids'] for b in batch]\n",
    "    position_ids = [b['position_ids'] for b in batch]\n",
    "    labels = [b['input_ids'].copy() for b in batch]\n",
    "    attention_mask = [b['attention_mask'] for b in batch]\n",
    "    input_ids = np.concatenate(input_ids)\n",
    "    position_ids = np.concatenate(position_ids)\n",
    "    labels = np.concatenate(labels)\n",
    "    query_lens = np.concatenate(attention_mask)\n",
    "    cumsum = [0] + np.cumsum(query_lens).tolist()\n",
    "    max_cumsum = int(np.max(cumsum))\n",
    "    cu_seq_lens_q = torch.tensor(cumsum, dtype=torch.int32)\n",
    "    cu_seq_lens_k = torch.tensor(cumsum, dtype=torch.int32)\n",
    "    max_seqlen_q = np.max(query_lens)\n",
    "    return {\n",
    "        'input_ids': torch.tensor(input_ids)[None],\n",
    "        'position_ids': torch.tensor(position_ids)[None],\n",
    "        'labels': torch.tensor(labels)[None],\n",
    "        'cu_seq_lens_q': cu_seq_lens_q,\n",
    "        'cu_seq_lens_k': cu_seq_lens_k,\n",
    "        'max_length_q': max_seqlen_q,\n",
    "        'max_length_k': max_seqlen_q\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1743bcc1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'attention_mask': array([ 471,  775,  661, 1255,  568]),\n",
       " 'input_ids': array([151644,     71,    810, ..., 160361, 182069, 151645]),\n",
       " 'position_ids': array([  0,   1,   2, ..., 565, 566, 567])}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "59d244d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_ids = collator([dataset[0], dataset[1], dataset[2]])\n",
    "for k in input_ids.keys():\n",
    "    if isinstance(input_ids[k], torch.Tensor):\n",
    "        input_ids[k] = input_ids[k].cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "aa27142f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': tensor([[151644,     71,    810,  ..., 155599, 158241, 151645]],\n",
       "        device='cuda:0'),\n",
       " 'position_ids': tensor([[  0,   1,   2,  ..., 821, 822, 823]], device='cuda:0'),\n",
       " 'labels': tensor([[151644,     71,    810,  ..., 155599, 158241, 151645]],\n",
       "        device='cuda:0'),\n",
       " 'cu_seq_lens_q': tensor([    0,   471,  1246,  1907,  3162,  3730,  4983,  5917,  6944,  7853,\n",
       "          8852,  9561, 10385], device='cuda:0', dtype=torch.int32),\n",
       " 'cu_seq_lens_k': tensor([    0,   471,  1246,  1907,  3162,  3730,  4983,  5917,  6944,  7853,\n",
       "          8852,  9561, 10385], device='cuda:0', dtype=torch.int32),\n",
       " 'max_length_q': 1255,\n",
       " 'max_length_k': 1255}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "350f5e51",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'loss': tensor(15.1131, device='cuda:0', grad_fn=<LinearCrossEntropyFunctionBackward>)}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model(**input_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "b7308f51",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Jul 23 14:24:33 2025       \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 555.42.06              Driver Version: 555.42.06      CUDA Version: 12.5     |\n",
      "|-----------------------------------------+------------------------+----------------------+\n",
      "| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n",
      "|                                         |                        |               MIG M. |\n",
      "|=========================================+========================+======================|\n",
      "|   0  NVIDIA GeForce RTX 3090 Ti     Off |   00000000:01:00.0 Off |                  Off |\n",
      "| 30%   38C    P8             22W /  400W |   22814MiB /  24564MiB |      0%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "|   1  NVIDIA GeForce RTX 3090 Ti     Off |   00000000:08:00.0 Off |                  Off |\n",
      "|  0%   49C    P8             23W /  350W |      18MiB /  24564MiB |      0%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "|   2  NVIDIA GeForce RTX 3090        Off |   00000000:09:00.0 Off |                  N/A |\n",
      "|  0%   38C    P8             24W /  300W |    4847MiB /  24576MiB |      0%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "                                                                                         \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| Processes:                                                                              |\n",
      "|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |\n",
      "|        ID   ID                                                               Usage      |\n",
      "|=========================================================================================|\n",
      "|    0   N/A  N/A      1886      G   /usr/lib/xorg/Xorg                              9MiB |\n",
      "|    0   N/A  N/A      4930      G   /usr/bin/gnome-shell                            6MiB |\n",
      "|    0   N/A  N/A   3157815      C   /usr/bin/python3.10                         22774MiB |\n",
      "|    1   N/A  N/A      1886      G   /usr/lib/xorg/Xorg                              4MiB |\n",
      "|    2   N/A  N/A      1886      G   /usr/lib/xorg/Xorg                              4MiB |\n",
      "|    2   N/A  N/A   3130035      C   /usr/bin/python3.10                          4824MiB |\n",
      "+-----------------------------------------------------------------------------------------+\n"
     ]
    }
   ],
   "source": [
    "!nvidia-smi"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python3.10",
   "language": "python",
   "name": "python3.10"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
diff --git a/test-sdpa-multipacking.ipynb b/test-sdpa-multipacking.ipynb
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "fa9c5a2e",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[2025-07-23 14:23:00,522] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/usr/bin/ld: cannot find -laio\n",
	"collect2: error: ld returned 1 exit status\n",
	"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'\n",
	"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'\n",
	"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'\n",
	"/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'\n",
	"collect2: error: ld returned 1 exit status\n"
	]
	}
	],
	"source": [
	"import torch\n",
	"from transformers import AutoTokenizer, AutoModelForCausalLM, AddedToken, Qwen3ForCausalLM\n",
	"from streaming import LocalDataset\n",
	"from streaming.base.format.mds.encodings import Encoding, _encodings\n",
	"from cut_cross_entropy import linear_cross_entropy\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "d59049af",
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"32779"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B-Base')\n",
	"\n",
	"extra = [AddedToken('<\|speech_start\|>')]\n",
	"for i in range(32768):\n",
	" extra.append(AddedToken(f'<\|speech_{i}\|>'))\n",
	"for i in range(5):\n",
	" extra.append(AddedToken(f'<\|pitch_{i}\|>'))\n",
	" extra.append(AddedToken(f'<\|rate_{i}\|>'))\n",
	"tokenizer.add_tokens(extra)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "e648a52b",
	"metadata": {},
	"outputs": [],
	"source": [
	"class Model(Qwen3ForCausalLM):\n",
	" def __init__(self, config):\n",
	" super().__init__(config)\n",
	" \n",
	" def forward(self, input_ids, attention_mask = None, position_ids = None, labels = None, **kwargs):\n",
	" super_out = self.model.forward(\n",
	" input_ids = input_ids, \n",
	" position_ids = position_ids, \n",
	" attention_mask = attention_mask, \n",
	" output_hidden_states = True,\n",
	" )\n",
	" if labels is not None:\n",
	" embeddings = super_out.last_hidden_state\n",
	" auto_shift_loss = linear_cross_entropy(\n",
	" embeddings.to(torch.bfloat16), \n",
	" self.lm_head.weight.to(torch.bfloat16), \n",
	" labels, \n",
	" shift=True,\n",
	" impl=\"cce_kahan_full_c\"\n",
	" )\n",
	" return {'loss': auto_shift_loss}\n",
	" return super_out"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "2c2f678e",
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n"
	]
	}
	],
	"source": [
	"model = Model.from_pretrained(\n",
	" 'Qwen/Qwen3-0.6B-Base', attn_implementation = 'flash_attention_2',\n",
	" torch_dtype = torch.bfloat16\n",
	").cuda()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "bbaeec27",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"Embedding(184448, 1024)"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"model.resize_token_embeddings(len(tokenizer), mean_resizing=False, pad_to_multiple_of=8)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "0520294d",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Wed Jul 23 14:23:06 2025 \n",
	"+-----------------------------------------------------------------------------------------+\n",
	"\| NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 \|\n",
	"\|-----------------------------------------+------------------------+----------------------+\n",
	"\| GPU Name Persistence-M \| Bus-Id Disp.A \| Volatile Uncorr. ECC \|\n",
	"\| Fan Temp Perf Pwr:Usage/Cap \| Memory-Usage \| GPU-Util Compute M. \|\n",
	"\| \| \| MIG M. \|\n",
	"\|=========================================+========================+======================\|\n",
	"\| 0 NVIDIA GeForce RTX 3090 Ti Off \| 00000000:01:00.0 Off \| Off \|\n",
	"\| 30% 41C P0 37W / 400W \| 1824MiB / 24564MiB \| 19% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	"\| 1 NVIDIA GeForce RTX 3090 Ti Off \| 00000000:08:00.0 Off \| Off \|\n",
	"\| 0% 49C P8 22W / 350W \| 18MiB / 24564MiB \| 0% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	"\| 2 NVIDIA GeForce RTX 3090 Off \| 00000000:09:00.0 Off \| N/A \|\n",
	"\| 0% 36C P8 25W / 300W \| 4847MiB / 24576MiB \| 0% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	" \n",
	"+-----------------------------------------------------------------------------------------+\n",
	"\| Processes: \|\n",
	"\| GPU GI CI PID Type Process name GPU Memory \|\n",
	"\| ID ID Usage \|\n",
	"\|=========================================================================================\|\n",
	"\| 0 N/A N/A 1886 G /usr/lib/xorg/Xorg 9MiB \|\n",
	"\| 0 N/A N/A 4930 G /usr/bin/gnome-shell 6MiB \|\n",
	"\| 0 N/A N/A 3157815 C /usr/bin/python3.10 1784MiB \|\n",
	"\| 1 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB \|\n",
	"\| 2 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB \|\n",
	"\| 2 N/A N/A 3130035 C /usr/bin/python3.10 4824MiB \|\n",
	"+-----------------------------------------------------------------------------------------+\n"
	]
	}
	],
	"source": [
	"!nvidia-smi"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "d840eb0a",
	"metadata": {},
	"outputs": [],
	"source": [
	"class UInt32(Encoding):\n",
	" def encode(self, obj) -> bytes:\n",
	" return obj.tobytes()\n",
	"\n",
	" def decode(self, data: bytes):\n",
	" return np.frombuffer(data, np.uint32)\n",
	"\n",
	"_encodings['uint32'] = UInt32\n",
	"\n",
	"class DatasetFixed(torch.utils.data.Dataset):\n",
	" def __init__(self, local):\n",
	" self.dataset = LocalDataset(local=local)\n",
	"\n",
	" def __getitem__(self, idx):\n",
	" data = self.dataset[idx]\n",
	" data.pop('audio', None)\n",
	" data.pop('text', None)\n",
	" data.pop('token_type_ids', None)\n",
	"\n",
	" for k in data.keys():\n",
	" data[k] = data[k].astype(np.int64)\n",
	" \n",
	" \"\"\"\n",
	" {'attention_mask': array([ 471, 775, 661, 1255, 568]),\n",
	" 'input_ids': array([151644, 71, 810, ..., 160361, 182069, 151645]),\n",
	" 'position_ids': array([ 0, 1, 2, ..., 565, 566, 567])\n",
	" }\n",
	" \"\"\"\n",
	"\n",
	" return data\n",
	"\n",
	" def __len__(self):\n",
	" return len(self.dataset)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "62a5f926",
	"metadata": {},
	"outputs": [],
	"source": [
	"dataset = DatasetFixed('packing-qwen3')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "ec3522db",
	"metadata": {},
	"outputs": [],
	"source": [
	"def collator(batch):\n",
	" batch = [b for b in batch if b is not None]\n",
	" input_ids = [b['input_ids'] for b in batch]\n",
	" position_ids = [b['position_ids'] for b in batch]\n",
	" labels = [b['input_ids'].copy() for b in batch]\n",
	" attention_mask = [b['attention_mask'] for b in batch]\n",
	" input_ids = np.concatenate(input_ids)\n",
	" position_ids = np.concatenate(position_ids)\n",
	" labels = np.concatenate(labels)\n",
	" query_lens = np.concatenate(attention_mask)\n",
	" cumsum = [0] + np.cumsum(query_lens).tolist()\n",
	" max_cumsum = int(np.max(cumsum))\n",
	" cu_seq_lens_q = torch.tensor(cumsum, dtype=torch.int32)\n",
	" cu_seq_lens_k = torch.tensor(cumsum, dtype=torch.int32)\n",
	" max_seqlen_q = np.max(query_lens)\n",
	" return {\n",
	" 'input_ids': torch.tensor(input_ids)[None],\n",
	" 'position_ids': torch.tensor(position_ids)[None],\n",
	" 'labels': torch.tensor(labels)[None],\n",
	" 'cu_seq_lens_q': cu_seq_lens_q,\n",
	" 'cu_seq_lens_k': cu_seq_lens_k,\n",
	" 'max_length_q': max_seqlen_q,\n",
	" 'max_length_k': max_seqlen_q\n",
	" }"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "1743bcc1",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'attention_mask': array([ 471, 775, 661, 1255, 568]),\n",
	" 'input_ids': array([151644, 71, 810, ..., 160361, 182069, 151645]),\n",
	" 'position_ids': array([ 0, 1, 2, ..., 565, 566, 567])}"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dataset[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "59d244d8",
	"metadata": {},
	"outputs": [],
	"source": [
	"input_ids = collator([dataset[0], dataset[1], dataset[2]])\n",
	"for k in input_ids.keys():\n",
	" if isinstance(input_ids[k], torch.Tensor):\n",
	" input_ids[k] = input_ids[k].cuda()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"id": "aa27142f",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'input_ids': tensor([[151644, 71, 810, ..., 155599, 158241, 151645]],\n",
	" device='cuda:0'),\n",
	" 'position_ids': tensor([[ 0, 1, 2, ..., 821, 822, 823]], device='cuda:0'),\n",
	" 'labels': tensor([[151644, 71, 810, ..., 155599, 158241, 151645]],\n",
	" device='cuda:0'),\n",
	" 'cu_seq_lens_q': tensor([ 0, 471, 1246, 1907, 3162, 3730, 4983, 5917, 6944, 7853,\n",
	" 8852, 9561, 10385], device='cuda:0', dtype=torch.int32),\n",
	" 'cu_seq_lens_k': tensor([ 0, 471, 1246, 1907, 3162, 3730, 4983, 5917, 6944, 7853,\n",
	" 8852, 9561, 10385], device='cuda:0', dtype=torch.int32),\n",
	" 'max_length_q': 1255,\n",
	" 'max_length_k': 1255}"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"input_ids"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"id": "350f5e51",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'loss': tensor(15.1131, device='cuda:0', grad_fn=<LinearCrossEntropyFunctionBackward>)}"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"model(**input_ids)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"id": "b7308f51",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Wed Jul 23 14:24:33 2025 \n",
	"+-----------------------------------------------------------------------------------------+\n",
	"\| NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 \|\n",
	"\|-----------------------------------------+------------------------+----------------------+\n",
	"\| GPU Name Persistence-M \| Bus-Id Disp.A \| Volatile Uncorr. ECC \|\n",
	"\| Fan Temp Perf Pwr:Usage/Cap \| Memory-Usage \| GPU-Util Compute M. \|\n",
	"\| \| \| MIG M. \|\n",
	"\|=========================================+========================+======================\|\n",
	"\| 0 NVIDIA GeForce RTX 3090 Ti Off \| 00000000:01:00.0 Off \| Off \|\n",
	"\| 30% 38C P8 22W / 400W \| 22814MiB / 24564MiB \| 0% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	"\| 1 NVIDIA GeForce RTX 3090 Ti Off \| 00000000:08:00.0 Off \| Off \|\n",
	"\| 0% 49C P8 23W / 350W \| 18MiB / 24564MiB \| 0% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	"\| 2 NVIDIA GeForce RTX 3090 Off \| 00000000:09:00.0 Off \| N/A \|\n",
	"\| 0% 38C P8 24W / 300W \| 4847MiB / 24576MiB \| 0% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	" \n",
	"+-----------------------------------------------------------------------------------------+\n",
	"\| Processes: \|\n",
	"\| GPU GI CI PID Type Process name GPU Memory \|\n",
	"\| ID ID Usage \|\n",
	"\|=========================================================================================\|\n",
	"\| 0 N/A N/A 1886 G /usr/lib/xorg/Xorg 9MiB \|\n",
	"\| 0 N/A N/A 4930 G /usr/bin/gnome-shell 6MiB \|\n",
	"\| 0 N/A N/A 3157815 C /usr/bin/python3.10 22774MiB \|\n",
	"\| 1 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB \|\n",
	"\| 2 N/A N/A 1886 G /usr/lib/xorg/Xorg 4MiB \|\n",
	"\| 2 N/A N/A 3130035 C /usr/bin/python3.10 4824MiB \|\n",
	"+-----------------------------------------------------------------------------------------+\n"
	]
	}
	],
	"source": [
	"!nvidia-smi"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "python3.10",
	"language": "python",
	"name": "python3.10"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.17"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}