This is a from-scratch, copy/paste runbook for getting:
- Model:
nvidia/Qwen3.5-397B-A17B-NVFP4 - Runtime: SGLang
- GPUs: all 4 GPUs (
TP=4) - Python:
3.12viauv - PyTorch: nightly
cu130 - Build threading cap:
16
| from __future__ import annotations | |
| from collections import defaultdict | |
| from typing import Iterable | |
| import torch | |
| from torch import Tensor | |
| from torch.optim import Optimizer | |
| from torch.optim._functional import adamw as functional_adamw |
| cmake_minimum_required(VERSION 3.10) | |
| project(MonteCarloCasino) | |
| set(CMAKE_CXX_STANDARD 17) | |
| set(CMAKE_CXX_STANDARD_REQUIRED ON) | |
| # Set optimization flags | |
| set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native") | |
| # Find threads package |
| How to quantize 70B model so it will fit on 2x4090 GPUs: | |
| I tried EXL2, AutoAWQ, and SqueezeLLM and they all failed for different reasons (issues opened). | |
| HQQ worked: | |
| I rented a 4x GPU 1TB RAM ($19/hr) instance on runpod with 1024GB container and 1024GB workspace disk space. | |
| I think you only need 2x GPU with 80GB VRAM and 512GB+ system RAM so probably overpaid. | |
| Note you need to fill in the form to get access to the 70B Meta weights. |
| # conda create -n dbrx python=3.10 -y && conda activate dbrx | |
| # pip install torch transformers tiktoken flash_attn bitsandbytes | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import torch | |
| tokenizer = AutoTokenizer.from_pretrained("SinclairSchneider/dbrx-instruct-quantization-fixed", trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained("SinclairSchneider/dbrx-instruct-quantization-fixed", device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True, load_in_4bit=True) | |
| input_text = "What does it take to build a great LLM?" |
| # Collaboration between Claude-3 and GPT-4 to implement https://arxiv.org/pdf/2312.02116.pdf | |
| # This is just the GMM decoder part of the model they propose (which is the new thing). | |
| # This one was mainly generated by GPT-4. | |
| # The AIs provided two implementations of the idea and revised eachothers' code. | |
| # I tested that the unit tests pass but haven't tried it in a language model yet. | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F |
| # Collaboration between Claude-3 and GPT-4 to implement https://arxiv.org/pdf/2312.02116.pdf | |
| # This is just the GMM decoder part of the model they propose (which is the new thing). | |
| # This one was mainly generated by Claude-3. | |
| # The AIs provided two implementations of the idea and revised eachothers' code. | |
| # I tested that the unit tests pass but haven't tried it in a language model yet. | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| import torch.nn.init as init | |
| import math | |
| #torch.autograd.set_detect_anomaly(True) | |
| class FeedForward(torch.nn.Module): | |
| def __init__(self, input_features, output_features): |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.init as init | |
| import torch.nn.functional as F | |
| # This layer is dropped into your pre-trained PyTorch model where nn.Linear is used | |
| class DoRALayer(nn.Module): | |
| def __init__(self, d_in, d_out, rank=4): | |
| super().__init__() |
| ##################################################################### | |
| # Auto Z-Calibration | |
| ##################################################################### | |
| [z_calibration] | |
| probe_nozzle_x: 175.5 | |
| probe_nozzle_y: 257 | |
| # The X and Y coordinates (in mm) for clicking the nozzle on the | |
| # Z endstop. | |
| probe_switch_x: 169.3 |