Skip to content

Instantly share code, notes, and snippets.

@maxisoft
Created October 22, 2024 14:00
Show Gist options
  • Save maxisoft/4954bb54f89b8384e496917003f6a6a5 to your computer and use it in GitHub Desktop.
Save maxisoft/4954bb54f89b8384e496917003f6a6a5 to your computer and use it in GitHub Desktop.
quick instructions on win10 for reproducability:
install visual studio 2022 pro with c++ dev/windows 10 sdk
install anaconda 2024.06
pytorch pytorch-cuda=12.4 (anaconda)
triton (v3.1.0-windows.post5) via wheels https://github.com/woct0rdho/triton-windows
install cuda 12 compatible sdk
in dir "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin\", symlink as admin "nvrtc-builtins64_126.dll" -> "nvrtc-builtins64_124.dll"

Installation Steps

  1. Install Visual Studio 2022 Pro:

    • Download and install Visual Studio 2022 Pro from the official Microsoft website.
    • During installation, select the Desktop development with C++ workload and the Windows 10 SDK component.
  2. Install Anaconda (2024.06 version):

    • Download and install the Anaconda individual installer (version 2024.06) from the official Anaconda website.
  3. Install PyTorch with CUDA Support:

    • Open the Anaconda Prompt and run the following command:
      conda install pytorch pytorch-cuda=12.4 -c pytorch
  4. Install Triton (v3.1.0-windows.post5):

    • Download the appropriate wheel file for Triton version 3.1.0-windows.post5 from the triton-windows repository.
    • Open the Anaconda Prompt, navigate to the download directory, and run:
      pip install <triton_wheel_filename.whl>
  5. Install CUDA Toolkit (12.x compatible):

    • Download and install the appropriate CUDA Toolkit version (compatible with CUDA 12.x) from the NVIDIA developer website.
  6. Create Symbolic Link (Admin Privileges Required):

    • Open a Command Prompt window as administrator.
    • Navigate to the C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin directory (adjust path if necessary).
    • Run the following command:
      mklink nvrtc-builtins64_124.dll nvrtc-builtins64_126.dll

Additional Notes:

  • This guide assumes you have an NVIDIA GPU with compute capability compatible with CUDA 12.x. Update commands accordingly if needed.
Machine architecture: x64
Build architecture: x64
== Python =====================================================
version: 3.12.4.final.0
architecture: x64
library: Python312.lib
libs: ['C:\\ProgramData\\anaconda3\\libs']
== Visual Studio Professional 2022 ============================
version: 17.8.34511.84
version (friendly): 2022
display version: 17.8.5
path: C:\Program Files\Microsoft Visual Studio\2022\Professional
== Visual C ===================================================
version: 14.38.33134
path: C:\Program Files\Microsoft Visual Studio\2022\Professional\VC
has cmake: True
has ninja: True
-- Tools ---------------------------------------------------
version: 14.38.33130
path: C:\Program Files\Microsoft Visual Studio\2022\Professional\VC\Tools\MSVC\14.38.33130
redist path: C:\Program Files\Microsoft Visual Studio\2022\Professional\VC\Redist\MSVC\14.38.33130\
-- F# ------------------------------------------------------
path: C:\Program Files\Microsoft Visual Studio\2022\Professional\Common7\IDE\CommonExtensions\Microsoft\FSharp\Tools
-- DLL -----------------------------------------------------
version: v143-14.38.33130.0
path: C:\Program Files\Microsoft Visual Studio\2022\Professional\VC\Redist\MSVC\14.38.33130\x64\Microsoft.VC143.CRT
== MSBuild ====================================================
version: 17.8.5.5502
path: C:\Program Files\Microsoft Visual Studio\2022\Professional\MSBuild\Current\Bin\MSBuild.exe
== Windows SDK ================================================
version: 10.0
sdk version: 10.0.22621.0
path: C:\Program Files (x86)\Windows Kits\10\
--------------
Nvidia Cuda 12.6 C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin
---------------
pytorch 2.5.0 cuda=12.4 installed via conda
import os
import subprocess
from functools import partial
import pyMSVC
environment = pyMSVC.Environment()
print(environment)
os.environ.update(environment) # should add msbuild's cl.exe to PATH
# check that visual studio compiler is in the path
subprocess.check_call('cl', shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
# add cuda binaries to path
program_files = os.environ.get("ProgramFiles", r"C:\Program Files")
os.environ["PATH"] += os.pathsep + os.path.join(program_files, r"NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin")
subprocess.check_call('ptxas --version', shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
# next is taken and adapted from https://github.com/pytorch-labs/attention-gym/tree/54755782172e89045bc908365f18ab75ae685708
import torch
from torch.nn.attention.flex_attention import (
create_block_mask,
flex_attention,
)
flex_attention = torch.compile(flex_attention, dynamic=False)
# Tanh Soft-Capping
@torch.library.custom_op("approx::tanh", mutates_args=())
def tanh_approx(inp: torch.Tensor) -> torch.Tensor:
return torch.tanh(inp)
@tanh_approx.register_fake
def _(inp: torch.Tensor) -> torch.Tensor:
return torch.tanh(inp)
from torch._inductor.lowering import make_pointwise, register_lowering
# Some internal torch.compile details
from torch._inductor.virtualized import ops
def tanh_approx_lowering(inp):
fn = partial(ops.inline_asm_elementwise, asm="tanh.approx.f32;")
return make_pointwise(fn)(inp)
register_lowering(torch.ops.approx.tanh)(tanh_approx_lowering)
class TanhApprox(torch.autograd.Function):
generate_vmap_rule = True
@staticmethod
def forward(x):
return torch.ops.approx.tanh(x)
@staticmethod
def setup_context(ctx, inputs, output):
(x,) = inputs
result = output
ctx.save_for_backward(result)
@staticmethod
def backward(ctx, grad_output):
(result,) = ctx.saved_tensors
return grad_output * (1 - result * result)
tanh_approx = TanhApprox.apply
def tanh_soft_cap(score, b, h, q_idx, kv_idx):
score = score / 2
score = tanh_approx(score)
return score * 2
def causal_mask_but_look_4_back(b, h, q_idx, kv_idx):
return (q_idx >= kv_idx) | (kv_idx - q_idx < 4)
NUM_HEADS = 2
MAX_SEQ_LEN = 32
def alibi_plus_tanh_score(score, b, h, q_idx, kv_idx):
bias = (q_idx - kv_idx)
scale = torch.exp2(-((h + 1) * 8.0 / NUM_HEADS)) # static scale
return torch.where(torch.isfinite(score), tanh_approx(score + bias * scale) * 2, -float("inf"))
block_mask = create_block_mask(causal_mask_but_look_4_back, 1, NUM_HEADS, MAX_SEQ_LEN, MAX_SEQ_LEN, device="cuda")
query = torch.randn(1, NUM_HEADS, MAX_SEQ_LEN, 64, device="cuda", dtype=torch.float16)
query[:, :, -8:, :] = -float("inf")
key = torch.randn(1, NUM_HEADS, MAX_SEQ_LEN, 64, device="cuda", dtype=torch.float16)
value = key # self attention
output, lse = flex_attention(query, key, value, block_mask=block_mask, score_mod=torch.compile(alibi_plus_tanh_score),
return_lse=True)
print(lse)
print(output)
print(output.size())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment