alexarmbr

Nsight Systems

add torch.cuda.cudart().cudaProfilerStart() and torch.cuda.cudart().cudaProfilerStop() where profiling should start and stop. launch profiler with

CUDA_VISIBLE_DEVICES=0,1,2,3 \
nsys profile \
-w true \
-t cuda,nvtx,osrt,cudnn,cublas \

	# mypy: allow-untyped-decorators
	# mypy: allow-untyped-defs
	import functools
	import itertools
	import logging
	import operator
	from collections import Counter, defaultdict
	from typing import Any, Callable, Optional, TypeVar, Union
	from typing_extensions import ParamSpec

	# torchrun --nproc-per-node 1 benchmark_sdpa.py
	# torchrun --nproc-per-node 2 benchmark_sdpa.py
	# torchrun --nproc-per-node 4 benchmark_sdpa.py
	# torchrun --nproc-per-node 8 benchmark_sdpa.py

	import torch
	import torch.nn.functional as F

	from torch.distributed.tensor.experimental import context_parallel
	from torch.distributed.tensor.experimental._attention import _cp_options

	import torch
	import torch.nn.functional as F
	import torch.profiler

	def benchmark_forward_pass(q, k, v, num_warmup=10, num_timed_runs=20):
	"""
	Benchmarks the forward pass of torch.nn.functional.scaled_dot_product_attention.
	"""
	start_event = torch.cuda.Event(enable_timing=True)
	end_event = torch.cuda.Event(enable_timing=True)


	import os
	from tqdm import tqdm

	import torch
	import torch.distributed as dist
	import torch.nn.functional as F
	from torch.distributed.tensor.experimental import context_parallel
	from torch.distributed.tensor.experimental._attention import _cp_options
	from torch.nn.attention import SDPBackend, sdpa_kernel

	import functools
	import torch
	import math

	def taylor_seer_approximation(WARMUP_STEPS=1, SKIP_INTERVAL_STEPS=1, compute_step_map=None, n_derivatives = 2):
	"""
	A decorator that approximates the forward pass of an nn.Module to reduce computation.

	Args:
	warmup: Number of steps to compute the actual forward pass before starting approximation

	"""
	test performance and correctness of ring attention vs. single gpu attention
	torchrun --nproc-per-node 4 ring_attn.py
	using 4 H100s I get:
	Rank 0 single gpu attention: 261.78 ms
	Rank 0 ring attention: 73.34 ms
	"""

	import os
	import math

	"""
	test performance and correctness of ulysses parallel attention vs single gpu attention
	torchrun --nproc-per-node 2 benchmark_attn.py

	using two H100s I get:
	Rank 0 single gpu attention: 1698.14 ms
	Rank 0 ulysses attention: 912.84 ms

	running pip install para-attn should install everything needed
	"""