w32zhong’s gists

w32zhong / Dockerfile

Last active February 28, 2025 16:34

Example dockerfile

	FROM nvcr.io/nvidia/pytorch:23.11-py3
	WORKDIR /workspace
	RUN pip install -r r1.txt
	ADD requirements.txt r2.txt
	# FlashAttention-2 compatibility copied from https://github.com/Dao-AILab/flash-attention/issues/836#issuecomment-1951433985
	RUN pip install flash-attn==2.5.1.post1
	RUN apt update && apt install -y tmux git-lfs
	RUN pip install nvitop
	ADD . myproject
	WORKDIR /workspace/myproject

w32zhong / grpo_demo.py

Created February 18, 2025 01:17 — forked from willccbb/grpo_demo.py

GRPO Llama-1B

	# train_grpo.py
	#
	# See https://github.com/willccbb/verifiers for ongoing developments
	#
	import re
	import torch
	from datasets import load_dataset, Dataset
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from peft import LoraConfig
	from trl import GRPOConfig, GRPOTrainer

w32zhong / steps.md

Last active February 10, 2025 22:09

EAGLE v1 Replication

Set up environment and run an inference test:

git clone --branch v1 --depth 1 https://github.com/SafeAILab/EAGLE.git EAGLE-v1
cd EAGLE-v1
wget https://raw.githubusercontent.com/w32zhong/EAGLE/refs/heads/eagle-v1-save/application/test_v1.py -O eagle/application/test_v1.py
pip install -e .
pip install transformers==4.36.2
pip install accelerate==0.21.0
pip install datasets==3.2.0

w32zhong / gpu_vram_estimate.py

Created October 5, 2024 17:14

GPU vram estimate for pre-training LLMs.

	import math

	def act_mem(layers, seqlen, h_dim, heads, precision=2, bs=1):
	""" Returns amount of GPU VRAM (in GB) required to store
	intermediate activations for traditional Transformer blocks
	"""
	mem_bytes = layers * precision * seqlen * bs * h_dim * (
	16 + 2/precision + 2headsseqlen/h_dim
	+ headsseqlen/(precisionh_dim)
	)

w32zhong / pandas_fire.py

Last active September 16, 2024 02:14

	def test(method, bits, random_top_layer, quantize_top_layer, results={}):
	print(prompt)
	start_time = time.time()
	if method == 'vanilla':
	cnt_tokens = test_vanilla(bits)
	elif method == 'eagle':
	cnt_tokens = test_eagle(bits,
	random_top_layer=random_top_layer,
	quantize_top_layer=quantize_top_layer
	)

w32zhong / crazy-tir.py

Last active July 23, 2024 03:30

	@I.ir_module
	class Module:
	@T.prim_func
	def main(var_A: T.handle, B: T.Buffer((768, 384), "int8"), Scale: T.Buffer((768, 3), "float16"), Zeros: T.Buffer((768, 3), "float16"), var_D: T.handle):
	T.func_attr({"dequantize_info": {"B_decode": {"decode_block": "B_decode", "fast_decoding": T.bool(False), "group_size": 256, "source_format": {"bits": 4, "format": "uint"}, "storage_dtype": "int8", "target_format": "float16", "with_scaling": T.bool(True), "with_zeros": T.bool(True), "zeros_mode": "rescale"}}, "dlight.tensorcore_prenormlized": T.bool(True), "opt_shapes": {"m": [2, 12]}, "tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
	m = T.int32()
	A = T.match_buffer(var_A, (m, 768), "float16")
	D = T.match_buffer(var_D, (m, 768), "float16")
	# with T.block("root"):
	A_reindex_pad_shared_dyn = T.alloc_buffer((1, (m + 127) // 128 * 128, 768), "float16", scope="shared.dyn")

w32zhong / interactive_pyplot.py

Last active November 7, 2023 18:52

	%pip install ipympl
	%matplotlib ipympl

	import numpy as np
	import matplotlib.pyplot as plt
	from matplotlib.widgets import Slider

	# draw initial triangle
	triangle = np.array([
	[1, 4, 1, 1],

w32zhong / bark.py

Last active May 15, 2023 21:37

Bark

	# pip install git+https://github.com/suno-ai/bark.git && pip uninstall -y torch torchvision torchaudio && pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118

	text_prompt = """
	For all neural retrievers, we fine-tune them on top of a further-pretrained backbone using a batched triplets.
	It contains a query q, and a pair of positive and negative passages, p plus and p minus.
	we use passages of other training instances as additional negatives, which is a common practice to get more training samples for free, basically.

	"""

	from bark import SAMPLE_RATE, generate_audio, preload_models

w32zhong / build.sh

Last active May 11, 2023 02:53

Build PyTorch 2.0

	# clone pytorch at b004c0b3c6a1ee39ba0b512a00d95e7f83852556 with all submodules.
	git clone -b main --recursive https://github.com/pytorch/pytorch
	cd pytorch

	inotifywait --event create -rm /home/tk/anaconda3/envs/pytorch-ref/

	conda deactivate
	conda env remove -n pytorch-src

	conda create -n pytorch-src python=3.11

w32zhong / llama.sh

Last active August 7, 2023 01:25

	conda create --name llama -c conda-forge python=3.8
	conda activate llama

	pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
	python -c 'import torch; print(torch.cuda.is_available())'

	conda install -c conda-forge gxx_linux-64=10.4.0

	conda install cuda -c nvidia/label/cuda-11.8.0
	#pip install packaging flash-attn

Wei w32zhong

EAGLE v1 Replication