See rune2e.sh
for info on how to run the experiment.
Last active
January 3, 2025 02:04
-
-
Save charlesfrye/27f25188dbbcfdf20a83c0230020fe05 to your computer and use it in GitHub Desktop.
Reproducing results from "Beat GPT-4o at Python by Searching with 100 Dumb LLaMAs"
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
import json | |
from pathlib import Path | |
from dataclasses import dataclass, asdict | |
import modal | |
image = modal.Image.debian_slim(python_version="3.11").pip_install( | |
"openai==1.38.0", "datasets==2.20.0" | |
) | |
app = modal.App("eval-infinite-monkeys", image=image) | |
volume = modal.Volume.from_name("humaneval", create_if_missing=True) | |
DATA_DIR = Path("/mnt/humaneval") | |
default_system_prompt = "Write the body for the Python function provided in the prompt below. Do not write anything else. Your output will be directly concatenated with the prompt and the resulting function executed against tests." | |
MINUTES = 60 # seconds | |
HOURS = 60 * MINUTES | |
@dataclass | |
class CompletionParams: | |
model: str = None | |
max_tokens: int = 1024 | |
temperature: float = 0.7 | |
top_p: float = 0.9 | |
frequency_penalty: float = 0 | |
presence_penalty: float = 0 | |
n: int = 1 | |
stop: str = None | |
seed: int = None | |
@dataclass | |
class ClientParams: | |
app_name: str = "example-infinite-monkeys" | |
workspace: str = None | |
api_key: str = "super-secret-token" | |
@property | |
def url(self): | |
return f"https://{self.workspace}--{self.app_name}-serve.modal.run/v1" | |
@app.local_entrypoint() | |
def main( | |
app_name: str = "example-infinite-monkeys", | |
workspace: str = None, | |
api_key: str = "super-secret-token", | |
model: str = None, | |
max_tokens: int = 1024, | |
temperature: float = 0.7, | |
top_p: float = 0.9, | |
frequency_penalty: float = 0, | |
presence_penalty: float = 0, | |
n: int = 1, | |
stop: str = None, | |
seed: int = None, | |
data_dir: str = "dev-llm", | |
subsample: int = 1, | |
system_prompt: str = default_system_prompt, | |
dry_run: bool = True, | |
): | |
if workspace is None: | |
workspace = modal.config._profile | |
client_params = ClientParams(app_name, workspace, api_key) | |
completion_params = CompletionParams( | |
model=model, | |
max_tokens=max_tokens, | |
temperature=temperature, | |
top_p=top_p, | |
frequency_penalty=frequency_penalty, | |
presence_penalty=presence_penalty, | |
n=n, | |
stop=stop, | |
seed=seed, | |
) | |
save_dataset.remote(path=data_dir, subsample=subsample) | |
results = run_human_eval.remote( | |
client_params=client_params, | |
completion_params=completion_params, | |
system_prompt=system_prompt, | |
data_dir=data_dir, | |
dry_run=dry_run, | |
) | |
if results: | |
with open("/tmp/results.jsonl", "w") as f: | |
f.writelines(json.dumps(result) + "\n" for result in results) | |
print(f"results saved locally to {f.name}") | |
@app.function(volumes={DATA_DIR: volume}, timeout=1 * HOURS) | |
def run_human_eval( | |
client_params: ClientParams, | |
completion_params: CompletionParams, | |
data_dir="dev-llm", | |
system_prompt: str = default_system_prompt, | |
dry_run=True, | |
): | |
dataset = load_dataset(data_dir) | |
timestamp = datetime.utcnow().isoformat() + "Z" | |
output_dir = Path(DATA_DIR) / data_dir / f"run-{timestamp}" | |
output_dir.mkdir(parents=True, exist_ok=True) | |
handles = [] | |
for i, item in enumerate(dataset): | |
handles.append( | |
run_item.spawn( | |
item, | |
client_params, | |
completion_params, | |
system_prompt, | |
output_dir, | |
dry_run, | |
) | |
) | |
for handle in handles: | |
result = handle.get() | |
if not dry_run: | |
return result | |
@app.function(volumes={DATA_DIR: volume}, timeout=1 * HOURS) | |
def run_item( | |
item: dict, | |
client_params: ClientParams, | |
completion_params: CompletionParams, | |
system_prompt: str, | |
output_dir: Path, | |
dry_run: bool, | |
): | |
client = create_client(client_params) | |
if completion_params.model: | |
print( | |
Colors.BOLD, | |
f"🧠: Using model {completion_params.model}. This may trigger a model load on first call!", | |
Colors.END, | |
sep="", | |
) | |
else: | |
print( | |
Colors.BOLD, | |
f"🔎: Looking up available models on server at {client.base_url}. This may trigger a model load!", | |
Colors.END, | |
sep="", | |
) | |
model = client.models.list().data[0] | |
model = model.id | |
print( | |
Colors.BOLD, | |
f"🧠: Using {model}", | |
Colors.END, | |
sep="", | |
) | |
completion_params.model = model | |
prompt = item["prompt"] | |
messages = [ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": prompt}, | |
] | |
per_request = 250 | |
ct, completions = completion_params.n, [] | |
if not dry_run: | |
while ct > 0: | |
response = get_completion( | |
client, | |
messages=messages, | |
**asdict(completion_params) | dict(n=min(ct, per_request)), | |
) | |
if response: | |
completions += [ | |
{ | |
"task_id": item["task_id"], | |
"completion": choice.message.content, | |
} | |
for choice in response.choices | |
] | |
ct -= per_request | |
index = item["task_id"].split("/")[-1] | |
output_path = output_dir / f"{index}.jsonl" | |
output_path.parent.mkdir(parents=True, exist_ok=True) | |
with open(output_path, "w") as f: | |
f.writelines(json.dumps(completion) + "\n" for completion in completions) | |
print(Colors.GREEN + f"Completions saved to {output_path}" + Colors.END) | |
class Colors: | |
"""ANSI color codes""" | |
GREEN = "\033[0;32m" | |
RED = "\033[0;31m" | |
BLUE = "\033[0;34m" | |
GRAY = "\033[0;90m" | |
BOLD = "\033[1m" | |
END = "\033[0m" | |
def get_completion(client, **kwargs): | |
try: | |
response = client.chat.completions.create(**kwargs) | |
return response | |
except Exception as e: | |
print(Colors.RED, f"Error during API call: {e}", Colors.END, sep="") | |
return None | |
def create_client(client_params: ClientParams): | |
from openai import OpenAI | |
client = OpenAI(api_key=client_params.api_key) | |
client.base_url = client_params.url | |
return client | |
@app.function(volumes={DATA_DIR: volume}) | |
def save_dataset(path="dev-llm", subsample: int = 1): | |
import datasets | |
path = DATA_DIR / path | |
ds = datasets.load_dataset( | |
"openai/openai_humaneval", | |
split=datasets.ReadInstruction("test", to=subsample, unit="%"), | |
) | |
ds.to_json(path / "data.jsonl") | |
volume.commit() | |
def load_dataset(path="dev-llm"): | |
import datasets | |
path = DATA_DIR / path | |
ds = datasets.load_dataset(path=str(path), data_files="data.jsonl") | |
return ds["train"] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import modal | |
MODELS_DIR = "/llamas" | |
DEFAULT_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct" | |
DEFAULT_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16" | |
volume = modal.Volume.from_name("llamas", create_if_missing=True) | |
image = ( | |
modal.Image.debian_slim(python_version="3.10") | |
.pip_install( | |
[ | |
"huggingface_hub", # download models from the Hugging Face Hub | |
"hf-transfer", # download models faster with Rust | |
] | |
) | |
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) | |
) | |
MINUTES = 60 | |
HOURS = 60 * MINUTES | |
app = modal.App(image=image, secrets=[modal.Secret.from_name("huggingface")]) | |
@app.function(volumes={MODELS_DIR: volume}, timeout=4 * HOURS) | |
def download_model(model_name, model_revision, force_download=False): | |
from huggingface_hub import snapshot_download | |
volume.reload() | |
snapshot_download( | |
model_name, | |
local_dir=MODELS_DIR, | |
ignore_patterns=[ | |
"*.pt", | |
"*.bin", | |
"*.pth", | |
"original/*", | |
], # Ensure safetensors | |
revision=model_revision, | |
force_download=force_download, | |
) | |
volume.commit() | |
@app.local_entrypoint() | |
def main( | |
model_name: str = DEFAULT_NAME, | |
model_revision: str = DEFAULT_REVISION, | |
force_download: bool = False, | |
): | |
download_model.remote(model_name, model_revision, force_download) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import modal | |
app = modal.App("humaneval-sandbox") | |
volume = modal.Volume.from_name("humaneval", create_if_missing=True) | |
sandbox_image = ( | |
modal.Image.debian_slim() | |
.apt_install("git") | |
.run_commands( | |
"git clone https://github.com/modal-labs/human-eval.git", | |
"pip install -e human-eval", | |
) | |
) | |
MINUTES = 60 | |
@app.function(volumes={"/humaneval": volume}) | |
def run_humaneval(sample_file_path: str, problem_file_path: str): | |
with modal.Volume.ephemeral() as vol: | |
with vol.batch_upload() as batch: | |
batch.put_file(sample_file_path, "samples.jsonl") | |
batch.put_file(problem_file_path, "problems.jsonl") | |
print(f"Starting sandbox for {sample_file_path}") | |
sandbox = modal.Sandbox.create( | |
"bash", | |
"-c", | |
"evaluate_functional_correctness vol/samples.jsonl --problem_file=vol/problems.jsonl --n_workers=32", | |
image=sandbox_image, | |
volumes={"/vol": vol}, | |
timeout=5 * MINUTES, | |
cpu=32, | |
) | |
try: | |
sandbox.wait_for(4 * MINUTES) | |
print(f"Finished sandbox for {sample_file_path}") | |
except TimeoutError: | |
print("Sandbox timed out") | |
if sandbox.returncode == 0: | |
print(sandbox.stdout.read()) | |
data = b"" | |
for chunk in vol.read_file("samples.jsonl_results.jsonl"): | |
data += chunk | |
with open(f"{sample_file_path}_results.jsonl", "wb") as f: | |
f.write(data) | |
else: | |
print(f"Tests failed with code {sandbox.returncode}") | |
print(sandbox.stderr.read()) | |
@app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES) | |
def find_missing_files(): | |
import os | |
volume.reload() | |
# Find all files matching /humaneval/{env}/{run}/{id}.jsonl | |
envs = [element for element in Path("/humaneval").iterdir() if element.is_dir()] | |
for env in envs: | |
print(f"looking in {env}") | |
problem_file = env / "data.jsonl" | |
pattern = "*/*.jsonl" | |
handles = [] | |
for file_path in env.glob(pattern): | |
# Skip files that end with _results.jsonl | |
if str(file_path).endswith("_results.jsonl"): | |
continue | |
print(f"Checking {file_path}") | |
# Check if the corresponding results file exists | |
results_file = f"{file_path}_results.jsonl" | |
if not os.path.exists(results_file): | |
# If it doesn't exist, run run_humaneval | |
handles.append(run_humaneval.spawn(file_path, problem_file)) | |
for handle in handles: | |
handle.get() | |
@app.local_entrypoint() | |
def main(): | |
find_missing_files.remote() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import modal | |
vllm_image = modal.Image.debian_slim(python_version="3.10").pip_install( | |
"vllm==0.5.3post1" | |
) | |
MODELS_DIR = "/llamas" | |
MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct" | |
MODEL_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16" | |
try: | |
volume = modal.Volume.lookup("llamas", create_if_missing=False) | |
except modal.exception.NotFoundError: | |
raise Exception("Download models first with modal run download_llama.py") | |
app = modal.App("example-infinite-monkeys") | |
N_GPU = 1 # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count | |
TOKEN = ( | |
"super-secret-token" # auth token. for production use, replace with a modal.Secret | |
) | |
MINUTES = 60 # seconds | |
HOURS = 60 * MINUTES | |
@app.function( | |
image=vllm_image, | |
gpu=modal.gpu.A100(count=N_GPU, size="40GB"), | |
container_idle_timeout=5 * MINUTES, | |
timeout=24 * HOURS, | |
allow_concurrent_inputs=2, | |
volumes={MODELS_DIR: volume}, | |
concurrency_limit=10, | |
) | |
@modal.asgi_app() | |
def serve(): | |
import fastapi | |
import vllm.entrypoints.openai.api_server as api_server | |
from vllm.engine.arg_utils import AsyncEngineArgs | |
from vllm.engine.async_llm_engine import AsyncLLMEngine | |
from vllm.entrypoints.logger import RequestLogger | |
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat | |
from vllm.entrypoints.openai.serving_completion import ( | |
OpenAIServingCompletion, | |
) | |
from vllm.usage.usage_lib import UsageContext | |
volume.reload() # ensure we have the latest version of the weights | |
# create a fastAPI app that uses vLLM's OpenAI-compatible router | |
web_app = fastapi.FastAPI( | |
title=f"OpenAI-compatible {MODEL_NAME} server", | |
description="Run an OpenAI-compatible LLM server with vLLM on modal.com", | |
version="0.0.1", | |
docs_url="/docs", | |
) | |
# security: CORS middleware for external requests | |
http_bearer = fastapi.security.HTTPBearer( | |
scheme_name="Bearer Token", | |
description="See code for authentication details.", | |
) | |
web_app.add_middleware( | |
fastapi.middleware.cors.CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
# security: inject dependency on authed routes | |
async def is_authenticated(api_key: str = fastapi.Security(http_bearer)): | |
if api_key.credentials != TOKEN: | |
raise fastapi.HTTPException( | |
status_code=fastapi.status.HTTP_401_UNAUTHORIZED, | |
detail="Invalid authentication credentials", | |
) | |
return {"username": "authenticated_user"} | |
router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)]) | |
# wrap vllm's router in auth router | |
router.include_router(api_server.router) | |
# add authed vllm to our fastAPI app | |
web_app.include_router(router) | |
engine_args = AsyncEngineArgs( | |
model=MODELS_DIR + "/" + MODEL_NAME, | |
tensor_parallel_size=N_GPU, | |
gpu_memory_utilization=0.90, | |
max_model_len=2048, | |
enforce_eager=False, # capture the graph for faster inference, but slower cold starts (30s > 20s) | |
) | |
engine = AsyncLLMEngine.from_engine_args( | |
engine_args, usage_context=UsageContext.OPENAI_API_SERVER | |
) | |
model_config = get_model_config(engine) | |
request_logger = RequestLogger(max_log_len=2048) | |
api_server.openai_serving_chat = OpenAIServingChat( | |
engine, | |
model_config=model_config, | |
served_model_names=[MODEL_NAME], | |
chat_template=None, | |
response_role="assistant", | |
lora_modules=[], | |
prompt_adapters=[], | |
request_logger=request_logger, | |
) | |
api_server.openai_serving_completion = OpenAIServingCompletion( | |
engine, | |
model_config=model_config, | |
served_model_names=[MODEL_NAME], | |
lora_modules=[], | |
prompt_adapters=[], | |
request_logger=request_logger, | |
) | |
return web_app | |
def get_model_config(engine): | |
import asyncio | |
try: # adapted from vLLM source -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1 | |
event_loop = asyncio.get_running_loop() | |
except RuntimeError: | |
event_loop = None | |
if event_loop is not None and event_loop.is_running(): | |
# If the current is instanced by Ray Serve, | |
# there is already a running event loop | |
model_config = event_loop.run_until_complete(engine.get_model_config()) | |
else: | |
# When using single vLLM without engine_use_ray | |
model_config = asyncio.run(engine.get_model_config()) | |
return model_config |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -euo pipefail | |
IFS=$'\n\t' | |
command -v modal >/dev/null 2>&1 || { echo >&2 "modal command not found. Install modal first! Aborting."; exit 1; } | |
echo 'downloading LLaMA 3.1 8B' | |
echo 'make sure to create a Secret called huggingface on Modal and accept the LLaMA 3.1 license' | |
modal run download_llama.py | |
echo 'deploying vLLM inference server' | |
modal deploy inference.py | |
echo 'running HumanEval generation' | |
modal run client.py --data-dir test --no-dry-run --n 1000 --subsample 100 | |
echo 'running HumanEval evaluation' | |
modal run eval.py::find_missing_files | |
echo 'run "modal launch jupyter --volume humaneval" and upload the notebook to run the analysis' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment