FrancescoJo · February 27, 2025 07:45
diff --git a/main.py b/main.py
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel

 from vllm import LLM, SamplingParams

 app = FastAPI()

 model_path = "/home/deploy/workspace/DeepSeek-R1-Distill-Llama-8B"
 llm = LLM(model=model_path, gpu_memory_utilization=0.9, tensor_parallel_size=1, enforce_eager=True, max_model_len=16384)

 sampling_params = SamplingParams(temperature=0.6, max_tokens=16384)

 class QueryRequest(BaseModel):
    query: str

 @app.post("/generate/")
 async def generate_response(request: QueryRequest):
    try:
        response = llm.generate(request.query, sampling_params)
        result_text = response[0].outputs[0].text
        return {"response": result_text}
    except Exception as e:
        raise HTTPException(status_code = 500, detail = str(e))
diff --git a/pyproject.toml b/pyproject.toml
 package-mode = false

 [project]
 name = "hwan-deepseek-with-fastapi"
 version = "0.1.0"
 description = "Add your description here"
 requires-python = ">=3.12"
 dependencies = [
    "uvicorn==0.34.0",
    "fastapi==0.115.8",
    "pydantic==2.10.6",
    "vllm==0.7.0",
 ]
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel

	from vllm import LLM, SamplingParams

	app = FastAPI()

	model_path = "/home/deploy/workspace/DeepSeek-R1-Distill-Llama-8B"
	llm = LLM(model=model_path, gpu_memory_utilization=0.9, tensor_parallel_size=1, enforce_eager=True, max_model_len=16384)

	sampling_params = SamplingParams(temperature=0.6, max_tokens=16384)

	class QueryRequest(BaseModel):
	query: str

	@app.post("/generate/")
	async def generate_response(request: QueryRequest):
	try:
	response = llm.generate(request.query, sampling_params)
	result_text = response[0].outputs[0].text
	return {"response": result_text}
	except Exception as e:
	raise HTTPException(status_code = 500, detail = str(e))
	package-mode = false

	[project]
	name = "hwan-deepseek-with-fastapi"
	version = "0.1.0"
	description = "Add your description here"
	requires-python = ">=3.12"
	dependencies = [
	"uvicorn==0.34.0",
	"fastapi==0.115.8",
	"pydantic==2.10.6",
	"vllm==0.7.0",
	]