Skip to content

Instantly share code, notes, and snippets.

@g023
Created April 21, 2026 17:47
Show Gist options
  • Select an option

  • Save g023/18b228dac4a1422627b13f86bfc6872c to your computer and use it in GitHub Desktop.

Select an option

Save g023/18b228dac4a1422627b13f86bfc6872c to your computer and use it in GitHub Desktop.
python inference for deepseek
import requests
import json
from typing import List, Dict, Optional, Generator, Union
import os
import time
import random
# ==============================================================================
# DeepSeek API Configuration (Replaces Ollama Globals)
# ==============================================================================
G_LOOP_SIZE = 75 # Size of the substring to check for repeats
# DeepSeek API does not use an append prompt for disabling think; that is controlled via "thinking" object
G_APPEND_PROMPT = ""
# DeepSeek API Base URL
G_HOST = "https://api.deepseek.com"
# ------------------------------------------------------------------------------
# DeepSeek Model Selection
# Use "deepseek-chat" for standard chat + optional thinking, or "deepseek-reasoner" for dedicated reasoning.
# ------------------------------------------------------------------------------
# G_MODEL = "deepseek-chat"
G_MODEL = "deepseek-reasoner" # Uncomment to use the dedicated reasoning model
# ------------------------------------------------------------------------------
# Thinking Mode Configuration
# For 'deepseek-chat', set G_THINKING = True to enable. For 'deepseek-reasoner', thinking is always on.
# ------------------------------------------------------------------------------
G_THINKING = True
# ------------------------------------------------------------------------------
# API Limits & Controls
# ------------------------------------------------------------------------------
G_MAX_OUTPUT_TOKENS = 256 # Positive integer required. For deepseek-reasoner, max is 64K.[reference:5]
G_TEMP = 0.0
# DeepSeek Options Dictionary (Simplified & Compatible)
# Unsupported options like 'num_ctx', 'repeat_penalty', 'mirostat' are removed.
# For 'deepseek-reasoner', temperature, top_p, presence_penalty, frequency_penalty have no effect.[reference:6]
G_OPTIONS = {
"max_tokens": G_MAX_OUTPUT_TOKENS,
"temperature": G_TEMP,
"top_p": 0.9,
"frequency_penalty": 1.0,
"presence_penalty": 0.5
}
# ------------------------------------------------------------------------------
# Environment Variables
# ------------------------------------------------------------------------------
# Ensure you set DEEPSEEK_API_KEY in your environment or replace the string below.
G_API_KEY = os.getenv("DEEPSEEK_API_KEY", "sk-your_key_here")
_STREAM_DONE = object()
def _resolve_host(host: Optional[str]) -> str:
"""Resolve the DeepSeek host. Override, environment fallback, or default."""
return (host or os.getenv("DEEPSEEK_HOST") or G_HOST).rstrip('/')
def _parse_stream_line(raw_line: Union[bytes, str]) -> Optional[Union[Dict, object]]:
"""Parse one SSE line from a DeepSeek streaming response."""
if isinstance(raw_line, bytes):
line = raw_line.decode('utf-8', errors='replace')
else:
line = raw_line
line = line.strip()
if not line or line.startswith(':'):
return None
if line.startswith('data:'):
payload = line[5:].strip()
if not payload or payload == '[DONE]':
return _STREAM_DONE
else:
payload = line
try:
return json.loads(payload)
except json.JSONDecodeError as exc:
print(f"Failed to decode chunk: {raw_line!r}, error: {exc}")
return None
def chat_with_deepseek(
messages: List[Dict[str, str]],
model: str = G_MODEL,
host: str = None,
stream: bool = False,
options: Dict = None, # Will use G_OPTIONS if None
thinking: bool = G_THINKING,
**kwargs
) -> Union[Dict, Generator[Dict, None, None]]:
"""
Sends a conversation to DeepSeek's API and returns the model's response.
Preserves the exact functional signature of chat_with_ollama for drop-in replacement.
"""
if options is None:
options = G_OPTIONS.copy()
# Append G_APPEND_PROMPT (legacy behavior, though DeepSeek doesn't use this)
if G_APPEND_PROMPT and messages and messages[-1].get("role") == "user":
messages[-1]["content"] += f"{G_APPEND_PROMPT}"
# --- Build DeepSeek API Payload ---
payload = {
"model": model,
"messages": messages,
"stream": stream,
**options # Unpack core options like max_tokens, temperature, etc.
}
# Conditionally add the "thinking" object for deepseek-chat
if model == "deepseek-chat":
payload["thinking"] = {"type": "enabled" if thinking else "disabled"}
# Note: 'deepseek-reasoner' always thinks; this parameter is not needed.
# --- Resolve Host & Endpoint ---
effective_host = _resolve_host(host)
endpoint = f"{effective_host}/chat/completions"
headers = {
"Authorization": f"Bearer {G_API_KEY}",
"Content-Type": "application/json"
}
# Print configuration for debugging (preserves original behavior)
print(f"Using model: {model}")
print(f"Temperature: {payload.get('temperature', 'N/A')}")
print(f"Max Tokens: {payload.get('max_tokens', 'N/A')}")
# --- Make the Request ---
try:
if stream:
return _stream_response(endpoint, headers, payload)
else:
response = requests.post(endpoint, headers=headers, json=payload, timeout=600)
response.raise_for_status()
return response.json()
except requests.exceptions.ConnectionError as e:
raise ConnectionError(f"Could not connect to DeepSeek at {effective_host}.") from e
except requests.exceptions.Timeout as e:
raise requests.exceptions.Timeout("Request timed out.") from e
except requests.exceptions.RequestException as e:
raise e
def _stream_response(endpoint: str, headers: Dict, payload: Dict) -> Generator[Dict, None, None]:
"""Internal generator to handle DeepSeek streaming responses."""
payload["stream"] = True
with requests.post(endpoint, headers=headers, json=payload, stream=True, timeout=600) as response:
response.raise_for_status()
for line in response.iter_lines(decode_unicode=True):
chunk = _parse_stream_line(line)
if chunk is None:
continue
if chunk is _STREAM_DONE:
break
yield chunk
if isinstance(chunk, dict):
if chunk.get("done"):
break
choices = chunk.get("choices") or []
if choices:
finish_reason = choices[0].get("finish_reason")
if finish_reason in {"stop", "length", "content_filter"}:
break
def llm_nonstream(conv=[], thinking=True, options=None):
if options is None:
options = G_OPTIONS.copy()
ret_dict = {
"reasoning": "",
"content": "",
"usage": {},
"time_taken": 0,
}
print("\n--- (Non-Streaming) ---")
try:
time_start = time.time()
response = chat_with_deepseek(
messages=conv,
model=G_MODEL,
options=options,
thinking=thinking
)
message = response['choices'][0]['message']
ret_dict["time_taken"] = time.time() - time_start
# DeepSeek separates reasoning and content
ret_dict["reasoning"] = message.get('reasoning_content', '')
ret_dict["content"] = message.get('content', '')
# Fallback for older models that embed thinking in content with <think> tags
if not ret_dict["reasoning"] and "</think>" in ret_dict["content"]:
parts = ret_dict["content"].rsplit("</think>", 1)
ret_dict["reasoning"] = parts[0].strip()
ret_dict["content"] = parts[1].strip()
# Use exact token counts returned by DeepSeek
usage = response.get('usage', {})
ret_dict["usage"] = {
"reasoning_tokens": 0, # DeepSeek doesn't break this down separately
"content_tokens": usage.get('completion_tokens', 0),
"total_tokens": usage.get('total_tokens', 0)
}
except Exception as e:
print(f"Error: {e}")
return ret_dict
def llm_stream(conv=[], thinking=True, options=None, retry_on_repeat=False, the_model=G_MODEL):
if options is None:
options = G_OPTIONS.copy()
print("\n--- Streaming Example ---")
print(f"Using model: {the_model}")
print(f"Temperature: {options.get('temperature', G_TEMP)}")
while True:
ret_dict = {
"reasoning": "",
"content": "",
"usage": {},
"time_taken": 0,
}
try:
stream = chat_with_deepseek(
messages=conv,
model=the_model,
stream=True,
thinking=thinking,
options=options,
)
print("Streaming response: ", end="")
time_start = time.time()
reason_str = ""
response_str = ""
in_reasoning = True
for chunk in stream:
if chunk.get('choices'):
delta = chunk['choices'][0].get('delta', {})
# Handle dedicated reasoning_content field
if delta.get('reasoning_content'):
reason_str += delta['reasoning_content']
print(delta['reasoning_content'], end="", flush=True)
# Handle main content field
if delta.get('content'):
if in_reasoning and delta.get('content'):
print("\n--- End of Reasoning, Start of Content ---")
in_reasoning = False
response_str += delta['content']
print(delta['content'], end="", flush=True)
# Check for final message in the chunk (for thinking-enabled models where content isn't streamed)
if 'message' in chunk:
final_message = chunk['message']
if not response_str: # Only set if content wasn't streamed
ret_dict["content"] = final_message.get('content', '')
print(final_message.get('content', ''), end="", flush=True)
if not reason_str: # Similarly for reasoning
ret_dict["reasoning"] = final_message.get('reasoning_content', '')
print(final_message.get('reasoning_content', ''), end="", flush=True)
elif chunk.get('choices') and 'message' in chunk['choices'][0]:
final_message = chunk['choices'][0]['message']
if not response_str: # Only set if content wasn't streamed
ret_dict["content"] = final_message.get('content', '')
print(final_message.get('content', ''), end="", flush=True)
if not reason_str: # Similarly for reasoning
ret_dict["reasoning"] = final_message.get('reasoning_content', '')
print(final_message.get('reasoning_content', ''), end="", flush=True)
# Collect usage from the final chunk
if 'usage' in chunk:
ret_dict["usage"] = {
"reasoning_tokens": chunk['usage'].get('reasoning_tokens', 0),
"content_tokens": chunk['usage'].get('completion_tokens', 0),
"total_tokens": chunk['usage'].get('total_tokens', 0)
}
print() # Newline after stream
# Fallback for </think> tags
if not reason_str and "</think>" in response_str:
parts = response_str.rsplit("</think>", 1)
reason_str = parts[0].strip()
response_str = parts[1].strip()
# Update ret_dict with final values
ret_dict["reasoning"] = reason_str
ret_dict["content"] = response_str
ret_dict["time_taken"] = time.time() - time_start
# Usage is already set from the final chunk
except Exception as e:
print(f"Error: {e}")
return ret_dict
# --- Repeat Detection Logic (Preserved exactly from original) ---
if not retry_on_repeat:
break
repeat_found = False
# 1. Line-level loop detection
lines = [line.strip() for line in response_str.split('\n') if line.strip()]
for block_size in range(1, 6):
if len(lines) < block_size * 3:
continue
for i in range(len(lines) - block_size * 3 + 1):
block = lines[i:i+block_size]
matches = sum(1 for j in range(len(lines) - block_size + 1) if lines[j:j+block_size] == block)
if matches >= 4:
repeat_found = True
break
if repeat_found:
break
# 2. Substring level loop detection
if not repeat_found and len(response_str) >= 100:
for chunk_size in [75, 100, 150]:
if len(response_str) < chunk_size * 4:
continue
for i in range(0, len(response_str) - chunk_size, chunk_size // 2):
chunk = response_str[i:i+chunk_size]
if response_str.count(chunk) >= 4:
repeat_found = True
break
if repeat_found:
break
if not repeat_found:
break
# Retry with varied temperature
options['temperature'] = round(random.uniform(0.6, 1.5), 2)
print(f"\n[!] Repeat detected, retrying with temperature {options['temperature']}...")
return ret_dict
# --- Example Usage (Unchanged) ---
if __name__ == "__main__":
conversation = [
{"role": "system", "content": "You are a helpful, concise assistant."},
{"role": "user", "content": "Say hello in an alien language:"}
]
# --- IMPORTANT: Set your API Key before running ---
# export DEEPSEEK_API_KEY="your_key_here"
# The rest of the script runs identically
ret_dict = llm_stream(conversation, thinking=G_THINKING)
print(f"\n--- Reasoning ---\n")
print(ret_dict["reasoning"])
print(f"\n--- Content ---\n")
print(ret_dict["content"])
print(f"\n--- Token Counts and Timing Info ---\n")
print(f"Estimated Reasoning Tokens: {ret_dict['usage'].get('reasoning_tokens', 'N/A')}")
print(f"Estimated Content Tokens: {ret_dict['usage'].get('content_tokens', 'N/A')}")
print(f"Estimated Total Tokens: {ret_dict['usage'].get('total_tokens', 'N/A')}")
print(f"Total Time: {ret_dict.get('time_taken', 'N/A'):.2f} seconds")
if ret_dict.get('time_taken', 0) > 0:
print(f"Average Speed: {ret_dict['usage'].get('total_tokens', 0) / ret_dict['time_taken']:.2f} tokens/second")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment