Created
April 21, 2026 17:47
-
-
Save g023/18b228dac4a1422627b13f86bfc6872c to your computer and use it in GitHub Desktop.
python inference for deepseek
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| import json | |
| from typing import List, Dict, Optional, Generator, Union | |
| import os | |
| import time | |
| import random | |
| # ============================================================================== | |
| # DeepSeek API Configuration (Replaces Ollama Globals) | |
| # ============================================================================== | |
| G_LOOP_SIZE = 75 # Size of the substring to check for repeats | |
| # DeepSeek API does not use an append prompt for disabling think; that is controlled via "thinking" object | |
| G_APPEND_PROMPT = "" | |
| # DeepSeek API Base URL | |
| G_HOST = "https://api.deepseek.com" | |
| # ------------------------------------------------------------------------------ | |
| # DeepSeek Model Selection | |
| # Use "deepseek-chat" for standard chat + optional thinking, or "deepseek-reasoner" for dedicated reasoning. | |
| # ------------------------------------------------------------------------------ | |
| # G_MODEL = "deepseek-chat" | |
| G_MODEL = "deepseek-reasoner" # Uncomment to use the dedicated reasoning model | |
| # ------------------------------------------------------------------------------ | |
| # Thinking Mode Configuration | |
| # For 'deepseek-chat', set G_THINKING = True to enable. For 'deepseek-reasoner', thinking is always on. | |
| # ------------------------------------------------------------------------------ | |
| G_THINKING = True | |
| # ------------------------------------------------------------------------------ | |
| # API Limits & Controls | |
| # ------------------------------------------------------------------------------ | |
| G_MAX_OUTPUT_TOKENS = 256 # Positive integer required. For deepseek-reasoner, max is 64K.[reference:5] | |
| G_TEMP = 0.0 | |
| # DeepSeek Options Dictionary (Simplified & Compatible) | |
| # Unsupported options like 'num_ctx', 'repeat_penalty', 'mirostat' are removed. | |
| # For 'deepseek-reasoner', temperature, top_p, presence_penalty, frequency_penalty have no effect.[reference:6] | |
| G_OPTIONS = { | |
| "max_tokens": G_MAX_OUTPUT_TOKENS, | |
| "temperature": G_TEMP, | |
| "top_p": 0.9, | |
| "frequency_penalty": 1.0, | |
| "presence_penalty": 0.5 | |
| } | |
| # ------------------------------------------------------------------------------ | |
| # Environment Variables | |
| # ------------------------------------------------------------------------------ | |
| # Ensure you set DEEPSEEK_API_KEY in your environment or replace the string below. | |
| G_API_KEY = os.getenv("DEEPSEEK_API_KEY", "sk-your_key_here") | |
| _STREAM_DONE = object() | |
| def _resolve_host(host: Optional[str]) -> str: | |
| """Resolve the DeepSeek host. Override, environment fallback, or default.""" | |
| return (host or os.getenv("DEEPSEEK_HOST") or G_HOST).rstrip('/') | |
| def _parse_stream_line(raw_line: Union[bytes, str]) -> Optional[Union[Dict, object]]: | |
| """Parse one SSE line from a DeepSeek streaming response.""" | |
| if isinstance(raw_line, bytes): | |
| line = raw_line.decode('utf-8', errors='replace') | |
| else: | |
| line = raw_line | |
| line = line.strip() | |
| if not line or line.startswith(':'): | |
| return None | |
| if line.startswith('data:'): | |
| payload = line[5:].strip() | |
| if not payload or payload == '[DONE]': | |
| return _STREAM_DONE | |
| else: | |
| payload = line | |
| try: | |
| return json.loads(payload) | |
| except json.JSONDecodeError as exc: | |
| print(f"Failed to decode chunk: {raw_line!r}, error: {exc}") | |
| return None | |
| def chat_with_deepseek( | |
| messages: List[Dict[str, str]], | |
| model: str = G_MODEL, | |
| host: str = None, | |
| stream: bool = False, | |
| options: Dict = None, # Will use G_OPTIONS if None | |
| thinking: bool = G_THINKING, | |
| **kwargs | |
| ) -> Union[Dict, Generator[Dict, None, None]]: | |
| """ | |
| Sends a conversation to DeepSeek's API and returns the model's response. | |
| Preserves the exact functional signature of chat_with_ollama for drop-in replacement. | |
| """ | |
| if options is None: | |
| options = G_OPTIONS.copy() | |
| # Append G_APPEND_PROMPT (legacy behavior, though DeepSeek doesn't use this) | |
| if G_APPEND_PROMPT and messages and messages[-1].get("role") == "user": | |
| messages[-1]["content"] += f"{G_APPEND_PROMPT}" | |
| # --- Build DeepSeek API Payload --- | |
| payload = { | |
| "model": model, | |
| "messages": messages, | |
| "stream": stream, | |
| **options # Unpack core options like max_tokens, temperature, etc. | |
| } | |
| # Conditionally add the "thinking" object for deepseek-chat | |
| if model == "deepseek-chat": | |
| payload["thinking"] = {"type": "enabled" if thinking else "disabled"} | |
| # Note: 'deepseek-reasoner' always thinks; this parameter is not needed. | |
| # --- Resolve Host & Endpoint --- | |
| effective_host = _resolve_host(host) | |
| endpoint = f"{effective_host}/chat/completions" | |
| headers = { | |
| "Authorization": f"Bearer {G_API_KEY}", | |
| "Content-Type": "application/json" | |
| } | |
| # Print configuration for debugging (preserves original behavior) | |
| print(f"Using model: {model}") | |
| print(f"Temperature: {payload.get('temperature', 'N/A')}") | |
| print(f"Max Tokens: {payload.get('max_tokens', 'N/A')}") | |
| # --- Make the Request --- | |
| try: | |
| if stream: | |
| return _stream_response(endpoint, headers, payload) | |
| else: | |
| response = requests.post(endpoint, headers=headers, json=payload, timeout=600) | |
| response.raise_for_status() | |
| return response.json() | |
| except requests.exceptions.ConnectionError as e: | |
| raise ConnectionError(f"Could not connect to DeepSeek at {effective_host}.") from e | |
| except requests.exceptions.Timeout as e: | |
| raise requests.exceptions.Timeout("Request timed out.") from e | |
| except requests.exceptions.RequestException as e: | |
| raise e | |
| def _stream_response(endpoint: str, headers: Dict, payload: Dict) -> Generator[Dict, None, None]: | |
| """Internal generator to handle DeepSeek streaming responses.""" | |
| payload["stream"] = True | |
| with requests.post(endpoint, headers=headers, json=payload, stream=True, timeout=600) as response: | |
| response.raise_for_status() | |
| for line in response.iter_lines(decode_unicode=True): | |
| chunk = _parse_stream_line(line) | |
| if chunk is None: | |
| continue | |
| if chunk is _STREAM_DONE: | |
| break | |
| yield chunk | |
| if isinstance(chunk, dict): | |
| if chunk.get("done"): | |
| break | |
| choices = chunk.get("choices") or [] | |
| if choices: | |
| finish_reason = choices[0].get("finish_reason") | |
| if finish_reason in {"stop", "length", "content_filter"}: | |
| break | |
| def llm_nonstream(conv=[], thinking=True, options=None): | |
| if options is None: | |
| options = G_OPTIONS.copy() | |
| ret_dict = { | |
| "reasoning": "", | |
| "content": "", | |
| "usage": {}, | |
| "time_taken": 0, | |
| } | |
| print("\n--- (Non-Streaming) ---") | |
| try: | |
| time_start = time.time() | |
| response = chat_with_deepseek( | |
| messages=conv, | |
| model=G_MODEL, | |
| options=options, | |
| thinking=thinking | |
| ) | |
| message = response['choices'][0]['message'] | |
| ret_dict["time_taken"] = time.time() - time_start | |
| # DeepSeek separates reasoning and content | |
| ret_dict["reasoning"] = message.get('reasoning_content', '') | |
| ret_dict["content"] = message.get('content', '') | |
| # Fallback for older models that embed thinking in content with <think> tags | |
| if not ret_dict["reasoning"] and "</think>" in ret_dict["content"]: | |
| parts = ret_dict["content"].rsplit("</think>", 1) | |
| ret_dict["reasoning"] = parts[0].strip() | |
| ret_dict["content"] = parts[1].strip() | |
| # Use exact token counts returned by DeepSeek | |
| usage = response.get('usage', {}) | |
| ret_dict["usage"] = { | |
| "reasoning_tokens": 0, # DeepSeek doesn't break this down separately | |
| "content_tokens": usage.get('completion_tokens', 0), | |
| "total_tokens": usage.get('total_tokens', 0) | |
| } | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| return ret_dict | |
| def llm_stream(conv=[], thinking=True, options=None, retry_on_repeat=False, the_model=G_MODEL): | |
| if options is None: | |
| options = G_OPTIONS.copy() | |
| print("\n--- Streaming Example ---") | |
| print(f"Using model: {the_model}") | |
| print(f"Temperature: {options.get('temperature', G_TEMP)}") | |
| while True: | |
| ret_dict = { | |
| "reasoning": "", | |
| "content": "", | |
| "usage": {}, | |
| "time_taken": 0, | |
| } | |
| try: | |
| stream = chat_with_deepseek( | |
| messages=conv, | |
| model=the_model, | |
| stream=True, | |
| thinking=thinking, | |
| options=options, | |
| ) | |
| print("Streaming response: ", end="") | |
| time_start = time.time() | |
| reason_str = "" | |
| response_str = "" | |
| in_reasoning = True | |
| for chunk in stream: | |
| if chunk.get('choices'): | |
| delta = chunk['choices'][0].get('delta', {}) | |
| # Handle dedicated reasoning_content field | |
| if delta.get('reasoning_content'): | |
| reason_str += delta['reasoning_content'] | |
| print(delta['reasoning_content'], end="", flush=True) | |
| # Handle main content field | |
| if delta.get('content'): | |
| if in_reasoning and delta.get('content'): | |
| print("\n--- End of Reasoning, Start of Content ---") | |
| in_reasoning = False | |
| response_str += delta['content'] | |
| print(delta['content'], end="", flush=True) | |
| # Check for final message in the chunk (for thinking-enabled models where content isn't streamed) | |
| if 'message' in chunk: | |
| final_message = chunk['message'] | |
| if not response_str: # Only set if content wasn't streamed | |
| ret_dict["content"] = final_message.get('content', '') | |
| print(final_message.get('content', ''), end="", flush=True) | |
| if not reason_str: # Similarly for reasoning | |
| ret_dict["reasoning"] = final_message.get('reasoning_content', '') | |
| print(final_message.get('reasoning_content', ''), end="", flush=True) | |
| elif chunk.get('choices') and 'message' in chunk['choices'][0]: | |
| final_message = chunk['choices'][0]['message'] | |
| if not response_str: # Only set if content wasn't streamed | |
| ret_dict["content"] = final_message.get('content', '') | |
| print(final_message.get('content', ''), end="", flush=True) | |
| if not reason_str: # Similarly for reasoning | |
| ret_dict["reasoning"] = final_message.get('reasoning_content', '') | |
| print(final_message.get('reasoning_content', ''), end="", flush=True) | |
| # Collect usage from the final chunk | |
| if 'usage' in chunk: | |
| ret_dict["usage"] = { | |
| "reasoning_tokens": chunk['usage'].get('reasoning_tokens', 0), | |
| "content_tokens": chunk['usage'].get('completion_tokens', 0), | |
| "total_tokens": chunk['usage'].get('total_tokens', 0) | |
| } | |
| print() # Newline after stream | |
| # Fallback for </think> tags | |
| if not reason_str and "</think>" in response_str: | |
| parts = response_str.rsplit("</think>", 1) | |
| reason_str = parts[0].strip() | |
| response_str = parts[1].strip() | |
| # Update ret_dict with final values | |
| ret_dict["reasoning"] = reason_str | |
| ret_dict["content"] = response_str | |
| ret_dict["time_taken"] = time.time() - time_start | |
| # Usage is already set from the final chunk | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| return ret_dict | |
| # --- Repeat Detection Logic (Preserved exactly from original) --- | |
| if not retry_on_repeat: | |
| break | |
| repeat_found = False | |
| # 1. Line-level loop detection | |
| lines = [line.strip() for line in response_str.split('\n') if line.strip()] | |
| for block_size in range(1, 6): | |
| if len(lines) < block_size * 3: | |
| continue | |
| for i in range(len(lines) - block_size * 3 + 1): | |
| block = lines[i:i+block_size] | |
| matches = sum(1 for j in range(len(lines) - block_size + 1) if lines[j:j+block_size] == block) | |
| if matches >= 4: | |
| repeat_found = True | |
| break | |
| if repeat_found: | |
| break | |
| # 2. Substring level loop detection | |
| if not repeat_found and len(response_str) >= 100: | |
| for chunk_size in [75, 100, 150]: | |
| if len(response_str) < chunk_size * 4: | |
| continue | |
| for i in range(0, len(response_str) - chunk_size, chunk_size // 2): | |
| chunk = response_str[i:i+chunk_size] | |
| if response_str.count(chunk) >= 4: | |
| repeat_found = True | |
| break | |
| if repeat_found: | |
| break | |
| if not repeat_found: | |
| break | |
| # Retry with varied temperature | |
| options['temperature'] = round(random.uniform(0.6, 1.5), 2) | |
| print(f"\n[!] Repeat detected, retrying with temperature {options['temperature']}...") | |
| return ret_dict | |
| # --- Example Usage (Unchanged) --- | |
| if __name__ == "__main__": | |
| conversation = [ | |
| {"role": "system", "content": "You are a helpful, concise assistant."}, | |
| {"role": "user", "content": "Say hello in an alien language:"} | |
| ] | |
| # --- IMPORTANT: Set your API Key before running --- | |
| # export DEEPSEEK_API_KEY="your_key_here" | |
| # The rest of the script runs identically | |
| ret_dict = llm_stream(conversation, thinking=G_THINKING) | |
| print(f"\n--- Reasoning ---\n") | |
| print(ret_dict["reasoning"]) | |
| print(f"\n--- Content ---\n") | |
| print(ret_dict["content"]) | |
| print(f"\n--- Token Counts and Timing Info ---\n") | |
| print(f"Estimated Reasoning Tokens: {ret_dict['usage'].get('reasoning_tokens', 'N/A')}") | |
| print(f"Estimated Content Tokens: {ret_dict['usage'].get('content_tokens', 'N/A')}") | |
| print(f"Estimated Total Tokens: {ret_dict['usage'].get('total_tokens', 'N/A')}") | |
| print(f"Total Time: {ret_dict.get('time_taken', 'N/A'):.2f} seconds") | |
| if ret_dict.get('time_taken', 0) > 0: | |
| print(f"Average Speed: {ret_dict['usage'].get('total_tokens', 0) / ret_dict['time_taken']:.2f} tokens/second") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment