Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import logging | |
from openai import OpenAI | |
from typing import List, Generator, Optional | |
logging.basicConfig(level=logging.INFO) | |
def request_generation( | |
api_key: str, | |
api_base: str, | |
message: str, | |
system_prompt: str, | |
model_name: str, | |
chat_history: Optional[List[dict]] = None, | |
temperature: float = 0.3, | |
frequency_penalty: float = 0.0, | |
presence_penalty: float = 0.0, | |
max_new_tokens: int = 1024, | |
tools: Optional[List[dict]] = None, | |
tool_choice: Optional[str] = None, | |
) -> Generator[str, None, None]: | |
""" | |
Sends a streaming chat request to an OpenAI-compatible backend using the official OpenAI client. | |
Buffers output to improve LaTeX rendering. | |
""" | |
client = OpenAI(api_key=api_key, base_url=api_base) | |
messages = [{"role": "system", "content": system_prompt}] | |
if chat_history: | |
messages.extend(chat_history) | |
messages.append({"role": "user", "content": message}) | |
request_args = { | |
"model": model_name, | |
"messages": messages, | |
"temperature": temperature, | |
"frequency_penalty": frequency_penalty, | |
"presence_penalty": presence_penalty, | |
"max_tokens": max_new_tokens, | |
"stream": True, | |
} | |
if tools: | |
request_args["tools"] = tools | |
if tool_choice: | |
request_args["tool_choice"] = tool_choice | |
logging.info(f"[Gateway] Request to {api_base} | Model: {model_name}") | |
try: | |
stream = client.chat.completions.create(**request_args) | |
collected = "" | |
buffer = "" | |
for chunk in stream: | |
delta = chunk.choices[0].delta.content or "" | |
collected += delta | |
buffer += delta | |
if "\n" in buffer or len(buffer) > 150: | |
yield buffer | |
buffer = "" | |
if buffer: | |
yield buffer | |
except Exception as e: | |
logging.exception("[Gateway] Streaming failed") | |
yield f"Error: {e}" | |