|
|
|
|
|
|
|
|
|
|
|
import os |
|
import json |
|
import httpx |
|
import gradio as gr |
|
import random |
|
from datetime import datetime, timedelta |
|
from src.assets.css.reasoning import styles |
|
|
|
|
|
auth = json.loads(os.getenv("auth")) |
|
|
|
|
|
|
|
busy = {} |
|
|
|
def server(): |
|
""" |
|
Clean up any expired server sessions from the 'busy' tracking dictionary. |
|
|
|
This function checks if any server marked as busy has passed its timeout period. |
|
If so, it removes them from the busy list, making them available again for use. |
|
""" |
|
now = datetime.now() |
|
for session, expiry in list(busy.items()): |
|
if expiry <= now: |
|
del busy[session] |
|
|
|
def setup(user_message, history): |
|
""" |
|
Append the current user message to the conversation history. |
|
|
|
Parameters: |
|
- user_message: The new message entered by the user. |
|
- history: A list of dictionaries containing previous conversation turns. |
|
|
|
Returns: |
|
- A new message list with the latest user message added. |
|
""" |
|
messages = history.copy() |
|
messages.append({"role": "user", "content": user_message}) |
|
return messages |
|
|
|
def connection(host, messages): |
|
""" |
|
Send the conversation to a specific server and receive a streamed response. |
|
|
|
This function prepares the headers and request payload, then opens a stream |
|
to the target host using the HTTP/2 protocol. It handles incremental data |
|
and separates reasoning content from main response content. |
|
|
|
Parameters: |
|
- host: A dictionary containing host settings such as token, model, and endpoint. |
|
- messages: A list of conversation history messages in proper format. |
|
|
|
Yields: |
|
- Incremental reasoning or content responses as they are received from the stream. |
|
""" |
|
headers = { |
|
"Authorization": f"Bearer {host['token']}", |
|
"Content-Type": "application/json" |
|
} |
|
payload = { |
|
"model": host["model"], |
|
"messages": messages, |
|
"stream": True, |
|
"temperature": 0.7, |
|
"top_p": 0.8, |
|
"top_k": 20, |
|
"repetition_penalty": 1.05 |
|
} |
|
|
|
reasoning = "" |
|
content = "" |
|
|
|
|
|
with httpx.Client(http2=True) as client: |
|
with client.stream("POST", host["endpoint"], headers=headers, json=payload) as resp: |
|
resp.raise_for_status() |
|
|
|
for line in resp.iter_lines(): |
|
if not line: |
|
continue |
|
|
|
raw = line[6:] if line.startswith("data: ") else line |
|
try: |
|
data = json.loads(raw) |
|
|
|
delta = data["choices"][0]["delta"] |
|
reasoning_response = delta.get("reasoning") |
|
if reasoning_response: |
|
reasoning += reasoning_response |
|
|
|
yield styles(reasoning=reasoning, expanded=True) |
|
content_response = delta.get("content") |
|
|
|
if content_response: |
|
content += content_response |
|
|
|
if reasoning: |
|
yield styles(reasoning=reasoning, expanded=False) + content |
|
else: |
|
yield content |
|
|
|
if data["choices"][0].get("finish_reason") == "stop": |
|
return |
|
except json.JSONDecodeError: |
|
continue |
|
|
|
def request(user_message, history): |
|
""" |
|
Main request handler that distributes the user's question to an available server. |
|
|
|
This function validates the input, prepares the message history, rotates through |
|
available hosts, and forwards the message to one that is not currently busy. |
|
If a server fails due to a known error, it is temporarily marked as unavailable. |
|
|
|
Parameters: |
|
- user_message: The latest message input by the user. |
|
- history: The chat history containing all prior messages. |
|
|
|
Yields: |
|
- Either the generated reply or a busy message if all hosts are unavailable. |
|
""" |
|
|
|
if not user_message or not user_message.strip(): |
|
yield [] |
|
return |
|
|
|
server() |
|
|
|
messages = setup(user_message, history) |
|
|
|
available = [h for h in auth if h["jarvis"] not in busy] |
|
|
|
random.shuffle(available) |
|
|
|
for host in available: |
|
try: |
|
|
|
yield from connection(host, messages) |
|
return |
|
except httpx.HTTPStatusError as e: |
|
|
|
if e.response.status_code == host.get("error"): |
|
busy[host["jarvis"]] = datetime.now() + timedelta(hours=1) |
|
except Exception: |
|
continue |
|
|
|
yield "The server is currently busy. Please wait a moment or try again later." |
|
|
|
|
|
gr.ChatInterface( |
|
fn=request, |
|
type="messages", |
|
chatbot=gr.Chatbot( |
|
type="messages", |
|
show_copy_button=True, |
|
scale=1 |
|
), |
|
examples=[ |
|
["Please introduce yourself."], |
|
["Give me a short introduction to large language model."], |
|
["Please generate a highly complex code snippet on any topic."], |
|
["Explain about quantum computers."] |
|
], |
|
cache_examples=False, |
|
concurrency_limit=2 |
|
).launch() |