inferoxy-hub / chat_handler.py
nazdridoy's picture
feat(handlers): pass Hugging Face profile to handlers
52fc803 verified
"""
Chat functionality handler for AI-Inferoxy AI Hub.
Handles chat completion requests with streaming responses.
"""
import os
import gradio as gr
import time
import threading
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
from huggingface_hub import InferenceClient
from huggingface_hub.errors import HfHubHTTPError
from requests.exceptions import ConnectionError, Timeout, RequestException
from hf_token_utils import get_proxy_token, report_token_status
from utils import (
validate_proxy_key,
format_error_message,
render_with_reasoning_toggle
)
# Timeout configuration for inference requests
INFERENCE_TIMEOUT = 120 # 2 minutes max for inference
def chat_respond(
message,
history: list[dict[str, str]],
system_message,
model_name,
provider_override,
max_tokens,
temperature,
top_p,
client_name: str | None = None,
):
"""
Chat completion function using AI-Inferoxy token management.
"""
# Validate proxy API key
is_valid, error_msg = validate_proxy_key()
if not is_valid:
yield error_msg
return
proxy_api_key = os.getenv("PROXY_KEY")
token_id = None
try:
# Get token from AI-Inferoxy proxy server with timeout handling
print(f"🔑 Chat: Requesting token from proxy...")
token, token_id = get_proxy_token(api_key=proxy_api_key)
print(f"✅ Chat: Got token: {token_id}")
# Enforce explicit provider selection via dropdown
model = model_name
provider = provider_override or "auto"
print(f"🤖 Chat: Using model='{model}', provider='{provider if provider else 'auto'}'")
# Prepare messages first
messages = [{"role": "system", "content": system_message}]
messages.extend(history)
messages.append({"role": "user", "content": message})
print(f"💬 Chat: Prepared {len(messages)} messages, creating client...")
# Create client with provider (auto if none specified) and always pass model
client = InferenceClient(
provider=provider if provider else "auto",
api_key=token
)
print(f"🚀 Chat: Client created, starting inference with timeout...")
chat_completion_kwargs = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
"stream": True,
"temperature": temperature,
"top_p": top_p,
}
response = ""
print(f"📡 Chat: Making streaming request with {INFERENCE_TIMEOUT}s timeout...")
# Create streaming function for timeout handling
def create_stream():
return client.chat_completion(**chat_completion_kwargs)
# Execute with timeout using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=1) as executor:
future = executor.submit(create_stream)
try:
# Get the stream with timeout
stream = future.result(timeout=INFERENCE_TIMEOUT)
print(f"🔄 Chat: Got stream, starting to iterate...")
# Track streaming time to detect hangs
last_token_time = time.time()
token_timeout = 30 # 30 seconds between tokens
for message in stream:
current_time = time.time()
# Check if we've been waiting too long for a token
if current_time - last_token_time > token_timeout:
raise TimeoutError(f"No response received for {token_timeout} seconds during streaming")
choices = message.choices
token_content = ""
if len(choices) and choices[0].delta.content:
token_content = choices[0].delta.content
last_token_time = current_time # Reset timer when we get content
response += token_content
yield response
except FutureTimeoutError:
future.cancel() # Cancel the running task
raise TimeoutError(f"Chat request timed out after {INFERENCE_TIMEOUT} seconds")
# Report successful token usage
if token_id:
report_token_status(token_id, "success", api_key=proxy_api_key, client_name=client_name)
except ConnectionError as e:
# Handle proxy connection errors
error_msg = f"Cannot connect to AI-Inferoxy server: {str(e)}"
print(f"🔌 Chat connection error: {error_msg}")
if token_id:
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)
yield format_error_message("Connection Error", "Unable to connect to the proxy server. Please check if it's running.")
except TimeoutError as e:
# Handle timeout errors
error_msg = f"Request timed out: {str(e)}"
print(f"⏰ Chat timeout: {error_msg}")
if token_id:
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)
yield format_error_message("Timeout Error", "The request took too long. The server may be overloaded. Please try again.")
except HfHubHTTPError as e:
# Handle HuggingFace API errors
error_msg = str(e)
print(f"🤗 Chat HF error: {error_msg}")
if token_id:
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)
# Provide more user-friendly error messages
if "401" in error_msg:
yield format_error_message("Authentication Error", "Invalid or expired API token. The proxy will provide a new token on retry.")
elif "402" in error_msg:
yield format_error_message("Quota Exceeded", "API quota exceeded. The proxy will try alternative providers.")
elif "429" in error_msg:
yield format_error_message("Rate Limited", "Too many requests. Please wait a moment and try again.")
else:
yield format_error_message("HuggingFace API Error", error_msg)
except Exception as e:
# Handle all other errors
error_msg = str(e)
print(f"❌ Chat unexpected error: {error_msg}")
if token_id:
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
yield format_error_message("Unexpected Error", f"An unexpected error occurred: {error_msg}")
def handle_chat_submit(message, history, system_msg, model_name, provider, max_tokens, temperature, top_p, hf_token: gr.OAuthToken = None, hf_profile: gr.OAuthProfile = None):
"""
Handle chat submission and manage conversation history with streaming.
"""
if not message.strip():
yield history, ""
return
# Require sign-in: if no token present, prompt login
access_token = getattr(hf_token, "token", None) if hf_token is not None else None
username = getattr(hf_profile, "username", None) if hf_profile is not None else None
if not access_token:
assistant_response = format_error_message("Access Required", "Please sign in with Hugging Face (sidebar Login button).")
current_history = history + [{"role": "assistant", "content": assistant_response}]
yield current_history, ""
return
# Add user message to history
history = history + [{"role": "user", "content": message}]
# Generate response with streaming
response_generator = chat_respond(
message,
history[:-1], # Don't include the current message in history for the function
system_msg,
model_name,
provider,
max_tokens,
temperature,
top_p,
client_name=username
)
# Stream the assistant response token by token
assistant_response = ""
for partial_response in response_generator:
assistant_response = render_with_reasoning_toggle(partial_response, True)
# Update history with the current partial response and yield it
current_history = history + [{"role": "assistant", "content": assistant_response}]
yield current_history, ""
def handle_chat_retry(history, system_msg, model_name, provider, max_tokens, temperature, top_p, hf_token: gr.OAuthToken = None, hf_profile: gr.OAuthProfile = None, retry_data=None):
"""
Retry the assistant response for the selected message.
Works with gr.Chatbot.retry() which provides retry_data.index for the message.
"""
# Require sign-in: if no token present, prompt login
access_token = getattr(hf_token, "token", None) if hf_token is not None else None
username = getattr(hf_profile, "username", None) if hf_profile is not None else None
if not access_token:
assistant_response = format_error_message("Access Required", "Please sign in with Hugging Face (sidebar Login button).")
current_history = (history or []) + [{"role": "assistant", "content": assistant_response}]
yield current_history
return
# Guard: empty history
if not history:
yield history
return
# Determine which assistant message index to retry
retry_index = None
try:
retry_index = getattr(retry_data, "index", None)
except Exception:
retry_index = None
if retry_index is None:
# Fallback to last assistant message
retry_index = len(history) - 1
# Trim history up to the message being retried (exclude that assistant msg)
trimmed_history = list(history[:retry_index])
# Find the most recent user message before retry_index
last_user_idx = None
for idx in range(retry_index - 1, -1, -1):
if trimmed_history[idx].get("role") == "user":
last_user_idx = idx
break
# Nothing to retry if no prior user message
if last_user_idx is None:
yield history
return
# Message to retry and prior conversation context (before that user msg)
message = trimmed_history[last_user_idx].get("content", "")
prior_history = trimmed_history[:last_user_idx]
if not message.strip():
yield history
return
# Stream a new assistant response
response_generator = chat_respond(
message,
prior_history,
system_msg,
model_name,
provider,
max_tokens,
temperature,
top_p,
client_name=username
)
assistant_response = ""
for partial_response in response_generator:
assistant_response = render_with_reasoning_toggle(partial_response, True)
current_history = trimmed_history + [{"role": "assistant", "content": assistant_response}]
yield current_history