Spaces:

nazdridoy
/

inferoxy-hub

Running

App Files Files Community

inferoxy-hub / chat_handler.py

nazdridoy

feat(handlers): pass Hugging Face profile to handlers

52fc803 verified 13 days ago

raw

history blame contribute delete

11 kB

	"""
	Chat functionality handler for AI-Inferoxy AI Hub.
	Handles chat completion requests with streaming responses.
	"""

	import os
	import gradio as gr
	import time
	import threading
	from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
	from huggingface_hub import InferenceClient
	from huggingface_hub.errors import HfHubHTTPError
	from requests.exceptions import ConnectionError, Timeout, RequestException
	from hf_token_utils import get_proxy_token, report_token_status
	from utils import (
	validate_proxy_key,
	format_error_message,
	render_with_reasoning_toggle
	)

	# Timeout configuration for inference requests
	INFERENCE_TIMEOUT = 120 # 2 minutes max for inference


	def chat_respond(
	message,
	history: list[dict[str, str]],
	system_message,
	model_name,
	provider_override,
	max_tokens,
	temperature,
	top_p,
	client_name: str \| None = None,
	):
	"""
	Chat completion function using AI-Inferoxy token management.
	"""
	# Validate proxy API key
	is_valid, error_msg = validate_proxy_key()
	if not is_valid:
	yield error_msg
	return

	proxy_api_key = os.getenv("PROXY_KEY")

	token_id = None
	try:
	# Get token from AI-Inferoxy proxy server with timeout handling
	print(f"🔑 Chat: Requesting token from proxy...")
	token, token_id = get_proxy_token(api_key=proxy_api_key)
	print(f"✅ Chat: Got token: {token_id}")

	# Enforce explicit provider selection via dropdown
	model = model_name
	provider = provider_override or "auto"

	print(f"🤖 Chat: Using model='{model}', provider='{provider if provider else 'auto'}'")

	# Prepare messages first
	messages = [{"role": "system", "content": system_message}]
	messages.extend(history)
	messages.append({"role": "user", "content": message})

	print(f"💬 Chat: Prepared {len(messages)} messages, creating client...")

	# Create client with provider (auto if none specified) and always pass model
	client = InferenceClient(
	provider=provider if provider else "auto",
	api_key=token
	)

	print(f"🚀 Chat: Client created, starting inference with timeout...")

	chat_completion_kwargs = {
	"model": model,
	"messages": messages,
	"max_tokens": max_tokens,
	"stream": True,
	"temperature": temperature,
	"top_p": top_p,
	}

	response = ""

	print(f"📡 Chat: Making streaming request with {INFERENCE_TIMEOUT}s timeout...")

	# Create streaming function for timeout handling
	def create_stream():
	return client.chat_completion(**chat_completion_kwargs)

	# Execute with timeout using ThreadPoolExecutor
	with ThreadPoolExecutor(max_workers=1) as executor:
	future = executor.submit(create_stream)

	try:
	# Get the stream with timeout
	stream = future.result(timeout=INFERENCE_TIMEOUT)
	print(f"🔄 Chat: Got stream, starting to iterate...")

	# Track streaming time to detect hangs
	last_token_time = time.time()
	token_timeout = 30 # 30 seconds between tokens

	for message in stream:
	current_time = time.time()

	# Check if we've been waiting too long for a token
	if current_time - last_token_time > token_timeout:
	raise TimeoutError(f"No response received for {token_timeout} seconds during streaming")

	choices = message.choices
	token_content = ""
	if len(choices) and choices[0].delta.content:
	token_content = choices[0].delta.content
	last_token_time = current_time # Reset timer when we get content

	response += token_content
	yield response

	except FutureTimeoutError:
	future.cancel() # Cancel the running task
	raise TimeoutError(f"Chat request timed out after {INFERENCE_TIMEOUT} seconds")

	# Report successful token usage
	if token_id:
	report_token_status(token_id, "success", api_key=proxy_api_key, client_name=client_name)

	except ConnectionError as e:
	# Handle proxy connection errors
	error_msg = f"Cannot connect to AI-Inferoxy server: {str(e)}"
	print(f"🔌 Chat connection error: {error_msg}")
	if token_id:
	report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)
	yield format_error_message("Connection Error", "Unable to connect to the proxy server. Please check if it's running.")

	except TimeoutError as e:
	# Handle timeout errors
	error_msg = f"Request timed out: {str(e)}"
	print(f"⏰ Chat timeout: {error_msg}")
	if token_id:
	report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)
	yield format_error_message("Timeout Error", "The request took too long. The server may be overloaded. Please try again.")

	except HfHubHTTPError as e:
	# Handle HuggingFace API errors
	error_msg = str(e)
	print(f"🤗 Chat HF error: {error_msg}")
	if token_id:
	report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)

	# Provide more user-friendly error messages
	if "401" in error_msg:
	yield format_error_message("Authentication Error", "Invalid or expired API token. The proxy will provide a new token on retry.")
	elif "402" in error_msg:
	yield format_error_message("Quota Exceeded", "API quota exceeded. The proxy will try alternative providers.")
	elif "429" in error_msg:
	yield format_error_message("Rate Limited", "Too many requests. Please wait a moment and try again.")
	else:
	yield format_error_message("HuggingFace API Error", error_msg)

	except Exception as e:
	# Handle all other errors
	error_msg = str(e)
	print(f"❌ Chat unexpected error: {error_msg}")
	if token_id:
	report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
	yield format_error_message("Unexpected Error", f"An unexpected error occurred: {error_msg}")


	def handle_chat_submit(message, history, system_msg, model_name, provider, max_tokens, temperature, top_p, hf_token: gr.OAuthToken = None, hf_profile: gr.OAuthProfile = None):
	"""
	Handle chat submission and manage conversation history with streaming.
	"""
	if not message.strip():
	yield history, ""
	return

	# Require sign-in: if no token present, prompt login
	access_token = getattr(hf_token, "token", None) if hf_token is not None else None
	username = getattr(hf_profile, "username", None) if hf_profile is not None else None
	if not access_token:
	assistant_response = format_error_message("Access Required", "Please sign in with Hugging Face (sidebar Login button).")
	current_history = history + [{"role": "assistant", "content": assistant_response}]
	yield current_history, ""
	return

	# Add user message to history
	history = history + [{"role": "user", "content": message}]

	# Generate response with streaming
	response_generator = chat_respond(
	message,
	history[:-1], # Don't include the current message in history for the function
	system_msg,
	model_name,
	provider,
	max_tokens,
	temperature,
	top_p,
	client_name=username
	)

	# Stream the assistant response token by token
	assistant_response = ""
	for partial_response in response_generator:
	assistant_response = render_with_reasoning_toggle(partial_response, True)
	# Update history with the current partial response and yield it
	current_history = history + [{"role": "assistant", "content": assistant_response}]
	yield current_history, ""


	def handle_chat_retry(history, system_msg, model_name, provider, max_tokens, temperature, top_p, hf_token: gr.OAuthToken = None, hf_profile: gr.OAuthProfile = None, retry_data=None):
	"""
	Retry the assistant response for the selected message.
	Works with gr.Chatbot.retry() which provides retry_data.index for the message.
	"""
	# Require sign-in: if no token present, prompt login
	access_token = getattr(hf_token, "token", None) if hf_token is not None else None
	username = getattr(hf_profile, "username", None) if hf_profile is not None else None
	if not access_token:
	assistant_response = format_error_message("Access Required", "Please sign in with Hugging Face (sidebar Login button).")
	current_history = (history or []) + [{"role": "assistant", "content": assistant_response}]
	yield current_history
	return
	# Guard: empty history
	if not history:
	yield history
	return

	# Determine which assistant message index to retry
	retry_index = None
	try:
	retry_index = getattr(retry_data, "index", None)
	except Exception:
	retry_index = None

	if retry_index is None:
	# Fallback to last assistant message
	retry_index = len(history) - 1

	# Trim history up to the message being retried (exclude that assistant msg)
	trimmed_history = list(history[:retry_index])

	# Find the most recent user message before retry_index
	last_user_idx = None
	for idx in range(retry_index - 1, -1, -1):
	if trimmed_history[idx].get("role") == "user":
	last_user_idx = idx
	break

	# Nothing to retry if no prior user message
	if last_user_idx is None:
	yield history
	return

	# Message to retry and prior conversation context (before that user msg)
	message = trimmed_history[last_user_idx].get("content", "")
	prior_history = trimmed_history[:last_user_idx]

	if not message.strip():
	yield history
	return

	# Stream a new assistant response
	response_generator = chat_respond(
	message,
	prior_history,
	system_msg,
	model_name,
	provider,
	max_tokens,
	temperature,
	top_p,
	client_name=username
	)

	assistant_response = ""
	for partial_response in response_generator:
	assistant_response = render_with_reasoning_toggle(partial_response, True)
	current_history = trimmed_history + [{"role": "assistant", "content": assistant_response}]
	yield current_history