import gradio as gr from fastapi import FastAPI, Request from fastapi.responses import JSONResponse import datetime import requests import os import json import asyncio from fastapi import FastAPI, Request, HTTPException from fastapi.responses import JSONResponse import datetime import requests import os import logging # Initialize FastAPI app = FastAPI() # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Configuration API_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B" headers = { "Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}", "Content-Type": "application/json" } def format_chat_response(response_text, prompt_tokens=0, completion_tokens=0): return { "id": f"chatcmpl-{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}", "object": "chat.completion", "created": int(datetime.datetime.now().timestamp()), "model": "Qwen/Qwen2.5-Coder-32B", "choices": [{ "index": 0, "message": { "role": "assistant", "content": response_text }, "finish_reason": "stop" }], "usage": { "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": prompt_tokens + completion_tokens } } async def query_model(payload): try: response = requests.post(API_URL, headers=headers, json=payload) response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: logger.error(f"Request failed: {e}") raise HTTPException(status_code=500, detail=str(e)) @app.get("/status") async def status(): try: response_text = "it's working" return JSONResponse(content=format_chat_response(response_text)) except Exception as e: logger.error(f"Status check failed: {e}") raise HTTPException(status_code=500, detail=str(e)) @app.post("/v1/chat/completions") async def chat_completion(request: Request): try: data = await request.json() messages = data.get("messages", []) if not messages: raise HTTPException(status_code=400, detail="Messages are required") payload = { "inputs": { "messages": messages }, "parameters": { "max_new_tokens": data.get("max_tokens", 2048), "temperature": data.get("temperature", 0.7), "top_p": data.get("top_p", 0.95), "do_sample": True } } response = await query_model(payload) if isinstance(response, dict) and "error" in response: raise HTTPException(status_code=500, detail=response["error"]) response_text = response[0]["generated_text"] return JSONResponse(content=format_chat_response(response_text)) except HTTPException as e: logger.error(f"Chat completion failed: {e.detail}") raise e except Exception as e: logger.error(f"Unexpected error: {e}") raise HTTPException(status_code=500, detail=str(e)) def generate_response(messages): payload = { "inputs": { "messages": messages }, "parameters": { "max_new_tokens": 2048, "temperature": 0.7, "top_p": 0.95, "do_sample": True } } try: response = requests.post(API_URL, headers=headers, json=payload) response.raise_for_status() result = response.json() if isinstance(result, dict) and "error" in result: return f"Error: {result['error']}" return result[0]["generated_text"] except requests.exceptions.RequestException as e: logger.error(f"Request failed: {e}") return f"Error: {e}" def chat_interface(messages): chat_history = [] for message in messages: try: response = generate_response([{"role": "user", "content": message}]) chat_history.append({"role": "user", "content": message}) chat_history.append({"role": "assistant", "content": response}) except Exception as e: chat_history.append({"role": "user", "content": message}) chat_history.append({"role": "assistant", "content": f"Error: {str(e)}"}) return chat_history # Create Gradio interface def gradio_app(): return gr.ChatInterface(chat_interface, type="messages") # Mount both FastAPI and Gradio app = gr.mount_gradio_app(app, gradio_app(), path="/") # For running with uvicorn directly if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)