import gradio as gr from fastapi import FastAPI, Request from fastapi.responses import JSONResponse import datetime import requests import os import json import asyncio # Initialize FastAPI app = FastAPI() # Configuration API_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B" headers = { "Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}", "Content-Type": "application/json" } def format_chat_response(response_text, prompt_tokens=0, completion_tokens=0): return { "id": f"chatcmpl-{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}", "object": "chat.completion", "created": int(datetime.datetime.now().timestamp()), "model": "Qwen/Qwen2.5-Coder-32B", "choices": [{ "index": 0, "message": { "role": "assistant", "content": response_text }, "finish_reason": "stop" }], "usage": { "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": prompt_tokens + completion_tokens } } async def query_model(payload): response = requests.post(API_URL, headers=headers, json=payload) return response.json() @app.post("/v1/chat/completions") async def chat_completion(request: Request): try: data = await request.json() messages = data.get("messages", []) payload = { "inputs": { "messages": messages }, "parameters": { "max_new_tokens": data.get("max_tokens", 2048), "temperature": data.get("temperature", 0.7), "top_p": data.get("top_p", 0.95), "do_sample": True } } response = await query_model(payload) if isinstance(response, dict) and "error" in response: return JSONResponse( status_code=500, content={"error": response["error"]} ) response_text = response[0]["generated_text"] return JSONResponse( content=format_chat_response(response_text) ) except Exception as e: return JSONResponse( status_code=500, content={"error": str(e)} ) def generate_response(messages): payload = { "inputs": { "messages": messages }, "parameters": { "max_new_tokens": 2048, "temperature": 0.7, "top_p": 0.95, "do_sample": True } } response = requests.post(API_URL, headers=headers, json=payload) result = response.json() if isinstance(result, dict) and "error" in result: return f"Error: {result['error']}" return result[0]["generated_text"] def chat_interface(message, chat_history): if message.strip() == "": return chat_history try: # Format the message history in the OpenAI style messages = [] for msg in chat_history: messages.append({"role": "user", "content": msg[0]}) if msg[1] is not None: messages.append({"role": "assistant", "content": msg[1]}) # Add the current message messages.append({"role": "user", "content": message}) # Get response response = generate_response(messages) # Update history in the new format chat_history.append((message, response)) return chat_history except Exception as e: chat_history.append((message, f"Error: {str(e)}")) return chat_history # Create Gradio interface with new message format demo = gr.ChatInterface( fn=chat_interface, title="Qwen2.5-Coder-32B Chat", description="Chat with Qwen2.5-Coder-32B model via Hugging Face Inference API", examples=["Hello! Can you help me with coding?", "Write a simple Python function to calculate factorial"], retry_btn="Retry", undo_btn="Undo last message", clear_btn="Clear conversation", ) # Mount both FastAPI and Gradio app = gr.mount_gradio_app(app, demo, path="/") # For running with uvicorn directly if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)