import gradio as gr
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
import datetime
import requests
import os
import json
import asyncio

# Initialize FastAPI
app = FastAPI()

# Configuration
API_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B"
headers = {
    "Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}",
    "Content-Type": "application/json"
}

def format_chat_response(response_text, prompt_tokens=0, completion_tokens=0):
    return {
        "id": f"chatcmpl-{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}",
        "object": "chat.completion",
        "created": int(datetime.datetime.now().timestamp()),
        "model": "Qwen/Qwen2.5-Coder-32B",
        "choices": [{
            "index": 0,
            "message": {
                "role": "assistant",
                "content": response_text
            },
            "finish_reason": "stop"
        }],
        "usage": {
            "prompt_tokens": prompt_tokens,
            "completion_tokens": completion_tokens,
            "total_tokens": prompt_tokens + completion_tokens
        }
    }

async def query_model(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

@app.post("/v1/chat/completions")
async def chat_completion(request: Request):
    try:
        data = await request.json()
        messages = data.get("messages", [])
        
        payload = {
            "inputs": {
                "messages": messages
            },
            "parameters": {
                "max_new_tokens": data.get("max_tokens", 2048),
                "temperature": data.get("temperature", 0.7),
                "top_p": data.get("top_p", 0.95),
                "do_sample": True
            }
        }
        
        response = await query_model(payload)
        
        if isinstance(response, dict) and "error" in response:
            return JSONResponse(
                status_code=500,
                content={"error": response["error"]}
            )
        
        response_text = response[0]["generated_text"]
        
        return JSONResponse(
            content=format_chat_response(response_text)
        )
    except Exception as e:
        return JSONResponse(
            status_code=500,
            content={"error": str(e)}
        )

def generate_response(messages):
    payload = {
        "inputs": {
            "messages": messages
        },
        "parameters": {
            "max_new_tokens": 2048,
            "temperature": 0.7,
            "top_p": 0.95,
            "do_sample": True
        }
    }
    
    response = requests.post(API_URL, headers=headers, json=payload)
    result = response.json()
    
    if isinstance(result, dict) and "error" in result:
        return f"Error: {result['error']}"
    
    return result[0]["generated_text"]

def chat_interface(message, chat_history):
    if message.strip() == "":
        return chat_history
    
    try:
        # Format the message history in the OpenAI style
        messages = []
        for msg in chat_history:
            messages.append({"role": "user", "content": msg[0]})
            if msg[1] is not None:
                messages.append({"role": "assistant", "content": msg[1]})
        
        # Add the current message
        messages.append({"role": "user", "content": message})
        
        # Get response
        response = generate_response(messages)
        
        # Update history in the new format
        chat_history.append((message, response))
        return chat_history
    except Exception as e:
        chat_history.append((message, f"Error: {str(e)}"))
        return chat_history

# Create Gradio interface with new message format
demo = gr.ChatInterface(
    fn=chat_interface,
    title="Qwen2.5-Coder-32B Chat",
    description="Chat with Qwen2.5-Coder-32B model via Hugging Face Inference API",
    examples=["Hello! Can you help me with coding?", 
             "Write a simple Python function to calculate factorial"],
    retry_btn="Retry",
    undo_btn="Undo last message",
    clear_btn="Clear conversation",
)

# Mount both FastAPI and Gradio
app = gr.mount_gradio_app(app, demo, path="/")

# For running with uvicorn directly
if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)