from typing import List, Dict, Any
import json
import torch
import os

# Disable xformers for CPU compatibility with Stella models
os.environ["XFORMERS_DISABLED"] = "1"

import gradio as gr
from fastapi import FastAPI
from fastapi.responses import JSONResponse
from sentence_transformers import SentenceTransformer

# Device detection - use GPU if available, otherwise CPU
def get_device():
    if torch.cuda.is_available():
        print("🚀 GPU detected - using CUDA for acceleration")
        return 'cuda'
    else:
        print("💻 Using CPU for inference")
        return 'cpu'

DEVICE = get_device()

# Available models
MODELS = {
    "nomic-ai/nomic-embed-text-v1.5": {"trust_remote_code": True},
    "nomic-ai/nomic-embed-text-v1": {"trust_remote_code": True},
    "mixedbread-ai/mxbai-embed-large-v1": {"trust_remote_code": False},
    "BAAI/bge-m3": {"trust_remote_code": False},
    "sentence-transformers/all-MiniLM-L6-v2": {"trust_remote_code": False},
    "sentence-transformers/all-mpnet-base-v2": {"trust_remote_code": False},
    "Snowflake/snowflake-arctic-embed-m": {"trust_remote_code": False},
    "Snowflake/snowflake-arctic-embed-l": {"trust_remote_code": False},
    "Snowflake/snowflake-arctic-embed-m-long": {"trust_remote_code": True},
    "Snowflake/snowflake-arctic-embed-m-v2.0": {"trust_remote_code": False},
    "BAAI/bge-large-en-v1.5": {"trust_remote_code": False},
    "BAAI/bge-base-en-v1.5": {"trust_remote_code": False},
    "BAAI/bge-small-en-v1.5": {"trust_remote_code": False},
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": {"trust_remote_code": False},
    "ibm-granite/granite-embedding-30m-english": {"trust_remote_code": False},
    "ibm-granite/granite-embedding-278m-multilingual": {"trust_remote_code": False},
    "Qwen/Qwen3-Embedding-0.6B": {"trust_remote_code": False},
    "Qwen/Qwen3-Embedding-4B": {"trust_remote_code": False},
    "Qwen/Qwen3-Embedding-8B": {"trust_remote_code": False},
    "dunzhang/stella_en_400M_v5": {"trust_remote_code": True},
    "dunzhang/stella_en_1.5B_v5": {"trust_remote_code": True},
    "infgrad/stella-base-en-v2": {"trust_remote_code": True},
    "nvidia/NV-Embed-v2": {"trust_remote_code": True},
    "Alibaba-NLP/gte-Qwen2-7B-instruct": {"trust_remote_code": False},
    "Alibaba-NLP/gte-Qwen2-1.5B-instruct": {"trust_remote_code": False},
    "intfloat/multilingual-e5-large-instruct": {"trust_remote_code": False},
    "intfloat/multilingual-e5-large": {"trust_remote_code": False},
    "BAAI/bge-en-icl": {"trust_remote_code": False},
}

# Model cache - keep only one model loaded at a time
current_model = None
current_model_name = "nomic-ai/nomic-embed-text-v1.5"

# Initialize default model
def load_model(model_name: str):
    global current_model, current_model_name
    
    # If requesting the same model that's already loaded, return it
    if current_model is not None and current_model_name == model_name:
        return current_model
    
    # Unload the previous model if it exists
    if current_model is not None:
        del current_model
        current_model = None
    
    # Load the new model
    trust_remote_code = MODELS.get(model_name, {}).get("trust_remote_code", False)
    try:
        print(f"Loading model '{model_name}' on {DEVICE}")
        
        
        current_model = SentenceTransformer(
            model_name, 
            trust_remote_code=trust_remote_code,
            device=DEVICE
        )
        current_model_name = model_name
        print(f"✅ Model '{model_name}' loaded successfully on {DEVICE}")
    except Exception as e:
        raise ValueError(f"Failed to load model '{model_name}': {str(e)}")
    
    return current_model

# Load default model
model = load_model(current_model_name)

# Create FastAPI app
fastapi_app = FastAPI()


def embed(document: str, model_name: str = None):
    if model_name:
        try:
            selected_model = load_model(model_name)
            return selected_model.encode(document)
        except Exception as e:
            raise ValueError(f"Error with model '{model_name}': {str(e)}")
    return model.encode(document)


# FastAPI endpoints
@fastapi_app.post("/embed")
async def embed_text(data: Dict[str, Any]):
    """Direct API endpoint for text embedding without queue"""
    try:
        text = data.get("text", "")
        model_name = data.get("model", current_model_name)
        
        if not text:
            return JSONResponse(
                status_code=400,
                content={"error": "No text provided"}
            )
        
        # Allow any model but warn about trust_remote_code
        if model_name not in MODELS:
            trust_remote_code = False
        else:
            trust_remote_code = MODELS[model_name].get("trust_remote_code", False)
        
        # Generate embedding
        embedding = embed(text, model_name)
        
        return JSONResponse(
            content={
                "embedding": embedding.tolist(),
                "dim": len(embedding),
                "model": model_name,
                "trust_remote_code": trust_remote_code,
                "predefined": model_name in MODELS
            }
        )
    except Exception as e:
        return JSONResponse(
            status_code=500,
            content={"error": str(e)}
        )


@fastapi_app.get("/models")
async def list_models():
    """List available embedding models"""
    return JSONResponse(
        content={
            "models": list(MODELS.keys()),
            "default": current_model_name
        }
    )


with gr.Blocks(title="Multi-Model Text Embeddings", css="""
    .json-holder {
        max-height: 400px !important;
        overflow-y: auto !important;
    }
    .json-holder .wrap {
        max-height: 400px !important;
        overflow-y: auto !important;
    }
""") as app:
    gr.Markdown("# Multi-Model Text Embeddings")
    gr.Markdown("Generate embeddings for your text using 28+ state-of-the-art embedding models including top MTEB performers like NV-Embed-v2, gte-Qwen2-7B-instruct, Nomic, BGE, Snowflake, IBM Granite, Qwen3, Stella, and more.")
    gr.Markdown(f"**Device**: {DEVICE.upper()} {'🚀' if DEVICE == 'cuda' else '💻'}")
    
    # Model selector dropdown (allows custom input)
    model_dropdown = gr.Dropdown(
        choices=list(MODELS.keys()),
        value=current_model_name,
        label="Select Embedding Model",
        info="Choose from predefined models or enter any Hugging Face model name",
        allow_custom_value=True
    )
    
    # Create an input text box
    text_input = gr.Textbox(label="Enter text to embed", placeholder="Type or paste your text here...")

    # Create an output component to display the embedding
    output = gr.JSON(label="Text Embedding", elem_classes=["json-holder"])
    
    # Add a submit button with API name
    submit_btn = gr.Button("Generate Embedding", variant="primary")

    # Handle both button click and text submission
    submit_btn.click(embed, inputs=[text_input, model_dropdown], outputs=output, api_name="predict")
    text_input.submit(embed, inputs=[text_input, model_dropdown], outputs=output)
    
    # Add API usage guide
    gr.Markdown("## API Usage")
    gr.Markdown("""
    You can use this API in two ways: via the direct FastAPI endpoint or through Gradio clients.
    
    **Security Note**: Only predefined models allow `trust_remote_code=True`. Any other Hugging Face model will use `trust_remote_code=False` for security.
    
    ### List Available Models
    ```bash
    curl https://ipepe-nomic-embeddings.hf.space/models
    ```
    
    ### Direct API Endpoint (No Queue!)
    ```bash
    # Default model (nomic-ai/nomic-embed-text-v1.5)
    curl -X POST https://ipepe-nomic-embeddings.hf.space/embed \
      -H "Content-Type: application/json" \
      -d '{"text": "Your text to embed goes here"}'
    
    # With predefined model (trust_remote_code allowed)
    curl -X POST https://ipepe-nomic-embeddings.hf.space/embed \
      -H "Content-Type: application/json" \
      -d '{"text": "Your text to embed goes here", "model": "sentence-transformers/all-MiniLM-L6-v2"}'
    
    # With any Hugging Face model (trust_remote_code=False for security)
    curl -X POST https://ipepe-nomic-embeddings.hf.space/embed \
      -H "Content-Type: application/json" \
      -d '{"text": "Your text to embed goes here", "model": "intfloat/e5-base-v2"}'
    ```
    
    Response format:
    ```json
    {
      "embedding": [0.123, -0.456, ...],
      "dim": 384,
      "model": "sentence-transformers/all-MiniLM-L6-v2",
      "trust_remote_code": false,
      "predefined": true
    }
    ```
    
    ### Python Example (Direct API)
    ```python
    import requests
    
    # List available models
    models = requests.get("https://ipepe-nomic-embeddings.hf.space/models").json()
    print(models["models"])
    
    # Generate embedding with specific model
    response = requests.post(
        "https://ipepe-nomic-embeddings.hf.space/embed",
        json={
            "text": "Your text to embed goes here",
            "model": "BAAI/bge-small-en-v1.5"
        }
    )
    result = response.json()
    embedding = result["embedding"]
    ```
    
    ### Python Example (Gradio Client)
    ```python
    from gradio_client import Client
    
    client = Client("ipepe/nomic-embeddings")
    result = client.predict(
        "Your text to embed goes here",
        "nomic-ai/nomic-embed-text-v1.5",  # model selection
        api_name="/predict"
    )
    print(result)  # Returns the embedding array
    ```
    
    ### Available Models
    - `nomic-ai/nomic-embed-text-v1.5` (default) - High-performing open embedding model with large token context
    - `nomic-ai/nomic-embed-text-v1` - Previous version of Nomic embedding model
    - `mixedbread-ai/mxbai-embed-large-v1` - State-of-the-art large embedding model from mixedbread.ai
    - `BAAI/bge-m3` - Multi-functional, multi-lingual, multi-granularity embedding model
    - `sentence-transformers/all-MiniLM-L6-v2` - Fast, small embedding model for general use
    - `sentence-transformers/all-mpnet-base-v2` - Balanced performance embedding model
    - `Snowflake/snowflake-arctic-embed-m` - Medium-sized Arctic embedding model
    - `Snowflake/snowflake-arctic-embed-l` - Large Arctic embedding model
    - `Snowflake/snowflake-arctic-embed-m-long` - Medium Arctic model optimized for long context
    - `Snowflake/snowflake-arctic-embed-m-v2.0` - Latest Arctic embedding with multilingual support
    - `BAAI/bge-large-en-v1.5` - Large BGE embedding model for English
    - `BAAI/bge-base-en-v1.5` - Base BGE embedding model for English
    - `BAAI/bge-small-en-v1.5` - Small BGE embedding model for English
    - `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` - Multilingual paraphrase model
    - `ibm-granite/granite-embedding-30m-english` - IBM Granite 30M English embedding model
    - `ibm-granite/granite-embedding-278m-multilingual` - IBM Granite 278M multilingual embedding model
    """)

if __name__ == '__main__':
    # Mount FastAPI app to Gradio
    app = gr.mount_gradio_app(fastapi_app, app, path="/")
    
    # Run with Uvicorn (Gradio uses this internally)
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)