import gradio as gr
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline,
    BlipProcessor,
    BlipForConditionalGeneration,
)
from PIL import Image
import whisper
import subprocess
import uuid
import os

# ----------- Load GoEmotions Model (28 emotion classes) -----------
model_name = "joeddav/distilbert-base-uncased-go-emotions-student"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    top_k=None,  # get scores for all emotions
    function_to_apply="sigmoid",
)

# ----------- Load Summarizer -----------
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# ----------- Load Captioning Model -----------
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# ----------- Load Whisper Model -----------
whisper_model = whisper.load_model("base")

# ----------- Define Emotion Categories -----------
EMOTION_LABELS = [
    "admiration",
    "amusement",
    "anger",
    "annoyance",
    "approval",
    "caring",
    "confusion",
    "curiosity",
    "desire",
    "disappointment",
    "disapproval",
    "disgust",
    "embarrassment",
    "excitement",
    "fear",
    "gratitude",
    "grief",
    "joy",
    "love",
    "nervousness",
    "optimism",
    "pride",
    "realization",
    "relief",
    "remorse",
    "sadness",
    "surprise",
    "neutral",
]

POSITIVE = {
    "admiration",
    "amusement",
    "approval",
    "caring",
    "excitement",
    "gratitude",
    "joy",
    "love",
    "optimism",
    "pride",
    "relief",
}

NEGATIVE = {
    "anger",
    "annoyance",
    "disappointment",
    "disapproval",
    "disgust",
    "embarrassment",
    "fear",
    "grief",
    "nervousness",
    "remorse",
    "sadness",
}

SURPRISE_UNCERTAINTY = {
    "surprise",
    "confusion",
    "curiosity",
    "realization",
}

DESIRE = {"desire"}

NEUTRAL = {"neutral"}

CATEGORY_MAP = {
    "Positive": POSITIVE,
    "Negative": NEGATIVE,
    "Surprise/Uncertainty": SURPRISE_UNCERTAINTY,
    "Desire": DESIRE,
    "Neutral": NEUTRAL,
}

# ----------- Text Classification -----------

def process_text(text):
    if not text.strip():
        return "Please enter some text."

    # Summarize only if text is longer than 1000 words
    if len(text.split()) > 1000:
        summary = summarizer(text, max_length=200, min_length=100, do_sample=False)[0]["summary_text"]
    else:
        summary = text

    print("Summary:", summary)

    preds = classifier(summary, truncation=True, max_length=512)[0]

    label_scores = {pred["label"]: pred["score"] for pred in preds}

    print("Raw classifier preds:", preds)
    print("Label scores dict:", label_scores)

    category_totals = {}
    for cat_name, emotions in CATEGORY_MAP.items():
        category_totals[cat_name] = sum(label_scores.get(e, 0) for e in emotions)

    best_category = max(category_totals, key=category_totals.get)

    emotions_in_cat = [(e, label_scores.get(e, 0)) for e in CATEGORY_MAP[best_category]]
    emotions_in_cat.sort(key=lambda x: x[1], reverse=True)

    threshold = 0.8
    strong_emotions = [(e, s) for e, s in emotions_in_cat if s > threshold]

    # Add the summary or original text here:
    out = f"**Summary/Text:**\n{summary}\n\n**Dominant Category:** {best_category}\n\n**Emotions in this category:**\n"
    if strong_emotions:
        for emotion, score in strong_emotions:
            out += f"{emotion}: {score:.4f}\n"
    else:
        top_two = emotions_in_cat[:2]
        for emotion, score in top_two:
            out += f"{emotion}: {score:.4f}\n"

    return out.strip()


# ----------- Image Processing -----------

def image_to_text(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = caption_processor(images=image, return_tensors="pt")
    out = caption_model.generate(**inputs)
    return caption_processor.decode(out[0], skip_special_tokens=True)


def process_image(image_path):
    caption = image_to_text(image_path)
    summary = summarizer(caption, max_length=60, min_length=5, do_sample=False)[0]["summary_text"]
    return process_text(summary)

# ----------- Audio/Video Transcription -----------

def extract_audio(video_path):
    audio_path = f"/tmp/{uuid.uuid4().hex}.mp3"
    subprocess.run(["ffmpeg", "-y", "-i", video_path, "-q:a", "0", "-map", "a", audio_path], check=True)
    return audio_path

def transcribe_audio(audio_path):
    result = whisper_model.transcribe(audio_path)
    return result["text"]

def process_audio(audio_path):
    text = transcribe_audio(audio_path)
    return process_text(text)

def process_video(video_path):
    audio_path = extract_audio(video_path)
    text = transcribe_audio(audio_path)
    os.remove(audio_path)
    return process_text(text)

# ----------- Gradio Interfaces -----------

text_input = gr.Interface(
    fn=process_text,
    inputs=gr.Textbox(lines=7, placeholder="Enter text...", label="Text Input"),
    outputs=gr.Textbox(label="Emotion Output"),
    title="Text Emotion Classifier (GoEmotions, Category Based)",
    description="Enter text to detect nuanced emotions grouped by dominant category.",
)

image_input = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="filepath", label="Upload Image"),
    outputs=gr.Textbox(label="Emotion Output"),
    title="Image Emotion Classifier",
    description="Upload an image. Model will caption it, summarize, and predict emotions grouped by category.",
)

audio_input = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath", label="Upload Audio"),
    outputs=gr.Textbox(label="Emotion Output"),
    title="Audio Emotion Classifier",
    description="Upload audio. Model will transcribe, summarize, and detect emotions grouped by category.",
)

video_input = gr.Interface(
    fn=process_video,
    inputs=gr.File(file_types=[".mp4", ".mov", ".avi"], label="Upload Video"),
    outputs=gr.Textbox(label="Emotion Output"),
    title="Video Emotion Classifier",
    description="Upload video. Model will extract audio, transcribe, summarize, and detect emotions grouped by category.",
)

gr.TabbedInterface(
    [text_input, image_input, audio_input, video_input], ["Text", "Image", "Audio", "Video"]
).launch()