import gradio as gr from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, pipeline, BlipProcessor, BlipForConditionalGeneration, ) from PIL import Image import whisper import subprocess import uuid import os # ----------- Load GoEmotions Model (28 emotion classes) ----------- model_name = "joeddav/distilbert-base-uncased-go-emotions-student" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name) classifier = pipeline( "text-classification", model=model, tokenizer=tokenizer, top_k=None, # get scores for all emotions function_to_apply="sigmoid", ) # ----------- Load Summarizer ----------- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") # ----------- Load Captioning Model ----------- caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") # ----------- Load Whisper Model ----------- whisper_model = whisper.load_model("base") # ----------- Define Emotion Categories ----------- EMOTION_LABELS = [ "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism", "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral", ] POSITIVE = { "admiration", "amusement", "approval", "caring", "excitement", "gratitude", "joy", "love", "optimism", "pride", "relief", } NEGATIVE = { "anger", "annoyance", "disappointment", "disapproval", "disgust", "embarrassment", "fear", "grief", "nervousness", "remorse", "sadness", } SURPRISE_UNCERTAINTY = { "surprise", "confusion", "curiosity", "realization", } DESIRE = {"desire"} NEUTRAL = {"neutral"} CATEGORY_MAP = { "Positive": POSITIVE, "Negative": NEGATIVE, "Surprise/Uncertainty": SURPRISE_UNCERTAINTY, "Desire": DESIRE, "Neutral": NEUTRAL, } # ----------- Text Classification ----------- def process_text(text): if not text.strip(): return "Please enter some text." # Summarize only if text is longer than 1000 words if len(text.split()) > 1000: summary = summarizer(text, max_length=200, min_length=100, do_sample=False)[0]["summary_text"] else: summary = text print("Summary:", summary) preds = classifier(summary, truncation=True, max_length=512)[0] label_scores = {pred["label"]: pred["score"] for pred in preds} print("Raw classifier preds:", preds) print("Label scores dict:", label_scores) category_totals = {} for cat_name, emotions in CATEGORY_MAP.items(): category_totals[cat_name] = sum(label_scores.get(e, 0) for e in emotions) best_category = max(category_totals, key=category_totals.get) emotions_in_cat = [(e, label_scores.get(e, 0)) for e in CATEGORY_MAP[best_category]] emotions_in_cat.sort(key=lambda x: x[1], reverse=True) threshold = 0.8 strong_emotions = [(e, s) for e, s in emotions_in_cat if s > threshold] # Add the summary or original text here: out = f"**Summary/Text:**\n{summary}\n\n**Dominant Category:** {best_category}\n\n**Emotions in this category:**\n" if strong_emotions: for emotion, score in strong_emotions: out += f"{emotion}: {score:.4f}\n" else: top_two = emotions_in_cat[:2] for emotion, score in top_two: out += f"{emotion}: {score:.4f}\n" return out.strip() # ----------- Image Processing ----------- def image_to_text(image_path): image = Image.open(image_path).convert("RGB") inputs = caption_processor(images=image, return_tensors="pt") out = caption_model.generate(**inputs) return caption_processor.decode(out[0], skip_special_tokens=True) def process_image(image_path): caption = image_to_text(image_path) summary = summarizer(caption, max_length=60, min_length=5, do_sample=False)[0]["summary_text"] return process_text(summary) # ----------- Audio/Video Transcription ----------- def extract_audio(video_path): audio_path = f"/tmp/{uuid.uuid4().hex}.mp3" subprocess.run(["ffmpeg", "-y", "-i", video_path, "-q:a", "0", "-map", "a", audio_path], check=True) return audio_path def transcribe_audio(audio_path): result = whisper_model.transcribe(audio_path) return result["text"] def process_audio(audio_path): text = transcribe_audio(audio_path) return process_text(text) def process_video(video_path): audio_path = extract_audio(video_path) text = transcribe_audio(audio_path) os.remove(audio_path) return process_text(text) # ----------- Gradio Interfaces ----------- text_input = gr.Interface( fn=process_text, inputs=gr.Textbox(lines=7, placeholder="Enter text...", label="Text Input"), outputs=gr.Textbox(label="Emotion Output"), title="Text Emotion Classifier (GoEmotions, Category Based)", description="Enter text to detect nuanced emotions grouped by dominant category.", ) image_input = gr.Interface( fn=process_image, inputs=gr.Image(type="filepath", label="Upload Image"), outputs=gr.Textbox(label="Emotion Output"), title="Image Emotion Classifier", description="Upload an image. Model will caption it, summarize, and predict emotions grouped by category.", ) audio_input = gr.Interface( fn=process_audio, inputs=gr.Audio(type="filepath", label="Upload Audio"), outputs=gr.Textbox(label="Emotion Output"), title="Audio Emotion Classifier", description="Upload audio. Model will transcribe, summarize, and detect emotions grouped by category.", ) video_input = gr.Interface( fn=process_video, inputs=gr.File(file_types=[".mp4", ".mov", ".avi"], label="Upload Video"), outputs=gr.Textbox(label="Emotion Output"), title="Video Emotion Classifier", description="Upload video. Model will extract audio, transcribe, summarize, and detect emotions grouped by category.", ) gr.TabbedInterface( [text_input, image_input, audio_input, video_input], ["Text", "Image", "Audio", "Video"] ).launch()