|
import gradio as gr |
|
from transformers import ( |
|
AutoTokenizer, |
|
AutoModelForSequenceClassification, |
|
pipeline, |
|
BlipProcessor, |
|
BlipForConditionalGeneration, |
|
) |
|
from PIL import Image |
|
import whisper |
|
import subprocess |
|
import uuid |
|
import os |
|
|
|
|
|
model_name = "joeddav/distilbert-base-uncased-go-emotions-student" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSequenceClassification.from_pretrained(model_name) |
|
classifier = pipeline( |
|
"text-classification", |
|
model=model, |
|
tokenizer=tokenizer, |
|
top_k=None, |
|
function_to_apply="sigmoid", |
|
) |
|
|
|
|
|
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") |
|
|
|
|
|
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") |
|
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") |
|
|
|
|
|
whisper_model = whisper.load_model("base") |
|
|
|
|
|
EMOTION_LABELS = [ |
|
"admiration", |
|
"amusement", |
|
"anger", |
|
"annoyance", |
|
"approval", |
|
"caring", |
|
"confusion", |
|
"curiosity", |
|
"desire", |
|
"disappointment", |
|
"disapproval", |
|
"disgust", |
|
"embarrassment", |
|
"excitement", |
|
"fear", |
|
"gratitude", |
|
"grief", |
|
"joy", |
|
"love", |
|
"nervousness", |
|
"optimism", |
|
"pride", |
|
"realization", |
|
"relief", |
|
"remorse", |
|
"sadness", |
|
"surprise", |
|
"neutral", |
|
] |
|
|
|
POSITIVE = { |
|
"admiration", |
|
"amusement", |
|
"approval", |
|
"caring", |
|
"excitement", |
|
"gratitude", |
|
"joy", |
|
"love", |
|
"optimism", |
|
"pride", |
|
"relief", |
|
} |
|
|
|
NEGATIVE = { |
|
"anger", |
|
"annoyance", |
|
"disappointment", |
|
"disapproval", |
|
"disgust", |
|
"embarrassment", |
|
"fear", |
|
"grief", |
|
"nervousness", |
|
"remorse", |
|
"sadness", |
|
} |
|
|
|
SURPRISE_UNCERTAINTY = { |
|
"surprise", |
|
"confusion", |
|
"curiosity", |
|
"realization", |
|
} |
|
|
|
DESIRE = {"desire"} |
|
|
|
NEUTRAL = {"neutral"} |
|
|
|
CATEGORY_MAP = { |
|
"Positive": POSITIVE, |
|
"Negative": NEGATIVE, |
|
"Surprise/Uncertainty": SURPRISE_UNCERTAINTY, |
|
"Desire": DESIRE, |
|
"Neutral": NEUTRAL, |
|
} |
|
|
|
|
|
|
|
def process_text(text): |
|
if not text.strip(): |
|
return "Please enter some text." |
|
|
|
|
|
if len(text.split()) > 1000: |
|
summary = summarizer(text, max_length=200, min_length=100, do_sample=False)[0]["summary_text"] |
|
else: |
|
summary = text |
|
|
|
print("Summary:", summary) |
|
|
|
preds = classifier(summary, truncation=True, max_length=512)[0] |
|
|
|
label_scores = {pred["label"]: pred["score"] for pred in preds} |
|
|
|
print("Raw classifier preds:", preds) |
|
print("Label scores dict:", label_scores) |
|
|
|
category_totals = {} |
|
for cat_name, emotions in CATEGORY_MAP.items(): |
|
category_totals[cat_name] = sum(label_scores.get(e, 0) for e in emotions) |
|
|
|
best_category = max(category_totals, key=category_totals.get) |
|
|
|
emotions_in_cat = [(e, label_scores.get(e, 0)) for e in CATEGORY_MAP[best_category]] |
|
emotions_in_cat.sort(key=lambda x: x[1], reverse=True) |
|
|
|
threshold = 0.8 |
|
strong_emotions = [(e, s) for e, s in emotions_in_cat if s > threshold] |
|
|
|
|
|
out = f"**Summary/Text:**\n{summary}\n\n**Dominant Category:** {best_category}\n\n**Emotions in this category:**\n" |
|
if strong_emotions: |
|
for emotion, score in strong_emotions: |
|
out += f"{emotion}: {score:.4f}\n" |
|
else: |
|
top_two = emotions_in_cat[:2] |
|
for emotion, score in top_two: |
|
out += f"{emotion}: {score:.4f}\n" |
|
|
|
return out.strip() |
|
|
|
|
|
|
|
|
|
|
|
def image_to_text(image_path): |
|
image = Image.open(image_path).convert("RGB") |
|
inputs = caption_processor(images=image, return_tensors="pt") |
|
out = caption_model.generate(**inputs) |
|
return caption_processor.decode(out[0], skip_special_tokens=True) |
|
|
|
|
|
def process_image(image_path): |
|
caption = image_to_text(image_path) |
|
summary = summarizer(caption, max_length=60, min_length=5, do_sample=False)[0]["summary_text"] |
|
return process_text(summary) |
|
|
|
|
|
|
|
def extract_audio(video_path): |
|
audio_path = f"/tmp/{uuid.uuid4().hex}.mp3" |
|
subprocess.run(["ffmpeg", "-y", "-i", video_path, "-q:a", "0", "-map", "a", audio_path], check=True) |
|
return audio_path |
|
|
|
def transcribe_audio(audio_path): |
|
result = whisper_model.transcribe(audio_path) |
|
return result["text"] |
|
|
|
def process_audio(audio_path): |
|
text = transcribe_audio(audio_path) |
|
return process_text(text) |
|
|
|
def process_video(video_path): |
|
audio_path = extract_audio(video_path) |
|
text = transcribe_audio(audio_path) |
|
os.remove(audio_path) |
|
return process_text(text) |
|
|
|
|
|
|
|
text_input = gr.Interface( |
|
fn=process_text, |
|
inputs=gr.Textbox(lines=7, placeholder="Enter text...", label="Text Input"), |
|
outputs=gr.Textbox(label="Emotion Output"), |
|
title="Text Emotion Classifier (GoEmotions, Category Based)", |
|
description="Enter text to detect nuanced emotions grouped by dominant category.", |
|
) |
|
|
|
image_input = gr.Interface( |
|
fn=process_image, |
|
inputs=gr.Image(type="filepath", label="Upload Image"), |
|
outputs=gr.Textbox(label="Emotion Output"), |
|
title="Image Emotion Classifier", |
|
description="Upload an image. Model will caption it, summarize, and predict emotions grouped by category.", |
|
) |
|
|
|
audio_input = gr.Interface( |
|
fn=process_audio, |
|
inputs=gr.Audio(type="filepath", label="Upload Audio"), |
|
outputs=gr.Textbox(label="Emotion Output"), |
|
title="Audio Emotion Classifier", |
|
description="Upload audio. Model will transcribe, summarize, and detect emotions grouped by category.", |
|
) |
|
|
|
video_input = gr.Interface( |
|
fn=process_video, |
|
inputs=gr.File(file_types=[".mp4", ".mov", ".avi"], label="Upload Video"), |
|
outputs=gr.Textbox(label="Emotion Output"), |
|
title="Video Emotion Classifier", |
|
description="Upload video. Model will extract audio, transcribe, summarize, and detect emotions grouped by category.", |
|
) |
|
|
|
gr.TabbedInterface( |
|
[text_input, image_input, audio_input, video_input], ["Text", "Image", "Audio", "Video"] |
|
).launch() |
|
|