sanjanavenkat's picture
Update app.py
929e2ad verified
import gradio as gr
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
pipeline,
BlipProcessor,
BlipForConditionalGeneration,
)
from PIL import Image
import whisper
import subprocess
import uuid
import os
# ----------- Load GoEmotions Model (28 emotion classes) -----------
model_name = "joeddav/distilbert-base-uncased-go-emotions-student"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline(
"text-classification",
model=model,
tokenizer=tokenizer,
top_k=None, # get scores for all emotions
function_to_apply="sigmoid",
)
# ----------- Load Summarizer -----------
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
# ----------- Load Captioning Model -----------
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# ----------- Load Whisper Model -----------
whisper_model = whisper.load_model("base")
# ----------- Define Emotion Categories -----------
EMOTION_LABELS = [
"admiration",
"amusement",
"anger",
"annoyance",
"approval",
"caring",
"confusion",
"curiosity",
"desire",
"disappointment",
"disapproval",
"disgust",
"embarrassment",
"excitement",
"fear",
"gratitude",
"grief",
"joy",
"love",
"nervousness",
"optimism",
"pride",
"realization",
"relief",
"remorse",
"sadness",
"surprise",
"neutral",
]
POSITIVE = {
"admiration",
"amusement",
"approval",
"caring",
"excitement",
"gratitude",
"joy",
"love",
"optimism",
"pride",
"relief",
}
NEGATIVE = {
"anger",
"annoyance",
"disappointment",
"disapproval",
"disgust",
"embarrassment",
"fear",
"grief",
"nervousness",
"remorse",
"sadness",
}
SURPRISE_UNCERTAINTY = {
"surprise",
"confusion",
"curiosity",
"realization",
}
DESIRE = {"desire"}
NEUTRAL = {"neutral"}
CATEGORY_MAP = {
"Positive": POSITIVE,
"Negative": NEGATIVE,
"Surprise/Uncertainty": SURPRISE_UNCERTAINTY,
"Desire": DESIRE,
"Neutral": NEUTRAL,
}
# ----------- Text Classification -----------
def process_text(text):
if not text.strip():
return "Please enter some text."
# Summarize only if text is longer than 1000 words
if len(text.split()) > 1000:
summary = summarizer(text, max_length=200, min_length=100, do_sample=False)[0]["summary_text"]
else:
summary = text
print("Summary:", summary)
preds = classifier(summary, truncation=True, max_length=512)[0]
label_scores = {pred["label"]: pred["score"] for pred in preds}
print("Raw classifier preds:", preds)
print("Label scores dict:", label_scores)
category_totals = {}
for cat_name, emotions in CATEGORY_MAP.items():
category_totals[cat_name] = sum(label_scores.get(e, 0) for e in emotions)
best_category = max(category_totals, key=category_totals.get)
emotions_in_cat = [(e, label_scores.get(e, 0)) for e in CATEGORY_MAP[best_category]]
emotions_in_cat.sort(key=lambda x: x[1], reverse=True)
threshold = 0.8
strong_emotions = [(e, s) for e, s in emotions_in_cat if s > threshold]
# Add the summary or original text here:
out = f"**Summary/Text:**\n{summary}\n\n**Dominant Category:** {best_category}\n\n**Emotions in this category:**\n"
if strong_emotions:
for emotion, score in strong_emotions:
out += f"{emotion}: {score:.4f}\n"
else:
top_two = emotions_in_cat[:2]
for emotion, score in top_two:
out += f"{emotion}: {score:.4f}\n"
return out.strip()
# ----------- Image Processing -----------
def image_to_text(image_path):
image = Image.open(image_path).convert("RGB")
inputs = caption_processor(images=image, return_tensors="pt")
out = caption_model.generate(**inputs)
return caption_processor.decode(out[0], skip_special_tokens=True)
def process_image(image_path):
caption = image_to_text(image_path)
summary = summarizer(caption, max_length=60, min_length=5, do_sample=False)[0]["summary_text"]
return process_text(summary)
# ----------- Audio/Video Transcription -----------
def extract_audio(video_path):
audio_path = f"/tmp/{uuid.uuid4().hex}.mp3"
subprocess.run(["ffmpeg", "-y", "-i", video_path, "-q:a", "0", "-map", "a", audio_path], check=True)
return audio_path
def transcribe_audio(audio_path):
result = whisper_model.transcribe(audio_path)
return result["text"]
def process_audio(audio_path):
text = transcribe_audio(audio_path)
return process_text(text)
def process_video(video_path):
audio_path = extract_audio(video_path)
text = transcribe_audio(audio_path)
os.remove(audio_path)
return process_text(text)
# ----------- Gradio Interfaces -----------
text_input = gr.Interface(
fn=process_text,
inputs=gr.Textbox(lines=7, placeholder="Enter text...", label="Text Input"),
outputs=gr.Textbox(label="Emotion Output"),
title="Text Emotion Classifier (GoEmotions, Category Based)",
description="Enter text to detect nuanced emotions grouped by dominant category.",
)
image_input = gr.Interface(
fn=process_image,
inputs=gr.Image(type="filepath", label="Upload Image"),
outputs=gr.Textbox(label="Emotion Output"),
title="Image Emotion Classifier",
description="Upload an image. Model will caption it, summarize, and predict emotions grouped by category.",
)
audio_input = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath", label="Upload Audio"),
outputs=gr.Textbox(label="Emotion Output"),
title="Audio Emotion Classifier",
description="Upload audio. Model will transcribe, summarize, and detect emotions grouped by category.",
)
video_input = gr.Interface(
fn=process_video,
inputs=gr.File(file_types=[".mp4", ".mov", ".avi"], label="Upload Video"),
outputs=gr.Textbox(label="Emotion Output"),
title="Video Emotion Classifier",
description="Upload video. Model will extract audio, transcribe, summarize, and detect emotions grouped by category.",
)
gr.TabbedInterface(
[text_input, image_input, audio_input, video_input], ["Text", "Image", "Audio", "Video"]
).launch()