Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import time | |
import os | |
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer | |
import pandas as pd | |
from sklearn.feature_extraction.text import CountVectorizer | |
import nltk | |
from nltk.tokenize import word_tokenize | |
import re | |
# Download necessary NLTK data | |
try: | |
# Make the download more reliable by specifying download directory | |
nltk_data_dir = '/home/user/nltk_data' | |
os.makedirs(nltk_data_dir, exist_ok=True) | |
# Download all required resources | |
nltk.download('punkt', download_dir=nltk_data_dir) | |
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_dir) | |
# Set the data path to include our custom directory | |
nltk.data.path.insert(0, nltk_data_dir) | |
except Exception as e: | |
print(f"NLTK download issue: {e}") | |
# Fallback simple approach if the directory approach fails | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
# Add error handling around model loading | |
try: | |
# Load Whisper for ASR | |
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3") | |
# Load Grammar Scoring Model (CoLA) | |
cola_model = AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-CoLA") | |
cola_tokenizer = AutoTokenizer.from_pretrained("textattack/roberta-base-CoLA") | |
grammar_pipeline = pipeline("text-classification", model=cola_model, tokenizer=cola_tokenizer) | |
# Load Grammar Correction Model (T5) | |
correction_pipeline = pipeline("text2text-generation", model="vennify/t5-base-grammar-correction") | |
# Add sentiment analysis | |
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") | |
# Add fluency analysis (using BERT) | |
fluency_pipeline = pipeline("text-classification", model="textattack/bert-base-uncased-CoLA") | |
# Set variables to track loaded models | |
MODELS_LOADED = True | |
except Exception as e: | |
print(f"Error loading models: {e}") | |
# Set variable to track failed model loading | |
MODELS_LOADED = False | |
# Common English filler words to detect | |
FILLER_WORDS = ["um", "uh", "like", "you know", "actually", "basically", "literally", | |
"sort of", "kind of", "i mean", "so", "well", "right", "okay", "yeah"] | |
def count_filler_words(text): | |
"""Count filler words in the text""" | |
text = text.lower() | |
count = 0 | |
for word in FILLER_WORDS: | |
count += len(re.findall(r'\b' + word + r'\b', text)) | |
return count, count / max(len(text.split()), 1) # Count and ratio | |
def calculate_speaking_rate(text, duration): | |
"""Calculate words per minute""" | |
if duration <= 0: | |
return 0 | |
words = len(text.split()) | |
return (words / duration) * 60 # Words per minute | |
def analyze_vocabulary_richness(text): | |
"""Analyze vocabulary richness""" | |
# Split text by simple regex instead of using word_tokenize to avoid NLTK issues | |
try: | |
# Try using word_tokenize first | |
words = word_tokenize(text.lower()) | |
except LookupError: | |
# Fallback to simple regex-based tokenization if NLTK fails | |
words = re.findall(r'\b\w+\b', text.lower()) | |
if not words: | |
return 0, {} | |
# Vocabulary richness (unique words / total words) | |
unique_words = set(words) | |
richness = len(unique_words) / len(words) | |
# Use simple POS tagging or skip it if NLTK fails | |
try: | |
pos_tags = nltk.pos_tag(words) | |
pos_counts = {} | |
for _, tag in pos_tags: | |
pos_counts[tag] = pos_counts.get(tag, 0) + 1 | |
except Exception: | |
# Return simplified count if POS tagging fails | |
pos_counts = {"WORD": len(words), "UNIQUE": len(unique_words)} | |
return richness, pos_counts | |
def analyze_sentence_complexity(text): | |
"""Analyze sentence complexity with error handling""" | |
try: | |
# Simple sentence splitting by punctuation | |
sentences = re.split(r'[.!?]+', text) | |
sentences = [s.strip() for s in sentences if s.strip()] | |
if not sentences: | |
return 0, 0 | |
# Average words per sentence | |
words_per_sentence = [len(s.split()) for s in sentences] | |
avg_words = sum(words_per_sentence) / len(sentences) | |
# Sentence length variation (standard deviation) | |
sentence_length_variation = np.std(words_per_sentence) if len(sentences) > 1 else 0 | |
return avg_words, sentence_length_variation | |
except Exception: | |
# In case of any error, return simple defaults | |
word_count = len(text.split()) | |
# Assume approximately 15 words per sentence if we can't detect | |
return word_count / max(1, text.count('.') + text.count('!') + text.count('?')), 0 | |
def create_detailed_feedback(transcription, grammar_score, corrected_text, | |
sentiment, fluency, filler_ratio, speaking_rate, | |
vocabulary_richness, avg_words_per_sentence): | |
"""Create detailed feedback based on all metrics""" | |
feedback = [] | |
# Grammar feedback | |
if "acceptable" in grammar_score.lower(): | |
feedback.append("โ Your grammar is good!") | |
else: | |
feedback.append("โ Your grammar needs improvement. Check the corrections provided.") | |
# Fluency feedback | |
if fluency > 0.7: | |
feedback.append("โ Your speech flows naturally.") | |
else: | |
feedback.append("โ Work on making your speech more fluid and natural.") | |
# Filler words feedback | |
if filler_ratio > 0.1: | |
feedback.append(f"โ You used too many filler words ({filler_ratio:.1%} of your words).") | |
else: | |
feedback.append("โ Good job minimizing filler words!") | |
# Speaking rate feedback | |
if 120 <= speaking_rate <= 160: | |
feedback.append(f"โ Your speaking pace is good ({speaking_rate:.0f} words/min).") | |
elif speaking_rate < 120: | |
feedback.append(f"โ Try speaking a bit faster ({speaking_rate:.0f} words/min is slower than ideal).") | |
else: | |
feedback.append(f"โ Try speaking a bit slower ({speaking_rate:.0f} words/min is faster than ideal).") | |
# Vocabulary feedback | |
if vocabulary_richness > 0.6: | |
feedback.append("โ Excellent vocabulary diversity!") | |
elif vocabulary_richness > 0.4: | |
feedback.append("โ Good vocabulary usage.") | |
else: | |
feedback.append("โ Try using more varied vocabulary.") | |
# Sentence complexity feedback | |
if 10 <= avg_words_per_sentence <= 20: | |
feedback.append("โ Good sentence structure and length.") | |
elif avg_words_per_sentence < 10: | |
feedback.append("โ Try using more complex sentences occasionally.") | |
else: | |
feedback.append("โ Your sentences are quite long. Consider varying your sentence length.") | |
# Overall sentiment feedback | |
if sentiment == "POSITIVE": | |
feedback.append("โ Your tone is positive and engaging.") | |
else: | |
feedback.append("โน๏ธ Your tone is neutral/negative. Consider if this matches your intent.") | |
return "\n".join(feedback) | |
def process_audio(audio): | |
if audio is None: | |
return "No audio provided.", "", "", "", None, "" | |
start_time = time.time() | |
# Check if models loaded properly | |
if 'MODELS_LOADED' in globals() and not MODELS_LOADED: | |
return ("Models failed to load. Please check the logs for details.", | |
"Error", "Error", "Unable to process audio due to model loading issues.", | |
None, "## Error\nThe required models couldn't be loaded. Please check the system configuration.") | |
try: | |
# Get audio duration (assuming audio[1] contains the sample rate) | |
sample_rate = 16000 # Default if we can't determine | |
if isinstance(audio, tuple) and len(audio) > 1: | |
sample_rate = audio[1] | |
# For file uploads, we need to handle differently | |
duration = 0 | |
if isinstance(audio, str): | |
# This is a file path | |
try: | |
import librosa | |
y, sr = librosa.load(audio, sr=None) | |
duration = librosa.get_duration(y=y, sr=sr) | |
except Exception as e: | |
print(f"Error getting duration: {e}") | |
# Estimate duration based on file size | |
try: | |
file_size = os.path.getsize(audio) | |
# Rough estimate: 16kHz, 16-bit audio is about 32KB per second | |
duration = file_size / 32000 | |
except: | |
duration = 10 # Default to 10 seconds if we can't determine | |
else: | |
# Assuming a tuple with (samples, sample_rate) | |
try: | |
duration = len(audio[0]) / sample_rate if sample_rate > 0 else 0 | |
except: | |
duration = 10 # Default duration | |
# Step 1: Transcription | |
try: | |
transcription_result = asr_pipeline(audio) | |
transcription = transcription_result["text"] | |
except Exception as e: | |
print(f"Transcription error: {e}") | |
return ("Error in speech recognition. Please try again.", | |
"Error", "Error", "There was an error processing your audio.", | |
None, f"## Error\nError in speech recognition: {str(e)[:100]}...") | |
if not transcription or transcription.strip() == "": | |
return ("No speech detected. Please speak louder or check your microphone.", | |
"N/A", "N/A", "No speech detected in the audio.", | |
None, "## No Speech Detected\nPlease try recording again with clearer speech.") | |
# Step 2: Grammar Scoring | |
try: | |
score_output = grammar_pipeline(transcription)[0] | |
label = score_output["label"] | |
confidence = score_output["score"] | |
grammar_score = f"{label} ({confidence:.2f})" | |
except Exception as e: | |
print(f"Grammar scoring error: {e}") | |
label = "UNKNOWN" | |
confidence = 0.5 | |
grammar_score = "Could not analyze grammar" | |
# Step 3: Grammar Correction | |
try: | |
corrected = correction_pipeline(transcription, max_length=128)[0]["generated_text"] | |
except Exception as e: | |
print(f"Grammar correction error: {e}") | |
corrected = transcription | |
# Step 4: Sentiment Analysis | |
try: | |
sentiment_result = sentiment_pipeline(transcription)[0] | |
sentiment = sentiment_result["label"] | |
sentiment_score = sentiment_result["score"] | |
except Exception as e: | |
print(f"Sentiment analysis error: {e}") | |
sentiment = "NEUTRAL" | |
sentiment_score = 0.5 | |
# Step 5: Fluency Analysis | |
try: | |
fluency_result = fluency_pipeline(transcription)[0] | |
fluency_score = fluency_result["score"] if fluency_result["label"] == "acceptable" else 1 - fluency_result["score"] | |
except Exception as e: | |
print(f"Fluency analysis error: {e}") | |
fluency_score = 0.5 | |
# Step 6: Filler Words Analysis | |
try: | |
filler_count, filler_ratio = count_filler_words(transcription) | |
except Exception as e: | |
print(f"Filler word analysis error: {e}") | |
filler_count, filler_ratio = 0, 0 | |
# Step 7: Speaking Rate | |
try: | |
speaking_rate = calculate_speaking_rate(transcription, duration) | |
except Exception as e: | |
print(f"Speaking rate calculation error: {e}") | |
speaking_rate = 0 | |
# Step 8: Vocabulary Richness | |
try: | |
vocab_richness, pos_counts = analyze_vocabulary_richness(transcription) | |
except Exception as e: | |
print(f"Vocabulary analysis error: {e}") | |
vocab_richness, pos_counts = 0.5, {"N/A": 1} | |
# Step 9: Sentence Complexity | |
try: | |
avg_words, sentence_variation = analyze_sentence_complexity(transcription) | |
except Exception as e: | |
print(f"Sentence complexity analysis error: {e}") | |
avg_words, sentence_variation = 0, 0 | |
# Create feedback | |
try: | |
feedback = create_detailed_feedback( | |
transcription, grammar_score, corrected, sentiment, | |
fluency_score, filler_ratio, speaking_rate, vocab_richness, avg_words | |
) | |
except Exception as e: | |
print(f"Feedback creation error: {e}") | |
feedback = "Error generating detailed feedback." | |
# Create metrics visualization | |
try: | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
# Define metrics for radar chart | |
categories = ['Grammar', 'Fluency', 'Vocabulary', 'Speaking Rate', 'Clarity'] | |
# Normalize scores between 0 and 1 | |
grammar_norm = confidence if label == "acceptable" else 1 - confidence | |
speaking_rate_norm = max(0, min(1, 1 - abs((speaking_rate - 140) / 100))) # Optimal around 140 wpm | |
values = [ | |
grammar_norm, | |
fluency_score, | |
vocab_richness, | |
speaking_rate_norm, | |
1 - filler_ratio # Lower filler ratio is better | |
] | |
# Complete the loop for the radar chart | |
values += values[:1] | |
categories += categories[:1] | |
# Convert to radians and plot | |
angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False).tolist() | |
angles += angles[:1] | |
ax.plot(angles, values, linewidth=2, linestyle='solid') | |
ax.fill(angles, values, alpha=0.25) | |
ax.set_yticklabels([]) | |
ax.set_xticks(angles[:-1]) | |
ax.set_xticklabels(categories[:-1]) | |
ax.grid(True) | |
plt.title('Speaking Performance Metrics', size=15, color='navy', y=1.1) | |
except Exception as e: | |
print(f"Visualization error: {e}") | |
# Create a simple error figure | |
fig, ax = plt.subplots(figsize=(6, 3)) | |
ax.text(0.5, 0.5, "Error creating visualization", | |
horizontalalignment='center', verticalalignment='center') | |
ax.axis('off') | |
# Create detailed analysis text | |
processing_time = time.time() - start_time | |
try: | |
pos_counts_str = ', '.join([f"{k}: {v}" for k, v in sorted(pos_counts.items(), key=lambda x: x[1], reverse=True)[:5]]) | |
except: | |
pos_counts_str = "N/A" | |
detailed_analysis = f""" | |
## Detailed Speech Analysis | |
**Processing Time:** {processing_time:.2f} seconds | |
**Audio Duration:** {duration:.2f} seconds | |
### Metrics: | |
- **Grammar Score:** {confidence:.2f} ({label}) | |
- **Fluency Score:** {fluency_score:.2f} | |
- **Speaking Rate:** {speaking_rate:.1f} words per minute | |
- **Vocabulary Richness:** {vocab_richness:.2f} (higher is better) | |
- **Filler Words:** {filler_count} occurrences ({filler_ratio:.1%} of speech) | |
- **Avg Words Per Sentence:** {avg_words:.1f} | |
- **Sentiment:** {sentiment} ({sentiment_score:.2f}) | |
### Word Types Used: | |
{pos_counts_str} | |
""" | |
return transcription, grammar_score, corrected, feedback, fig, detailed_analysis | |
except Exception as e: | |
print(f"Unexpected error in process_audio: {e}") | |
return ("An unexpected error occurred during processing.", | |
"Error", "Error", "There was an unexpected error processing your audio.", | |
None, f"## Unexpected Error\n\nAn error occurred: {str(e)[:200]}...") | |
# Create theme | |
theme = gr.themes.Soft( | |
primary_hue="blue", | |
secondary_hue="indigo", | |
).set( | |
button_primary_background_fill="*primary_500", | |
button_primary_background_fill_hover="*primary_600", | |
button_primary_text_color="white", | |
block_title_text_weight="600", | |
block_border_width="2px", | |
block_shadow="0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1)", | |
) | |
with gr.Blocks(theme=theme, css=""" | |
.container { max-width: 1000px; margin: auto; } | |
.header { text-align: center; margin-bottom: 20px; } | |
.header h1 { color: #1e40af; font-size: 2.5rem; } | |
.header p { color: #6b7280; font-size: 1.1rem; } | |
.footer { text-align: center; margin-top: 30px; color: #6b7280; } | |
.tips-box { background-color: #f0f9ff; border-radius: 10px; padding: 15px; margin: 10px 0; } | |
.score-card { border: 2px solid #dbeafe; border-radius: 10px; padding: 10px; } | |
""") as demo: | |
gr.HTML(""" | |
<div class="header"> | |
<h1>๐๏ธ Advanced ENGLISH Speaking Assessment</h1> | |
<p>Record or upload your speech to receive comprehensive feedback on your English speaking skills</p> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio( | |
sources=["microphone", "upload"], | |
type="filepath", | |
label="๐ค Speak or Upload Audio" | |
) | |
with gr.Accordion("Speaking Tips", open=False): | |
gr.HTML(""" | |
<div class="tips-box"> | |
<h4>Tips for Better Results:</h4> | |
<ul> | |
<li>Speak clearly and at a moderate pace</li> | |
<li>Minimize background noise</li> | |
<li>Try to speak for at least 20-30 seconds</li> | |
<li>Avoid filler words like "um", "uh", "like"</li> | |
<li>Practice with both prepared and impromptu topics</li> | |
</ul> | |
</div> | |
""") | |
submit_btn = gr.Button("Analyze Speech", variant="primary") | |
with gr.Row(): | |
with gr.Column(): | |
transcription_output = gr.Textbox(label="๐ Transcription", lines=3) | |
corrected_output = gr.Textbox(label="โ๏ธ Grammar Correction", lines=3) | |
grammar_score_output = gr.Textbox(label="โ Grammar Score") | |
with gr.Row(): | |
with gr.Column(): | |
metrics_chart = gr.Plot(label="Performance Metrics") | |
with gr.Column(): | |
feedback_output = gr.Textbox(label="๐ฌ Feedback", lines=8) | |
with gr.Accordion("Detailed Analysis", open=False): | |
detailed_analysis = gr.Markdown() | |
gr.HTML(""" | |
<div class="footer"> | |
<p>This tool provides an assessment of your spoken English. For professional evaluation, consult a qualified language instructor.</p> | |
</div> | |
""") | |
submit_btn.click( | |
fn=process_audio, | |
inputs=[audio_input], | |
outputs=[ | |
transcription_output, | |
grammar_score_output, | |
corrected_output, | |
feedback_output, | |
metrics_chart, | |
detailed_analysis | |
] | |
) | |
if __name__ == "__main__": | |
demo.launch() |