aiqcamp commited on
Commit
0dbafab
·
verified ·
1 Parent(s): bbf01ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +345 -21
app.py CHANGED
@@ -1,6 +1,23 @@
1
-
2
  import gradio as gr
 
 
 
3
  from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  # Load Whisper for ASR
6
  asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
@@ -13,36 +30,343 @@ grammar_pipeline = pipeline("text-classification", model=cola_model, tokenizer=c
13
  # Load Grammar Correction Model (T5)
14
  correction_pipeline = pipeline("text2text-generation", model="vennify/t5-base-grammar-correction")
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def process_audio(audio):
17
  if audio is None:
18
- return "No audio provided.", "", ""
19
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # Step 1: Transcription
21
- transcription = asr_pipeline(audio)["text"]
22
-
 
23
  # Step 2: Grammar Scoring
24
  score_output = grammar_pipeline(transcription)[0]
25
  label = score_output["label"]
26
  confidence = score_output["score"]
27
-
 
28
  # Step 3: Grammar Correction
29
  corrected = correction_pipeline(transcription, max_length=128)[0]["generated_text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- return transcription, f"{label} ({confidence:.2f})", corrected
32
-
33
- demo = gr.Interface(
34
- fn=process_audio,
35
- inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", label="🎤 Speak or Upload Audio (.wav)"),
36
- outputs=[
37
- gr.Textbox(label="📝 Transcription"),
38
- gr.Textbox(label="✅ Grammar Score"),
39
- gr.Textbox(label="✍️ Grammar Correction")
40
- ],
41
- title="🎙️ Voice Grammar Scorer",
42
- description="Record or upload a WAV file. This app transcribes your voice, scores its grammar, and suggests corrections.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  )
44
 
45
- if __name__ == "__main__":
46
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
-
 
 
 
1
  import gradio as gr
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ import time
5
  from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
6
+ import pandas as pd
7
+ from sklearn.feature_extraction.text import CountVectorizer
8
+ import nltk
9
+ from nltk.tokenize import word_tokenize
10
+ import re
11
+
12
+ # Download necessary NLTK data
13
+ try:
14
+ nltk.data.find('tokenizers/punkt')
15
+ except LookupError:
16
+ nltk.download('punkt')
17
+ try:
18
+ nltk.data.find('taggers/averaged_perceptron_tagger')
19
+ except LookupError:
20
+ nltk.download('averaged_perceptron_tagger')
21
 
22
  # Load Whisper for ASR
23
  asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
 
30
  # Load Grammar Correction Model (T5)
31
  correction_pipeline = pipeline("text2text-generation", model="vennify/t5-base-grammar-correction")
32
 
33
+ # Add sentiment analysis
34
+ sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
35
+
36
+ # Add fluency analysis (using BERT)
37
+ fluency_pipeline = pipeline("text-classification", model="textattack/bert-base-uncased-CoLA")
38
+
39
+ # Common English filler words to detect
40
+ FILLER_WORDS = ["um", "uh", "like", "you know", "actually", "basically", "literally",
41
+ "sort of", "kind of", "i mean", "so", "well", "right", "okay", "yeah"]
42
+
43
+ def count_filler_words(text):
44
+ """Count filler words in the text"""
45
+ text = text.lower()
46
+ count = 0
47
+ for word in FILLER_WORDS:
48
+ count += len(re.findall(r'\b' + word + r'\b', text))
49
+ return count, count / max(len(text.split()), 1) # Count and ratio
50
+
51
+ def calculate_speaking_rate(text, duration):
52
+ """Calculate words per minute"""
53
+ if duration <= 0:
54
+ return 0
55
+ words = len(text.split())
56
+ return (words / duration) * 60 # Words per minute
57
+
58
+ def analyze_vocabulary_richness(text):
59
+ """Analyze vocabulary richness"""
60
+ words = word_tokenize(text.lower())
61
+ if not words:
62
+ return 0, 0
63
+
64
+ # Vocabulary richness (unique words / total words)
65
+ unique_words = set(words)
66
+ richness = len(unique_words) / len(words)
67
+
68
+ # POS tagging to see variety of word types used
69
+ pos_tags = nltk.pos_tag(words)
70
+ pos_counts = {}
71
+ for _, tag in pos_tags:
72
+ pos_counts[tag] = pos_counts.get(tag, 0) + 1
73
+
74
+ return richness, pos_counts
75
+
76
+ def analyze_sentence_complexity(text):
77
+ """Analyze sentence complexity"""
78
+ sentences = re.split(r'[.!?]+', text)
79
+ sentences = [s.strip() for s in sentences if s.strip()]
80
+
81
+ if not sentences:
82
+ return 0, 0
83
+
84
+ # Average words per sentence
85
+ words_per_sentence = [len(s.split()) for s in sentences]
86
+ avg_words = sum(words_per_sentence) / len(sentences)
87
+
88
+ # Sentence length variation (standard deviation)
89
+ sentence_length_variation = np.std(words_per_sentence) if len(sentences) > 1 else 0
90
+
91
+ return avg_words, sentence_length_variation
92
+
93
+ def create_detailed_feedback(transcription, grammar_score, corrected_text,
94
+ sentiment, fluency, filler_ratio, speaking_rate,
95
+ vocabulary_richness, avg_words_per_sentence):
96
+ """Create detailed feedback based on all metrics"""
97
+ feedback = []
98
+
99
+ # Grammar feedback
100
+ if "acceptable" in grammar_score.lower():
101
+ feedback.append("✅ Your grammar is good!")
102
+ else:
103
+ feedback.append("❗ Your grammar needs improvement. Check the corrections provided.")
104
+
105
+ # Fluency feedback
106
+ if fluency > 0.7:
107
+ feedback.append("✅ Your speech flows naturally.")
108
+ else:
109
+ feedback.append("❗ Work on making your speech more fluid and natural.")
110
+
111
+ # Filler words feedback
112
+ if filler_ratio > 0.1:
113
+ feedback.append(f"❗ You used too many filler words ({filler_ratio:.1%} of your words).")
114
+ else:
115
+ feedback.append("✅ Good job minimizing filler words!")
116
+
117
+ # Speaking rate feedback
118
+ if 120 <= speaking_rate <= 160:
119
+ feedback.append(f"✅ Your speaking pace is good ({speaking_rate:.0f} words/min).")
120
+ elif speaking_rate < 120:
121
+ feedback.append(f"❗ Try speaking a bit faster ({speaking_rate:.0f} words/min is slower than ideal).")
122
+ else:
123
+ feedback.append(f"❗ Try speaking a bit slower ({speaking_rate:.0f} words/min is faster than ideal).")
124
+
125
+ # Vocabulary feedback
126
+ if vocabulary_richness > 0.6:
127
+ feedback.append("✅ Excellent vocabulary diversity!")
128
+ elif vocabulary_richness > 0.4:
129
+ feedback.append("✅ Good vocabulary usage.")
130
+ else:
131
+ feedback.append("❗ Try using more varied vocabulary.")
132
+
133
+ # Sentence complexity feedback
134
+ if 10 <= avg_words_per_sentence <= 20:
135
+ feedback.append("✅ Good sentence structure and length.")
136
+ elif avg_words_per_sentence < 10:
137
+ feedback.append("❗ Try using more complex sentences occasionally.")
138
+ else:
139
+ feedback.append("❗ Your sentences are quite long. Consider varying your sentence length.")
140
+
141
+ # Overall sentiment feedback
142
+ if sentiment == "POSITIVE":
143
+ feedback.append("✅ Your tone is positive and engaging.")
144
+ else:
145
+ feedback.append("ℹ️ Your tone is neutral/negative. Consider if this matches your intent.")
146
+
147
+ return "\n".join(feedback)
148
+
149
  def process_audio(audio):
150
  if audio is None:
151
+ return {
152
+ "transcription": "No audio provided.",
153
+ "grammar_score": "",
154
+ "corrected": "",
155
+ "feedback": "",
156
+ "metrics_chart": None,
157
+ "detailed_analysis": ""
158
+ }
159
+
160
+ start_time = time.time()
161
+
162
+ # Get audio duration (assuming audio[1] contains the sample rate)
163
+ sample_rate = 16000 # Default if we can't determine
164
+ if isinstance(audio, tuple) and len(audio) > 1:
165
+ sample_rate = audio[1]
166
+
167
+ # For file uploads, we need to handle differently
168
+ if isinstance(audio, str):
169
+ # This is a file path
170
+ import librosa
171
+ y, sr = librosa.load(audio, sr=None)
172
+ duration = librosa.get_duration(y=y, sr=sr)
173
+ else:
174
+ # Assuming a tuple with (samples, sample_rate)
175
+ try:
176
+ duration = len(audio[0]) / sample_rate if sample_rate > 0 else 0
177
+ except:
178
+ duration = 0
179
+
180
  # Step 1: Transcription
181
+ transcription_result = asr_pipeline(audio)
182
+ transcription = transcription_result["text"]
183
+
184
  # Step 2: Grammar Scoring
185
  score_output = grammar_pipeline(transcription)[0]
186
  label = score_output["label"]
187
  confidence = score_output["score"]
188
+ grammar_score = f"{label} ({confidence:.2f})"
189
+
190
  # Step 3: Grammar Correction
191
  corrected = correction_pipeline(transcription, max_length=128)[0]["generated_text"]
192
+
193
+ # Step 4: Sentiment Analysis
194
+ sentiment_result = sentiment_pipeline(transcription)[0]
195
+ sentiment = sentiment_result["label"]
196
+ sentiment_score = sentiment_result["score"]
197
+
198
+ # Step 5: Fluency Analysis
199
+ fluency_result = fluency_pipeline(transcription)[0]
200
+ fluency_score = fluency_result["score"] if fluency_result["label"] == "acceptable" else 1 - fluency_result["score"]
201
+
202
+ # Step 6: Filler Words Analysis
203
+ filler_count, filler_ratio = count_filler_words(transcription)
204
+
205
+ # Step 7: Speaking Rate
206
+ speaking_rate = calculate_speaking_rate(transcription, duration)
207
+
208
+ # Step 8: Vocabulary Richness
209
+ vocab_richness, pos_counts = analyze_vocabulary_richness(transcription)
210
+
211
+ # Step 9: Sentence Complexity
212
+ avg_words, sentence_variation = analyze_sentence_complexity(transcription)
213
+
214
+ # Create feedback
215
+ feedback = create_detailed_feedback(
216
+ transcription, grammar_score, corrected, sentiment,
217
+ fluency_score, filler_ratio, speaking_rate, vocab_richness, avg_words
218
+ )
219
+
220
+ # Create metrics visualization
221
+ fig, ax = plt.subplots(figsize=(10, 6))
222
+
223
+ # Define metrics for radar chart
224
+ categories = ['Grammar', 'Fluency', 'Vocabulary', 'Speaking Rate', 'Clarity']
225
+
226
+ # Normalize scores between 0 and 1
227
+ grammar_norm = confidence if label == "acceptable" else 1 - confidence
228
+ speaking_rate_norm = max(0, min(1, 1 - abs((speaking_rate - 140) / 100))) # Optimal around 140 wpm
229
+
230
+ values = [
231
+ grammar_norm,
232
+ fluency_score,
233
+ vocab_richness,
234
+ speaking_rate_norm,
235
+ 1 - filler_ratio # Lower filler ratio is better
236
+ ]
237
+
238
+ # Complete the loop for the radar chart
239
+ values += values[:1]
240
+ categories += categories[:1]
241
+
242
+ # Convert to radians and plot
243
+ angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False).tolist()
244
+ angles += angles[:1]
245
+
246
+ ax.plot(angles, values, linewidth=2, linestyle='solid')
247
+ ax.fill(angles, values, alpha=0.25)
248
+ ax.set_yticklabels([])
249
+ ax.set_xticks(angles[:-1])
250
+ ax.set_xticklabels(categories[:-1])
251
+ ax.grid(True)
252
+ plt.title('Speaking Performance Metrics', size=15, color='navy', y=1.1)
253
+
254
+ # Create detailed analysis text
255
+ processing_time = time.time() - start_time
256
+ detailed_analysis = f"""
257
+ ## Detailed Speech Analysis
258
+
259
+ **Processing Time:** {processing_time:.2f} seconds
260
+ **Audio Duration:** {duration:.2f} seconds
261
 
262
+ ### Metrics:
263
+ - **Grammar Score:** {confidence:.2f} ({label})
264
+ - **Fluency Score:** {fluency_score:.2f}
265
+ - **Speaking Rate:** {speaking_rate:.1f} words per minute
266
+ - **Vocabulary Richness:** {vocab_richness:.2f} (higher is better)
267
+ - **Filler Words:** {filler_count} occurrences ({filler_ratio:.1%} of speech)
268
+ - **Avg Words Per Sentence:** {avg_words:.1f}
269
+ - **Sentiment:** {sentiment} ({sentiment_score:.2f})
270
+
271
+ ### Word Types Used:
272
+ {', '.join([f"{k}: {v}" for k, v in sorted(pos_counts.items(), key=lambda x: x[1], reverse=True)[:5]])}
273
+ """
274
+
275
+ return {
276
+ "transcription": transcription,
277
+ "grammar_score": grammar_score,
278
+ "corrected": corrected,
279
+ "feedback": feedback,
280
+ "metrics_chart": fig,
281
+ "detailed_analysis": detailed_analysis
282
+ }
283
+
284
+ # Create theme
285
+ theme = gr.themes.Soft(
286
+ primary_hue="blue",
287
+ secondary_hue="indigo",
288
+ ).set(
289
+ button_primary_background_fill="*primary_500",
290
+ button_primary_background_fill_hover="*primary_600",
291
+ button_primary_text_color="white",
292
+ block_title_text_weight="600",
293
+ block_border_width="2px",
294
+ block_shadow="0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1)",
295
  )
296
 
297
+ with gr.Blocks(theme=theme, css="""
298
+ .container { max-width: 1000px; margin: auto; }
299
+ .header { text-align: center; margin-bottom: 20px; }
300
+ .header h1 { color: #1e40af; font-size: 2.5rem; }
301
+ .header p { color: #6b7280; font-size: 1.1rem; }
302
+ .footer { text-align: center; margin-top: 30px; color: #6b7280; }
303
+ .tips-box { background-color: #f0f9ff; border-radius: 10px; padding: 15px; margin: 10px 0; }
304
+ .score-card { border: 2px solid #dbeafe; border-radius: 10px; padding: 10px; }
305
+ """) as demo:
306
+ gr.HTML("""
307
+ <div class="header">
308
+ <h1>🎙️ Advanced ENGLISH Speaking Assessment</h1>
309
+ <p>Record or upload your speech to receive comprehensive feedback on your English speaking skills</p>
310
+ </div>
311
+ """)
312
+
313
+ with gr.Row():
314
+ with gr.Column():
315
+ audio_input = gr.Audio(
316
+ sources=["microphone", "upload"],
317
+ type="filepath",
318
+ label="🎤 Speak or Upload Audio"
319
+ )
320
+
321
+ with gr.Accordion("Speaking Tips", open=False):
322
+ gr.HTML("""
323
+ <div class="tips-box">
324
+ <h4>Tips for Better Results:</h4>
325
+ <ul>
326
+ <li>Speak clearly and at a moderate pace</li>
327
+ <li>Minimize background noise</li>
328
+ <li>Try to speak for at least 20-30 seconds</li>
329
+ <li>Avoid filler words like "um", "uh", "like"</li>
330
+ <li>Practice with both prepared and impromptu topics</li>
331
+ </ul>
332
+ </div>
333
+ """)
334
+
335
+ submit_btn = gr.Button("Analyze Speech", variant="primary")
336
+
337
+ with gr.Row():
338
+ with gr.Column():
339
+ transcription_output = gr.Textbox(label="📝 Transcription", lines=3)
340
+ corrected_output = gr.Textbox(label="✍️ Grammar Correction", lines=3)
341
+ grammar_score_output = gr.Textbox(label="✅ Grammar Score")
342
+
343
+ with gr.Row():
344
+ with gr.Column():
345
+ metrics_chart = gr.Plot(label="Performance Metrics")
346
+ with gr.Column():
347
+ feedback_output = gr.Textbox(label="💬 Feedback", lines=8)
348
+
349
+ with gr.Accordion("Detailed Analysis", open=False):
350
+ detailed_analysis = gr.Markdown()
351
+
352
+ gr.HTML("""
353
+ <div class="footer">
354
+ <p>This tool provides an assessment of your spoken English. For professional evaluation, consult a qualified language instructor.</p>
355
+ </div>
356
+ """)
357
+
358
+ submit_btn.click(
359
+ fn=process_audio,
360
+ inputs=[audio_input],
361
+ outputs={
362
+ "transcription": transcription_output,
363
+ "grammar_score": grammar_score_output,
364
+ "corrected": corrected_output,
365
+ "feedback": feedback_output,
366
+ "metrics_chart": metrics_chart,
367
+ "detailed_analysis": detailed_analysis
368
+ }
369
+ )
370
 
371
+ if __name__ == "__main__":
372
+ demo.launch()