File size: 17,381 Bytes
8f1aebf
fa95df1
 
 
bd50dab
 
 
ccc23c4
8f1aebf
27ccdcb
fa95df1
bd50dab
 
 
 
 
 
8f1aebf
 
bd50dab
8f1aebf
 
9512144
1575111
 
27ccdcb
 
489e668
 
27ccdcb
1575111
27ccdcb
 
1575111
489e668
 
27ccdcb
 
 
 
 
 
 
 
 
 
489e668
 
27ccdcb
489e668
 
 
 
 
 
 
 
27ccdcb
 
 
 
 
 
 
 
 
 
1575111
 
553e56f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1575111
 
 
 
 
27ccdcb
1575111
 
 
 
 
 
 
553e56f
 
 
 
 
 
 
625bc4b
553e56f
 
 
 
 
 
 
 
1575111
 
 
27ccdcb
 
 
 
 
1575111
 
 
 
 
 
 
27ccdcb
1575111
 
 
 
 
 
27ccdcb
1575111
 
 
 
 
 
 
27ccdcb
1575111
 
27ccdcb
 
 
 
 
 
 
 
 
1575111
 
27ccdcb
 
 
 
 
 
 
 
 
 
 
 
 
 
1575111
 
 
 
27ccdcb
 
 
1575111
27ccdcb
 
1575111
27ccdcb
 
 
 
 
 
1575111
8f1aebf
9512144
8f1aebf
 
 
 
 
 
bd50dab
 
8f1aebf
 
bd50dab
8f1aebf
bd50dab
fa95df1
8f1aebf
 
 
bd50dab
8f1aebf
 
 
bd50dab
8f1aebf
bd50dab
 
8f1aebf
 
 
 
bd50dab
1575111
 
d4e37c6
8f1aebf
d4e37c6
27ccdcb
 
9254af3
27ccdcb
d4e37c6
 
27ccdcb
9254af3
27ccdcb
1575111
45e169e
d4e37c6
a994885
45e169e
bd50dab
a994885
8f1aebf
45e169e
bd50dab
27ccdcb
45e169e
27ccdcb
 
 
 
45e169e
27ccdcb
 
 
45e169e
27ccdcb
 
 
45e169e
27ccdcb
8f1aebf
9254af3
27ccdcb
45e169e
27ccdcb
 
8f1aebf
489e668
45e169e
27ccdcb
 
489e668
 
 
 
 
 
 
 
 
 
 
 
27ccdcb
 
489e668
 
27ccdcb
 
 
 
 
 
 
489e668
 
 
 
 
 
 
 
 
 
 
 
27ccdcb
489e668
 
 
 
 
 
 
 
 
 
 
 
d4e37c6
45e169e
8f1aebf
d4e37c6
45e169e
 
 
 
d4e37c6
 
 
 
 
bd50dab
1575111
27ccdcb
489e668
a994885
d4e37c6
45e169e
d4e37c6
45e169e
 
9254af3
45e169e
d4e37c6
 
 
489e668
45e169e
489e668
45e169e
 
 
 
 
 
 
 
 
 
 
 
9254af3
 
 
 
 
 
 
 
 
45e169e
9254af3
 
 
 
 
 
27ccdcb
9254af3
 
bd50dab
 
45e169e
27ccdcb
d4e37c6
489e668
 
 
45e169e
 
d4e37c6
 
 
 
45e169e
d4e37c6
 
 
 
45e169e
bd50dab
e1aa210
 
5c16342
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
import gradio as gr
import requests
import json
import random
from gradio_client import Client
from dotenv import load_dotenv
import os
import speech_recognition as sr
from pydub import AudioSegment
import re

load_dotenv()

API_KEY = os.getenv("DEEPSEEK_API_KEY")
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
TTS_PASSWORD = os.getenv("TTS_PASSWORD")

if not all([API_KEY, HF_TOKEN, TTS_PASSWORD]):
    raise ValueError("Missing required environment variables!")

TTS_CLIENT = Client("KindSynapse/Youssef-Ahmed-Private-Text-To-Speech-Unlimited", hf_token=HF_TOKEN)
recognizer = sr.Recognizer()

MAIN_SYSTEM_PROMPT = {
    "role": "system",
    "content": """You are Sam, an intelligent and proactive English tutor. You drive the conversation and actively engage students. Your responses must be in JSON format with these keys:
        'response': Your main response (keep it conversational and engaging),
        'corrections': ALWAYS provide specific grammar or pronunciation corrections with examples (if none needed, say "Great grammar!"),
        'vocabulary': ALWAYS suggest alternative words/phrases with explanations (if none needed, suggest related vocabulary),
        'level_assessment': Current assessment (beginner/intermediate/advanced),
        'encouragement': A motivating comment,
        'context_memory': Important details about the user,
        'next_question': A follow-up question to keep conversation flowing
        
        IMPORTANT: You MUST always provide corrections and vocabulary suggestions in every response. Even if the student speaks perfectly, provide positive feedback and suggest advanced vocabulary or alternative expressions.
        
        Your personality:
        - Be the conversation driver - ask follow-up questions
        - Show genuine interest in the student's life
        - Provide corrections naturally without stopping the flow
        - Use the student's name frequently
        - Build on previous topics
        - Be encouraging but provide constructive feedback
        - Ask about their day, work, hobbies, culture, goals
        
        Correction guidelines:
        - ALWAYS provide corrections field - even if it's positive feedback
        - ALWAYS provide vocabulary field - suggest alternatives or related words
        - Use format: "Instead of 'X', try saying 'Y'"
        - Give pronunciation tips when needed
        - If no mistakes, say "Excellent grammar!" or "Perfect sentence structure!"
        
        Vocabulary guidelines:
        - ALWAYS suggest vocabulary - even if it's synonyms or advanced alternatives
        - Provide explanations for suggested words
        - Use format: "Instead of 'good', try 'excellent' or 'outstanding'"
        - Suggest topic-related vocabulary
        
        Conversation flow:
        - Start with personal questions (name, country, job, hobbies)
        - Build conversations around their interests
        - Use profession-specific vocabulary
        - Ask about their culture and experiences
        - Keep the conversation natural and flowing
        - Always end with a question to continue the dialogue
        
        Response length: Keep responses conversational (2-3 sentences max for response field)."""
}

WELCOME_PROMPT = {
    "role": "system",
    "content": """Create a heartfelt welcome message that:
        1. Introduces you as Sam, an enthusiastic and friendly English tutor who’s excited to guide them
        2. Kindly asks for their name and where they’re from in a natural conversational way
        3. Expresses genuine excitement about helping them grow
        
        Return the message in JSON format with the key 'greeting'.
        Make it feel personal, warm, and inviting β€” like a tutor who truly cares. Keep it within 2 sentences.
        
        Example: 
        {"greeting": "Hi there! I'm Sam, your friendly English tutor β€” so glad you're here! What's your name and where are you from?"}
        """
}


class EnglishTutor:
    def __init__(self):
        self.chat_history = [MAIN_SYSTEM_PROMPT]
        self.user_info = {
            "name": None,
            "level": "beginner",
            "interests": [],
            "country": None,
            "profession": None,
            "goals": None
        }

    def get_welcome_message(self):
        try:
            response = requests.post(
                "https://api.deepseek.com/v1/chat/completions",
                headers={"Authorization": f"Bearer {API_KEY}"},
                json={
                    "model": "deepseek-chat",
                    "messages": [WELCOME_PROMPT],
                    "temperature": random.uniform(0.5, 1.0),
                    "response_format": {"type": "json_object"}
                }
            )
            welcome_json = json.loads(response.json()["choices"][0]["message"]["content"])
            return welcome_json["greeting"]
        except Exception as e:
            print(f"Error in welcome message: {str(e)}")
            return "Hi! I'm Sam, your English tutor. What's your name and where are you from?"

    def get_bot_response(self, user_message):
        try:
            # Add user context to the message
            context_info = f"User info: {self.user_info}"
            enhanced_message = f"{user_message}\n\n[Context: {context_info}]"
            
            self.chat_history.append({"role": "user", "content": enhanced_message})
            
            response = requests.post(
                "https://api.deepseek.com/v1/chat/completions",
                headers={"Authorization": f"Bearer {API_KEY}"},
                json={
                    "model": "deepseek-chat",
                    "messages": self.chat_history,
                    "temperature": random.uniform(0.8, 1.0),
                    "response_format": {"type": "json_object"}
                }
            )
            
            bot_response = json.loads(response.json()["choices"][0]["message"]["content"])
            
            # Update user info
            if "level_assessment" in bot_response:
                self.user_info["level"] = bot_response["level_assessment"]
            if "context_memory" in bot_response:
                self._update_user_info(bot_response["context_memory"])
            
            self.chat_history.append({"role": "assistant", "content": json.dumps(bot_response)})
            
            return bot_response
        except Exception as e:
            print(f"Error getting bot response: {str(e)}")
            return {
                "response": "I apologize, but I couldn't process that properly. Could you try again?",
                "corrections": "",
                "vocabulary": "",
                "level_assessment": "beginner",
                "encouragement": "Don't worry, let's keep practicing!",
                "context_memory": "",
                "next_question": "What would you like to talk about?"
            }

    def _update_user_info(self, context_memory):
        if isinstance(context_memory, str):
            # Try to extract name if mentioned
            if "name" in context_memory.lower():
                name_match = re.search(r"name[:\s]+([A-Za-z]+)", context_memory)
                if name_match:
                    self.user_info["name"] = name_match.group(1)
            
            # Try to extract country if mentioned
            if "country" in context_memory.lower() or "from" in context_memory.lower():
                country_match = re.search(r"(?:from|country)[:\s]+([A-Za-z\s]+)", context_memory)
                if country_match:
                    self.user_info["country"] = country_match.group(1).strip()
        
        elif isinstance(context_memory, dict):
            for key in self.user_info:
                if key in context_memory:
                    self.user_info[key] = context_memory[key]

    def clean_text_for_tts(self, text):
        # Remove emojis and special characters that might cause TTS issues
        text = re.sub(r'[πŸŽ―πŸŒŸβœ¨πŸ’«πŸŽ€πŸ€–]', '', text)
        
        # Remove extra spaces and newlines
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Remove duplicate words at the beginning
        words = text.split()
        if len(words) > 1 and words[0].lower() == words[1].lower():
            text = ' '.join(words[1:])
        
        return text

def convert_audio_to_text(audio_path):
    try:
        if not audio_path.endswith('.wav'):
            audio = AudioSegment.from_file(audio_path)
            wav_path = audio_path + '.wav'
            audio.export(wav_path, format='wav')
            audio_path = wav_path

        with sr.AudioFile(audio_path) as source:
            audio = recognizer.record(source)
            text = recognizer.recognize_google(audio, language='en-US')
            return text
    except Exception as e:
        print(f"Error in speech recognition: {str(e)}")
        return None

def text_to_speech(text):
    try:
        result = TTS_CLIENT.predict(
            password=TTS_PASSWORD,
            prompt=text,
            voice="coral",
            emotion="Warm and friendly",
            use_random_seed=True,
            specific_seed=12345,
            api_name="/text_to_speech_app"
        )
        return result[0] if isinstance(result, (list, tuple)) else result
    except Exception as e:
        print(f"Error in text to speech: {str(e)}")
        return None

tutor = EnglishTutor()

def initialize_chat():
    try:
        welcome = tutor.get_welcome_message()
        clean_welcome = tutor.clean_text_for_tts(welcome)
        welcome_audio = text_to_speech(clean_welcome)
        history = [{"role": "assistant", "content": welcome}]
        return history, welcome_audio, f"πŸ€– Sam: {welcome}", ""
    except Exception as e:
        print(f"Error initializing chat: {str(e)}")
        welcome_msg = "Hi! I'm Sam, your English tutor. What's your name and where are you from?"
        history = [{"role": "assistant", "content": welcome_msg}]
        return history, None, f"πŸ€– Sam: {welcome_msg}", ""

def process_audio(audio, history, transcript, corrections):
    try:
        if audio is None:
            return history, None, transcript, corrections

        user_message = convert_audio_to_text(audio)
        if not user_message:
            return history, None, transcript, corrections

        bot_response = tutor.get_bot_response(user_message)
        
        # Create the main response with follow-up question
        main_response = bot_response.get("response", "")
        if bot_response.get("next_question"):
            main_response += f" {bot_response['next_question']}"
        
        # Add encouragement
        if bot_response.get("encouragement"):
            main_response += f" {bot_response['encouragement']}"
        
        # Clean text for TTS
        clean_response = tutor.clean_text_for_tts(main_response)
        audio_response = text_to_speech(clean_response)
        
        # Update chat history
        history = history or []
        history.append({"role": "user", "content": user_message})
        history.append({"role": "assistant", "content": main_response})
        
        # Update transcript
        new_transcript = transcript + f"\n\n🎀 You: {user_message}\nπŸ€– Sam: {main_response}"
        
        # Update corrections and vocabulary with debugging
        new_corrections = corrections
        correction_parts = []
        
        # Debug: Print the bot response to see what we're getting
        print(f"DEBUG - Bot response keys: {bot_response.keys()}")
        print(f"DEBUG - Corrections: '{bot_response.get('corrections', 'NOT FOUND')}'")
        print(f"DEBUG - Vocabulary: '{bot_response.get('vocabulary', 'NOT FOUND')}'")
        print(f"DEBUG - Level: '{bot_response.get('level_assessment', 'NOT FOUND')}'")
        
        # Always show current level
        if bot_response.get("level_assessment"):
            correction_parts.append(f"πŸ“Š **Current Level:** {bot_response['level_assessment'].title()}")
        
        # Show corrections if available
        if bot_response.get("corrections") and str(bot_response["corrections"]).strip() and bot_response["corrections"] != "":
            correction_parts.append(f"✍️ **Grammar Corrections:**\n{bot_response['corrections']}")
        
        # Show vocabulary if available
        if bot_response.get("vocabulary") and str(bot_response["vocabulary"]).strip() and bot_response["vocabulary"] != "":
            vocab = bot_response['vocabulary']
            if isinstance(vocab, dict):
                vocab_text = "\n".join([f"β€’ '{k}' β†’ '{v}'" for k, v in vocab.items()])
            else:
                vocab_text = str(vocab)
            correction_parts.append(f"πŸ“š **Vocabulary Suggestions:**\n{vocab_text}")
        
        # Show encouragement
        if bot_response.get("encouragement"):
            correction_parts.append(f"πŸ’‘ **Encouragement:**\n{bot_response['encouragement']}")
        
        # Always show user info if available
        if tutor.user_info.get("name"):
            info_parts = []
            if tutor.user_info.get("name"): info_parts.append(f"Name: {tutor.user_info['name']}")
            if tutor.user_info.get("country"): info_parts.append(f"Country: {tutor.user_info['country']}")
            if tutor.user_info.get("level"): info_parts.append(f"Level: {tutor.user_info['level']}")
            if info_parts:
                correction_parts.append(f"πŸ‘€ **Your Profile:**\n{' | '.join(info_parts)}")
        
        # If still no corrections, show a default message
        if not correction_parts:
            correction_parts.append("🎯 **Feedback:** Keep practicing! Sam is analyzing your English and will provide feedback soon.")
        
        # Create the new corrections text
        new_correction_text = "\n\n".join(correction_parts)
        timestamp = f"[{user_message[:30]}...]" if len(user_message) > 30 else f"[{user_message}]"
        
        if new_corrections:
            new_corrections = new_corrections + f"\n\n--- Latest Response {timestamp} ---\n{new_correction_text}"
        else:
            new_corrections = f"--- Latest Response {timestamp} ---\n{new_correction_text}"
        
        return history, audio_response, new_transcript, new_corrections
    except Exception as e:
        print(f"Error in process_audio: {str(e)}")
        return history, None, transcript, corrections

def submit_recording(audio, history, transcript, corrections):
    return process_audio(audio, history, transcript, corrections)

def clear_chat():
    global tutor
    tutor = EnglishTutor()
    return initialize_chat()

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# πŸŽ“ English Learning Assistant with Sam")
    gr.Markdown("🎀 **Record your voice** - Sam will automatically respond when you finish recording and help improve your English!")
    
    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(
                height=500,
                show_label=False,
                type='messages',
                avatar_images=("🎀", "πŸ€–")
            )
            
            with gr.Row():
                with gr.Column(scale=1):
                    audio_input = gr.Audio(
                        label="πŸŽ™οΈ Record your voice (auto-submits when finished)",
                        type="filepath",
                        show_label=True
                    )
                with gr.Column(scale=1):
                    audio_output = gr.Audio(
                        label="πŸ”Š Sam's response",
                        type="filepath",
                        show_label=True,
                        autoplay=True
                    )
        
        with gr.Column(scale=2):
            gr.Markdown("### πŸ“ Live Transcript")
            transcript_display = gr.Textbox(
                lines=10,
                max_lines=10,
                show_label=False,
                interactive=False,
                placeholder="Your conversation will appear here...",
                container=True
            )
            
            gr.Markdown("### πŸ“š Learning Corner")
            corrections_display = gr.Textbox(
                lines=8,
                max_lines=8,
                show_label=False,
                interactive=False,
                placeholder="Grammar corrections, vocabulary suggestions, and level assessment will appear here...",
                container=True
            )
    
    with gr.Row():
        clear_btn = gr.Button("πŸ”„ Start New Conversation", variant="secondary", size="lg")
        gr.Markdown("πŸ’‘ **Tip**: Sam will actively guide the conversation and provide personalized feedback!")
    
    # Auto-submit when audio is recorded
    audio_input.change(
        process_audio,
        inputs=[audio_input, chatbot, transcript_display, corrections_display],
        outputs=[chatbot, audio_output, transcript_display, corrections_display]
    )
    
    clear_btn.click(
        clear_chat,
        outputs=[chatbot, audio_output, transcript_display, corrections_display]
    )
    
    demo.load(
        initialize_chat,
        outputs=[chatbot, audio_output, transcript_display, corrections_display]
    )

if __name__ == "__main__":
    demo.launch()