import gradio as gr import speech_recognition as sr from gtts import gTTS import tempfile import google.generativeai as genai from langdetect import detect from pydub import AudioSegment import numpy as np # ---------- Configure Gemini ---------- def configure_genai(api_key: str): try: genai.configure(api_key=api_key) return genai.GenerativeModel("gemini-1.5-flash"), None except Exception as e: return None, f"Failed to configure Generative AI: {e}" # ---------- Speech to Text ---------- def recognize_audio(audio_path: str) -> str: r = sr.Recognizer() try: with sr.AudioFile(audio_path) as source: audio_data = r.record(source) return r.recognize_google(audio_data, language="ur-PK") except sr.UnknownValueError: return "Sorry, I could not understand the audio." except sr.RequestError as e: return f"Could not request results; {e}" except Exception as e: return f"Error during recognition: {e}" # ---------- LLM Response ---------- def generate_response(model, history_msgs): try: resp = model.generate_content(history_msgs) return resp.text except Exception as e: return f"Error generating response: {e}" # ---------- TTS → Autoplay Value ---------- def tts_to_autoplay(text: str, lang: str = "ur"): """ Convert text → MP3 → NumPy float32 for Gradio Audio autoplay. """ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: gTTS(text=text, lang=lang).save(fp.name) mp3_path = fp.name seg = AudioSegment.from_file(mp3_path) seg = seg.set_channels(1).set_frame_rate(16000) samples = np.array(seg.get_array_of_samples()).astype(np.float32) max_val = float(1 << (8 * seg.sample_width - 1)) samples = samples / max_val return seg.frame_rate, samples # ---------- Text Chat Logic ---------- def chat_with_model(message, history, api_key): if not api_key: return history + [{"role": "assistant", "content": "Please enter your Gemini API key."}], "" model, err = configure_genai(api_key) if err: return history + [{"role": "assistant", "content": err}], "" history = history or [] history.append({"role": "user", "content": message}) messages = [{"role": m["role"], "parts": [m["content"]]} for m in history] response = generate_response(model, messages) history.append({"role": "assistant", "content": response}) return history, "" # Clear input box # ---------- Voice Chat Logic ---------- def voice_chat_with_model(audio_path, history, api_key): if not api_key: return history + [{"role": "assistant", "content": "Please enter your Gemini API key."}], None, None model, err = configure_genai(api_key) if err: return history + [{"role": "assistant", "content": err}], None, None if audio_path is None: return history, None, None # ASR user_text = recognize_audio(audio_path) if user_text.startswith("Sorry") or user_text.startswith("Could not request") or user_text.startswith("Error"): return history + [{"role": "assistant", "content": user_text}], None, None # Update chat history = history or [] history.append({"role": "user", "content": user_text}) # LLM messages = [{"role": m["role"], "parts": [m["content"]]} for m in history] response = generate_response(model, messages) # Detect language for TTS try: lang = detect(response) except: lang = "en" autoplay_audio = tts_to_autoplay(response, lang=lang) history.append({"role": "assistant", "content": response}) return history, autoplay_audio, gr.update(value=None) # Reset recorder # ---------- UI ---------- with gr.Blocks(theme=gr.themes.Soft()) as demo: with gr.Row(): with gr.Column(scale=1): gr.Markdown("# 🤖 Urdu/English Voice Assistant") with gr.Accordion("⚙️ Settings", open=False): api_key_input = gr.Textbox( label="Gemini API Key", type="password", placeholder="Enter your Gemini API key" ) with gr.Column(scale=2): # Chat Mode with gr.Tab("💬 Chat Mode"): chatbot = gr.Chatbot(type="messages") msg = gr.Textbox(label="Type your message", placeholder="Say something...") clear = gr.Button("Clear Chat") msg.submit(chat_with_model, [msg, chatbot, api_key_input], [chatbot, msg]) clear.click(lambda: [], None, chatbot, queue=False) # Voice Mode with gr.Tab("🎤 Voice Mode"): voice_chatbot = gr.Chatbot(type="messages") record_btn = gr.Audio(sources=["microphone"], type="filepath", label="🎤 Record and Send") voice_clear = gr.Button("Clear Voice Chat") # Autoplay player with visible pause/play autoplay_player = gr.Audio(label="🔊 AI Response Playback (Pause/Play)", autoplay=True) record_btn.change( voice_chat_with_model, [record_btn, voice_chatbot, api_key_input], [voice_chatbot, autoplay_player, record_btn], ) voice_clear.click(lambda: [], None, voice_chatbot, queue=False) demo.launch()