Spaces:
Running
Running
import gradio as gr | |
import speech_recognition as sr | |
from gtts import gTTS | |
import tempfile | |
import google.generativeai as genai | |
from langdetect import detect | |
from pydub import AudioSegment | |
import numpy as np | |
# ---------- Configure Gemini ---------- | |
def configure_genai(api_key: str): | |
try: | |
genai.configure(api_key=api_key) | |
return genai.GenerativeModel("gemini-1.5-flash"), None | |
except Exception as e: | |
return None, f"Failed to configure Generative AI: {e}" | |
# ---------- Speech to Text ---------- | |
def recognize_audio(audio_path: str) -> str: | |
r = sr.Recognizer() | |
try: | |
with sr.AudioFile(audio_path) as source: | |
audio_data = r.record(source) | |
return r.recognize_google(audio_data, language="ur-PK") | |
except sr.UnknownValueError: | |
return "Sorry, I could not understand the audio." | |
except sr.RequestError as e: | |
return f"Could not request results; {e}" | |
except Exception as e: | |
return f"Error during recognition: {e}" | |
# ---------- LLM Response ---------- | |
def generate_response(model, history_msgs): | |
try: | |
resp = model.generate_content(history_msgs) | |
return resp.text | |
except Exception as e: | |
return f"Error generating response: {e}" | |
# ---------- TTS β Autoplay Value ---------- | |
def tts_to_autoplay(text: str, lang: str = "ur"): | |
""" | |
Convert text β MP3 β NumPy float32 for Gradio Audio autoplay. | |
""" | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: | |
gTTS(text=text, lang=lang).save(fp.name) | |
mp3_path = fp.name | |
seg = AudioSegment.from_file(mp3_path) | |
seg = seg.set_channels(1).set_frame_rate(16000) | |
samples = np.array(seg.get_array_of_samples()).astype(np.float32) | |
max_val = float(1 << (8 * seg.sample_width - 1)) | |
samples = samples / max_val | |
return seg.frame_rate, samples | |
# ---------- Text Chat Logic ---------- | |
def chat_with_model(message, history, api_key): | |
if not api_key: | |
return history + [{"role": "assistant", "content": "Please enter your Gemini API key."}], "" | |
model, err = configure_genai(api_key) | |
if err: | |
return history + [{"role": "assistant", "content": err}], "" | |
history = history or [] | |
history.append({"role": "user", "content": message}) | |
messages = [{"role": m["role"], "parts": [m["content"]]} for m in history] | |
response = generate_response(model, messages) | |
history.append({"role": "assistant", "content": response}) | |
return history, "" # Clear input box | |
# ---------- Voice Chat Logic ---------- | |
def voice_chat_with_model(audio_path, history, api_key): | |
if not api_key: | |
return history + [{"role": "assistant", "content": "Please enter your Gemini API key."}], None, None | |
model, err = configure_genai(api_key) | |
if err: | |
return history + [{"role": "assistant", "content": err}], None, None | |
if audio_path is None: | |
return history, None, None | |
# ASR | |
user_text = recognize_audio(audio_path) | |
if user_text.startswith("Sorry") or user_text.startswith("Could not request") or user_text.startswith("Error"): | |
return history + [{"role": "assistant", "content": user_text}], None, None | |
# Update chat | |
history = history or [] | |
history.append({"role": "user", "content": user_text}) | |
# LLM | |
messages = [{"role": m["role"], "parts": [m["content"]]} for m in history] | |
response = generate_response(model, messages) | |
# Detect language for TTS | |
try: | |
lang = detect(response) | |
except: | |
lang = "en" | |
autoplay_audio = tts_to_autoplay(response, lang=lang) | |
history.append({"role": "assistant", "content": response}) | |
return history, autoplay_audio, gr.update(value=None) # Reset recorder | |
# ---------- UI ---------- | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("# π€ Urdu/English Voice Assistant") | |
with gr.Accordion("βοΈ Settings", open=False): | |
api_key_input = gr.Textbox( | |
label="Gemini API Key", | |
type="password", | |
placeholder="Enter your Gemini API key" | |
) | |
with gr.Column(scale=2): | |
# Chat Mode | |
with gr.Tab("π¬ Chat Mode"): | |
chatbot = gr.Chatbot(type="messages") | |
msg = gr.Textbox(label="Type your message", placeholder="Say something...") | |
clear = gr.Button("Clear Chat") | |
msg.submit(chat_with_model, [msg, chatbot, api_key_input], [chatbot, msg]) | |
clear.click(lambda: [], None, chatbot, queue=False) | |
# Voice Mode | |
with gr.Tab("π€ Voice Mode"): | |
voice_chatbot = gr.Chatbot(type="messages") | |
record_btn = gr.Audio(sources=["microphone"], type="filepath", label="π€ Record and Send") | |
voice_clear = gr.Button("Clear Voice Chat") | |
# Autoplay player with visible pause/play | |
autoplay_player = gr.Audio(label="π AI Response Playback (Pause/Play)", autoplay=True) | |
record_btn.change( | |
voice_chat_with_model, | |
[record_btn, voice_chatbot, api_key_input], | |
[voice_chatbot, autoplay_player, record_btn], | |
) | |
voice_clear.click(lambda: [], None, voice_chatbot, queue=False) | |
demo.launch() | |