voice-chatbot / app.py
Nomi78600's picture
updateds
d1664d5 verified
import gradio as gr
import speech_recognition as sr
from gtts import gTTS
import tempfile
import google.generativeai as genai
from langdetect import detect
from pydub import AudioSegment
import numpy as np
# ---------- Configure Gemini ----------
def configure_genai(api_key: str):
try:
genai.configure(api_key=api_key)
return genai.GenerativeModel("gemini-1.5-flash"), None
except Exception as e:
return None, f"Failed to configure Generative AI: {e}"
# ---------- Speech to Text ----------
def recognize_audio(audio_path: str) -> str:
r = sr.Recognizer()
try:
with sr.AudioFile(audio_path) as source:
audio_data = r.record(source)
return r.recognize_google(audio_data, language="ur-PK")
except sr.UnknownValueError:
return "Sorry, I could not understand the audio."
except sr.RequestError as e:
return f"Could not request results; {e}"
except Exception as e:
return f"Error during recognition: {e}"
# ---------- LLM Response ----------
def generate_response(model, history_msgs):
try:
resp = model.generate_content(history_msgs)
return resp.text
except Exception as e:
return f"Error generating response: {e}"
# ---------- TTS β†’ Autoplay Value ----------
def tts_to_autoplay(text: str, lang: str = "ur"):
"""
Convert text β†’ MP3 β†’ NumPy float32 for Gradio Audio autoplay.
"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
gTTS(text=text, lang=lang).save(fp.name)
mp3_path = fp.name
seg = AudioSegment.from_file(mp3_path)
seg = seg.set_channels(1).set_frame_rate(16000)
samples = np.array(seg.get_array_of_samples()).astype(np.float32)
max_val = float(1 << (8 * seg.sample_width - 1))
samples = samples / max_val
return seg.frame_rate, samples
# ---------- Text Chat Logic ----------
def chat_with_model(message, history, api_key):
if not api_key:
return history + [{"role": "assistant", "content": "Please enter your Gemini API key."}], ""
model, err = configure_genai(api_key)
if err:
return history + [{"role": "assistant", "content": err}], ""
history = history or []
history.append({"role": "user", "content": message})
messages = [{"role": m["role"], "parts": [m["content"]]} for m in history]
response = generate_response(model, messages)
history.append({"role": "assistant", "content": response})
return history, "" # Clear input box
# ---------- Voice Chat Logic ----------
def voice_chat_with_model(audio_path, history, api_key):
if not api_key:
return history + [{"role": "assistant", "content": "Please enter your Gemini API key."}], None, None
model, err = configure_genai(api_key)
if err:
return history + [{"role": "assistant", "content": err}], None, None
if audio_path is None:
return history, None, None
# ASR
user_text = recognize_audio(audio_path)
if user_text.startswith("Sorry") or user_text.startswith("Could not request") or user_text.startswith("Error"):
return history + [{"role": "assistant", "content": user_text}], None, None
# Update chat
history = history or []
history.append({"role": "user", "content": user_text})
# LLM
messages = [{"role": m["role"], "parts": [m["content"]]} for m in history]
response = generate_response(model, messages)
# Detect language for TTS
try:
lang = detect(response)
except:
lang = "en"
autoplay_audio = tts_to_autoplay(response, lang=lang)
history.append({"role": "assistant", "content": response})
return history, autoplay_audio, gr.update(value=None) # Reset recorder
# ---------- UI ----------
with gr.Blocks(theme=gr.themes.Soft()) as demo:
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("# πŸ€– Urdu/English Voice Assistant")
with gr.Accordion("βš™οΈ Settings", open=False):
api_key_input = gr.Textbox(
label="Gemini API Key",
type="password",
placeholder="Enter your Gemini API key"
)
with gr.Column(scale=2):
# Chat Mode
with gr.Tab("πŸ’¬ Chat Mode"):
chatbot = gr.Chatbot(type="messages")
msg = gr.Textbox(label="Type your message", placeholder="Say something...")
clear = gr.Button("Clear Chat")
msg.submit(chat_with_model, [msg, chatbot, api_key_input], [chatbot, msg])
clear.click(lambda: [], None, chatbot, queue=False)
# Voice Mode
with gr.Tab("🎀 Voice Mode"):
voice_chatbot = gr.Chatbot(type="messages")
record_btn = gr.Audio(sources=["microphone"], type="filepath", label="🎀 Record and Send")
voice_clear = gr.Button("Clear Voice Chat")
# Autoplay player with visible pause/play
autoplay_player = gr.Audio(label="πŸ”Š AI Response Playback (Pause/Play)", autoplay=True)
record_btn.change(
voice_chat_with_model,
[record_btn, voice_chatbot, api_key_input],
[voice_chatbot, autoplay_player, record_btn],
)
voice_clear.click(lambda: [], None, voice_chatbot, queue=False)
demo.launch()