Spaces:

Nomi78600
/

voice-chatbot

Running

File size: 5,455 Bytes

55a0315
 
 
 
 
 
629b8f6
 
55a0315
9cc19a2
629b8f6
55a0315
 
 
 
 
 
9cc19a2
629b8f6
 
55a0315
 
629b8f6
9cc19a2
55a0315
 
 
 
 
 
 
9cc19a2
 
55a0315
9cc19a2
629b8f6
55a0315
 
 
d1664d5
 
629b8f6
9cc19a2
629b8f6
 
9cc19a2
629b8f6
 
 
 
 
 
 
 
9cc19a2
629b8f6
 
9cc19a2
629b8f6
 
9cc19a2
629b8f6
 
 
 
 
 
 
55a0315
629b8f6
 
 
 
9cc19a2
629b8f6
9cc19a2
55a0315
 
9cc19a2
55a0315
629b8f6
 
 
55a0315
 
9dcdb56
55a0315
629b8f6
55a0315
 
9dcdb56
55a0315
9cc19a2
55a0315
b8e97b7
55a0315
629b8f6
 
55a0315
77cd3ef
9cc19a2
55a0315
 
9cc19a2
55a0315
 
d1664d5
6547d10
9cc19a2
 
55a0315
629b8f6
34898d9
 
 
 
 
629b8f6
 
 
 
 
34898d9
 
9cc19a2
34898d9
7f12b65
34898d9
 
9cc19a2
9dcdb56
34898d9
 
9cc19a2
34898d9
7f12b65
9cc19a2
34898d9
 
d1664d5
 
3f6bd1b
a8bd3b3
 
 
629b8f6
a8bd3b3
34898d9
55a0315

import gradio as gr
import speech_recognition as sr
from gtts import gTTS
import tempfile
import google.generativeai as genai
from langdetect import detect
from pydub import AudioSegment
import numpy as np

# ---------- Configure Gemini ----------
def configure_genai(api_key: str):
    try:
        genai.configure(api_key=api_key)
        return genai.GenerativeModel("gemini-1.5-flash"), None
    except Exception as e:
        return None, f"Failed to configure Generative AI: {e}"

# ---------- Speech to Text ----------
def recognize_audio(audio_path: str) -> str:
    r = sr.Recognizer()
    try:
        with sr.AudioFile(audio_path) as source:
            audio_data = r.record(source)
            return r.recognize_google(audio_data, language="ur-PK")
    except sr.UnknownValueError:
        return "Sorry, I could not understand the audio."
    except sr.RequestError as e:
        return f"Could not request results; {e}"
    except Exception as e:
        return f"Error during recognition: {e}"

# ---------- LLM Response ----------
def generate_response(model, history_msgs):
    try:
        resp = model.generate_content(history_msgs)
        return resp.text
    except Exception as e:
        return f"Error generating response: {e}"

# ---------- TTS → Autoplay Value ----------
def tts_to_autoplay(text: str, lang: str = "ur"):
    """
    Convert text → MP3 → NumPy float32 for Gradio Audio autoplay.
    """
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
        gTTS(text=text, lang=lang).save(fp.name)
        mp3_path = fp.name

    seg = AudioSegment.from_file(mp3_path)
    seg = seg.set_channels(1).set_frame_rate(16000)

    samples = np.array(seg.get_array_of_samples()).astype(np.float32)
    max_val = float(1 << (8 * seg.sample_width - 1))
    samples = samples / max_val

    return seg.frame_rate, samples

# ---------- Text Chat Logic ----------
def chat_with_model(message, history, api_key):
    if not api_key:
        return history + [{"role": "assistant", "content": "Please enter your Gemini API key."}], ""

    model, err = configure_genai(api_key)
    if err:
        return history + [{"role": "assistant", "content": err}], ""

    history = history or []
    history.append({"role": "user", "content": message})

    messages = [{"role": m["role"], "parts": [m["content"]]} for m in history]
    response = generate_response(model, messages)

    history.append({"role": "assistant", "content": response})
    return history, ""  # Clear input box

# ---------- Voice Chat Logic ----------
def voice_chat_with_model(audio_path, history, api_key):
    if not api_key:
        return history + [{"role": "assistant", "content": "Please enter your Gemini API key."}], None, None

    model, err = configure_genai(api_key)
    if err:
        return history + [{"role": "assistant", "content": err}], None, None

    if audio_path is None:
        return history, None, None

    # ASR
    user_text = recognize_audio(audio_path)
    if user_text.startswith("Sorry") or user_text.startswith("Could not request") or user_text.startswith("Error"):
        return history + [{"role": "assistant", "content": user_text}], None, None

    # Update chat
    history = history or []
    history.append({"role": "user", "content": user_text})

    # LLM
    messages = [{"role": m["role"], "parts": [m["content"]]} for m in history]
    response = generate_response(model, messages)

    # Detect language for TTS
    try:
        lang = detect(response)
    except:
        lang = "en"

    autoplay_audio = tts_to_autoplay(response, lang=lang)

    history.append({"role": "assistant", "content": response})
    return history, autoplay_audio, gr.update(value=None)  # Reset recorder

# ---------- UI ----------
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("# 🤖 Urdu/English Voice Assistant")
            with gr.Accordion("⚙️ Settings", open=False):
                api_key_input = gr.Textbox(
                    label="Gemini API Key",
                    type="password",
                    placeholder="Enter your Gemini API key"
                )

        with gr.Column(scale=2):
            # Chat Mode
            with gr.Tab("💬 Chat Mode"):
                chatbot = gr.Chatbot(type="messages")
                msg = gr.Textbox(label="Type your message", placeholder="Say something...")
                clear = gr.Button("Clear Chat")

                msg.submit(chat_with_model, [msg, chatbot, api_key_input], [chatbot, msg])
                clear.click(lambda: [], None, chatbot, queue=False)

            # Voice Mode
            with gr.Tab("🎤 Voice Mode"):
                voice_chatbot = gr.Chatbot(type="messages")
                record_btn = gr.Audio(sources=["microphone"], type="filepath", label="🎤 Record and Send")
                voice_clear = gr.Button("Clear Voice Chat")

                # Autoplay player with visible pause/play
                autoplay_player = gr.Audio(label="🔊 AI Response Playback (Pause/Play)", autoplay=True)

                record_btn.change(
                    voice_chat_with_model,
                    [record_btn, voice_chatbot, api_key_input],
                    [voice_chatbot, autoplay_player, record_btn],
                )
                voice_clear.click(lambda: [], None, voice_chatbot, queue=False)

demo.launch()