Spaces:
Running
Running
File size: 5,455 Bytes
55a0315 629b8f6 55a0315 9cc19a2 629b8f6 55a0315 9cc19a2 629b8f6 55a0315 629b8f6 9cc19a2 55a0315 9cc19a2 55a0315 9cc19a2 629b8f6 55a0315 d1664d5 629b8f6 9cc19a2 629b8f6 9cc19a2 629b8f6 9cc19a2 629b8f6 9cc19a2 629b8f6 9cc19a2 629b8f6 55a0315 629b8f6 9cc19a2 629b8f6 9cc19a2 55a0315 9cc19a2 55a0315 629b8f6 55a0315 9dcdb56 55a0315 629b8f6 55a0315 9dcdb56 55a0315 9cc19a2 55a0315 b8e97b7 55a0315 629b8f6 55a0315 77cd3ef 9cc19a2 55a0315 9cc19a2 55a0315 d1664d5 6547d10 9cc19a2 55a0315 629b8f6 34898d9 629b8f6 34898d9 9cc19a2 34898d9 7f12b65 34898d9 9cc19a2 9dcdb56 34898d9 9cc19a2 34898d9 7f12b65 9cc19a2 34898d9 d1664d5 3f6bd1b a8bd3b3 629b8f6 a8bd3b3 34898d9 55a0315 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import gradio as gr
import speech_recognition as sr
from gtts import gTTS
import tempfile
import google.generativeai as genai
from langdetect import detect
from pydub import AudioSegment
import numpy as np
# ---------- Configure Gemini ----------
def configure_genai(api_key: str):
try:
genai.configure(api_key=api_key)
return genai.GenerativeModel("gemini-1.5-flash"), None
except Exception as e:
return None, f"Failed to configure Generative AI: {e}"
# ---------- Speech to Text ----------
def recognize_audio(audio_path: str) -> str:
r = sr.Recognizer()
try:
with sr.AudioFile(audio_path) as source:
audio_data = r.record(source)
return r.recognize_google(audio_data, language="ur-PK")
except sr.UnknownValueError:
return "Sorry, I could not understand the audio."
except sr.RequestError as e:
return f"Could not request results; {e}"
except Exception as e:
return f"Error during recognition: {e}"
# ---------- LLM Response ----------
def generate_response(model, history_msgs):
try:
resp = model.generate_content(history_msgs)
return resp.text
except Exception as e:
return f"Error generating response: {e}"
# ---------- TTS β Autoplay Value ----------
def tts_to_autoplay(text: str, lang: str = "ur"):
"""
Convert text β MP3 β NumPy float32 for Gradio Audio autoplay.
"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
gTTS(text=text, lang=lang).save(fp.name)
mp3_path = fp.name
seg = AudioSegment.from_file(mp3_path)
seg = seg.set_channels(1).set_frame_rate(16000)
samples = np.array(seg.get_array_of_samples()).astype(np.float32)
max_val = float(1 << (8 * seg.sample_width - 1))
samples = samples / max_val
return seg.frame_rate, samples
# ---------- Text Chat Logic ----------
def chat_with_model(message, history, api_key):
if not api_key:
return history + [{"role": "assistant", "content": "Please enter your Gemini API key."}], ""
model, err = configure_genai(api_key)
if err:
return history + [{"role": "assistant", "content": err}], ""
history = history or []
history.append({"role": "user", "content": message})
messages = [{"role": m["role"], "parts": [m["content"]]} for m in history]
response = generate_response(model, messages)
history.append({"role": "assistant", "content": response})
return history, "" # Clear input box
# ---------- Voice Chat Logic ----------
def voice_chat_with_model(audio_path, history, api_key):
if not api_key:
return history + [{"role": "assistant", "content": "Please enter your Gemini API key."}], None, None
model, err = configure_genai(api_key)
if err:
return history + [{"role": "assistant", "content": err}], None, None
if audio_path is None:
return history, None, None
# ASR
user_text = recognize_audio(audio_path)
if user_text.startswith("Sorry") or user_text.startswith("Could not request") or user_text.startswith("Error"):
return history + [{"role": "assistant", "content": user_text}], None, None
# Update chat
history = history or []
history.append({"role": "user", "content": user_text})
# LLM
messages = [{"role": m["role"], "parts": [m["content"]]} for m in history]
response = generate_response(model, messages)
# Detect language for TTS
try:
lang = detect(response)
except:
lang = "en"
autoplay_audio = tts_to_autoplay(response, lang=lang)
history.append({"role": "assistant", "content": response})
return history, autoplay_audio, gr.update(value=None) # Reset recorder
# ---------- UI ----------
with gr.Blocks(theme=gr.themes.Soft()) as demo:
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("# π€ Urdu/English Voice Assistant")
with gr.Accordion("βοΈ Settings", open=False):
api_key_input = gr.Textbox(
label="Gemini API Key",
type="password",
placeholder="Enter your Gemini API key"
)
with gr.Column(scale=2):
# Chat Mode
with gr.Tab("π¬ Chat Mode"):
chatbot = gr.Chatbot(type="messages")
msg = gr.Textbox(label="Type your message", placeholder="Say something...")
clear = gr.Button("Clear Chat")
msg.submit(chat_with_model, [msg, chatbot, api_key_input], [chatbot, msg])
clear.click(lambda: [], None, chatbot, queue=False)
# Voice Mode
with gr.Tab("π€ Voice Mode"):
voice_chatbot = gr.Chatbot(type="messages")
record_btn = gr.Audio(sources=["microphone"], type="filepath", label="π€ Record and Send")
voice_clear = gr.Button("Clear Voice Chat")
# Autoplay player with visible pause/play
autoplay_player = gr.Audio(label="π AI Response Playback (Pause/Play)", autoplay=True)
record_btn.change(
voice_chat_with_model,
[record_btn, voice_chatbot, api_key_input],
[voice_chatbot, autoplay_player, record_btn],
)
voice_clear.click(lambda: [], None, voice_chatbot, queue=False)
demo.launch()
|