import gradio as gr import torch import soundfile as sf from speechbrain.pretrained import EncoderDecoderASR, Tacotron2, HIFIGAN import google.generativeai as genai import os from dotenv import load_dotenv # Load API key load_dotenv() genai.configure(api_key=os.getenv("GEMINI_API_KEY")) # Correct model name for Gemini gemini = genai.GenerativeModel("models/gemini-1.5-flash") # You can also try "models/gemini-1.5-pro" if needed # Load SpeechBrain models asr_model = EncoderDecoderASR.from_hparams( source="speechbrain/asr-transformer-transformerlm-librispeech", savedir="tmp_asr" ) tacotron2 = Tacotron2.from_hparams( source="speechbrain/tts-tacotron2-ljspeech", savedir="tmp_tts" ) hifigan = HIFIGAN.from_hparams( source="speechbrain/tts-hifigan-ljspeech", savedir="tmp_hifigan" ) # Voice Agent Function def voice_agent(audio_path): if audio_path is None: return "❌ No audio received.", None try: # Transcribe speech user_input = asr_model.transcribe_file(audio_path) # Gemini response gemini_response = gemini.generate_content(user_input) reply_text = gemini_response.text.strip() # Convert reply to speech mel_output, _, _ = tacotron2.encode_text(reply_text) waveform = hifigan.decode_batch(mel_output).squeeze() sf.write("reply.wav", waveform.numpy(), 22050) return reply_text, "reply.wav" except Exception as e: return f"❌ Error: {str(e)}", None # Gradio UI iface = gr.Interface( fn=voice_agent, inputs=gr.Audio(type="filepath", label="🎙️ Record or Upload Your Voice"), outputs=[ gr.Text(label="🤖 Gemini's Reply"), gr.Audio(label="🔊 AI Voice Reply") ], title="🧠 Voice AI Agent: SpeechBrain + Gemini", description="Talk to the AI! Free voice assistant using SpeechBrain + Gemini. Entirely open-source and runs on Hugging Face.", live=True ) iface.launch()