Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import soundfile as sf | |
from speechbrain.pretrained import EncoderDecoderASR, Tacotron2, HIFIGAN | |
import google.generativeai as genai | |
import os | |
from dotenv import load_dotenv | |
# Load API key | |
load_dotenv() | |
genai.configure(api_key=os.getenv("GEMINI_API_KEY")) | |
# Correct model name for Gemini | |
gemini = genai.GenerativeModel("models/gemini-1.5-flash") # You can also try "models/gemini-1.5-pro" if needed | |
# Load SpeechBrain models | |
asr_model = EncoderDecoderASR.from_hparams( | |
source="speechbrain/asr-transformer-transformerlm-librispeech", | |
savedir="tmp_asr" | |
) | |
tacotron2 = Tacotron2.from_hparams( | |
source="speechbrain/tts-tacotron2-ljspeech", | |
savedir="tmp_tts" | |
) | |
hifigan = HIFIGAN.from_hparams( | |
source="speechbrain/tts-hifigan-ljspeech", | |
savedir="tmp_hifigan" | |
) | |
# Voice Agent Function | |
def voice_agent(audio_path): | |
if audio_path is None: | |
return "β No audio received.", None | |
try: | |
# Transcribe speech | |
user_input = asr_model.transcribe_file(audio_path) | |
# Gemini response | |
gemini_response = gemini.generate_content(user_input) | |
reply_text = gemini_response.text.strip() | |
# Convert reply to speech | |
mel_output, _, _ = tacotron2.encode_text(reply_text) | |
waveform = hifigan.decode_batch(mel_output).squeeze() | |
sf.write("reply.wav", waveform.numpy(), 22050) | |
return reply_text, "reply.wav" | |
except Exception as e: | |
return f"β Error: {str(e)}", None | |
# Gradio UI | |
iface = gr.Interface( | |
fn=voice_agent, | |
inputs=gr.Audio(type="filepath", label="ποΈ Record or Upload Your Voice"), | |
outputs=[ | |
gr.Text(label="π€ Gemini's Reply"), | |
gr.Audio(label="π AI Voice Reply") | |
], | |
title="π§ Voice AI Agent: SpeechBrain + Gemini", | |
description="Talk to the AI! Free voice assistant using SpeechBrain + Gemini. Entirely open-source and runs on Hugging Face.", | |
live=True | |
) | |
iface.launch() | |