bilulu's picture
Update app.py
2130d3f verified
import gradio as gr
import torch
import soundfile as sf
from speechbrain.pretrained import EncoderDecoderASR, Tacotron2, HIFIGAN
import google.generativeai as genai
import os
from dotenv import load_dotenv
# Load API key
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
# Correct model name for Gemini
gemini = genai.GenerativeModel("models/gemini-1.5-flash") # You can also try "models/gemini-1.5-pro" if needed
# Load SpeechBrain models
asr_model = EncoderDecoderASR.from_hparams(
source="speechbrain/asr-transformer-transformerlm-librispeech",
savedir="tmp_asr"
)
tacotron2 = Tacotron2.from_hparams(
source="speechbrain/tts-tacotron2-ljspeech",
savedir="tmp_tts"
)
hifigan = HIFIGAN.from_hparams(
source="speechbrain/tts-hifigan-ljspeech",
savedir="tmp_hifigan"
)
# Voice Agent Function
def voice_agent(audio_path):
if audio_path is None:
return "❌ No audio received.", None
try:
# Transcribe speech
user_input = asr_model.transcribe_file(audio_path)
# Gemini response
gemini_response = gemini.generate_content(user_input)
reply_text = gemini_response.text.strip()
# Convert reply to speech
mel_output, _, _ = tacotron2.encode_text(reply_text)
waveform = hifigan.decode_batch(mel_output).squeeze()
sf.write("reply.wav", waveform.numpy(), 22050)
return reply_text, "reply.wav"
except Exception as e:
return f"❌ Error: {str(e)}", None
# Gradio UI
iface = gr.Interface(
fn=voice_agent,
inputs=gr.Audio(type="filepath", label="πŸŽ™οΈ Record or Upload Your Voice"),
outputs=[
gr.Text(label="πŸ€– Gemini's Reply"),
gr.Audio(label="πŸ”Š AI Voice Reply")
],
title="🧠 Voice AI Agent: SpeechBrain + Gemini",
description="Talk to the AI! Free voice assistant using SpeechBrain + Gemini. Entirely open-source and runs on Hugging Face.",
live=True
)
iface.launch()