Spaces:
Sleeping
Sleeping
File size: 1,970 Bytes
da091fd 2130d3f da091fd 2130d3f da091fd 2130d3f 4ad7181 2130d3f da091fd 92f5864 2130d3f 4ad7181 da091fd 92f5864 bcb3ccc 2130d3f 92f5864 2130d3f 92f5864 4ad7181 2130d3f da091fd 2130d3f da091fd 2130d3f da091fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import gradio as gr
import torch
import soundfile as sf
from speechbrain.pretrained import EncoderDecoderASR, Tacotron2, HIFIGAN
import google.generativeai as genai
import os
from dotenv import load_dotenv
# Load API key
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
# Correct model name for Gemini
gemini = genai.GenerativeModel("models/gemini-1.5-flash") # You can also try "models/gemini-1.5-pro" if needed
# Load SpeechBrain models
asr_model = EncoderDecoderASR.from_hparams(
source="speechbrain/asr-transformer-transformerlm-librispeech",
savedir="tmp_asr"
)
tacotron2 = Tacotron2.from_hparams(
source="speechbrain/tts-tacotron2-ljspeech",
savedir="tmp_tts"
)
hifigan = HIFIGAN.from_hparams(
source="speechbrain/tts-hifigan-ljspeech",
savedir="tmp_hifigan"
)
# Voice Agent Function
def voice_agent(audio_path):
if audio_path is None:
return "β No audio received.", None
try:
# Transcribe speech
user_input = asr_model.transcribe_file(audio_path)
# Gemini response
gemini_response = gemini.generate_content(user_input)
reply_text = gemini_response.text.strip()
# Convert reply to speech
mel_output, _, _ = tacotron2.encode_text(reply_text)
waveform = hifigan.decode_batch(mel_output).squeeze()
sf.write("reply.wav", waveform.numpy(), 22050)
return reply_text, "reply.wav"
except Exception as e:
return f"β Error: {str(e)}", None
# Gradio UI
iface = gr.Interface(
fn=voice_agent,
inputs=gr.Audio(type="filepath", label="ποΈ Record or Upload Your Voice"),
outputs=[
gr.Text(label="π€ Gemini's Reply"),
gr.Audio(label="π AI Voice Reply")
],
title="π§ Voice AI Agent: SpeechBrain + Gemini",
description="Talk to the AI! Free voice assistant using SpeechBrain + Gemini. Entirely open-source and runs on Hugging Face.",
live=True
)
iface.launch()
|