Spaces:

bilulu
/

Project-gemini-voicebot

Sleeping

File size: 1,970 Bytes

da091fd
 
 
 
 
 
 
 
2130d3f
da091fd
 
 
2130d3f
 
da091fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2130d3f
4ad7181
 
2130d3f
da091fd
92f5864
2130d3f
4ad7181
da091fd
92f5864
 
 
bcb3ccc
2130d3f
92f5864
 
 
 
 
2130d3f
92f5864
 
 
4ad7181
2130d3f
da091fd
 
2130d3f
da091fd
 
 
 
2130d3f
 
da091fd

import gradio as gr
import torch
import soundfile as sf
from speechbrain.pretrained import EncoderDecoderASR, Tacotron2, HIFIGAN
import google.generativeai as genai
import os
from dotenv import load_dotenv

# Load API key
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# Correct model name for Gemini
gemini = genai.GenerativeModel("models/gemini-1.5-flash")  # You can also try "models/gemini-1.5-pro" if needed

# Load SpeechBrain models
asr_model = EncoderDecoderASR.from_hparams(
    source="speechbrain/asr-transformer-transformerlm-librispeech",
    savedir="tmp_asr"
)
tacotron2 = Tacotron2.from_hparams(
    source="speechbrain/tts-tacotron2-ljspeech",
    savedir="tmp_tts"
)
hifigan = HIFIGAN.from_hparams(
    source="speechbrain/tts-hifigan-ljspeech",
    savedir="tmp_hifigan"
)

# Voice Agent Function
def voice_agent(audio_path):
    if audio_path is None:
        return "❌ No audio received.", None

    try:
        # Transcribe speech
        user_input = asr_model.transcribe_file(audio_path)

        # Gemini response
        gemini_response = gemini.generate_content(user_input)
        reply_text = gemini_response.text.strip()

        # Convert reply to speech
        mel_output, _, _ = tacotron2.encode_text(reply_text)
        waveform = hifigan.decode_batch(mel_output).squeeze()
        sf.write("reply.wav", waveform.numpy(), 22050)

        return reply_text, "reply.wav"

    except Exception as e:
        return f"❌ Error: {str(e)}", None


# Gradio UI
iface = gr.Interface(
    fn=voice_agent,
    inputs=gr.Audio(type="filepath", label="🎙️ Record or Upload Your Voice"),
    outputs=[
        gr.Text(label="🤖 Gemini's Reply"),
        gr.Audio(label="🔊 AI Voice Reply")
    ],
    title="🧠 Voice AI Agent: SpeechBrain + Gemini",
    description="Talk to the AI! Free voice assistant using SpeechBrain + Gemini. Entirely open-source and runs on Hugging Face.",
    live=True
)

iface.launch()