File size: 1,970 Bytes
da091fd
 
 
 
 
 
 
 
2130d3f
da091fd
 
 
2130d3f
 
da091fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2130d3f
4ad7181
 
2130d3f
da091fd
92f5864
2130d3f
4ad7181
da091fd
92f5864
 
 
bcb3ccc
2130d3f
92f5864
 
 
 
 
2130d3f
92f5864
 
 
4ad7181
2130d3f
da091fd
 
2130d3f
da091fd
 
 
 
2130d3f
 
da091fd
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import gradio as gr
import torch
import soundfile as sf
from speechbrain.pretrained import EncoderDecoderASR, Tacotron2, HIFIGAN
import google.generativeai as genai
import os
from dotenv import load_dotenv

# Load API key
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# Correct model name for Gemini
gemini = genai.GenerativeModel("models/gemini-1.5-flash")  # You can also try "models/gemini-1.5-pro" if needed

# Load SpeechBrain models
asr_model = EncoderDecoderASR.from_hparams(
    source="speechbrain/asr-transformer-transformerlm-librispeech",
    savedir="tmp_asr"
)
tacotron2 = Tacotron2.from_hparams(
    source="speechbrain/tts-tacotron2-ljspeech",
    savedir="tmp_tts"
)
hifigan = HIFIGAN.from_hparams(
    source="speechbrain/tts-hifigan-ljspeech",
    savedir="tmp_hifigan"
)

# Voice Agent Function
def voice_agent(audio_path):
    if audio_path is None:
        return "❌ No audio received.", None

    try:
        # Transcribe speech
        user_input = asr_model.transcribe_file(audio_path)

        # Gemini response
        gemini_response = gemini.generate_content(user_input)
        reply_text = gemini_response.text.strip()

        # Convert reply to speech
        mel_output, _, _ = tacotron2.encode_text(reply_text)
        waveform = hifigan.decode_batch(mel_output).squeeze()
        sf.write("reply.wav", waveform.numpy(), 22050)

        return reply_text, "reply.wav"

    except Exception as e:
        return f"❌ Error: {str(e)}", None


# Gradio UI
iface = gr.Interface(
    fn=voice_agent,
    inputs=gr.Audio(type="filepath", label="πŸŽ™οΈ Record or Upload Your Voice"),
    outputs=[
        gr.Text(label="πŸ€– Gemini's Reply"),
        gr.Audio(label="πŸ”Š AI Voice Reply")
    ],
    title="🧠 Voice AI Agent: SpeechBrain + Gemini",
    description="Talk to the AI! Free voice assistant using SpeechBrain + Gemini. Entirely open-source and runs on Hugging Face.",
    live=True
)

iface.launch()