File size: 2,384 Bytes
6e55da8
c5ef34e
653911d
 
c11bb04
7f25911
036f56f
c11bb04
1a24747
5adc99b
7f25911
 
501663c
5adc99b
7f25911
501663c
1a24747
7f25911
036f56f
7f25911
036f56f
 
 
501663c
1a24747
7f25911
1a24747
 
 
55c39a0
1a24747
501663c
d9c827c
7f25911
 
 
 
 
55c39a0
7f25911
 
 
 
 
 
 
55c39a0
501663c
7f25911
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55c39a0
ee439d6
7f25911
 
 
 
 
 
653911d
 
1a24747
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model

# Environment variables
HF_TOKEN   = os.environ["HF_TOKEN"]
device_map = "auto"

# 1. RVQ Codec
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available(): rvq = rvq.to("cuda")

# 2. Pyannote VAD (correct loader)
vad_pipe = PyannotePipeline.from_pretrained(
    "pyannote/voice-activity-detection",
    use_auth_token=HF_TOKEN
)

# 3. Ultravox ASR+LLM
ultravox_pipe = pipeline(
    model="fixie-ai/ultravox-v0_4",
    trust_remote_code=True,
    device_map=device_map,
    torch_dtype=torch.float16
)

# 4. Audio Diffusion (matching UNet)
diff_pipe = DiffusionPipeline.from_pretrained(
    "timbre-labs/audio-diffusion-large-44khz",  # compatible conditioned UNet
    torch_dtype=torch.float16
).to("cuda")

# 5. Dia TTS with sharding
dia = Dia.from_pretrained(
    "nari-labs/Dia-1.6B",
    device_map=device_map,
    torch_dtype=torch.float16,
    trust_remote_code=True
)

def process_audio(audio):
    sr, array = audio
    if torch.is_tensor(array): array = array.numpy()

    # VAD
    vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})

    # RVQ
    x       = torch.tensor(array).unsqueeze(0).to("cuda")
    codes   = rvq.encode(x)
    decoded = rvq.decode(codes).squeeze().cpu().numpy()

    # Ultravox
    out  = ultravox_pipe({"array": decoded, "sampling_rate": sr})
    text = out.get("text", "")

    # Diffusion enhancement
    pros = diff_pipe(raw_audio=decoded)["audios"][0]

    # TTS
    tts = dia.generate(f"[emotion:neutral] {text}")
    tts_np = tts.squeeze().cpu().numpy()
    tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np

    return (sr, tts_np), text

with gr.Blocks(title="Maya AI πŸ“ˆ") as demo:
    gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
    audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
    send_btn = gr.Button("Send")
    audio_out = gr.Audio(label="AI Response")
    text_out = gr.Textbox(label="Generated Text")
    send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])

if __name__ == "__main__":
    demo.launch()