Spaces:
Runtime error
Runtime error
File size: 2,384 Bytes
6e55da8 c5ef34e 653911d c11bb04 7f25911 036f56f c11bb04 1a24747 5adc99b 7f25911 501663c 5adc99b 7f25911 501663c 1a24747 7f25911 036f56f 7f25911 036f56f 501663c 1a24747 7f25911 1a24747 55c39a0 1a24747 501663c d9c827c 7f25911 55c39a0 7f25911 55c39a0 501663c 7f25911 55c39a0 ee439d6 7f25911 653911d 1a24747 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model
# Environment variables
HF_TOKEN = os.environ["HF_TOKEN"]
device_map = "auto"
# 1. RVQ Codec
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available(): rvq = rvq.to("cuda")
# 2. Pyannote VAD (correct loader)
vad_pipe = PyannotePipeline.from_pretrained(
"pyannote/voice-activity-detection",
use_auth_token=HF_TOKEN
)
# 3. Ultravox ASR+LLM
ultravox_pipe = pipeline(
model="fixie-ai/ultravox-v0_4",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
# 4. Audio Diffusion (matching UNet)
diff_pipe = DiffusionPipeline.from_pretrained(
"timbre-labs/audio-diffusion-large-44khz", # compatible conditioned UNet
torch_dtype=torch.float16
).to("cuda")
# 5. Dia TTS with sharding
dia = Dia.from_pretrained(
"nari-labs/Dia-1.6B",
device_map=device_map,
torch_dtype=torch.float16,
trust_remote_code=True
)
def process_audio(audio):
sr, array = audio
if torch.is_tensor(array): array = array.numpy()
# VAD
vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
# RVQ
x = torch.tensor(array).unsqueeze(0).to("cuda")
codes = rvq.encode(x)
decoded = rvq.decode(codes).squeeze().cpu().numpy()
# Ultravox
out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
text = out.get("text", "")
# Diffusion enhancement
pros = diff_pipe(raw_audio=decoded)["audios"][0]
# TTS
tts = dia.generate(f"[emotion:neutral] {text}")
tts_np = tts.squeeze().cpu().numpy()
tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np
return (sr, tts_np), text
with gr.Blocks(title="Maya AI π") as demo:
gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
send_btn = gr.Button("Send")
audio_out = gr.Audio(label="AI Response")
text_out = gr.Textbox(label="Generated Text")
send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
if __name__ == "__main__":
demo.launch()
|