Maya-AI / app.py
Devakumar868's picture
Update app.py
7f25911 verified
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model
# Environment variables
HF_TOKEN = os.environ["HF_TOKEN"]
device_map = "auto"
# 1. RVQ Codec
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available(): rvq = rvq.to("cuda")
# 2. Pyannote VAD (correct loader)
vad_pipe = PyannotePipeline.from_pretrained(
"pyannote/voice-activity-detection",
use_auth_token=HF_TOKEN
)
# 3. Ultravox ASR+LLM
ultravox_pipe = pipeline(
model="fixie-ai/ultravox-v0_4",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
# 4. Audio Diffusion (matching UNet)
diff_pipe = DiffusionPipeline.from_pretrained(
"timbre-labs/audio-diffusion-large-44khz", # compatible conditioned UNet
torch_dtype=torch.float16
).to("cuda")
# 5. Dia TTS with sharding
dia = Dia.from_pretrained(
"nari-labs/Dia-1.6B",
device_map=device_map,
torch_dtype=torch.float16,
trust_remote_code=True
)
def process_audio(audio):
sr, array = audio
if torch.is_tensor(array): array = array.numpy()
# VAD
vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
# RVQ
x = torch.tensor(array).unsqueeze(0).to("cuda")
codes = rvq.encode(x)
decoded = rvq.decode(codes).squeeze().cpu().numpy()
# Ultravox
out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
text = out.get("text", "")
# Diffusion enhancement
pros = diff_pipe(raw_audio=decoded)["audios"][0]
# TTS
tts = dia.generate(f"[emotion:neutral] {text}")
tts_np = tts.squeeze().cpu().numpy()
tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np
return (sr, tts_np), text
with gr.Blocks(title="Maya AI πŸ“ˆ") as demo:
gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
send_btn = gr.Button("Send")
audio_out = gr.Audio(label="AI Response")
text_out = gr.Textbox(label="Generated Text")
send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
if __name__ == "__main__":
demo.launch()