import os import gradio as gr import torch import numpy as np from transformers import pipeline from diffusers import DiffusionPipeline from pyannote.audio import Pipeline as PyannotePipeline from dia.model import Dia from dac.utils import load_model as load_dac_model # Environment variables HF_TOKEN = os.environ["HF_TOKEN"] device_map = "auto" # 1. RVQ Codec rvq = load_dac_model(tag="latest", model_type="44khz") rvq.eval() if torch.cuda.is_available(): rvq = rvq.to("cuda") # 2. Pyannote VAD (correct loader) vad_pipe = PyannotePipeline.from_pretrained( "pyannote/voice-activity-detection", use_auth_token=HF_TOKEN ) # 3. Ultravox ASR+LLM ultravox_pipe = pipeline( model="fixie-ai/ultravox-v0_4", trust_remote_code=True, device_map=device_map, torch_dtype=torch.float16 ) # 4. Audio Diffusion (matching UNet) diff_pipe = DiffusionPipeline.from_pretrained( "timbre-labs/audio-diffusion-large-44khz", # compatible conditioned UNet torch_dtype=torch.float16 ).to("cuda") # 5. Dia TTS with sharding dia = Dia.from_pretrained( "nari-labs/Dia-1.6B", device_map=device_map, torch_dtype=torch.float16, trust_remote_code=True ) def process_audio(audio): sr, array = audio if torch.is_tensor(array): array = array.numpy() # VAD vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr}) # RVQ x = torch.tensor(array).unsqueeze(0).to("cuda") codes = rvq.encode(x) decoded = rvq.decode(codes).squeeze().cpu().numpy() # Ultravox out = ultravox_pipe({"array": decoded, "sampling_rate": sr}) text = out.get("text", "") # Diffusion enhancement pros = diff_pipe(raw_audio=decoded)["audios"][0] # TTS tts = dia.generate(f"[emotion:neutral] {text}") tts_np = tts.squeeze().cpu().numpy() tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np return (sr, tts_np), text with gr.Blocks(title="Maya AI 📈") as demo: gr.Markdown("## Maya-AI: Supernatural Conversational Agent") audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice") send_btn = gr.Button("Send") audio_out = gr.Audio(label="AI Response") text_out = gr.Textbox(label="Generated Text") send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out]) if __name__ == "__main__": demo.launch()