import spaces import os, sys, logging sys.path.append("neutts-air") from neuttsair.neutts import NeuTTSAir import numpy as np import gradio as gr from groq import Groq SAMPLES_PATH = os.path.join(os.getcwd(), "neutts-air", "samples") DEFAULT_REF_TEXT = "So I'm live on radio. And I say, well, my dear friend James here clearly, and the whole room just froze. Turns out I'd completely misspoken and mentioned our other friend." DEFAULT_REF_PATH = os.path.join(SAMPLES_PATH, "dave.wav") DEFAULT_GEN_TEXT = "My name is Dave, and um, I'm from London." logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", stream=sys.stdout ) tts = NeuTTSAir( backbone_repo="neuphonic/neutts-air", backbone_device="cpu", codec_repo="neuphonic/neucodec", codec_device="cpu" ) def transcribe(file_path: str): client = Groq() with open(file_path, "rb") as file: transcription = client.audio.transcriptions.create( file=(file_path, file.read()), model="whisper-large-v3-turbo", temperature=0, response_format="verbose_json", ) if len(transcription.text) <= 0: logging.warn("Error while transcripting the reference audio.") return transcription.text @spaces.GPU() def infer( gen_text: str, ref_text: str = DEFAULT_REF_TEXT, ref_audio_path: str = DEFAULT_REF_PATH, ) -> tuple[int, np.ndarray]: """ Generates speech using NeuTTS-Air given a reference audio and text, and new text to synthesize. Args: gen_text (str): The new text to synthesize. ref_text (str): The text corresponding to the reference audio. ref_audio_path (str): The file path to the reference audio. Returns: tuple [int, np.ndarray]: A tuple containing the sample rate (24000) and the generated audio waveform as a numpy array. """ if gen_text is None or not len(gen_text): raise Exception("Please insert the new text to synthesize.") if ref_audio_path != DEFAULT_REF_PATH and ref_text == DEFAULT_REF_TEXT: ref_text = "" if not len(ref_text): ref_text = transcribe(ref_audio_path) logging.info(f"Using reference: {ref_audio_path}") gr.Info("Starting inference request!") gr.Info("Encoding reference...") ref_codes = tts.encode_reference(ref_audio_path) gr.Info(f"Generating audio for input text: {gen_text}") wav = tts.infer(gen_text, ref_codes, ref_text) return (24_000, wav) demo = gr.Interface( fn=infer, inputs=[ gr.Textbox(label="Text to Generate", value=DEFAULT_GEN_TEXT), gr.Textbox(label="Reference Text (Optional)", value=DEFAULT_REF_TEXT), gr.Audio(type="filepath", label="Reference Audio", value=DEFAULT_REF_PATH), ], outputs=gr.Audio(type="numpy", label="Generated Speech"), title="NeuTTS-Air☁️", description="Upload a reference audio sample, provide the reference text, and enter new text to synthesize." ) if __name__ == "__main__": demo.queue(max_size=10).launch(allowed_paths=[SAMPLES_PATH], mcp_server=False, inbrowser=True)