neutts-air

Running

App Files Files Community

playmak3r commited on Oct 18

Commit

f4b1b5b

1 Parent(s): 743b8f5

Add transcription functionality and improve input handling in infer function

Browse files

Files changed (2) hide show

app.py +28 -5
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ sys.path.append("neutts-air")
 from neuttsair.neutts import NeuTTSAir
 import numpy as np
 import gradio as gr
 SAMPLES_PATH = os.path.join(os.getcwd(), "neutts-air", "samples")
@@ -24,23 +25,45 @@ tts = NeuTTSAir(
     codec_device="cpu"
 )
 @spaces.GPU()
 def infer(
-    ref_text: str,
-    ref_audio_path: str,
     gen_text: str,
 ) -> tuple[int, np.ndarray]:
     """
     Generates speech using NeuTTS-Air given a reference audio and text, and new text to synthesize.
     Args:
         ref_text (str): The text corresponding to the reference audio.
         ref_audio_path (str): The file path to the reference audio.
-        gen_text (str): The new text to synthesize.
     Returns:
         tuple [int, np.ndarray]: A tuple containing the sample rate (24000) and the generated audio waveform as a numpy array.
     """
     logging.info(f"Using reference: {ref_audio_path}")
     gr.Info("Starting inference request!")
     gr.Info("Encoding reference...")
@@ -54,9 +77,9 @@ def infer(
 demo = gr.Interface(
     fn=infer,
     inputs=[
-        gr.Textbox(label="Reference Text", value=DEFAULT_REF_TEXT),
-        gr.Audio(type="filepath", label="Reference Audio", value=DEFAULT_REF_PATH),
         gr.Textbox(label="Text to Generate", value=DEFAULT_GEN_TEXT),
     ],
     outputs=gr.Audio(type="numpy", label="Generated Speech"),
     title="NeuTTS-Air☁️",

 from neuttsair.neutts import NeuTTSAir
 import numpy as np
 import gradio as gr
+from groq import Groq
 SAMPLES_PATH = os.path.join(os.getcwd(), "neutts-air", "samples")
     codec_device="cpu"
 )
+def transcribe(file_path: str):
+    client = Groq()
+    with open(file_path, "rb") as file:
+        transcription = client.audio.transcriptions.create(
+            file=(file_path, file.read()),
+            model="whisper-large-v3-turbo",
+            temperature=0,
+            response_format="verbose_json",
+        )
+        if len(transcription.text) <= 0: logging.warn("Error while transcripting the reference audio.")
+        return transcription.text
 @spaces.GPU()
 def infer(
     gen_text: str,
+    ref_text: str = DEFAULT_REF_TEXT,
+    ref_audio_path: str = DEFAULT_REF_PATH,
 ) -> tuple[int, np.ndarray]:
     """
     Generates speech using NeuTTS-Air given a reference audio and text, and new text to synthesize.
     Args:
+        gen_text (str): The new text to synthesize.
         ref_text (str): The text corresponding to the reference audio.
         ref_audio_path (str): The file path to the reference audio.
     Returns:
         tuple [int, np.ndarray]: A tuple containing the sample rate (24000) and the generated audio waveform as a numpy array.
     """
+    if gen_text is None or not len(gen_text):
+        raise Exception("Please insert the new text to synthesize.")
+    if ref_audio_path != DEFAULT_REF_PATH and ref_text == DEFAULT_REF_TEXT:
+        ref_text = ""
+    if not len(ref_text):
+        ref_text = transcribe(ref_audio_path)
     logging.info(f"Using reference: {ref_audio_path}")
     gr.Info("Starting inference request!")
     gr.Info("Encoding reference...")
 demo = gr.Interface(
     fn=infer,
     inputs=[
         gr.Textbox(label="Text to Generate", value=DEFAULT_GEN_TEXT),
+        gr.Textbox(label="Reference Text (Optional)", value=DEFAULT_REF_TEXT),
+        gr.Audio(type="filepath", label="Reference Audio", value=DEFAULT_REF_PATH),
     ],
     outputs=gr.Audio(type="numpy", label="Generated Speech"),
     title="NeuTTS-Air☁️",

requirements.txt CHANGED Viewed

@@ -16,4 +16,6 @@ transformers==4.56.1
 vector-quantize-pytorch==1.17.8
 resemble-perth==1.0.1
 accelerate==1.10.1
-gradio

 vector-quantize-pytorch==1.17.8
 resemble-perth==1.0.1
 accelerate==1.10.1
+gradio
+groq