playmak3r commited on
Commit
f4b1b5b
·
1 Parent(s): 743b8f5

Add transcription functionality and improve input handling in infer function

Browse files
Files changed (2) hide show
  1. app.py +28 -5
  2. requirements.txt +3 -1
app.py CHANGED
@@ -4,6 +4,7 @@ sys.path.append("neutts-air")
4
  from neuttsair.neutts import NeuTTSAir
5
  import numpy as np
6
  import gradio as gr
 
7
 
8
 
9
  SAMPLES_PATH = os.path.join(os.getcwd(), "neutts-air", "samples")
@@ -24,23 +25,45 @@ tts = NeuTTSAir(
24
  codec_device="cpu"
25
  )
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  @spaces.GPU()
28
  def infer(
29
- ref_text: str,
30
- ref_audio_path: str,
31
  gen_text: str,
 
 
32
  ) -> tuple[int, np.ndarray]:
33
  """
34
  Generates speech using NeuTTS-Air given a reference audio and text, and new text to synthesize.
35
 
36
  Args:
 
37
  ref_text (str): The text corresponding to the reference audio.
38
  ref_audio_path (str): The file path to the reference audio.
39
- gen_text (str): The new text to synthesize.
40
  Returns:
41
  tuple [int, np.ndarray]: A tuple containing the sample rate (24000) and the generated audio waveform as a numpy array.
42
  """
43
 
 
 
 
 
 
 
 
44
  logging.info(f"Using reference: {ref_audio_path}")
45
  gr.Info("Starting inference request!")
46
  gr.Info("Encoding reference...")
@@ -54,9 +77,9 @@ def infer(
54
  demo = gr.Interface(
55
  fn=infer,
56
  inputs=[
57
- gr.Textbox(label="Reference Text", value=DEFAULT_REF_TEXT),
58
- gr.Audio(type="filepath", label="Reference Audio", value=DEFAULT_REF_PATH),
59
  gr.Textbox(label="Text to Generate", value=DEFAULT_GEN_TEXT),
 
 
60
  ],
61
  outputs=gr.Audio(type="numpy", label="Generated Speech"),
62
  title="NeuTTS-Air☁️",
 
4
  from neuttsair.neutts import NeuTTSAir
5
  import numpy as np
6
  import gradio as gr
7
+ from groq import Groq
8
 
9
 
10
  SAMPLES_PATH = os.path.join(os.getcwd(), "neutts-air", "samples")
 
25
  codec_device="cpu"
26
  )
27
 
28
+
29
+
30
+ def transcribe(file_path: str):
31
+ client = Groq()
32
+ with open(file_path, "rb") as file:
33
+ transcription = client.audio.transcriptions.create(
34
+ file=(file_path, file.read()),
35
+ model="whisper-large-v3-turbo",
36
+ temperature=0,
37
+ response_format="verbose_json",
38
+ )
39
+
40
+ if len(transcription.text) <= 0: logging.warn("Error while transcripting the reference audio.")
41
+ return transcription.text
42
+
43
  @spaces.GPU()
44
  def infer(
 
 
45
  gen_text: str,
46
+ ref_text: str = DEFAULT_REF_TEXT,
47
+ ref_audio_path: str = DEFAULT_REF_PATH,
48
  ) -> tuple[int, np.ndarray]:
49
  """
50
  Generates speech using NeuTTS-Air given a reference audio and text, and new text to synthesize.
51
 
52
  Args:
53
+ gen_text (str): The new text to synthesize.
54
  ref_text (str): The text corresponding to the reference audio.
55
  ref_audio_path (str): The file path to the reference audio.
 
56
  Returns:
57
  tuple [int, np.ndarray]: A tuple containing the sample rate (24000) and the generated audio waveform as a numpy array.
58
  """
59
 
60
+ if gen_text is None or not len(gen_text):
61
+ raise Exception("Please insert the new text to synthesize.")
62
+ if ref_audio_path != DEFAULT_REF_PATH and ref_text == DEFAULT_REF_TEXT:
63
+ ref_text = ""
64
+ if not len(ref_text):
65
+ ref_text = transcribe(ref_audio_path)
66
+
67
  logging.info(f"Using reference: {ref_audio_path}")
68
  gr.Info("Starting inference request!")
69
  gr.Info("Encoding reference...")
 
77
  demo = gr.Interface(
78
  fn=infer,
79
  inputs=[
 
 
80
  gr.Textbox(label="Text to Generate", value=DEFAULT_GEN_TEXT),
81
+ gr.Textbox(label="Reference Text (Optional)", value=DEFAULT_REF_TEXT),
82
+ gr.Audio(type="filepath", label="Reference Audio", value=DEFAULT_REF_PATH),
83
  ],
84
  outputs=gr.Audio(type="numpy", label="Generated Speech"),
85
  title="NeuTTS-Air☁️",
requirements.txt CHANGED
@@ -16,4 +16,6 @@ transformers==4.56.1
16
  vector-quantize-pytorch==1.17.8
17
  resemble-perth==1.0.1
18
  accelerate==1.10.1
19
- gradio
 
 
 
16
  vector-quantize-pytorch==1.17.8
17
  resemble-perth==1.0.1
18
  accelerate==1.10.1
19
+ gradio
20
+
21
+ groq