Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -7,9 +7,118 @@ import json
|
|
7 |
import shutil
|
8 |
from pathlib import Path
|
9 |
import numpy as np
|
|
|
|
|
10 |
|
11 |
# ===== NEUTTS IMPORTS =====
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# ===== CONFIGURATION =====
|
15 |
CONFIG_FILE = "voice_profiles.json"
|
@@ -26,7 +135,18 @@ class VoiceProfileManager:
|
|
26 |
if os.path.exists(self.config_file):
|
27 |
with open(self.config_file, 'r') as f:
|
28 |
return json.load(f)
|
29 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
def save_profiles(self):
|
32 |
with open(self.config_file, 'w') as f:
|
@@ -53,11 +173,11 @@ def download_default_samples():
|
|
53 |
samples = {
|
54 |
"dave": {
|
55 |
"audio": "https://github.com/neophonic/neutts-air/raw/main/samples/dave.wav",
|
56 |
-
"text": "
|
57 |
},
|
58 |
"andrea": {
|
59 |
"audio": "https://github.com/neophonic/neutts-air/raw/main/samples/andrea.wav",
|
60 |
-
"text": "
|
61 |
}
|
62 |
}
|
63 |
|
@@ -67,332 +187,12 @@ def download_default_samples():
|
|
67 |
|
68 |
if not os.path.exists(audio_path):
|
69 |
try:
|
70 |
-
|
|
|
71 |
with open(audio_path, 'wb') as f:
|
72 |
f.write(response.content)
|
73 |
|
74 |
-
response = requests.get(urls["text"])
|
75 |
with open(text_path, 'w') as f:
|
76 |
-
f.write(
|
77 |
|
78 |
-
print(f
|
79 |
-
except Exception as e:
|
80 |
-
print(f"❌ Failed to download {name}: {e}")
|
81 |
-
|
82 |
-
# ===== TTS ENGINE =====
|
83 |
-
class TTSEngine:
|
84 |
-
def __init__(self):
|
85 |
-
self.tts = None
|
86 |
-
self.voice_manager = VoiceProfileManager()
|
87 |
-
download_default_samples()
|
88 |
-
|
89 |
-
def initialize_tts(self):
|
90 |
-
if self.tts is None:
|
91 |
-
print("🚀 Initializing NeuTTS Q4 GGUF...")
|
92 |
-
self.tts = NeuTTSAir(
|
93 |
-
backbone_repo="neuphonic/neutts-air-q4-gguf",
|
94 |
-
backbone_device="cpu",
|
95 |
-
codec_repo="neuphonic/neucodec",
|
96 |
-
codec_device="cpu"
|
97 |
-
)
|
98 |
-
return self.tts
|
99 |
-
|
100 |
-
def generate_speech(self, text, voice_name):
|
101 |
-
try:
|
102 |
-
tts = self.initialize_tts()
|
103 |
-
profile = self.voice_manager.get_profile(voice_name)
|
104 |
-
|
105 |
-
if not profile:
|
106 |
-
return None, f"❌ Voice profile '{voice_name}' not found"
|
107 |
-
|
108 |
-
ref_codes = tts.encode_reference(profile["audio_path"])
|
109 |
-
ref_text = profile["text"]
|
110 |
-
|
111 |
-
wav = tts.infer(text, ref_codes, ref_text)
|
112 |
-
return wav, None
|
113 |
-
|
114 |
-
except Exception as e:
|
115 |
-
return None, f"❌ Generation error: {str(e)}"
|
116 |
-
|
117 |
-
# ===== SCRIPT PARSING =====
|
118 |
-
def parse_conversation_script(script_text):
|
119 |
-
"""Parse script with speaker labels"""
|
120 |
-
lines = []
|
121 |
-
for line in script_text.strip().split('\n'):
|
122 |
-
line = line.strip()
|
123 |
-
if ':' in line:
|
124 |
-
speaker, dialogue = line.split(':', 1)
|
125 |
-
lines.append({
|
126 |
-
"speaker": speaker.strip(),
|
127 |
-
"text": dialogue.strip()
|
128 |
-
})
|
129 |
-
elif line:
|
130 |
-
# Default to Speaker A if no label
|
131 |
-
lines.append({
|
132 |
-
"speaker": "Speaker A",
|
133 |
-
"text": line
|
134 |
-
})
|
135 |
-
return lines
|
136 |
-
|
137 |
-
def generate_script_from_prompt(prompt, style="conversational"):
|
138 |
-
"""Generate a podcast script from a prompt"""
|
139 |
-
# Simple template-based generation
|
140 |
-
templates = {
|
141 |
-
"conversational": [
|
142 |
-
"Host: Welcome to our podcast! Today we're discussing {prompt}",
|
143 |
-
"Co-host: That's right! It's a fascinating topic that affects many people.",
|
144 |
-
"Host: Let's start with the basics. What should our audience know about this?",
|
145 |
-
"Co-host: Well, first of all, it's important to understand the key concepts.",
|
146 |
-
"Host: And what about the practical applications? How can people use this in their daily lives?",
|
147 |
-
"Co-host: Great question! There are several ways to apply this knowledge effectively."
|
148 |
-
],
|
149 |
-
"interview": [
|
150 |
-
"Interviewer: Thanks for joining us today to talk about {prompt}",
|
151 |
-
"Guest: Happy to be here! It's a topic I'm very passionate about.",
|
152 |
-
"Interviewer: Could you share some background on how you got involved in this field?",
|
153 |
-
"Guest: Absolutely. It all started several years ago when I first discovered this area.",
|
154 |
-
"Interviewer: What are the most exciting developments you're seeing right now?",
|
155 |
-
"Guest: There are some incredible advancements happening that will change everything."
|
156 |
-
],
|
157 |
-
"debate": [
|
158 |
-
"Moderator: Welcome to our debate on {prompt}",
|
159 |
-
"Proponent: I believe this is one of the most important issues of our time.",
|
160 |
-
"Opponent: While I respect that view, I have some serious concerns about the approach.",
|
161 |
-
"Proponent: Let me address those concerns with some concrete evidence.",
|
162 |
-
"Opponent: The evidence is compelling, but we must consider the broader implications.",
|
163 |
-
"Moderator: Let's hear from both sides about potential solutions."
|
164 |
-
]
|
165 |
-
}
|
166 |
-
|
167 |
-
template = templates.get(style, templates["conversational"])
|
168 |
-
script = "\n".join([line.format(prompt=prompt) for line in template])
|
169 |
-
return script
|
170 |
-
|
171 |
-
# ===== MAIN GENERATION FUNCTIONS =====
|
172 |
-
tts_engine = TTSEngine()
|
173 |
-
|
174 |
-
def clone_voice(voice_name, upload_audio, reference_text):
|
175 |
-
"""Clone a voice from uploaded audio"""
|
176 |
-
if not voice_name or not upload_audio:
|
177 |
-
return "❌ Please provide a voice name and audio file"
|
178 |
-
|
179 |
-
try:
|
180 |
-
# Save uploaded audio
|
181 |
-
audio_ext = Path(upload_audio).suffix
|
182 |
-
audio_path = f"{SAMPLE_DIR}/{voice_name}{audio_ext}"
|
183 |
-
shutil.copy2(upload_audio, audio_path)
|
184 |
-
|
185 |
-
# Save voice profile
|
186 |
-
result = tts_engine.voice_manager.add_profile(voice_name, audio_path, reference_text)
|
187 |
-
return result
|
188 |
-
except Exception as e:
|
189 |
-
return f"❌ Error cloning voice: {str(e)}"
|
190 |
-
|
191 |
-
def generate_podcast(script_input, speaker_a, speaker_b, prompt_input, script_style):
|
192 |
-
"""Generate a complete podcast with two speakers"""
|
193 |
-
try:
|
194 |
-
# Generate script if prompt is provided
|
195 |
-
if prompt_input and (not script_input or script_input.strip() == ""):
|
196 |
-
script_input = generate_script_from_prompt(prompt_input, script_style)
|
197 |
-
|
198 |
-
if not script_input or script_input.strip() == "":
|
199 |
-
return None, "❌ Please provide either a script or a prompt"
|
200 |
-
|
201 |
-
# Parse conversation
|
202 |
-
conversation = parse_conversation_script(script_input)
|
203 |
-
if not conversation:
|
204 |
-
return None, "❌ Could not parse script"
|
205 |
-
|
206 |
-
# Generate audio for each line
|
207 |
-
combined_audio = None
|
208 |
-
current_sample_rate = 24000
|
209 |
-
|
210 |
-
for i, line in enumerate(conversation):
|
211 |
-
speaker = line["speaker"]
|
212 |
-
text = line["text"]
|
213 |
-
|
214 |
-
# Choose voice based on speaker label or A/B assignment
|
215 |
-
if "host" in speaker.lower() or "a" in speaker.lower() or "interviewer" in speaker.lower():
|
216 |
-
voice = speaker_a
|
217 |
-
elif "co-host" in speaker.lower() or "b" in speaker.lower() or "guest" in speaker.lower():
|
218 |
-
voice = speaker_b
|
219 |
-
else:
|
220 |
-
# Default assignment
|
221 |
-
voice = speaker_a if i % 2 == 0 else speaker_b
|
222 |
-
|
223 |
-
print(f"🎙️ {speaker} ({voice}): {text}")
|
224 |
-
|
225 |
-
# Generate speech
|
226 |
-
wav, error = tts_engine.generate_speech(text, voice)
|
227 |
-
if error:
|
228 |
-
return None, error
|
229 |
-
|
230 |
-
# Combine audio
|
231 |
-
if combined_audio is None:
|
232 |
-
combined_audio = wav
|
233 |
-
else:
|
234 |
-
# Add a small pause between speakers
|
235 |
-
pause = np.zeros(int(0.5 * current_sample_rate)) # 0.5 second pause
|
236 |
-
combined_audio = np.concatenate([combined_audio, pause, wav])
|
237 |
-
|
238 |
-
# Save final audio
|
239 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
240 |
-
sf.write(f.name, combined_audio, current_sample_rate)
|
241 |
-
audio_file = f.name
|
242 |
-
|
243 |
-
# Save script
|
244 |
-
script_file = audio_file.replace(".wav", "_script.txt")
|
245 |
-
with open(script_file, 'w') as f:
|
246 |
-
f.write(script_input)
|
247 |
-
|
248 |
-
return audio_file, script_file, "✅ Podcast generated successfully!"
|
249 |
-
|
250 |
-
except Exception as e:
|
251 |
-
return None, None, f"❌ Error: {str(e)}"
|
252 |
-
|
253 |
-
# ===== GRADIO UI =====
|
254 |
-
css = """
|
255 |
-
.container { max-width: 1400px; margin: 0 auto; }
|
256 |
-
.header { background: linear-gradient(135deg, #32CD32 0%, #1E90FF 100%); color: white; padding: 30px; border-radius: 12px; margin-bottom: 25px; text-align: center; border: 3px solid #1E90FF; }
|
257 |
-
.section { border: 2px solid #32CD32; border-radius: 10px; padding: 20px; margin-bottom: 20px; background: white; }
|
258 |
-
.output-section { background: linear-gradient(135deg, #F0FFF0 0%, #F0F8FF 100%); border: 2px dashed #1E90FF; border-radius: 10px; padding: 20px; margin-top: 20px; }
|
259 |
-
.btn-primary { background: linear-gradient(135deg, #32CD32 0%, #1E90FF 100%) !important; border: 2px solid #1E90FF !important; color: white !important; font-weight: bold !important; }
|
260 |
-
.btn-secondary { background: linear-gradient(135deg, #FFA500 0%, #FF6347 100%) !important; border: 2px solid #FF6347 !important; color: white !important; }
|
261 |
-
.tab { background: #f0f8ff; padding: 15px; border-radius: 8px; margin: 10px 0; }
|
262 |
-
"""
|
263 |
-
|
264 |
-
with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
|
265 |
-
gr.HTML("""
|
266 |
-
<div class="header">
|
267 |
-
<h1>🎙️ 2nd-Host AI - Complete Podcast Studio</h1>
|
268 |
-
<h3>Voice Cloning • 2-Speaker Podcasts • Script Generation • Export</h3>
|
269 |
-
</div>
|
270 |
-
""")
|
271 |
-
|
272 |
-
# Initialize voice manager
|
273 |
-
voice_manager = VoiceProfileManager()
|
274 |
-
available_voices = voice_manager.list_profiles()
|
275 |
-
|
276 |
-
with gr.Tab("🎭 Voice Cloning"):
|
277 |
-
gr.Markdown("### Clone New Voices")
|
278 |
-
with gr.Row():
|
279 |
-
with gr.Column():
|
280 |
-
voice_name = gr.Textbox(label="Voice Name", placeholder="e.g., 'David', 'Sarah', 'Expert'")
|
281 |
-
upload_audio = gr.Audio(label="Reference Audio", type="filepath")
|
282 |
-
reference_text = gr.Textbox(
|
283 |
-
label="Reference Text",
|
284 |
-
value="Hey there, this is my voice for cloning.",
|
285 |
-
placeholder="Text spoken in the reference audio"
|
286 |
-
)
|
287 |
-
clone_btn = gr.Button("🎯 Clone Voice", variant="primary")
|
288 |
-
|
289 |
-
with gr.Column():
|
290 |
-
clone_status = gr.Textbox(label="Cloning Status", interactive=False)
|
291 |
-
available_voices_display = gr.Dropdown(
|
292 |
-
label="Available Voices",
|
293 |
-
choices=available_voices,
|
294 |
-
value=available_voices[0] if available_voices else None
|
295 |
-
)
|
296 |
-
refresh_btn = gr.Button("🔄 Refresh Voices")
|
297 |
-
|
298 |
-
with gr.Tab("🎬 Podcast Studio"):
|
299 |
-
gr.Markdown("### Create 2-Speaker Podcast")
|
300 |
-
|
301 |
-
with gr.Row():
|
302 |
-
with gr.Column():
|
303 |
-
# Script input
|
304 |
-
script_input = gr.Textbox(
|
305 |
-
label="Podcast Script",
|
306 |
-
lines=6,
|
307 |
-
placeholder="""Format: Speaker: Dialogue
|
308 |
-
Example:
|
309 |
-
Host: Welcome to our show!
|
310 |
-
Co-host: Thanks for having me!
|
311 |
-
Host: Let's discuss AI voice technology...
|
312 |
-
Co-host: It's revolutionizing content creation!""",
|
313 |
-
value=""
|
314 |
-
)
|
315 |
-
|
316 |
-
# Script generation
|
317 |
-
prompt_input = gr.Textbox(
|
318 |
-
label="Or Generate from Prompt",
|
319 |
-
placeholder="e.g., 'The future of AI in education'"
|
320 |
-
)
|
321 |
-
script_style = gr.Radio(
|
322 |
-
choices=["conversational", "interview", "debate"],
|
323 |
-
label="Script Style",
|
324 |
-
value="conversational"
|
325 |
-
)
|
326 |
-
generate_script_btn = gr.Button("📝 Generate Script", variant="secondary")
|
327 |
-
|
328 |
-
with gr.Column():
|
329 |
-
# Speaker selection
|
330 |
-
speaker_a = gr.Dropdown(
|
331 |
-
choices=available_voices,
|
332 |
-
label="🎤 Speaker A (Host)",
|
333 |
-
value=available_voices[0] if available_voices else None
|
334 |
-
)
|
335 |
-
speaker_b = gr.Dropdown(
|
336 |
-
choices=available_voices,
|
337 |
-
label="🎤 Speaker B (Co-host/Guest)",
|
338 |
-
value=available_voices[1] if len(available_voices) > 1 else available_voices[0] if available_voices else None
|
339 |
-
)
|
340 |
-
|
341 |
-
generate_btn = gr.Button("🚀 Generate Podcast", variant="primary", size="lg")
|
342 |
-
|
343 |
-
with gr.Tab("📤 Output"):
|
344 |
-
gr.Markdown("### Generated Podcast")
|
345 |
-
with gr.Row():
|
346 |
-
with gr.Column():
|
347 |
-
audio_output = gr.Audio(label="🎧 Podcast Audio", type="filepath")
|
348 |
-
script_output = gr.File(label="📄 Script File", file_types=[".txt"])
|
349 |
-
|
350 |
-
with gr.Column():
|
351 |
-
generation_status = gr.Textbox(label="Generation Status", lines=3)
|
352 |
-
download_btn = gr.Button("💾 Download All", variant="primary")
|
353 |
-
|
354 |
-
# ===== EVENT HANDLERS =====
|
355 |
-
def refresh_voices():
|
356 |
-
voice_manager = VoiceProfileManager()
|
357 |
-
voices = voice_manager.list_profiles()
|
358 |
-
return gr.Dropdown(choices=voices, value=voices[0] if voices else None), gr.Dropdown(choices=voices, value=voices[1] if len(voices) > 1 else voices[0] if voices else None)
|
359 |
-
|
360 |
-
def handle_clone_voice(voice_name, audio_path, text):
|
361 |
-
result = clone_voice(voice_name, audio_path, text)
|
362 |
-
return result, *refresh_voices()
|
363 |
-
|
364 |
-
def handle_generate_script(prompt, style):
|
365 |
-
if not prompt:
|
366 |
-
return "❌ Please enter a prompt"
|
367 |
-
script = generate_script_from_prompt(prompt, style)
|
368 |
-
return script
|
369 |
-
|
370 |
-
def handle_generate_podcast(script, speaker_a, speaker_b, prompt, style):
|
371 |
-
return generate_podcast(script, speaker_a, speaker_b, prompt, style)
|
372 |
-
|
373 |
-
# Connect events
|
374 |
-
clone_btn.click(
|
375 |
-
handle_clone_voice,
|
376 |
-
inputs=[voice_name, upload_audio, reference_text],
|
377 |
-
outputs=[clone_status, speaker_a, speaker_b]
|
378 |
-
)
|
379 |
-
|
380 |
-
refresh_btn.click(
|
381 |
-
refresh_voices,
|
382 |
-
outputs=[speaker_a, speaker_b]
|
383 |
-
)
|
384 |
-
|
385 |
-
generate_script_btn.click(
|
386 |
-
handle_generate_script,
|
387 |
-
inputs=[prompt_input, script_style],
|
388 |
-
outputs=[script_input]
|
389 |
-
)
|
390 |
-
|
391 |
-
generate_btn.click(
|
392 |
-
handle_generate_podcast,
|
393 |
-
inputs=[script_input, speaker_a, speaker_b, prompt_input, script_style],
|
394 |
-
outputs=[audio_output, script_output, generation_status]
|
395 |
-
)
|
396 |
-
|
397 |
-
if __name__ == "__main__":
|
398 |
-
demo.launch(share=True)
|
|
|
7 |
import shutil
|
8 |
from pathlib import Path
|
9 |
import numpy as np
|
10 |
+
import re
|
11 |
+
from typing import Generator
|
12 |
|
13 |
# ===== NEUTTS IMPORTS =====
|
14 |
+
try:
|
15 |
+
# Try multiple import approaches for NeuTTS
|
16 |
+
try:
|
17 |
+
# Approach 1: Direct import from the structure
|
18 |
+
from neutts import NeuTTSAir
|
19 |
+
except ImportError:
|
20 |
+
try:
|
21 |
+
# Approach 2: Import from the module directly
|
22 |
+
import sys
|
23 |
+
sys.path.append('/usr/local/lib/python3.10/site-packages')
|
24 |
+
from neutts import NeuTTSAir
|
25 |
+
except ImportError:
|
26 |
+
# Approach 3: Use the components directly
|
27 |
+
from phonemizer.backend import EspeakBackend
|
28 |
+
import perth
|
29 |
+
from neucodec import NeuCodec
|
30 |
+
from llama_cpp import Llama
|
31 |
+
|
32 |
+
# Define NeuTTSAir class manually
|
33 |
+
class NeuTTSAir:
|
34 |
+
def __init__(self, backbone_repo="neuphonic/neutts-air-q4-gguf", backbone_device="cpu", codec_repo="neuphonic/neucodec", codec_device="cpu"):
|
35 |
+
self.sample_rate = 24_000
|
36 |
+
self.max_context = 2048
|
37 |
+
self.hop_length = 480
|
38 |
+
|
39 |
+
print("🧠 Loading phonemizer...")
|
40 |
+
self.phonemizer = EspeakBackend(language="en-us", preserve_punctuation=True, with_stress=True)
|
41 |
+
self._load_backbone(backbone_repo, backbone_device)
|
42 |
+
self._load_codec(codec_repo, codec_device)
|
43 |
+
self.watermarker = perth.PerthImplicitWatermarker()
|
44 |
+
print("✅ NeuTTS-Air initialized!")
|
45 |
+
|
46 |
+
def _load_backbone(self, backbone_repo, backbone_device):
|
47 |
+
print(f"🔧 Loading Q4 GGUF backbone: {backbone_repo}")
|
48 |
+
self.backbone = Llama.from_pretrained(
|
49 |
+
repo_id=backbone_repo,
|
50 |
+
filename="*.gguf",
|
51 |
+
n_ctx=self.max_context,
|
52 |
+
n_gpu_layers=0,
|
53 |
+
verbose=False,
|
54 |
+
use_mlock=False,
|
55 |
+
n_threads=2,
|
56 |
+
low_vram=True
|
57 |
+
)
|
58 |
+
|
59 |
+
def _load_codec(self, codec_repo, codec_device):
|
60 |
+
print(f"🔧 Loading codec: {codec_repo}")
|
61 |
+
self.codec = NeuCodec.from_pretrained(codec_repo)
|
62 |
+
self.codec.eval().to(codec_device)
|
63 |
+
|
64 |
+
def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
|
65 |
+
output_str = self._infer_gguf(ref_codes, ref_text, text)
|
66 |
+
wav = self._decode(output_str)
|
67 |
+
watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=24000)
|
68 |
+
return watermarked_wav
|
69 |
+
|
70 |
+
def encode_reference(self, ref_audio_path: str | Path):
|
71 |
+
import torch
|
72 |
+
import librosa
|
73 |
+
wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
|
74 |
+
wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)
|
75 |
+
with torch.no_grad():
|
76 |
+
ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
|
77 |
+
return ref_codes.numpy() if isinstance(ref_codes, torch.Tensor) else ref_codes
|
78 |
+
|
79 |
+
def _decode(self, codes: str):
|
80 |
+
speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
|
81 |
+
if len(speech_ids) > 0:
|
82 |
+
import torch
|
83 |
+
with torch.no_grad():
|
84 |
+
codes_tensor = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(self.codec.device)
|
85 |
+
recon = self.codec.decode_code(codes_tensor).cpu().numpy()
|
86 |
+
return recon[0, 0, :]
|
87 |
+
else:
|
88 |
+
raise ValueError("No speech tokens found")
|
89 |
+
|
90 |
+
def _to_phones(self, text: str) -> str:
|
91 |
+
phones = self.phonemizer.phonemize([text])
|
92 |
+
return " ".join(phones[0].split())
|
93 |
+
|
94 |
+
def _infer_gguf(self, ref_codes: list, ref_text: str, input_text: str) -> str:
|
95 |
+
ref_text_phones = self._to_phones(ref_text)
|
96 |
+
input_text_phones = self._to_phones(input_text)
|
97 |
+
|
98 |
+
if isinstance(ref_codes, (torch.Tensor, np.ndarray)):
|
99 |
+
ref_codes = ref_codes.tolist()
|
100 |
+
|
101 |
+
codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
|
102 |
+
|
103 |
+
prompt = f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text_phones} {input_text_phones}<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
|
104 |
+
|
105 |
+
output = self.backbone(
|
106 |
+
prompt,
|
107 |
+
max_tokens=self.max_context,
|
108 |
+
temperature=1.0,
|
109 |
+
top_k=50,
|
110 |
+
stop=["<|SPEECH_GENERATION_END|>"],
|
111 |
+
echo=False
|
112 |
+
)
|
113 |
+
|
114 |
+
return output["choices"][0]["text"]
|
115 |
+
|
116 |
+
NEUTTS_AVAILABLE = True
|
117 |
+
print("✅ NeuTTS-Air loaded successfully!")
|
118 |
+
|
119 |
+
except Exception as e:
|
120 |
+
NEUTTS_AVAILABLE = False
|
121 |
+
print(f"❌ NeuTTS-Air import failed: {e}")
|
122 |
|
123 |
# ===== CONFIGURATION =====
|
124 |
CONFIG_FILE = "voice_profiles.json"
|
|
|
135 |
if os.path.exists(self.config_file):
|
136 |
with open(self.config_file, 'r') as f:
|
137 |
return json.load(f)
|
138 |
+
return {
|
139 |
+
"dave": {
|
140 |
+
"audio_path": "samples/dave.wav",
|
141 |
+
"text": "Hey there, this is Dave speaking.",
|
142 |
+
"created_at": "default"
|
143 |
+
},
|
144 |
+
"andrea": {
|
145 |
+
"audio_path": "samples/andrea.wav",
|
146 |
+
"text": "Hello, my name is Andrea.",
|
147 |
+
"created_at": "default"
|
148 |
+
}
|
149 |
+
}
|
150 |
|
151 |
def save_profiles(self):
|
152 |
with open(self.config_file, 'w') as f:
|
|
|
173 |
samples = {
|
174 |
"dave": {
|
175 |
"audio": "https://github.com/neophonic/neutts-air/raw/main/samples/dave.wav",
|
176 |
+
"text": "Hey there, this is Dave speaking."
|
177 |
},
|
178 |
"andrea": {
|
179 |
"audio": "https://github.com/neophonic/neutts-air/raw/main/samples/andrea.wav",
|
180 |
+
"text": "Hello, my name is Andrea."
|
181 |
}
|
182 |
}
|
183 |
|
|
|
187 |
|
188 |
if not os.path.exists(audio_path):
|
189 |
try:
|
190 |
+
print(f"📥 Downloading {name} sample...")
|
191 |
+
response = requests.get(urls["audio"], timeout=60)
|
192 |
with open(audio_path, 'wb') as f:
|
193 |
f.write(response.content)
|
194 |
|
|
|
195 |
with open(text_path, 'w') as f:
|
196 |
+
f.write(urls["text"])
|
197 |
|
198 |
+
print(f
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|