Actual-Innocence commited on
Commit
ab5e85c
·
verified ·
1 Parent(s): dff00d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -328
app.py CHANGED
@@ -7,9 +7,118 @@ import json
7
  import shutil
8
  from pathlib import Path
9
  import numpy as np
 
 
10
 
11
  # ===== NEUTTS IMPORTS =====
12
- from neuttsair.neutts import NeuTTSAir
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # ===== CONFIGURATION =====
15
  CONFIG_FILE = "voice_profiles.json"
@@ -26,7 +135,18 @@ class VoiceProfileManager:
26
  if os.path.exists(self.config_file):
27
  with open(self.config_file, 'r') as f:
28
  return json.load(f)
29
- return {}
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  def save_profiles(self):
32
  with open(self.config_file, 'w') as f:
@@ -53,11 +173,11 @@ def download_default_samples():
53
  samples = {
54
  "dave": {
55
  "audio": "https://github.com/neophonic/neutts-air/raw/main/samples/dave.wav",
56
- "text": "https://raw.githubusercontent.com/neophonic/neutts-air/main/samples/dave.txt"
57
  },
58
  "andrea": {
59
  "audio": "https://github.com/neophonic/neutts-air/raw/main/samples/andrea.wav",
60
- "text": "https://raw.githubusercontent.com/neophonic/neutts-air/main/samples/andrea.txt"
61
  }
62
  }
63
 
@@ -67,332 +187,12 @@ def download_default_samples():
67
 
68
  if not os.path.exists(audio_path):
69
  try:
70
- response = requests.get(urls["audio"])
 
71
  with open(audio_path, 'wb') as f:
72
  f.write(response.content)
73
 
74
- response = requests.get(urls["text"])
75
  with open(text_path, 'w') as f:
76
- f.write(response.text)
77
 
78
- print(f"✅ Downloaded {name} sample")
79
- except Exception as e:
80
- print(f"❌ Failed to download {name}: {e}")
81
-
82
- # ===== TTS ENGINE =====
83
- class TTSEngine:
84
- def __init__(self):
85
- self.tts = None
86
- self.voice_manager = VoiceProfileManager()
87
- download_default_samples()
88
-
89
- def initialize_tts(self):
90
- if self.tts is None:
91
- print("🚀 Initializing NeuTTS Q4 GGUF...")
92
- self.tts = NeuTTSAir(
93
- backbone_repo="neuphonic/neutts-air-q4-gguf",
94
- backbone_device="cpu",
95
- codec_repo="neuphonic/neucodec",
96
- codec_device="cpu"
97
- )
98
- return self.tts
99
-
100
- def generate_speech(self, text, voice_name):
101
- try:
102
- tts = self.initialize_tts()
103
- profile = self.voice_manager.get_profile(voice_name)
104
-
105
- if not profile:
106
- return None, f"❌ Voice profile '{voice_name}' not found"
107
-
108
- ref_codes = tts.encode_reference(profile["audio_path"])
109
- ref_text = profile["text"]
110
-
111
- wav = tts.infer(text, ref_codes, ref_text)
112
- return wav, None
113
-
114
- except Exception as e:
115
- return None, f"❌ Generation error: {str(e)}"
116
-
117
- # ===== SCRIPT PARSING =====
118
- def parse_conversation_script(script_text):
119
- """Parse script with speaker labels"""
120
- lines = []
121
- for line in script_text.strip().split('\n'):
122
- line = line.strip()
123
- if ':' in line:
124
- speaker, dialogue = line.split(':', 1)
125
- lines.append({
126
- "speaker": speaker.strip(),
127
- "text": dialogue.strip()
128
- })
129
- elif line:
130
- # Default to Speaker A if no label
131
- lines.append({
132
- "speaker": "Speaker A",
133
- "text": line
134
- })
135
- return lines
136
-
137
- def generate_script_from_prompt(prompt, style="conversational"):
138
- """Generate a podcast script from a prompt"""
139
- # Simple template-based generation
140
- templates = {
141
- "conversational": [
142
- "Host: Welcome to our podcast! Today we're discussing {prompt}",
143
- "Co-host: That's right! It's a fascinating topic that affects many people.",
144
- "Host: Let's start with the basics. What should our audience know about this?",
145
- "Co-host: Well, first of all, it's important to understand the key concepts.",
146
- "Host: And what about the practical applications? How can people use this in their daily lives?",
147
- "Co-host: Great question! There are several ways to apply this knowledge effectively."
148
- ],
149
- "interview": [
150
- "Interviewer: Thanks for joining us today to talk about {prompt}",
151
- "Guest: Happy to be here! It's a topic I'm very passionate about.",
152
- "Interviewer: Could you share some background on how you got involved in this field?",
153
- "Guest: Absolutely. It all started several years ago when I first discovered this area.",
154
- "Interviewer: What are the most exciting developments you're seeing right now?",
155
- "Guest: There are some incredible advancements happening that will change everything."
156
- ],
157
- "debate": [
158
- "Moderator: Welcome to our debate on {prompt}",
159
- "Proponent: I believe this is one of the most important issues of our time.",
160
- "Opponent: While I respect that view, I have some serious concerns about the approach.",
161
- "Proponent: Let me address those concerns with some concrete evidence.",
162
- "Opponent: The evidence is compelling, but we must consider the broader implications.",
163
- "Moderator: Let's hear from both sides about potential solutions."
164
- ]
165
- }
166
-
167
- template = templates.get(style, templates["conversational"])
168
- script = "\n".join([line.format(prompt=prompt) for line in template])
169
- return script
170
-
171
- # ===== MAIN GENERATION FUNCTIONS =====
172
- tts_engine = TTSEngine()
173
-
174
- def clone_voice(voice_name, upload_audio, reference_text):
175
- """Clone a voice from uploaded audio"""
176
- if not voice_name or not upload_audio:
177
- return "❌ Please provide a voice name and audio file"
178
-
179
- try:
180
- # Save uploaded audio
181
- audio_ext = Path(upload_audio).suffix
182
- audio_path = f"{SAMPLE_DIR}/{voice_name}{audio_ext}"
183
- shutil.copy2(upload_audio, audio_path)
184
-
185
- # Save voice profile
186
- result = tts_engine.voice_manager.add_profile(voice_name, audio_path, reference_text)
187
- return result
188
- except Exception as e:
189
- return f"❌ Error cloning voice: {str(e)}"
190
-
191
- def generate_podcast(script_input, speaker_a, speaker_b, prompt_input, script_style):
192
- """Generate a complete podcast with two speakers"""
193
- try:
194
- # Generate script if prompt is provided
195
- if prompt_input and (not script_input or script_input.strip() == ""):
196
- script_input = generate_script_from_prompt(prompt_input, script_style)
197
-
198
- if not script_input or script_input.strip() == "":
199
- return None, "❌ Please provide either a script or a prompt"
200
-
201
- # Parse conversation
202
- conversation = parse_conversation_script(script_input)
203
- if not conversation:
204
- return None, "❌ Could not parse script"
205
-
206
- # Generate audio for each line
207
- combined_audio = None
208
- current_sample_rate = 24000
209
-
210
- for i, line in enumerate(conversation):
211
- speaker = line["speaker"]
212
- text = line["text"]
213
-
214
- # Choose voice based on speaker label or A/B assignment
215
- if "host" in speaker.lower() or "a" in speaker.lower() or "interviewer" in speaker.lower():
216
- voice = speaker_a
217
- elif "co-host" in speaker.lower() or "b" in speaker.lower() or "guest" in speaker.lower():
218
- voice = speaker_b
219
- else:
220
- # Default assignment
221
- voice = speaker_a if i % 2 == 0 else speaker_b
222
-
223
- print(f"🎙️ {speaker} ({voice}): {text}")
224
-
225
- # Generate speech
226
- wav, error = tts_engine.generate_speech(text, voice)
227
- if error:
228
- return None, error
229
-
230
- # Combine audio
231
- if combined_audio is None:
232
- combined_audio = wav
233
- else:
234
- # Add a small pause between speakers
235
- pause = np.zeros(int(0.5 * current_sample_rate)) # 0.5 second pause
236
- combined_audio = np.concatenate([combined_audio, pause, wav])
237
-
238
- # Save final audio
239
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
240
- sf.write(f.name, combined_audio, current_sample_rate)
241
- audio_file = f.name
242
-
243
- # Save script
244
- script_file = audio_file.replace(".wav", "_script.txt")
245
- with open(script_file, 'w') as f:
246
- f.write(script_input)
247
-
248
- return audio_file, script_file, "✅ Podcast generated successfully!"
249
-
250
- except Exception as e:
251
- return None, None, f"❌ Error: {str(e)}"
252
-
253
- # ===== GRADIO UI =====
254
- css = """
255
- .container { max-width: 1400px; margin: 0 auto; }
256
- .header { background: linear-gradient(135deg, #32CD32 0%, #1E90FF 100%); color: white; padding: 30px; border-radius: 12px; margin-bottom: 25px; text-align: center; border: 3px solid #1E90FF; }
257
- .section { border: 2px solid #32CD32; border-radius: 10px; padding: 20px; margin-bottom: 20px; background: white; }
258
- .output-section { background: linear-gradient(135deg, #F0FFF0 0%, #F0F8FF 100%); border: 2px dashed #1E90FF; border-radius: 10px; padding: 20px; margin-top: 20px; }
259
- .btn-primary { background: linear-gradient(135deg, #32CD32 0%, #1E90FF 100%) !important; border: 2px solid #1E90FF !important; color: white !important; font-weight: bold !important; }
260
- .btn-secondary { background: linear-gradient(135deg, #FFA500 0%, #FF6347 100%) !important; border: 2px solid #FF6347 !important; color: white !important; }
261
- .tab { background: #f0f8ff; padding: 15px; border-radius: 8px; margin: 10px 0; }
262
- """
263
-
264
- with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
265
- gr.HTML("""
266
- <div class="header">
267
- <h1>🎙️ 2nd-Host AI - Complete Podcast Studio</h1>
268
- <h3>Voice Cloning • 2-Speaker Podcasts • Script Generation • Export</h3>
269
- </div>
270
- """)
271
-
272
- # Initialize voice manager
273
- voice_manager = VoiceProfileManager()
274
- available_voices = voice_manager.list_profiles()
275
-
276
- with gr.Tab("🎭 Voice Cloning"):
277
- gr.Markdown("### Clone New Voices")
278
- with gr.Row():
279
- with gr.Column():
280
- voice_name = gr.Textbox(label="Voice Name", placeholder="e.g., 'David', 'Sarah', 'Expert'")
281
- upload_audio = gr.Audio(label="Reference Audio", type="filepath")
282
- reference_text = gr.Textbox(
283
- label="Reference Text",
284
- value="Hey there, this is my voice for cloning.",
285
- placeholder="Text spoken in the reference audio"
286
- )
287
- clone_btn = gr.Button("🎯 Clone Voice", variant="primary")
288
-
289
- with gr.Column():
290
- clone_status = gr.Textbox(label="Cloning Status", interactive=False)
291
- available_voices_display = gr.Dropdown(
292
- label="Available Voices",
293
- choices=available_voices,
294
- value=available_voices[0] if available_voices else None
295
- )
296
- refresh_btn = gr.Button("🔄 Refresh Voices")
297
-
298
- with gr.Tab("🎬 Podcast Studio"):
299
- gr.Markdown("### Create 2-Speaker Podcast")
300
-
301
- with gr.Row():
302
- with gr.Column():
303
- # Script input
304
- script_input = gr.Textbox(
305
- label="Podcast Script",
306
- lines=6,
307
- placeholder="""Format: Speaker: Dialogue
308
- Example:
309
- Host: Welcome to our show!
310
- Co-host: Thanks for having me!
311
- Host: Let's discuss AI voice technology...
312
- Co-host: It's revolutionizing content creation!""",
313
- value=""
314
- )
315
-
316
- # Script generation
317
- prompt_input = gr.Textbox(
318
- label="Or Generate from Prompt",
319
- placeholder="e.g., 'The future of AI in education'"
320
- )
321
- script_style = gr.Radio(
322
- choices=["conversational", "interview", "debate"],
323
- label="Script Style",
324
- value="conversational"
325
- )
326
- generate_script_btn = gr.Button("📝 Generate Script", variant="secondary")
327
-
328
- with gr.Column():
329
- # Speaker selection
330
- speaker_a = gr.Dropdown(
331
- choices=available_voices,
332
- label="🎤 Speaker A (Host)",
333
- value=available_voices[0] if available_voices else None
334
- )
335
- speaker_b = gr.Dropdown(
336
- choices=available_voices,
337
- label="🎤 Speaker B (Co-host/Guest)",
338
- value=available_voices[1] if len(available_voices) > 1 else available_voices[0] if available_voices else None
339
- )
340
-
341
- generate_btn = gr.Button("🚀 Generate Podcast", variant="primary", size="lg")
342
-
343
- with gr.Tab("📤 Output"):
344
- gr.Markdown("### Generated Podcast")
345
- with gr.Row():
346
- with gr.Column():
347
- audio_output = gr.Audio(label="🎧 Podcast Audio", type="filepath")
348
- script_output = gr.File(label="📄 Script File", file_types=[".txt"])
349
-
350
- with gr.Column():
351
- generation_status = gr.Textbox(label="Generation Status", lines=3)
352
- download_btn = gr.Button("💾 Download All", variant="primary")
353
-
354
- # ===== EVENT HANDLERS =====
355
- def refresh_voices():
356
- voice_manager = VoiceProfileManager()
357
- voices = voice_manager.list_profiles()
358
- return gr.Dropdown(choices=voices, value=voices[0] if voices else None), gr.Dropdown(choices=voices, value=voices[1] if len(voices) > 1 else voices[0] if voices else None)
359
-
360
- def handle_clone_voice(voice_name, audio_path, text):
361
- result = clone_voice(voice_name, audio_path, text)
362
- return result, *refresh_voices()
363
-
364
- def handle_generate_script(prompt, style):
365
- if not prompt:
366
- return "❌ Please enter a prompt"
367
- script = generate_script_from_prompt(prompt, style)
368
- return script
369
-
370
- def handle_generate_podcast(script, speaker_a, speaker_b, prompt, style):
371
- return generate_podcast(script, speaker_a, speaker_b, prompt, style)
372
-
373
- # Connect events
374
- clone_btn.click(
375
- handle_clone_voice,
376
- inputs=[voice_name, upload_audio, reference_text],
377
- outputs=[clone_status, speaker_a, speaker_b]
378
- )
379
-
380
- refresh_btn.click(
381
- refresh_voices,
382
- outputs=[speaker_a, speaker_b]
383
- )
384
-
385
- generate_script_btn.click(
386
- handle_generate_script,
387
- inputs=[prompt_input, script_style],
388
- outputs=[script_input]
389
- )
390
-
391
- generate_btn.click(
392
- handle_generate_podcast,
393
- inputs=[script_input, speaker_a, speaker_b, prompt_input, script_style],
394
- outputs=[audio_output, script_output, generation_status]
395
- )
396
-
397
- if __name__ == "__main__":
398
- demo.launch(share=True)
 
7
  import shutil
8
  from pathlib import Path
9
  import numpy as np
10
+ import re
11
+ from typing import Generator
12
 
13
  # ===== NEUTTS IMPORTS =====
14
+ try:
15
+ # Try multiple import approaches for NeuTTS
16
+ try:
17
+ # Approach 1: Direct import from the structure
18
+ from neutts import NeuTTSAir
19
+ except ImportError:
20
+ try:
21
+ # Approach 2: Import from the module directly
22
+ import sys
23
+ sys.path.append('/usr/local/lib/python3.10/site-packages')
24
+ from neutts import NeuTTSAir
25
+ except ImportError:
26
+ # Approach 3: Use the components directly
27
+ from phonemizer.backend import EspeakBackend
28
+ import perth
29
+ from neucodec import NeuCodec
30
+ from llama_cpp import Llama
31
+
32
+ # Define NeuTTSAir class manually
33
+ class NeuTTSAir:
34
+ def __init__(self, backbone_repo="neuphonic/neutts-air-q4-gguf", backbone_device="cpu", codec_repo="neuphonic/neucodec", codec_device="cpu"):
35
+ self.sample_rate = 24_000
36
+ self.max_context = 2048
37
+ self.hop_length = 480
38
+
39
+ print("🧠 Loading phonemizer...")
40
+ self.phonemizer = EspeakBackend(language="en-us", preserve_punctuation=True, with_stress=True)
41
+ self._load_backbone(backbone_repo, backbone_device)
42
+ self._load_codec(codec_repo, codec_device)
43
+ self.watermarker = perth.PerthImplicitWatermarker()
44
+ print("✅ NeuTTS-Air initialized!")
45
+
46
+ def _load_backbone(self, backbone_repo, backbone_device):
47
+ print(f"🔧 Loading Q4 GGUF backbone: {backbone_repo}")
48
+ self.backbone = Llama.from_pretrained(
49
+ repo_id=backbone_repo,
50
+ filename="*.gguf",
51
+ n_ctx=self.max_context,
52
+ n_gpu_layers=0,
53
+ verbose=False,
54
+ use_mlock=False,
55
+ n_threads=2,
56
+ low_vram=True
57
+ )
58
+
59
+ def _load_codec(self, codec_repo, codec_device):
60
+ print(f"🔧 Loading codec: {codec_repo}")
61
+ self.codec = NeuCodec.from_pretrained(codec_repo)
62
+ self.codec.eval().to(codec_device)
63
+
64
+ def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
65
+ output_str = self._infer_gguf(ref_codes, ref_text, text)
66
+ wav = self._decode(output_str)
67
+ watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=24000)
68
+ return watermarked_wav
69
+
70
+ def encode_reference(self, ref_audio_path: str | Path):
71
+ import torch
72
+ import librosa
73
+ wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
74
+ wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)
75
+ with torch.no_grad():
76
+ ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
77
+ return ref_codes.numpy() if isinstance(ref_codes, torch.Tensor) else ref_codes
78
+
79
+ def _decode(self, codes: str):
80
+ speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
81
+ if len(speech_ids) > 0:
82
+ import torch
83
+ with torch.no_grad():
84
+ codes_tensor = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(self.codec.device)
85
+ recon = self.codec.decode_code(codes_tensor).cpu().numpy()
86
+ return recon[0, 0, :]
87
+ else:
88
+ raise ValueError("No speech tokens found")
89
+
90
+ def _to_phones(self, text: str) -> str:
91
+ phones = self.phonemizer.phonemize([text])
92
+ return " ".join(phones[0].split())
93
+
94
+ def _infer_gguf(self, ref_codes: list, ref_text: str, input_text: str) -> str:
95
+ ref_text_phones = self._to_phones(ref_text)
96
+ input_text_phones = self._to_phones(input_text)
97
+
98
+ if isinstance(ref_codes, (torch.Tensor, np.ndarray)):
99
+ ref_codes = ref_codes.tolist()
100
+
101
+ codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
102
+
103
+ prompt = f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text_phones} {input_text_phones}<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
104
+
105
+ output = self.backbone(
106
+ prompt,
107
+ max_tokens=self.max_context,
108
+ temperature=1.0,
109
+ top_k=50,
110
+ stop=["<|SPEECH_GENERATION_END|>"],
111
+ echo=False
112
+ )
113
+
114
+ return output["choices"][0]["text"]
115
+
116
+ NEUTTS_AVAILABLE = True
117
+ print("✅ NeuTTS-Air loaded successfully!")
118
+
119
+ except Exception as e:
120
+ NEUTTS_AVAILABLE = False
121
+ print(f"❌ NeuTTS-Air import failed: {e}")
122
 
123
  # ===== CONFIGURATION =====
124
  CONFIG_FILE = "voice_profiles.json"
 
135
  if os.path.exists(self.config_file):
136
  with open(self.config_file, 'r') as f:
137
  return json.load(f)
138
+ return {
139
+ "dave": {
140
+ "audio_path": "samples/dave.wav",
141
+ "text": "Hey there, this is Dave speaking.",
142
+ "created_at": "default"
143
+ },
144
+ "andrea": {
145
+ "audio_path": "samples/andrea.wav",
146
+ "text": "Hello, my name is Andrea.",
147
+ "created_at": "default"
148
+ }
149
+ }
150
 
151
  def save_profiles(self):
152
  with open(self.config_file, 'w') as f:
 
173
  samples = {
174
  "dave": {
175
  "audio": "https://github.com/neophonic/neutts-air/raw/main/samples/dave.wav",
176
+ "text": "Hey there, this is Dave speaking."
177
  },
178
  "andrea": {
179
  "audio": "https://github.com/neophonic/neutts-air/raw/main/samples/andrea.wav",
180
+ "text": "Hello, my name is Andrea."
181
  }
182
  }
183
 
 
187
 
188
  if not os.path.exists(audio_path):
189
  try:
190
+ print(f"📥 Downloading {name} sample...")
191
+ response = requests.get(urls["audio"], timeout=60)
192
  with open(audio_path, 'wb') as f:
193
  f.write(response.content)
194
 
 
195
  with open(text_path, 'w') as f:
196
+ f.write(urls["text"])
197
 
198
+ print(f