Spaces:

Actual-Innocence
/

2nd-Host-Ai

Runtime error

App Files Files

xet

Community

Actual-Innocence commited on 4 days ago

Commit

ab5e85c

verified ·

1 Parent(s): dff00d4

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -328

app.py CHANGED Viewed

@@ -7,9 +7,118 @@ import json
 import shutil
 from pathlib import Path
 import numpy as np
 # ===== NEUTTS IMPORTS =====
-from neuttsair.neutts import NeuTTSAir
 # ===== CONFIGURATION =====
 CONFIG_FILE = "voice_profiles.json"
@@ -26,7 +135,18 @@ class VoiceProfileManager:
         if os.path.exists(self.config_file):
             with open(self.config_file, 'r') as f:
                 return json.load(f)
-        return {}
     def save_profiles(self):
         with open(self.config_file, 'w') as f:
@@ -53,11 +173,11 @@ def download_default_samples():
     samples = {
         "dave": {
             "audio": "https://github.com/neophonic/neutts-air/raw/main/samples/dave.wav",
-            "text": "https://raw.githubusercontent.com/neophonic/neutts-air/main/samples/dave.txt"
         },
         "andrea": {
             "audio": "https://github.com/neophonic/neutts-air/raw/main/samples/andrea.wav",
-            "text": "https://raw.githubusercontent.com/neophonic/neutts-air/main/samples/andrea.txt"
         }
     }
@@ -67,332 +187,12 @@ def download_default_samples():
         if not os.path.exists(audio_path):
             try:
-                response = requests.get(urls["audio"])
                 with open(audio_path, 'wb') as f:
                     f.write(response.content)
-                response = requests.get(urls["text"])
                 with open(text_path, 'w') as f:
-                    f.write(response.text)
-                print(f"✅ Downloaded {name} sample")
-            except Exception as e:
-                print(f"❌ Failed to download {name}: {e}")
-# ===== TTS ENGINE =====
-class TTSEngine:
-    def __init__(self):
-        self.tts = None
-        self.voice_manager = VoiceProfileManager()
-        download_default_samples()
-    def initialize_tts(self):
-        if self.tts is None:
-            print("🚀 Initializing NeuTTS Q4 GGUF...")
-            self.tts = NeuTTSAir(
-                backbone_repo="neuphonic/neutts-air-q4-gguf",
-                backbone_device="cpu",
-                codec_repo="neuphonic/neucodec",
-                codec_device="cpu"
-            )
-        return self.tts
-    def generate_speech(self, text, voice_name):
-        try:
-            tts = self.initialize_tts()
-            profile = self.voice_manager.get_profile(voice_name)
-            if not profile:
-                return None, f"❌ Voice profile '{voice_name}' not found"
-            ref_codes = tts.encode_reference(profile["audio_path"])
-            ref_text = profile["text"]
-            wav = tts.infer(text, ref_codes, ref_text)
-            return wav, None
-        except Exception as e:
-            return None, f"❌ Generation error: {str(e)}"
-# ===== SCRIPT PARSING =====
-def parse_conversation_script(script_text):
-    """Parse script with speaker labels"""
-    lines = []
-    for line in script_text.strip().split('\n'):
-        line = line.strip()
-        if ':' in line:
-            speaker, dialogue = line.split(':', 1)
-            lines.append({
-                "speaker": speaker.strip(),
-                "text": dialogue.strip()
-            })
-        elif line:
-            # Default to Speaker A if no label
-            lines.append({
-                "speaker": "Speaker A",
-                "text": line
-            })
-    return lines
-def generate_script_from_prompt(prompt, style="conversational"):
-    """Generate a podcast script from a prompt"""
-    # Simple template-based generation
-    templates = {
-        "conversational": [
-            "Host: Welcome to our podcast! Today we're discussing {prompt}",
-            "Co-host: That's right! It's a fascinating topic that affects many people.",
-            "Host: Let's start with the basics. What should our audience know about this?",
-            "Co-host: Well, first of all, it's important to understand the key concepts.",
-            "Host: And what about the practical applications? How can people use this in their daily lives?",
-            "Co-host: Great question! There are several ways to apply this knowledge effectively."
-        ],
-        "interview": [
-            "Interviewer: Thanks for joining us today to talk about {prompt}",
-            "Guest: Happy to be here! It's a topic I'm very passionate about.",
-            "Interviewer: Could you share some background on how you got involved in this field?",
-            "Guest: Absolutely. It all started several years ago when I first discovered this area.",
-            "Interviewer: What are the most exciting developments you're seeing right now?",
-            "Guest: There are some incredible advancements happening that will change everything."
-        ],
-        "debate": [
-            "Moderator: Welcome to our debate on {prompt}",
-            "Proponent: I believe this is one of the most important issues of our time.",
-            "Opponent: While I respect that view, I have some serious concerns about the approach.",
-            "Proponent: Let me address those concerns with some concrete evidence.",
-            "Opponent: The evidence is compelling, but we must consider the broader implications.",
-            "Moderator: Let's hear from both sides about potential solutions."
-        ]
-    }
-    template = templates.get(style, templates["conversational"])
-    script = "\n".join([line.format(prompt=prompt) for line in template])
-    return script
-# ===== MAIN GENERATION FUNCTIONS =====
-tts_engine = TTSEngine()
-def clone_voice(voice_name, upload_audio, reference_text):
-    """Clone a voice from uploaded audio"""
-    if not voice_name or not upload_audio:
-        return "❌ Please provide a voice name and audio file"
-    try:
-        # Save uploaded audio
-        audio_ext = Path(upload_audio).suffix
-        audio_path = f"{SAMPLE_DIR}/{voice_name}{audio_ext}"
-        shutil.copy2(upload_audio, audio_path)
-        # Save voice profile
-        result = tts_engine.voice_manager.add_profile(voice_name, audio_path, reference_text)
-        return result
-    except Exception as e:
-        return f"❌ Error cloning voice: {str(e)}"
-def generate_podcast(script_input, speaker_a, speaker_b, prompt_input, script_style):
-    """Generate a complete podcast with two speakers"""
-    try:
-        # Generate script if prompt is provided
-        if prompt_input and (not script_input or script_input.strip() == ""):
-            script_input = generate_script_from_prompt(prompt_input, script_style)
-        if not script_input or script_input.strip() == "":
-            return None, "❌ Please provide either a script or a prompt"
-        # Parse conversation
-        conversation = parse_conversation_script(script_input)
-        if not conversation:
-            return None, "❌ Could not parse script"
-        # Generate audio for each line
-        combined_audio = None
-        current_sample_rate = 24000
-        for i, line in enumerate(conversation):
-            speaker = line["speaker"]
-            text = line["text"]
-            # Choose voice based on speaker label or A/B assignment
-            if "host" in speaker.lower() or "a" in speaker.lower() or "interviewer" in speaker.lower():
-                voice = speaker_a
-            elif "co-host" in speaker.lower() or "b" in speaker.lower() or "guest" in speaker.lower():
-                voice = speaker_b
-            else:
-                # Default assignment
-                voice = speaker_a if i % 2 == 0 else speaker_b
-            print(f"🎙️ {speaker} ({voice}): {text}")
-            # Generate speech
-            wav, error = tts_engine.generate_speech(text, voice)
-            if error:
-                return None, error
-            # Combine audio
-            if combined_audio is None:
-                combined_audio = wav
-            else:
-                # Add a small pause between speakers
-                pause = np.zeros(int(0.5 * current_sample_rate))  # 0.5 second pause
-                combined_audio = np.concatenate([combined_audio, pause, wav])
-        # Save final audio
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-            sf.write(f.name, combined_audio, current_sample_rate)
-            audio_file = f.name
-        # Save script
-        script_file = audio_file.replace(".wav", "_script.txt")
-        with open(script_file, 'w') as f:
-            f.write(script_input)
-        return audio_file, script_file, "✅ Podcast generated successfully!"
-    except Exception as e:
-        return None, None, f"❌ Error: {str(e)}"
-# ===== GRADIO UI =====
-css = """
-.container { max-width: 1400px; margin: 0 auto; }
-.header { background: linear-gradient(135deg, #32CD32 0%, #1E90FF 100%); color: white; padding: 30px; border-radius: 12px; margin-bottom: 25px; text-align: center; border: 3px solid #1E90FF; }
-.section { border: 2px solid #32CD32; border-radius: 10px; padding: 20px; margin-bottom: 20px; background: white; }
-.output-section { background: linear-gradient(135deg, #F0FFF0 0%, #F0F8FF 100%); border: 2px dashed #1E90FF; border-radius: 10px; padding: 20px; margin-top: 20px; }
-.btn-primary { background: linear-gradient(135deg, #32CD32 0%, #1E90FF 100%) !important; border: 2px solid #1E90FF !important; color: white !important; font-weight: bold !important; }
-.btn-secondary { background: linear-gradient(135deg, #FFA500 0%, #FF6347 100%) !important; border: 2px solid #FF6347 !important; color: white !important; }
-.tab { background: #f0f8ff; padding: 15px; border-radius: 8px; margin: 10px 0; }
-"""
-with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
-    gr.HTML("""
-    <div class="header">
-        <h1>🎙️ 2nd-Host AI - Complete Podcast Studio</h1>
-        <h3>Voice Cloning • 2-Speaker Podcasts • Script Generation • Export</h3>
-    </div>
-    """)
-    # Initialize voice manager
-    voice_manager = VoiceProfileManager()
-    available_voices = voice_manager.list_profiles()
-    with gr.Tab("🎭 Voice Cloning"):
-        gr.Markdown("### Clone New Voices")
-        with gr.Row():
-            with gr.Column():
-                voice_name = gr.Textbox(label="Voice Name", placeholder="e.g., 'David', 'Sarah', 'Expert'")
-                upload_audio = gr.Audio(label="Reference Audio", type="filepath")
-                reference_text = gr.Textbox(
-                    label="Reference Text",
-                    value="Hey there, this is my voice for cloning.",
-                    placeholder="Text spoken in the reference audio"
-                )
-                clone_btn = gr.Button("🎯 Clone Voice", variant="primary")
-            with gr.Column():
-                clone_status = gr.Textbox(label="Cloning Status", interactive=False)
-                available_voices_display = gr.Dropdown(
-                    label="Available Voices",
-                    choices=available_voices,
-                    value=available_voices[0] if available_voices else None
-                )
-                refresh_btn = gr.Button("🔄 Refresh Voices")
-    with gr.Tab("🎬 Podcast Studio"):
-        gr.Markdown("### Create 2-Speaker Podcast")
-        with gr.Row():
-            with gr.Column():
-                # Script input
-                script_input = gr.Textbox(
-                    label="Podcast Script",
-                    lines=6,
-                    placeholder="""Format: Speaker: Dialogue
-Example:
-Host: Welcome to our show!
-Co-host: Thanks for having me!
-Host: Let's discuss AI voice technology...
-Co-host: It's revolutionizing content creation!""",
-                    value=""
-                )
-                # Script generation
-                prompt_input = gr.Textbox(
-                    label="Or Generate from Prompt",
-                    placeholder="e.g., 'The future of AI in education'"
-                )
-                script_style = gr.Radio(
-                    choices=["conversational", "interview", "debate"],
-                    label="Script Style",
-                    value="conversational"
-                )
-                generate_script_btn = gr.Button("📝 Generate Script", variant="secondary")
-            with gr.Column():
-                # Speaker selection
-                speaker_a = gr.Dropdown(
-                    choices=available_voices,
-                    label="🎤 Speaker A (Host)",
-                    value=available_voices[0] if available_voices else None
-                )
-                speaker_b = gr.Dropdown(
-                    choices=available_voices,
-                    label="🎤 Speaker B (Co-host/Guest)",
-                    value=available_voices[1] if len(available_voices) > 1 else available_voices[0] if available_voices else None
-                )
-                generate_btn = gr.Button("🚀 Generate Podcast", variant="primary", size="lg")
-    with gr.Tab("📤 Output"):
-        gr.Markdown("### Generated Podcast")
-        with gr.Row():
-            with gr.Column():
-                audio_output = gr.Audio(label="🎧 Podcast Audio", type="filepath")
-                script_output = gr.File(label="📄 Script File", file_types=[".txt"])
-            with gr.Column():
-                generation_status = gr.Textbox(label="Generation Status", lines=3)
-                download_btn = gr.Button("💾 Download All", variant="primary")
-    # ===== EVENT HANDLERS =====
-    def refresh_voices():
-        voice_manager = VoiceProfileManager()
-        voices = voice_manager.list_profiles()
-        return gr.Dropdown(choices=voices, value=voices[0] if voices else None), gr.Dropdown(choices=voices, value=voices[1] if len(voices) > 1 else voices[0] if voices else None)
-    def handle_clone_voice(voice_name, audio_path, text):
-        result = clone_voice(voice_name, audio_path, text)
-        return result, *refresh_voices()
-    def handle_generate_script(prompt, style):
-        if not prompt:
-            return "❌ Please enter a prompt"
-        script = generate_script_from_prompt(prompt, style)
-        return script
-    def handle_generate_podcast(script, speaker_a, speaker_b, prompt, style):
-        return generate_podcast(script, speaker_a, speaker_b, prompt, style)
-    # Connect events
-    clone_btn.click(
-        handle_clone_voice,
-        inputs=[voice_name, upload_audio, reference_text],
-        outputs=[clone_status, speaker_a, speaker_b]
-    )
-    refresh_btn.click(
-        refresh_voices,
-        outputs=[speaker_a, speaker_b]
-    )
-    generate_script_btn.click(
-        handle_generate_script,
-        inputs=[prompt_input, script_style],
-        outputs=[script_input]
-    )
-    generate_btn.click(
-        handle_generate_podcast,
-        inputs=[script_input, speaker_a, speaker_b, prompt_input, script_style],
-        outputs=[audio_output, script_output, generation_status]
-    )
-if __name__ == "__main__":
-    demo.launch(share=True)

 import shutil
 from pathlib import Path
 import numpy as np
+import re
+from typing import Generator
 # ===== NEUTTS IMPORTS =====
+try:
+    # Try multiple import approaches for NeuTTS
+    try:
+        # Approach 1: Direct import from the structure
+        from neutts import NeuTTSAir
+    except ImportError:
+        try:
+            # Approach 2: Import from the module directly
+            import sys
+            sys.path.append('/usr/local/lib/python3.10/site-packages')
+            from neutts import NeuTTSAir
+        except ImportError:
+            # Approach 3: Use the components directly
+            from phonemizer.backend import EspeakBackend
+            import perth
+            from neucodec import NeuCodec
+            from llama_cpp import Llama
+            # Define NeuTTSAir class manually
+            class NeuTTSAir:
+                def __init__(self, backbone_repo="neuphonic/neutts-air-q4-gguf", backbone_device="cpu", codec_repo="neuphonic/neucodec", codec_device="cpu"):
+                    self.sample_rate = 24_000
+                    self.max_context = 2048
+                    self.hop_length = 480
+                    print("🧠 Loading phonemizer...")
+                    self.phonemizer = EspeakBackend(language="en-us", preserve_punctuation=True, with_stress=True)
+                    self._load_backbone(backbone_repo, backbone_device)
+                    self._load_codec(codec_repo, codec_device)
+                    self.watermarker = perth.PerthImplicitWatermarker()
+                    print("✅ NeuTTS-Air initialized!")
+                def _load_backbone(self, backbone_repo, backbone_device):
+                    print(f"🔧 Loading Q4 GGUF backbone: {backbone_repo}")
+                    self.backbone = Llama.from_pretrained(
+                        repo_id=backbone_repo,
+                        filename="*.gguf",
+                        n_ctx=self.max_context,
+                        n_gpu_layers=0,
+                        verbose=False,
+                        use_mlock=False,
+                        n_threads=2,
+                        low_vram=True
+                    )
+                def _load_codec(self, codec_repo, codec_device):
+                    print(f"🔧 Loading codec: {codec_repo}")
+                    self.codec = NeuCodec.from_pretrained(codec_repo)
+                    self.codec.eval().to(codec_device)
+                def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
+                    output_str = self._infer_gguf(ref_codes, ref_text, text)
+                    wav = self._decode(output_str)
+                    watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=24000)
+                    return watermarked_wav
+                def encode_reference(self, ref_audio_path: str | Path):
+                    import torch
+                    import librosa
+                    wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
+                    wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)
+                    with torch.no_grad():
+                        ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
+                    return ref_codes.numpy() if isinstance(ref_codes, torch.Tensor) else ref_codes
+                def _decode(self, codes: str):
+                    speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
+                    if len(speech_ids) > 0:
+                        import torch
+                        with torch.no_grad():
+                            codes_tensor = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(self.codec.device)
+                            recon = self.codec.decode_code(codes_tensor).cpu().numpy()
+                        return recon[0, 0, :]
+                    else:
+                        raise ValueError("No speech tokens found")
+                def _to_phones(self, text: str) -> str:
+                    phones = self.phonemizer.phonemize([text])
+                    return " ".join(phones[0].split())
+                def _infer_gguf(self, ref_codes: list, ref_text: str, input_text: str) -> str:
+                    ref_text_phones = self._to_phones(ref_text)
+                    input_text_phones = self._to_phones(input_text)
+                    if isinstance(ref_codes, (torch.Tensor, np.ndarray)):
+                        ref_codes = ref_codes.tolist()
+                    codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
+                    prompt = f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text_phones} {input_text_phones}<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
+                    output = self.backbone(
+                        prompt,
+                        max_tokens=self.max_context,
+                        temperature=1.0,
+                        top_k=50,
+                        stop=["<|SPEECH_GENERATION_END|>"],
+                        echo=False
+                    )
+                    return output["choices"][0]["text"]
+    NEUTTS_AVAILABLE = True
+    print("✅ NeuTTS-Air loaded successfully!")
+except Exception as e:
+    NEUTTS_AVAILABLE = False
+    print(f"❌ NeuTTS-Air import failed: {e}")
 # ===== CONFIGURATION =====
 CONFIG_FILE = "voice_profiles.json"
         if os.path.exists(self.config_file):
             with open(self.config_file, 'r') as f:
                 return json.load(f)
+        return {
+            "dave": {
+                "audio_path": "samples/dave.wav",
+                "text": "Hey there, this is Dave speaking.",
+                "created_at": "default"
+            },
+            "andrea": {
+                "audio_path": "samples/andrea.wav",
+                "text": "Hello, my name is Andrea.",
+                "created_at": "default"
+            }
+        }
     def save_profiles(self):
         with open(self.config_file, 'w') as f:
     samples = {
         "dave": {
             "audio": "https://github.com/neophonic/neutts-air/raw/main/samples/dave.wav",
+            "text": "Hey there, this is Dave speaking."
         },
         "andrea": {
             "audio": "https://github.com/neophonic/neutts-air/raw/main/samples/andrea.wav",
+            "text": "Hello, my name is Andrea."
         }
     }
         if not os.path.exists(audio_path):
             try:
+                print(f"📥 Downloading {name} sample...")
+                response = requests.get(urls["audio"], timeout=60)
                 with open(audio_path, 'wb') as f:
                     f.write(response.content)
                 with open(text_path, 'w') as f:
+                    f.write(urls["text"])
+                print(f