import gradio as gr import os import tempfile import requests import soundfile as sf import json import shutil from pathlib import Path import numpy as np # ===== NEUTTS IMPORTS ===== from neuttsair.neutts import NeuTTSAir # ===== CONFIGURATION ===== CONFIG_FILE = "voice_profiles.json" SAMPLE_DIR = "samples" os.makedirs(SAMPLE_DIR, exist_ok=True) # ===== VOICE PROFILE MANAGEMENT ===== class VoiceProfileManager: def __init__(self, config_file=CONFIG_FILE): self.config_file = config_file self.profiles = self.load_profiles() def load_profiles(self): if os.path.exists(self.config_file): with open(self.config_file, 'r') as f: return json.load(f) return {} def save_profiles(self): with open(self.config_file, 'w') as f: json.dump(self.profiles, f, indent=2) def add_profile(self, name, audio_path, text): self.profiles[name] = { "audio_path": audio_path, "text": text, "created_at": str(np.datetime64('now')) } self.save_profiles() return f"✅ Voice profile '{name}' saved!" def get_profile(self, name): return self.profiles.get(name) def list_profiles(self): return list(self.profiles.keys()) # ===== SAMPLE MANAGEMENT ===== def download_default_samples(): """Download default sample voices""" samples = { "dave": { "audio": "https://github.com/neophonic/neutts-air/raw/main/samples/dave.wav", "text": "https://raw.githubusercontent.com/neophonic/neutts-air/main/samples/dave.txt" }, "andrea": { "audio": "https://github.com/neophonic/neutts-air/raw/main/samples/andrea.wav", "text": "https://raw.githubusercontent.com/neophonic/neutts-air/main/samples/andrea.txt" } } for name, urls in samples.items(): audio_path = f"{SAMPLE_DIR}/{name}.wav" text_path = f"{SAMPLE_DIR}/{name}.txt" if not os.path.exists(audio_path): try: response = requests.get(urls["audio"]) with open(audio_path, 'wb') as f: f.write(response.content) response = requests.get(urls["text"]) with open(text_path, 'w') as f: f.write(response.text) print(f"✅ Downloaded {name} sample") except Exception as e: print(f"❌ Failed to download {name}: {e}") # ===== TTS ENGINE ===== class TTSEngine: def __init__(self): self.tts = None self.voice_manager = VoiceProfileManager() download_default_samples() def initialize_tts(self): if self.tts is None: print("🚀 Initializing NeuTTS Q4 GGUF...") self.tts = NeuTTSAir( backbone_repo="neuphonic/neutts-air-q4-gguf", backbone_device="cpu", codec_repo="neuphonic/neucodec", codec_device="cpu" ) return self.tts def generate_speech(self, text, voice_name): try: tts = self.initialize_tts() profile = self.voice_manager.get_profile(voice_name) if not profile: return None, f"❌ Voice profile '{voice_name}' not found" ref_codes = tts.encode_reference(profile["audio_path"]) ref_text = profile["text"] wav = tts.infer(text, ref_codes, ref_text) return wav, None except Exception as e: return None, f"❌ Generation error: {str(e)}" # ===== SCRIPT PARSING ===== def parse_conversation_script(script_text): """Parse script with speaker labels""" lines = [] for line in script_text.strip().split('\n'): line = line.strip() if ':' in line: speaker, dialogue = line.split(':', 1) lines.append({ "speaker": speaker.strip(), "text": dialogue.strip() }) elif line: # Default to Speaker A if no label lines.append({ "speaker": "Speaker A", "text": line }) return lines def generate_script_from_prompt(prompt, style="conversational"): """Generate a podcast script from a prompt""" # Simple template-based generation templates = { "conversational": [ "Host: Welcome to our podcast! Today we're discussing {prompt}", "Co-host: That's right! It's a fascinating topic that affects many people.", "Host: Let's start with the basics. What should our audience know about this?", "Co-host: Well, first of all, it's important to understand the key concepts.", "Host: And what about the practical applications? How can people use this in their daily lives?", "Co-host: Great question! There are several ways to apply this knowledge effectively." ], "interview": [ "Interviewer: Thanks for joining us today to talk about {prompt}", "Guest: Happy to be here! It's a topic I'm very passionate about.", "Interviewer: Could you share some background on how you got involved in this field?", "Guest: Absolutely. It all started several years ago when I first discovered this area.", "Interviewer: What are the most exciting developments you're seeing right now?", "Guest: There are some incredible advancements happening that will change everything." ], "debate": [ "Moderator: Welcome to our debate on {prompt}", "Proponent: I believe this is one of the most important issues of our time.", "Opponent: While I respect that view, I have some serious concerns about the approach.", "Proponent: Let me address those concerns with some concrete evidence.", "Opponent: The evidence is compelling, but we must consider the broader implications.", "Moderator: Let's hear from both sides about potential solutions." ] } template = templates.get(style, templates["conversational"]) script = "\n".join([line.format(prompt=prompt) for line in template]) return script # ===== MAIN GENERATION FUNCTIONS ===== tts_engine = TTSEngine() def clone_voice(voice_name, upload_audio, reference_text): """Clone a voice from uploaded audio""" if not voice_name or not upload_audio: return "❌ Please provide a voice name and audio file" try: # Save uploaded audio audio_ext = Path(upload_audio).suffix audio_path = f"{SAMPLE_DIR}/{voice_name}{audio_ext}" shutil.copy2(upload_audio, audio_path) # Save voice profile result = tts_engine.voice_manager.add_profile(voice_name, audio_path, reference_text) return result except Exception as e: return f"❌ Error cloning voice: {str(e)}" def generate_podcast(script_input, speaker_a, speaker_b, prompt_input, script_style): """Generate a complete podcast with two speakers""" try: # Generate script if prompt is provided if prompt_input and (not script_input or script_input.strip() == ""): script_input = generate_script_from_prompt(prompt_input, script_style) if not script_input or script_input.strip() == "": return None, "❌ Please provide either a script or a prompt" # Parse conversation conversation = parse_conversation_script(script_input) if not conversation: return None, "❌ Could not parse script" # Generate audio for each line combined_audio = None current_sample_rate = 24000 for i, line in enumerate(conversation): speaker = line["speaker"] text = line["text"] # Choose voice based on speaker label or A/B assignment if "host" in speaker.lower() or "a" in speaker.lower() or "interviewer" in speaker.lower(): voice = speaker_a elif "co-host" in speaker.lower() or "b" in speaker.lower() or "guest" in speaker.lower(): voice = speaker_b else: # Default assignment voice = speaker_a if i % 2 == 0 else speaker_b print(f"🎙️ {speaker} ({voice}): {text}") # Generate speech wav, error = tts_engine.generate_speech(text, voice) if error: return None, error # Combine audio if combined_audio is None: combined_audio = wav else: # Add a small pause between speakers pause = np.zeros(int(0.5 * current_sample_rate)) # 0.5 second pause combined_audio = np.concatenate([combined_audio, pause, wav]) # Save final audio with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: sf.write(f.name, combined_audio, current_sample_rate) audio_file = f.name # Save script script_file = audio_file.replace(".wav", "_script.txt") with open(script_file, 'w') as f: f.write(script_input) return audio_file, script_file, "✅ Podcast generated successfully!" except Exception as e: return None, None, f"❌ Error: {str(e)}" # ===== GRADIO UI ===== css = """ .container { max-width: 1400px; margin: 0 auto; } .header { background: linear-gradient(135deg, #32CD32 0%, #1E90FF 100%); color: white; padding: 30px; border-radius: 12px; margin-bottom: 25px; text-align: center; border: 3px solid #1E90FF; } .section { border: 2px solid #32CD32; border-radius: 10px; padding: 20px; margin-bottom: 20px; background: white; } .output-section { background: linear-gradient(135deg, #F0FFF0 0%, #F0F8FF 100%); border: 2px dashed #1E90FF; border-radius: 10px; padding: 20px; margin-top: 20px; } .btn-primary { background: linear-gradient(135deg, #32CD32 0%, #1E90FF 100%) !important; border: 2px solid #1E90FF !important; color: white !important; font-weight: bold !important; } .btn-secondary { background: linear-gradient(135deg, #FFA500 0%, #FF6347 100%) !important; border: 2px solid #FF6347 !important; color: white !important; } .tab { background: #f0f8ff; padding: 15px; border-radius: 8px; margin: 10px 0; } """ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo: gr.HTML("""

🎙️ 2nd-Host AI - Complete Podcast Studio

Voice Cloning • 2-Speaker Podcasts • Script Generation • Export

""") # Initialize voice manager voice_manager = VoiceProfileManager() available_voices = voice_manager.list_profiles() with gr.Tab("🎭 Voice Cloning"): gr.Markdown("### Clone New Voices") with gr.Row(): with gr.Column(): voice_name = gr.Textbox(label="Voice Name", placeholder="e.g., 'David', 'Sarah', 'Expert'") upload_audio = gr.Audio(label="Reference Audio", type="filepath") reference_text = gr.Textbox( label="Reference Text", value="Hey there, this is my voice for cloning.", placeholder="Text spoken in the reference audio" ) clone_btn = gr.Button("🎯 Clone Voice", variant="primary") with gr.Column(): clone_status = gr.Textbox(label="Cloning Status", interactive=False) available_voices_display = gr.Dropdown( label="Available Voices", choices=available_voices, value=available_voices[0] if available_voices else None ) refresh_btn = gr.Button("🔄 Refresh Voices") with gr.Tab("🎬 Podcast Studio"): gr.Markdown("### Create 2-Speaker Podcast") with gr.Row(): with gr.Column(): # Script input script_input = gr.Textbox( label="Podcast Script", lines=6, placeholder="""Format: Speaker: Dialogue Example: Host: Welcome to our show! Co-host: Thanks for having me! Host: Let's discuss AI voice technology... Co-host: It's revolutionizing content creation!""", value="" ) # Script generation prompt_input = gr.Textbox( label="Or Generate from Prompt", placeholder="e.g., 'The future of AI in education'" ) script_style = gr.Radio( choices=["conversational", "interview", "debate"], label="Script Style", value="conversational" ) generate_script_btn = gr.Button("📝 Generate Script", variant="secondary") with gr.Column(): # Speaker selection speaker_a = gr.Dropdown( choices=available_voices, label="🎤 Speaker A (Host)", value=available_voices[0] if available_voices else None ) speaker_b = gr.Dropdown( choices=available_voices, label="🎤 Speaker B (Co-host/Guest)", value=available_voices[1] if len(available_voices) > 1 else available_voices[0] if available_voices else None ) generate_btn = gr.Button("🚀 Generate Podcast", variant="primary", size="lg") with gr.Tab("📤 Output"): gr.Markdown("### Generated Podcast") with gr.Row(): with gr.Column(): audio_output = gr.Audio(label="🎧 Podcast Audio", type="filepath") script_output = gr.File(label="📄 Script File", file_types=[".txt"]) with gr.Column(): generation_status = gr.Textbox(label="Generation Status", lines=3) download_btn = gr.Button("💾 Download All", variant="primary") # ===== EVENT HANDLERS ===== def refresh_voices(): voice_manager = VoiceProfileManager() voices = voice_manager.list_profiles() return gr.Dropdown(choices=voices, value=voices[0] if voices else None), gr.Dropdown(choices=voices, value=voices[1] if len(voices) > 1 else voices[0] if voices else None) def handle_clone_voice(voice_name, audio_path, text): result = clone_voice(voice_name, audio_path, text) return result, *refresh_voices() def handle_generate_script(prompt, style): if not prompt: return "❌ Please enter a prompt" script = generate_script_from_prompt(prompt, style) return script def handle_generate_podcast(script, speaker_a, speaker_b, prompt, style): return generate_podcast(script, speaker_a, speaker_b, prompt, style) # Connect events clone_btn.click( handle_clone_voice, inputs=[voice_name, upload_audio, reference_text], outputs=[clone_status, speaker_a, speaker_b] ) refresh_btn.click( refresh_voices, outputs=[speaker_a, speaker_b] ) generate_script_btn.click( handle_generate_script, inputs=[prompt_input, script_style], outputs=[script_input] ) generate_btn.click( handle_generate_podcast, inputs=[script_input, speaker_a, speaker_b, prompt_input, script_style], outputs=[audio_output, script_output, generation_status] ) if __name__ == "__main__": demo.launch(share=True)