Spaces:
Runtime error
Runtime error
import gradio as gr | |
import os | |
import tempfile | |
import requests | |
import soundfile as sf | |
import json | |
import shutil | |
from pathlib import Path | |
import numpy as np | |
# ===== NEUTTS IMPORTS ===== | |
from neuttsair.neutts import NeuTTSAir | |
# ===== CONFIGURATION ===== | |
CONFIG_FILE = "voice_profiles.json" | |
SAMPLE_DIR = "samples" | |
os.makedirs(SAMPLE_DIR, exist_ok=True) | |
# ===== VOICE PROFILE MANAGEMENT ===== | |
class VoiceProfileManager: | |
def __init__(self, config_file=CONFIG_FILE): | |
self.config_file = config_file | |
self.profiles = self.load_profiles() | |
def load_profiles(self): | |
if os.path.exists(self.config_file): | |
with open(self.config_file, 'r') as f: | |
return json.load(f) | |
return {} | |
def save_profiles(self): | |
with open(self.config_file, 'w') as f: | |
json.dump(self.profiles, f, indent=2) | |
def add_profile(self, name, audio_path, text): | |
self.profiles[name] = { | |
"audio_path": audio_path, | |
"text": text, | |
"created_at": str(np.datetime64('now')) | |
} | |
self.save_profiles() | |
return f"β Voice profile '{name}' saved!" | |
def get_profile(self, name): | |
return self.profiles.get(name) | |
def list_profiles(self): | |
return list(self.profiles.keys()) | |
# ===== SAMPLE MANAGEMENT ===== | |
def download_default_samples(): | |
"""Download default sample voices""" | |
samples = { | |
"dave": { | |
"audio": "https://github.com/neophonic/neutts-air/raw/main/samples/dave.wav", | |
"text": "https://raw.githubusercontent.com/neophonic/neutts-air/main/samples/dave.txt" | |
}, | |
"andrea": { | |
"audio": "https://github.com/neophonic/neutts-air/raw/main/samples/andrea.wav", | |
"text": "https://raw.githubusercontent.com/neophonic/neutts-air/main/samples/andrea.txt" | |
} | |
} | |
for name, urls in samples.items(): | |
audio_path = f"{SAMPLE_DIR}/{name}.wav" | |
text_path = f"{SAMPLE_DIR}/{name}.txt" | |
if not os.path.exists(audio_path): | |
try: | |
response = requests.get(urls["audio"]) | |
with open(audio_path, 'wb') as f: | |
f.write(response.content) | |
response = requests.get(urls["text"]) | |
with open(text_path, 'w') as f: | |
f.write(response.text) | |
print(f"β Downloaded {name} sample") | |
except Exception as e: | |
print(f"β Failed to download {name}: {e}") | |
# ===== TTS ENGINE ===== | |
class TTSEngine: | |
def __init__(self): | |
self.tts = None | |
self.voice_manager = VoiceProfileManager() | |
download_default_samples() | |
def initialize_tts(self): | |
if self.tts is None: | |
print("π Initializing NeuTTS Q4 GGUF...") | |
self.tts = NeuTTSAir( | |
backbone_repo="neuphonic/neutts-air-q4-gguf", | |
backbone_device="cpu", | |
codec_repo="neuphonic/neucodec", | |
codec_device="cpu" | |
) | |
return self.tts | |
def generate_speech(self, text, voice_name): | |
try: | |
tts = self.initialize_tts() | |
profile = self.voice_manager.get_profile(voice_name) | |
if not profile: | |
return None, f"β Voice profile '{voice_name}' not found" | |
ref_codes = tts.encode_reference(profile["audio_path"]) | |
ref_text = profile["text"] | |
wav = tts.infer(text, ref_codes, ref_text) | |
return wav, None | |
except Exception as e: | |
return None, f"β Generation error: {str(e)}" | |
# ===== SCRIPT PARSING ===== | |
def parse_conversation_script(script_text): | |
"""Parse script with speaker labels""" | |
lines = [] | |
for line in script_text.strip().split('\n'): | |
line = line.strip() | |
if ':' in line: | |
speaker, dialogue = line.split(':', 1) | |
lines.append({ | |
"speaker": speaker.strip(), | |
"text": dialogue.strip() | |
}) | |
elif line: | |
# Default to Speaker A if no label | |
lines.append({ | |
"speaker": "Speaker A", | |
"text": line | |
}) | |
return lines | |
def generate_script_from_prompt(prompt, style="conversational"): | |
"""Generate a podcast script from a prompt""" | |
# Simple template-based generation | |
templates = { | |
"conversational": [ | |
"Host: Welcome to our podcast! Today we're discussing {prompt}", | |
"Co-host: That's right! It's a fascinating topic that affects many people.", | |
"Host: Let's start with the basics. What should our audience know about this?", | |
"Co-host: Well, first of all, it's important to understand the key concepts.", | |
"Host: And what about the practical applications? How can people use this in their daily lives?", | |
"Co-host: Great question! There are several ways to apply this knowledge effectively." | |
], | |
"interview": [ | |
"Interviewer: Thanks for joining us today to talk about {prompt}", | |
"Guest: Happy to be here! It's a topic I'm very passionate about.", | |
"Interviewer: Could you share some background on how you got involved in this field?", | |
"Guest: Absolutely. It all started several years ago when I first discovered this area.", | |
"Interviewer: What are the most exciting developments you're seeing right now?", | |
"Guest: There are some incredible advancements happening that will change everything." | |
], | |
"debate": [ | |
"Moderator: Welcome to our debate on {prompt}", | |
"Proponent: I believe this is one of the most important issues of our time.", | |
"Opponent: While I respect that view, I have some serious concerns about the approach.", | |
"Proponent: Let me address those concerns with some concrete evidence.", | |
"Opponent: The evidence is compelling, but we must consider the broader implications.", | |
"Moderator: Let's hear from both sides about potential solutions." | |
] | |
} | |
template = templates.get(style, templates["conversational"]) | |
script = "\n".join([line.format(prompt=prompt) for line in template]) | |
return script | |
# ===== MAIN GENERATION FUNCTIONS ===== | |
tts_engine = TTSEngine() | |
def clone_voice(voice_name, upload_audio, reference_text): | |
"""Clone a voice from uploaded audio""" | |
if not voice_name or not upload_audio: | |
return "β Please provide a voice name and audio file" | |
try: | |
# Save uploaded audio | |
audio_ext = Path(upload_audio).suffix | |
audio_path = f"{SAMPLE_DIR}/{voice_name}{audio_ext}" | |
shutil.copy2(upload_audio, audio_path) | |
# Save voice profile | |
result = tts_engine.voice_manager.add_profile(voice_name, audio_path, reference_text) | |
return result | |
except Exception as e: | |
return f"β Error cloning voice: {str(e)}" | |
def generate_podcast(script_input, speaker_a, speaker_b, prompt_input, script_style): | |
"""Generate a complete podcast with two speakers""" | |
try: | |
# Generate script if prompt is provided | |
if prompt_input and (not script_input or script_input.strip() == ""): | |
script_input = generate_script_from_prompt(prompt_input, script_style) | |
if not script_input or script_input.strip() == "": | |
return None, "β Please provide either a script or a prompt" | |
# Parse conversation | |
conversation = parse_conversation_script(script_input) | |
if not conversation: | |
return None, "β Could not parse script" | |
# Generate audio for each line | |
combined_audio = None | |
current_sample_rate = 24000 | |
for i, line in enumerate(conversation): | |
speaker = line["speaker"] | |
text = line["text"] | |
# Choose voice based on speaker label or A/B assignment | |
if "host" in speaker.lower() or "a" in speaker.lower() or "interviewer" in speaker.lower(): | |
voice = speaker_a | |
elif "co-host" in speaker.lower() or "b" in speaker.lower() or "guest" in speaker.lower(): | |
voice = speaker_b | |
else: | |
# Default assignment | |
voice = speaker_a if i % 2 == 0 else speaker_b | |
print(f"ποΈ {speaker} ({voice}): {text}") | |
# Generate speech | |
wav, error = tts_engine.generate_speech(text, voice) | |
if error: | |
return None, error | |
# Combine audio | |
if combined_audio is None: | |
combined_audio = wav | |
else: | |
# Add a small pause between speakers | |
pause = np.zeros(int(0.5 * current_sample_rate)) # 0.5 second pause | |
combined_audio = np.concatenate([combined_audio, pause, wav]) | |
# Save final audio | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
sf.write(f.name, combined_audio, current_sample_rate) | |
audio_file = f.name | |
# Save script | |
script_file = audio_file.replace(".wav", "_script.txt") | |
with open(script_file, 'w') as f: | |
f.write(script_input) | |
return audio_file, script_file, "β Podcast generated successfully!" | |
except Exception as e: | |
return None, None, f"β Error: {str(e)}" | |
# ===== GRADIO UI ===== | |
css = """ | |
.container { max-width: 1400px; margin: 0 auto; } | |
.header { background: linear-gradient(135deg, #32CD32 0%, #1E90FF 100%); color: white; padding: 30px; border-radius: 12px; margin-bottom: 25px; text-align: center; border: 3px solid #1E90FF; } | |
.section { border: 2px solid #32CD32; border-radius: 10px; padding: 20px; margin-bottom: 20px; background: white; } | |
.output-section { background: linear-gradient(135deg, #F0FFF0 0%, #F0F8FF 100%); border: 2px dashed #1E90FF; border-radius: 10px; padding: 20px; margin-top: 20px; } | |
.btn-primary { background: linear-gradient(135deg, #32CD32 0%, #1E90FF 100%) !important; border: 2px solid #1E90FF !important; color: white !important; font-weight: bold !important; } | |
.btn-secondary { background: linear-gradient(135deg, #FFA500 0%, #FF6347 100%) !important; border: 2px solid #FF6347 !important; color: white !important; } | |
.tab { background: #f0f8ff; padding: 15px; border-radius: 8px; margin: 10px 0; } | |
""" | |
with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo: | |
gr.HTML(""" | |
<div class="header"> | |
<h1>ποΈ 2nd-Host AI - Complete Podcast Studio</h1> | |
<h3>Voice Cloning β’ 2-Speaker Podcasts β’ Script Generation β’ Export</h3> | |
</div> | |
""") | |
# Initialize voice manager | |
voice_manager = VoiceProfileManager() | |
available_voices = voice_manager.list_profiles() | |
with gr.Tab("π Voice Cloning"): | |
gr.Markdown("### Clone New Voices") | |
with gr.Row(): | |
with gr.Column(): | |
voice_name = gr.Textbox(label="Voice Name", placeholder="e.g., 'David', 'Sarah', 'Expert'") | |
upload_audio = gr.Audio(label="Reference Audio", type="filepath") | |
reference_text = gr.Textbox( | |
label="Reference Text", | |
value="Hey there, this is my voice for cloning.", | |
placeholder="Text spoken in the reference audio" | |
) | |
clone_btn = gr.Button("π― Clone Voice", variant="primary") | |
with gr.Column(): | |
clone_status = gr.Textbox(label="Cloning Status", interactive=False) | |
available_voices_display = gr.Dropdown( | |
label="Available Voices", | |
choices=available_voices, | |
value=available_voices[0] if available_voices else None | |
) | |
refresh_btn = gr.Button("π Refresh Voices") | |
with gr.Tab("π¬ Podcast Studio"): | |
gr.Markdown("### Create 2-Speaker Podcast") | |
with gr.Row(): | |
with gr.Column(): | |
# Script input | |
script_input = gr.Textbox( | |
label="Podcast Script", | |
lines=6, | |
placeholder="""Format: Speaker: Dialogue | |
Example: | |
Host: Welcome to our show! | |
Co-host: Thanks for having me! | |
Host: Let's discuss AI voice technology... | |
Co-host: It's revolutionizing content creation!""", | |
value="" | |
) | |
# Script generation | |
prompt_input = gr.Textbox( | |
label="Or Generate from Prompt", | |
placeholder="e.g., 'The future of AI in education'" | |
) | |
script_style = gr.Radio( | |
choices=["conversational", "interview", "debate"], | |
label="Script Style", | |
value="conversational" | |
) | |
generate_script_btn = gr.Button("π Generate Script", variant="secondary") | |
with gr.Column(): | |
# Speaker selection | |
speaker_a = gr.Dropdown( | |
choices=available_voices, | |
label="π€ Speaker A (Host)", | |
value=available_voices[0] if available_voices else None | |
) | |
speaker_b = gr.Dropdown( | |
choices=available_voices, | |
label="π€ Speaker B (Co-host/Guest)", | |
value=available_voices[1] if len(available_voices) > 1 else available_voices[0] if available_voices else None | |
) | |
generate_btn = gr.Button("π Generate Podcast", variant="primary", size="lg") | |
with gr.Tab("π€ Output"): | |
gr.Markdown("### Generated Podcast") | |
with gr.Row(): | |
with gr.Column(): | |
audio_output = gr.Audio(label="π§ Podcast Audio", type="filepath") | |
script_output = gr.File(label="π Script File", file_types=[".txt"]) | |
with gr.Column(): | |
generation_status = gr.Textbox(label="Generation Status", lines=3) | |
download_btn = gr.Button("πΎ Download All", variant="primary") | |
# ===== EVENT HANDLERS ===== | |
def refresh_voices(): | |
voice_manager = VoiceProfileManager() | |
voices = voice_manager.list_profiles() | |
return gr.Dropdown(choices=voices, value=voices[0] if voices else None), gr.Dropdown(choices=voices, value=voices[1] if len(voices) > 1 else voices[0] if voices else None) | |
def handle_clone_voice(voice_name, audio_path, text): | |
result = clone_voice(voice_name, audio_path, text) | |
return result, *refresh_voices() | |
def handle_generate_script(prompt, style): | |
if not prompt: | |
return "β Please enter a prompt" | |
script = generate_script_from_prompt(prompt, style) | |
return script | |
def handle_generate_podcast(script, speaker_a, speaker_b, prompt, style): | |
return generate_podcast(script, speaker_a, speaker_b, prompt, style) | |
# Connect events | |
clone_btn.click( | |
handle_clone_voice, | |
inputs=[voice_name, upload_audio, reference_text], | |
outputs=[clone_status, speaker_a, speaker_b] | |
) | |
refresh_btn.click( | |
refresh_voices, | |
outputs=[speaker_a, speaker_b] | |
) | |
generate_script_btn.click( | |
handle_generate_script, | |
inputs=[prompt_input, script_style], | |
outputs=[script_input] | |
) | |
generate_btn.click( | |
handle_generate_podcast, | |
inputs=[script_input, speaker_a, speaker_b, prompt_input, script_style], | |
outputs=[audio_output, script_output, generation_status] | |
) | |
if __name__ == "__main__": | |
demo.launch(share=True) |