2nd-Host-Ai / app.py
Actual-Innocence's picture
Update app.py
aac2889 verified
raw
history blame
16.1 kB
import gradio as gr
import os
import tempfile
import requests
import soundfile as sf
import json
import shutil
from pathlib import Path
import numpy as np
# ===== NEUTTS IMPORTS =====
from neuttsair.neutts import NeuTTSAir
# ===== CONFIGURATION =====
CONFIG_FILE = "voice_profiles.json"
SAMPLE_DIR = "samples"
os.makedirs(SAMPLE_DIR, exist_ok=True)
# ===== VOICE PROFILE MANAGEMENT =====
class VoiceProfileManager:
def __init__(self, config_file=CONFIG_FILE):
self.config_file = config_file
self.profiles = self.load_profiles()
def load_profiles(self):
if os.path.exists(self.config_file):
with open(self.config_file, 'r') as f:
return json.load(f)
return {}
def save_profiles(self):
with open(self.config_file, 'w') as f:
json.dump(self.profiles, f, indent=2)
def add_profile(self, name, audio_path, text):
self.profiles[name] = {
"audio_path": audio_path,
"text": text,
"created_at": str(np.datetime64('now'))
}
self.save_profiles()
return f"βœ… Voice profile '{name}' saved!"
def get_profile(self, name):
return self.profiles.get(name)
def list_profiles(self):
return list(self.profiles.keys())
# ===== SAMPLE MANAGEMENT =====
def download_default_samples():
"""Download default sample voices"""
samples = {
"dave": {
"audio": "https://github.com/neophonic/neutts-air/raw/main/samples/dave.wav",
"text": "https://raw.githubusercontent.com/neophonic/neutts-air/main/samples/dave.txt"
},
"andrea": {
"audio": "https://github.com/neophonic/neutts-air/raw/main/samples/andrea.wav",
"text": "https://raw.githubusercontent.com/neophonic/neutts-air/main/samples/andrea.txt"
}
}
for name, urls in samples.items():
audio_path = f"{SAMPLE_DIR}/{name}.wav"
text_path = f"{SAMPLE_DIR}/{name}.txt"
if not os.path.exists(audio_path):
try:
response = requests.get(urls["audio"])
with open(audio_path, 'wb') as f:
f.write(response.content)
response = requests.get(urls["text"])
with open(text_path, 'w') as f:
f.write(response.text)
print(f"βœ… Downloaded {name} sample")
except Exception as e:
print(f"❌ Failed to download {name}: {e}")
# ===== TTS ENGINE =====
class TTSEngine:
def __init__(self):
self.tts = None
self.voice_manager = VoiceProfileManager()
download_default_samples()
def initialize_tts(self):
if self.tts is None:
print("πŸš€ Initializing NeuTTS Q4 GGUF...")
self.tts = NeuTTSAir(
backbone_repo="neuphonic/neutts-air-q4-gguf",
backbone_device="cpu",
codec_repo="neuphonic/neucodec",
codec_device="cpu"
)
return self.tts
def generate_speech(self, text, voice_name):
try:
tts = self.initialize_tts()
profile = self.voice_manager.get_profile(voice_name)
if not profile:
return None, f"❌ Voice profile '{voice_name}' not found"
ref_codes = tts.encode_reference(profile["audio_path"])
ref_text = profile["text"]
wav = tts.infer(text, ref_codes, ref_text)
return wav, None
except Exception as e:
return None, f"❌ Generation error: {str(e)}"
# ===== SCRIPT PARSING =====
def parse_conversation_script(script_text):
"""Parse script with speaker labels"""
lines = []
for line in script_text.strip().split('\n'):
line = line.strip()
if ':' in line:
speaker, dialogue = line.split(':', 1)
lines.append({
"speaker": speaker.strip(),
"text": dialogue.strip()
})
elif line:
# Default to Speaker A if no label
lines.append({
"speaker": "Speaker A",
"text": line
})
return lines
def generate_script_from_prompt(prompt, style="conversational"):
"""Generate a podcast script from a prompt"""
# Simple template-based generation
templates = {
"conversational": [
"Host: Welcome to our podcast! Today we're discussing {prompt}",
"Co-host: That's right! It's a fascinating topic that affects many people.",
"Host: Let's start with the basics. What should our audience know about this?",
"Co-host: Well, first of all, it's important to understand the key concepts.",
"Host: And what about the practical applications? How can people use this in their daily lives?",
"Co-host: Great question! There are several ways to apply this knowledge effectively."
],
"interview": [
"Interviewer: Thanks for joining us today to talk about {prompt}",
"Guest: Happy to be here! It's a topic I'm very passionate about.",
"Interviewer: Could you share some background on how you got involved in this field?",
"Guest: Absolutely. It all started several years ago when I first discovered this area.",
"Interviewer: What are the most exciting developments you're seeing right now?",
"Guest: There are some incredible advancements happening that will change everything."
],
"debate": [
"Moderator: Welcome to our debate on {prompt}",
"Proponent: I believe this is one of the most important issues of our time.",
"Opponent: While I respect that view, I have some serious concerns about the approach.",
"Proponent: Let me address those concerns with some concrete evidence.",
"Opponent: The evidence is compelling, but we must consider the broader implications.",
"Moderator: Let's hear from both sides about potential solutions."
]
}
template = templates.get(style, templates["conversational"])
script = "\n".join([line.format(prompt=prompt) for line in template])
return script
# ===== MAIN GENERATION FUNCTIONS =====
tts_engine = TTSEngine()
def clone_voice(voice_name, upload_audio, reference_text):
"""Clone a voice from uploaded audio"""
if not voice_name or not upload_audio:
return "❌ Please provide a voice name and audio file"
try:
# Save uploaded audio
audio_ext = Path(upload_audio).suffix
audio_path = f"{SAMPLE_DIR}/{voice_name}{audio_ext}"
shutil.copy2(upload_audio, audio_path)
# Save voice profile
result = tts_engine.voice_manager.add_profile(voice_name, audio_path, reference_text)
return result
except Exception as e:
return f"❌ Error cloning voice: {str(e)}"
def generate_podcast(script_input, speaker_a, speaker_b, prompt_input, script_style):
"""Generate a complete podcast with two speakers"""
try:
# Generate script if prompt is provided
if prompt_input and (not script_input or script_input.strip() == ""):
script_input = generate_script_from_prompt(prompt_input, script_style)
if not script_input or script_input.strip() == "":
return None, "❌ Please provide either a script or a prompt"
# Parse conversation
conversation = parse_conversation_script(script_input)
if not conversation:
return None, "❌ Could not parse script"
# Generate audio for each line
combined_audio = None
current_sample_rate = 24000
for i, line in enumerate(conversation):
speaker = line["speaker"]
text = line["text"]
# Choose voice based on speaker label or A/B assignment
if "host" in speaker.lower() or "a" in speaker.lower() or "interviewer" in speaker.lower():
voice = speaker_a
elif "co-host" in speaker.lower() or "b" in speaker.lower() or "guest" in speaker.lower():
voice = speaker_b
else:
# Default assignment
voice = speaker_a if i % 2 == 0 else speaker_b
print(f"πŸŽ™οΈ {speaker} ({voice}): {text}")
# Generate speech
wav, error = tts_engine.generate_speech(text, voice)
if error:
return None, error
# Combine audio
if combined_audio is None:
combined_audio = wav
else:
# Add a small pause between speakers
pause = np.zeros(int(0.5 * current_sample_rate)) # 0.5 second pause
combined_audio = np.concatenate([combined_audio, pause, wav])
# Save final audio
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f.name, combined_audio, current_sample_rate)
audio_file = f.name
# Save script
script_file = audio_file.replace(".wav", "_script.txt")
with open(script_file, 'w') as f:
f.write(script_input)
return audio_file, script_file, "βœ… Podcast generated successfully!"
except Exception as e:
return None, None, f"❌ Error: {str(e)}"
# ===== GRADIO UI =====
css = """
.container { max-width: 1400px; margin: 0 auto; }
.header { background: linear-gradient(135deg, #32CD32 0%, #1E90FF 100%); color: white; padding: 30px; border-radius: 12px; margin-bottom: 25px; text-align: center; border: 3px solid #1E90FF; }
.section { border: 2px solid #32CD32; border-radius: 10px; padding: 20px; margin-bottom: 20px; background: white; }
.output-section { background: linear-gradient(135deg, #F0FFF0 0%, #F0F8FF 100%); border: 2px dashed #1E90FF; border-radius: 10px; padding: 20px; margin-top: 20px; }
.btn-primary { background: linear-gradient(135deg, #32CD32 0%, #1E90FF 100%) !important; border: 2px solid #1E90FF !important; color: white !important; font-weight: bold !important; }
.btn-secondary { background: linear-gradient(135deg, #FFA500 0%, #FF6347 100%) !important; border: 2px solid #FF6347 !important; color: white !important; }
.tab { background: #f0f8ff; padding: 15px; border-radius: 8px; margin: 10px 0; }
"""
with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
gr.HTML("""
<div class="header">
<h1>πŸŽ™οΈ 2nd-Host AI - Complete Podcast Studio</h1>
<h3>Voice Cloning β€’ 2-Speaker Podcasts β€’ Script Generation β€’ Export</h3>
</div>
""")
# Initialize voice manager
voice_manager = VoiceProfileManager()
available_voices = voice_manager.list_profiles()
with gr.Tab("🎭 Voice Cloning"):
gr.Markdown("### Clone New Voices")
with gr.Row():
with gr.Column():
voice_name = gr.Textbox(label="Voice Name", placeholder="e.g., 'David', 'Sarah', 'Expert'")
upload_audio = gr.Audio(label="Reference Audio", type="filepath")
reference_text = gr.Textbox(
label="Reference Text",
value="Hey there, this is my voice for cloning.",
placeholder="Text spoken in the reference audio"
)
clone_btn = gr.Button("🎯 Clone Voice", variant="primary")
with gr.Column():
clone_status = gr.Textbox(label="Cloning Status", interactive=False)
available_voices_display = gr.Dropdown(
label="Available Voices",
choices=available_voices,
value=available_voices[0] if available_voices else None
)
refresh_btn = gr.Button("πŸ”„ Refresh Voices")
with gr.Tab("🎬 Podcast Studio"):
gr.Markdown("### Create 2-Speaker Podcast")
with gr.Row():
with gr.Column():
# Script input
script_input = gr.Textbox(
label="Podcast Script",
lines=6,
placeholder="""Format: Speaker: Dialogue
Example:
Host: Welcome to our show!
Co-host: Thanks for having me!
Host: Let's discuss AI voice technology...
Co-host: It's revolutionizing content creation!""",
value=""
)
# Script generation
prompt_input = gr.Textbox(
label="Or Generate from Prompt",
placeholder="e.g., 'The future of AI in education'"
)
script_style = gr.Radio(
choices=["conversational", "interview", "debate"],
label="Script Style",
value="conversational"
)
generate_script_btn = gr.Button("πŸ“ Generate Script", variant="secondary")
with gr.Column():
# Speaker selection
speaker_a = gr.Dropdown(
choices=available_voices,
label="🎀 Speaker A (Host)",
value=available_voices[0] if available_voices else None
)
speaker_b = gr.Dropdown(
choices=available_voices,
label="🎀 Speaker B (Co-host/Guest)",
value=available_voices[1] if len(available_voices) > 1 else available_voices[0] if available_voices else None
)
generate_btn = gr.Button("πŸš€ Generate Podcast", variant="primary", size="lg")
with gr.Tab("πŸ“€ Output"):
gr.Markdown("### Generated Podcast")
with gr.Row():
with gr.Column():
audio_output = gr.Audio(label="🎧 Podcast Audio", type="filepath")
script_output = gr.File(label="πŸ“„ Script File", file_types=[".txt"])
with gr.Column():
generation_status = gr.Textbox(label="Generation Status", lines=3)
download_btn = gr.Button("πŸ’Ύ Download All", variant="primary")
# ===== EVENT HANDLERS =====
def refresh_voices():
voice_manager = VoiceProfileManager()
voices = voice_manager.list_profiles()
return gr.Dropdown(choices=voices, value=voices[0] if voices else None), gr.Dropdown(choices=voices, value=voices[1] if len(voices) > 1 else voices[0] if voices else None)
def handle_clone_voice(voice_name, audio_path, text):
result = clone_voice(voice_name, audio_path, text)
return result, *refresh_voices()
def handle_generate_script(prompt, style):
if not prompt:
return "❌ Please enter a prompt"
script = generate_script_from_prompt(prompt, style)
return script
def handle_generate_podcast(script, speaker_a, speaker_b, prompt, style):
return generate_podcast(script, speaker_a, speaker_b, prompt, style)
# Connect events
clone_btn.click(
handle_clone_voice,
inputs=[voice_name, upload_audio, reference_text],
outputs=[clone_status, speaker_a, speaker_b]
)
refresh_btn.click(
refresh_voices,
outputs=[speaker_a, speaker_b]
)
generate_script_btn.click(
handle_generate_script,
inputs=[prompt_input, script_style],
outputs=[script_input]
)
generate_btn.click(
handle_generate_podcast,
inputs=[script_input, speaker_a, speaker_b, prompt_input, script_style],
outputs=[audio_output, script_output, generation_status]
)
if __name__ == "__main__":
demo.launch(share=True)