import streamlit as st import os import time import sys import torch from huggingface_hub import snapshot_download current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(current_dir) sys.path.append(os.path.join(current_dir, "indextts")) from indextts.infer import IndexTTS from tools.i18n.i18n import I18nAuto # Initialize internationalization i18n = I18nAuto(language="en") # Changed to English # GPU configuration DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # App configuration st.set_page_config(page_title="echoAI - IndexTTS", layout="wide") # Create necessary directories os.makedirs("outputs/tasks", exist_ok=True) os.makedirs("prompts", exist_ok=True) # Download checkpoints if not exists if not os.path.exists("checkpoints"): snapshot_download("IndexTeam/IndexTTS-1.5", local_dir="checkpoints") # Load TTS model with GPU support @st.cache_resource def load_model(): tts = IndexTTS(model_dir="checkpoints", cfg_path="checkpoints/config.yaml") tts.load_normalizer() if DEVICE == "cuda": tts.model.to(DEVICE) # Move model to GPU if available return tts tts = load_model() # Inference function with device awareness def infer(voice_path, text, output_path=None): if not output_path: output_path = os.path.join("outputs", f"spk_{int(time.time())}.wav") # Ensure input is on correct device tts.infer(voice_path, text, output_path) return output_path # Streamlit UI st.title("echoAI - IndexTTS") st.markdown("""

An Industrial-Level Controllable and Efficient Zero-Shot Text-To-Speech System

""", unsafe_allow_html=True) # Device status indicator st.sidebar.markdown(f"**Device:** {DEVICE.upper()}") # Main interface with st.container(): st.header("Audio Generation") # Translated col1, col2 = st.columns(2) with col1: uploaded_audio = st.file_uploader( "Upload reference audio", # Translated type=["wav", "mp3", "ogg"], accept_multiple_files=False ) input_text = st.text_area( "Input target text", # Translated height=150, placeholder="Enter text to synthesize..." ) generate_btn = st.button("Generate Speech") # Translated with col2: if generate_btn and uploaded_audio and input_text: with st.spinner("Generating audio..."): # Save uploaded audio audio_path = os.path.join("prompts", uploaded_audio.name) with open(audio_path, "wb") as f: f.write(uploaded_audio.getbuffer()) # Perform inference try: output_path = infer(audio_path, input_text) st.audio(output_path, format="audio/wav") st.success("Generation complete!") # Download button with open(output_path, "rb") as f: st.download_button( "Download Result", # Translated f, file_name=os.path.basename(output_path) except Exception as e: st.error(f"Error: {str(e)}") elif generate_btn: st.warning("Please upload an audio file and enter text first!") # Translated # Sidebar with additional info with st.sidebar: st.header("About echoAI") st.markdown(""" ### Key Features: - Zero-shot voice cloning - Industrial-grade TTS - Efficient synthesis - Controllable output """) st.markdown("---") st.markdown(""" ### Usage Instructions: 1. Upload a reference audio clip 2. Enter target text 3. Click 'Generate Speech' """) if __name__ == "__main__": # Cleanup old files if needed pass