import streamlit as st
import os
import time
import sys
import torch
from huggingface_hub import snapshot_download
current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(current_dir)
sys.path.append(os.path.join(current_dir, "indextts"))
from indextts.infer import IndexTTS
from tools.i18n.i18n import I18nAuto
# Initialize internationalization
i18n = I18nAuto(language="en") # Changed to English
# GPU configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# App configuration
st.set_page_config(page_title="echoAI - IndexTTS", layout="wide")
# Create necessary directories
os.makedirs("outputs/tasks", exist_ok=True)
os.makedirs("prompts", exist_ok=True)
# Download checkpoints if not exists
if not os.path.exists("checkpoints"):
snapshot_download("IndexTeam/IndexTTS-1.5", local_dir="checkpoints")
# Load TTS model with GPU support
@st.cache_resource
def load_model():
tts = IndexTTS(model_dir="checkpoints", cfg_path="checkpoints/config.yaml")
tts.load_normalizer()
if DEVICE == "cuda":
tts.model.to(DEVICE) # Move model to GPU if available
return tts
tts = load_model()
# Inference function with device awareness
def infer(voice_path, text, output_path=None):
if not output_path:
output_path = os.path.join("outputs", f"spk_{int(time.time())}.wav")
# Ensure input is on correct device
tts.infer(voice_path, text, output_path)
return output_path
# Streamlit UI
st.title("echoAI - IndexTTS")
st.markdown("""
An Industrial-Level Controllable and Efficient Zero-Shot Text-To-Speech System
""", unsafe_allow_html=True)
# Device status indicator
st.sidebar.markdown(f"**Device:** {DEVICE.upper()}")
# Main interface
with st.container():
st.header("Audio Generation") # Translated
col1, col2 = st.columns(2)
with col1:
uploaded_audio = st.file_uploader(
"Upload reference audio", # Translated
type=["wav", "mp3", "ogg"],
accept_multiple_files=False
)
input_text = st.text_area(
"Input target text", # Translated
height=150,
placeholder="Enter text to synthesize..."
)
generate_btn = st.button("Generate Speech") # Translated
with col2:
if generate_btn and uploaded_audio and input_text:
with st.spinner("Generating audio..."):
# Save uploaded audio
audio_path = os.path.join("prompts", uploaded_audio.name)
with open(audio_path, "wb") as f:
f.write(uploaded_audio.getbuffer())
# Perform inference
try:
output_path = infer(audio_path, input_text)
st.audio(output_path, format="audio/wav")
st.success("Generation complete!")
# Download button
with open(output_path, "rb") as f:
st.download_button(
"Download Result", # Translated
f,
file_name=os.path.basename(output_path)
except Exception as e:
st.error(f"Error: {str(e)}")
elif generate_btn:
st.warning("Please upload an audio file and enter text first!") # Translated
# Sidebar with additional info
with st.sidebar:
st.header("About echoAI")
st.markdown("""
### Key Features:
- Zero-shot voice cloning
- Industrial-grade TTS
- Efficient synthesis
- Controllable output
""")
st.markdown("---")
st.markdown("""
### Usage Instructions:
1. Upload a reference audio clip
2. Enter target text
3. Click 'Generate Speech'
""")
if __name__ == "__main__":
# Cleanup old files if needed
pass