import streamlit as st
import os
import time
import sys
import torch
from huggingface_hub import snapshot_download

current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(current_dir)
sys.path.append(os.path.join(current_dir, "indextts"))

from indextts.infer import IndexTTS
from tools.i18n.i18n import I18nAuto

# Initialize internationalization
i18n = I18nAuto(language="en")  # Changed to English

# GPU configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# App configuration
st.set_page_config(page_title="echoAI - IndexTTS", layout="wide")

# Create necessary directories
os.makedirs("outputs/tasks", exist_ok=True)
os.makedirs("prompts", exist_ok=True)

# Download checkpoints if not exists
if not os.path.exists("checkpoints"):
    snapshot_download("IndexTeam/IndexTTS-1.5", local_dir="checkpoints")

# Load TTS model with GPU support
@st.cache_resource
def load_model():
    tts = IndexTTS(model_dir="checkpoints", cfg_path="checkpoints/config.yaml")
    tts.load_normalizer()
    if DEVICE == "cuda":
        tts.model.to(DEVICE)  # Move model to GPU if available
    return tts

tts = load_model()

# Inference function with device awareness
def infer(voice_path, text, output_path=None):
    if not output_path:
        output_path = os.path.join("outputs", f"spk_{int(time.time())}.wav")
    
    # Ensure input is on correct device
    tts.infer(voice_path, text, output_path)
    return output_path

# Streamlit UI
st.title("echoAI - IndexTTS")
st.markdown("""
<h4 style='text-align: center;'>
    An Industrial-Level Controllable and Efficient Zero-Shot Text-To-Speech System
</h4>
<p style='text-align: center;'>
    <a href='https://arxiv.org/abs/2502.05512'><img src='https://img.shields.io/badge/ArXiv-2502.05512-red'></a>
</p>
""", unsafe_allow_html=True)

# Device status indicator
st.sidebar.markdown(f"**Device:** {DEVICE.upper()}")

# Main interface
with st.container():
    st.header("Audio Generation")  # Translated
    
    col1, col2 = st.columns(2)
    
    with col1:
        uploaded_audio = st.file_uploader(
            "Upload reference audio",  # Translated
            type=["wav", "mp3", "ogg"],
            accept_multiple_files=False
        )
        
        input_text = st.text_area(
            "Input target text",  # Translated
            height=150,
            placeholder="Enter text to synthesize..."
        )
        
        generate_btn = st.button("Generate Speech")  # Translated

    with col2:
        if generate_btn and uploaded_audio and input_text:
            with st.spinner("Generating audio..."):
                # Save uploaded audio
                audio_path = os.path.join("prompts", uploaded_audio.name)
                with open(audio_path, "wb") as f:
                    f.write(uploaded_audio.getbuffer())
                
                # Perform inference
                try:
                    output_path = infer(audio_path, input_text)
                    st.audio(output_path, format="audio/wav")
                    st.success("Generation complete!")
                    
                    # Download button
                    with open(output_path, "rb") as f:
                        st.download_button(
                            "Download Result",  # Translated
                            f,
                            file_name=os.path.basename(output_path)
                except Exception as e:
                    st.error(f"Error: {str(e)}")
        elif generate_btn:
            st.warning("Please upload an audio file and enter text first!")  # Translated

# Sidebar with additional info
with st.sidebar:
    st.header("About echoAI")
    st.markdown("""
    ### Key Features:
    - Zero-shot voice cloning
    - Industrial-grade TTS
    - Efficient synthesis
    - Controllable output
    """)
    
    st.markdown("---")
    st.markdown("""
    ### Usage Instructions:
    1. Upload a reference audio clip
    2. Enter target text
    3. Click 'Generate Speech'
    """)

if __name__ == "__main__":
    # Cleanup old files if needed
    pass