ECHOAI

Running on Zero

App Files Files Community

MPCIRCLE commited on 21 days ago

Commit

f222c6c

verified ·

1 Parent(s): c026f7e

Update webui.py

Browse files

changed overall streamlit code

Files changed (1) hide show

webui.py +177 -120

webui.py CHANGED Viewed

@@ -1,133 +1,190 @@
-import streamlit as st
 import os
 import time
-import sys
-import torch
 from huggingface_hub import snapshot_download
-current_dir = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(current_dir)
-sys.path.append(os.path.join(current_dir, "indextts"))
 from indextts.infer import IndexTTS
-from tools.i18n.i18n import I18nAuto
-# Initialize internationalization
-i18n = I18nAuto(language="en")  # Changed to English
-# GPU configuration
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# App configuration
-st.set_page_config(page_title="echoAI - IndexTTS", layout="wide")
-# Create necessary directories
-os.makedirs("outputs/tasks", exist_ok=True)
-os.makedirs("prompts", exist_ok=True)
-# Download checkpoints if not exists
-if not os.path.exists("checkpoints"):
-    snapshot_download("IndexTeam/IndexTTS-1.5", local_dir="checkpoints")
-# Load TTS model with GPU support
-@st.cache_resource
-def load_model():
-    tts = IndexTTS(model_dir="checkpoints", cfg_path="checkpoints/config.yaml")
     tts.load_normalizer()
-    if DEVICE == "cuda":
-        tts.model.to(DEVICE)  # Move model to GPU if available
     return tts
-tts = load_model()
-# Inference function with device awareness
-def infer(voice_path, text, output_path=None):
-    if not output_path:
-        output_path = os.path.join("outputs", f"spk_{int(time.time())}.wav")
-    # Ensure input is on correct device
-    tts.infer(voice_path, text, output_path)
     return output_path
 # Streamlit UI
-st.title("echoAI - IndexTTS")
-st.markdown("""
-<h4 style='text-align: center;'>
-    An Industrial-Level Controllable and Efficient Zero-Shot Text-To-Speech System
-</h4>
-<p style='text-align: center;'>
-    <a href='https://arxiv.org/abs/2502.05512'><img src='https://img.shields.io/badge/ArXiv-2502.05512-red'></a>
-</p>
-""", unsafe_allow_html=True)
-# Device status indicator
-st.sidebar.markdown(f"**Device:** {DEVICE.upper()}")
-# Main interface
-with st.container():
-    st.header("Audio Generation")  # Translated
-    col1, col2 = st.columns(2)
-    with col1:
-        uploaded_audio = st.file_uploader(
-            "Upload reference audio",  # Translated
-            type=["wav", "mp3", "ogg"],
-            accept_multiple_files=False
-        )
-        input_text = st.text_area(
-            "Input target text",  # Translated
-            height=150,
-            placeholder="Enter text to synthesize..."
-        )
-        generate_btn = st.button("Generate Speech")  # Translated
-    with col2:
-        if generate_btn and uploaded_audio and input_text:
-            with st.spinner("Generating audio..."):
-                # Save uploaded audio
-                audio_path = os.path.join("prompts", uploaded_audio.name)
-                with open(audio_path, "wb") as f:
-                    f.write(uploaded_audio.getbuffer())
-                # Perform inference
-                try:
-                    output_path = infer(audio_path, input_text)
-                    st.audio(output_path, format="audio/wav")
-                    st.success("Generation complete!")
-                    # Download button
-                    with open(output_path, "rb") as f:
-                        st.download_button(
-                            "Download Result",  # Translated
-                            f,
-                            file_name=os.path.basename(output_path))
-                except Exception as e:
-                    st.error(f"Error: {str(e)}")
-        elif generate_btn:
-            st.warning("Please upload an audio file and enter text first!")  # Translated
-# Sidebar with additional info
-with st.sidebar:
-    st.header("About echoAI")
-    st.markdown("""
-    ### Key Features:
-    - Zero-shot voice cloning
-    - Industrial-grade TTS
-    - Efficient synthesis
-    - Controllable output
-    """)
-    st.markdown("---")
-    st.markdown("""
-    ### Usage Instructions:
-    1. Upload a reference audio clip
-    2. Enter target text
-    3. Click 'Generate Speech'
-    """)
-if __name__ == "__main__":
-    # Cleanup old files if needed
-    pass

 import os
 import time
+import shutil # Added shutil for potentially cleaning old files if needed, though not used in this version
 from huggingface_hub import snapshot_download
+import streamlit as st
+# Imports from your package
+# Ensure 'indextts' is correctly installed or available in your environment/requirements.txt
 from indextts.infer import IndexTTS
+# ------------------------------------------------------------------------------
+# Configuration
+# ------------------------------------------------------------------------------
+# Where to store model checkpoints and outputs
+# These paths are relative to the root directory of your Spaces repository
+CHECKPOINT_DIR = "checkpoints"
+OUTPUT_DIR = "outputs"
+PROMPTS_DIR = "prompts" # Directory to save uploaded reference audio
+# Ensure necessary directories exist. Hugging Face Spaces provides a writable filesystem.
+os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+os.makedirs(PROMPTS_DIR, exist_ok=True)
+MODEL_REPO = "IndexTeam/IndexTTS-1.5"
+CFG_FILENAME = "config.yaml"
+# ------------------------------------------------------------------------------
+# Model loading (cached so it only runs once per resource identifier)
+# ------------------------------------------------------------------------------
+# @st.cache_resource is the recommended way in Streamlit to cache large objects
+# like ML models that should be loaded only once.
+# This is crucial for efficiency on platforms like Spaces, preventing re-loading
+# the model on every user interaction/script re-run.
+@st.cache_resource(show_spinner=False)
+def load_tts_model():
+    """
+    Downloads the model snapshot and initializes the IndexTTS model.
+    Cached using st.cache_resource to load only once.
+    """
+    st.write("⏳ Loading model... This may take a moment.")
+    # Download the model snapshot if not already present
+    # local_dir_use_symlinks=False is often safer in containerized environments
+    snapshot_download(
+        repo_id=MODEL_REPO,
+        local_dir=CHECKPOINT_DIR,
+        local_dir_use_symlinks=False,
+    )
+    # Initialize the TTS object
+    # The underlying IndexTTS library should handle using the GPU if available
+    # and if dependencies (like CUDA-enabled PyTorch/TensorFlow) are installed.
+    tts = IndexTTS(
+        model_dir=CHECKPOINT_DIR,
+        cfg_path=os.path.join(CHECKPOINT_DIR, CFG_FILENAME)
+    )
+    # Load any normalizer or auxiliary data required by the model
     tts.load_normalizer()
+    st.write("✅ Model loaded!")
     return tts
+# Load the TTS model using the cached function
+# This line is executed on each script run, but the function body only runs
+# the first time or if the function signature/dependencies change.
+tts = load_tts_model()
+# ------------------------------------------------------------------------------
+# Inference function
+# ------------------------------------------------------------------------------
+def run_inference(reference_audio_path: str, text: str) -> str:
+    """
+    Run TTS inference using the uploaded reference audio and the target text.
+    Returns the path to the generated .wav file.
+    """
+    if not os.path.exists(reference_audio_path):
+         raise FileNotFoundError(f"Reference audio not found at {reference_audio_path}")
+    # Generate a unique output filename
+    timestamp = int(time.time())
+    output_filename = f"generated_{timestamp}.wav"
+    output_path = os.path.join(OUTPUT_DIR, output_filename)
+    # Perform the TTS inference
+    # The efficiency of this step depends on the IndexTTS library and hardware
+    tts.infer(reference_audio_path, text, output_path)
+    # Optional: Clean up old files in output/prompts directories if space is limited
+    # This can be added if you find directories filling up on Spaces.
+    # E.g., a function to remove files older than X hours/days.
+    # For a simple demo, may not be necessary initially.
     return output_path
+# ------------------------------------------------------------------------------
 # Streamlit UI
+# ------------------------------------------------------------------------------
+st.set_page_config(page_title="IndexTTS Demo", layout="wide")
+st.markdown(
+    """
+    <h1 style="text-align: center;">IndexTTS: Zero-Shot Controllable & Efficient TTS</h1>
+    <p style="text-align: center;">
+      <a href="https://arxiv.org/abs/2502.05512" target="_blank">
+        View the paper on arXiv (2502.05512)
+      </a>
+    </p>
+    """,
+    unsafe_allow_html=True
+)
+st.sidebar.header("Settings")
+with st.sidebar.expander("🗂️ Output Directories"):
+    st.write(f"- Checkpoints: `{CHECKPOINT_DIR}`")
+    st.write(f"- Generated audio: `{OUTPUT_DIR}`")
+    st.write(f"- Uploaded prompts: `{PROMPTS_DIR}`")
+    st.info("These directories are located within your Space's persistent storage.")
+st.header("1. Upload Reference Audio")
+ref_audio_file = st.file_uploader(
+    label="Upload a reference audio (wav or mp3)",
+    type=["wav", "mp3"],
+    help="This audio will condition the voice characteristics.",
+    key="ref_audio_uploader" # Added a key for potential future state management
+)
+ref_path = None # Initialize ref_path
+if ref_audio_file:
+    # Save the uploaded file to the prompts directory
+    # Streamlit's uploader provides file-like object
+    ref_filename = ref_audio_file.name
+    ref_path = os.path.join(PROMPTS_DIR, ref_filename)
+    # Use a more robust way to save the file
+    with open(ref_path, "wb") as f:
+        # Use getbuffer() for efficiency with large files
+        f.write(ref_audio_file.getbuffer())
+    st.success(f"Saved reference audio: `{ref_filename}`")
+    st.audio(ref_path, format="audio/wav") # Display the uploaded audio
+st.header("2. Enter Text to Synthesize")
+text_input = st.text_area(
+    label="Enter the text you want to convert to speech",
+    placeholder="Type your sentence here...",
+    key="text_input_area" # Added a key
+)
+# Button to trigger generation
+generate_button = st.button("Generate Speech", key="generate_tts_button")
+# ------------------------------------------------------------------------------
+# Trigger Inference and Display Results
+# ------------------------------------------------------------------------------
+# This block runs only when the button is clicked AND inputs are valid
+if generate_button:
+    if not ref_path or not os.path.exists(ref_path):
+        st.error("Please upload a reference audio first.")
+    elif not text_input or not text_input.strip():
+        st.error("Please enter some text to synthesize.")
+    else:
+        # Use st.spinner to indicate processing is happening
+        with st.spinner("🚀 Generating speech..."):
+            try:
+                # Call the inference function
+                output_wav_path = run_inference(ref_path, text_input)
+                # Check if output file was actually created
+                if os.path.exists(output_wav_path):
+                    st.success("🎉 Done! Here’s your generated audio:")
+                    # Display the generated audio
+                    st.audio(output_wav_path, format="audio/wav")
+                else:
+                     st.error("Generation failed: Output file was not created.")
+            except Exception as e:
+                st.error(f"An error occurred during inference: {e}")
+                # Optional: Log the full traceback for debugging on Spaces
+                # import traceback
+                # st.exception(e) # This shows traceback in the app
+# Add a footer or more info
+st.markdown("---")
+st.markdown("Demo powered by [IndexTTS](https://arxiv.org/abs/2502.05512) and built with Streamlit.")