KaniTTS-expo2025-osaka-ja

Running on Zero

App Files Files Community

ylankgz commited on Sep 18

Commit

0488cfb

1 Parent(s): a30344c

Add examples table

Browse files

Files changed (3) hide show

app.py +28 -92
requirements.txt +1 -1
util.py +3 -3

app.py CHANGED Viewed

@@ -39,14 +39,12 @@ token_ = os.getenv('HF_TOKEN')
 # Model configurations
 models_configs = {
-    'Base_pretrained_model': Config(),
-    'Female_voice': Config(
         model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2',
-        temperature=0.2
     ),
-    'Male_voice': Config(
         model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1',
-        temperature=0.2
     )
 }
@@ -61,31 +59,11 @@ for model_name, config in models_configs.items():
 print("All models loaded!")
-# def initialize_models():
-#     """Initialize models globally to avoid reloading"""
-#     global models
-#     # if player is None:
-#     #     print("Initializing NeMo Audio Player...")
-#     #     player = NemoAudioPlayer(Config())
-#     #     print("NeMo Audio Player initialized!")
-#     if not models:
-#         print("Loading TTS models...")
-#         for model_name, config in models_configs.items():
-#             print(f"Loading {model_name}...")
-#             models[model_name] = KaniModel(config, player, token_)
-#             print(f"{model_name} loaded!")
-#         print("All models loaded!")
 @spaces.GPU
 def generate_speech_gpu(text, model_choice):
     """
     Generate speech from text using the selected model on GPU
     """
-    # Initialize models if not already done
-    # initialize_models()
     if not text.strip():
         return None, "Please enter text for speech generation."
@@ -114,16 +92,8 @@ def generate_speech_gpu(text, model_choice):
         print(f"Error during generation: {str(e)}")
         return None, f"❌ Error during generation: {str(e)}"
-# def validate_input(text, model_choice):
-#     """Quick validation without GPU"""
-#     if not text.strip():
-#         return "⚠️ Please enter text for speech generation."
-#     if not model_choice:
-#         return "⚠️ Please select a model."
-#     return f"✅ Ready to generate with {model_choice}"
 # Create Gradio interface
-with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
     gr.Markdown("# KaniTTS: Fast and Expressive Speech Generation Model")
     gr.Markdown("Select a model and enter text to generate high-quality speech")
@@ -137,20 +107,18 @@ with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as d
             )
             text_input = gr.Textbox(
-                label="Enter Text",
-                placeholder="Enter text for speech generation...",
                 lines=3,
                 max_lines=10
             )
             generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
-            # Quick validation button (CPU only)
-            # validate_btn = gr.Button("🔍 Validate Input", variant="secondary")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
-                label="Generated Speech",
                 type="numpy"
             )
@@ -168,64 +136,32 @@ with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as d
         outputs=[audio_output, time_report_output]
     )
-        # Demo Examples
-    gr.Markdown("## 🎯 Demo Examples")
     def play_demo(text):
         return (22050, demo_examples[text]), 'DEMO'
     with gr.Row():
-        for text in list(demo_examples.keys())[:4]:
-            gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output, time_report_output])
-    with gr.Row():
-        for text in list(demo_examples.keys())[4:8]:
-            gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output, time_report_output])
-    # # CPU validation event
-    # validate_btn.click(
-    #     fn=validate_input,
-    #     inputs=[text_input, model_dropdown],
-    #     outputs=status_text
-    # )
-    # # Update status on input change
-    # text_input.change(
-    #     fn=validate_input,
-    #     inputs=[text_input, model_dropdown],
-    #     outputs=status_text
-    # )
-    # Text examples
-    # gr.Markdown("### 📝 Text Examples:")
-    # examples = [
-    #     "Hello! How are you today?",
-    #     "Welcome to the world of artificial intelligence.",
-    #     "This is a demonstration of neural text-to-speech synthesis.",
-    #     "Zero GPU makes high-quality speech generation accessible to everyone!"
-    # ]
-    # gr.Examples(
-    #     examples=examples,
-    #     inputs=text_input,
-    #     label="Click on an example to use it"
-    # )
-    # # Information section
-    # with gr.Accordion("ℹ️ Model Information", open=False):
-    #     gr.Markdown("""
-    #     **Available Models:**
-    #     - **Base Model**: Default pre-trained model for general use
-    #     - **Female Voice**: Optimized for female voice characteristics
-    #     - **Male Voice**: Optimized for male voice characteristics
-    #     **Features:**
-    #     - Powered by NVIDIA NeMo Toolkit
-    #     - High-quality 22kHz audio output
-    #     - Zero GPU acceleration for fast inference
-    #     - Support for long text sequences
-    #     """)
 if __name__ == "__main__":
     demo.launch(

 # Model configurations
 models_configs = {
+    'base': Config(),
+    'female': Config(
         model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2',
     ),
+    'male': Config(
         model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1',
     )
 }
 print("All models loaded!")
 @spaces.GPU
 def generate_speech_gpu(text, model_choice):
     """
     Generate speech from text using the selected model on GPU
     """
     if not text.strip():
         return None, "Please enter text for speech generation."
         print(f"Error during generation: {str(e)}")
         return None, f"❌ Error during generation: {str(e)}"
 # Create Gradio interface
+with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
     gr.Markdown("# KaniTTS: Fast and Expressive Speech Generation Model")
     gr.Markdown("Select a model and enter text to generate high-quality speech")
             )
             text_input = gr.Textbox(
+                label="Text",
+                placeholder="Enter your text ...",
                 lines=3,
                 max_lines=10
             )
             generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
+                label="Generated Audio",
                 type="numpy"
             )
         outputs=[audio_output, time_report_output]
     )
+    gr.Markdown("## Examples")
     def play_demo(text):
         return (22050, demo_examples[text]), 'DEMO'
     with gr.Row():
+        examples = [
+            ["Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?", "male"],
+            ["No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...", "male"],
+            ["I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.", "male"],
+            ["Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.", "female"],
+            ["Holy fu- Oh my God! Don't you understand how dangerous it is, huh?", "male"],
+            ["You make my days brighter, and my wildest dreams feel like reality. How do you do that?", "female"],
+            ["Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?", "female"],
+            ["Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?", "female"],
+        ]
+        gr.Examples(
+            examples=examples,
+            inputs=[text_input, model_dropdown],
+            outputs=audio_output,
+            fn=lambda t=text_input: play_demo(t), outputs=[audio_output, time_report_output],
+            cache_examples=True,
+        )
 if __name__ == "__main__":
     demo.launch(

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 torch==2.8.0
 librosa==0.11.0
-nemo_toolkit[all]==2.4.0
 numpy==1.26.4
 gradio>=4.0.0

 torch==2.8.0
 librosa==0.11.0
+nemo_toolkit[tts]==2.4.0
 numpy==1.26.4
 gradio>=4.0.0

util.py CHANGED Viewed

@@ -197,7 +197,7 @@ class KaniModel:
         model_request = point_2 - point_1
         player_time = point_3 - point_2
         total_time = point_3 - point_1
-        report = f"MODEL GENERATION: {model_request:.2f}\nNANO CODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
         return report
     def run_model(self, text: str):
@@ -256,9 +256,9 @@ class Demo:
         return arr
     def __call__(self):
-        examples = {}
         for idx, (sentence, url) in enumerate(zip(self.sentences, self.urls), start=1):
             filename = f"{idx}.wav"
             filepath = self.download_audio(url, filename)
-            examples[sentence] = self.get_audio(filepath)
         return examples

         model_request = point_2 - point_1
         player_time = point_3 - point_2
         total_time = point_3 - point_1
+        report = f"SPEECH TOKENS: {model_request:.2f}\n CODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
         return report
     def run_model(self, text: str):
         return arr
     def __call__(self):
+        examples = []
         for idx, (sentence, url) in enumerate(zip(self.sentences, self.urls), start=1):
             filename = f"{idx}.wav"
             filepath = self.download_audio(url, filename)
+            examples.append([sentence, self.get_audio(filepath)])
         return examples