Spaces:

nineninesix
/

Kyrgyz-Speech-To-Text

Running on Zero

File size: 2,464 Bytes

import gradio as gr
import spaces
from utils import InitModels, ModelConfigs


class KyrgyzSTTApp:
    """Kyrgyz Speech-to-Text Application"""

    def __init__(self):
        self.model_manager = InitModels()
        self._initialize_models()

    def _initialize_models(self) -> None:
        """Initialize all available models"""
        self.model_manager.initialize_all_models()

    @spaces.GPU
    def transcribe(self, audio, model_name: str) -> str:
        """
        Transcribe audio using the selected model

        Args:
            audio: Audio file path
            model_name: Name of the model to use

        Returns:
            Transcribed text
        """
        import torch

        # Get model and move to GPU (activated by @spaces.GPU decorator)
        model = self.model_manager.get_model(model_name)
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device)

        # Get pipeline and tokenizer
        pipe = self.model_manager.get_pipeline(model_name)
        tokenizer = self.model_manager.get_tokenizer(model_name)

        # Update pipeline device
        pipe.model = model
        pipe.device = torch.device(device)

        # Inference logic (unchanged)
        text = pipe(audio)["text"]
        print(tokenizer.decode(tokenizer(text).input_ids))

        return text

    def create_interface(self) -> gr.Interface:
        """Create and configure Gradio interface"""
        model_choices = list(ModelConfigs.get_all_configs().keys())

        iface = gr.Interface(
            fn=self.transcribe,
            inputs=[
                gr.Audio(type="filepath", label="Audio Input"),
                gr.Dropdown(
                    choices=model_choices,
                    value=model_choices[1],  # Default to Medium
                    label="Select Model"
                )
            ],
            outputs=gr.Textbox(
                label="Transcript",
                lines=5,
                show_copy_button=True
            ),
            title="Kyrgyz Speech-to-Text",
            description="Multi-language speech recognition for Kyrgyz, English, and Russian.\nAudio must be up to 30 seconds long!",
            theme=gr.themes.Ocean()
        )

        return iface

    def launch(self) -> None:
        """Launch the Gradio interface"""
        iface = self.create_interface()
        iface.launch()


if __name__ == "__main__":
    app = KyrgyzSTTApp()
    app.launch()