Simonlob's picture
Update app.py
752db35 verified
import gradio as gr
import spaces
from utils import InitModels, ModelConfigs
class KyrgyzSTTApp:
"""Kyrgyz Speech-to-Text Application"""
def __init__(self):
self.model_manager = InitModels()
self._initialize_models()
def _initialize_models(self) -> None:
"""Initialize all available models"""
self.model_manager.initialize_all_models()
@spaces.GPU
def transcribe(self, audio, model_name: str) -> str:
"""
Transcribe audio using the selected model
Args:
audio: Audio file path
model_name: Name of the model to use
Returns:
Transcribed text
"""
import torch
# Get model and move to GPU (activated by @spaces.GPU decorator)
model = self.model_manager.get_model(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# Get pipeline and tokenizer
pipe = self.model_manager.get_pipeline(model_name)
tokenizer = self.model_manager.get_tokenizer(model_name)
# Update pipeline device
pipe.model = model
pipe.device = torch.device(device)
# Inference logic (unchanged)
text = pipe(audio)["text"]
print(tokenizer.decode(tokenizer(text).input_ids))
return text
def create_interface(self) -> gr.Interface:
"""Create and configure Gradio interface"""
model_choices = list(ModelConfigs.get_all_configs().keys())
iface = gr.Interface(
fn=self.transcribe,
inputs=[
gr.Audio(type="filepath", label="Audio Input"),
gr.Dropdown(
choices=model_choices,
value=model_choices[1], # Default to Medium
label="Select Model"
)
],
outputs=gr.Textbox(
label="Transcript",
lines=5,
show_copy_button=True
),
title="Kyrgyz Speech-to-Text",
description="Multi-language speech recognition for Kyrgyz, English, and Russian.\nAudio must be up to 30 seconds long!",
theme=gr.themes.Ocean()
)
return iface
def launch(self) -> None:
"""Launch the Gradio interface"""
iface = self.create_interface()
iface.launch()
if __name__ == "__main__":
app = KyrgyzSTTApp()
app.launch()