from transformers import pipeline import torch import gc import librosa from model_api import clear_gpu_cache, get_device_and_dtype def transcribe_audio(audio_path: str, device: str = "cuda", torch_dtype: torch.dtype = torch.float16) -> str: """ Transcribes an MP3 audio file. Args: audio_path: Path to the audio file. device: The device to use for transcription (e.g., "cuda" for GPU, "cpu" for CPU). torch_dtype: The torch data type to use for model computations. Returns: The transcribed text. """ try: # Create a pipeline with explicit device specification pipe = pipeline( "automatic-speech-recognition", model="openai/whisper-small", device=device, chunk_length_s=25, stride_length_s=2, torch_dtype=torch_dtype, ) # Perform transcription result = pipe(audio_path, return_timestamps=True) # Extract text text = result['text'] return text except Exception as e: print(f"Error during transcription: {str(e)}") return "" finally: del pipe clear_gpu_cache() if __name__ == "__main__": selected_device, selected_dtype = get_device_and_dtype() result = transcribe_audio("/workspaces/Video_Analyser/app_srv/downloads/45677153-510d-4f47-95ee-c1b4b0843433/audio.mp3.mp3", selected_device, selected_dtype) print(result)