from transformers import pipeline import gradio as gr import torch # Updated model options with 2 new models MODEL_OPTIONS = { "Whisper Tiny (Fastest)": "openai/whisper-tiny", "Whisper Base (Balanced)": "openai/whisper-base", "Whisper Small (Better Accuracy)": "openai/whisper-small", "Whisper Medium (High Accuracy)": "openai/whisper-medium", "Whisper Large (Highest Accuracy)": "openai/whisper-large", # New model "Whisper Large-v2 (Latest)": "openai/whisper-large-v2" # New model } # Language codes for Whisper LANGUAGE_CODES = { "Auto-detect": None, "English": "en", "Spanish": "es", "French": "fr", "German": "de", "Italian": "it", "Portuguese": "pt", "Russian": "ru", "Chinese": "zh", "Japanese": "ja", "Korean": "ko", "Arabic": "ar", "Hindi": "hi", "Dutch": "nl" } def transcribe_audio(audio_file, model_choice, task_choice, language_choice, timestamp_choice, beam_size): # Initialize the pipeline with selected model model_name = MODEL_OPTIONS[model_choice] task = "translate" if task_choice == "Translate to English" else "transcribe" language = LANGUAGE_CODES[language_choice] # Create pipeline pipe = pipeline( "automatic-speech-recognition", model=model_name, chunk_length_s=30, device=0 if torch.cuda.is_available() else -1 ) # Generate kwargs for the pipeline generate_kwargs = { "task": task, "num_beams": beam_size } if language and task == "transcribe": generate_kwargs["language"] = language # Process audio file if timestamp_choice: result = pipe( audio_file, generate_kwargs=generate_kwargs, return_timestamps=True ) timestamp_text = "\n".join([ f"[{chunk['timestamp'][0]:.2f}s -> {chunk['timestamp'][1]:.2f}s] {chunk['text']}" for chunk in result.get("chunks", []) ]) return result["text"], timestamp_text, gr.update(visible=True) else: result = pipe( audio_file, generate_kwargs=generate_kwargs, return_timestamps=False ) return result["text"], "", gr.update(visible=False) with gr.Blocks() as demo: gr.Markdown("# 🎵 Audio Transcription & Translation") gr.Markdown("Upload an audio file or use your microphone to transcribe or translate speech.") with gr.Row(): with gr.Column(): audio_input = gr.Audio( label="Audio Input", type="filepath" ) # Updated model selection with new models model_choice = gr.Dropdown( choices=list(MODEL_OPTIONS.keys()), value="Whisper Tiny (Fastest)", label="Model Selection" ) task_choice = gr.Radio( choices=["Transcribe", "Translate to English"], value="Transcribe", label="Task" ) # Extended language options language_choice = gr.Dropdown( choices=list(LANGUAGE_CODES.keys()), value="Auto-detect", label="Language (for transcription)" ) # New features timestamp_choice = gr.Checkbox( label="Include Timestamps", value=False ) beam_size = gr.Slider( minimum=1, maximum=10, value=1, step=1, label="Beam Size (Higher = Better Accuracy but Slower)" ) with gr.Column(): text_output = gr.Textbox( lines=15, label="Transcription", interactive=False ) # New output for timestamps timestamp_output = gr.Textbox( lines=8, label="Timestamps (if enabled)", interactive=False, visible=False ) transcribe_btn = gr.Button("Transcribe Audio", variant="primary") transcribe_btn.click( transcribe_audio, inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size], outputs=[text_output, timestamp_output, timestamp_output] ) gr.Examples( examples=[ ["example_audio_1.wav", "Whisper Tiny (Fastest)", "Transcribe", "Auto-detect", False, 1], ["example_audio_2.wav", "Whisper Base (Balanced)", "Transcribe", "English", False, 1], ["example_audio_3.wav", "Whisper Small (Better Accuracy)", "Translate to English", "Auto-detect", False, 1], ["example_audio_4.wav", "Whisper Large (Highest Accuracy)", "Transcribe", "Spanish", True, 3] ], inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size], ) gr.Markdown("### Features") gr.Markdown("- **Model Selection**: Choose from 6 different Whisper models with speed/accuracy tradeoffs") gr.Markdown("- **Task Options**: Transcribe audio in original language or translate to English") gr.Markdown("- **Language Selection**: Auto-detect or specify input language for better accuracy") gr.Markdown("- **Multiple Input Methods**: Upload audio files or record with microphone") gr.Markdown("- **Timestamps**: Option to include word-level timestamps") gr.Markdown("- **Beam Search**: Adjustable beam size for better accuracy") gr.Markdown("### Model Information") gr.Markdown(""" | Model | Parameters | Speed | Best For | |-------|------------|-------|----------| | Whisper Tiny | 39M | Fastest | Quick transcriptions, low resources | | Whisper Base | 74M | Fast | Balanced performance | | Whisper Small | 244M | Medium | Better accuracy | | Whisper Medium | 769M | Slow | High accuracy transcriptions | | Whisper Large | 1.5B | Slower | Very high accuracy | | Whisper Large-v2 | 1.5B | Slower | Latest improvements | """) gr.Markdown("- **Supported Formats**: WAV, MP3, M4A, FLAC") gr.Markdown("- **Note**: First transcription may take 10-60 seconds (model loading)") if __name__ == "__main__": demo.launch()