from transformers import pipeline
import gradio as gr
import torch

# Updated model options with 2 new models
MODEL_OPTIONS = {
    "Whisper Tiny (Fastest)": "openai/whisper-tiny",
    "Whisper Base (Balanced)": "openai/whisper-base",
    "Whisper Small (Better Accuracy)": "openai/whisper-small",
    "Whisper Medium (High Accuracy)": "openai/whisper-medium",
    "Whisper Large (Highest Accuracy)": "openai/whisper-large",  # New model
    "Whisper Large-v2 (Latest)": "openai/whisper-large-v2"       # New model
}

# Language codes for Whisper
LANGUAGE_CODES = {
    "Auto-detect": None,
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Italian": "it",
    "Portuguese": "pt",
    "Russian": "ru",
    "Chinese": "zh",
    "Japanese": "ja",
    "Korean": "ko",
    "Arabic": "ar",
    "Hindi": "hi",
    "Dutch": "nl"
}

def transcribe_audio(audio_file, model_choice, task_choice, language_choice, timestamp_choice, beam_size):
    # Initialize the pipeline with selected model
    model_name = MODEL_OPTIONS[model_choice]
    task = "translate" if task_choice == "Translate to English" else "transcribe"
    language = LANGUAGE_CODES[language_choice]
    
    # Create pipeline
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model_name,
        chunk_length_s=30,
        device=0 if torch.cuda.is_available() else -1
    )
    
    # Generate kwargs for the pipeline
    generate_kwargs = {
        "task": task,
        "num_beams": beam_size
    }
    if language and task == "transcribe":
        generate_kwargs["language"] = language
    
    # Process audio file
    if timestamp_choice:
        result = pipe(
            audio_file,
            generate_kwargs=generate_kwargs,
            return_timestamps=True
        )
        timestamp_text = "\n".join([
            f"[{chunk['timestamp'][0]:.2f}s -> {chunk['timestamp'][1]:.2f}s] {chunk['text']}"
            for chunk in result.get("chunks", [])
        ])
        return result["text"], timestamp_text, gr.update(visible=True)
    else:
        result = pipe(
            audio_file,
            generate_kwargs=generate_kwargs,
            return_timestamps=False
        )
        return result["text"], "", gr.update(visible=False)

with gr.Blocks() as demo:
    gr.Markdown("# 🎵 Audio Transcription & Translation")
    gr.Markdown("Upload an audio file or use your microphone to transcribe or translate speech.")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                label="Audio Input",
                type="filepath"
            )
            
            # Updated model selection with new models
            model_choice = gr.Dropdown(
                choices=list(MODEL_OPTIONS.keys()),
                value="Whisper Tiny (Fastest)",
                label="Model Selection"
            )
            
            task_choice = gr.Radio(
                choices=["Transcribe", "Translate to English"],
                value="Transcribe",
                label="Task"
            )
            
            # Extended language options
            language_choice = gr.Dropdown(
                choices=list(LANGUAGE_CODES.keys()),
                value="Auto-detect",
                label="Language (for transcription)"
            )
            
            # New features
            timestamp_choice = gr.Checkbox(
                label="Include Timestamps",
                value=False
            )
            
            beam_size = gr.Slider(
                minimum=1,
                maximum=10,
                value=1,
                step=1,
                label="Beam Size (Higher = Better Accuracy but Slower)"
            )
        
        with gr.Column():
            text_output = gr.Textbox(
                lines=15,
                label="Transcription",
                interactive=False
            )
            
            # New output for timestamps
            timestamp_output = gr.Textbox(
                lines=8,
                label="Timestamps (if enabled)",
                interactive=False,
                visible=False
            )
    
    transcribe_btn = gr.Button("Transcribe Audio", variant="primary")
    
    transcribe_btn.click(
        transcribe_audio,
        inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
        outputs=[text_output, timestamp_output, timestamp_output]
    )
    
    gr.Examples(
        examples=[
            ["example_audio_1.wav", "Whisper Tiny (Fastest)", "Transcribe", "Auto-detect", False, 1],
            ["example_audio_2.wav", "Whisper Base (Balanced)", "Transcribe", "English", False, 1],
            ["example_audio_3.wav", "Whisper Small (Better Accuracy)", "Translate to English", "Auto-detect", False, 1],
            ["example_audio_4.wav", "Whisper Large (Highest Accuracy)", "Transcribe", "Spanish", True, 3]
        ],
        inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
    )
    
    gr.Markdown("### Features")
    gr.Markdown("- **Model Selection**: Choose from 6 different Whisper models with speed/accuracy tradeoffs")
    gr.Markdown("- **Task Options**: Transcribe audio in original language or translate to English")
    gr.Markdown("- **Language Selection**: Auto-detect or specify input language for better accuracy")
    gr.Markdown("- **Multiple Input Methods**: Upload audio files or record with microphone")
    gr.Markdown("- **Timestamps**: Option to include word-level timestamps")
    gr.Markdown("- **Beam Search**: Adjustable beam size for better accuracy")
    
    gr.Markdown("### Model Information")
    gr.Markdown("""
    | Model | Parameters | Speed | Best For |
    |-------|------------|-------|----------|
    | Whisper Tiny | 39M | Fastest | Quick transcriptions, low resources |
    | Whisper Base | 74M | Fast | Balanced performance |
    | Whisper Small | 244M | Medium | Better accuracy |
    | Whisper Medium | 769M | Slow | High accuracy transcriptions |
    | Whisper Large | 1.5B | Slower | Very high accuracy |
    | Whisper Large-v2 | 1.5B | Slower | Latest improvements |
    """)
    
    gr.Markdown("- **Supported Formats**: WAV, MP3, M4A, FLAC")
    gr.Markdown("- **Note**: First transcription may take 10-60 seconds (model loading)")

if __name__ == "__main__":
    demo.launch()