# # ----- Prerequisites ----- # 1. Install required Python libraries: # pip install gradio transformers torch gtts langdetect # # 2. Install ffmpeg on your system. # - (Mac) brew install ffmpeg # - (Ubuntu) sudo apt install ffmpeg # - (Windows) choco install ffmpeg # import gradio as gr import subprocess import os import shutil import uuid from transformers import pipeline from gtts import gTTS from langdetect import detect, DetectorFactory # Ensure deterministic language detection results DetectorFactory.seed = 0 # --- 1. Load the model only once --- # This is more efficient as it won't reload the model on every function call. print("Loading Whisper model, this may take a moment...") try: asr_pipeline = pipeline( "automatic-speech-recognition", model="openai/whisper-tiny", # Using tiny for speed, can be changed to base, small, etc. device="cpu" # Use "cuda:0" if you have a GPU and torch with CUDA ) print("Whisper model loaded successfully.") except Exception as e: print(f"Error loading model: {e}") # Exit or handle the error appropriately if the model is critical asr_pipeline = None def translate_video(video_path): """ Translates the audio of a video file to English and provides detailed output. """ if not asr_pipeline: gr.Warning("The speech recognition model is not available. The application cannot proceed.") return "Model not loaded.", None, None, None, None # Create a unique temporary directory for this run temp_dir = f"temp_{uuid.uuid4()}" os.makedirs(temp_dir, exist_ok=True) try: gr.Info("Step 1/5: Extracting audio from video...") audio_path = os.path.join(temp_dir, "audio.wav") # Use ffmpeg to extract audio. -y overwrites existing files. -i is input. # -vn disables video recording. -acodec pcm_s16le is standard for .wav # -ar 16000 is the sample rate Whisper expects. command = [ "ffmpeg", "-i", video_path, "-y", "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", audio_path ] subprocess.run(command, check=True, capture_output=True, text=True) if not os.path.exists(audio_path): raise FileNotFoundError("Audio extraction failed. ffmpeg did not produce an audio file.") # --- 2. Transcribe the original audio to text --- gr.Info("Step 2/5: Transcribing original audio...") transcription_result = asr_pipeline( audio_path, return_timestamps=True, # We don't need timestamps for the full transcript generate_kwargs={"task": "transcribe"} ) original_transcript = transcription_result["text"].strip() if not original_transcript: gr.Warning("No speech was detected in the video.") return "No speech detected.", "N/A", "N/A", None, video_path yield "", original_transcript, "", None, video_path # --- 3. Detect the language of the original transcript --- gr.Info("Step 3/5: Detecting language...") try: detected_language_code = detect(original_transcript) # You can expand this with a dictionary for full language names if desired # e.g., lang_map = {'es': 'Spanish', 'fr': 'French', ...} except Exception: detected_language_code = "Unknown" summary_markdown = f""" ## Translation Details - **Detected Language**: `{detected_language_code}` --- """ yield summary_markdown, original_transcript, "", None, video_path # --- 4. Translate the audio into English --- gr.Info("Step 4/5: Translating audio to English...") translation_result = asr_pipeline( audio_path, return_timestamps=True, generate_kwargs={"task": "translate", "language": "en"} # Explicitly translate to English ) translated_text = translation_result["text"].strip() # Create a detailed summary markdown summary_markdown += f""" ### Translated Text (English) {translated_text} """ yield summary_markdown, original_transcript, translated_text, None, video_path # --- 5. Convert translated text to speech --- gr.Info("Step 5/5: Generating translated audio...") tts = gTTS(translated_text, lang='en') translated_audio_path = os.path.join(temp_dir, "translated_audio.mp3") tts.save(translated_audio_path) # Create a detailed summary markdown summary_markdown += f""" ### Translated Text (English) {translated_text} """ return summary_markdown, original_transcript, translated_text, translated_audio_path, video_path except subprocess.CalledProcessError as e: error_message = f"ffmpeg error: {e.stderr}" gr.Warning(error_message) return error_message, None, None, None, None except Exception as e: error_message = f"An unexpected error occurred: {str(e)}" gr.Warning(error_message) return error_message, None, None, None, None finally: # Clean up the temporary directory if os.path.exists(temp_dir): shutil.rmtree(temp_dir) # --- Create the Gradio interface --- iface = gr.Interface( fn=translate_video, inputs=gr.Video(label="Upload Your Video", sources=['upload']), outputs=[ gr.Markdown(label="Summary"), gr.Textbox(label="Original Transcript", interactive=False, lines=5), gr.Textbox(label="Translated Text (English)", interactive=False, lines=5), gr.Audio(label="Translated Audio (English)"), gr.Video(label="Original Video"), ], title="Enhanced Video Translator", description="Upload a video to transcribe its audio, detect the language, and translate it to English. Provides original transcript, translated text, and translated audio.", allow_flagging="never", examples=[ # You can place video files in a folder named 'examples' next to your script # and they will show up here. # [os.path.join(os.path.dirname(__file__), "examples/example_video_1.mp4")], ] ) if __name__ == "__main__": iface.launch()