Video-Translate / app.py
broadfield-dev's picture
Update app.py
91ccc2d verified
#
# ----- Prerequisites -----
# 1. Install required Python libraries:
# pip install gradio transformers torch gtts langdetect
#
# 2. Install ffmpeg on your system.
# - (Mac) brew install ffmpeg
# - (Ubuntu) sudo apt install ffmpeg
# - (Windows) choco install ffmpeg
#
import gradio as gr
import subprocess
import os
import shutil
import uuid
from transformers import pipeline
from gtts import gTTS
from langdetect import detect, DetectorFactory
# Ensure deterministic language detection results
DetectorFactory.seed = 0
# --- 1. Load the model only once ---
# This is more efficient as it won't reload the model on every function call.
print("Loading Whisper model, this may take a moment...")
try:
asr_pipeline = pipeline(
"automatic-speech-recognition",
model="openai/whisper-tiny", # Using tiny for speed, can be changed to base, small, etc.
device="cpu" # Use "cuda:0" if you have a GPU and torch with CUDA
)
print("Whisper model loaded successfully.")
except Exception as e:
print(f"Error loading model: {e}")
# Exit or handle the error appropriately if the model is critical
asr_pipeline = None
def translate_video(video_path):
"""
Translates the audio of a video file to English and provides detailed output.
"""
if not asr_pipeline:
gr.Warning("The speech recognition model is not available. The application cannot proceed.")
return "Model not loaded.", None, None, None, None
# Create a unique temporary directory for this run
temp_dir = f"temp_{uuid.uuid4()}"
os.makedirs(temp_dir, exist_ok=True)
try:
gr.Info("Step 1/5: Extracting audio from video...")
audio_path = os.path.join(temp_dir, "audio.wav")
# Use ffmpeg to extract audio. -y overwrites existing files. -i is input.
# -vn disables video recording. -acodec pcm_s16le is standard for .wav
# -ar 16000 is the sample rate Whisper expects.
command = [
"ffmpeg", "-i", video_path, "-y",
"-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
audio_path
]
subprocess.run(command, check=True, capture_output=True, text=True)
if not os.path.exists(audio_path):
raise FileNotFoundError("Audio extraction failed. ffmpeg did not produce an audio file.")
# --- 2. Transcribe the original audio to text ---
gr.Info("Step 2/5: Transcribing original audio...")
transcription_result = asr_pipeline(
audio_path,
return_timestamps=True, # We don't need timestamps for the full transcript
generate_kwargs={"task": "transcribe"}
)
original_transcript = transcription_result["text"].strip()
if not original_transcript:
gr.Warning("No speech was detected in the video.")
return "No speech detected.", "N/A", "N/A", None, video_path
yield "", original_transcript, "", None, video_path
# --- 3. Detect the language of the original transcript ---
gr.Info("Step 3/5: Detecting language...")
try:
detected_language_code = detect(original_transcript)
# You can expand this with a dictionary for full language names if desired
# e.g., lang_map = {'es': 'Spanish', 'fr': 'French', ...}
except Exception:
detected_language_code = "Unknown"
summary_markdown = f"""
## Translation Details
- **Detected Language**: `{detected_language_code}`
---
"""
yield summary_markdown, original_transcript, "", None, video_path
# --- 4. Translate the audio into English ---
gr.Info("Step 4/5: Translating audio to English...")
translation_result = asr_pipeline(
audio_path,
return_timestamps=True,
generate_kwargs={"task": "translate", "language": "en"} # Explicitly translate to English
)
translated_text = translation_result["text"].strip()
# Create a detailed summary markdown
summary_markdown += f"""
### Translated Text (English)
{translated_text}
"""
yield summary_markdown, original_transcript, translated_text, None, video_path
# --- 5. Convert translated text to speech ---
gr.Info("Step 5/5: Generating translated audio...")
tts = gTTS(translated_text, lang='en')
translated_audio_path = os.path.join(temp_dir, "translated_audio.mp3")
tts.save(translated_audio_path)
# Create a detailed summary markdown
summary_markdown += f"""
### Translated Text (English)
{translated_text}
"""
return summary_markdown, original_transcript, translated_text, translated_audio_path, video_path
except subprocess.CalledProcessError as e:
error_message = f"ffmpeg error: {e.stderr}"
gr.Warning(error_message)
return error_message, None, None, None, None
except Exception as e:
error_message = f"An unexpected error occurred: {str(e)}"
gr.Warning(error_message)
return error_message, None, None, None, None
finally:
# Clean up the temporary directory
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
# --- Create the Gradio interface ---
iface = gr.Interface(
fn=translate_video,
inputs=gr.Video(label="Upload Your Video", sources=['upload']),
outputs=[
gr.Markdown(label="Summary"),
gr.Textbox(label="Original Transcript", interactive=False, lines=5),
gr.Textbox(label="Translated Text (English)", interactive=False, lines=5),
gr.Audio(label="Translated Audio (English)"),
gr.Video(label="Original Video"),
],
title="Enhanced Video Translator",
description="Upload a video to transcribe its audio, detect the language, and translate it to English. Provides original transcript, translated text, and translated audio.",
allow_flagging="never",
examples=[
# You can place video files in a folder named 'examples' next to your script
# and they will show up here.
# [os.path.join(os.path.dirname(__file__), "examples/example_video_1.mp4")],
]
)
if __name__ == "__main__":
iface.launch()