Spaces:
Running
Running
# | |
# ----- Prerequisites ----- | |
# 1. Install required Python libraries: | |
# pip install gradio transformers torch gtts langdetect | |
# | |
# 2. Install ffmpeg on your system. | |
# - (Mac) brew install ffmpeg | |
# - (Ubuntu) sudo apt install ffmpeg | |
# - (Windows) choco install ffmpeg | |
# | |
import gradio as gr | |
import subprocess | |
import os | |
import shutil | |
import uuid | |
from transformers import pipeline | |
from gtts import gTTS | |
from langdetect import detect, DetectorFactory | |
# Ensure deterministic language detection results | |
DetectorFactory.seed = 0 | |
# --- 1. Load the model only once --- | |
# This is more efficient as it won't reload the model on every function call. | |
print("Loading Whisper model, this may take a moment...") | |
try: | |
asr_pipeline = pipeline( | |
"automatic-speech-recognition", | |
model="openai/whisper-tiny", # Using tiny for speed, can be changed to base, small, etc. | |
device="cpu" # Use "cuda:0" if you have a GPU and torch with CUDA | |
) | |
print("Whisper model loaded successfully.") | |
except Exception as e: | |
print(f"Error loading model: {e}") | |
# Exit or handle the error appropriately if the model is critical | |
asr_pipeline = None | |
def translate_video(video_path): | |
""" | |
Translates the audio of a video file to English and provides detailed output. | |
""" | |
if not asr_pipeline: | |
gr.Warning("The speech recognition model is not available. The application cannot proceed.") | |
return "Model not loaded.", None, None, None, None | |
# Create a unique temporary directory for this run | |
temp_dir = f"temp_{uuid.uuid4()}" | |
os.makedirs(temp_dir, exist_ok=True) | |
try: | |
gr.Info("Step 1/5: Extracting audio from video...") | |
audio_path = os.path.join(temp_dir, "audio.wav") | |
# Use ffmpeg to extract audio. -y overwrites existing files. -i is input. | |
# -vn disables video recording. -acodec pcm_s16le is standard for .wav | |
# -ar 16000 is the sample rate Whisper expects. | |
command = [ | |
"ffmpeg", "-i", video_path, "-y", | |
"-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", | |
audio_path | |
] | |
subprocess.run(command, check=True, capture_output=True, text=True) | |
if not os.path.exists(audio_path): | |
raise FileNotFoundError("Audio extraction failed. ffmpeg did not produce an audio file.") | |
# --- 2. Transcribe the original audio to text --- | |
gr.Info("Step 2/5: Transcribing original audio...") | |
transcription_result = asr_pipeline( | |
audio_path, | |
return_timestamps=True, # We don't need timestamps for the full transcript | |
generate_kwargs={"task": "transcribe"} | |
) | |
original_transcript = transcription_result["text"].strip() | |
if not original_transcript: | |
gr.Warning("No speech was detected in the video.") | |
return "No speech detected.", "N/A", "N/A", None, video_path | |
yield "", original_transcript, "", None, video_path | |
# --- 3. Detect the language of the original transcript --- | |
gr.Info("Step 3/5: Detecting language...") | |
try: | |
detected_language_code = detect(original_transcript) | |
# You can expand this with a dictionary for full language names if desired | |
# e.g., lang_map = {'es': 'Spanish', 'fr': 'French', ...} | |
except Exception: | |
detected_language_code = "Unknown" | |
summary_markdown = f""" | |
## Translation Details | |
- **Detected Language**: `{detected_language_code}` | |
--- | |
""" | |
yield summary_markdown, original_transcript, "", None, video_path | |
# --- 4. Translate the audio into English --- | |
gr.Info("Step 4/5: Translating audio to English...") | |
translation_result = asr_pipeline( | |
audio_path, | |
return_timestamps=True, | |
generate_kwargs={"task": "translate", "language": "en"} # Explicitly translate to English | |
) | |
translated_text = translation_result["text"].strip() | |
# Create a detailed summary markdown | |
summary_markdown += f""" | |
### Translated Text (English) | |
{translated_text} | |
""" | |
yield summary_markdown, original_transcript, translated_text, None, video_path | |
# --- 5. Convert translated text to speech --- | |
gr.Info("Step 5/5: Generating translated audio...") | |
tts = gTTS(translated_text, lang='en') | |
translated_audio_path = os.path.join(temp_dir, "translated_audio.mp3") | |
tts.save(translated_audio_path) | |
# Create a detailed summary markdown | |
summary_markdown += f""" | |
### Translated Text (English) | |
{translated_text} | |
""" | |
return summary_markdown, original_transcript, translated_text, translated_audio_path, video_path | |
except subprocess.CalledProcessError as e: | |
error_message = f"ffmpeg error: {e.stderr}" | |
gr.Warning(error_message) | |
return error_message, None, None, None, None | |
except Exception as e: | |
error_message = f"An unexpected error occurred: {str(e)}" | |
gr.Warning(error_message) | |
return error_message, None, None, None, None | |
finally: | |
# Clean up the temporary directory | |
if os.path.exists(temp_dir): | |
shutil.rmtree(temp_dir) | |
# --- Create the Gradio interface --- | |
iface = gr.Interface( | |
fn=translate_video, | |
inputs=gr.Video(label="Upload Your Video", sources=['upload']), | |
outputs=[ | |
gr.Markdown(label="Summary"), | |
gr.Textbox(label="Original Transcript", interactive=False, lines=5), | |
gr.Textbox(label="Translated Text (English)", interactive=False, lines=5), | |
gr.Audio(label="Translated Audio (English)"), | |
gr.Video(label="Original Video"), | |
], | |
title="Enhanced Video Translator", | |
description="Upload a video to transcribe its audio, detect the language, and translate it to English. Provides original transcript, translated text, and translated audio.", | |
allow_flagging="never", | |
examples=[ | |
# You can place video files in a folder named 'examples' next to your script | |
# and they will show up here. | |
# [os.path.join(os.path.dirname(__file__), "examples/example_video_1.mp4")], | |
] | |
) | |
if __name__ == "__main__": | |
iface.launch() |