Spaces:
Running
Running
File size: 6,364 Bytes
a74e608 5181a56 a74e608 eff56bc a74e608 eff56bc a74e608 5181a56 eff56bc a74e608 0a15900 a74e608 eff56bc 5181a56 a74e608 eff56bc 0a15900 a74e608 ddfc932 a74e608 5181a56 a74e608 0a15900 eff56bc a74e608 91ccc2d 0a15900 eff56bc a74e608 2ca3a63 a74e608 0a15900 a74e608 0a15900 a74e608 5181a56 eff56bc a74e608 0a15900 a74e608 eff56bc a74e608 eff56bc a74e608 5181a56 a74e608 5181a56 a74e608 5181a56 a74e608 5181a56 a74e608 5181a56 a74e608 5181a56 eff56bc 5181a56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
#
# ----- Prerequisites -----
# 1. Install required Python libraries:
# pip install gradio transformers torch gtts langdetect
#
# 2. Install ffmpeg on your system.
# - (Mac) brew install ffmpeg
# - (Ubuntu) sudo apt install ffmpeg
# - (Windows) choco install ffmpeg
#
import gradio as gr
import subprocess
import os
import shutil
import uuid
from transformers import pipeline
from gtts import gTTS
from langdetect import detect, DetectorFactory
# Ensure deterministic language detection results
DetectorFactory.seed = 0
# --- 1. Load the model only once ---
# This is more efficient as it won't reload the model on every function call.
print("Loading Whisper model, this may take a moment...")
try:
asr_pipeline = pipeline(
"automatic-speech-recognition",
model="openai/whisper-tiny", # Using tiny for speed, can be changed to base, small, etc.
device="cpu" # Use "cuda:0" if you have a GPU and torch with CUDA
)
print("Whisper model loaded successfully.")
except Exception as e:
print(f"Error loading model: {e}")
# Exit or handle the error appropriately if the model is critical
asr_pipeline = None
def translate_video(video_path):
"""
Translates the audio of a video file to English and provides detailed output.
"""
if not asr_pipeline:
gr.Warning("The speech recognition model is not available. The application cannot proceed.")
return "Model not loaded.", None, None, None, None
# Create a unique temporary directory for this run
temp_dir = f"temp_{uuid.uuid4()}"
os.makedirs(temp_dir, exist_ok=True)
try:
gr.Info("Step 1/5: Extracting audio from video...")
audio_path = os.path.join(temp_dir, "audio.wav")
# Use ffmpeg to extract audio. -y overwrites existing files. -i is input.
# -vn disables video recording. -acodec pcm_s16le is standard for .wav
# -ar 16000 is the sample rate Whisper expects.
command = [
"ffmpeg", "-i", video_path, "-y",
"-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
audio_path
]
subprocess.run(command, check=True, capture_output=True, text=True)
if not os.path.exists(audio_path):
raise FileNotFoundError("Audio extraction failed. ffmpeg did not produce an audio file.")
# --- 2. Transcribe the original audio to text ---
gr.Info("Step 2/5: Transcribing original audio...")
transcription_result = asr_pipeline(
audio_path,
return_timestamps=True, # We don't need timestamps for the full transcript
generate_kwargs={"task": "transcribe"}
)
original_transcript = transcription_result["text"].strip()
if not original_transcript:
gr.Warning("No speech was detected in the video.")
return "No speech detected.", "N/A", "N/A", None, video_path
yield "", original_transcript, "", None, video_path
# --- 3. Detect the language of the original transcript ---
gr.Info("Step 3/5: Detecting language...")
try:
detected_language_code = detect(original_transcript)
# You can expand this with a dictionary for full language names if desired
# e.g., lang_map = {'es': 'Spanish', 'fr': 'French', ...}
except Exception:
detected_language_code = "Unknown"
summary_markdown = f"""
## Translation Details
- **Detected Language**: `{detected_language_code}`
---
"""
yield summary_markdown, original_transcript, "", None, video_path
# --- 4. Translate the audio into English ---
gr.Info("Step 4/5: Translating audio to English...")
translation_result = asr_pipeline(
audio_path,
return_timestamps=True,
generate_kwargs={"task": "translate", "language": "en"} # Explicitly translate to English
)
translated_text = translation_result["text"].strip()
# Create a detailed summary markdown
summary_markdown += f"""
### Translated Text (English)
{translated_text}
"""
yield summary_markdown, original_transcript, translated_text, None, video_path
# --- 5. Convert translated text to speech ---
gr.Info("Step 5/5: Generating translated audio...")
tts = gTTS(translated_text, lang='en')
translated_audio_path = os.path.join(temp_dir, "translated_audio.mp3")
tts.save(translated_audio_path)
# Create a detailed summary markdown
summary_markdown += f"""
### Translated Text (English)
{translated_text}
"""
return summary_markdown, original_transcript, translated_text, translated_audio_path, video_path
except subprocess.CalledProcessError as e:
error_message = f"ffmpeg error: {e.stderr}"
gr.Warning(error_message)
return error_message, None, None, None, None
except Exception as e:
error_message = f"An unexpected error occurred: {str(e)}"
gr.Warning(error_message)
return error_message, None, None, None, None
finally:
# Clean up the temporary directory
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
# --- Create the Gradio interface ---
iface = gr.Interface(
fn=translate_video,
inputs=gr.Video(label="Upload Your Video", sources=['upload']),
outputs=[
gr.Markdown(label="Summary"),
gr.Textbox(label="Original Transcript", interactive=False, lines=5),
gr.Textbox(label="Translated Text (English)", interactive=False, lines=5),
gr.Audio(label="Translated Audio (English)"),
gr.Video(label="Original Video"),
],
title="Enhanced Video Translator",
description="Upload a video to transcribe its audio, detect the language, and translate it to English. Provides original transcript, translated text, and translated audio.",
allow_flagging="never",
examples=[
# You can place video files in a folder named 'examples' next to your script
# and they will show up here.
# [os.path.join(os.path.dirname(__file__), "examples/example_video_1.mp4")],
]
)
if __name__ == "__main__":
iface.launch() |