Spaces:

broadfield-dev
/

Video-Translate

Running

App Files Files Community

Video-Translate / app.py

broadfield-dev

Update app.py

91ccc2d verified about 2 months ago

raw

history blame contribute delete

6.36 kB

	#
	# ----- Prerequisites -----
	# 1. Install required Python libraries:
	# pip install gradio transformers torch gtts langdetect
	#
	# 2. Install ffmpeg on your system.
	# - (Mac) brew install ffmpeg
	# - (Ubuntu) sudo apt install ffmpeg
	# - (Windows) choco install ffmpeg
	#
	import gradio as gr
	import subprocess
	import os
	import shutil
	import uuid
	from transformers import pipeline
	from gtts import gTTS
	from langdetect import detect, DetectorFactory

	# Ensure deterministic language detection results
	DetectorFactory.seed = 0

	# --- 1. Load the model only once ---
	# This is more efficient as it won't reload the model on every function call.
	print("Loading Whisper model, this may take a moment...")
	try:
	asr_pipeline = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-tiny", # Using tiny for speed, can be changed to base, small, etc.
	device="cpu" # Use "cuda:0" if you have a GPU and torch with CUDA
	)
	print("Whisper model loaded successfully.")
	except Exception as e:
	print(f"Error loading model: {e}")
	# Exit or handle the error appropriately if the model is critical
	asr_pipeline = None

	def translate_video(video_path):
	"""
	Translates the audio of a video file to English and provides detailed output.
	"""
	if not asr_pipeline:
	gr.Warning("The speech recognition model is not available. The application cannot proceed.")
	return "Model not loaded.", None, None, None, None

	# Create a unique temporary directory for this run
	temp_dir = f"temp_{uuid.uuid4()}"
	os.makedirs(temp_dir, exist_ok=True)

	try:
	gr.Info("Step 1/5: Extracting audio from video...")

	audio_path = os.path.join(temp_dir, "audio.wav")

	# Use ffmpeg to extract audio. -y overwrites existing files. -i is input.
	# -vn disables video recording. -acodec pcm_s16le is standard for .wav
	# -ar 16000 is the sample rate Whisper expects.
	command = [
	"ffmpeg", "-i", video_path, "-y",
	"-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
	audio_path
	]
	subprocess.run(command, check=True, capture_output=True, text=True)

	if not os.path.exists(audio_path):
	raise FileNotFoundError("Audio extraction failed. ffmpeg did not produce an audio file.")


	# --- 2. Transcribe the original audio to text ---
	gr.Info("Step 2/5: Transcribing original audio...")
	transcription_result = asr_pipeline(
	audio_path,
	return_timestamps=True, # We don't need timestamps for the full transcript
	generate_kwargs={"task": "transcribe"}
	)
	original_transcript = transcription_result["text"].strip()

	if not original_transcript:
	gr.Warning("No speech was detected in the video.")
	return "No speech detected.", "N/A", "N/A", None, video_path
	yield "", original_transcript, "", None, video_path

	# --- 3. Detect the language of the original transcript ---
	gr.Info("Step 3/5: Detecting language...")
	try:
	detected_language_code = detect(original_transcript)
	# You can expand this with a dictionary for full language names if desired
	# e.g., lang_map = {'es': 'Spanish', 'fr': 'French', ...}
	except Exception:
	detected_language_code = "Unknown"
	summary_markdown = f"""
	## Translation Details
	- Detected Language: `{detected_language_code}`

	---

	"""
	yield summary_markdown, original_transcript, "", None, video_path

	# --- 4. Translate the audio into English ---
	gr.Info("Step 4/5: Translating audio to English...")
	translation_result = asr_pipeline(
	audio_path,
	return_timestamps=True,
	generate_kwargs={"task": "translate", "language": "en"} # Explicitly translate to English
	)
	translated_text = translation_result["text"].strip()

	# Create a detailed summary markdown
	summary_markdown += f"""
	### Translated Text (English)
	{translated_text}
	"""

	yield summary_markdown, original_transcript, translated_text, None, video_path

	# --- 5. Convert translated text to speech ---
	gr.Info("Step 5/5: Generating translated audio...")
	tts = gTTS(translated_text, lang='en')
	translated_audio_path = os.path.join(temp_dir, "translated_audio.mp3")
	tts.save(translated_audio_path)

	# Create a detailed summary markdown
	summary_markdown += f"""
	### Translated Text (English)
	{translated_text}
	"""

	return summary_markdown, original_transcript, translated_text, translated_audio_path, video_path

	except subprocess.CalledProcessError as e:
	error_message = f"ffmpeg error: {e.stderr}"
	gr.Warning(error_message)
	return error_message, None, None, None, None
	except Exception as e:
	error_message = f"An unexpected error occurred: {str(e)}"
	gr.Warning(error_message)
	return error_message, None, None, None, None
	finally:
	# Clean up the temporary directory
	if os.path.exists(temp_dir):
	shutil.rmtree(temp_dir)


	# --- Create the Gradio interface ---
	iface = gr.Interface(
	fn=translate_video,
	inputs=gr.Video(label="Upload Your Video", sources=['upload']),
	outputs=[
	gr.Markdown(label="Summary"),
	gr.Textbox(label="Original Transcript", interactive=False, lines=5),
	gr.Textbox(label="Translated Text (English)", interactive=False, lines=5),
	gr.Audio(label="Translated Audio (English)"),
	gr.Video(label="Original Video"),
	],
	title="Enhanced Video Translator",
	description="Upload a video to transcribe its audio, detect the language, and translate it to English. Provides original transcript, translated text, and translated audio.",
	allow_flagging="never",
	examples=[
	# You can place video files in a folder named 'examples' next to your script
	# and they will show up here.
	# [os.path.join(os.path.dirname(__file__), "examples/example_video_1.mp4")],
	]
	)

	if __name__ == "__main__":
	iface.launch()