Spaces:

broadfield-dev
/

Video-Translate

Running

File size: 6,364 Bytes

#
# ----- Prerequisites -----
# 1. Install required Python libraries:
#    pip install gradio transformers torch gtts langdetect
#
# 2. Install ffmpeg on your system.
#    - (Mac)     brew install ffmpeg
#    - (Ubuntu)  sudo apt install ffmpeg
#    - (Windows) choco install ffmpeg
#
import gradio as gr
import subprocess
import os
import shutil
import uuid
from transformers import pipeline
from gtts import gTTS
from langdetect import detect, DetectorFactory

# Ensure deterministic language detection results
DetectorFactory.seed = 0

# --- 1. Load the model only once ---
# This is more efficient as it won't reload the model on every function call.
print("Loading Whisper model, this may take a moment...")
try:
    asr_pipeline = pipeline(
        "automatic-speech-recognition",
        model="openai/whisper-tiny", # Using tiny for speed, can be changed to base, small, etc.
        device="cpu" # Use "cuda:0" if you have a GPU and torch with CUDA
    )
    print("Whisper model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    # Exit or handle the error appropriately if the model is critical
    asr_pipeline = None

def translate_video(video_path):
    """
    Translates the audio of a video file to English and provides detailed output.
    """
    if not asr_pipeline:
        gr.Warning("The speech recognition model is not available. The application cannot proceed.")
        return "Model not loaded.", None, None, None, None

    # Create a unique temporary directory for this run
    temp_dir = f"temp_{uuid.uuid4()}"
    os.makedirs(temp_dir, exist_ok=True)
    
    try:
        gr.Info("Step 1/5: Extracting audio from video...")
        
        audio_path = os.path.join(temp_dir, "audio.wav")
        
        # Use ffmpeg to extract audio. -y overwrites existing files. -i is input.
        # -vn disables video recording. -acodec pcm_s16le is standard for .wav
        # -ar 16000 is the sample rate Whisper expects.
        command = [
            "ffmpeg", "-i", video_path, "-y",
            "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
            audio_path
        ]
        subprocess.run(command, check=True, capture_output=True, text=True)

        if not os.path.exists(audio_path):
            raise FileNotFoundError("Audio extraction failed. ffmpeg did not produce an audio file.")


        # --- 2. Transcribe the original audio to text ---
        gr.Info("Step 2/5: Transcribing original audio...")
        transcription_result = asr_pipeline(
            audio_path,
            return_timestamps=True, # We don't need timestamps for the full transcript
            generate_kwargs={"task": "transcribe"}
        )
        original_transcript = transcription_result["text"].strip()

        if not original_transcript:
            gr.Warning("No speech was detected in the video.")
            return "No speech detected.", "N/A", "N/A", None, video_path
        yield "", original_transcript, "", None, video_path

        # --- 3. Detect the language of the original transcript ---
        gr.Info("Step 3/5: Detecting language...")
        try:
            detected_language_code = detect(original_transcript)
            # You can expand this with a dictionary for full language names if desired
            # e.g., lang_map = {'es': 'Spanish', 'fr': 'French', ...}
        except Exception:
            detected_language_code = "Unknown"
        summary_markdown = f"""
        ## Translation Details
        - **Detected Language**: `{detected_language_code}`
        
        ---
        
        """
        yield summary_markdown, original_transcript, "", None, video_path

        # --- 4. Translate the audio into English ---
        gr.Info("Step 4/5: Translating audio to English...")
        translation_result = asr_pipeline(
            audio_path,
            return_timestamps=True,
            generate_kwargs={"task": "translate", "language": "en"} # Explicitly translate to English
        )
        translated_text = translation_result["text"].strip()
        
        # Create a detailed summary markdown
        summary_markdown += f"""
        ### Translated Text (English)
        {translated_text}
        """

        yield summary_markdown, original_transcript, translated_text, None, video_path
        
        # --- 5. Convert translated text to speech ---
        gr.Info("Step 5/5: Generating translated audio...")
        tts = gTTS(translated_text, lang='en')
        translated_audio_path = os.path.join(temp_dir, "translated_audio.mp3")
        tts.save(translated_audio_path)

        # Create a detailed summary markdown
        summary_markdown += f"""
        ### Translated Text (English)
        {translated_text}
        """

        return summary_markdown, original_transcript, translated_text, translated_audio_path, video_path

    except subprocess.CalledProcessError as e:
        error_message = f"ffmpeg error: {e.stderr}"
        gr.Warning(error_message)
        return error_message, None, None, None, None
    except Exception as e:
        error_message = f"An unexpected error occurred: {str(e)}"
        gr.Warning(error_message)
        return error_message, None, None, None, None
    finally:
        # Clean up the temporary directory
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)


# --- Create the Gradio interface ---
iface = gr.Interface(
    fn=translate_video,
    inputs=gr.Video(label="Upload Your Video", sources=['upload']),
    outputs=[
        gr.Markdown(label="Summary"),
        gr.Textbox(label="Original Transcript", interactive=False, lines=5),
        gr.Textbox(label="Translated Text (English)", interactive=False, lines=5),
        gr.Audio(label="Translated Audio (English)"),
        gr.Video(label="Original Video"),
    ],
    title="Enhanced Video Translator",
    description="Upload a video to transcribe its audio, detect the language, and translate it to English. Provides original transcript, translated text, and translated audio.",
    allow_flagging="never",
    examples=[
        # You can place video files in a folder named 'examples' next to your script
        # and they will show up here.
        # [os.path.join(os.path.dirname(__file__), "examples/example_video_1.mp4")],
    ]
)

if __name__ == "__main__":
    iface.launch()