File size: 6,364 Bytes
a74e608
 
 
 
 
 
 
 
 
 
5181a56
 
 
 
 
 
 
a74e608
eff56bc
a74e608
 
eff56bc
a74e608
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5181a56
eff56bc
a74e608
0a15900
a74e608
 
 
 
 
 
 
 
 
 
 
eff56bc
5181a56
a74e608
eff56bc
0a15900
a74e608
 
 
 
ddfc932
a74e608
5181a56
a74e608
 
 
 
 
0a15900
eff56bc
a74e608
 
 
 
 
 
 
 
91ccc2d
0a15900
 
 
 
 
 
 
eff56bc
a74e608
 
 
 
2ca3a63
a74e608
 
 
0a15900
 
 
 
 
 
a74e608
0a15900
 
a74e608
 
 
5181a56
 
eff56bc
a74e608
0a15900
a74e608
 
 
 
 
eff56bc
a74e608
 
 
 
eff56bc
a74e608
 
 
 
 
 
 
 
5181a56
a74e608
5181a56
 
a74e608
5181a56
a74e608
 
 
 
5181a56
 
a74e608
 
5181a56
a74e608
 
 
 
 
5181a56
eff56bc
 
5181a56
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#
# ----- Prerequisites -----
# 1. Install required Python libraries:
#    pip install gradio transformers torch gtts langdetect
#
# 2. Install ffmpeg on your system.
#    - (Mac)     brew install ffmpeg
#    - (Ubuntu)  sudo apt install ffmpeg
#    - (Windows) choco install ffmpeg
#
import gradio as gr
import subprocess
import os
import shutil
import uuid
from transformers import pipeline
from gtts import gTTS
from langdetect import detect, DetectorFactory

# Ensure deterministic language detection results
DetectorFactory.seed = 0

# --- 1. Load the model only once ---
# This is more efficient as it won't reload the model on every function call.
print("Loading Whisper model, this may take a moment...")
try:
    asr_pipeline = pipeline(
        "automatic-speech-recognition",
        model="openai/whisper-tiny", # Using tiny for speed, can be changed to base, small, etc.
        device="cpu" # Use "cuda:0" if you have a GPU and torch with CUDA
    )
    print("Whisper model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    # Exit or handle the error appropriately if the model is critical
    asr_pipeline = None

def translate_video(video_path):
    """
    Translates the audio of a video file to English and provides detailed output.
    """
    if not asr_pipeline:
        gr.Warning("The speech recognition model is not available. The application cannot proceed.")
        return "Model not loaded.", None, None, None, None

    # Create a unique temporary directory for this run
    temp_dir = f"temp_{uuid.uuid4()}"
    os.makedirs(temp_dir, exist_ok=True)
    
    try:
        gr.Info("Step 1/5: Extracting audio from video...")
        
        audio_path = os.path.join(temp_dir, "audio.wav")
        
        # Use ffmpeg to extract audio. -y overwrites existing files. -i is input.
        # -vn disables video recording. -acodec pcm_s16le is standard for .wav
        # -ar 16000 is the sample rate Whisper expects.
        command = [
            "ffmpeg", "-i", video_path, "-y",
            "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
            audio_path
        ]
        subprocess.run(command, check=True, capture_output=True, text=True)

        if not os.path.exists(audio_path):
            raise FileNotFoundError("Audio extraction failed. ffmpeg did not produce an audio file.")


        # --- 2. Transcribe the original audio to text ---
        gr.Info("Step 2/5: Transcribing original audio...")
        transcription_result = asr_pipeline(
            audio_path,
            return_timestamps=True, # We don't need timestamps for the full transcript
            generate_kwargs={"task": "transcribe"}
        )
        original_transcript = transcription_result["text"].strip()

        if not original_transcript:
            gr.Warning("No speech was detected in the video.")
            return "No speech detected.", "N/A", "N/A", None, video_path
        yield "", original_transcript, "", None, video_path

        # --- 3. Detect the language of the original transcript ---
        gr.Info("Step 3/5: Detecting language...")
        try:
            detected_language_code = detect(original_transcript)
            # You can expand this with a dictionary for full language names if desired
            # e.g., lang_map = {'es': 'Spanish', 'fr': 'French', ...}
        except Exception:
            detected_language_code = "Unknown"
        summary_markdown = f"""
        ## Translation Details
        - **Detected Language**: `{detected_language_code}`
        
        ---
        
        """
        yield summary_markdown, original_transcript, "", None, video_path

        # --- 4. Translate the audio into English ---
        gr.Info("Step 4/5: Translating audio to English...")
        translation_result = asr_pipeline(
            audio_path,
            return_timestamps=True,
            generate_kwargs={"task": "translate", "language": "en"} # Explicitly translate to English
        )
        translated_text = translation_result["text"].strip()
        
        # Create a detailed summary markdown
        summary_markdown += f"""
        ### Translated Text (English)
        {translated_text}
        """

        yield summary_markdown, original_transcript, translated_text, None, video_path
        
        # --- 5. Convert translated text to speech ---
        gr.Info("Step 5/5: Generating translated audio...")
        tts = gTTS(translated_text, lang='en')
        translated_audio_path = os.path.join(temp_dir, "translated_audio.mp3")
        tts.save(translated_audio_path)

        # Create a detailed summary markdown
        summary_markdown += f"""
        ### Translated Text (English)
        {translated_text}
        """

        return summary_markdown, original_transcript, translated_text, translated_audio_path, video_path

    except subprocess.CalledProcessError as e:
        error_message = f"ffmpeg error: {e.stderr}"
        gr.Warning(error_message)
        return error_message, None, None, None, None
    except Exception as e:
        error_message = f"An unexpected error occurred: {str(e)}"
        gr.Warning(error_message)
        return error_message, None, None, None, None
    finally:
        # Clean up the temporary directory
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)


# --- Create the Gradio interface ---
iface = gr.Interface(
    fn=translate_video,
    inputs=gr.Video(label="Upload Your Video", sources=['upload']),
    outputs=[
        gr.Markdown(label="Summary"),
        gr.Textbox(label="Original Transcript", interactive=False, lines=5),
        gr.Textbox(label="Translated Text (English)", interactive=False, lines=5),
        gr.Audio(label="Translated Audio (English)"),
        gr.Video(label="Original Video"),
    ],
    title="Enhanced Video Translator",
    description="Upload a video to transcribe its audio, detect the language, and translate it to English. Provides original transcript, translated text, and translated audio.",
    allow_flagging="never",
    examples=[
        # You can place video files in a folder named 'examples' next to your script
        # and they will show up here.
        # [os.path.join(os.path.dirname(__file__), "examples/example_video_1.mp4")],
    ]
)

if __name__ == "__main__":
    iface.launch()