Spaces:
Running
Running
import os | |
import google.generativeai as genai | |
from moviepy.video.io.VideoFileClip import VideoFileClip | |
from moviepy.audio.io.AudioFileClip import AudioFileClip | |
from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip | |
from moviepy.video.tools.subtitles import SubtitlesClip | |
import tempfile | |
import logging | |
import gradio as gr | |
from gtts import gTTS | |
import srt | |
# Suppress moviepy logs | |
logging.getLogger("moviepy").setLevel(logging.ERROR) | |
# Configure Gemini API | |
genai.configure(api_key=os.environ["GEMINI_API_KEY"]) | |
# Create the Gemini model | |
generation_config = { | |
"temperature": 0.7, | |
"top_p": 0.9, | |
"top_k": 40, | |
"max_output_tokens": 8192, | |
"response_mime_type": "text/plain", | |
} | |
model = genai.GenerativeModel( | |
model_name="gemini-2.5-flash-preview-05-20", | |
generation_config=generation_config, | |
) | |
# List of all supported languages | |
SUPPORTED_LANGUAGES = [ | |
"Auto Detect", "English", "Chinese", "German", "Spanish", "Russian", "Korean", | |
"French", "Japanese", "Portuguese", "Turkish", "Polish", "Catalan", "Dutch", | |
"Arabic", "Swedish", "Italian", "Indonesian", "Hindi", "Finnish", "Vietnamese", | |
"Hebrew", "Ukrainian", "Greek", "Malay", "Czech", "Romanian", "Danish", | |
"Hungarian", "Tamil", "Norwegian", "Thai", "Urdu", "Croatian", "Bulgarian", | |
"Lithuanian", "Latin", "Maori", "Malayalam", "Welsh", "Slovak", "Telugu", | |
"Persian", "Latvian", "Bengali", "Serbian", "Azerbaijani", "Slovenian", | |
"Kannada", "Estonian", "Macedonian", "Breton", "Basque", "Icelandic", | |
"Armenian", "Nepali", "Mongolian", "Bosnian", "Kazakh", "Albanian", | |
"Swahili", "Galician", "Marathi", "Punjabi", "Sinhala", "Khmer", "Shona", | |
"Yoruba", "Somali", "Afrikaans", "Occitan", "Georgian", "Belarusian", | |
"Tajik", "Sindhi", "Gujarati", "Amharic", "Yiddish", "Lao", "Uzbek", | |
"Faroese", "Haitian Creole", "Pashto", "Turkmen", "Nynorsk", "Maltese", | |
"Sanskrit", "Luxembourgish", "Burmese", "Tibetan", "Tagalog", "Malagasy", | |
"Assamese", "Tatar", "Hawaiian", "Lingala", "Hausa", "Bashkir", "Javanese", | |
"Sundanese" | |
] | |
# Language code mapping for gTTS | |
LANGUAGE_CODES = { | |
"English": "en", "Chinese": "zh", "German": "de", "Spanish": "es", | |
"Russian": "ru", "Korean": "ko", "French": "fr", "Japanese": "ja", | |
"Portuguese": "pt", "Turkish": "tr", "Polish": "pl", "Catalan": "ca", | |
"Dutch": "nl", "Arabic": "ar", "Swedish": "sv", "Italian": "it", | |
"Indonesian": "id", "Hindi": "hi", "Finnish": "fi", "Vietnamese": "vi", | |
"Hebrew": "he", "Ukrainian": "uk", "Greek": "el", "Malay": "ms", | |
"Czech": "cs", "Romanian": "ro", "Danish": "da", "Hungarian": "hu", | |
"Tamil": "ta", "Norwegian": "no", "Thai": "th", "Urdu": "ur", | |
"Croatian": "hr", "Bulgarian": "bg", "Lithuanian": "lt", "Latin": "la", | |
"Maori": "mi", "Malayalam": "ml", "Welsh": "cy", "Slovak": "sk", | |
"Telugu": "te", "Persian": "fa", "Latvian": "lv", "Bengali": "bn", | |
"Serbian": "sr", "Azerbaijani": "az", "Slovenian": "sl", "Kannada": "kn", | |
"Estonian": "et", "Macedonian": "mk", "Breton": "br", "Basque": "eu", | |
"Icelandic": "is", "Armenian": "hy", "Nepali": "ne", "Mongolian": "mn", | |
"Bosnian": "bs", "Kazakh": "kk", "Albanian": "sq", "Swahili": "sw", | |
"Galician": "gl", "Marathi": "mr", "Punjabi": "pa", "Sinhala": "si", | |
"Khmer": "km", "Shona": "sn", "Yoruba": "yo", "Somali": "so", | |
"Afrikaans": "af", "Occitan": "oc", "Georgian": "ka", "Belarusian": "be", | |
"Tajik": "tg", "Sindhi": "sd", "Gujarati": "gu", "Amharic": "am", | |
"Yiddish": "yi", "Lao": "lo", "Uzbek": "uz", "Faroese": "fo", | |
"Haitian Creole": "ht", "Pashto": "ps", "Turkmen": "tk", "Nynorsk": "nn", | |
"Maltese": "mt", "Sanskrit": "sa", "Luxembourgish": "lb", "Burmese": "my", | |
"Tibetan": "bo", "Tagalog": "tl", "Malagasy": "mg", "Assamese": "as", | |
"Tatar": "tt", "Hawaiian": "haw", "Lingala": "ln", "Hausa": "ha", | |
"Bashkir": "ba", "Javanese": "jv", "Sundanese": "su" | |
} | |
def extract_audio_from_video(video_file): | |
"""Extract audio from a video file and save it as a WAV file.""" | |
video = VideoFileClip(video_file) | |
audio_file = os.path.join(tempfile.gettempdir(), "extracted_audio.wav") | |
video.audio.write_audiofile(audio_file, fps=16000, logger=None) | |
return audio_file | |
def transcribe_audio_with_gemini(audio_file): | |
"""Transcribe audio using Gemini with a magic prompt for accurate timestamps.""" | |
with open(audio_file, "rb") as f: | |
audio_data = f.read() | |
audio_blob = { | |
'mime_type': 'audio/wav', | |
'data': audio_data | |
} | |
prompt = """ | |
You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. | |
Include timestamps for each sentence in the following format: | |
[HH:MM:SS] Sentence 1 | |
[HH:MM:SS] Sentence 2 | |
... | |
Ensure the timestamps are accurate and correspond to the start of each sentence. | |
Respond only with the transcription and timestamps. Do not add explanations or extra text. | |
""" | |
convo = model.start_chat() | |
convo.send_message(prompt) | |
response = convo.send_message(audio_blob) | |
return response.text.strip() | |
def generate_subtitles(transcription): | |
"""Generate SRT subtitles from transcription with timestamps.""" | |
lines = transcription.split("\n") | |
srt_subtitles = [] | |
for i, line in enumerate(lines, start=1): | |
if not line.strip(): | |
continue | |
if line.startswith("["): | |
timestamp = line.split("]")[0] + "]" | |
text = line.split("]")[1].strip() | |
else: | |
timestamp = "[00:00:00]" | |
text = line.strip() | |
start_time = timestamp[1:-1] | |
start_seconds = time_to_seconds(start_time) | |
end_seconds = start_seconds + 5 # Placeholder duration | |
subtitle = srt.Subtitle( | |
index=i, | |
start=datetime.timedelta(seconds=start_seconds), | |
end=datetime.timedelta(seconds=end_seconds), | |
content=text | |
) | |
srt_subtitles.append(subtitle) | |
return srt.compose(srt_subtitles) | |
def time_to_seconds(time_str): | |
"""Convert HH:MM:SS to seconds.""" | |
hh, mm, ss = map(int, time_str.split(":")) | |
return hh * 3600 + mm * 60 + ss | |
def seconds_to_time(seconds): | |
"""Convert seconds to HH:MM:SS.""" | |
hh = seconds // 3600 | |
mm = (seconds % 3600) // 60 | |
ss = seconds % 60 | |
return f"{hh:02}:{mm:02}:{ss:02}" | |
def translate_srt(srt_text, target_language): | |
"""Translate an SRT file while preserving timestamps.""" | |
prompt = f""" | |
Translate the following SRT subtitles into {target_language}. | |
Preserve the SRT format (timestamps and structure). | |
Translate only the text after the timestamp. | |
Do not add explanations or extra text. | |
Ensure the translation is accurate and culturally appropriate. | |
Here is the SRT file: | |
{srt_text} | |
""" | |
response = model.generate_content(prompt) | |
return response.text | |
def generate_tts_audio(srt_text, language): | |
"""Generate TTS audio from SRT text.""" | |
# Extract all text from SRT | |
subtitles = list(srt.parse(srt_text)) | |
all_text = " ".join([sub.content for sub in subtitles]) | |
# Get language code | |
lang_code = LANGUAGE_CODES.get(language, "en") | |
# Generate TTS | |
tts = gTTS(text=all_text, lang=lang_code, slow=False) | |
audio_file = os.path.join(tempfile.gettempdir(), "tts_audio.mp3") | |
tts.save(audio_file) | |
return audio_file | |
def add_subtitles_to_video(video_file, srt_file, output_file): | |
"""Add subtitles to video and return the path to the new video.""" | |
# Create subtitle clip | |
generator = lambda txt: TextClip(txt, font='Arial', fontsize=24, color='white') | |
subtitles = SubtitlesClip(srt_file, generator) | |
# Load video | |
video = VideoFileClip(video_file) | |
# Composite video with subtitles | |
result = CompositeVideoClip([ | |
video, | |
subtitles.set_position(('center', 'bottom')) | |
]) | |
# Write output | |
result.write_videofile(output_file, codec='libx264', audio_codec='aac', threads=4) | |
return output_file | |
def process_video(video_file, language="Auto Detect", translate_to=None, add_tts=False, add_subtitles=False): | |
"""Process a video file with full options.""" | |
# Extract audio from the video | |
audio_file = extract_audio_from_video(video_file) | |
# Transcribe audio using Gemini | |
transcription = transcribe_audio_with_gemini(audio_file) | |
# Generate subtitles | |
subtitles = generate_subtitles(transcription) | |
# Save original subtitles | |
original_srt_file = os.path.join(tempfile.gettempdir(), "original_subtitles.srt") | |
with open(original_srt_file, "w", encoding="utf-8") as f: | |
f.write(subtitles) | |
# Translate subtitles if requested | |
translated_srt_file = None | |
if translate_to and translate_to != "None": | |
translated_subtitles = translate_srt(subtitles, translate_to) | |
translated_srt_file = os.path.join(tempfile.gettempdir(), "translated_subtitles.srt") | |
with open(translated_srt_file, "w", encoding="utf-8") as f: | |
f.write(translated_subtitles) | |
# Generate TTS audio if requested | |
tts_audio_file = None | |
if add_tts: | |
target_lang = translate_to if translate_to and translate_to != "None" else language | |
tts_audio_file = generate_tts_audio(subtitles if not translated_srt_file else translated_subtitles, target_lang) | |
# Create video with subtitles if requested | |
output_video_file = None | |
if add_subtitles: | |
srt_to_use = translated_srt_file if translated_srt_file else original_srt_file | |
output_video_file = os.path.join(tempfile.gettempdir(), "output_video.mp4") | |
add_subtitles_to_video(video_file, srt_to_use, output_video_file) | |
# Clean up | |
os.remove(audio_file) | |
return original_srt_file, translated_srt_file, tts_audio_file, output_video_file, "Detected Language: Auto" | |
# Define the Gradio interface | |
with gr.Blocks(title="AutoSubGen Pro - AI Video Subtitle Generator") as demo: | |
# Header | |
with gr.Column(): | |
gr.Markdown("# 🎥 AutoSubGen Pro") | |
gr.Markdown("### Advanced AI-Powered Video Subtitle Generator") | |
gr.Markdown("Generate, translate, and add subtitles with text-to-speech audio to your videos.") | |
# Main content | |
with gr.Tab("Generate Subtitles"): | |
gr.Markdown("### Upload a video file to process") | |
with gr.Row(): | |
video_input = gr.Video(label="Upload Video File", scale=2) | |
with gr.Column(): | |
language_dropdown = gr.Dropdown( | |
choices=SUPPORTED_LANGUAGES, | |
label="Source Language", | |
value="Auto Detect", | |
) | |
translate_to_dropdown = gr.Dropdown( | |
choices=["None"] + SUPPORTED_LANGUAGES[1:], | |
label="Translate To", | |
value="None", | |
) | |
tts_checkbox = gr.Checkbox(label="Generate Text-to-Speech Audio") | |
subtitles_checkbox = gr.Checkbox(label="Add Subtitles to Video") | |
generate_button = gr.Button("Process Video", variant="primary") | |
with gr.Row(): | |
with gr.Column(): | |
original_subtitle_output = gr.File(label="Original Subtitles (SRT)") | |
translated_subtitle_output = gr.File(label="Translated Subtitles (SRT)") | |
with gr.Column(): | |
tts_audio_output = gr.Audio(label="Text-to-Speech Audio", visible=False) | |
video_output = gr.Video(label="Video with Subtitles", visible=False) | |
detected_language_output = gr.Textbox(label="Detected Language") | |
# Show/hide outputs based on checkboxes | |
def toggle_outputs(tts, subs): | |
return [ | |
gr.Audio(visible=tts), | |
gr.Video(visible=subs) | |
] | |
tts_checkbox.change( | |
fn=lambda x: gr.Audio(visible=x), | |
inputs=tts_checkbox, | |
outputs=tts_audio_output | |
) | |
subtitles_checkbox.change( | |
fn=lambda x: gr.Video(visible=x), | |
inputs=subtitles_checkbox, | |
outputs=video_output | |
) | |
# Link button to function | |
generate_button.click( | |
process_video, | |
inputs=[video_input, language_dropdown, translate_to_dropdown, tts_checkbox, subtitles_checkbox], | |
outputs=[original_subtitle_output, translated_subtitle_output, tts_audio_output, video_output, detected_language_output] | |
) | |
# Launch the interface | |
demo.launch(share=True) |