|
import gradio as gr |
|
import librosa |
|
import numpy as np |
|
import moviepy.editor as mpy |
|
import torch |
|
|
|
from PIL import Image, ImageDraw, ImageFont |
|
from transformers import pipeline |
|
|
|
|
|
max_duration = 60 |
|
fps = 25 |
|
video_width = 640 |
|
video_height = 480 |
|
margin_left = 20 |
|
margin_right = 20 |
|
margin_top = 20 |
|
line_height = 44 |
|
|
|
background_image = Image.open("background.png") |
|
font = ImageFont.truetype("Lato-Regular.ttf", 40) |
|
text_color = (255, 200, 200) |
|
highlight_color = (255, 255, 255) |
|
|
|
|
|
|
|
checkpoint = "openai/whisper-large-v2" |
|
|
|
if torch.cuda.is_available() and torch.cuda.device_count() > 0: |
|
from transformers import ( |
|
AutomaticSpeechRecognitionPipeline, |
|
WhisperForConditionalGeneration, |
|
WhisperProcessor, |
|
) |
|
model = WhisperForConditionalGeneration.from_pretrained(checkpoint).to("cuda").half() |
|
processor = WhisperProcessor.from_pretrained(checkpoint) |
|
pipe = AutomaticSpeechRecognitionPipeline( |
|
model=model, |
|
tokenizer=processor.tokenizer, |
|
feature_extractor=processor.feature_extractor, |
|
batch_size=8, |
|
torch_dtype=torch.float16, |
|
device="cuda:0" |
|
) |
|
else: |
|
pipe = pipeline(model=checkpoint) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pipe.model.generation_config.alignment_heads = [[10, 12], [13, 17], [16, 11], [16, 12], [16, 13], [17, 15], [17, 16], [18, 4], [18, 11], [18, 19], [19, 11], [21, 2], [21, 3], [22, 3], [22, 9], [22, 12], [23, 5], [23, 7], [23, 13], [25, 5], [26, 1], [26, 12], [27, 15]] |
|
|
|
chunks = [] |
|
|
|
start_chunk = 0 |
|
last_draws = [] |
|
last_image = None |
|
|
|
|
|
def make_frame(t): |
|
global chunks, start_chunk, last_draws, last_image |
|
|
|
|
|
|
|
|
|
|
|
|
|
image = background_image.copy() |
|
draw = ImageDraw.Draw(image) |
|
|
|
|
|
|
|
|
|
space_length = draw.textlength(" ", font) |
|
x = margin_left |
|
y = margin_top |
|
|
|
|
|
draws = [] |
|
for i in range(start_chunk, len(chunks)): |
|
chunk = chunks[i] |
|
chunk_start = chunk["timestamp"][0] |
|
chunk_end = chunk["timestamp"][1] |
|
if chunk_start > t: break |
|
if chunk_end is None: chunk_end = max_duration |
|
|
|
word = chunk["text"] |
|
word_length = draw.textlength(word + " ", font) - space_length |
|
|
|
if x + word_length >= video_width - margin_right: |
|
x = margin_left |
|
y += line_height |
|
|
|
|
|
if y >= margin_top + line_height * 7: |
|
start_chunk = i |
|
break |
|
|
|
highlight = (chunk_start <= t < chunk_end) |
|
draws.append([x, y, word, word_length, highlight]) |
|
|
|
x += word_length + space_length |
|
|
|
|
|
|
|
if draws != last_draws: |
|
for x, y, word, word_length, highlight in draws: |
|
if highlight: |
|
color = highlight_color |
|
draw.rectangle([x, y + line_height, x + word_length, y + line_height + 4], fill=color) |
|
else: |
|
color = text_color |
|
|
|
draw.text((x, y), word, fill=color, font=font) |
|
|
|
last_image = np.array(image) |
|
last_draws = draws |
|
|
|
return last_image |
|
|
|
|
|
def predict(audio_path, lang): |
|
global chunks, start_chunk, last_draws, last_image |
|
|
|
start_chunk = 0 |
|
last_draws = None |
|
last_image = None |
|
|
|
audio_data, sr = librosa.load(audio_path, mono=True) |
|
duration = librosa.get_duration(y=audio_data, sr=sr) |
|
duration = min(max_duration, duration) |
|
audio_data = audio_data[:int(duration * sr)] |
|
|
|
|
|
audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate) |
|
pipe.model.config.forced_decoder_ids = ( |
|
pipe.tokenizer.get_decoder_prompt_ids( |
|
language=lang, |
|
task="transcribe" |
|
) |
|
) |
|
output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word", generate_kwargs = |
|
{ |
|
"penalty_alpha": 0.6, |
|
"top_k": 5, |
|
}) |
|
chunks = output["chunks"] |
|
|
|
|
|
|
|
clip = mpy.VideoClip(make_frame, duration=duration) |
|
audio_clip = mpy.AudioFileClip(audio_path).set_duration(duration) |
|
clip = clip.set_audio(audio_clip) |
|
clip.write_videofile("my_video.mp4", fps=fps, codec="libx264", audio_codec="aac") |
|
return "my_video.mp4" |
|
|
|
|
|
title = "Word-level timestamps with Whisper" |
|
|
|
description = """ |
|
This demo shows Whisper <b>word-level timestamps</b> in action using Hugging Face Transformers. It creates a video showing subtitled audio with the current word highlighted. It can even do music lyrics! |
|
|
|
This demo uses the <b>openai/whisper-small</b> checkpoint. |
|
|
|
Since it's only a demo, the output is limited to the first 60 seconds of audio. |
|
To use this on longer audio, <a href="https://huggingface.co/spaces/Matthijs/whisper_word_timestamps/settings?duplicate=true">duplicate the space</a> |
|
and in <b>app.py</b> change the value of `max_duration`. |
|
""" |
|
|
|
article = """ |
|
<div style='margin:20px auto;'> |
|
|
|
<p>Credits:<p> |
|
|
|
<ul> |
|
<li>Shakespeare's "Henry V" speech from <a href="https://freesound.org/people/acclivity/sounds/24096/">acclivity</a> (CC BY-NC 4.0 license) |
|
<li>"Here's to the Crazy Ones" speech by Steve Jobs</li> |
|
<li>"Stupid People" comedy routine by Bill Engvall</li> |
|
<li>"BeOS, It's The OS" song by The Cotton Squares</li> |
|
<li>Lato font by Łukasz Dziedzic (licensed under Open Font License)</li> |
|
<li>Whisper model by OpenAI</li> |
|
</ul> |
|
|
|
</div> |
|
""" |
|
|
|
examples = [ |
|
["examples/steve_jobs_crazy_ones.mp3", "en"], |
|
["examples/henry5.wav", "en"], |
|
["examples/stupid_people.mp3", "en"], |
|
["examples/beos_song.mp3", "en"], |
|
] |
|
|
|
gr.Interface( |
|
fn=predict, |
|
inputs=[ |
|
gr.Audio(label="Upload Audio", source="upload", type="filepath"), |
|
gr.Dropdown( |
|
["en", "de", "it", "fr", "zh"], label="Lang", info="Select a language!", max_choices=1 |
|
) |
|
], |
|
outputs=[ |
|
gr.Video(label="Output Video"), |
|
], |
|
title=title, |
|
description=description, |
|
article=article, |
|
examples=examples, |
|
).launch() |
|
|