Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import torch | |
import numpy as np | |
import decord | |
import torch.nn as nn | |
import json | |
import cv2 | |
from kpe_mediapipe import video_holistic | |
from crop_hands import HandExtractor | |
from crop_face import FaceExtractor | |
from dinov2_features import extract_embeddings_from_frames | |
from body_features import process_pose_landmarks | |
# from shubert import SignHubertModel, SignHubertConfig | |
from inference import test | |
import subprocess | |
class SHuBERTProcessor: | |
def __init__(self, config): | |
self.config = config | |
def process_video(self, video_path): | |
# output_file = f"{output_path}/{os.path.basename(video_file)}" | |
# # Target FPS is 12.5 | |
# cmd = [ | |
# 'ffmpeg', | |
# '-i', video_path, | |
# '-filter:v', 'fps=15', | |
# '-c:v', 'libx264', | |
# '-preset', 'medium', # Balance between speed and quality | |
# '-crf', '23', # Quality level (lower is better) | |
# '-y', # Overwrite output file if it exists | |
# video_path | |
# ] | |
# try: | |
# subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
# print(f"Saved to {video_path} at 15 fps") | |
# except subprocess.CalledProcessError as e: | |
# print(f"Error reading video {video_path}: {e}") | |
# Step 1: Change the fps to 15 | |
signer_video = decord.VideoReader(video_path) | |
signer_video_fps = signer_video.get_avg_fps() | |
# target_fps = 12 | |
# stride = max(1, int(round(signer_video_fps / target_fps))) | |
stride = 1 | |
index_list = list(range(0, len(signer_video), stride)) | |
signer_video = signer_video.get_batch(index_list) | |
signer_video = signer_video.asnumpy() | |
# Step 2: Extract pose using kpe_mediapipe | |
landmarks = video_holistic( | |
video_input=signer_video, | |
face_model_path=self.config['mediapipe_face_model_path'], | |
hand_model_path=self.config['mediapipe_hands_model_path'], | |
) | |
# Step 3: Extract stream features | |
hand_extractor = HandExtractor() | |
left_hand_frames, right_hand_frames = hand_extractor.extract_hand_frames(signer_video, landmarks) | |
left_hand_embeddings = extract_embeddings_from_frames(left_hand_frames, self.config['dino_hands_model_path']) | |
right_hand_embeddings = extract_embeddings_from_frames(right_hand_frames, self.config['dino_hands_model_path']) | |
del left_hand_frames, right_hand_frames | |
face_extractor = FaceExtractor() | |
face_frames = face_extractor.extract_face_frames(signer_video, landmarks) | |
face_embeddings = extract_embeddings_from_frames(face_frames, self.config['dino_face_model_path']) | |
del face_frames, signer_video | |
pose_embeddings = process_pose_landmarks(landmarks) | |
del landmarks | |
output_text = test(face_embeddings, | |
left_hand_embeddings, | |
right_hand_embeddings, | |
pose_embeddings, | |
self.config['slt_model_config'], | |
self.config['slt_model_checkpoint'], | |
self.config['slt_tokenizer_checkpoint'], | |
self.config['temp_dir']) | |
return output_text | |
if __name__ == "__main__": | |
config = { | |
'yolov8_model_path': '/share/data/pals/shester/inference/models/yolov8n.pt', | |
'dino_face_model_path': '/share/data/pals/shester/inference/models/dinov2face.pth', | |
'dino_hands_model_path': '/share/data/pals/shester/inference/models/dinov2hand.pth', | |
'mediapipe_face_model_path': '/share/data/pals/shester/inference/models/face_landmarker_v2_with_blendshapes.task', | |
'mediapipe_hands_model_path': '/share/data/pals/shester/inference/models/hand_landmarker.task', | |
'shubert_model_path': '/share/data/pals/shester/inference/models/checkpoint_836_400000.pt', | |
'temp_dir': '/share/data/pals/shester/inference', | |
'slt_model_config': '/share/data/pals/shester/inference/models/byt5_base/config.json', | |
'slt_model_checkpoint': '/share/data/pals/shester/inference/models/checkpoint-11625', | |
'slt_tokenizer_checkpoint': '/share/data/pals/shester/inference/models/byt5_base', | |
} | |
# input_clip = "/share/data/pals/shester/datasets/openasl/clips_bbox/J-0KHhPS_m4.029676-029733.mp4" | |
# input_clip = "/share/data/pals/shester/inference/recordings/sabrin30fps.mp4" | |
input_clip = "/share/data/pals/shester/inference/recordings/sample_sabrina.mp4" | |
processor = SHuBERTProcessor(config) | |
output_text = processor.process_video(input_clip) | |
print(f"The English translation is: {output_text}") | |
# /home-nfs/shesterg/.cache/torch/hub/facebookresearch_dinov2_main/dinov2/layers/attention.py | |
# /home-nfs/shesterg/.cache/torch/hub/facebookresearch_dinov2_main/dinov2/layers/block.py |