ShesterG's picture
updated fps
8ff74f5
import os
import torch
import numpy as np
import decord
import torch.nn as nn
import json
import cv2
from kpe_mediapipe import video_holistic
from crop_hands import HandExtractor
from crop_face import FaceExtractor
from dinov2_features import extract_embeddings_from_frames
from body_features import process_pose_landmarks
# from shubert import SignHubertModel, SignHubertConfig
from inference import test
import subprocess
class SHuBERTProcessor:
def __init__(self, config):
self.config = config
def process_video(self, video_path):
# output_file = f"{output_path}/{os.path.basename(video_file)}"
# # Target FPS is 12.5
# cmd = [
# 'ffmpeg',
# '-i', video_path,
# '-filter:v', 'fps=15',
# '-c:v', 'libx264',
# '-preset', 'medium', # Balance between speed and quality
# '-crf', '23', # Quality level (lower is better)
# '-y', # Overwrite output file if it exists
# video_path
# ]
# try:
# subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# print(f"Saved to {video_path} at 15 fps")
# except subprocess.CalledProcessError as e:
# print(f"Error reading video {video_path}: {e}")
# Step 1: Change the fps to 15
signer_video = decord.VideoReader(video_path)
signer_video_fps = signer_video.get_avg_fps()
# target_fps = 12
# stride = max(1, int(round(signer_video_fps / target_fps)))
stride = 1
index_list = list(range(0, len(signer_video), stride))
signer_video = signer_video.get_batch(index_list)
signer_video = signer_video.asnumpy()
# Step 2: Extract pose using kpe_mediapipe
landmarks = video_holistic(
video_input=signer_video,
face_model_path=self.config['mediapipe_face_model_path'],
hand_model_path=self.config['mediapipe_hands_model_path'],
)
# Step 3: Extract stream features
hand_extractor = HandExtractor()
left_hand_frames, right_hand_frames = hand_extractor.extract_hand_frames(signer_video, landmarks)
left_hand_embeddings = extract_embeddings_from_frames(left_hand_frames, self.config['dino_hands_model_path'])
right_hand_embeddings = extract_embeddings_from_frames(right_hand_frames, self.config['dino_hands_model_path'])
del left_hand_frames, right_hand_frames
face_extractor = FaceExtractor()
face_frames = face_extractor.extract_face_frames(signer_video, landmarks)
face_embeddings = extract_embeddings_from_frames(face_frames, self.config['dino_face_model_path'])
del face_frames, signer_video
pose_embeddings = process_pose_landmarks(landmarks)
del landmarks
output_text = test(face_embeddings,
left_hand_embeddings,
right_hand_embeddings,
pose_embeddings,
self.config['slt_model_config'],
self.config['slt_model_checkpoint'],
self.config['slt_tokenizer_checkpoint'],
self.config['temp_dir'])
return output_text
if __name__ == "__main__":
config = {
'yolov8_model_path': '/share/data/pals/shester/inference/models/yolov8n.pt',
'dino_face_model_path': '/share/data/pals/shester/inference/models/dinov2face.pth',
'dino_hands_model_path': '/share/data/pals/shester/inference/models/dinov2hand.pth',
'mediapipe_face_model_path': '/share/data/pals/shester/inference/models/face_landmarker_v2_with_blendshapes.task',
'mediapipe_hands_model_path': '/share/data/pals/shester/inference/models/hand_landmarker.task',
'shubert_model_path': '/share/data/pals/shester/inference/models/checkpoint_836_400000.pt',
'temp_dir': '/share/data/pals/shester/inference',
'slt_model_config': '/share/data/pals/shester/inference/models/byt5_base/config.json',
'slt_model_checkpoint': '/share/data/pals/shester/inference/models/checkpoint-11625',
'slt_tokenizer_checkpoint': '/share/data/pals/shester/inference/models/byt5_base',
}
# input_clip = "/share/data/pals/shester/datasets/openasl/clips_bbox/J-0KHhPS_m4.029676-029733.mp4"
# input_clip = "/share/data/pals/shester/inference/recordings/sabrin30fps.mp4"
input_clip = "/share/data/pals/shester/inference/recordings/sample_sabrina.mp4"
processor = SHuBERTProcessor(config)
output_text = processor.process_video(input_clip)
print(f"The English translation is: {output_text}")
# /home-nfs/shesterg/.cache/torch/hub/facebookresearch_dinov2_main/dinov2/layers/attention.py
# /home-nfs/shesterg/.cache/torch/hub/facebookresearch_dinov2_main/dinov2/layers/block.py