|
|
import os |
|
|
import copy |
|
|
import math |
|
|
import time |
|
|
import pprint |
|
|
from tqdm import tqdm, trange |
|
|
import numpy as np |
|
|
|
|
|
import torch |
|
|
import torch.nn.functional as F |
|
|
import torch.backends.cudnn as cudnn |
|
|
from torch.utils.data import DataLoader |
|
|
|
|
|
from baselines.crossmodal_moment_localization.config import TestOptions |
|
|
from baselines.crossmodal_moment_localization.model_xml import XML |
|
|
from baselines.crossmodal_moment_localization.start_end_dataset import \ |
|
|
start_end_collate, StartEndEvalDataset, prepare_batch_inputs |
|
|
from baselines.clip_alignment_with_language.inference import \ |
|
|
get_submission_top_n, post_processing_vcmr_nms, post_processing_svmr_nms |
|
|
from utils.basic_utils import save_json, load_json |
|
|
from utils.tensor_utils import find_max_triples_from_upper_triangle_product |
|
|
from standalone_eval.eval import eval_retrieval |
|
|
|
|
|
import logging |
|
|
from ndcg_iou_topk import calculate_ndcg_iou |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def compute_context_info(model, eval_dataset, opt): |
|
|
"""Use val set to do evaluation, remember to run with torch.no_grad(). |
|
|
estimated 2200 (videos) * 100 (frm) * 500 (hsz) * 4 (B) * 2 (video/sub) * 2 (layers) / (1024 ** 2) = 1.76 GB |
|
|
max_n_videos: only consider max_n_videos videos for each query to return st_ed scores. |
|
|
""" |
|
|
model.eval() |
|
|
|
|
|
context_dataloader = DataLoader(eval_dataset, |
|
|
collate_fn=start_end_collate, |
|
|
batch_size=opt.eval_context_bsz, |
|
|
num_workers=opt.num_workers, |
|
|
shuffle=False, |
|
|
pin_memory=opt.pin_memory) |
|
|
|
|
|
metas = [] |
|
|
video_feat1 = [] |
|
|
video_feat2 = [] |
|
|
video_mask = [] |
|
|
sub_feat1 = [] |
|
|
sub_feat2 = [] |
|
|
sub_mask = [] |
|
|
for idx, batch in tqdm(enumerate(context_dataloader), |
|
|
desc="Computing query2video scores", |
|
|
total=len(context_dataloader)): |
|
|
metas.extend(batch[0]) |
|
|
model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory) |
|
|
|
|
|
_video_feat1, _video_feat2, _sub_feat1, _sub_feat2 = model.encode_context( |
|
|
model_inputs["video_feat"], model_inputs["video_mask"], |
|
|
model_inputs["sub_feat"], model_inputs["sub_mask"]) |
|
|
if "video" in opt.ctx_mode: |
|
|
video_feat1.append(_video_feat1) |
|
|
video_feat2.append(_video_feat2) |
|
|
video_mask.append(model_inputs["video_mask"]) |
|
|
if "sub" in opt.ctx_mode: |
|
|
sub_feat1.append(_sub_feat1) |
|
|
sub_feat2.append(_sub_feat2) |
|
|
sub_mask.append(model_inputs["sub_mask"]) |
|
|
|
|
|
def cat_tensor(tensor_list): |
|
|
if len(tensor_list) == 0: |
|
|
return None |
|
|
else: |
|
|
seq_l = [e.shape[1] for e in tensor_list] |
|
|
b_sizes = [e.shape[0] for e in tensor_list] |
|
|
b_sizes_cumsum = np.cumsum([0] + b_sizes) |
|
|
if len(tensor_list[0].shape) == 3: |
|
|
hsz = tensor_list[0].shape[2] |
|
|
res_tensor = tensor_list[0].new_zeros(sum(b_sizes), max(seq_l), hsz) |
|
|
elif len(tensor_list[0].shape) == 2: |
|
|
res_tensor = tensor_list[0].new_zeros(sum(b_sizes), max(seq_l)) |
|
|
else: |
|
|
raise ValueError("Only support 2/3 dimensional tensors") |
|
|
for i, e in enumerate(tensor_list): |
|
|
res_tensor[b_sizes_cumsum[i]:b_sizes_cumsum[i+1], :seq_l[i]] = e |
|
|
return res_tensor |
|
|
|
|
|
return metas, dict( |
|
|
video_feat1=cat_tensor(video_feat1), |
|
|
video_feat2=cat_tensor(video_feat2), |
|
|
video_mask=cat_tensor(video_mask), |
|
|
sub_feat1=cat_tensor(sub_feat1), |
|
|
sub_feat2=cat_tensor(sub_feat2), |
|
|
sub_mask=cat_tensor(sub_mask), |
|
|
) |
|
|
|
|
|
|
|
|
def index_if_not_none(input_tensor, indices): |
|
|
if input_tensor is None: |
|
|
return input_tensor |
|
|
else: |
|
|
return input_tensor[indices] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_min_max_length_mask(array_shape, min_l, max_l): |
|
|
""" The last two dimension denotes matrix of upper-triangle with upper-right corner masked, |
|
|
below is the case for 4x4. |
|
|
[[0, 1, 1, 0], |
|
|
[0, 0, 1, 1], |
|
|
[0, 0, 0, 1], |
|
|
[0, 0, 0, 0]] |
|
|
|
|
|
Args: |
|
|
array_shape: np.shape??? The last two dimensions should be the same |
|
|
min_l: int, minimum length of predicted span |
|
|
max_l: int, maximum length of predicted span |
|
|
|
|
|
Returns: |
|
|
|
|
|
""" |
|
|
single_dims = (1, ) * (len(array_shape) - 2) |
|
|
mask_shape = single_dims + array_shape[-2:] |
|
|
extra_length_mask_array = np.ones(mask_shape, dtype=np.float32) |
|
|
mask_triu = np.triu(extra_length_mask_array, k=min_l) |
|
|
mask_triu_reversed = 1 - np.triu(extra_length_mask_array, k=max_l) |
|
|
final_prob_mask = mask_triu * mask_triu_reversed |
|
|
return final_prob_mask |
|
|
|
|
|
|
|
|
def get_svmr_res_from_st_ed_probs(svmr_gt_st_probs, svmr_gt_ed_probs, query_metas, video2idx, |
|
|
clip_length, min_pred_l, max_pred_l, max_before_nms): |
|
|
""" |
|
|
Args: |
|
|
svmr_gt_st_probs: np.ndarray (N_queries, L, L), value range [0, 1] |
|
|
svmr_gt_ed_probs: |
|
|
query_metas: |
|
|
video2idx: |
|
|
clip_length: float, how long each clip is in seconds |
|
|
min_pred_l: int, minimum number of clips |
|
|
max_pred_l: int, maximum number of clips |
|
|
max_before_nms: get top-max_before_nms predictions for each query |
|
|
|
|
|
Returns: |
|
|
|
|
|
""" |
|
|
svmr_res = [] |
|
|
query_vid_names = [e["vid_name"] for e in query_metas] |
|
|
|
|
|
|
|
|
st_ed_prob_product = np.einsum("bm,bn->bmn", svmr_gt_st_probs, svmr_gt_ed_probs) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
valid_prob_mask = generate_min_max_length_mask(st_ed_prob_product.shape, min_l=min_pred_l, max_l=max_pred_l) |
|
|
st_ed_prob_product *= valid_prob_mask |
|
|
|
|
|
batched_sorted_triples = find_max_triples_from_upper_triangle_product( |
|
|
st_ed_prob_product, top_n=max_before_nms, prob_thd=None) |
|
|
for i, q_vid_name in tqdm(enumerate(query_vid_names), |
|
|
desc="[SVMR] Loop over queries to generate predictions", |
|
|
total=len(query_vid_names)): |
|
|
q_m = query_metas[i] |
|
|
video_idx = video2idx[q_vid_name] |
|
|
_sorted_triples = batched_sorted_triples[i] |
|
|
_sorted_triples[:, 1] += 1 |
|
|
_sorted_triples[:, :2] = _sorted_triples[:, :2] * clip_length |
|
|
|
|
|
cur_ranked_predictions = [[video_idx, ] + row for row in _sorted_triples.tolist()] |
|
|
cur_query_pred = dict( |
|
|
query_id=q_m["query_id"], |
|
|
desc=q_m["desc"], |
|
|
predictions=cur_ranked_predictions |
|
|
) |
|
|
svmr_res.append(cur_query_pred) |
|
|
return svmr_res |
|
|
|
|
|
|
|
|
def load_external_vr_res2(external_vr_res_path, top_n_vr_videos=5): |
|
|
"""return a mapping from query_id to top retrieved video info""" |
|
|
external_vr_res = load_json(external_vr_res_path) |
|
|
external_vr_res = get_submission_top_n(external_vr_res, top_n=top_n_vr_videos)["VR"] |
|
|
query2video = {e["query_id"]: e["predictions"] for e in external_vr_res} |
|
|
return query2video |
|
|
|
|
|
|
|
|
def compute_query2ctx_info(model, eval_dataset, opt, video_metas, ctx_info, |
|
|
max_before_nms=1000, max_n_videos=100, maxtopk=40): |
|
|
"""Use val set to do evaluation, remember to run with torch.no_grad(). |
|
|
estimated size 20,000 (query) * 500 (hsz) * 4 / (1024**2) = 38.15 MB |
|
|
max_n_videos: int, use max_n_videos videos for computing VCMR/VR results |
|
|
""" |
|
|
|
|
|
video2idx = eval_dataset.video2idx |
|
|
|
|
|
if opt.external_inference_vr_res_path is not None: |
|
|
video_idx2meta_idx = {video2idx[m["vid_name"]]: i for i, m in enumerate(video_metas)} |
|
|
external_query2video = \ |
|
|
load_external_vr_res2(opt.external_inference_vr_res_path, top_n_vr_videos=max_n_videos) |
|
|
|
|
|
external_query2video_meta_idx = \ |
|
|
{k: [video_idx2meta_idx[e[0]] for e in v] for k, v in external_query2video.items()} |
|
|
else: |
|
|
external_query2video = None |
|
|
external_query2video_meta_idx = None |
|
|
|
|
|
model.eval() |
|
|
eval_dataset.set_data_mode("query") |
|
|
|
|
|
query_eval_loader = DataLoader(eval_dataset, |
|
|
collate_fn=start_end_collate, |
|
|
batch_size=opt.eval_query_bsz, |
|
|
num_workers=opt.num_workers, |
|
|
shuffle=False, |
|
|
pin_memory=opt.pin_memory) |
|
|
n_total_videos = len(video_metas) |
|
|
n_total_query = len(eval_dataset) |
|
|
bsz = opt.eval_query_bsz |
|
|
|
|
|
flat_st_ed_scores_sorted_indices = np.empty((n_total_query, max_before_nms), dtype=int) |
|
|
flat_st_ed_sorted_scores = np.zeros((n_total_query, max_before_nms), dtype=np.float32) |
|
|
sorted_q2c_indices = np.empty((n_total_query, max_n_videos), dtype=int) |
|
|
sorted_q2c_scores = np.empty((n_total_query, max_n_videos), dtype=np.float32) |
|
|
|
|
|
|
|
|
query_metas = [] |
|
|
for idx, batch in tqdm( |
|
|
enumerate(query_eval_loader), desc="Computing q embedding", total=len(query_eval_loader)): |
|
|
_query_metas = batch[0] |
|
|
query_metas.extend(batch[0]) |
|
|
model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory) |
|
|
|
|
|
_query_context_scores, _st_probs, _ed_probs = \ |
|
|
model.get_pred_from_raw_query(model_inputs["query_feat"], model_inputs["query_mask"], |
|
|
ctx_info["video_feat1"], ctx_info["video_feat2"], |
|
|
ctx_info["video_mask"], |
|
|
ctx_info["sub_feat1"], ctx_info["sub_feat2"], |
|
|
ctx_info["sub_mask"], |
|
|
cross=True) |
|
|
|
|
|
|
|
|
_query_context_scores = torch.exp(opt.q2c_alpha * _query_context_scores) |
|
|
|
|
|
|
|
|
|
|
|
_st_probs = F.softmax(_st_probs, dim=-1) |
|
|
_ed_probs = F.softmax(_ed_probs, dim=-1) |
|
|
|
|
|
if external_query2video is None: |
|
|
_sorted_q2c_scores, _sorted_q2c_indices = \ |
|
|
torch.topk(_query_context_scores, max_n_videos, dim=1, largest=True) |
|
|
else: |
|
|
relevant_video_info = [external_query2video[qm["query_id"]] for qm in _query_metas] |
|
|
_sorted_q2c_indices = _query_context_scores.new( |
|
|
[[video_idx2meta_idx[sub_e[0]] for sub_e in e] for e in relevant_video_info]).long() |
|
|
_sorted_q2c_scores = _query_context_scores.new( |
|
|
[[sub_e[3] for sub_e in e] for e in relevant_video_info]) |
|
|
_sorted_q2c_scores = torch.exp(opt.q2c_alpha * _sorted_q2c_scores) |
|
|
|
|
|
sorted_q2c_indices[idx * bsz:(idx + 1) * bsz] = _sorted_q2c_indices.cpu().numpy() |
|
|
sorted_q2c_scores[idx * bsz:(idx + 1) * bsz] = _sorted_q2c_scores.cpu().numpy() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
row_indices = torch.arange(0, len(_st_probs), device=opt.device).unsqueeze(1) |
|
|
_st_probs = _st_probs[row_indices, _sorted_q2c_indices] |
|
|
_ed_probs = _ed_probs[row_indices, _sorted_q2c_indices] |
|
|
|
|
|
|
|
|
_st_ed_scores = torch.einsum("qvm,qv,qvn->qvmn", _st_probs, _sorted_q2c_scores, _ed_probs) |
|
|
valid_prob_mask = generate_min_max_length_mask( |
|
|
_st_ed_scores.shape, min_l=opt.min_pred_l, max_l=opt.max_pred_l) |
|
|
_st_ed_scores *= torch.from_numpy( |
|
|
valid_prob_mask).to(_st_ed_scores.device) |
|
|
|
|
|
|
|
|
|
|
|
_n_q = _st_ed_scores.shape[0] |
|
|
_flat_st_ed_scores = _st_ed_scores.reshape(_n_q, -1) |
|
|
_flat_st_ed_sorted_scores, _flat_st_ed_scores_sorted_indices = \ |
|
|
torch.sort(_flat_st_ed_scores, dim=1, descending=True) |
|
|
|
|
|
flat_st_ed_sorted_scores[idx * bsz:(idx + 1) * bsz] = \ |
|
|
_flat_st_ed_sorted_scores[:, :max_before_nms].cpu().numpy() |
|
|
flat_st_ed_scores_sorted_indices[idx * bsz:(idx + 1) * bsz] = \ |
|
|
_flat_st_ed_scores_sorted_indices[:, :max_before_nms].cpu().numpy() |
|
|
|
|
|
if opt.debug: |
|
|
break |
|
|
|
|
|
|
|
|
vcmr_res = {} |
|
|
for i, (_flat_st_ed_scores_sorted_indices, _flat_st_ed_sorted_scores) in tqdm( |
|
|
enumerate(zip(flat_st_ed_scores_sorted_indices, flat_st_ed_sorted_scores)), |
|
|
desc="[VCMR] Loop over queries to generate predictions", total=n_total_query): |
|
|
|
|
|
video_meta_indices_local, pred_st_indices, pred_ed_indices = \ |
|
|
np.unravel_index(_flat_st_ed_scores_sorted_indices, |
|
|
shape=(max_n_videos, opt.max_ctx_l, opt.max_ctx_l)) |
|
|
|
|
|
|
|
|
video_meta_indices = sorted_q2c_indices[i, video_meta_indices_local] |
|
|
|
|
|
pred_st_in_seconds = pred_st_indices.astype(np.float32) * opt.clip_length |
|
|
pred_ed_in_seconds = pred_ed_indices.astype(np.float32) * opt.clip_length + opt.clip_length |
|
|
cur_vcmr_redictions = [] |
|
|
for j, (v_meta_idx, v_score) in enumerate(zip(video_meta_indices, _flat_st_ed_sorted_scores)): |
|
|
video_idx = video2idx[video_metas[v_meta_idx]["vid_name"]] |
|
|
cur_vcmr_redictions.append( |
|
|
{ |
|
|
"video_name": video_metas[v_meta_idx]["vid_name"], |
|
|
"timestamp": [float(pred_st_in_seconds[j]), float(pred_ed_in_seconds[j])], |
|
|
"model_scores": float(v_score) |
|
|
} |
|
|
) |
|
|
query_id=query_metas[i]["query_id"] |
|
|
vcmr_res[query_id] = cur_vcmr_redictions[:maxtopk] |
|
|
return vcmr_res |
|
|
|
|
|
|
|
|
def get_eval_res(model, eval_dataset, context_data, opt, maxtopk): |
|
|
"""compute and save query and video proposal embeddings""" |
|
|
|
|
|
video_metas, context_info = compute_context_info(model, context_data, opt) |
|
|
eval_res = compute_query2ctx_info(model, eval_dataset, opt, video_metas, context_info, |
|
|
max_before_nms=opt.max_before_nms, max_n_videos=opt.max_vcmr_video, maxtopk=maxtopk) |
|
|
return eval_res |
|
|
|
|
|
|
|
|
POST_PROCESSING_MMS_FUNC = { |
|
|
"SVMR": post_processing_svmr_nms, |
|
|
"VCMR": post_processing_vcmr_nms |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def eval_epoch(model, eval_dataset, context_data, logger, opt, max_after_nms, iou_thds, topks): |
|
|
"""max_after_nms: always set to 100, since the eval script only evaluate top-100""" |
|
|
|
|
|
|
|
|
model.eval() |
|
|
pred_data = get_eval_res(model, eval_dataset, context_data, opt, max(topks)) |
|
|
|
|
|
gt_data = eval_dataset.ground_truth |
|
|
average_ndcg = calculate_ndcg_iou(gt_data, pred_data, iou_thds, topks) |
|
|
return average_ndcg, pred_data |
|
|
|
|
|
def setup_model(opt): |
|
|
"""Load model from checkpoint and move to specified device""" |
|
|
checkpoint = torch.load(opt.ckpt_filepath) |
|
|
loaded_model_cfg = checkpoint["model_cfg"] |
|
|
loaded_model_cfg["stack_conv_predictor_conv_kernel_sizes"] = -1 |
|
|
model = XML(loaded_model_cfg) |
|
|
model.load_state_dict(checkpoint["model"]) |
|
|
logger.info("Loaded model saved at epoch {} from checkpoint: {}" |
|
|
.format(checkpoint["epoch"], opt.ckpt_filepath)) |
|
|
|
|
|
if opt.device.type == "cuda": |
|
|
logger.info("CUDA enabled.") |
|
|
model.to(opt.device) |
|
|
if len(opt.device_ids) > 1: |
|
|
logger.info("Use multi GPU", opt.device_ids) |
|
|
model = torch.nn.DataParallel(model, device_ids=opt.device_ids) |
|
|
return model |
|
|
|
|
|
|
|
|
def start_inference(): |
|
|
logger.info("Setup config, data and model...") |
|
|
opt = TestOptions().parse() |
|
|
cudnn.benchmark = False |
|
|
cudnn.deterministic = True |
|
|
|
|
|
assert opt.eval_path is not None |
|
|
eval_dataset = StartEndEvalDataset( |
|
|
dset_name=opt.dset_name, |
|
|
eval_split_name=opt.eval_split_name, |
|
|
data_path=opt.eval_path, |
|
|
desc_bert_path_or_handler=opt.desc_bert_path, |
|
|
sub_bert_path_or_handler=opt.sub_bert_path, |
|
|
max_desc_len=opt.max_desc_l, |
|
|
max_ctx_len=opt.max_ctx_l, |
|
|
corpus_path=opt.corpus_path, |
|
|
vid_feat_path_or_handler=opt.vid_feat_path, |
|
|
clip_length=opt.clip_length, |
|
|
ctx_mode=opt.ctx_mode, |
|
|
data_mode="query", |
|
|
h5driver=opt.h5driver, |
|
|
data_ratio=opt.data_ratio, |
|
|
normalize_vfeat=not opt.no_norm_vfeat, |
|
|
normalize_tfeat=not opt.no_norm_tfeat |
|
|
) |
|
|
|
|
|
model = setup_model(opt) |
|
|
save_submission_filename = "inference_{}_{}_{}_predictions_{}.json".format( |
|
|
opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks)) |
|
|
logger.info("Starting inference...") |
|
|
with torch.no_grad(): |
|
|
metrics_no_nms, metrics_nms, latest_file_paths = \ |
|
|
eval_epoch(model, eval_dataset, opt, save_submission_filename, |
|
|
tasks=opt.tasks, max_after_nms=100) |
|
|
logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4))) |
|
|
logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4))) |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
start_inference() |
|
|
|