diff --git a/General-Bench-Closeset/.gitkeep b/General-Bench-Closeset/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/General-Bench-Openset/.gitkeep b/General-Bench-Openset/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/README.md b/README.md index 8bcdcfb6fe51209003d5ac8026b871c6f49a5c03..abc866bef82dcebabe30966aef0905a967d663d1 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +<<<<<<< HEAD --- title: README emoji: 🌍 @@ -131,3 +132,82 @@ If you find our benchmark useful in your research, please kindly consider citing ``` +======= +# GenBench 评分系统 - 用户使用说明 + +本系统用于评估大模型在 General-Bench 多模态任务集上的表现,可完成预测、评分和最终得分计算。 + +## 环境准备 + +- Python 3.9 及以上 +- 推荐提前安装依赖(如 pandas, numpy, openpyxl 等) +- Video Generation评测,需要按照video_generation_evaluation/README.md中的步骤安装依赖 +- Video Comprehension评测,需要按照[sa2va](https://github.com/magic-research/Sa2VA)中的README.md中的步骤安装依赖。 + +## 数据集下载 + +- **Open Set(公开数据集)**:请从 [HuggingFace General-Bench-Openset](https://huggingface.co/datasets/General-Level/General-Bench-Openset) 下载全部数据,解压后放入 `General-Bench-Openset/` 目录。 +- **Close Set(私有数据集)**:请从 [HuggingFace General-Bench-Closeset](https://huggingface.co/datasets/General-Level/General-Bench-Closeset) 下载全部数据,解压后放入 `General-Bench-Closeset/` 目录。 + +## 一键运行 + +请直接运行主脚本 `run.sh`,即可完成全部流程: + +```bash +bash run.sh +``` + +该命令将依次完成: +1. 生成各模态预测结果 +2. 计算各任务得分 +3. 计算最终 Level 得分 + +## 分步运行(可选) + +如只需运行部分步骤,可使用 `--step` 参数: + +- 只运行第1步(生成预测): + ```bash + bash run.sh --step 1 + ``` +- 只运行第1、2步: + ```bash + bash run.sh --step 12 + ``` +- 只运行第2、3步: + ```bash + bash run.sh --step 23 + ``` +- 不加参数默认全部执行(等价于 `--step 123`) + +- 步骤1:生成预测结果prediction.json,存在每一个数据集的annotation.json同级目录下 +- 步骤2:计算每个任务的得分,存在outcome/{model_name}_result.xlsx中 +- 步骤3:计算相关模型的Level得分 + +> **注意:** +> - 使用 **Close Set(私有数据集)** 时,只需运行 step1(即 `bash run.sh --step 1`),并将生成的 prediction.json 提交到系统。 +> - 使用 **Open Set(公开数据集)** 时,需依次运行 step1、step2、step3(即 `bash run.sh --step 123`),完成全部评测流程。 + +## 结果查看 + +- 预测结果(prediction.json)会输出到每个任务对应的数据集文件夹下,与 annotation.json 同级。 +- 评分结果(如 Qwen2.5-7B-Instruct_result.xlsx)会输出到 outcome/ 目录。 +- 最终 Level 得分会直接在终端打印输出。 + +## 目录说明 + +- `General-Bench-Openset/`:公开数据集目录 +- `General-Bench-Closeset/`:私有数据集目录 +- `outcome/`:输出结果目录 +- `references/`:参考模板目录 +- `run.sh`:主运行脚本(推荐用户只用此脚本) + +## 常见问题 + +- 如遇依赖缺失,请根据报错信息安装相应 Python 包。 +- 如需自定义模型或数据路径,可编辑 `run.sh` 脚本中的相关变量。 + +--- + +如需进一步帮助,请联系系统维护者或查阅详细开发文档。 +>>>>>>> 6f59817 (submit NLP Video Audio) diff --git a/README_Evaluate.md b/README_Evaluate.md new file mode 100644 index 0000000000000000000000000000000000000000..f2c87294a4c4672091f44f4db24e59a2acbc888c --- /dev/null +++ b/README_Evaluate.md @@ -0,0 +1,77 @@ +# GenBench 评分系统 - 用户使用说明 + +本系统用于评估大模型在 General-Bench 多模态任务集上的表现,可完成预测、评分和最终得分计算。 + +## 环境准备 + +- Python 3.9 及以上 +- 推荐提前安装依赖(如 pandas, numpy, openpyxl 等) +- Video Generation评测,需要按照video_generation_evaluation/README.md中的步骤安装依赖 +- Video Comprehension评测,需要按照[sa2va](https://github.com/magic-research/Sa2VA)中的README.md中的步骤安装依赖。 + +## 数据集下载 + +- **Open Set(公开数据集)**:请从 [HuggingFace General-Bench-Openset](https://huggingface.co/datasets/General-Level/General-Bench-Openset) 下载全部数据,解压后放入 `General-Bench-Openset/` 目录。 +- **Close Set(私有数据集)**:请从 [HuggingFace General-Bench-Closeset](https://huggingface.co/datasets/General-Level/General-Bench-Closeset) 下载全部数据,解压后放入 `General-Bench-Closeset/` 目录。 + +## 一键运行 + +请直接运行主脚本 `run.sh`,即可完成全部流程: + +```bash +bash run.sh +``` + +该命令将依次完成: +1. 生成各模态预测结果 +2. 计算各任务得分 +3. 计算最终 Level 得分 + +## 分步运行(可选) + +如只需运行部分步骤,可使用 `--step` 参数: + +- 只运行第1步(生成预测): + ```bash + bash run.sh --step 1 + ``` +- 只运行第1、2步: + ```bash + bash run.sh --step 12 + ``` +- 只运行第2、3步: + ```bash + bash run.sh --step 23 + ``` +- 不加参数默认全部执行(等价于 `--step 123`) + +- 步骤1:生成预测结果prediction.json,存在每一个数据集的annotation.json同级目录下 +- 步骤2:计算每个任务的得分,存在outcome/{model_name}_result.xlsx中 +- 步骤3:计算相关模型的Level得分 + +> **注意:** +> - 使用 **Close Set(私有数据集)** 时,只需运行 step1(即 `bash run.sh --step 1`),并将生成的 prediction.json 提交到系统。 +> - 使用 **Open Set(公开数据集)** 时,需依次运行 step1、step2、step3(即 `bash run.sh --step 123`),完成全部评测流程。 + +## 结果查看 + +- 预测结果(prediction.json)会输出到每个任务对应的数据集文件夹下,与 annotation.json 同级。 +- 评分结果(如 Qwen2.5-7B-Instruct_result.xlsx)会输出到 outcome/ 目录。 +- 最终 Level 得分会直接在终端打印输出。 + +## 目录说明 + +- `General-Bench-Openset/`:公开数据集目录 +- `General-Bench-Closeset/`:私有数据集目录 +- `outcome/`:输出结果目录 +- `references/`:参考模板目录 +- `run.sh`:主运行脚本(推荐用户只用此脚本) + +## 常见问题 + +- 如遇依赖缺失,请根据报错信息安装相应 Python 包。 +- 如需自定义模型或数据路径,可编辑 `run.sh` 脚本中的相关变量。 + +--- + +如需进一步帮助,请联系系统维护者或查阅详细开发文档。 \ No newline at end of file diff --git a/outcome/Qwen2.5-7B-Instruct_result.xlsx b/outcome/Qwen2.5-7B-Instruct_result.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..3fd70bf5bcd4f09782144608d8c70ebe05324eac Binary files /dev/null and b/outcome/Qwen2.5-7B-Instruct_result.xlsx differ diff --git a/outcome/emu2-32b_result.xlsx b/outcome/emu2-32b_result.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..f64513d78af54daaf01d732e1d1625e3681130a7 Binary files /dev/null and b/outcome/emu2-32b_result.xlsx differ diff --git a/outcome/test_result.xlsx b/outcome/test_result.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..89ffac0c50b6807bf9a3cabcd223ca2053fe7e4d Binary files /dev/null and b/outcome/test_result.xlsx differ diff --git a/predictors/audio_predict_comprehension.py b/predictors/audio_predict_comprehension.py new file mode 100644 index 0000000000000000000000000000000000000000..1dbadb6804c2bf431356f2cc1edaed47124bdd28 --- /dev/null +++ b/predictors/audio_predict_comprehension.py @@ -0,0 +1,1252 @@ +from email.mime import audio +import json +import os +from pandas import read_json +from regex import B, D +import tqdm +from typing import List, Dict, Any +import nltk +from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction +from dataclasses import dataclass +from abc import ABC, abstractmethod +from rouge_score import rouge_scorer +import math +import time +from urllib.request import urlopen +import librosa +from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM + + +def read_json(file_path: str) -> Dict[str, Any]: + with open(file_path, "r") as f: + data = json.load(f) + return data + + +def exact_match_accuracy(predictions: List[str], references: List[str]) -> float: + correct = 0 + for pred, ref in zip(predictions, references): + if isinstance(ref, str): + ref = [ref] + if isinstance(ref, int): + ref = [ref] + is_match_this_turn = False + for r in ref: + if pred.strip() == r.strip(): + is_match_this_turn = True + if is_match_this_turn: + correct += 1 + return correct / len(predictions) if predictions else 0.0 + + +def blur_match_accuracy(predictions: List[str], references: List[str]) -> float: + correct = 0 + for pred, ref in zip(predictions, references): + # if isinstance(ref, int): + # if == ref: + if str(ref) in str(pred).strip().lower(): + correct += 1 + return correct / len(predictions) if predictions else 0.0 + + +def calculate_f1(predictions: List[str], references: List[str]) -> float: + def compute_f1(pred: str, ref: str) -> float: + pred_tokens = pred.strip().split() + ref_tokens = ref.strip().split() + + common_tokens = set(pred_tokens) & set(ref_tokens) + num_common = len(common_tokens) + + if num_common == 0: + return 0.0 + + precision = num_common / len(pred_tokens) + recall = num_common / len(ref_tokens) + + return 2 * precision * recall / (precision + recall) + + total_f1 = 0.0 + for pred, ref in zip(predictions, references): + if isinstance(ref, str): + ref = [ref] + max_f1 = 0.0 + for r in ref: + max_f1 = max(compute_f1(pred, r), max_f1) + total_f1 += max_f1 + + return total_f1 / len(predictions) if predictions else 0.0 + + +def rouge_evaluation(predictions: List[str], references: List[str]) -> Dict[str, float]: + scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) + rouge1_scores, rouge2_scores, rougel_scores = [], [], [] + for pred, ref in zip(predictions, references): + if isinstance(ref, str): + ref = [ref] + rouge1, rouge2, rougeL = 0, 0, 0 + for r in ref: + scores = scorer.score(r, pred) + rouge1 = max(scores['rouge1'].fmeasure, rouge1) + rouge2 = max(scores['rouge2'].fmeasure, rouge2) + rougeL = max(scores['rougeL'].fmeasure, rougeL) + rouge1_scores.append(rouge1) + rouge2_scores.append(rouge2) + rougel_scores.append(rougeL) + return { + 'rouge1': sum(rouge1_scores) / len(rouge1_scores), + 'rouge2': sum(rouge2_scores) / len(rouge2_scores), + 'rougeL': sum(rougel_scores) / len(rougel_scores), + } + + +def bleu_evaluation(predictions: List[str], references: List[str]) -> Dict[str, float]: + smoothie = SmoothingFunction().method4 + bleu1_scores, bleu2_scores, bleu3_scores, bleu4_scores = [], [], [], [] + + for pred, ref in zip(predictions, references): + hypothesis = nltk.word_tokenize(pred) + if isinstance(ref, str): + ref = [ref] + bleu1, bleu2, bleu3, bleu4 = 0, 0, 0, 0 + for r in ref: + reference = [nltk.word_tokenize(r)] + bleu1 = max(sentence_bleu(reference, hypothesis, weights=(1, 0, 0, 0), smoothing_function=smoothie), bleu1) + bleu2 = max(sentence_bleu(reference, hypothesis, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie), bleu2) + bleu3 = max(sentence_bleu(reference, hypothesis, weights=(1/3, 1/3, 1/3, 0), smoothing_function=smoothie), bleu3) + bleu4 = max(sentence_bleu(reference, hypothesis, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie), bleu4) + + bleu1_scores.append(bleu1) + bleu2_scores.append(bleu2) + bleu3_scores.append(bleu3) + bleu4_scores.append(bleu4) + + return { + 'bleu1': sum(bleu1_scores) / len(bleu1_scores) if bleu1_scores else 0.0, + 'bleu2': sum(bleu2_scores) / len(bleu2_scores) if bleu2_scores else 0.0, + 'bleu3': sum(bleu3_scores) / len(bleu3_scores) if bleu3_scores else 0.0, + 'bleu4': sum(bleu4_scores) / len(bleu4_scores) if bleu4_scores else 0.0, + } + + +def mean_absolute_error(predictions: List[float], references: List[float]) -> float: + if not predictions: + return 0.0 + error_sum = 0.0 + for p, r in zip(predictions, references): + error_sum += abs(p - r) + return error_sum / len(predictions) + + +def mean_squared_error(predictions: List[float], references: List[float]) -> float: + if not predictions: + return 0.0 + error_sum = 0.0 + for p, r in zip(predictions, references): + error_sum += (p - r) ** 2 + return error_sum / len(predictions) + + +def root_mean_squared_error(predictions: List[float], references: List[float]) -> float: + return math.sqrt(mean_squared_error(predictions, references)) + + +def post_process_output(output: str) -> str: + cnt = 0 + for d in output: + if d['gt'] in d['response'].strip().lower(): + cnt += 1 + acc = round(cnt / len(output), 4) + print(f"Accuracy: {acc}") + return acc + + +def evaluation_accuracy(predictions: List[str]) -> Dict[str, float]: + correct = 0 + for pred in predictions: + if pred == '1': + correct += 1 + return correct / len(predictions) if predictions else 0.0 + + +class AudioComprehensionModel: + def __init__(self, model_name: str): + self.model_name = model_name + self.load_model() + + def load_model(self): + if 'qwen-audio-chat' in self.model_name.lower(): + self.model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map='cuda', trust_remote_code=True).eval() + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + self.tokenizer.padding_side = 'left' + self.tokenizer.pad_token_id = self.tokenizer.eod_id + elif 'qwen2' in self.model_name.lower(): + self.processor = AutoProcessor.from_pretrained(self.model_name) + print(self.processor.chat_template) + self.model = Qwen2AudioForConditionalGeneration.from_pretrained(self.model_name, device_map="auto").eval() + + elif 'new_model_name' in self.model_name.lower(): + # support to load self-build models here + pass + + else: + raise ValueError(f"Unsupported model name: {self.model_name}") + + def generate(self, prompt: str, max_new_tokens=256, audio_path: str=None) -> str: + + if "qwen-audio-chat" in self.model_name.lower(): + query = self.tokenizer.from_list_format([ + {'audio': audio_path}, # Either a local path or an url + {'text': prompt} # The query, + ]) + response, history = self.model.chat(self.tokenizer, query=query, history=None) + return response + + elif "qwen2" in self.model_name.lower(): + conversation = [ + {'role': 'system', 'content': 'You are a helpful assistant.'}, + {"role": "user", "content": [ + {"type": "audio", "audio": audio_path}, + {"type": "text", "text": prompt}, + ]}, + ] + text = self.processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) + audios = [] + for message in conversation: + if isinstance(message["content"], list): + for ele in message["content"]: + if ele["type"] == "audio": + audios.append( + librosa.load( + ele['audio'], + sr=self.processor.feature_extractor.sampling_rate)[0] + ) + # print(text) + inputs = self.processor(text=text, audios=audios, return_tensors="pt", padding=True) + inputs.input_ids = inputs.input_ids.to("cuda") + inputs = inputs.to("cuda") + # print(inputs) + # exit(0) + generate_ids = self.model.generate(**inputs, max_length=300) + generate_ids = generate_ids[:, inputs.input_ids.size(1):] + + response = self.processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + return response + + elif "new" in self.model_name.lower(): + # support to generate response based on self-build models here + pass + + else: + raise ValueError(f"Unsupported model name: {self.model_name}") + + + +@dataclass +class Instance: + input: Dict[str, Any] + output: Dict[str, Any] + id: str + + +class BaseTask(ABC): + def __init__(self, task_data: Dict[str, Any], model: AudioComprehensionModel, audio_dir: str = None, output_dir: str = None, task_name: str = None): + self.task_data = read_json(task_data) + self.model = model + self.audio_dir = audio_dir # should include the audios files + self.data = self._parse_data(self.task_data) + self.choice_candidate = self._get_choice_candidate(self.task_data) + self.task_name = os.path.dirname(task_data).split("/")[-1] if task_name is None else task_name + self.output_dir = output_dir + os.makedirs(self.output_dir, exist_ok=True) if self.output_dir else None + + self.references = [] + self.predictions = [] + + def save_predictions(self, audio_paths): + results = [] + for gt, response, audio_path in zip(self.references, self.predictions, audio_paths): + results.append({ + 'gt': gt, + 'response': response, + 'audio_path': audio_path, + }) + time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime()) + results_file = os.path.join(self.output_dir, f'{self.task_name }_{time_prefix}.json') if self.output_dir else f'{self.task_name }_{time_prefix}.json' + json.dump(results, open(results_file, 'w')) + + @abstractmethod + def _get_choice_candidate(self): + pass + + @abstractmethod + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + pass + + @abstractmethod + def evaluate(self) -> Dict[str, float]: + pass + + @abstractmethod + def run_inference(self): + pass + + +class EvaluationTask(BaseTask): + """ + Used to determine whether the results generated by the model are correct + """ + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return task_data + + def _get_choice_candidate(self, data: List[Instance]) -> List[str]: + return ["None"] + + def save_predictions(self, audio_paths): + results = [] + for gt, response, audio_path in zip(self.references, self.predictions, audio_paths): + results.append({ + 'gt': gt[0], + 'response': gt[1], + 'audio_path': audio_path, + 'llm_prediction': response, + }) + time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime()) + results_file = os.path.join(self.output_dir, f'{self.task_name }_{time_prefix}.json') if self.output_dir else f'{self.task_name }_{time_prefix}.json' + json.dump(results, open(results_file, 'w')) + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + prompt = " will provide you with a Ground-truth label and a Prediction label. The label can either be a single string or a list of multiple labels. I need you to compare these two labels on a semantic level.\nSpecifically, I want you to evaluate whether the Prediction label semantically matches, is partially aligned, includes, or describes the Ground-truth label (or the semantic meaning represented by the list of labels). If any of these conditions are satisfied, consider it a match.\n\nHere are some examples of successful matches:\n\nGround-truth label: \"rain\"\nPrediction label: \"The sound in the audio is rain falling\"\n(This is considered a match.)\nGround-truth label: [\"decrease\", \"volume\", \"none\"]\nPrediction label: \"The intent in the audio is to adjust the volume\"(This is also considered a match.)\nIf the labels successfully match, assign a score of 1. If they do not match, assign a score of 0.**Imporant!!!, only output the score (0 or 1), no explanation.** \n\nGround-truth label:{}\nPrediction label:{}" + gt = inst["gt"] + response = inst["response"] + prompt = prompt.format(gt, response) + try: + response = self.model.generate(prompt) + # print(response) + except Exception as e: + response = "None" + continue + + self.predictions.append(response) + self.references.append([inst["gt"], inst["response"]]) + audio_paths.append(inst["audio_path"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = evaluation_accuracy(self.predictions) + return {"accuracy": acc} + + +class AccentSexClassification(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: List[Instance]) -> List[str]: + return ['female', 'male'] + + def save_predictions(self, audio_paths): + results = [] + for gt, response, audio_path in zip(self.references, self.predictions, audio_paths): + results.append({ + 'gt': gt, + 'response': response, + 'audio_path': audio_path, + }) + time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime()) + results_file = os.path.join(self.output_dir, f'{self.task_name }_{time_prefix}.json') if self.output_dir else f'{self.task_name }_{time_prefix}.json' + json.dump(results, open(results_file, 'w')) + + def run_inference(self): + self.predictions = [] + self.references = [] + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the audio and then answer the question by directly choose a choice from choice candidates. Questions: {question}, Candidate choices: {self.choice_candidate}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except: + print("error audio {}".format(inst.input["audio_file"])) + continue + self.predictions.append(response) + self.references.append(inst.output["text"]) + audio_paths.append(inst.input["audio_file"]) + + self.save_predictions(audio_paths) + + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class AcousticSceneClassification(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: List[Instance]) -> List[str]: + choices = [] + for item in data['data']: + choices.append(item['output']["text"].strip().lower()) + choices = list(set(choices)) + return choices + + def run_inference(self): + print(f"Choice candidates: {self.choice_candidate}") + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the input music and then determine the category of the acoustic scene. The candidate scene category are {self.choice_candidate}. Please output **only one category** from the provided candidate categories, and **DO NOT** output any other words.\nQuestions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"].strip().lower()) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class AnimalSoundDetection(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data) -> List[str]: + choices = [] + for item in data['data']: + choices.append(item['output']["text"].strip().lower()) + choices = list(set(choices)) + return choices + + def run_inference(self): + print(f"Choice candidates: {self.choice_candidate}") + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the audio and then answer the question by directly choose a choice from choice candidates, without other words. Questions: {question}, Candidate choices: {self.choice_candidate}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"].strip().lower()) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class AudioCaptions(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: List[Instance]) -> List[str]: + return ["None"] + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the audio and then answer the question. Questions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"]) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + bleu = bleu_evaluation(self.predictions, self.references) + return {"bleu1": bleu['bleu1']} + + +class AudioCaptionsClotho(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: List[Instance]) -> List[str]: + return ["None"] + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the audio and then answer the question. Questions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"]) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = bleu_evaluation(self.predictions, self.references) + return {"accuracy": acc} + + +class AudioQA(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data) -> List[str]: + choices = [] + for item in data['data']: + choices.append(item['output']["text"].strip().lower()) + choices = list(set(choices)) + return choices + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the audio and then answer the question by directly choose a choice from choice candidates. Questions: {question}, Candidate choices: {self.choice_candidate}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"]) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class BirdSoundDetection(BaseTask): + + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: List[Instance]) -> List[str]: + return ["Yes", "No"] + + def save_predictions(self, audio_paths): + results = [] + for gt, response, audio_path in zip(self.references, self.predictions, audio_paths): + results.append({ + 'gt': gt, + 'response': response, + 'audio_path': audio_path, + }) + time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime()) + results_file = os.path.join(self.output_dir, f'{self.task_name }_{time_prefix}.json') if self.output_dir else f'{self.task_name }_{time_prefix}.json' + json.dump(results, open(results_file, 'w')) + + def run_inference(self): + self.predictions = [] + self.references = [] + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the audio and then answer the question by directly choose a choice from choice candidates. Questions: {question}, Candidate choices: {self.choice_candidate}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append("Yes" if inst.output["text"] == 1 else "No") + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class EnvironmentSoundRecognition(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data) -> List[str]: + choices = [] + for item in data['data']: + choices.append(item['output']["text"].strip().lower()) + choices = list(set(choices)) + return choices + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the audio and then answer the question by directly choose a choice from choice candidates. Questions: {question}, Candidate choices: {self.choice_candidate}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print(f"error {e}") + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"]) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = blur_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class IntentClassification(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: Dict) -> Dict: + intent_label = data['intent_label'] + return intent_label + + def run_inference(self): + audio_paths = [] + candidate_actions = ','.join([k for k in self.choice_candidate['action'].keys() if not k[0].isdigit()]) + candidate_objects = ','.join([k for k in self.choice_candidate['object'].keys() if not k[0].isdigit()]) + candidate_locations = ','.join([k for k in self.choice_candidate['location'].keys() if not k[0].isdigit()]) + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the audio and then detect the intention. The intention triplet includes three parts: action, object, and location. The candicate actions are {candidate_actions}, candidate objects are {candidate_objects}, and candidate locations are {candidate_locations}. Please answer the questions only use the provided candidate actions, objects, and locations. Questions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(' '.join([self.choice_candidate['action'][inst.output["text"].split()[0]], self.choice_candidate['action'][inst.output["text"].split()[1]], self.choice_candidate['action'][inst.output["text"].split()[2]]])) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +def post_process_intent_output(): + data_path = '/m2v_intern/wushengqiong/model/audio-test/predictions/understanding/IntentClassification_250102204424.json' + intent_label = read_json('/m2v_intern/wushengqiong/model/audio-test/understanding/IntentClassification/annotation.json')['intent_label'] + action = intent_label['action'] + object = intent_label['object'] + location = intent_label['location'] + + data = read_json(data_path) + + results = [] + for d in data: + results.append({ + 'gt': [action[d['gt'].split()[0]], object[d['gt'].split()[1]], location[d['gt'].split()[2]]], + 'response': d['response'], + 'audio_path': d['audio_path'], + }) + json.dump(results, open('/m2v_intern/wushengqiong/model/audio-test/predictions/understanding/IntentClassification_250102204424_1.json', 'w')) + + +class MusicGenreClassification(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: Dict) -> Dict: + choices = [] + for item in data['data']: + choices.append(item['output']["text"].strip().lower()) + choices = list(set(choices)) + return choices + + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"].replace('\\', '/')) + question = inst.input["prompt"] + prompt = f"Please listen to the input music and then determine the genre of the music. The candidate genres are {self.choice_candidate}. Please output **only one genre** from the provided candidate genres, and **DO NOT** output any other words.\nQuestions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"]) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class MusicInstrumentClassification(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: Dict) -> Dict: + choices = [] + for item in data['data']: + choices.append(item['output']["text"].strip().lower()) + choices = list(set(choices)) + return choices + + def run_inference(self): + audio_paths = [] + # candidate_instruments = ','.join([k for k in self.choice_candidate.keys() if not k[0].isdigit()]) + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the music and then detect the instrument of the music. The candidate instruments are {self.choice_candidate}. Please output **only the most appropriate music instrument** from the provided candidate music instruments, and **DO NOT** output any other words. Questions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"]) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class MusicInstrumentSourceAnalysis(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: Dict) -> Dict: + choices = [] + for item in data['data']: + choices.append(item['output']["text"].strip().lower()) + choices = list(set(choices)) + return choices + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the music and then detect the instrucment source of the music. The candidate sources are {self.choice_candidate}. Please output **only the most appropriate music source** from the provided candidate music sources, and **DO NOT** output any other words. Questions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"]) + audio_paths.append(inst.input["audio_file"].strip().lower()) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class MusicPitchAnalysis(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: Dict) -> Dict: + choices = [] + for item in data['data']: + choices.append(item['output']["text"]) + choices = list(set(choices)) + return choices + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the music and then detect the pitch score of the music. The 0-based MIDI pitch is in the range [0, 127]. Please output **only the most appropriate pitch score in a number** from the provided range, and **DO NOT** output any other words. Questions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"]) + audio_paths.append(inst.input["audio_file"].strip().lower()) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class NoteQualitiesAnalysis(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: Dict) -> Dict: + choices = [] + for item in data['data']: + choices.append(','.join(item['output']["text"]).strip().lower()) + choices = list(set(choices)) + return choices + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the music and then detect the note quality of the given music. The candidate annotation is {self.choice_candidate}. Please output **the qualities which are present in this note** from the provided candidate music note quality candidate categories, and **DO NOT** output any other words. Questions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(','.join(inst.output["text"])) + audio_paths.append(inst.input["audio_file"].strip().lower()) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class OpenAQA(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: Dict) -> Dict: + choices = [] + for item in data['data']: + choices.append(item['output']["text"].strip().lower()) + choices = list(set(choices)) + return choices + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the audio and then answer the question. Questions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"]) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = bleu_evaluation(self.predictions, self.references) + return {"accuracy": acc} + + +class SoundEventClassification(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: Dict) -> Dict: + choices = [] + for item in data['data']: + choices.append(item['output']["text"].strip().lower()) + choices = list(set(choices)) + return choices + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the music and then detect the happening event of the given audio. The candidate annotation is {self.choice_candidate}. Please output **only one event** from the provided candidate events,, and **DO NOT** output any other words. Questions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"]) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class SpeechCommand(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: Dict) -> Dict: + choices = [] + for item in data['data']: + choices.append(item['output']["text"].strip().lower()) + choices = list(set(choices)) + return choices + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"].replace('\\', '/')) + question = inst.input["prompt"] + prompt = f"Please listen to the audio and then detect the speech command of the given audio. The candidate annotation is {self.choice_candidate}. Please output **only one command** from the provided candidate commands, and **DO NOT** output any other words. Questions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"].strip().lower()) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class SpeechEmotionRecognition(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: Dict) -> Dict: + choices = [] + for item in data['data']: + choices.append(item['output']["text"].strip().lower()) + choices = list(set(choices)) + return choices + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the audio and then detect the emotion of the given audio. The candidate annotation is {self.choice_candidate}. Please output **only one emotion** from the provided candidate emotions, and **DO NOT** output any other words. Questions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"].strip().lower()) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class VocalSoundClassification(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: Dict) -> Dict: + choices = [] + for item in data['data']: + choices.append(item['output']["text"].strip().lower()) + choices = list(set(choices)) + return choices + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + question = inst.input["prompt"] + prompt = f"Please listen to the audio and then detect the vocal sound category of the given audio. The candidate annotation is {self.choice_candidate}. Please output **only one vocal sound category** from the provided candidate vocal sounds, and **DO NOT** output any other words. Questions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"].strip().lower()) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +class VocalTechniqueDetection(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def _get_choice_candidate(self, data: Dict) -> Dict: + choices = [] + for item in data['data']: + choices.append(item['output']["text"].strip().lower()) + choices = list(set(choices)) + return choices + + def run_inference(self): + audio_paths = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"].replace('\\', '/')) + question = inst.input["prompt"] + prompt = f"Please listen to the audio and then detect the vocal technique of the given audio. The candidate annotations are scales, arpeggios, long tones, and excerpts. Please output **only one vocal technique** from the provided candidate vocal techniques, and **DO NOT** output any other words. Questions: {question}\nAnswer:" + try: + response = self.model.generate(prompt, audio_path=audio_path) + except Exception as e: + print("Error audio: {}".format(inst.input["audio_file"])) + response = "None" + continue + self.predictions.append(response) + self.references.append(inst.output["text"].strip().lower()) + audio_paths.append(inst.input["audio_file"]) + self.save_predictions(audio_paths) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + + +def log_performance_csv(model_name, task_name, metric, score, root_path, output_file='prediction.json'): + import csv + file_exists = os.path.isfile(os.path.join(root_path, output_file)) + + row_data = { + 'model': model_name, + 'task': task_name, + 'metric': metric, + 'score': str(score), + } + + with open(os.path.join(root_path, output_file), mode='a', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=row_data.keys()) + if not file_exists: + writer.writeheader() + + writer.writerow(row_data) + + +def log_performance_json(model_name, task_name, metric, score, root_path, output_file='prediction.json'): + import json + log_data = { + 'model': model_name, + 'task': task_name, + 'metric': metric, + 'score': str(score), + } + + log_file_path = os.path.join(root_path, output_file) + + if os.path.exists(log_file_path): + with open(log_file_path, 'r') as f: + existing_data = json.load(f) + else: + existing_data = [] + + existing_data.append(log_data) + + with open(log_file_path, 'w', encoding='utf-8') as f: + json.dump(existing_data, f, indent=4) + + +def log_performance_detail(model_name, task_name, metrics, root_path, output_file='performance_log.csv'): + import csv + file_path = os.path.join(root_path, output_file) + file_exists = os.path.isfile(file_path) + + # Retrieve the main indicator values from the metrics dictionary + metric_value = None + if isinstance(metrics, dict): + # Select metrics based on priority + for key in ['accuracy', 'f1', 'micro_f1', 'bleu4', 'rougeL', 'code_bleu', 'MAE']: + if key in metrics: + metric_value = metrics[key] + break + if metric_value is None and len(metrics) > 0: + # If no priority metric is found, use the first metric + metric_value = list(metrics.values())[0] + else: + metric_value = metrics + + # Simplify the file name, keeping only the last part + model_name = model_name.split('/')[-1] + + if file_exists: + # Read existing data + rows = [] + tasks = set() + with open(file_path, 'r', newline='', encoding='utf-8') as f: + reader = csv.reader(f) + header = next(reader, ['task', model_name]) # If the file is empty, use the default header + if len(header) == 1: # If there is only the task column, add the model column + header.append(model_name) + rows.append(header) + + # Read existing data and update + for row in reader: + if row[0] == task_name: # If the same task is found, update the value + row = [task_name, str(metric_value)] + tasks.add(row[0]) + rows.append(row) + + # If it is a new task, add a new row + if task_name not in tasks: + rows.append([task_name, str(metric_value)]) + else: + # Create a new file + rows = [ + ['task', model_name], + [task_name, str(metric_value)] + ] + + # Write all data + with open(file_path, 'w', newline='', encoding='utf-8') as f: + writer = csv.writer(f) + writer.writerows(rows) + + +if __name__ == "__main__": + + import argparse + # Parse command line arguments + parser = argparse.ArgumentParser(description="Run audio understanding tasks") + parser.add_argument('-m', '--model_name', type=str, required=True, help='Name of the audio understanding model to use') + parser.add_argument('-d', '--data_dir', type=str, default='./audio/understanding/', help='Directory containing task data') + parser.add_argument('-o', '--output_dir', type=str, default='./audio/predictions/understanding/', help='Directory to save predictions') + parser.add_argument('-r', '--root_path', type=str, default='./', help='Root path for logging performance') + parser.add_argument('-t', '--task_names', type=str, nargs='+', + help='List of task names to run (default: AccentClassification AccentSexClassification AcousticSceneClassification)') + args = parser.parse_args() + + # model_name = 'Qwen2-Audio-7B-Instruct' + # data_dir = './understanding/' + # output_dir = f'./predictions/understanding/{model_name}' + # root_path = './' + + model = AudioComprehensionModel(model_name=args.model_name) + + + task_name_list = [ + 'AccentClassification', 'AccentSexClassification', 'AcousticSceneClassification', + 'AnimalSoundClassification', 'AudioCaptioning', 'AudioCaptioningClotho', + 'AudioQA', 'BirdSoundDetection', 'EnvironmentSoundRecognition', + 'IntentClassification', 'MusicGenreClassification', + 'MusicInstrumentClassification', 'MusicInstrumentSourceAnalysis', + 'MusicPitchAnalysis', 'NoteQualitiesAnalysis', 'OpenAQA', + 'SingerIdentification', 'SoundEventClassification', + 'SpeakerIdentification', 'SpeechCommand', + 'SpeechEmotionRecognition', 'VocalSoundClassification', + 'VocalTechniqueDetection' + ] + if args.task_names is None or len(args.task_names) == 0: + args.task_names = task_name_list + + for task_name in args.task_names: # os.listdir(data_dir): + + # Dynamically get the class by its name + if task_name in globals(): # Ensure the class is defined in the current scope + task_class = globals()[task_name] + else: + # Optionally, handle cases where the class is not found + print(f"Task {task_name} is not defined in the current scope.") + continue + + # Initialize the task class + import glob + json_file_list = glob.glob(os.path.join(args.data_dir, task_name, "*.json")) + if len(json_file_list) == 0: + print(f"No JSON files found for task: {task_name}") + continue + elif len(json_file_list) > 1: + print(f"Multiple JSON files found for task: {task_name}, using the first one: {json_file_list[0]}") + task_annotation_data = json_file_list[0] + else: + task_annotation_data = json_file_list[0] + task = task_class( + task_data=task_annotation_data, + model=model, + audio_dir=os.path.join(args.data_dir, task_name, 'audios'), + output_dir=args.output_dir + ) + + # Run inference for the task + # This should generate audio files based on the task's data + print(f"Running inference for task: {task_name}") + task.run_inference() + # if you want to save the predictions, you need to rewrite the save_predictions() in each Task class depending on your need, and call task.save_predictions() after task.run_inference() or inside the run_inference method. + + + # Evaluate the task, return a dictionary of metrics + # For example, {'FAD_score': 0.123} + eval_results = task.evaluate() + print("Task name: ", task_name, "Evaluation results:", eval_results) + log_performance_json( + model_name=args.model_name, + task_name=task_name, + metric=list(eval_results.keys())[0].split('_')[0], # CLAP_score + score=eval_results[list(eval_results.keys())[0]], # e.g., 0.123 + root_path=args.data_dir) + + # or you can run the tasks one by one like below: + # task_name = 'AcousticSceneClassification' + # task = AcousticSceneClassification( + # task_data=os.path.join(data_dir, f"{task_name}/annotation.json"), + # model=model, + # audio_dir=os.path.join(data_dir, f"{task_name}/audios"), + # output_dir=output_dir) + # task.run_inference() + # print(task.evaluate()) + + + diff --git a/predictors/audio_predict_generation.py b/predictors/audio_predict_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..5bc50d975bfa35e5778718f13d35eb6cd6d21645 --- /dev/null +++ b/predictors/audio_predict_generation.py @@ -0,0 +1,1245 @@ +from email.mime import audio +import json +import os +from pyexpat import model +from regex import B, D +import tqdm +from typing import List, Dict, Any +import nltk +from dataclasses import dataclass +from abc import ABC, abstractmethod +import math +import time +from urllib.request import urlopen +import librosa +import torch +from torch import nn +import numpy as np +from encodec import EncodecModel +import laion_clap +import resampy +import soundfile as sf +from scipy import linalg +from multiprocessing.dummy import Pool as ThreadPool +import copy +import pickle +from collections import defaultdict + + + +def read_json(file_path: str) -> Dict[str, Any]: + with open(file_path, "r") as f: + data = json.load(f) + return data + + +# ================================================ FAD related functions ================================================ +# These functions are used to calculate the FAD score + + +def load_audio_task(fname, sample_rate, channels, dtype="float32"): + if dtype not in ['float64', 'float32', 'int32', 'int16']: + raise ValueError(f"dtype not supported: {dtype}") + + wav_data, sr = sf.read(fname, dtype=dtype) + # For integer type PCM input, convert to [-1.0, +1.0] + if dtype == 'int16': + wav_data = wav_data / 32768.0 + elif dtype == 'int32': + wav_data = wav_data / float(2**31) + + # Convert to mono + assert channels in [1, 2], "channels must be 1 or 2" + if len(wav_data.shape) > channels: + wav_data = np.mean(wav_data, axis=1) + + if sr != sample_rate: + wav_data = resampy.resample(wav_data, sr, sample_rate) + + return wav_data + + +class FrechetAudioDistance: + def __init__( + self, + ckpt_dir=None, + model_name="clap", + submodel_name="630k-audioset", # only for CLAP + sample_rate=16000, + channels=1, + use_pca=False, # only for VGGish + use_activation=False, # only for VGGish + verbose=False, + audio_load_worker=8, + enable_fusion=False, # only for CLAP + ): + """ + Initialize FAD + + -- ckpt_dir: folder where the downloaded checkpoints are stored + -- model_name: one between vggish, pann, clap or encodec + -- submodel_name: only for clap models - determines which checkpoint to use. + options: ["630k-audioset", "630k", "music_audioset", "music_speech", "music_speech_audioset"] + -- sample_rate: one between [8000, 16000, 32000, 48000]. depending on the model set the sample rate to use + -- channels: number of channels in an audio track + -- use_pca: whether to apply PCA to the vggish embeddings + -- use_activation: whether to use the output activation in vggish + -- enable_fusion: whether to use fusion for clap models (valid depending on the specific submodel used) + """ + assert model_name in ["vggish", "clap", "encodec"], "model_name must be either 'vggish', 'pann', 'clap' or 'encodec'" + if model_name == "vggish": + assert sample_rate == 16000, "sample_rate must be 16000" + elif model_name == "clap": + assert sample_rate == 48000, "sample_rate must be 48000" + assert submodel_name in ["630k-audioset", "630k", "music_audioset", "music_speech", "music_speech_audioset"] + elif model_name == "encodec": + assert sample_rate in [24000, 48000], "sample_rate must be 24000 or 48000" + if sample_rate == 48000: + assert channels == 2, "channels must be 2 for 48khz encodec model" + self.model_name = model_name + self.submodel_name = submodel_name + self.sample_rate = sample_rate + self.channels = channels + self.verbose = verbose + self.device = torch.device( + 'cuda') if torch.cuda.is_available() else torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu') + if self.device == torch.device('mps') and self.model_name == "clap": + if self.verbose: + print("[Frechet Audio Distance] CLAP does not support MPS device yet, because:") + print("[Frechet Audio Distance] The operator 'aten::upsample_bicubic2d.out' is not currently implemented for the MPS device.") + print("[Frechet Audio Distance] Using CPU device instead.") + self.device = torch.device('cpu') + if self.verbose: + print("[Frechet Audio Distance] Using device: {}".format(self.device)) + self.audio_load_worker = audio_load_worker + self.enable_fusion = enable_fusion + if ckpt_dir is not None: + os.makedirs(ckpt_dir, exist_ok=True) + torch.hub.set_dir(ckpt_dir) + self.ckpt_dir = ckpt_dir + else: + # by default `ckpt_dir` is `torch.hub.get_dir()` + self.ckpt_dir = torch.hub.get_dir() + self.__get_model(model_name=model_name, use_pca=use_pca, use_activation=use_activation) + + def __get_model(self, model_name="vggish", use_pca=False, use_activation=False): + """ + Get ckpt and set model for the specified model_name + + Params: + -- model_name: one between vggish, pann or clap + -- use_pca: whether to apply PCA to the vggish embeddings + -- use_activation: whether to use the output activation in vggish + """ + # vggish + if model_name == "vggish": + # S. Hershey et al., "CNN Architectures for Large-Scale Audio Classification", ICASSP 2017 + self.model = torch.hub.load(repo_or_dir='harritaylor/torchvggish', model='vggish') + if not use_pca: + self.model.postprocess = False + if not use_activation: + self.model.embeddings = nn.Sequential(*list(self.model.embeddings.children())[:-1]) + self.model.device = self.device + # clap + elif model_name == "clap": + # choose the right checkpoint and model + if self.submodel_name == "630k-audioset": + if self.enable_fusion: + download_name = "630k-audioset-fusion-best.pt" + else: + download_name = "630k-audioset-best.pt" + elif self.submodel_name == "630k": + if self.enable_fusion: + download_name = "630k-fusion-best.pt" + else: + download_name = "630k-best.pt" + elif self.submodel_name == "music_audioset": + download_name = "music_audioset_epoch_15_esc_90.14.pt" + elif self.submodel_name == "music_speech": + download_name = "music_speech_epoch_15_esc_89.25.pt" + elif self.submodel_name == "music_speech_audioset": + download_name = "music_speech_audioset_epoch_15_esc_89.98.pt" + + model_path = os.path.join(self.ckpt_dir, download_name) + + # download checkpoint + if not (os.path.exists(model_path)): + if self.verbose: + print("[Frechet Audio Distance] Downloading {}...".format(model_path)) + torch.hub.download_url_to_file( + url=f"https://huggingface.co/lukewys/laion_clap/resolve/main/{download_name}", + dst=model_path + ) + # init model and load checkpoint + if self.submodel_name in ["630k-audioset", "630k"]: + self.model = laion_clap.CLAP_Module(enable_fusion=self.enable_fusion, + device=self.device) + elif self.submodel_name in ["music_audioset", "music_speech", "music_speech_audioset"]: + self.model = laion_clap.CLAP_Module(enable_fusion=self.enable_fusion, + amodel='HTSAT-base', + device=self.device) + self.model.load_ckpt(model_path) + + # init model and load checkpoint + if self.submodel_name in ["630k-audioset", "630k"]: + self.model = laion_clap.CLAP_Module(enable_fusion=self.enable_fusion, + device=self.device) + elif self.submodel_name in ["music_audioset", "music_speech", "music_speech_audioset"]: + self.model = laion_clap.CLAP_Module(enable_fusion=self.enable_fusion, + amodel='HTSAT-base', + device=self.device) + self.model.load_ckpt(model_path) + + # encodec + elif model_name == "encodec": + # choose the right model based on sample_rate + # weights are loaded from the encodec repo: https://github.com/facebookresearch/encodec/ + if self.sample_rate == 24000: + self.model = EncodecModel.encodec_model_24khz() + elif self.sample_rate == 48000: + self.model = EncodecModel.encodec_model_48khz() + # 24kbps is the max bandwidth supported by both versions + # these models use 32 residual quantizers + self.model.set_target_bandwidth(24.0) + + self.model.to(self.device) + self.model.eval() + + def get_embeddings(self, x, sr): + """ + Get embeddings using VGGish, PANN, CLAP or EnCodec models. + Params: + -- x : a list of np.ndarray audio samples + -- sr : sampling rate. + """ + embd_lst = [] + try: + for audio in tqdm(x, disable=(not self.verbose)): + if self.model_name == "vggish": + embd = self.model.forward(audio, sr) + elif self.model_name == "clap": + audio = torch.tensor(audio).float().unsqueeze(0) + embd = self.model.get_audio_embedding_from_data(audio, use_tensor=True) + elif self.model_name == "encodec": + # add two dimensions + audio = torch.tensor( + audio).float().unsqueeze(0).unsqueeze(0).to(self.device) + # if SAMPLE_RATE is 48000, we need to make audio stereo + if self.model.sample_rate == 48000: + if audio.shape[-1] != 2: + if self.verbose: + print( + "[Frechet Audio Distance] Audio is mono, converting to stereo for 48khz model..." + ) + audio = torch.cat((audio, audio), dim=1) + else: + # transpose to (batch, channels, samples) + audio = audio[:, 0].transpose(1, 2) + + if self.verbose: + print( + "[Frechet Audio Distance] Audio shape: {}".format( + audio.shape + ) + ) + + with torch.no_grad(): + # encodec embedding (before quantization) + embd = self.model.encoder(audio) + embd = embd.squeeze(0) + + if self.verbose: + print( + "[Frechet Audio Distance] Embedding shape: {}".format( + embd.shape + ) + ) + + if embd.device != torch.device("cpu"): + embd = embd.cpu() + + if torch.is_tensor(embd): + embd = embd.detach().numpy() + + embd_lst.append(embd) + except Exception as e: + print("[Frechet Audio Distance] get_embeddings throw an exception: {}".format(str(e))) + + return np.concatenate(embd_lst, axis=0) + + def calculate_embd_statistics(self, embd_lst): + if isinstance(embd_lst, list): + embd_lst = np.array(embd_lst) + mu = np.mean(embd_lst, axis=0) + sigma = np.cov(embd_lst, rowvar=False) + return mu, sigma + + def calculate_frechet_distance(self, mu1, sigma1, mu2, sigma2, eps=1e-6): + """ + Adapted from: https://github.com/mseitzer/pytorch-fid/blob/master/src/pytorch_fid/fid_score.py + + Numpy implementation of the Frechet Distance. + The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1) + and X_2 ~ N(mu_2, C_2) is + d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)). + Stable version by Dougal J. Sutherland. + Params: + -- mu1 : Numpy array containing the activations of a layer of the + inception net (like returned by the function 'get_predictions') + for generated samples. + -- mu2 : The sample mean over activations, precalculated on an + representative data set. + -- sigma1: The covariance matrix over activations for generated samples. + -- sigma2: The covariance matrix over activations, precalculated on an + representative data set. + Returns: + -- : The Frechet Distance. + """ + + mu1 = np.atleast_1d(mu1) + mu2 = np.atleast_1d(mu2) + + sigma1 = np.atleast_2d(sigma1) + sigma2 = np.atleast_2d(sigma2) + + assert mu1.shape == mu2.shape, \ + 'Training and test mean vectors have different lengths' + assert sigma1.shape == sigma2.shape, \ + 'Training and test covariances have different dimensions' + + diff = mu1 - mu2 + + # Product might be almost singular + covmean, _ = linalg.sqrtm(sigma1.dot(sigma2).astype(complex), disp=False) + if not np.isfinite(covmean).all(): + msg = ('fid calculation produces singular product; ' + 'adding %s to diagonal of cov estimates') % eps + print(msg) + offset = np.eye(sigma1.shape[0]) * eps + covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset).astype(complex)) + + # Numerical error might give slight imaginary component + if np.iscomplexobj(covmean): + if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3): + m = np.max(np.abs(covmean.imag)) + raise ValueError('Imaginary component {}'.format(m)) + covmean = covmean.real + + tr_covmean = np.trace(covmean) + + return (diff.dot(diff) + np.trace(sigma1) + + np.trace(sigma2) - 2 * tr_covmean) + + def __load_audio_files(self, dir, dtype="float32"): + task_results = [] + + pool = ThreadPool(self.audio_load_worker) + pbar = tqdm(total=len(os.listdir(dir)), disable=(not self.verbose)) + + def update(*a): + pbar.update() + + if self.verbose: + print("[Frechet Audio Distance] Loading audio from {}...".format(dir)) + for fname in os.listdir(dir): + res = pool.apply_async( + load_audio_task, + args=(os.path.join(dir, fname), self.sample_rate, self.channels, dtype), + callback=update, + ) + task_results.append(res) + pool.close() + pool.join() + + return [k.get() for k in task_results] + + def score(self, + background_dir, + eval_dir, + background_embds_path=None, + eval_embds_path=None, + dtype="float32" + ): + """ + Computes the Frechet Audio Distance (FAD) between two directories of audio files. + + Parameters: + - background_dir (str): Path to the directory containing background audio files. + - eval_dir (str): Path to the directory containing evaluation audio files. + - background_embds_path (str, optional): Path to save/load background audio embeddings (e.g., /folder/bkg_embs.npy). If None, embeddings won't be saved. + - eval_embds_path (str, optional): Path to save/load evaluation audio embeddings (e.g., /folder/test_embs.npy). If None, embeddings won't be saved. + - dtype (str, optional): Data type for loading audio. Default is "float32". + + Returns: + - float: The Frechet Audio Distance (FAD) score between the two directories of audio files. + """ + try: + # Load or compute background embeddings + if background_embds_path is not None and os.path.exists(background_embds_path): + if self.verbose: + print(f"[Frechet Audio Distance] Loading embeddings from {background_embds_path}...") + embds_background = np.load(background_embds_path) + else: + audio_background = self.__load_audio_files(background_dir, dtype=dtype) + embds_background = self.get_embeddings(audio_background, sr=self.sample_rate) + if background_embds_path: + os.makedirs(os.path.dirname(background_embds_path), exist_ok=True) + np.save(background_embds_path, embds_background) + + # Load or compute eval embeddings + if eval_embds_path is not None and os.path.exists(eval_embds_path): + if self.verbose: + print(f"[Frechet Audio Distance] Loading embeddings from {eval_embds_path}...") + embds_eval = np.load(eval_embds_path) + else: + audio_eval = self.__load_audio_files(eval_dir, dtype=dtype) + embds_eval = self.get_embeddings(audio_eval, sr=self.sample_rate) + if eval_embds_path: + os.makedirs(os.path.dirname(eval_embds_path), exist_ok=True) + np.save(eval_embds_path, embds_eval) + + # Check if embeddings are empty + if len(embds_background) == 0: + print("[Frechet Audio Distance] background set dir is empty, exiting...") + return -1 + if len(embds_eval) == 0: + print("[Frechet Audio Distance] eval set dir is empty, exiting...") + return -1 + + # Compute statistics and FAD score + mu_background, sigma_background = self.calculate_embd_statistics(embds_background) + mu_eval, sigma_eval = self.calculate_embd_statistics(embds_eval) + + fad_score = self.calculate_frechet_distance( + mu_background, + sigma_background, + mu_eval, + sigma_eval + ) + + return fad_score + except Exception as e: + print(f"[Frechet Audio Distance] An error occurred: {e}") + return -1 + + +def calculate_fad_score(background_dir, eval_dir, background_embds_path=None, eval_embds_path=None, dtype="float32", ckpt_dir=None, model_name="clap", submodel_name="630k-audioset", sample_rate=16000, channels=1, use_pca=False, use_activation=False, verbose=False, audio_load_worker=8, enable_fusion=False): + """ + Calculate the Frechet Audio Distance (FAD) score between two directories of audio files. + + Parameters: + - background_dir: Directory containing background audio files. + - eval_dir: Directory containing evaluation audio files. + - background_embds_path: Path to save/load background audio embeddings. + - eval_embds_path: Path to save/load evaluation audio embeddings. + - dtype: Data type for loading audio files (default is "float32"). + - ckpt_dir: Directory where the model checkpoints are stored. + - model_name: Name of the model to use (default is "clap"). + - submodel_name: Submodel name for CLAP (default is "630k-audioset"). + - sample_rate: Sample rate for audio files (default is 16000). + - channels: Number of channels in the audio files (default is 1). + - use_pca: Whether to apply PCA to VGGish embeddings (default is False). + - use_activation: Whether to use output activation in VGGish (default is False). + - verbose: Whether to print verbose output (default is False). + - audio_load_worker: Number of workers for loading audio files (default is 8). + - enable_fusion: Whether to enable fusion for CLAP models (default is False). + + Returns: + - FAD score as a float. + """ + + fad = FrechetAudioDistance( + ckpt_dir=ckpt_dir, + model_name=model_name, + submodel_name=submodel_name, + sample_rate=sample_rate, + channels=channels, + use_pca=use_pca, + use_activation=use_activation, + verbose=verbose, + audio_load_worker=audio_load_worker, + enable_fusion=enable_fusion + ) + + return { + "FAD_score": fad.score(background_dir, eval_dir, background_embds_path, eval_embds_path, dtype) + } + + + + + +# ================================================ CLAP related functions ================================================ +# These functions are used to calculate the CLAP score + + +# quantization +def int16_to_float32(x): + return (x / 32767.0).astype('float32') + + +def float32_to_int16(x): + x = np.clip(x, a_min=-1., a_max=1.) + return (x * 32767.).astype('int16') + + +def calculate_cosine_similarity(embeddings1, embeddings2): + dot_product = np.dot(embeddings1, embeddings2) + norm1 = np.linalg.norm(embeddings1) + norm2 = np.linalg.norm(embeddings2) + return dot_product / (norm1 * norm2) if norm1 and norm2 else 0.0 + + +def calculate_clap_score(clap_checkpoint=None, model_id=-1, verbose=True, audio_file_list=None, text_file_list=None): + """Load the pretrained checkpoint of CLAP model + + Parameters + ---------- + ckpt: str + if ckpt is specified, the model will load this ckpt, otherwise the model will download the ckpt from zenodo. \n + For fusion model, it will download the 630k+audioset fusion model (id=3). For non-fusion model, it will download the 630k+audioset model (id=1). + model_id: + if model_id is specified, you can download our best ckpt, as: + id = 0 --> 630k non-fusion ckpt \n + id = 1 --> 630k+audioset non-fusion ckpt \n + id = 2 --> 630k fusion ckpt \n + id = 3 --> 630k+audioset fusion ckpt \n + Note that if your model is specied as non-fusion model but you download a fusion model ckpt, you will face an error. + """ + model = laion_clap.CLAP_Module(enable_fusion=False) + model.load_ckpt(ckpt = clap_checkpoint, model_id = model_id, verbose=verbose) # download the default pretrained checkpoint. + audio_embeddings = [] + for file in audio_file_list: + audio, sr = librosa.load(file, sr=16000) + audio = int16_to_float32(audio) + embeddings = laion_clap.get_audio_embedding(audio) + audio_embeddings.append(embeddings) + + text_embeddings = [] + for file in text_file_list: + if os.path.exists(file): + with open(file, 'r') as f: + text = f.read() + else: + text = file + embeddings = laion_clap.get_text_embedding(text) + text_embeddings.append(embeddings) + + # Compute similarity scores + scores = [] + for audio_emb, text_emb in zip(audio_embeddings, text_embeddings): + score = calculate_cosine_similarity(audio_emb, text_emb) + scores.append(score) + + # compute the average score + if len(scores) > 0: + average_score = sum(scores) / len(scores) + else: + average_score = 0.0 + + return {"CLAP_score": average_score, "scores": scores} + + +# ================================================ CIDEr (Consensus-based Image Description Evaluation) related functions ================================================ +# These functions are used to calculate the CIDEr score + + +import whisper # a tool from OpenAI for speech recognition + + +def speech_to_text(model_name="turbo", audio_file="audio.mp3"): + """ + Convert speech to text using a speech recognition model. + """ + model = whisper.load_model(model_name) + + # load audio and pad/trim it to fit 30 seconds + audio = whisper.load_audio(audio_file) + audio = whisper.pad_or_trim(audio) + + # make log-Mel spectrogram and move to the same device as the model + mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device) + + # detect the spoken language + _, probs = model.detect_language(mel) + print(f"Detected language: {max(probs, key=probs.get)}") + + # decode the audio + options = whisper.DecodingOptions() + result = whisper.decode(model, mel, options) + + # print the recognized text + print(result.text) + return result.text + + +def precook(s, n=4, out=False): + """ + Takes a string as input and returns an object that can be given to + either cook_refs or cook_test. This is optional: cook_refs and cook_test + can take string arguments as well. + :param s: string : sentence to be converted into ngrams + :param n: int : number of ngrams for which representation is calculated + :return: term frequency vector for occuring ngrams + """ + words = s.split() + counts = defaultdict(int) + for k in range(1,n+1): + for i in range(len(words)-k+1): + ngram = tuple(words[i:i+k]) + counts[ngram] += 1 + return counts + +def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" + '''Takes a list of reference sentences for a single segment + and returns an object that encapsulates everything that BLEU + needs to know about them. + :param refs: list of string : reference sentences for some image + :param n: int : number of ngrams for which (ngram) representation is calculated + :return: result (list of dict) + ''' + return [precook(ref, n) for ref in refs] + +def cook_test(test, n=4): + '''Takes a test sentence and returns an object that + encapsulates everything that BLEU needs to know about it. + :param test: list of string : hypothesis sentence for some image + :param n: int : number of ngrams for which (ngram) representation is calculated + :return: result (dict) + ''' + return precook(test, n, True) + + +# https://github.com/ramavedantam/cider/blob/master/pyciderevalcap/cider/cider_scorer.py +class CiderScorer(object): + """CIDEr scorer. + """ + + def copy(self): + ''' copy the refs.''' + new = CiderScorer(n=self.n) + new.ctest = copy.copy(self.ctest) + new.crefs = copy.copy(self.crefs) + return new + + def __init__(self, test=None, refs=None, n=4, sigma=6.0): + ''' singular instance ''' + self.n = n + self.sigma = sigma + self.crefs = [] + self.ctest = [] + self.document_frequency = defaultdict(float) + self.cook_append(test, refs) + self.ref_len = None + + def cook_append(self, test, refs): + '''called by constructor and __iadd__ to avoid creating new instances.''' + + if refs is not None: + self.crefs.append(cook_refs(refs)) + if test is not None: + self.ctest.append(cook_test(test)) ## N.B.: -1 + else: + self.ctest.append(None) # lens of crefs and ctest have to match + + def size(self): + assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) + return len(self.crefs) + + def __iadd__(self, other): + '''add an instance (e.g., from another sentence).''' + + if type(other) is tuple: + ## avoid creating new CiderScorer instances + self.cook_append(other[0], other[1]) + else: + self.ctest.extend(other.ctest) + self.crefs.extend(other.crefs) + + return self + + def compute_doc_freq(self): + ''' + Compute term frequency for reference data. + This will be used to compute idf (inverse document frequency later) + The term frequency is stored in the object + :return: None + ''' + for refs in self.crefs: + # refs, k ref captions of one image + for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]): + self.document_frequency[ngram] += 1 + # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) + + def compute_cider(self, df_mode="corpus"): + def counts2vec(cnts): + """ + Function maps counts of ngram to vector of tfidf weights. + The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. + The n-th entry of array denotes length of n-grams. + :param cnts: + :return: vec (array of dict), norm (array of float), length (int) + """ + vec = [defaultdict(float) for _ in range(self.n)] + length = 0 + norm = [0.0 for _ in range(self.n)] + for (ngram,term_freq) in cnts.iteritems(): + # give word count 1 if it doesn't appear in reference corpus + df = np.log(max(1.0, self.document_frequency[ngram])) + # ngram index + n = len(ngram)-1 + # tf (term_freq) * idf (precomputed idf) for n-grams + vec[n][ngram] = float(term_freq)*(self.ref_len - df) + # compute norm for the vector. the norm will be used for + # computing similarity + norm[n] += pow(vec[n][ngram], 2) + + if n == 1: + length += term_freq + norm = [np.sqrt(n) for n in norm] + return vec, norm, length + + def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): + ''' + Compute the cosine similarity of two vectors. + :param vec_hyp: array of dictionary for vector corresponding to hypothesis + :param vec_ref: array of dictionary for vector corresponding to reference + :param norm_hyp: array of float for vector corresponding to hypothesis + :param norm_ref: array of float for vector corresponding to reference + :param length_hyp: int containing length of hypothesis + :param length_ref: int containing length of reference + :return: array of score for each n-grams cosine similarity + ''' + delta = float(length_hyp - length_ref) + # measure consine similarity + val = np.array([0.0 for _ in range(self.n)]) + for n in range(self.n): + # ngram + for (ngram,count) in vec_hyp[n].iteritems(): + val[n] += vec_hyp[n][ngram] * vec_ref[n][ngram] + + if (norm_hyp[n] != 0) and (norm_ref[n] != 0): + val[n] /= (norm_hyp[n]*norm_ref[n]) + + assert(not math.isnan(val[n])) + return val + + # compute log reference length + if df_mode == "corpus": + self.ref_len = np.log(float(len(self.crefs))) + elif df_mode == "coco-val-df": + # if coco option selected, use length of coco-val set + self.ref_len = np.log(float(40504)) + + scores = [] + for test, refs in zip(self.ctest, self.crefs): + # compute vector for test captions + vec, norm, length = counts2vec(test) + # compute vector for ref captions + score = np.array([0.0 for _ in range(self.n)]) + for ref in refs: + vec_ref, norm_ref, length_ref = counts2vec(ref) + score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) + # change by vrama91 - mean of ngram scores, instead of sum + score_avg = np.mean(score) + # divide by number of references + score_avg /= len(refs) + # multiply score by 10 + score_avg *= 10.0 + # append score of an image to the score list + scores.append(score_avg) + return scores + + def compute_score(self, df_mode, option=None, verbose=0): + # compute idf + if df_mode == "corpus": + self.compute_doc_freq() + # assert to check document frequency + assert(len(self.ctest) >= max(self.document_frequency.values())) + # import json for now and write the corresponding files + else: + self.document_frequency = pickle.load(open(os.path.join('data', df_mode + '.p'),'r')) + # compute cider score + score = self.compute_cider(df_mode) + # debug + # print score + return np.mean(np.array(score)), np.array(score) + + +# https://github.com/ramavedantam/cider/blob/master/pyciderevalcap/cider/cider.py +class Cider: + """ + Main Class to compute the CIDEr metric + + """ + def __init__(self, n=4, df="corpus"): + """ + Initialize the CIDEr scoring function + : param n (int): n-gram size + : param df (string): specifies where to get the IDF values from + takes values 'corpus', 'coco-train' + : return: None + """ + # set cider to sum over 1 to 4-grams + self._n = n + self._df = df + + def compute_score(self, gts, res): + """ + Main function to compute CIDEr score + : param gts (dict) : {image:tokenized reference sentence} + : param res (dict) : {image:tokenized candidate sentence} + : return: cider (float) : computed CIDEr score for the corpus + """ + + cider_scorer = CiderScorer(n=self._n) + + for res_id in res: + + hypo = res_id['caption'] + ref = gts[res_id['image_id']] + + # Sanity check. + assert(type(hypo) is list) + assert(len(hypo) == 1) + assert(type(ref) is list) + assert(len(ref) > 0) + cider_scorer += (hypo[0], ref) + + (score, scores) = cider_scorer.compute_score(self._df) + + return score, scores + + def method(self): + return "CIDEr" + + +def calculate_CIDEr_score(audio_file_list=None, text_file_list=None): + # convert audio files to text using speech-to-text + if audio_file_list is None or text_file_list is None: + raise ValueError("Both audio_file_list and text_file_list must be provided.") + if len(audio_file_list) != len(text_file_list): + raise ValueError("audio_file_list and text_file_list must have the same length.") + # Load the CIDEr scorer + cider_scorer = Cider(n=4, df="corpus") + # Prepare the ground truth and results + gts = {} + res = [] + from spacy.tokenizer import Tokenizer + from spacy.lang.en import English + nlp = English() + # Create a blank Tokenizer with just the English vocab + tokenizer = Tokenizer(nlp.vocab) + + for audio_file, text_file in zip(audio_file_list, text_file_list): + # Convert audio to text + text = speech_to_text(audio_file=audio_file) + + gts[audio_file] = [tokenizer(text).words] # Tokenize the text + + with open(text_file, 'r') as f: + reference_text = f.read().strip() + # Tokenize the reference text + text = tokenizer(reference_text).words + res.append({ + 'image_id': audio_file, + 'caption': [text] + }) + # Compute the CIDEr score + score, scores = cider_scorer.compute_score(gts, res) + return { + "CIDEr_score": score, + "scores": scores + } + + + + + + + +# ================================================ WER (Word Error Rate) related functions ================================================ +# These functions are used to calculate the WER + +# pip install werpy + +import werpy +def calculate_wer(audio_file_list: list, text_file_list: list) -> float: + """Calculate the Word Error Rate (WER) between a reference and a hypothesis. + Args: + audio_file_list (list): List of audio files to be transcribed. + text_file_list (list): List of text files containing the reference transcriptions. + """ + if len(audio_file_list) != len(text_file_list): + raise ValueError("audio_file_list and text_file_list must have the same length.") + + total_wer = 0.0 + for audio_file, text_file in zip(audio_file_list, text_file_list): + # Convert audio to text using speech-to-text + transcribed_text = speech_to_text(audio_file=audio_file) + + # Read the reference text from the file + with open(text_file, 'r') as f: + reference_text = f.read().strip() + + # Calculate WER + wer_score = werpy.wer(reference_text, transcribed_text) + total_wer += wer_score + + average_wer = total_wer / len(audio_file_list) + return {"WER_score": average_wer} + + + + +# ================================================ MCD (Mel Cepstral Distortion ) related functions ================================================ +# These functions are used to calculate the MCD + +# pip install -U pymcd +from pymcd.mcd import Calculate_MCD + +def calculate_mcd(reference_audio_list: str, generated_audio_list: str) -> float: + """Calculate the Mel Cepstral Distortion (MCD) between two audio files. + + Args: + reference_audio (str): Path to the reference audio file. + generated_audio (str): Path to the generated audio file. + + Returns: + float: The MCD score. + """ + # instance of MCD class + # three different modes "plain", "dtw" and "dtw_sl" for the above three MCD metrics + mcd_toolbox = Calculate_MCD(MCD_mode="plain") + + # two inputs w.r.t. reference (ground-truth) and synthesized speeches, respectively + mcd_scores = [] + for ref_audio, gen_audio in zip(reference_audio_list, generated_audio_list): + # calculate MCD score + mcd_score = mcd_toolbox.calculate_mcd(ref_audio, gen_audio) + mcd_scores.append(mcd_score) + # calculate average MCD score + mcd_score = sum(mcd_scores) / len(mcd_scores) + if mcd_score is None: + raise ValueError("MCD score could not be calculated. Please check the audio files.") + + return {"MCD_score": mcd_score, "mcd_scores": mcd_scores} + + + +class AudioGenerationModel: + def __init__(self, model_name: str): + self.model_name = model_name + + def __init__(self, model_name: str): + self.model_name = model_name + self.load_model() + + def load_model(self): + # Placeholder for loading the model + pass + + def generate(self, input_text: str) -> np.ndarray: + # Placeholder for audio generation logic + # This should return the generated audio as a numpy array or a file path + pass + + + +@dataclass +class Instance: + input: Dict[str, Any] + output: Dict[str, Any] + id: str + + +class BaseTask(ABC): + def __init__(self, task_data: Dict[str, Any], model: AudioGenerationModel, audio_dir: str = None, output_dir: str = None, task_name: str = None): + self.task_data = read_json(task_data) + self.model = model + self.audio_dir = audio_dir # should include the audios files + self.data = self._parse_data(self.task_data) + self.task_name = os.path.dirname(task_data).split("/")[-1] if task_name is None else task_name + self.output_dir = output_dir + os.makedirs(self.output_dir, exist_ok=True) if self.output_dir else None + + self.references = [] + self.predictions = [] + + def save_predictions(self, audio_paths): + results = [] + for gt, response, audio_path in zip(self.references, self.predictions, audio_paths): + results.append({ + 'gt': gt, + 'response': response, + 'audio_path': audio_path, + }) + time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime()) + results_file = os.path.join(self.output_dir, f'{self.task_name }_{time_prefix}.json') if self.output_dir else f'{self.task_name }_{time_prefix}.json' + json.dump(results, open(results_file, 'w')) + + @abstractmethod + def _get_choice_candidate(self): + pass + + @abstractmethod + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + pass + + @abstractmethod + def evaluate(self) -> Dict[str, float]: + pass + + @abstractmethod + def run_inference(self): + pass + + +class SingleCaptionToAudio(BaseTask): + def __init__(self, task_data: Dict[str, Any], model: AudioGenerationModel, audio_dir: str = None, output_dir: str = None, task_name: str = None): + super().__init__(task_data, model, audio_dir, output_dir, task_name) + self._get_choice_candidate() + + def _get_choice_candidate(self): + # Placeholder for getting choice candidates + pass + + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def save_predictions(self, audio_paths): + results = [] + for gt, response, audio_path in zip(self.references, self.predictions, audio_paths): + results.append({ + 'gt': gt, + 'response': response, + 'audio_path': audio_path, + }) + time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime()) + results_file = os.path.join(self.output_dir, f'{self.task_name }_{time_prefix}.json') if self.output_dir else f'{self.task_name }_{time_prefix}.json' + json.dump(results, open(results_file, 'w')) + + + def evaluate(self) -> Dict[str, float]: + self.predictions = [] + self.references = [] + for inst in tqdm.tqdm(self.data): + audio_path = os.path.join(self.audio_dir, inst.input["audio_file"]) + prompt = inst.input["prompt"] + try: + response = self.model.generate(prompt, audio_path=audio_path) + except: + print("error audio {}".format(inst.input["audio_file"])) + continue + # response is the generated audio file path + self.predictions.append(response) + self.references.append(prompt) + # self.save_predictions(audio_paths) + + def run_inference(self): + clap_score = calculate_clap_score(self.predictions, self.references) + return clap_score + + +class VideoToAudio(BaseTask): + def __init__(self, task_data: Dict[str, Any], model: AudioGenerationModel, audio_dir: str = None, output_dir: str = None, task_name: str = None): + super().__init__(task_data, model, audio_dir, output_dir, task_name) + self._get_choice_candidate() + + def _get_choice_candidate(self): + # Placeholder for getting choice candidates + pass + + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def evaluate(self) -> Dict[str, float]: + self.predictions = [] + self.references = [] + for inst in tqdm.tqdm(self.data): + video_path = os.path.join(self.audio_dir, inst.input["video_file"]) + prompt = inst.input["prompt"] + try: + response = self.model.generate(prompt, video_path=video_path) + except: + print("error video {}".format(inst.input["video_file"])) + continue + # response is the generated audio file path + self.predictions.append(response) + self.references.append(prompt) + + def run_inference(self): + fad_score = calculate_fad_score( + background_dir=self.audio_dir, + eval_dir=self.output_dir + ) + return fad_score + + +class ImageToSpeech(BaseTask): + def __init__(self, task_data: Dict[str, Any], model: AudioGenerationModel, audio_dir: str = None, output_dir: str = None, task_name: str = None): + super().__init__(task_data, model, audio_dir, output_dir, task_name) + self._get_choice_candidate() + + def _get_choice_candidate(self): + # Placeholder for getting choice candidates + pass + + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def evaluate(self) -> Dict[str, float]: + # Placeholder for evaluation logic + self.predictions = [] + self.references = [] + for inst in tqdm.tqdm(self.data): + image_path = os.path.join(self.audio_dir, inst.input["image_file"]) + prompt = inst.input["prompt"] + try: + response = self.model.generate(prompt, image_path=image_path) + except: + print("error image {}".format(inst.input["image_file"])) + continue + # response is the generated audio file path + self.predictions.append(response) + self.references.append(prompt) + + def run_inference(self): + CIDEr_score = calculate_CIDEr_score( + audio_file_list=self.predictions, + text_file_list=self.references + ) + return CIDEr_score + + +def log_performance_csv(model_name, task_name, metric, score, root_path, output_file='prediction.json'): + import csv + file_exists = os.path.isfile(os.path.join(root_path, output_file)) + + row_data = { + 'model': model_name, + 'task': task_name, + 'metric': metric, + 'score': str(score), + } + + with open(os.path.join(root_path, output_file), mode='a', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=row_data.keys()) + if not file_exists: + writer.writeheader() + + writer.writerow(row_data) + + +def log_performance_json(model_name, task_name, metric, score, root_path, output_file='prediction.json'): + import json + log_data = { + 'model': model_name, + 'task': task_name, + 'metric': metric, + 'score': str(score), + } + + log_file_path = os.path.join(root_path, output_file) + + if os.path.exists(log_file_path): + with open(log_file_path, 'r') as f: + existing_data = json.load(f) + else: + existing_data = [] + + existing_data.append(log_data) + + with open(log_file_path, 'w', encoding='utf-8') as f: + json.dump(existing_data, f, indent=4) + + + + +if __name__ == "__main__": + import argparse + # Parse command line arguments + parser = argparse.ArgumentParser(description="Run audio generation tasks") + parser.add_argument('-m', '--model_name', type=str, required=True, help='Name of the audio generation model to use') + parser.add_argument('-d', '--data_dir', type=str, default='./audio/generation/', help='Directory containing task data') + parser.add_argument('-o', '--output_dir', type=str, default='./audio/predictions/generation/', help='Directory to save predictions for each task') + parser.add_argument('-r', '--root_path', type=str, default='./', help='Root path for logging performance') + parser.add_argument('-t', '--task_names', type=str, nargs='+', + help='List of task names to run (for example: SingleCaptionToAudio VideoToAudio ImageToSpeech)') + args = parser.parse_args() + + # Initialize the model + model = AudioGenerationModel(model_name=args.model_name) + # data_dir = './generation/' + # output_dir = f'./predictions/generation/{args.model_name}' + # root_path = './' + + task_name_list = [ + 'SingleCaptionToAudio', 'VideoToAudio', 'ImageToSpeech', + # Add more task names as needed + ] + + if args.task_names is None or len(args.task_names) == 0: + args.task_names = task_name_list + + for task_name in args.task_names: # os.listdir(data_dir): + + # Dynamically get the class by its name + if task_name in globals(): # Ensure the class is defined in the current scope + task_class = globals()[task_name] + else: + # Optionally, handle cases where the class is not found + print(f"Task {task_name} is not defined in the current scope.") + continue + + # Initialize the task class + import glob + json_file_list = glob.glob(os.path.join(args.data_dir, task_name, "*.json")) + if len(json_file_list) == 0: + print(f"No JSON files found for task: {task_name}") + continue + elif len(json_file_list) > 1: + print(f"Multiple JSON files found for task: {task_name}, using the first one: {json_file_list[0]}") + task_annotation_data = json_file_list[0] + else: + task_annotation_data = json_file_list[0] + print(f"Using task annotation data: {task_annotation_data}") + task = task_class( + task_data=task_annotation_data, + model=model, + audio_dir=os.path.join(args.data_dir, task_name, 'audios'), + output_dir=args.output_dir + ) + + # Run inference for the task + # This should generate audio files based on the task's data + print(f"Running inference for task: {task_name}") + task.run_inference() + # if you want to save the predictions, you need to rewrite the save_predictions() in each Task class depending on your need, and call task.save_predictions() after task.run_inference() or inside the run_inference method. + + + # Evaluate the task, return a dictionary of metrics + # For example, {'FAD_score': 0.123} + eval_results = task.evaluate() + print("Task name: ", task_name, "Evaluation results:", eval_results) + log_performance_json( + model_name=args.model_name, + task_name=task_name, + metric=list(eval_results.keys())[0].split('_')[0], # FAD_score + score=eval_results[list(eval_results.keys())[0]], # e.g., 0.123 + root_path=args.data_dir) + + # or you can run the tasks one by one like below: + # task_name = 'SingleCaptionToAudio' + # task = SingleCaptionToAudio( + # task_data=os.path.join(data_dir, f"{task_name}/annotation.json"), + # model=model, + # audio_dir=os.path.join(data_dir, f"{task_name}/audios"), + # output_dir=output_dir) + # task.run_inference() + # print(task.evaluate()) + + diff --git a/predictors/nlp_predictor.py b/predictors/nlp_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..b2e8e14a4838a8930011c92233a89b729e2b5ac3 --- /dev/null +++ b/predictors/nlp_predictor.py @@ -0,0 +1,1024 @@ +import json +import os +import tqdm +from typing import List, Dict, Any +import nltk +import re +from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction +from dataclasses import dataclass +from abc import ABC, abstractmethod +from transformers import pipeline +from rouge_score import rouge_scorer +from codebleu import calc_codebleu +import math +import numpy as np +import jieba + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel + + +class LLMModel: + def __init__(self, model_name: str): + self.model_name = model_name + self.is_time_series = False + self.timesfm_model = None # timesfm时序模型 + + if "timesfm" in model_name.lower(): + import timesfm + self.is_time_series = True + self.tfm = timesfm.TimesFm( + hparams=timesfm.TimesFmHparams( + backend="gpu", + per_core_batch_size=32, + ), + checkpoint=timesfm.TimesFmCheckpoint( + huggingface_repo_id=model_name), + ) + + elif "qwen" in model_name.lower() or "gemma" in model_name.lower() or "internlm" in model_name.lower() or "vicuna" in model_name.lower() or "gpt" in model_name.lower(): + self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto") + self.copied_model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto") + self.model = self.model.eval() + + elif "chatglm" in model_name.lower(): + self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto") + self.model = self.model.eval() + + else: + self.pipeline = pipeline("text-generation", model=model_name, device_map="auto", trust_remote_code=True) + + + def generate(self, prompt: str, max_new_tokens=256) -> str: + if self.is_time_series: + raise NotImplementedError("This model is a time-series model. Please call generate_for_timeseries() instead of generate().") + + if "vicuna" in self.model_name.lower() or "gpt" in self.model_name.lower(): + inputs = self.tokenizer(prompt, return_tensors="pt") + generate_ids = self.model.generate(inputs.input_ids.cuda(), max_new_tokens=max_new_tokens, pad_token_id=self.tokenizer.eos_token_id) + output = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + return output + + elif "llama" in self.model_name.lower(): + self.messages = [ + {"role": "system", "content": "You are a helpful and useful AI assistant."}, + {"role": "user", "content":prompt } + ] + prompt = self.pipeline.tokenizer.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True) + terminators = [ + self.pipeline.tokenizer.eos_token_id, + self.pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>") + ] + output = self.pipeline(prompt, max_new_tokens=max_new_tokens, num_return_sequences=1, + pad_token_id = self.pipeline.tokenizer.eos_token_id, + return_full_text=False, eos_token_id=terminators) + return output[0]["generated_text"] + + elif "qwen" in self.model_name.lower(): + self.messages = [ + {"role": "system", "content": "You are a helpful and useful AI assistant."}, + {"role": "user", "content": prompt} + ] + prompt = self.tokenizer.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True) + model_inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda") + generated_ids = self.model.generate(model_inputs.input_ids, max_new_tokens=max_new_tokens, pad_token_id=self.tokenizer.eos_token_id) + generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] + response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + return response + + elif "gemma" in self.model_name.lower(): + self.messages = [ + {"role": "user", "content": prompt} + ] + prompt = self.tokenizer.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True) + model_inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda") + generated_ids = self.model.generate(model_inputs.input_ids, max_new_tokens=max_new_tokens, pad_token_id=self.tokenizer.eos_token_id) + generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] + response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + return response + + elif "chatglm" in self.model_name.lower() or "internlm" in self.model_name.lower(): + response, _ = self.model.chat(self.tokenizer, prompt, history=[]) + return response + + def generate_for_timeseries( + self, + series_data: List[float], + horizon: int = 1, + freq: int = 0 + ) -> List[float]: + if self.is_time_series and self.tfm is not None: + forecast_input = [series_data] + frequency_input = [freq] + + point_forecast, _ = self.tfm.forecast( + forecast_input, + freq=frequency_input + ) + + forecast_result = point_forecast[0] + if horizon < len(forecast_result): + forecast_result = forecast_result[:horizon] + return forecast_result.tolist() + + else: + prompt = ( + "You are a time-series forecasting assistant.\n" + f"The historical data points are: {series_data}.\n" + f"Please predict the next {horizon} future data point(s) directly without other words based on the historical trend.\n\n" + "Format your answer as a list of floats, e.g. `[3.1415, 2.7182]`.\n" + "Answer:" + ) + + raw_response = self.generate(prompt, max_new_tokens=64) + import re + pattern = r"\[([\d\.\,\s\-eE]+)\]" + match = re.search(pattern, raw_response) + if not match: + print("Warning: LLM output not in expected format, fallback to 0.0") + return [0.0] * horizon + + numbers_str = match.group(1) + raw_nums = re.split(r"[\s,]+", numbers_str.strip()) + parsed_vals = [] + for val in raw_nums: + try: + parsed_vals.append(float(val)) + except ValueError: + continue + + # 如果预测数量不够 horizon,就做填充或截断 + if len(parsed_vals) < horizon: + # 填充 + while len(parsed_vals) < horizon: + parsed_vals.append(parsed_vals[-1] if parsed_vals else 0.0) + elif len(parsed_vals) > horizon: + parsed_vals = parsed_vals[:horizon] + + return parsed_vals + + +@dataclass +class Instance: + input: Dict[str, Any] + output: Dict[str, Any] + id: str + +class BaseTask(ABC): + def __init__(self, task_data: Dict[str, Any], model: LLMModel): + self.task_data = task_data + self.model = model + self.data = self._parse_data(task_data) + + @abstractmethod + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + pass + + @abstractmethod + def run_inference(self): + pass + + +class MultipleChoiceQA(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output={}, id=d["id"]) + for d in task_data["data"]] + + def run_inference(self): + self.predictions = [] + for inst in tqdm.tqdm(self.data): + question = inst.input["question"] + options = inst.input["options"] + options_chars = [chr(65 + i) for i in range(len(options))] + prompt = f"Question: {question}\nOptions:\n" + for i, opt in enumerate(options): + prompt += options_chars[i] + ". " + opt + "\n" + + if self.task_data["task"] == "Causal Reasoning": + prompt += f"{question}\nPlease substitute yourself into the above scenario and select the most likely cause and effect outcome. " + prompt += r'Please answer the question and output it strictly in the following format: "The final answer is $\boxed{your choice}$" at the end of the sentence.' + response = self.model.generate(prompt, max_new_tokens=256) + pred = None + if "answer" not in response: + pred = "A" + else: + pattern = "answer" + response = re.split(pattern, response, flags=re.IGNORECASE)[-1] + for opt in options_chars: + if opt in response: + pred = opt + break + if pred is None: + pred = "A" + + self.predictions.append(pred) + + +class OpenQA(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output={}, id=d["id"]) + for d in task_data["data"]] + + def run_inference(self): + self.predictions = [] + for inst in tqdm.tqdm(self.data): + prompt = "" + question = inst.input["question"] + + if "context" in inst.input.keys(): + context = inst.input["context"] + prompt += f"Given the context: {context}\n" + + if self.task_data["task"] == "Temporal Reasoning": + prompt += f"{question}\nAccroding to the provided context, how long does it take for the event? Please give a direct answer without other words" + elif self.task_data["task"] == "Medical Question Answering": + prompt += f"Please answer the question in a short pargraph: {question}" + elif self.task_data["task"] == "Multilingual Question Answering": + prompt += f"Please directly answer the question using the language in the question: {question}" + elif self.task_data["task"] == "Table Question Answering": + table = inst.input["table"] + prompt += f"Please read the content of the table below carefully and then directly answer the question without other words:\n{table}\n\nQuestion: {question}\nAnswer:" + else: + prompt += f"Please directly answer the question in a short sentence: {question}" + if self.task_data["task"] == "Document-Level Causal": + prompt += f"\nIf the context does not contain an answer to the question, simply output \"None of the above\"." + + response = self.model.generate(prompt, max_new_tokens=256) + pred = response.strip() + self.predictions.append(pred) + + +class SummarizationTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + instances = [] + for d in task_data["data"]: + if "document_list" in d: + instance = Instance( + input={"document_list": d["document_list"]}, + output={}, + id=d["id"] + ) + elif d.get("input") and "highlights" in d.get("output", {}): + instance = Instance( + input={"document": d["document"]}, + output={}, + id=d["id"] + ) + else: + instance = Instance( + input={"document": d["document"]}, + output={}, + id=d["id"] + ) + instances.append(instance) + return instances + + def run_inference(self): + self.predictions = [] + for inst in tqdm.tqdm(self.data): + if "document_list" in inst.input: + doc_list = inst.input["document_list"] + combined_docs = "\n".join(doc_list) + + prompt = ( + "You are a multi-document summarization assistant.\n" + "Please read the following documents, and then summarize them in a concise paragraph:\n\n" + f"{combined_docs}\n\n" + "Summary:" + ) + else: + doc = inst.input["document"] + prompt = ( + "Please summarize the following document in a short sentence\n" + f"{doc}\n" + "Summary:" + ) + + pred = self.model.generate(prompt, max_new_tokens=256) + + if "Summary:" in pred: + pred = pred.split("Summary:")[-1].strip() + else: + pred = pred.strip() + + self.predictions.append(pred) + + +class TranslationTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input={ + "source_lang": d["in"], + "target_lang": d["out"], + "text": d["input"] + }, + output={}, + id=d["id"]) + for d in task_data["data"]] + + def run_inference(self): + self.predictions = [] + for inst in tqdm.tqdm(self.data): + source_lang = inst.input["source_lang"] + target_lang = inst.input["target_lang"] + text = inst.input["text"] + + prompt = (f"Please directly Translate the following text from {source_lang} to {target_lang}.\n" + f"Text: {text}\n" + f"Translation:") + pred = self.model.generate(prompt, max_new_tokens=256) + if "Translation:" in pred: + pred = pred.split("Translation:")[-1].strip() + else: + pred = pred.strip() + + self.predictions.append(pred) + + +class StoryGenerationTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + instances = [] + for d in task_data["data"]: + instances.append( + Instance( + input=d["input"], + output={}, + id=d["id"] + ) + ) + return instances + + def run_inference(self): + self.predictions = [] + for inst in tqdm.tqdm(self.data): + prompt_text = inst.input["prompt"] + prompt = f"Please write a story based on the following prompt:\n{prompt_text}\nStory:" + pred = self.model.generate(prompt, max_new_tokens=512) + if "Story:" in pred: + pred = pred.split("Story:")[-1].strip() + + self.predictions.append(pred) + + +class DialogueGenerationTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + instances = [] + for d in task_data["data"]: + dialog_list = d.get("dialog", []) + if not dialog_list: + continue + + instances.append( + Instance( + input={"dialog": dialog_list}, + output={}, + id=d["id"] + ) + ) + return instances + + def run_inference(self): + self.predictions = [] + + for inst in tqdm.tqdm(self.data): + dialog_context = inst.input["dialog"] + prompt = "Below is a multi-turn conversation. Please continue the dialogue for the last turn.\n\n" + for turn_idx, turn in enumerate(dialog_context): + prompt += f"Turn {turn_idx + 1}: {turn}\n" + prompt += "\nNow please respond in one short answer:\n" + + pred = self.model.generate(prompt, max_new_tokens=128).strip() + self.predictions.append(pred) + + +class CodeGenerationTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + instances = [] + for d in task_data["data"]: + instance_id = d["id"] + language = d["language"] + goal = d["goal"] + context = d.get("context", []) + + instances.append( + Instance( + input={ + "language": language, + "goal": goal, + "context": context + }, + output={}, + id=instance_id + ) + ) + return instances + + def run_inference(self): + self.predictions = [] + self.languages = [] + + for inst in tqdm.tqdm(self.data): + language = inst.input["language"] + goal = inst.input["goal"] + context = inst.input["context"] + + prompt = f"You are an AI developer. Your goal is: {goal}\n" + prompt += f"Please write {language} code that solves the described task.\n\n" + + for c_item in context: + c_type = c_item["type"] + c_content = c_item["content"] + if c_type == "description": + prompt += f"Description:\n{c_content}\n\n" + elif c_type == "example": + prompt += "Examples:\n" + for ex in c_content: + prompt += f"- Input: {ex['input']}, Expected Output: {ex['output']}\n" + prompt += "\n" + else: + prompt += f"{c_type.capitalize()}:\n{c_content}\n\n" + + prompt += ( + "Now, please output ONLY the final code solution (without additional explanations, comments or text)." + "\nCode:\n" + ) + + pred_code = self.model.generate(prompt, max_new_tokens=256).strip() + if "Code:" in pred_code: + pred_code = pred_code.split("Code:", 1)[-1].strip() + + self.predictions.append(pred_code) + self.languages.append(language) + + +class CodeRepairTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + instances = [] + for d in task_data["data"]: + instance_id = d["id"] + input_part = d["input"] + + prompt = input_part["prompt"] + source_code = input_part["sourceCode"] + instances.append( + Instance( + input={ + "prompt": prompt, + "sourceCode": source_code + }, + output={}, + id=instance_id + ) + ) + return instances + + def run_inference(self): + self.predictions = [] + + for inst in tqdm.tqdm(self.data): + prompt = inst.input["prompt"] + source_code = inst.input["sourceCode"] + final_prompt = ( + f"{prompt}\n" + f"{source_code}\n\n" + "Now, please output ONLY the final code solution (without additional explanations, comments or text)." + "Refined Code:" + ) + + pred_code = self.model.generate(final_prompt, max_new_tokens=256).strip() + if "Refined Code:" in pred_code: + pred_code = pred_code.split("Refined Code:", 1)[-1].strip() + + self.predictions.append(pred_code) + + +class CodeDefectDetectionTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + instances = [] + for d in task_data["data"]: + instances.append( + Instance( + input={"func": d["func"]}, + output={}, + id=d["id"] + ) + ) + return instances + + def run_inference(self): + self.predictions = [] + + for inst in tqdm.tqdm(self.data): + code_snippet = inst.input["func"] + prompt = ( + "You are a code reviewer. Below is a piece of code or function:\n" + f"{code_snippet}\n\n" + "Please review carefully and determine if it contains a grammatical or logical defect. " + "For example, the code below has defect:\n" + "static void show_packets(AVFormatContext *format_ctx)\n\n{\n\n AVPacket packet;\n\n\n\n av_init_packet(&packet);\n\n probe_array_header(\"packets\", 0);\n\n while (!av_read_frame(format_ctx, &packet))\n\n show_packet(format_ctx, &packet);\n\n probe_array_footer(\"packets\", 0);\n\n}\n" + "For another example, the code below has no defect:\n" + "static void visitor_output_setup_internal(TestOutputVisitorData *output_data,\n\n bool is_human)\n\n{\n\n output_data->human = is_human;\n\n output_data->sov = string_output_visitor_new(is_human);\n\n g_assert(output_data->sov);\n\n output_data->ov = string_output_get_visitor(output_data->sov);\n\n g_assert(output_data->ov);\n\n}\n" + "Output only 'No defect' if it does NOT contain a grammatical or logical defect, " + "or ouput only 'Defect' if it DOES contain a defect.\n" + "Answer:" + ) + + response = self.model.generate(prompt, max_new_tokens=16).strip() + + if "no defect" in response.lower(): + pred = "0" + else: + pred = "1" + + self.predictions.append(pred) + + +class TextToSQLTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + instances = [] + for d in task_data["data"]: + instances.append( + Instance( + input={ + "context": d["input"]["context"], + "question": d["input"]["question"], + }, + output={}, + id=d["id"] + ) + ) + return instances + + def run_inference(self): + self.predictions = [] + + for inst in tqdm.tqdm(self.data): + schema_context = inst.input["context"] + question = inst.input["question"] + + prompt = ( + "Below is a database schema:\n" + f"{schema_context}\n" + "Given the schema, please write a valid SQL query that answers the following question without other words.\n" + f"Question: {question}\n" + "SQL:" + ) + + response = self.model.generate(prompt, max_new_tokens=256) + if "SQL:" in response: + pred_sql = response.split("SQL:", 1)[-1].strip() + else: + pred_sql = response.strip() + + self.predictions.append(pred_sql) + + +class CodeExplanationTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + instances = [] + for d in task_data["data"]: + code_snippet = d["code"] + instance_id = d["id"] + + instances.append( + Instance( + input={"code": code_snippet}, + output={}, + id=instance_id + ) + ) + return instances + + def run_inference(self): + self.predictions = [] + + for inst in tqdm.tqdm(self.data): + code_snippet = inst.input["code"] + prompt = ( + "You are a code explainer. " + "Please read the following code snippet and provide a concise, clear explanation in natural language:. For example:\n" + "Code:\nboolean equalsResidueRing ( Object obj ) { if ( !( obj instanceof ResidueRing ) ) { return false ; } ResidueRing < C > otherRing = null ; try { otherRing = ( ResidueRing < C > ) obj ; } catch ( ClassCastException e ) { return false ; } if ( otherRing == null ) { return false ; } if ( ! ring . equals ( otherRing . ring ) ) { return false ; } return modul . equals ( otherRing . modul ) ; }" + "Explanation: compares this ResidueRing with another object.\n\n" + "Now please explain the code below without other words:\n" + f"{code_snippet}\n" + "Explanation:" + ) + + pred_explanation = self.model.generate(prompt, max_new_tokens=256).strip() + self.predictions.append(pred_explanation) + + +class MathematicalProofGenerationTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + instances = [] + for d in task_data["data"]: + statement = d["statement"] + + instances.append( + Instance( + input={ + "statement": statement + }, + output={}, + id=d["id"] + ) + ) + return instances + + def run_inference(self): + self.predictions = [] + + for inst in tqdm.tqdm(self.data): + statement = inst.input["statement"] + + prompt = ( + "You are a mathematical assistant. " + "Please provide a clear, step-by-step proof for the following statement:\n" + f"Statement: {statement}\n\n" + "Ensure you include the final conclusion as well. Proof:" + ) + + pred_proof = self.model.generate(prompt, max_new_tokens=512).strip() + self.predictions.append(pred_proof) + + +class MathematicalWordProblemSolvingTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + instances = [] + for d in task_data["data"]: + problem_text = d["problem"]["text"] + constraints = d["problem"].get("constraints", []) + + instances.append( + Instance( + input={ + "problem_text": problem_text, + "constraints": constraints + }, + output={}, + id=d["id"] + ) + ) + return instances + + def run_inference(self): + self.predictions_steps = [] + self.predictions_final = [] + + for inst in tqdm.tqdm(self.data): + problem_text = inst.input["problem_text"] + constraints = inst.input["constraints"] + constraints_str = "" + if constraints: + constraints_str = "\nConstraints:\n" + "\n".join(constraints) + + prompt = ( + "You are a math problem solver. Please solve the following word problem step by step. " + "Finally, provide the final numeric or short answer in a separate line labeled as 'Final Answer:'.\n\n" + f"Problem:\n{problem_text}{constraints_str}\n\n" + "Solution (step-by-step) + Final Answer:\n" + ) + + response = self.model.generate(prompt, max_new_tokens=512).strip() + + steps_part, final_part = response, "" + if "Final Answer:" in response: + parts = response.split("Final Answer:", 1) + steps_part = parts[0].strip() + final_part = parts[1].strip() + + self.predictions_steps.append(steps_part) + self.predictions_final.append(final_part) + + +class ParaphraseGenerationTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + instances = [] + for d in task_data["data"]: + instances.append( + Instance( + input={"originalSentence": d["input"]["originalSentence"]}, + output={}, + id=d["id"] + ) + ) + return instances + + def run_inference(self): + self.predictions = [] + for inst in tqdm.tqdm(self.data): + original_sentence = inst.input["originalSentence"] + + prompt = ( + "Please rewrite the following sentence in a different way but keep the same meaning:\n" + f"{original_sentence}\n" + "Paraphrase:" + ) + + pred = self.model.generate(prompt, max_new_tokens=128) + + if "Paraphrase:" in pred: + pred = pred.split("Paraphrase:")[-1].strip() + + self.predictions.append(pred.strip()) + + +class GrammarCorrectionTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [ + Instance( + input=d["input"], + output={}, + id=d["id"] + ) + for d in task_data["data"] + ] + + def run_inference(self): + self.predictions = [] + + for inst in tqdm.tqdm(self.data): + error_type = inst.input["Error Type"] + ungrammatical_sentence = inst.input["Ungrammatical Statement"] + + prompt = ( + f"You are a grammar correction assistant.\n" + f"There is a sentence with the following error type: {error_type}.\n" + f"Please rewrite the sentence in correct standard English without any other word.\n\n" + f"Ungrammatical Sentence: {ungrammatical_sentence}\n\n" + f"Rewritten Sentence:" + ) + + corrected = self.model.generate(prompt, max_new_tokens=128).strip() + if "Rewritten Sentence:" in corrected: + corrected = corrected.split("Rewritten Sentence:")[-1].strip() + + self.predictions.append(corrected) + + +class TextStyleTransferTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + instances = [] + for d in task_data["data"]: + instances.append( + Instance( + input={ + "text": d["input"]["text"], + "style": d["input"]["style"] + }, + output={}, + id=d["id"] + ) + ) + return instances + + def run_inference(self): + self.predictions = [] + + for inst in tqdm.tqdm(self.data): + text = inst.input["text"] + style = inst.input["style"] + + prompt = ( + "You are a style transfer assistant.\n" + "Below is a piece of text and a target style.\n" + f"Text: {text}\n" + f"Style: {style}\n\n" + "Please rewrite the above text to match the target style more accurately, " + "while keeping the original meaning intact.\n" + "Answer:" + ) + + pred = self.model.generate(prompt, max_new_tokens=256).strip() + if "Answer:" in pred: + pred = pred.split("Answer:")[-1].strip() + + self.predictions.append(pred) + + +class TableToTextGenerationTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + instances = [] + for d in task_data["data"]: + instance_id = d["id"] + table_data = d["input"]["table"] + instances.append( + Instance( + input={"table": table_data}, + output={}, + id=instance_id + ) + ) + return instances + + def run_inference(self): + self.predictions = [] + + for inst in tqdm.tqdm(self.data): + table_data = inst.input["table"] + + prompt = "Below is a table. Please generate a coherent description that summarizes the table's content.\n\n" + for table_idx, table_item in enumerate(table_data): + header = table_item["header"] + rows = table_item["rows"] + prompt += f"Table {table_idx+1}:\nHeader: {header}\nRows:\n" + for r_idx, row in enumerate(rows): + prompt += f"{r_idx+1}. {row}\n" + prompt += "\n" + + prompt += "Now write a concise text describing the above table:\n" + + pred_text = self.model.generate(prompt, max_new_tokens=512) + pred_text = pred_text.strip() + + self.predictions.append(pred_text) + + +class TimeSeriesForecastingTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + instances = [] + for d in task_data["data"]: + time_series = d["input"]["data"] + instances.append( + Instance( + input={"time_series": time_series}, + output={}, + id=d["id"] + ) + ) + return instances + + def run_inference(self): + self.predictions = [] + for inst in tqdm.tqdm(self.data): + series_data = inst.input["time_series"] + pred_values = self.model.generate_for_timeseries(series_data, horizon=1, freq=0) + predicted = pred_values[0] if pred_values else 0.0 + self.predictions.append(predicted) + + +class ClassificationTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output={}, id=d["id"]) + for d in task_data["data"]] + + def run_inference(self): + self.predictions = [] + for inst in tqdm.tqdm(self.data): + if 'stance_detection' in self.task_data['task']: + tweets = inst.input["tweets"] + target = inst.input["target"] + prompt = inst.input["prompt"].replace("<<>>", target).replace("<<>>", tweets) + elif 'aspect_sentiment_classification' in self.task_data['task']: + raw_text = inst.input["raw_text"] + target = inst.input["target"] + prompt = inst.input["prompt"].replace("<<>>", raw_text).replace("<<>>", target) + 'Please direct return the category name without any other words.' + elif 'target_oriented_opinion_words_extraction' in self.task_data['task']: + raw_text = inst.input["raw_text"] + aspect = inst.input["aspect"] + prompt = inst.input["prompt"].replace("<<>>", raw_text).replace("<<>>", aspect) + 'Please direct return the opinion word without any other words.' + else: + raw_text = inst.input["raw_text"] + prompt = inst.input["prompt"].replace("<<>>", raw_text) + 'Please return the desired result directly, without any other explanation.' + response = self.model.generate(prompt, max_new_tokens=64) + self.predictions.append(response.lower()) + + +class MultiLabelClassificationTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output={}, id=d["id"]) + for d in task_data["data"]] + + def run_inference(self): + self.predictions = [] + for inst in tqdm.tqdm(self.data): + raw_text = inst.input["raw_text"] + prompt = inst.input["prompt"].replace("<<>>", raw_text) + prompt = prompt + " Please return the desired result directly, without any other explanation." + " Split the result by commas instead of \\n." + response = self.model.generate(prompt, max_new_tokens=64) + self.predictions.append('

'.join(response.lower().split(', '))) + + +class ChoiceTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output={}, id=d["id"]) + for d in task_data["data"]] + + def run_inference(self): + self.predictions = [] + for inst in tqdm.tqdm(self.data): + raw_text = inst.input["raw_text"] + prompt = inst.input["prompt"].replace("<<>>", raw_text) + 'Please return the desired result directly, without any other explanation.' + response = self.model.generate(prompt, max_new_tokens=64) + if len(response.strip()) > 1: + if "A" in response.strip(): + response = "A" + elif "B" in response.strip(): + response = "B" + elif "C" in response.strip(): + response = "C" + elif "D" in response.strip(): + response = "D" + self.predictions.append(response.lower()) + + +class NERTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output={}, id=d["id"]) + for d in task_data["data"]] + + def run_inference(self): + self.predictions = [] + for inst in tqdm.tqdm(self.data): + text = inst.input["raw_text"] + prompt = inst.input["prompt"].replace("<<>>", text) + response = self.model.generate(prompt, max_new_tokens=128) + self.predictions.append('

'.join(response.lower().split(', '))) + + +def save_predictions(task_obj: BaseTask, task_directory: str): + save_path = os.path.join(task_directory, "prediction.json") + records = [] + if isinstance(task_obj, MathematicalWordProblemSolvingTask): + for idx, inst in enumerate(task_obj.data): + records.append({ + "id": inst.id, + "prediction_steps": task_obj.predictions_steps[idx], + "prediction_final": task_obj.predictions_final[idx] + }) + elif isinstance(task_obj, TimeSeriesForecastingTask): + for idx, inst in enumerate(task_obj.data): + records.append({ + "id": inst.id, + "prediction": float(task_obj.predictions[idx]) + }) + else: + for idx, inst in enumerate(task_obj.data): + pred_val = task_obj.predictions[idx] + if isinstance(pred_val, (np.floating, np.integer)): + pred_val = float(pred_val) + records.append({"id": inst.id, "prediction": pred_val}) + with open(save_path, "w", encoding="utf-8") as fp: + json.dump(records, fp, ensure_ascii=False, indent=2) + + +TASK_MAPPING = { + "MultipleChoiceQA": MultipleChoiceQA, + "OpenQA": OpenQA, + "Summarization": SummarizationTask, + "Story Generation": StoryGenerationTask, + "Translation": TranslationTask, + "Dialogue": DialogueGenerationTask, + "Code Generation": CodeGenerationTask, + "Code Defect Detection": CodeDefectDetectionTask, + "Code Repair": CodeRepairTask, + "Code Explanation": CodeExplanationTask, + "Proof": MathematicalProofGenerationTask, + "Mathematical Word Problem Solving": MathematicalWordProblemSolvingTask, + "Text to SQL": TextToSQLTask, + "Paraphrase Generation": ParaphraseGenerationTask, + "Grammar Correction": GrammarCorrectionTask, + "Table-to-Text Generation": TableToTextGenerationTask, + "Time Series": TimeSeriesForecastingTask, + "Text Style Transfer": TextStyleTransferTask, + "classification": ClassificationTask, + "multi label classification": MultiLabelClassificationTask, + "ner": NERTask, + "extraction": MultiLabelClassificationTask, + "relation extraction": MultiLabelClassificationTask, + "event detection": MultiLabelClassificationTask, + "parsing": MultiLabelClassificationTask, + "multiple choice": ChoiceTask, +} + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="NLP Predictor") + parser.add_argument("--dataset_dir", required=True) + parser.add_argument("--model_name", required=True) + args = parser.parse_args() + + data_root = os.path.abspath(args.dataset_dir) + model = LLMModel(args.model_name) + + task_dirs = sorted([d for d in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, d))]) + + for idx, task_folder in enumerate(task_dirs, start=1): + folder_path = os.path.join(data_root, task_folder) + annotation_path = os.path.join(folder_path, "annotation.json") + + with open(annotation_path, "r", encoding="utf-8") as f: + task_data = json.load(f) + + task_type = task_data.get("type") + task_name = task_data.get("task", task_folder) + print(f"\nTask {idx}/{len(task_dirs)}: {task_name} (Type = {task_type})") + + task_class = TASK_MAPPING.get(task_type, OpenQA) + task = task_class(task_data, model) + + task.run_inference() + save_predictions(task, folder_path) diff --git a/predictors/video_comprehension_flow_matching_tracking.py b/predictors/video_comprehension_flow_matching_tracking.py new file mode 100644 index 0000000000000000000000000000000000000000..55e696c4456e4af46755be103ed0c9b4f4114d51 --- /dev/null +++ b/predictors/video_comprehension_flow_matching_tracking.py @@ -0,0 +1,562 @@ +import tqdm +from typing import List, Dict, Any +from dataclasses import dataclass +from abc import ABC, abstractmethod +from PIL import Image +import numpy as np +import cv2 +from typing import Tuple +import os +import json +import argparse + +import torch +from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, CLIPImageProcessor, + CLIPVisionModel, GenerationConfig) + +def exact_match_accuracy(predictions: List[str], references: List[str]) -> float: + correct = 0 + for pred, ref in zip(predictions, references): + if isinstance(ref, str): + ref = [ref] + is_match_this_turn = False + for r in ref: + if pred.strip() == r.strip(): + is_match_this_turn = True + if is_match_this_turn: + correct += 1 + return correct / len(predictions) if predictions else 0.0 + + +def bbox_to_corners(bbox): + """将(x_min, y_min, w, h)格式转换为(x_min, y_min, x_max, y_max)格式""" + x_min, y_min, w, h = bbox + return (x_min, y_min, x_min + w, y_min + h) + + +def calculate_iou(bbox1, bbox2): + """计算两个边界框的交并比(IoU/Jaccard Index)""" + # 转换为对角坐标格式 + bbox1 = bbox_to_corners(bbox1) + bbox2 = bbox_to_corners(bbox2) + + # 计算交集区域的坐标 + x1 = max(bbox1[0], bbox2[0]) + y1 = max(bbox1[1], bbox2[1]) + x2 = min(bbox1[2], bbox2[2]) + y2 = min(bbox1[3], bbox2[3]) + + # 计算交集面积 + intersection_area = max(0, x2 - x1) * max(0, y2 - y1) + + # 计算两个边界框的面积 + bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) + bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1]) + + # 计算并集面积 + union_area = bbox1_area + bbox2_area - intersection_area + + # 计算IoU + if union_area == 0: + return 0.0 + return intersection_area / union_area + + +def calculate_j_metric(pred_bboxes, gt_bboxes): + """计算J指标(Jaccard Index)""" + if len(pred_bboxes) != len(gt_bboxes): + raise ValueError("预测边界框和真实边界框数量不一致") + + iou_values = [] + for pred, gt in zip(pred_bboxes, gt_bboxes): + iou = calculate_iou(pred, gt) + iou_values.append(iou) + + # 返回平均Jaccard Index + return sum(iou_values) / len(iou_values) if iou_values else 0.0 + + +def calculate_f1_score(pred_bboxes, gt_bboxes, threshold=0.5): + """计算F1 Score(F指标)""" + if len(pred_bboxes) == 0 and len(gt_bboxes) == 0: + return 1.0 # 特殊情况:没有检测也没有真实目标,视为完全正确 + + true_positives = 0 + false_positives = 0 + false_negatives = 0 + + # 标记已匹配的真实边界框 + gt_matched = [False] * len(gt_bboxes) + + # 计算每对边界框的IoU + iou_matrix = [] + for i, pred in enumerate(pred_bboxes): + row = [] + for j, gt in enumerate(gt_bboxes): + row.append(calculate_iou(pred, gt)) + iou_matrix.append(row) + + # 贪心匹配:将每个预测边界框匹配到IoU最高的真实边界框 + for i in range(len(pred_bboxes)): + if not iou_matrix: + break + + # 找到当前行的最大值及其索引 + max_iou = max(iou_matrix[i]) if iou_matrix[i] else 0 + j = iou_matrix[i].index(max_iou) if iou_matrix[i] else -1 + + if max_iou >= threshold: + true_positives += 1 + gt_matched[j] = True + else: + false_positives += 1 + + # 计算假阴性 + false_negatives = sum(1 for matched in gt_matched if not matched) + + # 计算精确率和召回率 + precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0 + recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0 + + # 计算F1 Score + f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 + return f1 + + +def calculate_j_and_f_metrics(pred_bboxes, gt_bboxes, iou_threshold=0.5): + """计算J指标和F指标""" + # 计算J指标 + j_metric = calculate_j_metric(pred_bboxes, gt_bboxes) + + # 计算F指标 + f_metric = calculate_f1_score(pred_bboxes, gt_bboxes, threshold=iou_threshold) + + return { + "J_metric": j_metric, + "F_metric": f_metric + } + +def read_flow(file_path: str) -> np.ndarray: + if file_path.endswith('.flo'): + return read_flow_flo(file_path) + elif file_path.endswith(('.png', '.jpg', '.jpeg')): + return read_flow_png(file_path) + else: + raise NotImplementedError + + +def read_flow_flo(file_path: str) -> np.ndarray: + with open(file_path, 'rb') as f: + + magic = np.fromfile(f, np.float32, count=1) + if 202021.25 != magic: + raise NotImplementedError + + w = np.fromfile(f, np.int32, count=1)[0] + h = np.fromfile(f, np.int32, count=1)[0] + + flow = np.fromfile(f, np.float32, count=2 * w * h) + flow = flow.reshape(h, w, 2) + + return flow + + +def read_flow_png(file_path: str) -> np.ndarray: + img = cv2.imread(file_path, cv2.IMREAD_UNCHANGED).astype(np.float32) + + # 确保图像有足够的通道 + if len(img.shape) != 3 or img.shape[2] < 2: + raise NotImplementedError + + u = (img[:, :, 2] - 32768.0) / 64.0 # R + v = (img[:, :, 1] - 32768.0) / 64.0 # G + + flow = np.stack([u, v], axis=2) + + return flow + + +def calculate_epe(flow_gt: np.ndarray, flow_pred: np.ndarray) -> Tuple[float, np.ndarray]: + if flow_gt.shape != flow_pred.shape: + raise NotImplementedError + + diff = flow_gt - flow_pred + epe_map = np.sqrt(np.sum(diff ** 2, axis=2)) + + mean_epe = np.mean(epe_map) + + return mean_epe, epe_map + +class Sa2VAModel: + def __init__(self, model_name="ByteDance/Sa2VA-4B"): + self.model_name = model_name + + model = AutoModel.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + use_flash_attn=True, + trust_remote_code=True, + ).eval().cuda() + + tokenizer = AutoTokenizer.from_pretrained( + model_name, + trust_remote_code=True, + ) + + self.model = model + self.tokenizer = tokenizer + + def generate(self, input_dict): + pred_dict = self.model.predict_forward(**input_dict, tokenizer=self.tokenizer) + if 'prediction_masks' in pred_dict.keys() and pred_dict['prediction_masks'] and len( + pred_dict['prediction_masks']) != 0: + masks = pred_dict['prediction_masks'][0] # (f, h, w) + else: + masks = None + text_response = pred_dict["prediction"] + return text_response, masks + +@dataclass +class Instance: + input: Dict[str, Any] + output: Dict[str, Any] + id: str + + +class BaseTask(ABC): + def __init__(self, task_data: Dict[str, Any], model): + self.task_data = task_data + self.model = model + self.data = self._parse_data(task_data) + + @abstractmethod + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + pass + + @abstractmethod + def evaluate(self) -> Dict[str, float]: + pass + + @abstractmethod + def run_inference(self): + pass + +def get_bbox_from_mask(mask): + if len(mask.shape) != 2: + raise NotImplementedError + + y_indices, x_indices = np.nonzero(mask) + + if len(x_indices) == 0 or len(y_indices) == 0: + return None + + x_min = np.min(x_indices) + x_max = np.max(x_indices) + y_min = np.min(y_indices) + y_max = np.max(y_indices) + + return (x_min, y_min, x_max-x_min, y_max-y_min) + +def mask2bbox(masks, video_length): + if masks is None: + bboxes = [[0, 0, 0, 0]] * video_length + else: + bboxes = [] + for mask in masks: + bbox = get_bbox_from_mask(mask) + if bbox is None: + bbox = [0, 0, 0, 0] + bboxes.append(bbox) + return bboxes + +class MatchTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def run_inference(self): + self.predictions = [] + self.references = [] + for inst in tqdm.tqdm(self.data): + prompt = "\n" + inst.input["prompt"] + video_folder = inst.input["video_folder"] + frame_files = [os.path.join(video_folder, _name) for _name in os.listdir(video_folder)] + video = [] + for image_path in frame_files: + video.append(Image.open(image_path).convert('RGB')) + + input_dict = { + "video": video, + "text": prompt, + } + + response, _ = self.model.generate(input_dict, max_new_tokens=256) + response = response.split("<")[0].strip() + + self.predictions.append(response) + self.references.append(inst.output["answer"]) + + def evaluate(self) -> Dict[str, float]: + acc = exact_match_accuracy(self.predictions, self.references) + return {"accuracy": acc} + +class TrackingTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def run_inference(self): + self.predictions = [] + self.references = [] + for inst in tqdm.tqdm(self.data): + prompt = "\n" + inst.input["prompt"] + video_folder = inst.input["video_folder"] + frame_files = [os.path.join(video_folder, _name) for _name in os.listdir(video_folder)] + video = [] + for image_path in frame_files: + video.append(Image.open(image_path).convert('RGB')) + + input_dict = { + "video": video, + "text": prompt, + } + + response, masks = self.model.generate(input_dict, max_new_tokens=256) + + bboxes = mask2bbox(masks, len(video)) + + self.predictions.append(bboxes) + self.references.append(inst.output["answer"]) + + def evaluate(self) -> Dict[str, float]: + j_f, n = 0, 1e-4 + for pred_bboxes, gt_bboxes in zip(self.predictions, self.references): + metrics = calculate_j_and_f_metrics(pred_bboxes, gt_bboxes) + j_f += (metrics['J_metric'] + metrics['F_metric']) / 2.0 + n += 1 + j_f = j_f / n + return {"J&F": j_f} + +class FlowTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def run_inference(self): + self.predictions = [] + self.references = [] + for inst in tqdm.tqdm(self.data): + prompt = "\n" + inst.input["prompt"] + video_folder = inst.input["video_folder"] + frame_files = [os.path.join(video_folder, _name) for _name in os.listdir(video_folder)] + video = [] + for image_path in frame_files: + video.append(Image.open(image_path).convert('RGB')) + + input_dict = { + "video": video, + "text": prompt, + } + + response, masks = self.model.generate(input_dict, max_new_tokens=256) + + pred_flows = np.zeros(masks.shape[1], masks.shape[2], 2) + + self.predictions.append(pred_flows) + self.references.append(read_flow(inst.output["flow"])) + + def evaluate(self) -> Dict[str, float]: + EPE, n = 0, 1e-4 + for pred_flow, gt_flow in zip(self.predictions, self.references): + mean_epe, _ = calculate_epe(pred_flow, gt_flow) + EPE += mean_epe + n += 1 + EPE = EPE / n + return {"EPE": EPE} + + +def log_performance(model_name, task_name, metrics, root_path, output_file='performance_log.csv'): + import csv + file_exists = os.path.isfile(os.path.join(root_path, output_file)) + + row_data = { + 'model': model_name, + 'task': task_name, + 'metrics': str(metrics) + } + + with open(os.path.join(root_path, output_file), mode='a', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=row_data.keys()) + if not file_exists: + writer.writeheader() + + writer.writerow(row_data) + + +def log_performance_detail(model_name, task_name, metrics, root_path, output_file='performance_log.csv'): + import csv + file_path = os.path.join(root_path, output_file) + file_exists = os.path.isfile(file_path) + + # 从metrics字典中获取主要指标值 + metric_value = None + if isinstance(metrics, dict): + # 按照优先级选择指标 + for key in ['accuracy', 'f1', 'micro_f1', 'bleu4', 'rougeL', 'code_bleu', 'MAE']: + if key in metrics: + metric_value = metrics[key] + break + if metric_value is None and len(metrics) > 0: + # 如果没有找到优先指标,使用第一个指标 + metric_value = list(metrics.values())[0] + else: + metric_value = metrics + + # 简化文件名,只保留最后一部分 + model_name = model_name.split('/')[-1] + + if file_exists: + # 读取现有数据 + rows = [] + tasks = set() + with open(file_path, 'r', newline='', encoding='utf-8') as f: + reader = csv.reader(f) + header = next(reader, ['task', model_name]) # 如果文件为空,使用默认表头 + if len(header) == 1: # 如果只有task列,添加model列 + header.append(model_name) + rows.append(header) + + # 读取现有数据并更新 + for row in reader: + if row[0] == task_name: # 如果找到相同任务,更新值 + row = [task_name, str(metric_value)] + tasks.add(row[0]) + rows.append(row) + + # 如果是新任务,添加新行 + if task_name not in tasks: + rows.append([task_name, str(metric_value)]) + else: + # 创建新文件 + rows = [ + ['task', model_name], + [task_name, str(metric_value)] + ] + + # 写入所有数据 + with open(file_path, 'w', newline='', encoding='utf-8') as f: + writer = csv.writer(f) + writer.writerows(rows) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--root_path", type=str, default="General-Bench-Openset/video/comprehension") + parser.add_argument("--model_name", type=str, default="ByteDance/Sa2VA-4B") + args = parser.parse_args() + root_path = args.root_path + model_name = args.model_name + + model = Sa2VAModel(model_name=model_name) + + task_files = [ + "AnimalTrack", + "GreenWaterTrack", + "LongVideoHumanTrack", + "RelationMatch", + "UAVUAVTrack", + "BallTrack", + "HumanPartTrack", + "LongVideoVehicleTrack", + "ShapeMatch", + "UAVVehicleTrack", + "BlueWaterTrack", + "HumanTrack", + "MotionMatch", + "SizeMatch", + "VehicleTrack", + "ColorMatch", + "LOGOMarkerMatch", + "ObjectMarkerMatch", + "SyntheticSceneFlowEstimate", + "WhiteWaterTrack", + "ComplexSceneFlowEstimate", + "LongVideoAnimalTrack", + "OtherPartTrack", + "UAVBuildingTrack", + "YellowWaterTrack", + "CrowdTrack", + "LongVideoCrowdTrack", + "PanoramicFlowEstimate", + "UAVGeneralObjectTrack", + "GeneralObjectTrack", + "LongVideoGeneralObjectTrack", + "PositionMatch", + "UAVHumanTrack"] + + task_files = [w + '.json' if not w.endswith('json') else w for w in task_files] + + if isinstance(task_files, str): + task_files = [task_files] + + for idx, filename in enumerate(task_files): + file_path = os.path.join(root_path, f"{filename.replace('.json', '')}/", filename) + if not os.path.exists(file_path): + continue + + with open(file_path, 'r', encoding='utf-8') as f: + task_data = json.load(f) + + task_type = task_data["type"] + task_name = task_data["task"] + print(f"Running evaluation for task {idx + 1}: {task_name}") + + # 定义任务类型与任务类的映射字典 + TASK_MAPPING = { + "AnimalTrack": TrackingTask, + "GreenWaterTrack": TrackingTask, + "LongVideoHumanTrack": TrackingTask, + "RelationMatch": MatchTask, + "UAVUAVTrack": TrackingTask, + "BallTrack": TrackingTask, + "HumanPartTrack": TrackingTask, + "LongVideoVehicleTrack": TrackingTask, + "ShapeMatch": MatchTask, + "UAVVehicleTrack": TrackingTask, + "BlueWaterTrack": TrackingTask, + "HumanTrack": TrackingTask, + "MotionMatch": MatchTask, + "SizeMatch": MatchTask, + "VehicleTrack": TrackingTask, + "ColorMatch": MatchTask, + "LOGOMarkerMatch": MatchTask, + "ObjectMarkerMatch": MatchTask, + "SyntheticSceneFlowEstimate": FlowTask, + "WhiteWaterTrack": TrackingTask, + "ComplexSceneFlowEstimate": FlowTask, + "LongVideoAnimalTrack": TrackingTask, + "OtherPartTrack": TrackingTask, + "UAVBuildingTrack": TrackingTask, + "YellowWaterTrack": TrackingTask, + "CrowdTrack": TrackingTask, + "LongVideoCrowdTrack": TrackingTask, + "PanoramicFlowEstimate": FlowTask, + "UAVGeneralObjectTrack": TrackingTask, + "GeneralObjectTrack": TrackingTask, + "LongVideoGeneralObjectTrack": TrackingTask, + "PositionMatch": MatchTask, + "UAVHumanTrack": TrackingTask, + } + + # 根据 task_type 获取对应的任务类 + task_class = TASK_MAPPING.get(task_type) # 使用精确匹配 + if task_class is None: + raise NotImplementedError + else: + task = task_class(task_data, model) + + task.run_inference() + metrics = task.evaluate() + print("Task name: ", task_name, "Task type: ", task_type, "Evaluation results:", metrics) + log_performance(model_name, task_name, metrics, root_path) \ No newline at end of file diff --git a/predictors/video_comprehension_qa_caption.py b/predictors/video_comprehension_qa_caption.py new file mode 100644 index 0000000000000000000000000000000000000000..5c71243c704e77971329cfb9406e6abd705808d1 --- /dev/null +++ b/predictors/video_comprehension_qa_caption.py @@ -0,0 +1,443 @@ +import tqdm +from typing import List, Dict, Any +from dataclasses import dataclass +from abc import ABC, abstractmethod +from PIL import Image +import numpy as np +import os +import json +import argparse + +import torch +from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer, + LlavaOnevisionForConditionalGeneration, AutoProcessor) + +# An example of the model +class LLavaOneVisionModel: + def __init__(self, model_name="llava-hf/llava-onevision-qwen2-7b-ov-hf"): + self.model_name = model_name + + model = LlavaOnevisionForConditionalGeneration.from_pretrained( + model_name, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + ).eval().cuda() + + tokenizer = AutoTokenizer.from_pretrained( + model_name, + trust_remote_code=True + ) + + self.processor = AutoProcessor.from_pretrained(model_name) + + self.model = model + self.tokenizer = tokenizer + + def generate(self, conversation, video): + prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True) + inputs = self.processor(images=video, text=prompt, return_tensors="pt").to(self.model.device, torch.float16) + outputs = self.model.generate(**inputs, max_new_tokens=256, do_sample=False) + text_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) + text_response = text_response.split('assistant\n')[1] + + return text_response + +@dataclass +class Instance: + input: Dict[str, Any] + output: Dict[str, Any] + id: str + + +class BaseTask(ABC): + def __init__(self, task_data: Dict[str, Any], model): + self.task_data = task_data + self.model = model + self.data = self._parse_data(task_data) + + @abstractmethod + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + pass + + @abstractmethod + def evaluate(self) -> Dict[str, float]: + pass + + @abstractmethod + def run_inference(self): + pass + + +def cal_accuracy(predictions: List[str], references: List[str]) -> float: + correct = 0 + for pred, ref in zip(predictions, references): + if isinstance(ref, str): + ref = [ref] + is_match_this_turn = False + for r in ref: + if "yes" in r.lower() or "no" in r.lower(): + # for yes or no question + r = r.lower() + pred = pred.lower() + + if r.strip() in pred.strip(): + is_match_this_turn = True + + if is_match_this_turn: + correct += 1 + return correct / len(predictions) if predictions else 0.0 + + +class Bleu1_Scorer(): + def __init__(self, predictions, references): + from pycocoevalcap.bleu.bleu import Bleu + self.pred = predictions + self.gt = references + self.scorers = [ + (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']), + ] + + def compute_scores(self): + total_scores = {} + for scorer, method in self.scorers: + print('Computing %s score...' % (scorer.method())) + score, scores = scorer.compute_score(self.gt, self.pred) + if isinstance(method, list): + for sc, scs, m in zip(score, scores, method): + print('%s: %0.3f' % (m, sc * 100)) + total_scores['Bleu'] = [x * 100 for x in score] + else: + total_scores[method] = score * 100 + + return {"Bleu_1": total_scores['Bleu'][0]} + + +class AccTask(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + self.task_name = task_data["task"] + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def read_video_frames(self, data_path_list, root_path, max_frames_num=64): + frames = [] + if len(data_path_list) > max_frames_num: + frame_idx = np.linspace(0, len(data_path_list) - 1, max_frames_num, dtype=int) + data_path_list = [data_path_list[i] for i in frame_idx] + + for frame_path in data_path_list: + path = os.path.join(root_path, frame_path) + if os.path.exists(path): + try: + frame = Image.open(path) + frames.append(frame) + except Exception as e: + print(f"Warning: Failed to read frame {path}. Error: {e}") + else: + print(f"Warning: Frame path {path} does not exist.") + return frames + + + def run_inference(self, root_path): + + if os.path.exists(f'./predictions_{self.task_name}.json'): + self.predictions = json.load(open(f'./predictions_{self.task_name}.json', 'r')) + self.references = json.load(open(f'./references_{self.task_name}.json', 'r')) + return + + self.predictions = [] + self.references = [] + for inst in tqdm.tqdm(self.data): + video_path = inst.input['video_file_list'] + video = self.read_video_frames(video_path, os.path.join(root_path, self.task_name, 'videos'), max_frames_num=64) + + question = 'Please answer the following question related to the video. ' + inst.input['prompt'] + + other_requirements = '' + if 'VideoActionCounting' in self.task_name: + other_requirements = 'The output must consist only of Arabic numerals.' + if 'VideoActionOrdering' in self.task_name: + other_requirements = 'The output format must be: [num]->[num]->[num]->[num]. The number represents the index marked in the question. For example: 2->1->3->4, 1->2->3->4, 3->2->1->4...' + if 'SignLanguageVideoRecognition' in self.task_name: + other_requirements = 'The output format must be a word.' + question += other_requirements + + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": question}, + {"type": "video"}, + ], + }, + ] + + text_response = self.model.generate(conversation, video) + + self.predictions.append(text_response) + self.references.append(inst.output["text"]) + + json.dump(self.predictions, open(f'./predictions_{self.task_name}.json', 'w')) + json.dump(self.references, open(f'./references_{self.task_name}.json', 'w')) + + def evaluate(self) -> Dict[str, float]: + + acc = cal_accuracy(self.predictions, self.references) + return {"accuracy": acc*100} + + +class BLEUTASK(BaseTask): + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + self.task_name = task_data["task"] + return [Instance(input=d["input"], output=d["output"], id=d["id"]) + for d in task_data["data"]] + + def read_video_frames(self, data_path_list, root_path, max_frames_num=64): + frames = [] + if len(data_path_list) > max_frames_num: + frame_idx = np.linspace(0, len(data_path_list) - 1, max_frames_num, dtype=int) + data_path_list = [data_path_list[i] for i in frame_idx] + + for frame_path in data_path_list: + path = os.path.join(root_path, frame_path) + if os.path.exists(path): + try: + frame = Image.open(path) + frames.append(frame) + except Exception as e: + print(f"Warning: Failed to read frame {path}. Error: {e}") + else: + print(f"Warning: Frame path {path} does not exist.") + return frames + + + def run_inference(self, root_path): + + if os.path.exists(f'./predictions_{self.task_name}.json'): + self.predictions = json.load(open(f'./predictions_{self.task_name}.json', 'r')) + self.references = json.load(open(f'./references_{self.task_name}.json', 'r')) + return + + self.predictions = [] + self.references = [] + for inst in tqdm.tqdm(self.data): + video_path = inst.input['video_file_list'] + video = self.read_video_frames(video_path, os.path.join(root_path, self.task_name, 'videos'), max_frames_num=64) + + question = 'Please answer the following question related to the video. ' + inst.input['prompt'] + other_requirements = ' The output should be concise. ' + question += other_requirements + + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": question}, + {"type": "video"}, + ], + }, + ] + + text_response = self.model.generate(conversation, video) + + self.predictions.append(text_response) + self.references.append(inst.output["text"]) + + json.dump(self.predictions, open(f'./predictions_{self.task_name}.json', 'w')) + json.dump(self.references, open(f'./references_{self.task_name}.json', 'w')) + + def evaluate(self) -> Dict[str, float]: + + predictions = {} + references = {} + + num = 1 + for pred, ref in zip(self.predictions, self.references): + predictions[str(num)] = [pred.lower()] + references[str(num)] = [ref.lower()] + num += 1 + + bleu1_scorer = Bleu1_Scorer(predictions, references) + bleu1_scores = bleu1_scorer.compute_scores() + return bleu1_scores + + + +def log_performance(model_name, task_name, metrics, root_path, output_file='performance_log.csv'): + import csv + file_exists = os.path.isfile(os.path.join(root_path, output_file)) + + row_data = { + 'model': model_name, + 'task': task_name, + 'metrics': str(metrics) + } + + with open(os.path.join(root_path, output_file), mode='a', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=row_data.keys()) + if not file_exists: + writer.writeheader() + + writer.writerow(row_data) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--root_path", type=str, default="General-Bench-Openset/video/comprehension") + parser.add_argument("--model_name", type=str, default="llava-hf/llava-onevision-qwen2-7b-ov-hf") + args = parser.parse_args() + root_path = args.root_path + model_name = args.model_name + + model = LLavaOneVisionModel(model_name=model_name) # An example of the model + + # 56 tasks + task_files = [ + "AgricultureVideoQuestionAnswering", + "ArtRecognition", + "ArtsAndCraftsVideoCaptioning", + "AutosAndVehiclesVideoCaptioning", + "BallGameVideoQuestionAnswering", + "BallSportsVideoCaptioning", + "BodyMotionVideoCaptioning", + "BusinessVideoCaptioning", + "ComedyVideoQuestionAnswering", + "DailyLifeAndSkillsVideoCaptioning", + "EducationVideoQuestionAnswering", + "EntertainmentRelatedVideoCaptioning", + "FacialActionVideoCaptioning", + "FacialObjectOperationsVideoCaptioning", + "FinanceVideoCaptioning", + "FoodVideoCaptioning", + "GameVideoQuestionAnswering", + "GeographyVideoQuestionAnswering", + "GymnasticsVideoQuestionAnswering", + "HistoryAndLiteratureVideoCaptioning", + "HumanHumanInteractionVideoCaptioning", + "HumanObjectInteractionVideoCaptioning", + "HumanObjectInteractionVideoQuestionAnswering", + "HumanSurvivalVideoQuestionAnswering", + "HumorVideoCaptioning", + "MilitaryVideoQuestionAnswering", + "MovieAndShowVideoCaptioning", + "MovieVideoQuestionAnswering", + "MusicalInstrumentsVideoCaptioning", + "MusicVideoQuestionAnswering", + "NaturalDisasterVideoRecognition", + "NewsAndDocumentaryVideoCaptioning", + "ObjectColorVideoQuestionAnswering", + "ObjectDirectionVideoQuestionAnswering", + "ObjectLocationVideoQuestionAnswering", + "ObjectMotionVideoQuestionAnswering", + "PersonalCareVideoCaptioning", + "PetsVideoQuestionAnswering", + "PetsVideoRecognition", + "ScienceAndTechnologyVideoCaptioning", + "ScienceVideoQuestionAnswering", + "ScienceVideoRecognition", + "SignLanguageVideoRecognition", + "SportsAndExcerciseVideoCaptioning", + "SportsVideoQuestionAnswering", + "TVShowRecognition", + "VideoActionCounting", + "VideoActionOrdering", + "VideoActionSequencePrediction", + "VideoActionSequenceUnderstanding", + "VideoAnimalRecognition", + "VideoFoodRecognition", + "VideoObjectCounting", + "VideoObjectExistenceRecognition", + "VideoObjectInteractionRecognition", + "VideoSportsRecognition", + ] + + task_files = [w + '.json' if not w.endswith('json') else w for w in task_files] + + if isinstance(task_files, str): + task_files = [task_files] + + for idx, filename in enumerate(task_files): + file_path = os.path.join(root_path, f"{filename.replace('.json', '')}/", "annotation.json") + + if not os.path.exists(file_path): + continue + + with open(file_path, 'r', encoding='utf-8') as f: + task_data = json.load(f) + + task_type = task_data["type"] + task_name = task_data["task"] + print(f"Running evaluation for task {idx + 1}: {task_name}") + + TASK_MAPPING = { + "AgricultureVideoQuestionAnswering": BLEUTASK, + "ArtRecognition": AccTask, + "ArtsAndCraftsVideoCaptioning": BLEUTASK, + "AutosAndVehiclesVideoCaptioning": BLEUTASK, + "BallGameVideoQuestionAnswering": AccTask, + "BallSportsVideoCaptioning": BLEUTASK, + "BodyMotionVideoCaptioning": BLEUTASK, + "BusinessVideoCaptioning": BLEUTASK, + "ComedyVideoQuestionAnswering": BLEUTASK, + "DailyLifeAndSkillsVideoCaptioning": BLEUTASK, + "EducationVideoQuestionAnswering": AccTask, + "EntertainmentRelatedVideoCaptioning": BLEUTASK, + "FacialActionVideoCaptioning": BLEUTASK, + "FacialObjectOperationsVideoCaptioning": BLEUTASK, + "FinanceVideoCaptioning": BLEUTASK, + "FoodVideoCaptioning": BLEUTASK, + "GameVideoQuestionAnswering": BLEUTASK, + "GeographyVideoQuestionAnswering": BLEUTASK, + "GymnasticsVideoQuestionAnswering": AccTask, + "HistoryAndLiteratureVideoCaptioning": BLEUTASK, + "HumanHumanInteractionVideoCaptioning": BLEUTASK, + "HumanObjectInteractionVideoCaptioning": BLEUTASK, + "HumanObjectInteractionVideoQuestionAnswering": BLEUTASK, + "HumanSurvivalVideoQuestionAnswering": BLEUTASK, + "HumorVideoCaptioning": BLEUTASK, + "MilitaryVideoQuestionAnswering": BLEUTASK, + "MovieAndShowVideoCaptioning": BLEUTASK, + "MovieVideoQuestionAnswering": BLEUTASK, + "MusicalInstrumentsVideoCaptioning": BLEUTASK, + "MusicVideoQuestionAnswering": BLEUTASK, + "NaturalDisasterVideoRecognition": BLEUTASK, + "NewsAndDocumentaryVideoCaptioning": BLEUTASK, + "ObjectColorVideoQuestionAnswering": AccTask, + "ObjectDirectionVideoQuestionAnswering": BLEUTASK, + "ObjectLocationVideoQuestionAnswering": AccTask, + "ObjectMotionVideoQuestionAnswering": AccTask, + "PersonalCareVideoCaptioning": BLEUTASK, + "PetsVideoQuestionAnswering": BLEUTASK, + "PetsVideoRecognition": BLEUTASK, + "ScienceAndTechnologyVideoCaptioning": BLEUTASK, + "ScienceVideoQuestionAnswering": BLEUTASK, + "ScienceVideoRecognition": BLEUTASK, + "SignLanguageVideoRecognition": AccTask, + "SportsAndExcerciseVideoCaptioning": BLEUTASK, + "SportsVideoQuestionAnswering": BLEUTASK, + "TVShowRecognition": AccTask, + "VideoActionCounting": AccTask, + "VideoActionOrdering": AccTask, + "VideoActionSequencePrediction": BLEUTASK, + "VideoActionSequenceUnderstanding": BLEUTASK, + "VideoAnimalRecognition": AccTask, + "VideoFoodRecognition": AccTask, + "VideoObjectCounting": BLEUTASK, + "VideoObjectExistenceRecognition": BLEUTASK, + "VideoObjectInteractionRecognition": BLEUTASK, + "VideoSportsRecognition": AccTask, + } + + task_class = TASK_MAPPING.get(task_name) + if task_class is None: + raise NotImplementedError + else: + task = task_class(task_data, model) + + task.run_inference(root_path=root_path) + metrics = task.evaluate() + + print("Task name: ", task_name, "Task type: ", task_type, "Evaluation results:", metrics) + log_performance(model_name, task_name, metrics, '../outcome/', output_file='video_comprehension_qa_caption_performance_log.csv') + + + diff --git a/predictors/video_comprehension_tasks.py b/predictors/video_comprehension_tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..f778f0f4c288502a352bf5c4e4ed077306341806 --- /dev/null +++ b/predictors/video_comprehension_tasks.py @@ -0,0 +1,550 @@ + + +from abc import ABC, abstractmethod +from dataclasses import dataclass +import os +from typing import Dict, Any, List +import json +import torch +import tqdm +import argparse + + +from transformers import AutoModelForCausalLM, AutoTokenizer +from PIL import Image +import pycocotools.mask as mask_util +import numpy as np + + +PREFIX = 'data' + +PROMPT = { + 'VOS': '\nPlease segment the major object in the video.', + 'RVOS': '\nPlease segment {}.', + 'ActionDet': '\nPlease detect {}.', + 'VDE': '\nPlease generate the depth map of the video.', +} + + +@dataclass +class Instance: + input: Dict[str, Any] + output: Dict[str, Any] + id: str + + +class BaseTask(ABC): + def __init__(self, task_data: str, model): + self.task_data = task_data + self.model = model + self.task_name = os.path.basename(task_data) + + + self.data = self._parse_data(task_data) + + @abstractmethod + def _parse_data(self, task_data: str) -> List[Instance]: + pass + + @abstractmethod + def evaluate(self, results:List[Instance]) -> Dict[str, float]: + pass + + @abstractmethod + def run_inference(self) -> List[Instance]: + pass + + +class TaskVOS(BaseTask): + + def _load_video(self, video_path: str) -> List[Image.Image]: + video_frames = [] + for frame_file in sorted(os.listdir(video_path)): + if frame_file.endswith('.jpg') or frame_file.endswith('.png'): + frame_path = os.path.join(video_path, frame_file) + video_frames.append(Image.open(frame_path).convert('RGB')) + return video_frames + + + def _parse_data(self, task_data: str) -> List[Instance]: + json_path = os.path.join(task_data, 'annotation.json') + json_data = json.load(open(json_path, 'r')) + + results = [] + json_data_data = json_data['data'] + for json_item in json_data_data: + input_dict = {} + input_dict['video_folder'] = json_item['input']['video_folder'] + input_dict['video'] = self._load_video(os.path.join(task_data, input_dict['video_folder'])) + + output_dict = {} + output_dict['serilized_masks'] = json_item['output'] + output_dict['masks'] = [] + for mask_id, mask_data in output_dict['serilized_masks'].items(): + mask = mask_util.decode(mask_data['mask']) + output_dict['masks'].append(mask) + instance_id = json_item['id'] + results.append(Instance(input=input_dict, output=output_dict, id=instance_id)) + return results + + + + def evaluate(self, results:List[Instance]) -> Dict[str, float]: + iou_list = [] + for instance in results: + masks = instance.output['masks'] + prediction_masks = instance.output['prediction_masks'] + + assert len(masks) == len(prediction_masks), "Number of masks and prediction masks do not match." + + intersection = 0. + union = 0. + for gt_mask, pred_mask in zip(masks, prediction_masks): + intersection += (gt_mask.astype(bool) & pred_mask.astype(bool)).sum() + union += (gt_mask | pred_mask).sum() + iou = intersection / union if union > 0 else 0.0 + iou_list.append(iou) + iou_mean = np.mean(iou_list).item() * 100 + return {"IoU": iou_mean} + + def run_inference(self) -> List[Instance]: + results = [] + for instance in tqdm.tqdm(self.data, desc=f"Running inference on {self.task_name}"): + input_data = instance.input + + result = self.model.predict_forward( + video=input_data['video'], + text=PROMPT['VOS'], + ) + + # output postprocessing + output_masks = result['prediction_masks'] + + instance.output['prediction_masks'] = output_masks[0] + results.append(instance) + return results + + +class TaskRVOS(BaseTask): + def _load_video(self, video_path: str) -> List[Image.Image]: + video_frames = [] + for frame_file in sorted(os.listdir(video_path)): + if frame_file.endswith('.jpg') or frame_file.endswith('.png'): + frame_path = os.path.join(video_path, frame_file) + video_frames.append(Image.open(frame_path).convert('RGB')) + return video_frames + + + def _parse_data(self, task_data: str) -> List[Instance]: + json_path = os.path.join(task_data, 'annotation.json') + json_data = json.load(open(json_path, 'r')) + + results = [] + json_data_data = json_data['data'] + for json_item in json_data_data: + input_dict = {} + input_dict['video_folder'] = json_item['input']['video_folder'] + input_dict['video'] = self._load_video(os.path.join(task_data, input_dict['video_folder'])) + input_dict['prompt'] = json_item['input']['prompt'] + + output_dict = {} + output_dict['serilized_masks'] = json_item['output'] + output_dict['masks'] = [] + for mask_id, mask_data in output_dict['serilized_masks'].items(): + mask = mask_util.decode(mask_data['mask']) + output_dict['masks'].append(mask) + instance_id = json_item['id'] + results.append(Instance(input=input_dict, output=output_dict, id=instance_id)) + return results + + + + def evaluate(self, results:List[Instance]) -> Dict[str, float]: + iou_list = [] + for instance in results: + masks = instance.output['masks'] + prediction_masks = instance.output['prediction_masks'] + + assert len(masks) == len(prediction_masks), "Number of masks and prediction masks do not match." + + intersection = 0. + union = 0. + for gt_mask, pred_mask in zip(masks, prediction_masks): + intersection += (gt_mask.astype(bool) & pred_mask.astype(bool)).sum() + union += (gt_mask | pred_mask).sum() + iou = intersection / union if union > 0 else 0.0 + iou_list.append(iou) + iou_mean = np.mean(iou_list).item() * 100 + return {"IoU": iou_mean} + + def run_inference(self) -> List[Instance]: + results = [] + for instance in tqdm.tqdm(self.data, desc=f"Running inference on {self.task_name}"): + input_data = instance.input + + result = self.model.predict_forward( + video=input_data['video'], + text=PROMPT['RVOS'].format(input_data['prompt']), + ) + + # output postprocessing + output_masks = result['prediction_masks'] + + instance.output['prediction_masks'] = output_masks[0] + results.append(instance) + return results + + + +class TaskActionDet(BaseTask): + def _load_video(self, video_path: str) -> List[Image.Image]: + import cv2 + cap = cv2.VideoCapture(video_path) + img_list = [] + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + img_list.append(Image.fromarray(frame).convert('RGB')) + + return img_list + + + def _parse_data(self, task_data: str) -> List[Instance]: + if self.task_name in ['AnimalVG', 'AutoVG', 'HumanVG']: + self.is_vg = True + else: + self.is_vg = False + + json_path = os.path.join(task_data, 'annotation.json') + json_data = json.load(open(json_path, 'r')) + + results = [] + json_data_data = json_data['data'] + for json_item in json_data_data: + video_path = os.path.join(self.task_data, 'videos', json_item['video_path']) + image_list = self._load_video(video_path) + assert len(image_list) > 0, f"Video {video_path} has no frames." + if len(image_list) != json_item['frame_count']: + print(f"Warning: Frame count mismatch for video {video_path}. Expected {json_item['frame_count']}, got {len(image_list)}.") + while len(image_list) < json_item['frame_count']: + image_list.append(image_list[-1]) + input_dict = {} + input_dict['video'] = image_list + input_dict['prompt'] = json_item['caption'] + + output_dict = {} + if self.is_vg: + output_dict['tube_start_frame'] = json_item['tube_start_frame'] + output_dict['tube_end_frame'] = json_item['tube_end_frame'] + else: + output_dict['tube_start_frame'] = json_item['tube_start_frame'] - 1 + output_dict['tube_end_frame'] = json_item['tube_end_frame'] - 1 + + trajectory = json_item['trajectory'] + + if self.is_vg: + trajectory = [trajectory[frame_id_str]['bbox'] for frame_id_str in trajectory if output_dict['tube_start_frame'] <= int(frame_id_str) < output_dict['tube_end_frame']] + + assert len(trajectory) == output_dict['tube_end_frame'] - output_dict['tube_start_frame'] + bboxes = [] + for _ in range(output_dict['tube_start_frame']): + bboxes.append([0, 0, 0, 0]) + + # trajectory is a list of [x, y, w, h] for each frame + for item in trajectory: + x, y, w, h = item + bbox = [x, y, x + w, y + h] + bboxes.append(bbox) + + for _ in range(output_dict['tube_end_frame'], len(image_list)): + bboxes.append([0, 0, 0, 0]) + output_dict['bboxes'] = bboxes + + instance_id = json_item['original_video_id'] + results.append(Instance(input=input_dict, output=output_dict, id=instance_id)) + return results + + def evaluate(self, results:List[Instance]) -> Dict[str, float]: + iou_list = [] + for instance in results: + boxes = instance.output['bboxes'] + prediction_boxes = instance.output['prediction_boxes'] + assert len(boxes) == len(prediction_boxes), "Number of boxes and prediction boxes do not match." + iou = 0. + frame_union = 0 + for gt_box, pred_box in zip(boxes, prediction_boxes): + gt_box = np.array(gt_box) + pred_box = np.array(pred_box) + + if np.all(gt_box == 0) and np.all(pred_box == 0): + continue + frame_union += 1 + if np.all(gt_box == 0) or np.all(pred_box == 0): + continue + + intersection = np.maximum(0, np.minimum(gt_box[2:], pred_box[2:]) - np.maximum(gt_box[:2], pred_box[:2])) + intersection_area = intersection[0] * intersection[1] + gt_area = (gt_box[2] - gt_box[0]) * (gt_box[3] - gt_box[1]) + pred_area = (pred_box[2] - pred_box[0]) * (pred_box[3] - pred_box[1]) + union_area = gt_area + pred_area - intersection_area + iou += intersection_area / union_area + if frame_union > 0: + iou /= frame_union + iou_list.append(iou) + iou_mean = np.mean(iou_list).item() * 100 + return {"vIoU": iou_mean} + + def run_inference(self) -> List[Instance]: + results = [] + for instance in tqdm.tqdm(self.data, desc=f"Running inference on {self.task_name}"): + input_data = instance.input + + result = self.model.predict_boxes( + video=input_data['video'], + text=PROMPT['ActionDet'].format(input_data['prompt']), + ) + + # output postprocessing + output_masks = result['prediction_boxes'] + instance.output['prediction_boxes'] = output_masks[0] + results.append(instance) + return results + + + +class TaskVDE(BaseTask): + def _load_video(self, video_path: str) -> List[Image.Image]: + import cv2 + cap = cv2.VideoCapture(video_path) + img_list = [] + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + img_list.append(Image.fromarray(frame).convert('RGB')) + + return img_list + + def _parse_data(self, task_data: str) -> List[Instance]: + json_path = os.path.join(task_data, 'annotation.json') + json_data = json.load(open(json_path, 'r')) + + results = [] + json_data_data = json_data['data'] + for json_item in json_data_data: + video_path = os.path.join(self.task_data, 'video', json_item['input']) + annotation_path = os.path.join(self.task_data, 'depth', json_item['output']) + instance_id = json_item['id'] + + assert os.path.exists(video_path), f"Video path {video_path} does not exist." + assert os.path.exists(annotation_path), f"Annotation path {annotation_path} does not exist" + + + input_dict = {} + input_dict['video'] = self._load_video(video_path) + + output_dict = {} + output_dict['depth_map'] = np.load(annotation_path)['disparity'] # nf, 1, h, w + assert len(input_dict['video']) == output_dict['depth_map'].shape[0], "Number of video frames and depth map frames do not match." + assert output_dict['depth_map'].ndim == 4, "Depth map should be 4-dimensional (nf, 1, h, w)." + assert input_dict['video'][0].size == (output_dict['depth_map'].shape[3], output_dict['depth_map'].shape[2]), "Video frame size does not match depth map size." + results.append(Instance(input=input_dict, output=output_dict, id=instance_id)) + return results + + + def _abs_relative_difference(self, output, target, valid_mask=None): + actual_output = output[valid_mask] + actual_target = target[valid_mask] + abs_relative_diff = np.abs(actual_output - actual_target) / actual_target + return abs_relative_diff.mean() + + def evaluate(self, results:List[Instance]) -> Dict[str, float]: + abs_rel_list = [] + dataset_max_depth = 80 + for instance in results: + depth_map = instance.output['depth_map'] + prediction_depth = instance.output['prediction_depth'] + + assert depth_map.shape == prediction_depth.shape, "Depth map and prediction depth shape do not match." + + # Calculate absolute relative error + gt_disp = depth_map[:, 0] + pred_disp = prediction_depth[:, 0] + # valid mask + valid_mask = np.logical_and( + (gt_disp > 1e-3), + (gt_disp < dataset_max_depth) + ) + pred_disp = np.clip(pred_disp, a_min=1e-3, a_max=None) + pred_disp_masked = pred_disp[valid_mask].reshape((-1, 1)) + + + gt_disp_maksed = gt_disp[valid_mask].reshape((-1, 1)).astype(np.float64) + # calc scale and shift + _ones = np.ones_like(pred_disp_masked) + A = np.concatenate([pred_disp_masked, _ones], axis=-1) + X = np.linalg.lstsq(A, gt_disp_maksed, rcond=None)[0] + scale, shift = X # gt = scale * pred + shift + + # align + aligned_pred = scale * pred_disp + shift + aligned_pred = np.clip(aligned_pred, a_min=1e-3, a_max=None) + + + pred_depth = aligned_pred + gt_depth = gt_disp + + # metric evaluation, clip to dataset min max + pred_depth = np.clip( + pred_depth, a_min=1e-3, a_max=dataset_max_depth + ) + abs_rel = self._abs_relative_difference( + pred_depth, + gt_depth, + valid_mask=valid_mask + ) + abs_rel_list.append(abs_rel) + + abs_rel_mean = np.mean(abs_rel_list).item() + + + def sigmoid(x): + return 1 / (1 + np.exp(-x)) + score = (sigmoid(0.1 / (abs_rel_mean + 1e-6)) * 2 - 1) * 100 + return {"absRel": abs_rel_mean, "score": score} + + + def run_inference(self) -> List[Instance]: + results = [] + for instance in tqdm.tqdm(self.data, desc=f"Running inference on {self.task_name}"): + input_data = instance.input + + result = self.model.predict_depth( + video=input_data['video'], + text=PROMPT['VDE'], + ) + + # output postprocessing + depth_map = result['prediction_depth'] + instance.output['prediction_depth'] = depth_map + results.append(instance) + return results + + +tasks = { + 'AnimalVOS': TaskVOS, + 'AutoVOS':TaskVOS, + 'HumanVOS':TaskVOS, + 'SportsVOS':TaskVOS, + + ## IW + 'IWAnimalVOS':TaskVOS, + 'IWAutoVOS':TaskVOS, + 'IWFurnitureVOS':TaskVOS, + 'IWHumanVOS':TaskVOS, + + ## Street + 'AutoStreetVOS':TaskVOS, + 'BicycleStreetVOS':TaskVOS, + 'HumanStreetVOS':TaskVOS, + + # RVOS + 'AnimalRVOS':TaskRVOS, + 'HumanRVOS':TaskRVOS, + + ## ReVOS, + 'AnimalReVOS':TaskRVOS, + 'AutoReVOS': TaskRVOS, + 'HumanReVOS': TaskRVOS, + + ## CReVOS + 'AnimalCReVOS': TaskRVOS, + 'AutoCReVOS' : TaskRVOS, + 'HumanCReVOS': TaskRVOS, + 'HumanPartCReVOS': TaskRVOS, + 'EquipmentCReVOS': TaskRVOS, + + + ## Action Det + # V-C-10 HCSTVG2 + 'StaticActionDet': TaskActionDet, + 'DynamicActionDet': TaskActionDet, + # V-C-12 VidSTG + 'AnimalVG': TaskActionDet, + 'AutoVG': TaskActionDet, + 'HumanVG': TaskActionDet, + + ## VDE + 'StaticVDE': TaskVDE, + 'StreetVDE': TaskVDE, + 'SynVDE': TaskVDE, + 'DynamicVDE': TaskVDE, +} + + + +def predict_dummy_boxes(video, text): + # Dummy function to simulate box prediction + # In practice, this should call the model's prediction method + num_frames = len(video) + return { + 'prediction_boxes': [ + [[0,0, 100, 100]] * num_frames, # Example boxes, [0, 0, 0, 0] is empty box + ] + } + + +def predict_dummy_depth(video, text): + # Dummy function to simulate depth prediction + # In practice, this should call the model's prediction method + num_frames = len(video) + width, height = video[0].size + return { + 'prediction_depth': np.random.rand(num_frames, 1, height, width).astype(np.float32) * 80 # Random depth values + } + + +def main(root:str, model_path:str): + metrics = {} + + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + use_flash_attn=True, + trust_remote_code=True, + ).eval().cuda() + tokenizer = AutoTokenizer.from_pretrained( + model_path, + trust_remote_code=True + ) + model.preparing_for_generation(tokenizer=tokenizer) + + model.predict_boxes = predict_dummy_boxes + model.predict_depth = predict_dummy_depth + + for task_name in tasks: + task_class = tasks[task_name] + task_data_path = os.path.join(root, task_name) + task_instance = task_class(task_data=task_data_path, model=model) + + results = task_instance.run_inference() + evaluation_results = task_instance.evaluate(results) + metrics[task_instance.task_name] = evaluation_results + + print(metrics) + + +if __name__ == "__main__": + # root = os.path.join(PREFIX, "General-Bench-Openset/video/comprehension") + import argparse + parser = argparse.ArgumentParser(description="Run video tasks evaluation.") + parser.add_argument("--model_path", type=str, default='ByteDance/Sa2VA-4B', required=False, help="Model to use for evaluation") + parser.add_argument("--root_path", type=str, default="General-Bench-Openset/video/comprehension", required=False, help="Root path to the dataset") + args = parser.parse_args() + main(args.root_path, args.model_path) \ No newline at end of file diff --git a/predictors/video_generation_evaluate_kit.py b/predictors/video_generation_evaluate_kit.py new file mode 100644 index 0000000000000000000000000000000000000000..84e28e804f29a08d874d4827a12611c6bf7b776d --- /dev/null +++ b/predictors/video_generation_evaluate_kit.py @@ -0,0 +1,327 @@ +import subprocess +from typing import List, Dict, Any +from dataclasses import dataclass +from abc import ABC, abstractmethod +from PIL import Image +from pathlib import Path +import numpy as np +import cv2 +import clip +import torch +from torch import nn +import torch.nn.functional as F + +from typing import Tuple +import os +import json +from diffusers import CogVideoXPipeline +from diffusers.utils import export_to_video +from video_generation_evaluation.toolkit.fvd import get_dataset_features, I3DFeatureExtractor +from numpy import cov +from numpy import mean +from scipy.linalg import sqrtm +from video_generation_evaluation.evaluate import task2dimension + + +class BaseTask(ABC): + def __init__(self, task_data: str, model): + self.task_data = task_data + self.model = model + self.data = self._parse_data(task_data) + + @abstractmethod + def _parse_data(self, task_data: Dict[str, Any]): + pass + + @abstractmethod + def evaluate(self) -> Dict[str, float]: + pass + + @abstractmethod + def run_inference(self): + pass + +class T2VTask(BaseTask): + def _parse_result_file(self, output_dir: Path) -> float | None: + for jsonfile in output_dir.iterdir(): + if "eval" in jsonfile.name: + with open(jsonfile.as_posix(), "r") as file: + data = json.load(file) + + return float(data[self.taskname][0]) + + def _parse_data(self, task_data): + with open(task_data, "r") as file: + annos = json.load(file) + taskname = annos["task"].replace(" ", "") + self.taskname = taskname + self.save_root = os.path.join("General-Bench", "Video-Generation", taskname) + return annos["data"] + + def run_inference(self): + for d in self.data: + prompt = d["input"]["prompt"] + for i in range(5): + video = self.model(prompt, generator=torch.Generator(self.model.device).manual_seed(i)).frames[0] + save_name = prompt + "-" + str(i) + ".mp4" + save_path = os.path.join(self.save_root, save_name) + export_to_video(video, save_path, fps=8) + +class FVDEval(T2VTask): + def evaluate(self, real_video_root): + model = I3DFeatureExtractor().cuda().eval() + + real_features = get_dataset_features(real_video_root, model) + generated_features = get_dataset_features(self.save_root, model) + + mu_real = mean(real_features, axis=0) + mu_generated = mean(generated_features, axis=0) + + sigma_real = cov(real_features, rowvar=False) + sigma_generated = cov(generated_features, rowvar=False) + + diff = mu_real - mu_generated + covmean, _ = sqrtm(sigma_real.dot(sigma_generated), disp=False) + if np.iscomplexobj(covmean): + covmean = covmean.real + fvd = diff.dot(diff) + np.trace(sigma_real + sigma_generated - 2 * covmean) + print(f"{self.taskname} score: {fvd}") + return fvd + +class ThirdPartyEval(T2VTask): + def evaluate(self): + videos_path = Path(self.save_root).resolve() + dimension = task2dimension[self.taskname] + full_info = Path("./full_info_t2v.json").resolve() + output_dir = Path("./evaluation_results").resolve() + output_dir = output_dir.joinpath(self.taskname) + output_dir.mkdir(parents=True, exist_ok=True) + + cmd = [ + "python", "-W", "ignore", "evaluate.py", + "--full_json_dir", str(full_info), + "--videos_path", str(videos_path), + "--dimension", dimension, + "--output_path", str(output_dir) + ] + + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError as exc: + raise RuntimeError(f"Evaluation failed: {exc}") from exc + + score = self._parse_result_file(Path(output_dir)) + print(f"{self.taskname} score: {score}") + return score + +class I2VTask(BaseTask): + def _parse_result_file(self, output_dir: Path) -> float | None: + score = 0 + for jsonfile in output_dir.iterdir(): + if "eval" in jsonfile.name: + with open(jsonfile.as_posix(), "r") as file: + data: dict = json.load(file) + score += list(data.values())[0][0] + return score + + def _parse_data(self, task_data): + self.dirpath = os.path.dirname(task_data) + with open(task_data, "r") as file: + annos = json.load(file) + taskname = annos["task"].replace(" ", "") + self.taskname = taskname + self.dimensions = ("subject_consistency", "overall_consistency", "motion_smoothness", "dynamic_degree") + self.save_root = os.path.join("General-Bench", "Video-Generation", taskname) + + def run_inference(self): + for d in self.data: + prompt = d["input"]["prompt"] + image = d["input"]["image"] + image = os.path.join(self.dirpath, image) + for i in range(5): + video = self.model( + prompt=prompt, + image=image, + generator=torch.Generator(self.model.device).manual_seed(i) + ).frames[0] + save_name = prompt + "-" + str(i) + ".mp4" + save_path = os.path.join(self.save_root, save_name) + export_to_video(video, save_path, fps=8) + + def evaluate(self): + taskname = self.taskname + full_info = Path("./full_info_i2v.json").resolve() + output_dir = Path("./evaluation_results").resolve() + output_dir = output_dir.joinpath(taskname) + output_dir.mkdir(parents=True, exist_ok=True) + + for dimension in self.dimensions: + cmd = [ + "python", "-W", "ignore", "evaluate.py", + "--full_json_dir", str(full_info), + "--videos_path", str(self.save_root), + "--dimension", dimension, + "--output_path", str(output_dir) + ] + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError as exc: + raise RuntimeError(f"Evaluation failed: {exc}") from exc + + score = self._parse_result_file(Path(output_dir)) + print(f"{self.taskname} score: {score}") + return score + +class AthleticsT2V(FVDEval): pass + +class HumanT2V(FVDEval): pass + +class ConcertT2V(FVDEval): pass + +class TerrestrialAnimalT2V(FVDEval): pass + +class WaterSportsT2V(FVDEval): pass + +class ActionT2V(ThirdPartyEval): pass + +class ArtisticT2V(ThirdPartyEval): pass + +class BackgroundConsistency(ThirdPartyEval): pass + +class CameraMotionT2V(ThirdPartyEval): pass + +class ClassConditionedT2V(ThirdPartyEval): pass + +class ColorT2V(ThirdPartyEval): pass + +class DynamicT2V(ThirdPartyEval): pass + +class MaterialT2V(ThirdPartyEval): pass + +class MultiClassConditionedT2V(ThirdPartyEval): pass + +class SceneT2V(ThirdPartyEval): pass + +class SpatialRelationT2V(ThirdPartyEval): pass + +class StaticT2V(ThirdPartyEval): pass + +class StyleT2V(ThirdPartyEval): pass + +class ArchitectureI2V(I2VTask): pass + +class ClothI2V(I2VTask): pass + +class FoodI2V(I2VTask): pass + +class FurnitureI2V(I2VTask): pass + +class HumanI2V(I2VTask): pass + +class PetI2V(I2VTask): pass + +class PlantI2V(I2VTask): pass + +class SceneI2V(I2VTask): pass + +class VehicleI2V(I2VTask): pass + +class WeatherI2V(I2VTask): pass + +class WildAnimalI2V(I2VTask): pass + + +if __name__ == "__main__": + root = Path("General-Bench-Openset/video/generation") + + task_type = "T2V" + model = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.bfloat16).to("cuda") + + task_files = [ + "AthleticsT2V", + "HumanT2V", + "ConcertT2V", + "TerrestrialAnimalT2V", + "WaterSportsT2V", + "ActionT2V", + "ArtisticT2V", + "BackgroundConsistency", + "CameraMotionT2V", + "ClassConditionedT2V", + "ColorT2V", + "DynamicT2V", + "MaterialT2V", + "MultiClassConditionedT2V", + "SceneT2V", + "SpatialRelationT2V", + "StaticT2V", + "StyleT2V", + "ArchitectureI2V", + "ClothI2V", + "FoodI2V", + "FurnitureI2V", + "HumanI2V", + "PetI2V", + "PlantI2V", + "SceneI2V", + "VehicleI2V", + "WeatherI2V", + "WildAnimalI2V", + ] + + task_files = [root.joinpath(task, "annotation.json") for task in task_files] + + for idx, file in enumerate(task_files): + if file.exists(): + continue + + with open(file.as_posix(), 'r', encoding='utf-8') as f: + task_data = json.load(f) + + task_name = task_data["task"] + print(f"Running evaluation for task {idx + 1}: {task_name}") + + TASK_MAPPING = { + "AthleticsT2V": AthleticsT2V, + "HumanT2V": HumanT2V, + "ConcertT2V": ConcertT2V, + "TerrestrialAnimalT2V": TerrestrialAnimalT2V, + "WaterSportsT2V": WaterSportsT2V, + "ActionT2V": ActionT2V, + "ArtisticT2V": ArtisticT2V, + "BackgroundConsistency": BackgroundConsistency, + "CameraMotionT2V": CameraMotionT2V, + "ClassConditionedT2V": ClassConditionedT2V, + "ColorT2V": ColorT2V, + "DynamicT2V": DynamicT2V, + "MaterialT2V": MaterialT2V, + "MultiClassConditionedT2V": MultiClassConditionedT2V, + "SceneT2V": SceneT2V, + "SpatialRelationT2V": SpatialRelationT2V, + "StaticT2V": StaticT2V, + "StyleT2V": StyleT2V, + "ArchitectureI2V": ArchitectureI2V, + "ClothI2V": ClothI2V, + "FoodI2V": FoodI2V, + "FurnitureI2V": FurnitureI2V, + "HumanI2V": HumanI2V, + "PetI2V": PetI2V, + "PlantI2V": PlantI2V, + "SceneI2V": SceneI2V, + "VehicleI2V": VehicleI2V, + "WeatherI2V": WeatherI2V, + "WildAnimalI2V": WildAnimalI2V, + } + + clean_task_name = task_name.replace(" ", "") + task_class = TASK_MAPPING.get(clean_task_name) + if task_class is None: + raise NotImplementedError + elif task_type not in clean_task_name: + continue + else: + task = task_class(file.as_posix(), model) + + task.run_inference() + metrics = task.evaluate() + print("Task name: ", task_name, "Task type: ", task_type, "Evaluation results:", metrics) \ No newline at end of file diff --git a/predictors/video_translation_restoration_superresolution_objectdetection.py b/predictors/video_translation_restoration_superresolution_objectdetection.py new file mode 100644 index 0000000000000000000000000000000000000000..c9255d366acfbf2c8a3ffbb95b7e5f38f7935c6d --- /dev/null +++ b/predictors/video_translation_restoration_superresolution_objectdetection.py @@ -0,0 +1,340 @@ +""" +Unified evaluator for four video–vision tasks and their metrics + + • Video Translation → Frame-Acc (CLIP-based) + • Video Restoration (去噪/去模糊/…) → PSNR + • Video Super-Resolution → MUSIQ (no-reference IQA) + • Video (Salient / Camouflaged) Object Detection → Structure-measure + +""" + +from __future__ import annotations + +import os +import sys +import json +import math +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Dict, Any, List, Tuple + +# ───────────────────────── third-party imports ──────────────────────────── +import numpy as np +from PIL import Image +from tqdm import tqdm +import torch +import torchvision.transforms as T + +import open_clip # Frame-Acc + +import pyiqa # MUSIQ + + +# Accepted image extensions (case-insensitive) +IMG_EXTS = ('.png', '.jpg', '.jpeg', '.bmp') + +# ───────────────────────────── dataclass ──────────────────────────────── +@dataclass +class Instance: + """Single sample inside the JSON""" + input: Dict[str, Any] + output: Dict[str, Any] + id: str + +# ────────────────────────────── abstract ──────────────────────────────── +class BaseTask(ABC): + def __init__(self, task_data: Dict[str, Any]): + self.task_data = task_data + self.data: List[Instance] = self._parse_data(task_data) + + # --- implement in subclass ------------------------------------------------ + @abstractmethod + def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: + ... + + @abstractmethod + def run_inference(self) -> None: + """collect paths & meta ⇒ self.records (does *not* run a model)""" + ... + + @abstractmethod + def evaluate(self) -> Dict[str, float]: + ... + +# ════════════════════════════════════════════════════════════════════════════ +# 1. Video Translation – Frame-Acc +# ════════════════════════════════════════════════════════════════════════════ +class VideoTranslationTask(BaseTask): + def _parse_data(self, task_data): + return [Instance(**d) for d in task_data["data"]] + + def run_inference(self): + """gather [(frame_paths, src_prompt, tgt_prompt), …]""" + self.records: List[Tuple[List[str], str, str]] = [] + for inst in tqdm(self.data, desc="collect-frames"): + frame_dir = inst.output["frame_dir"] + frames = sorted( + os.path.join(frame_dir, f) + for f in os.listdir(frame_dir) + if f.lower().endswith(IMG_EXTS) + ) + self.records.append((frames, + inst.input["source_prompt"], + inst.input["target_prompt"])) + + @torch.no_grad() + def evaluate(self, batch_size: int = 32): + if open_clip is None: + raise ImportError("open_clip_torch not installed. pip install open_clip_torch") + + device = 'cuda' if torch.cuda.is_available() else 'cpu' + model, _, preprocess = open_clip.create_model_and_transforms( + "ViT-B-32", pretrained="openai", device=device + ) + model.eval() + tokenizer = open_clip.tokenize + + total, correct = 0, 0 + for frame_paths, src_prompt, tgt_prompt in tqdm(self.records, desc="Frame-Acc eval"): + text_feat = model.encode_text( + tokenizer([src_prompt, tgt_prompt]).to(device) + ).float() + text_feat = text_feat / text_feat.norm(dim=-1, keepdim=True) # (2,D) + + for i in range(0, len(frame_paths), batch_size): + batch_files = frame_paths[i:i + batch_size] + imgs = torch.stack([ + preprocess(Image.open(p).convert("RGB")) for p in batch_files + ]).to(device) + img_feat = model.encode_image(imgs).float() + img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True) # (B,D) + sim = img_feat @ text_feat.T # (B,2) + correct += (sim[:, 1] > sim[:, 0]).sum().item() + total += sim.size(0) + + return {"Frame-Acc": 100.0 * correct / total if total else 0.0} + + +# ════════════════════════════════════════════════════════════════════════════ +# 2. Video Restoration suite – PSNR +# ════════════════════════════════════════════════════════════════════════════ +def compute_psnr(img1: np.ndarray, img2: np.ndarray, max_val: float = 255.0) -> float: + mse = np.mean((img1 - img2) ** 2, dtype=np.float64) + if mse == 0: + return math.inf + return 10.0 * math.log10((max_val ** 2) / mse) + + +class VideoRestorationTask(BaseTask): + def _parse_data(self, task_data): + return [Instance(**d) for d in task_data["data"]] + + def run_inference(self): + """gather [(pred_paths, gt_paths), …]""" + self.records: List[Tuple[List[str], List[str]]] = [] + for inst in tqdm(self.data, desc="collect-frames"): + pred_dir = inst.input["pred_dir"] + gt_dir = inst.input["gt_dir"] + + frame_names = sorted( + f for f in os.listdir(gt_dir) if f.lower().endswith(IMG_EXTS) + ) + pred_paths, gt_paths = [], [] + for fname in frame_names: + p_path = os.path.join(pred_dir, fname) + g_path = os.path.join(gt_dir, fname) + if not os.path.exists(p_path): + raise FileNotFoundError(f"Missing prediction frame: {p_path}") + pred_paths.append(p_path) + gt_paths.append(g_path) + self.records.append((pred_paths, gt_paths)) + + def evaluate(self): + psnr_sum, valid_frames = 0.0, 0 + + for preds, gts in tqdm(self.records, desc="PSNR eval"): + for p, g in zip(preds, gts): + img1 = np.array(Image.open(p).convert("RGB"), dtype=np.float32) + img2 = np.array(Image.open(g).convert("RGB"), dtype=np.float32) + + if img1.shape != img2.shape: + raise ValueError(f"Shape mismatch: {p} vs {g}") + + val = compute_psnr(img1, img2) + if math.isfinite(val): + psnr_sum += val + valid_frames += 1 + + return {"PSNR": psnr_sum / valid_frames if valid_frames else 0.0} + +# ════════════════════════════════════════════════════════════════════════════ +# 3. Video Super-Resolution – MUSIQ +# ════════════════════════════════════════════════════════════════════════════ +class VideoSuperResolutionTask(BaseTask): + def _parse_data(self, task_data): + return [Instance(**d) for d in task_data["data"]] + + def run_inference(self): + self.records: List[List[str]] = [] + for inst in tqdm(self.data, desc="collect-frames"): + pred_dir = inst.input["pred_dir"] + frames = sorted( + os.path.join(pred_dir, f) + for f in os.listdir(pred_dir) + if f.lower().endswith(IMG_EXTS) + ) + if not frames: + raise RuntimeError(f"No prediction frames found in {pred_dir}") + self.records.append(frames) + + @torch.no_grad() + def evaluate(self, batch_size: int = 8): + if pyiqa is None: + raise ImportError("pyiqa not installed. pip install pyiqa") + + device = 'cuda' if torch.cuda.is_available() else 'cpu' + model = pyiqa.create_metric('musiq', device=device, as_loss=False) + model.eval() + transform = T.ToTensor() + + total_sum, total_frames = 0.0, 0 + for frames in tqdm(self.records, desc="MUSIQ eval"): + for i in range(0, len(frames), batch_size): + batch = frames[i:i + batch_size] + imgs = torch.stack([ + transform(Image.open(p).convert("RGB")) for p in batch + ]).to(device) + scores = model(imgs) # (B,) + total_sum += scores.sum().item() + total_frames += scores.numel() + + return {"MUSIQ": total_sum / total_frames if total_frames else 0.0} + + +# ════════════════════════════════════════════════════════════════════════════ +# 4. Video (Salient / Camouflaged) Object Detection – Structure-measure +# ════════════════════════════════════════════════════════════════════════════ +def _ssim(pred: np.ndarray, gt: np.ndarray) -> float: + C1, C2 = 0.01 ** 2, 0.03 ** 2 + mp, mg = pred.mean(), gt.mean() + var_p, var_g = pred.var(), gt.var() + cov = ((pred - mp) * (gt - mg)).mean() + return ((2 * mp * mg + C1) * (2 * cov + C2)) / ( + (mp ** 2 + mg ** 2 + C1) * (var_p + var_g + C2) + 1e-8) + + +def _object_score(x: np.ndarray) -> float: + if x.size == 0: + return 0.0 + mu, sigma = x.mean(), x.std() + return 2 * mu / (mu * mu + 1 + sigma + 1e-8) + + +def structure_measure(pred: np.ndarray, gt: np.ndarray, alpha: float = 0.5) -> float: + """pred in [0,1] float32, gt binary uint8 (0/1)""" + y = gt.mean() + if y == 0: # GT 全黑 + return 1.0 - pred.mean() + if y == 1: # GT 全白 + return pred.mean() + + # ─── object-aware term ───────────────────────────────────────────────── + S_fg = _object_score(pred[gt > 0.5]) + S_bg = _object_score(1 - pred[gt <= 0.5]) + s_object = y * S_fg + (1 - y) * S_bg + + # ─── region-aware term ──────────────────────────────────────────────── + h, w = gt.shape + rows, cols = np.where(gt > 0.5) + cx = int(np.round(cols.mean())) if cols.size else w // 2 + cy = int(np.round(rows.mean())) if rows.size else h // 2 + + def split(img): + return [img[:cy, :cx], img[:cy, cx:], img[cy:, :cx], img[cy:, cx:]] + + regions_p = split(pred) + regions_g = split(gt.astype(np.float32)) + + weights = [r.size / (h * w) for r in regions_g] + ssim_scores = [_ssim(p_r, g_r) for p_r, g_r in zip(regions_p, regions_g)] + s_region = sum(w * s for w, s in zip(weights, ssim_scores)) + + score = alpha * s_object + (1 - alpha) * s_region + return max(score, 0.0) + + +class VideoObjectDetectionTask(BaseTask): + def _parse_data(self, task_data): + return [Instance(**d) for d in task_data["data"]] + + def run_inference(self): + self.records: List[Tuple[List[str], List[str]]] = [] + for inst in tqdm(self.data, desc="collect-frames"): + pred_dir = inst.input["pred_dir"] + gt_dir = inst.input["gt_dir"] + + frame_names = sorted( + f for f in os.listdir(gt_dir) if f.lower().endswith(IMG_EXTS) + ) + preds, gts = [], [] + for fname in frame_names: + p_path = os.path.join(pred_dir, fname) + g_path = os.path.join(gt_dir, fname) + if not os.path.exists(p_path): + raise FileNotFoundError(f"Missing prediction frame: {p_path}") + preds.append(p_path) + gts.append(g_path) + self.records.append((preds, gts)) + + def evaluate(self): + total_sum, total_frames = 0.0, 0 + + for preds, gts in tqdm(self.records, desc="S-measure eval"): + for p, g in zip(preds, gts): + pred = np.array(Image.open(p).convert('L'), dtype=np.float32) + if pred.max() > 1.0: + pred /= 255.0 + gt = (np.array(Image.open(g).convert('L')) > 128).astype(np.uint8) + + if pred.shape != gt.shape: + raise ValueError(f"Shape mismatch: {p} vs {g}") + + total_sum += structure_measure(pred, gt) + total_frames += 1 + + return {"S-measure": total_sum / total_frames if total_frames else 0.0} + + +# ════════════════════════════════════════════════════════════════════════════ +# unified runner +# ═════════════════ +TASK_MAPPING = { + "VideoTranslation": VideoTranslationTask, + "VideoRestoration": VideoRestorationTask, + "VideoSuperResolution": VideoSuperResolutionTask, + "VideoObjectDetection": VideoObjectDetectionTask, +} + + +def main(): + if len(sys.argv) != 2: + print("Usage: python integrated_eval.py ") + sys.exit(1) + + task_json_path = sys.argv[1] + with open(task_json_path, 'r', encoding='utf-8') as f: + task_data = json.load(f) + + task_type = task_data.get("type") + TaskCls = TASK_MAPPING.get(task_type) + if TaskCls is None: + raise NotImplementedError(f"Unsupported task type: {task_type}") + + task = TaskCls(task_data) + task.run_inference() + metrics = task.evaluate() + print(f"[{task_type}] Evaluation Results → {metrics}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/processors/._audio_processor.py b/processors/._audio_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..84cbf2804976ecc5e2ae916bae13e0d5d4e5fe66 Binary files /dev/null and b/processors/._audio_processor.py differ diff --git a/processors/._image_processor.py b/processors/._image_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..25e2b121710b8b30a02e2f078c60a06352836871 Binary files /dev/null and b/processors/._image_processor.py differ diff --git a/processors/__init__.py b/processors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..96288a99be64dd9c5a20c55a055e72ac49d69270 --- /dev/null +++ b/processors/__init__.py @@ -0,0 +1 @@ +"""Processors package for different modalities.""" \ No newline at end of file diff --git a/processors/__pycache__/.___init__.cpython-38.pyc b/processors/__pycache__/.___init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84cbf2804976ecc5e2ae916bae13e0d5d4e5fe66 Binary files /dev/null and b/processors/__pycache__/.___init__.cpython-38.pyc differ diff --git a/processors/__pycache__/.___init__.cpython-39.pyc b/processors/__pycache__/.___init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84cbf2804976ecc5e2ae916bae13e0d5d4e5fe66 Binary files /dev/null and b/processors/__pycache__/.___init__.cpython-39.pyc differ diff --git a/processors/__pycache__/._video_processor.cpython-39.pyc b/processors/__pycache__/._video_processor.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84cbf2804976ecc5e2ae916bae13e0d5d4e5fe66 Binary files /dev/null and b/processors/__pycache__/._video_processor.cpython-39.pyc differ diff --git a/processors/__pycache__/__init__.cpython-311.pyc b/processors/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b333e33aaa641960b49dc012017c622236232301 Binary files /dev/null and b/processors/__pycache__/__init__.cpython-311.pyc differ diff --git a/processors/__pycache__/__init__.cpython-312.pyc b/processors/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8803c15b8cdc511849df11138bbcc62e2f816750 Binary files /dev/null and b/processors/__pycache__/__init__.cpython-312.pyc differ diff --git a/processors/__pycache__/__init__.cpython-38.pyc b/processors/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7c021e9e03834483e7ea598a05dc2dee6dfe88d Binary files /dev/null and b/processors/__pycache__/__init__.cpython-38.pyc differ diff --git a/processors/__pycache__/__init__.cpython-39.pyc b/processors/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..890baa908db9501ccffb23cb9e72008a482a09f6 Binary files /dev/null and b/processors/__pycache__/__init__.cpython-39.pyc differ diff --git a/processors/__pycache__/audio_processor.cpython-311.pyc b/processors/__pycache__/audio_processor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09149988ac5eb1ac7808dfa21f15de91e7f8c815 Binary files /dev/null and b/processors/__pycache__/audio_processor.cpython-311.pyc differ diff --git a/processors/__pycache__/audio_processor.cpython-312.pyc b/processors/__pycache__/audio_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd942e21bbd0ab305709294d9532cb9fad5ae78a Binary files /dev/null and b/processors/__pycache__/audio_processor.cpython-312.pyc differ diff --git a/processors/__pycache__/audio_processor.cpython-38.pyc b/processors/__pycache__/audio_processor.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc58ff64e794ef1362b4d90b1066bf0907a8dc26 Binary files /dev/null and b/processors/__pycache__/audio_processor.cpython-38.pyc differ diff --git a/processors/__pycache__/audio_processor.cpython-39.pyc b/processors/__pycache__/audio_processor.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f96f236bc8f897a5b72f9b6dae7b88e86fef097 Binary files /dev/null and b/processors/__pycache__/audio_processor.cpython-39.pyc differ diff --git a/processors/__pycache__/image_processor.cpython-311.pyc b/processors/__pycache__/image_processor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ebd8ec13bf2c9d86ba531b9c6a2e247eb52f1211 Binary files /dev/null and b/processors/__pycache__/image_processor.cpython-311.pyc differ diff --git a/processors/__pycache__/image_processor.cpython-312.pyc b/processors/__pycache__/image_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce3fcf13616e8bf6c92c0284b0a2f9ae1c9dd815 Binary files /dev/null and b/processors/__pycache__/image_processor.cpython-312.pyc differ diff --git a/processors/__pycache__/image_processor.cpython-38.pyc b/processors/__pycache__/image_processor.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..71cf05417ffcf1e17b497f5cf962fa4b6fd0883c Binary files /dev/null and b/processors/__pycache__/image_processor.cpython-38.pyc differ diff --git a/processors/__pycache__/image_processor.cpython-39.pyc b/processors/__pycache__/image_processor.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea8106a8e53d4a292c319d1a5401ef46d63560e8 Binary files /dev/null and b/processors/__pycache__/image_processor.cpython-39.pyc differ diff --git a/processors/__pycache__/nlp_processor.cpython-311.pyc b/processors/__pycache__/nlp_processor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f94ecaf8811dcb60cdaa6e688c83d19950ac48b2 Binary files /dev/null and b/processors/__pycache__/nlp_processor.cpython-311.pyc differ diff --git a/processors/__pycache__/nlp_processor.cpython-312.pyc b/processors/__pycache__/nlp_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d50717488dd2ea522c988f8aafda4d101ece16a6 Binary files /dev/null and b/processors/__pycache__/nlp_processor.cpython-312.pyc differ diff --git a/processors/__pycache__/nlp_processor.cpython-38.pyc b/processors/__pycache__/nlp_processor.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..924a8fd8e6de5eabb62f89b119fddffd8d3695c1 Binary files /dev/null and b/processors/__pycache__/nlp_processor.cpython-38.pyc differ diff --git a/processors/__pycache__/nlp_processor.cpython-39.pyc b/processors/__pycache__/nlp_processor.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19009301a97a1faca65c9cf983fa8e54c88aaba2 Binary files /dev/null and b/processors/__pycache__/nlp_processor.cpython-39.pyc differ diff --git a/processors/__pycache__/pseudo_audio_processor.cpython-39.pyc b/processors/__pycache__/pseudo_audio_processor.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e95f6f26a495268895de6929ba4b09de297b16e9 Binary files /dev/null and b/processors/__pycache__/pseudo_audio_processor.cpython-39.pyc differ diff --git a/processors/__pycache__/three_d_processor.cpython-311.pyc b/processors/__pycache__/three_d_processor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0467f58f43aee06ce5a086775d603e94f3fcd77 Binary files /dev/null and b/processors/__pycache__/three_d_processor.cpython-311.pyc differ diff --git a/processors/__pycache__/three_d_processor.cpython-312.pyc b/processors/__pycache__/three_d_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f4aa0614ebb3337b258fbc6dae30ba5e3e1a9bcd Binary files /dev/null and b/processors/__pycache__/three_d_processor.cpython-312.pyc differ diff --git a/processors/__pycache__/three_d_processor.cpython-38.pyc b/processors/__pycache__/three_d_processor.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3bd4303169bcc139a2549d152f6ea1908b880b75 Binary files /dev/null and b/processors/__pycache__/three_d_processor.cpython-38.pyc differ diff --git a/processors/__pycache__/three_d_processor.cpython-39.pyc b/processors/__pycache__/three_d_processor.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88afcf7062ee89beb39075f753668e6e295aae18 Binary files /dev/null and b/processors/__pycache__/three_d_processor.cpython-39.pyc differ diff --git a/processors/__pycache__/video_processor.cpython-311.pyc b/processors/__pycache__/video_processor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee984ba2be2898d01b5a1fff5ad97e2a208a4588 Binary files /dev/null and b/processors/__pycache__/video_processor.cpython-311.pyc differ diff --git a/processors/__pycache__/video_processor.cpython-312.pyc b/processors/__pycache__/video_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac8570776284733eae2ba7d4f2ac7479a85153ce Binary files /dev/null and b/processors/__pycache__/video_processor.cpython-312.pyc differ diff --git a/processors/__pycache__/video_processor.cpython-38.pyc b/processors/__pycache__/video_processor.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47845001e4e61863e76ee878e34b0808c8d2d1a3 Binary files /dev/null and b/processors/__pycache__/video_processor.cpython-38.pyc differ diff --git a/processors/__pycache__/video_processor.cpython-39.pyc b/processors/__pycache__/video_processor.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..82f080282b722d8c13e58bf43e250eddf2710306 Binary files /dev/null and b/processors/__pycache__/video_processor.cpython-39.pyc differ diff --git a/processors/audio_processor.py b/processors/audio_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..635679410a8ae5d2a31f17824ea8a7be14315336 --- /dev/null +++ b/processors/audio_processor.py @@ -0,0 +1,80 @@ +from typing import List +from utils.data_types import ModalityType, TaskType, TaskResult +from utils.base_processor import BaseModalityProcessor + +class AudioProcessor(BaseModalityProcessor): + """音频模态处理器""" + def __init__(self, modality: ModalityType, dataset_dir: str, pred_json_file: str): + super().__init__(modality, dataset_dir, pred_json_file) + + def process_comprehension(self) -> List[TaskResult]: + """处理音频理解类任务 + + 需要返回一个TaskResult列表,每个TaskResult包含: + - task_name: 任务名称,例如 "speech_recognition", "audio_classification" 等 + - metric: 评估指标,例如 "WER", "accuracy" 等 + - score: 评估分数 + - task_type: 默认为 TaskType.COMPREHENSION,不需要指定 + + 示例格式: + return [ + TaskResult( + task_name="speech_recognition", + metric="WER", + score=0.15 + ), + TaskResult( + task_name="audio_classification", + metric="accuracy", + score=0.92 + ) + ] + """ + return [] + + def process_generation(self) -> List[TaskResult]: + """处理音频生成类任务 + + 需要返回一个TaskResult列表,每个TaskResult包含: + - task_name: 任务名称,例如 "speech_synthesis", "audio_generation" 等 + - metric: 评估指标,例如 "MOS", "FAD" 等 + - score: 评估分数 + - task_type: 需要指定为 TaskType.GENERATION + + 示例格式: + return [ + TaskResult( + task_name="speech_synthesis", + metric="MOS", + score=4.2, + task_type=TaskType.GENERATION + ), + TaskResult( + task_name="audio_generation", + metric="FAD", + score=12.5, + task_type=TaskType.GENERATION + ) + ] + """ + return [] + +# 使用示例 +if __name__ == "__main__": + processor = AudioProcessor(ModalityType.AUDIO, "") + + # 测试理解任务 + print("\n理解类任务结果:") + for task in processor.process_comprehension(): + print(f"任务: {task.task_name}") + print(f"指标: {task.metric}") + print(f"分数: {task.score}") + print("-" * 20) + + # 测试生成任务 + print("\n生成类任务结果:") + for task in processor.process_generation(): + print(f"任务: {task.task_name}") + print(f"指标: {task.metric}") + print(f"分数: {task.score}") + print("-" * 20) \ No newline at end of file diff --git a/processors/image_processor.py b/processors/image_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..0e80964d861f51f074c8a1bd234c613db1f70086 --- /dev/null +++ b/processors/image_processor.py @@ -0,0 +1,83 @@ +from typing import List +from utils.data_types import ModalityType, TaskType, TaskResult +from utils.base_processor import BaseModalityProcessor + +class ImageProcessor(BaseModalityProcessor): + """图像模态处理器""" + def __init__(self, modality: ModalityType, dataset_dir: str, pred_json_file: str): + super().__init__(modality, dataset_dir, pred_json_file) + + def process_1(self): + return [] + + def process_comprehension(self) -> List[TaskResult]: + """处理图像理解类任务 + + 需要返回一个TaskResult列表,每个TaskResult包含: + - task_name: 任务名称,例如 "image_classification", "object_detection" 等 + - metric: 评估指标,例如 "accuracy", "mAP" 等 + - score: 评估分数 + - task_type: 默认为 TaskType.COMPREHENSION,不需要指定 + + 示例格式: + return [ + TaskResult( + task_name="image_classification", + metric="accuracy", + score=0.95 + ), + TaskResult( + task_name="object_detection", + metric="mAP", + score=0.82 + ) + ] + """ + return [] + + def process_generation(self) -> List[TaskResult]: + """处理图像生成类任务 + + 需要返回一个TaskResult列表,每个TaskResult包含: + - task_name: 任务名称,例如 "image_generation", "image_editing" 等 + - metric: 评估指标,例如 "FID", "IS" 等 + - score: 评估分数 + - task_type: 需要指定为 TaskType.GENERATION + + 示例格式: + return [ + TaskResult( + task_name="image_generation", + metric="FID", + score=15.2, + task_type=TaskType.GENERATION + ), + TaskResult( + task_name="image_editing", + metric="PSNR", + score=28.5, + task_type=TaskType.GENERATION + ) + ] + """ + return [] + +# 使用示例 +if __name__ == "__main__": + processor = ImageProcessor(ModalityType.IMAGE, "") + + # 测试理解任务 + print("\n理解类任务结果:") + for task in processor.process_comprehension(): + print(f"任务: {task.task_name}") + print(f"指标: {task.metric}") + print(f"分数: {task.score}") + print("-" * 20) + + # 测试生成任务 + print("\n生成类任务结果:") + for task in processor.process_generation(): + print(f"任务: {task.task_name}") + print(f"指标: {task.metric}") + print(f"分数: {task.score}") + print("-" * 20) \ No newline at end of file diff --git a/processors/nlp_processor.py b/processors/nlp_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..bb1a8e0392a4c9fbc5b53ed140cc0ed5a11778b2 --- /dev/null +++ b/processors/nlp_processor.py @@ -0,0 +1,381 @@ +import json +import os +import re +import math +import numpy as np +import pandas as pd +from typing import List, Dict, Any, Optional +import nltk +from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction +from rouge_score import rouge_scorer +from codebleu import calc_codebleu +from utils.data_types import TaskResult, TaskType + + +class NLPProcessor: + def __init__(self, modality, dataset_dir: str, pred_json_file: str = "prediction.json"): + self.modality = modality + self.dataset_dir = dataset_dir + '/nlp' + self.pred_json_file = pred_json_file + + def process(self) -> List[TaskResult]: + results = [] + + task_dirs = [d for d in os.listdir(self.dataset_dir) if os.path.isdir(os.path.join(self.dataset_dir, d))] + total_tasks = len(task_dirs) + processed_tasks = 0 + + for task_folder in task_dirs: + folder_path = os.path.join(self.dataset_dir, task_folder) + annotation_path = os.path.join(folder_path, "annotation.json") + prediction_path = os.path.join(folder_path, self.pred_json_file) + + if not os.path.exists(annotation_path): + print(f"Skip {task_folder}: annotation.json no exists") + continue + + if not os.path.exists(prediction_path): + print(f"Skip {task_folder}: {self.pred_json_file} no exists.") + continue + + try: + with open(annotation_path, "r", encoding="utf-8") as f: + task_data = json.load(f) + + with open(prediction_path, "r", encoding="utf-8") as f: + predictions_data = json.load(f) + + task_result = self._evaluate_task(task_data, predictions_data) + if task_result: + results.append(task_result) + processed_tasks += 1 + print(f"Task: {task_folder} (Socre: {task_result.score:.4f})") + else: + print(f"Skip {task_folder}.") + + except Exception as e: + print(f"Skip {task_folder}: Error - {e}") + continue + + return results + + def _evaluate_task(self, task_data: Dict[str, Any], predictions_data: List[Dict]) -> Optional[TaskResult]: + task_type = task_data.get("type", "") + task_name = task_data.get("task", "") + + pred_map = {pred["id"]: pred for pred in predictions_data} + + predictions = [] + references = [] + + for data_item in task_data["data"]: + item_id = data_item["id"] + if item_id not in pred_map: + continue + + pred_item = pred_map[item_id] + + if "prediction" in pred_item: + pred = pred_item["prediction"] + elif "prediction_final" in pred_item: + pred = pred_item["prediction_final"] + else: + continue + + ref = self._extract_reference(data_item, task_type) + if ref is None: + continue + + predictions.append(pred) + references.append(ref) + + if not predictions: + return None + + score, metric = self._calculate_metrics(predictions, references, task_type) + metric = self._convert_metric(metric) + + return TaskResult( + task_name=task_name, + metric=metric, + score=score, + task_type=TaskType.COMPREHENSION + ) + + def _extract_reference(self, data_item: Dict[str, Any], task_type: str) -> Any: + output = data_item.get("output", {}) + + if task_type == "MultipleChoiceQA": + return output.get("answer") + elif task_type == "OpenQA": + return output.get("answer") + elif task_type == "Summarization": + return output.get("summary") or output.get("highlights") + elif task_type == "Translation": + if isinstance(output, str): + return output + else: + return output.get("translation") + elif task_type == "Story Generation": + return output.get("story") + elif task_type == "Dialogue": + return output.get("reference") + elif task_type == "Code Generation": + return output.get("response", {}).get("content") + elif task_type == "Code Repair": + return output.get("repairCode") + elif task_type == "Code Defect Detection": + return str(output.get("target")) + elif task_type == "Text to SQL": + return output.get("sql") + elif task_type == "Code Explanation": + return output.get("nl") + elif task_type == "Proof": + proof_data = output.get("proof", {}) + steps = proof_data.get("steps", []) + conclusion = proof_data.get("conclusion", "") + return "\n".join(steps) + f"\nConclusion: {conclusion}" + elif task_type == "Mathematical Word Problem Solving": + return output.get("solution", {}).get("final_answer") + elif task_type == "Paraphrase Generation": + return output.get("paraphraseSentence") + elif task_type == "Grammar Correction": + return output.get("Standard English") + elif task_type == "Text Style Transfer": + return output.get("answer") + elif task_type == "Table-to-Text Generation": + return output.get("response", {}).get("text") + elif task_type == "Time Series": + return output.get("target") + elif task_type in ["classification", "multiple choice"]: + return list(output.values())[0].lower() if output else "" + elif task_type in ["multi label classification", "ner", "extraction", "relation extraction", "event detection", "parsing"]: + value = list(output.values())[0] if output else "" + return '

'.join(value.lower().split(', ')) if isinstance(value, str) else "" + else: + # 默认取第一个值 + return list(output.values())[0] if output else "" + + def _calculate_metrics(self, predictions: List, references: List, task_type: str) -> tuple: + if task_type == "MultipleChoiceQA": + score = self._exact_match_accuracy(predictions, references) + return score, "accuracy" + + elif task_type == "OpenQA": + f1_score = self._calculate_f1(predictions, references) + return f1_score, "f1" + + elif task_type == "Summarization": + rouge_scores = self._rouge_evaluation(predictions, references) + return rouge_scores["rouge1"], "rouge1" + + elif task_type == "Translation": + rouge_scores = self._rouge_evaluation(predictions, references) + return rouge_scores["rouge1"], "rouge1" + + elif task_type in ["Story Generation", "Dialogue", "Paraphrase Generation", "Grammar Correction", "Text Style Transfer", "Table-to-Text Generation"]: + bleu_scores = self._bleu_evaluation(predictions, references) + return bleu_scores["bleu1"], "bleu1" + + elif task_type in ["Code Generation", "Code Repair"]: + try: + result = calc_codebleu(references, predictions, lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None) + return result["codebleu"], "code_bleu" + except: + return 0.0, "code_bleu" + + elif task_type == "Code Defect Detection": + score = self._exact_match_accuracy(predictions, references) + return score, "accuracy" + + elif task_type == "Text to SQL": + score = self._exact_match_accuracy(predictions, references) + return score, "accuracy" + + elif task_type in ["Code Explanation", "Proof"]: + bleu_scores = self._bleu_evaluation(predictions, references) + return bleu_scores["bleu1"], "bleu1" + + elif task_type == "Mathematical Word Problem Solving": + score = self._exact_match_accuracy(predictions, references) + return score, "accuracy" + + elif task_type == "Time Series": + mae = self._mean_absolute_error(predictions, references) + return mae, "MAE" + + elif task_type in ["classification", "multiple choice"]: + f1_score = self._calculate_micro_f1(predictions, references) + return f1_score, "micro_f1" + + elif task_type in ["multi label classification", "ner", "extraction", "relation extraction", "event detection", "parsing"]: + f1_score = self._calculate_micro_f1(predictions, references) + return f1_score, "micro_f1" + + else: + f1_score = self._calculate_f1(predictions, references) + return f1_score, "f1" + + def _exact_match_accuracy(self, predictions: List[str], references: List[str]) -> float: + correct = 0 + for pred, ref in zip(predictions, references): + if isinstance(ref, str): + ref = [ref] + is_match = False + for r in ref: + if str(pred).strip() == str(r).strip(): + is_match = True + break + if is_match: + correct += 1 + return correct / len(predictions) if predictions else 0.0 + + def _calculate_f1(self, predictions: List[str], references: List[str]) -> float: + def compute_f1(pred: str, ref: str) -> float: + pred_tokens = str(pred).strip().split() + ref_tokens = str(ref).strip().split() + + common_tokens = set(pred_tokens) & set(ref_tokens) + num_common = len(common_tokens) + + if num_common == 0: + return 0.0 + + precision = num_common / len(pred_tokens) if pred_tokens else 0.0 + recall = num_common / len(ref_tokens) if ref_tokens else 0.0 + + return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 + + total_f1 = 0.0 + for pred, ref in zip(predictions, references): + if isinstance(ref, str): + ref = [ref] + max_f1 = 0.0 + for r in ref: + max_f1 = max(compute_f1(pred, r), max_f1) + total_f1 += max_f1 + + return total_f1 / len(predictions) if predictions else 0.0 + + def _calculate_micro_f1(self, predictions: List[str], references: List[str]) -> float: + total_tp = 0 + total_fp = 0 + total_fn = 0 + + for pred, ref in zip(predictions, references): + pred_tokens = set(str(pred).strip().split('

')) + ref_tokens = set(str(ref).strip().split("

")) + + tp = len(pred_tokens & ref_tokens) + fp = len(pred_tokens - ref_tokens) + fn = len(ref_tokens - pred_tokens) + + total_tp += tp + total_fp += fp + total_fn += fn + + if total_tp == 0: + return 0.0 + + precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0 + recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0 + return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 + + def _rouge_evaluation(self, predictions: List[str], references: List[str]) -> Dict[str, float]: + scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) + rouge1_scores, rouge2_scores, rougel_scores = [], [], [] + + for pred, ref in zip(predictions, references): + if isinstance(ref, str): + ref = [ref] + rouge1, rouge2, rougeL = 0, 0, 0 + for r in ref: + scores = scorer.score(str(r), str(pred)) + rouge1 = max(scores['rouge1'].fmeasure, rouge1) + rouge2 = max(scores['rouge2'].fmeasure, rouge2) + rougeL = max(scores['rougeL'].fmeasure, rougeL) + rouge1_scores.append(rouge1) + rouge2_scores.append(rouge2) + rougel_scores.append(rougeL) + + return { + 'rouge1': sum(rouge1_scores) / len(rouge1_scores) if rouge1_scores else 0.0, + 'rouge2': sum(rouge2_scores) / len(rouge2_scores) if rouge2_scores else 0.0, + 'rougeL': sum(rougel_scores) / len(rougel_scores) if rougel_scores else 0.0, + } + + def _bleu_evaluation(self, predictions: List[str], references: List[str]) -> Dict[str, float]: + smoothie = SmoothingFunction().method4 + bleu1_scores, bleu2_scores, bleu3_scores, bleu4_scores = [], [], [], [] + + for pred, ref in zip(predictions, references): + try: + hypothesis = nltk.word_tokenize(str(pred)) + except: + hypothesis = str(pred).split() + + if isinstance(ref, str): + ref = [ref] + + bleu1, bleu2, bleu3, bleu4 = 0, 0, 0, 0 + for r in ref: + try: + reference = [nltk.word_tokenize(str(r))] + except: + reference = [str(r).split()] + + try: + bleu1 = max(sentence_bleu(reference, hypothesis, weights=(1, 0, 0, 0), smoothing_function=smoothie), bleu1) + bleu2 = max(sentence_bleu(reference, hypothesis, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie), bleu2) + bleu3 = max(sentence_bleu(reference, hypothesis, weights=(1/3, 1/3, 1/3, 0), smoothing_function=smoothie), bleu3) + bleu4 = max(sentence_bleu(reference, hypothesis, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie), bleu4) + except: + continue + + bleu1_scores.append(bleu1) + bleu2_scores.append(bleu2) + bleu3_scores.append(bleu3) + bleu4_scores.append(bleu4) + + return { + 'bleu1': sum(bleu1_scores) / len(bleu1_scores) if bleu1_scores else 0.0, + 'bleu2': sum(bleu2_scores) / len(bleu2_scores) if bleu2_scores else 0.0, + 'bleu3': sum(bleu3_scores) / len(bleu3_scores) if bleu3_scores else 0.0, + 'bleu4': sum(bleu4_scores) / len(bleu4_scores) if bleu4_scores else 0.0, + } + + def _mean_absolute_error(self, predictions: List[float], references: List[float]) -> float: + if not predictions: + return 0.0 + + error_sum = 0.0 + valid_count = 0 + + for p, r in zip(predictions, references): + try: + error_sum += abs(float(p) - float(r)) + valid_count += 1 + except: + continue + + return error_sum / valid_count if valid_count > 0 else 0.0 + + def _convert_metric(self, metric: str) -> str: + m = metric.lower() + if m == "accuracy": + return "ACC" + if m == "f1": + return "F1" + if m == "micro_f1": + return "Micro-F1" + if m.startswith("rouge"): + if "l" in m: + return "ROUGE-L" + else: + return "ROUGE-1" + if m.startswith("bleu"): + return "BLEU-1" + if m == "code_bleu": + return "CodeBLEU" + return metric.upper() + diff --git a/processors/three_d_processor.py b/processors/three_d_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..91d9afe931064acd85d918e1dd802f936ef72f06 --- /dev/null +++ b/processors/three_d_processor.py @@ -0,0 +1,79 @@ +from typing import List +from utils.data_types import ModalityType, TaskType, TaskResult +from utils.base_processor import BaseModalityProcessor + +class ThreeDProcessor(BaseModalityProcessor): + """3D模态处理器""" + def __init__(self, modality: ModalityType, dataset_dir: str, pred_json_file: str): + super().__init__(modality, dataset_dir, pred_json_file) + + def process_comprehension(self) -> List[TaskResult]: + """处理3D理解类任务 + + 需要返回一个TaskResult列表,每个TaskResult包含: + - task_name: 任务名称,例如 "3d_object_detection", "point_cloud_segmentation" 等 + - metric: 评估指标,例如 "mAP", "IoU" 等 + - score: 评估分数 + - task_type: 默认为 TaskType.COMPREHENSION,不需要指定 + 示例格式: + return [ + TaskResult( + task_name="3d_object_detection", + metric="mAP", + score=0.76 + ), + TaskResult( + task_name="point_cloud_segmentation", + metric="IoU", + score=0.82 + ) + ] + """ + return [] + + def process_generation(self) -> List[TaskResult]: + """处理3D生成类任务 + + 需要返回一个TaskResult列表,每个TaskResult包含: + - task_name: 任务名称,例如 "3d_reconstruction", "mesh_generation" 等 + - metric: 评估指标,例如 "CD", "F1" 等 + - score: 评估分数 + - task_type: 这里需要指定为 TaskType.GENERATION + + 示例格式: + return [ + TaskResult( + task_name="3d_reconstruction", + metric="CD", + score=0.15, + task_type=TaskType.GENERATION + ), + TaskResult( + task_name="mesh_generation", + metric="F1", + score=0.88, + task_type=TaskType.GENERATION + ) + ] + """ + return [] + +# 使用示例 +if __name__ == "__main__": + processor = ThreeDProcessor(ModalityType.THREE_D, "") + + # 测试理解任务 + print("\n理解类任务结果:") + for task in processor.process_comprehension(): + print(f"任务: {task.task_name}") + print(f"指标: {task.metric}") + print(f"分数: {task.score}") + print("-" * 20) + + # 测试生成任务 + print("\n生成类任务结果:") + for task in processor.process_generation(): + print(f"任务: {task.task_name}") + print(f"指标: {task.metric}") + print(f"分数: {task.score}") + print("-" * 20) \ No newline at end of file diff --git a/processors/video_processor.py b/processors/video_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..c27a47376d0a5ab0b3869b1efd1ee7b3efbe829d --- /dev/null +++ b/processors/video_processor.py @@ -0,0 +1,80 @@ +from typing import List +from utils.data_types import ModalityType, TaskType, TaskResult +from utils.base_processor import BaseModalityProcessor + +class VideoProcessor(BaseModalityProcessor): + """视频模态处理器""" + def __init__(self, modality: ModalityType, dataset_dir: str, pred_json_file: str): + super().__init__(modality, dataset_dir, pred_json_file) + + def process_comprehension(self) -> List[TaskResult]: + """处理视频理解类任务 + + 需要返回一个TaskResult列表,每个TaskResult包含: + - task_name: 任务名称,例如 "action_recognition", "video_classification" 等 + - metric: 评估指标,例如 "accuracy", "mAP" 等 + - score: 评估分数 + - task_type: 默认为 TaskType.COMPREHENSION,不需要指定 + + 示例格式: + return [ + TaskResult( + task_name="action_recognition", + metric="accuracy", + score=0.88 + ), + TaskResult( + task_name="video_classification", + metric="accuracy", + score=0.92 + ) + ] + """ + return [] + + def process_generation(self) -> List[TaskResult]: + """处理视频生成类任务 + + 需要返回一个TaskResult列表,每个TaskResult包含: + - task_name: 任务名称,例如 "video_generation", "video_prediction" 等 + - metric: 评估指标,例如 "FVD", "PSNR" 等 + - score: 评估分数 + - task_type: 需要指定为 TaskType.GENERATION + + 示例格式: + return [ + TaskResult( + task_name="video_generation", + metric="FVD", + score=45.2, + task_type=TaskType.GENERATION + ), + TaskResult( + task_name="video_prediction", + metric="PSNR", + score=25.8, + task_type=TaskType.GENERATION + ) + ] + """ + return [] + +# 使用示例 +if __name__ == "__main__": + processor = VideoProcessor(ModalityType.VIDEO, "") + + # 测试理解任务 + print("\n理解类任务结果:") + for task in processor.process_comprehension(): + print(f"任务: {task.task_name}") + print(f"指标: {task.metric}") + print(f"分数: {task.score}") + print("-" * 20) + + # 测试生成任务 + print("\n生成类任务结果:") + for task in processor.process_generation(): + print(f"任务: {task.task_name}") + print(f"指标: {task.metric}") + print(f"分数: {task.score}") + print("-" * 20) \ No newline at end of file diff --git a/ranker.py b/ranker.py new file mode 100644 index 0000000000000000000000000000000000000000..8735f1fc14721a2a06b78a9e35cffe2967ae38e9 --- /dev/null +++ b/ranker.py @@ -0,0 +1,420 @@ +""" +Calculate level scores based on Excel files. +""" + +import pandas as pd +import numpy as np +from utils import special_metrix +import logging +import sys +import argparse +import math + +def setup_logging(model_name): + """Configure logging with model name in filename""" + log_filename = f'outcome/score_calculation_{model_name.lower()}.log' + + # 创建一个handler,用UTF-8编码写入文件 + handler = logging.FileHandler(log_filename, encoding='utf-8') + handler.setFormatter(logging.Formatter( + fmt='%(asctime)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + )) + + # 配置根logger + root_logger = logging.getLogger() + root_logger.setLevel(logging.INFO) + + # 移除所有已存在的handler + for hdlr in root_logger.handlers[:]: + root_logger.removeHandler(hdlr) + + # 添加新的handler + root_logger.addHandler(handler) + + return log_filename + +def normalize_special_metrics(metrics_list, scores_list): + """Normalize special metrics""" + special_metrics = set([k.upper() for k in special_metrix.special_metric_dict.keys()]) + logging.info(f'Special metrics: {special_metrics}') + + normalized_scores = [] + for metric, score in zip(metrics_list, scores_list): + metric_upper = metric.upper() if isinstance(metric, str) else metric + if metric_upper in special_metrics: + logging.info('-'*25) + logging.info(f'>>> Metric: {metric} | Original: {score}') + if pd.isna(score) or score == float('inf') or score == 0.0: + normalized_score = 0.0 + else: + normalized_score = special_metrix.map_function_for_special(metric_upper, score) + logging.info(f'>>> Metric: {metric} | Normalized: {normalized_score}') + normalized_scores.append(normalized_score) + else: + normalized_scores.append(score) + + return normalized_scores + +def get_level_2_mono(scores): + """Calculate level-2 score for a single modality""" + valid_scores = [s for s in scores if not pd.isna(s) and s != float('inf')] + if not valid_scores: + return 0.0 + avg = sum(valid_scores) / len(scores) + logging.info(f"Valid scores: {valid_scores}") + logging.info(f"Average: {avg}") + logging.info(f"Total scores: {len(scores)}") + logging.info(f"Valid scores count: {len(valid_scores)}") + logging.info(f"Invalid scores count: {len(scores) - len(valid_scores)}") + return avg + +def get_level_2(comprehension_scores, generation_scores): + """Calculate level-2 score for a single modality""" + score_c = get_level_2_mono(comprehension_scores) + score_g = get_level_2_mono(generation_scores) + return (score_c + score_g) / 2 + +def get_level_mono(sota_scores, model_scores, level, task_type="Comprehension"): + """Calculate level score for a single modality (Level-3 and Level-4 use the same logic)""" + valid_pairs = [(s, m) for s, m in zip(sota_scores, model_scores) + if not pd.isna(s) and not pd.isna(m) and s != float('inf') and m != float('inf')] + if not valid_pairs: + return 0.0 + + logging.info(f"\nLevel-{level} scoring details ({task_type}):") + logging.info(f"Valid score pairs: {len(valid_pairs)}") + + scores = [m if m >= s else 0.0 for s, m in valid_pairs] + avg_score = sum(scores) / len(sota_scores) + logging.info(f"Final Level-{level} score: {avg_score:.2f}") + return avg_score + +def get_level_3(sota_c, score_c, sota_g, score_g): + """ + 计算单个模态的level-3分数 + """ + score_c = get_level_mono(sota_c, score_c, 3, "Comprehension") + score_g = get_level_mono(sota_g, score_g, 3, "Generation") + return (score_c + score_g) / 2 + +def get_level_4(sota_c, score_c, sota_g, score_g, epsilon=1e-6): + """ + 计算单个模态的level-4分数 + """ + score_c = get_level_mono(sota_c, score_c, 4, "Comprehension") + score_g = get_level_mono(sota_g, score_g, 4, "Generation") + + if score_c == 0 or score_g == 0: + return 0.0 + + return 2 * (score_c * score_g) / (score_c + score_g + epsilon) + +def process_sheet(sota_df, model_df, model_name): + """ + 处理单个sheet的数据 + """ + # 提取需要的列 + metrics = sota_df['Metrics'].tolist() + sota = sota_df['SoTA Performance'].tolist() + + # 查找模型名称(大小写不敏感) + model_columns = model_df.columns + model_col = next((col for col in model_columns if col.lower() == model_name.lower()), None) + if model_col is None: + raise ValueError(f"在Excel文件中找不到模型列: {model_name}") + + model_scores = model_df[model_col].tolist() + + def to_float_inf(x): + if pd.isna(x): + return float('inf') + if isinstance(x, str) and (x.strip() == '∞' or x.strip().lower() == 'inf'): + return float('inf') + try: + return float(x) + except Exception: + return float('inf') + + # 转换为float类型 + sota = [to_float_inf(x) for x in sota] + model_scores = [to_float_inf(x) for x in model_scores] + + # 归一化特殊指标 + sota = normalize_special_metrics(metrics, sota) + model_scores = normalize_special_metrics(metrics, model_scores) + + return metrics, sota, model_scores + +def get_modality_scores(comprehension_metrics, comprehension_sota, comprehension_scores, + generation_metrics, generation_sota, generation_scores): + """ + 计算单个模态的各个level分数 + """ + # Level-2: 理解和生成的平均分 + score_level_2 = get_level_2(comprehension_scores, generation_scores) + + # Level-3: 相对于SoTA的表现 + score_level_3 = get_level_3(comprehension_sota, comprehension_scores, + generation_sota, generation_scores) + + # Level-4: 理解和生成的综合表现 + score_level_4 = get_level_4(comprehension_sota, comprehension_scores, + generation_sota, generation_scores) + + return score_level_2, score_level_3, score_level_4 + +def sigmoid_adjust(x): + """ + 对RMSE指标进行sigmoid调整 + """ + T = 5 + return 2 / (1 + math.exp(-T / x)) - 1 + +def get_level_5(l4_score, sota_df, model_df, model_name): + """ + 计算Level-5分数 + """ + # 从Excel中读取NLP分数 + metrics = sota_df['Metrics'].tolist() + sota_scores = sota_df['SoTA Performance'].tolist() + + # 查找模型名称(大小写不敏感) + model_columns = model_df.columns + model_col = next((col for col in model_columns if col.lower() == model_name.lower()), None) + if model_col is None: + raise ValueError(f"在Excel文件中找不到模型列: {model_name}") + + model_scores = model_df[model_col].tolist() + + def to_float_inf(x): + if pd.isna(x): + return float('inf') + if isinstance(x, str) and (x.strip() == '∞' or x.strip().lower() == 'inf'): + return float('inf') + try: + return float(x) + except Exception: + return float('inf') + + # 转换为float类型 + sota_scores = [to_float_inf(x) for x in sota_scores] + model_scores = [to_float_inf(x) for x in model_scores] + + # 对RMSE指标进行特殊处理 + rmse_index = next((i for i, m in enumerate(metrics) if m.upper() == 'RMSE'), None) + if rmse_index is not None: + model_scores[rmse_index] = sigmoid_adjust(model_scores[rmse_index]) * 100 + + # 计算获胜任务的平均分 + valid_pairs = [(s, m) for s, m in zip(sota_scores, model_scores) + if not pd.isna(s) and not pd.isna(m) and s != float('inf') and m != float('inf')] + if not valid_pairs: + return 0.0 + + T = len(valid_pairs) + # 计算获胜任务数 + wins = sum(1 for s, m in valid_pairs if m >= s) + + s_l = [m if m >= s else 0 for s, m in valid_pairs] + s_l = sum(s_l) / len(sota_scores) + + # 计算权重 + w_l = s_l / 100 + # 计算Level-5分数 + l5_score = l4_score * w_l + + # 打印详细信息 + logging.info(f"\nLevel-5 scoring details:") + logging.info(f"NLP task statistics: Supporting {T}/{len(metrics)} tasks, Wins {wins}") + logging.info(f"Task comparison:") + for i, (metric, sota, model) in enumerate(zip(metrics, sota_scores, model_scores)): + if not pd.isna(sota) and not pd.isna(model) and sota != float('inf') and model != float('inf'): + status = "✓" if model >= sota else "✗" + logging.info(f"Task {i+1:2d}: {metric:10s} | SoTA: {sota:6.2f} | Model: {model:6.2f} | {status}") + logging.info(f"\nWinning task average score: {s_l:.4f}") + logging.info(f"Weight (w_l): {w_l:.4f}") + logging.info(f"Level-4 score: {l4_score:.4f}") + logging.info(f"Final Level-5 score: {l5_score:.4f}") + + return l5_score + +def main(model_name, sota_file, pred_result_file): + # Set up logging + log_filename = setup_logging(model_name) + print(f"Results will be saved to log file: {log_filename}") + + logging.info(f'Reading files: {sota_file} and {pred_result_file}') + + # Get all sheet names + sota_sheets = pd.ExcelFile(sota_file).sheet_names + model_sheets = pd.ExcelFile(pred_result_file).sheet_names + + logging.info(f'SoTA file sheets: {sota_sheets}') + logging.info(f'Model file sheets: {model_sheets}') + + # Skip level-scores sheet + sota_sheets = [s for s in sota_sheets if s.lower() != 'level-scores'] + model_sheets = [s for s in model_sheets if s.lower() != 'level-scores'] + + # Ensure both files have matching sheets + assert set(sota_sheets) == set(model_sheets), "Sheets in both Excel files must match" + + # Organize data by modality + modality_data = {} + + # Save NLP data for Level-5 calculation + nlp_sota_df = None + nlp_model_df = None + + for sheet in sota_sheets: + # Save NLP data + if sheet == 'NLP': + nlp_sota_df = pd.read_excel(sota_file, sheet_name=sheet) + nlp_model_df = pd.read_excel(pred_result_file, sheet_name=sheet) + logging.info(f'NLP data loaded for Level-5 calculation') + continue + + # Parse sheet name + try: + modality, task = sheet.split('-') + except ValueError: + raise ValueError(f'Invalid sheet name format: {sheet}') + + # Verify modality + if modality not in ['Image', 'Audio', 'Video', '3D']: + logging.info(f'Unknown modality: {modality}') + continue + + logging.info(f'Processing {modality} modality {task} task: {sheet}') + + # Initialize modality data + if modality not in modality_data: + modality_data[modality] = { + 'comprehension': {'metrics': [], 'sota': [], 'scores': []}, + 'generation': {'metrics': [], 'sota': [], 'scores': []} + } + + # Read data + sota_df = pd.read_excel(sota_file, sheet_name=sheet) + model_df = pd.read_excel(pred_result_file, sheet_name=sheet) + + # Process data + metrics, sota, scores = process_sheet(sota_df, model_df, model_name) + + # Categorize by task type + if task == 'Comprehension': + modality_data[modality]['comprehension']['metrics'].extend(metrics) + modality_data[modality]['comprehension']['sota'].extend(sota) + modality_data[modality]['comprehension']['scores'].extend(scores) + elif task == 'Generation': + modality_data[modality]['generation']['metrics'].extend(metrics) + modality_data[modality]['generation']['sota'].extend(sota) + modality_data[modality]['generation']['scores'].extend(scores) + + if not modality_data: + raise ValueError("No valid modality data found") + + # Calculate scores for each modality + modality_scores = {} + for modality, data in modality_data.items(): + logging.info(f'\nCalculating scores for {modality} modality...') + scores = get_modality_scores( + data['comprehension']['metrics'], + data['comprehension']['sota'], + data['comprehension']['scores'], + data['generation']['metrics'], + data['generation']['sota'], + data['generation']['scores'] + ) + modality_scores[modality] = scores + + # Calculate final scores (average across modalities) + final_scores = { + 'Level-2': sum(s[0] for s in modality_scores.values()) / len(modality_scores), + 'Level-3': sum(s[1] for s in modality_scores.values()) / len(modality_scores), + 'Level-4': sum(s[2] for s in modality_scores.values()) / len(modality_scores) + } + + # Calculate Level-5 score + if nlp_sota_df is not None and nlp_model_df is not None: + final_scores['Level-5'] = get_level_5(final_scores['Level-4'], nlp_sota_df, nlp_model_df, model_name) + else: + raise ValueError("NLP data not found, cannot calculate Level-5 score") + + # Prepare result string + result_str = '\n' + '='*50 + '\n' + result_str += f'Evaluation Results for Model {model_name}:\n\n' + result_str += 'Results by Modality:\n' + for modality, data in modality_data.items(): + # Calculate total and valid tasks + comp_tasks = len(data['comprehension']['metrics']) + gen_tasks = len(data['generation']['metrics']) + total_tasks = comp_tasks + gen_tasks + + def count_valid_wins(sota_list, score_list): + valid_count = sum(1 for s, m in zip(sota_list, score_list) + if not pd.isna(s) and not pd.isna(m) and + s != float('inf') and m != float('inf') and + s != 0.0 and m != 0.0) + wins = sum(1 for s, m in zip(sota_list, score_list) + if not pd.isna(s) and not pd.isna(m) and + s != float('inf') and m != float('inf') and + m >= s) + return valid_count, wins + + comp_valid, comp_wins = count_valid_wins(data['comprehension']['sota'], + data['comprehension']['scores']) + gen_valid, gen_wins = count_valid_wins(data['generation']['sota'], + data['generation']['scores']) + total_valid = comp_valid + gen_valid + total_wins = comp_wins + gen_wins + + result_str += f'\n{modality} Modality (Supporting {total_valid}/{total_tasks} tasks, Wins: {total_wins}):\n' + scores = modality_scores[modality] + result_str += f'>>> Level-2 Score: {scores[0]:.2f}\n' + result_str += f'>>> Level-3 Score: {scores[1]:.2f}\n' + result_str += f'>>> Level-4 Score: {scores[2]:.2f}\n' + + # Add NLP results if available + if nlp_sota_df is not None and nlp_model_df is not None: + metrics = nlp_sota_df['Metrics'].tolist() + sota_scores = nlp_sota_df['SoTA Performance'].tolist() + model_col = next((col for col in nlp_model_df.columns if col.lower() == model_name.lower()), None) + if model_col: + model_scores = nlp_model_df[model_col].tolist() + valid_pairs = [(s, m) for s, m in zip(sota_scores, model_scores) + if not pd.isna(s) and not pd.isna(m) and + s != float('inf') and m != float('inf')] + wins = sum(1 for s, m in valid_pairs if m >= s) + result_str += f'\nNLP Modality (Supporting {len(valid_pairs)}/{len(metrics)} tasks, Wins: {wins})\n' + + result_str += '\n' + '='*50 + '\n' + result_str += 'Final Scores:\n' + result_str += f'>>> Level-2 Score: {final_scores["Level-2"]:.2f}\n' + result_str += f'>>> Level-3 Score: {final_scores["Level-3"]:.2f}\n' + result_str += f'>>> Level-4 Score: {final_scores["Level-4"]:.2f}\n' + result_str += f'>>> Level-5 Score: {final_scores["Level-5"]:.2f}\n' + result_str += '='*50 + '\n' + result_str += 'Notes:\n' + result_str += '1. NLP modality is not included in Level-2 to Level-4 scoring\n' + result_str += '2. Each modality calculates both comprehension and generation scores\n' + result_str += '3. Final scores are averaged across all participating modalities\n' + result_str += '4. All scores are converted to percentages\n' + result_str += '5. Level-5 score is based on Level-4 score and NLP task weights\n' + + # Write to log + logging.info("\nFinal Evaluation Results:") + logging.info(result_str) + + # Print to console + print(result_str) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Calculate model scores') + parser.add_argument('-s', '--sota_file', type=str, default='references/sota_result.xlsx', help='SoTA score file') + parser.add_argument('-p', '--pred_result_file', type=str, default='outcome/emu2_result.xlsx', help='Model prediction Excel file') + parser.add_argument('-m', '--model_name', type=str, default='Emu2-32B', help='Model name (matching Excel column name)') + args = parser.parse_args() + + main(args.model_name, args.sota_file, args.pred_result_file) \ No newline at end of file diff --git a/references/sota_result.xlsx b/references/sota_result.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..82f60e57c3acceec0c490a075d5a3e7d91792409 Binary files /dev/null and b/references/sota_result.xlsx differ diff --git a/references/template_result.xlsx b/references/template_result.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..2ee4cfeaf2d756ca4434c16123c9dc8d90df272d Binary files /dev/null and b/references/template_result.xlsx differ diff --git a/register.py b/register.py new file mode 100644 index 0000000000000000000000000000000000000000..955defcf5f28c4e1cb4981bdd79eb25f220498c7 --- /dev/null +++ b/register.py @@ -0,0 +1,119 @@ +import os +from typing import List, Dict, Optional +import pandas as pd +from utils.data_types import ModalityType, TaskType, TaskResult, ModalityResults + +# Import modality processors +from processors.image_processor import ImageProcessor +from processors.video_processor import VideoProcessor +from processors.audio_processor import AudioProcessor +from processors.nlp_processor import NLPProcessor +from processors.three_d_processor import ThreeDProcessor +# from processors.pseudo_audio_processor import PseudoAudioProcessor + +def process_all_modalities(dataset_dir: str, pred_json_file: str) -> ModalityResults: + """Process data for all modalities + + Args: + dataset_dir: Dataset directory path + pred_json_file: Prediction JSON filename + """ + if not os.path.exists(dataset_dir): + raise ValueError(f"Dataset directory not found: {dataset_dir}") + print(f"Using dataset directory: {dataset_dir}") + + # Available processors + processors = [ + ImageProcessor(ModalityType.IMAGE, dataset_dir, pred_json_file), + VideoProcessor(ModalityType.VIDEO, dataset_dir, pred_json_file), + AudioProcessor(ModalityType.AUDIO, dataset_dir, pred_json_file), + NLPProcessor(ModalityType.NLP, dataset_dir, pred_json_file), + ThreeDProcessor(ModalityType.THREE_D, dataset_dir, pred_json_file) + ] + + # Collect results + results: ModalityResults = {} + for processor in processors: + if processor.modality == ModalityType.NLP: + # NLP doesn't distinguish between comprehension and generation + nlp_results = processor.process() + if nlp_results: + results[processor.modality] = { + TaskType.COMPREHENSION: nlp_results + } + else: + # Other modalities have both comprehension and generation + comp_results = processor.process_comprehension() + gen_results = processor.process_generation() + + if comp_results or gen_results: + results[processor.modality] = {} + if comp_results: + results[processor.modality][TaskType.COMPREHENSION] = comp_results + if gen_results: + results[processor.modality][TaskType.GENERATION] = gen_results + + print(f"Implemented modalities: {[m.value for m in results.keys()]}") + + return results + +def save_to_excel(results: ModalityResults, template_excel: str, output_dir: str, model_name: str): + """Save results to Excel file, keeping all sheets and empty columns for unimplemented modalities""" + # Read template Excel sheets + template_dfs = pd.read_excel(template_excel, sheet_name=None) + + # Create new Excel writer + output_file = os.path.join(output_dir, f"{model_name}_result.xlsx") + with pd.ExcelWriter(output_file, engine='openpyxl') as writer: + for sheet_name, template_df in template_dfs.items(): + new_df = template_df.copy() + new_df[model_name] = None + found = False + for modality, task_results in results.items(): + for task_type, results_list in task_results.items(): + expected_sheet = f"{modality.value}-{task_type.value.capitalize()}" if modality != ModalityType.NLP else modality.value + if expected_sheet == sheet_name: + found = True + for task_result in results_list: + mask = (new_df['Task Name'] == task_result.task_name) & \ + (new_df['Metrics'] == task_result.metric) + if mask.any(): + new_df.loc[mask, model_name] = task_result.score + new_df.to_excel(writer, sheet_name=sheet_name, index=False) + if found: + print(f"Updated {sheet_name} sheet") + else: + print(f"{sheet_name} sheet empty, column preserved") + print(f"Results saved to {output_file}") + +def main(): + """Main function to process command line args and execute workflow""" + import argparse + + parser = argparse.ArgumentParser(description='Process multimodal evaluation data and generate Excel report') + parser.add_argument('-d', '--dataset_dir', type=str, default='General-Bench-Openset', + help='Dataset directory path (default: General-Bench-Openset)') + parser.add_argument('-t', '--template', type=str, default='references/template_result.xlsx', + help='Template Excel file path (default: references/template_result.xlsx)') + parser.add_argument('-p', '--pred_json_file', type=str, default='prediction.json', + help='Prediction JSON file name(default: prediction.json)') + parser.add_argument('-o', '--output_dir', type=str, default='outcome', + help='Output directory path (default: outcome)') + parser.add_argument('-m', '--model_name', type=str, default='test', help='Model name') + + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + print(f"Processing evaluation data for {args.model_name}...") + print(f"Dataset directory: {args.dataset_dir}") + print(f"Template file: {args.template}") + print(f"Output directory: {args.output_dir}") + + results = process_all_modalities(args.dataset_dir, args.pred_json_file) + save_to_excel(results, args.template, args.output_dir, args.model_name) + + print("Processing complete!") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/run.sh b/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..352b382d2e44790a27338d5b797d5d4a9964d56b --- /dev/null +++ b/run.sh @@ -0,0 +1,58 @@ +export CUDA_VISIBLE_DEVICES=0 + +DATASET_DIR=General-Bench-Openset +NLP_MODEL_NAME=Qwen/Qwen2.5-7B-Instruct +AUDIO_MODEL_NAME=Qwen/Qwen2-Audio-7B-Instruct +VIDEO_MODEL_NAME=Qwen/Qwen2.5-VL-3B-Instruct +IMAGE_MODEL_NAME=Qwen/Qwen2.5-VL-7B-Instruct +3D_MODEL_NAME=Qwen/Qwen2.5-3B-Instruct + +# 解析 step 参数 +STEP="123" +for arg in "$@"; do +case $arg in + --step=*) + STEP="${arg#*=}" + ;; + --step) + shift + STEP="$1" + ;; +esac +done + +contains_step() { +case "$STEP" in + *$1*) return 0 ;; + *) return 1 ;; +esac +} + +# Step1: Generate predictions for NLP, Image, Audio, Video, 3D tasks +if contains_step 1; then + # NLP + python predictors/nlp_predictor.py --dataset_dir ${DATASET_DIR}/nlp --model_name ${NLP_MODEL_NAME} + + # Audio + python predictors/audio_predict_comprehension.py -m Qwen/Qwen2-Audio-7B-Instruct -d ${DATASET_DIR}/audio/comprehension/ -o ${DATASET_DIR}/audio/predictions/comprehension/ -t AccentClassification AccentSexClassification + python predictors/audio_predict_generation.py -m SpeechGPT -d ${DATASET_DIR}/audio/generation/ -o ${DATASET_DIR}/audio/predictions/generation/ -t SingleCaptionToAudio VideoToAudio ImageToSpeech + + # Video + python predictors/video_comprehension_tasks.py + python predictors/video_comprehension_flow_matching_tracking.py + python predictors/video_comprehension_qa_caption.py + python predictors/video_translation_restoration_superresolution_objectdetection.py + python predictors/video_generation_evaluate_kit.py +fi + +MODEL_NAME=Qwen2.5-7B-Instruct +# Step2: Obtain the score for each task +if contains_step 2; then + python register.py -d ${DATASET_DIR} -t references/template_result.xlsx -o outcome -m ${MODEL_NAME} -p prediction.json +fi + +MODEL_NAME=Qwen2.5-7B-Instruct +# Step3: Obtain the Level score +if contains_step 3; then + python ranker.py -p outcome/${MODEL_NAME}_result.xlsx -m ${MODEL_NAME} +fi \ No newline at end of file diff --git a/utils/._special_metrix.py b/utils/._special_metrix.py new file mode 100644 index 0000000000000000000000000000000000000000..84cbf2804976ecc5e2ae916bae13e0d5d4e5fe66 Binary files /dev/null and b/utils/._special_metrix.py differ diff --git a/utils/__pycache__/._special_metrix.cpython-39.pyc b/utils/__pycache__/._special_metrix.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84cbf2804976ecc5e2ae916bae13e0d5d4e5fe66 Binary files /dev/null and b/utils/__pycache__/._special_metrix.cpython-39.pyc differ diff --git a/utils/__pycache__/base_processor.cpython-311.pyc b/utils/__pycache__/base_processor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18aec4f293a8d925b40a5b57e9a3d31bcf053f8d Binary files /dev/null and b/utils/__pycache__/base_processor.cpython-311.pyc differ diff --git a/utils/__pycache__/base_processor.cpython-312.pyc b/utils/__pycache__/base_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f1894f0a334f3622b928d9c890e87a35184f183 Binary files /dev/null and b/utils/__pycache__/base_processor.cpython-312.pyc differ diff --git a/utils/__pycache__/base_processor.cpython-38.pyc b/utils/__pycache__/base_processor.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8222f8f0a75c928e69745d26d8d52c3f66cc40c8 Binary files /dev/null and b/utils/__pycache__/base_processor.cpython-38.pyc differ diff --git a/utils/__pycache__/base_processor.cpython-39.pyc b/utils/__pycache__/base_processor.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ba650c863ead8e3e56bed49ef97b9fab82419de Binary files /dev/null and b/utils/__pycache__/base_processor.cpython-39.pyc differ diff --git a/utils/__pycache__/data_types.cpython-311.pyc b/utils/__pycache__/data_types.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b62accdb2e73ada7334e6bb7feb27b8afc901fa Binary files /dev/null and b/utils/__pycache__/data_types.cpython-311.pyc differ diff --git a/utils/__pycache__/data_types.cpython-312.pyc b/utils/__pycache__/data_types.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f1520d18c2112389a99e32a9ea61f10eaa0e0ea9 Binary files /dev/null and b/utils/__pycache__/data_types.cpython-312.pyc differ diff --git a/utils/__pycache__/data_types.cpython-38.pyc b/utils/__pycache__/data_types.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45e96d9b2248f60a4a56800834d0caef4af43dac Binary files /dev/null and b/utils/__pycache__/data_types.cpython-38.pyc differ diff --git a/utils/__pycache__/data_types.cpython-39.pyc b/utils/__pycache__/data_types.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f5e8eae13ea00ad7cc80072f22c51ed4aedf330d Binary files /dev/null and b/utils/__pycache__/data_types.cpython-39.pyc differ diff --git a/utils/__pycache__/special_metrix.cpython-310.pyc b/utils/__pycache__/special_metrix.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bd9dd001419bcb65ebd9c6a337ab2c5f7432ea9 Binary files /dev/null and b/utils/__pycache__/special_metrix.cpython-310.pyc differ diff --git a/utils/__pycache__/special_metrix.cpython-311.pyc b/utils/__pycache__/special_metrix.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c444c04bf751359e63b4cf55b17013d55718e59b Binary files /dev/null and b/utils/__pycache__/special_metrix.cpython-311.pyc differ diff --git a/utils/__pycache__/special_metrix.cpython-312.pyc b/utils/__pycache__/special_metrix.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..59d896444773b44e1c935935bec1a5a412f589c6 Binary files /dev/null and b/utils/__pycache__/special_metrix.cpython-312.pyc differ diff --git a/utils/__pycache__/special_metrix.cpython-38.pyc b/utils/__pycache__/special_metrix.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36173db31c350c86514454188f61fec8af5f744a Binary files /dev/null and b/utils/__pycache__/special_metrix.cpython-38.pyc differ diff --git a/utils/__pycache__/special_metrix.cpython-39.pyc b/utils/__pycache__/special_metrix.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c628bca27a7239545443a9234506b62a0ba1d25a Binary files /dev/null and b/utils/__pycache__/special_metrix.cpython-39.pyc differ diff --git a/utils/base_processor.py b/utils/base_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..2d0879c8950c002b10858819ffa790a9322e7fce --- /dev/null +++ b/utils/base_processor.py @@ -0,0 +1,24 @@ +from typing import List +from .data_types import ModalityType, TaskType, TaskResult + +"""Base modality processor""" + +class BaseModalityProcessor: + def __init__(self, modality: ModalityType, + dataset_dir: str, + pred_json_file: str): + self.modality = modality + self.dataset_dir = dataset_dir + self.pred_json_file = pred_json_file + + def process_comprehension(self) -> List[TaskResult]: + """Process comprehension tasks, optional implementation""" + return [] + + def process_generation(self) -> List[TaskResult]: + """Process generation tasks, optional implementation""" + return [] + + def process(self) -> List[TaskResult]: + """Process tasks without type distinction (e.g., NLP tasks)""" + return [] \ No newline at end of file diff --git a/utils/data_types.py b/utils/data_types.py new file mode 100644 index 0000000000000000000000000000000000000000..d741a629f155988f5878e312145c879febdb3d6f --- /dev/null +++ b/utils/data_types.py @@ -0,0 +1,24 @@ +from typing import List, Dict, Union, Literal +from dataclasses import dataclass +from enum import Enum + +class TaskType(Enum): + COMPREHENSION = "comprehension" + GENERATION = "generation" + +class ModalityType(Enum): + IMAGE = "Image" + VIDEO = "Video" + AUDIO = "Audio" + NLP = "NLP" + THREE_D = "3D" + +@dataclass +class TaskResult: + task_name: str + metric: str + score: float + task_type: TaskType = TaskType.COMPREHENSION # Default to comprehension task + +# Store results for all modalities +ModalityResults = Dict[ModalityType, Dict[TaskType, List[TaskResult]]] \ No newline at end of file diff --git a/utils/special_metrix.py b/utils/special_metrix.py new file mode 100644 index 0000000000000000000000000000000000000000..bad0c0f4e07cccea4ca570a0bda582f1eaba14d6 --- /dev/null +++ b/utils/special_metrix.py @@ -0,0 +1,80 @@ +import math +import random + +def _sigmoid(x): + return 1 / (1 + math.exp(-x)) + + +def _2_sigmoid_minus_1(x): + return 2 * _sigmoid(x) - 1 + +def _tanh(x): + return math.tanh(x) + + +# mapping param for special metrix +special_metric_dict = { + # with T + 'MAE': 50, + 'RMS': 50, + 'MSE': 5, + 'RMSE': 5, + 'ABSREL': 0.1, + 'EPE': 1, + 'FID': 25, + 'FVD': 100, + 'FAD': 10, + 'PSNR': 1 / 20, # higher is better + 'SAD': 10, + 'RTE': 0.5, + 'CD': 1, + 'MCD': 5, + # without T + 'WER': None, + 'MS-SSIM': None, + 'MOS': None, +} + +HIGHER_IS_BETTER = [ + 'PSNR', +] + +def map_function_for_special(metrix: str, score: float) -> float: + """ + Score mapping function for special metrics. + >>> metrix: metrix name, str, e.g., 'MAE'. + >>> score: task score, float, e.g., 5.3. + return: mapped scores, float. + """ + metrix = metrix.upper() + T = special_metric_dict[metrix] + + assert score > 0, f'score should be > 0, but found: {score}' + + if metrix in HIGHER_IS_BETTER: + y = _tanh(T * score) + elif metrix == 'WER': + y = 1 - score + elif metrix == 'MS-SSIM': + y = (score + 1) / 2 + elif metrix == 'MOS': + y = (score - 1) / 4 + else: # lower is better + y = _2_sigmoid_minus_1(T / score) + + return y * 100 # Convert to percentage scale + +# • Normalizing WER: +# y = 1 − x, where x ∈ [0, 1], y ∈ [0, 1]. +# • Normalizing MS-SSIM: +# y = (x + 1) / 2 , where x ∈ [−1, 1], y ∈ [0, 1]. +# • Normalizing MOS: +# y = x − 1 / 4 , where x ∈ [1, 5], y ∈ [0, 1]. + +if __name__ == '__main__': + r = random.random() + print(f"{r = }") + print(f"{_sigmoid(r) = }") + print(f"{_2_sigmoid_minus_1(r) = }") + print(f"{_tanh(r) = }") + print(f"{_tanh(r / 2) = }") \ No newline at end of file diff --git a/video_generation_evaluation b/video_generation_evaluation new file mode 160000 index 0000000000000000000000000000000000000000..102dea8ed7b69f3567aa5cd6fcc3e562ab738c2d --- /dev/null +++ b/video_generation_evaluation @@ -0,0 +1 @@ +Subproject commit 102dea8ed7b69f3567aa5cd6fcc3e562ab738c2d