import os import pickle import numpy as np from src.index import Indexer import torch import argparse from src.text_embedding import TextEmbeddingModel import random from collections import Counter def softmax_weights(scores, temperature=1.0): scores = np.array(scores) scores = scores / temperature e_scores = np.exp(scores - np.max(scores)) return e_scores / np.sum(e_scores) def normalize_fuzzy_cnt(fuzzy_cnt): total = sum(fuzzy_cnt.values()) if total == 0: return fuzzy_cnt for key in fuzzy_cnt: fuzzy_cnt[key] /= total return fuzzy_cnt def class_type_boost(query_type, candidate_type): if query_type == candidate_type: return 1.3 elif abs(query_type - candidate_type) == 1: return 1.1 elif abs(query_type - candidate_type) == 2: return 0.9 else: return 0.8 def set_seed(seed): torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. np.random.seed(seed) # Numpy module. random.seed(seed) # Python random module. def load_pkl(path): with open(path, 'rb') as f: return pickle.load(f) def infer_3_class(model, tokenizer, index, label_dict, is_mixed_dict, text_list, K): encoded_text = tokenizer.batch_encode_plus( text_list, return_tensors="pt", max_length=512, padding="max_length", truncation=True, ) encoded_text = {k: v for k, v in encoded_text.items()} embeddings = model(encoded_text).cpu().detach().numpy() top_ids_and_scores = index.search_knn(embeddings, K) pred = [] for i, (ids, scores) in enumerate(top_ids_and_scores): sorted_scores = np.argsort(scores) sorted_scores = sorted_scores[::-1] topk_ids = [ids[j] for j in sorted_scores] topk_scores = [scores[j] for j in sorted_scores] weights = softmax_weights(topk_scores, temperature=0.1) candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids] initial_pred = Counter(candidate_models).most_common(1)[0][0] fuzzy_cnt = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0} for id, weight in zip(topk_ids, weights): label = (label_dict[int(id)], is_mixed_dict[int(id)]) boost = class_type_boost(is_mixed_dict[int(id)],initial_pred) fuzzy_cnt[label] += weight * boost total_score = sum(fuzzy_cnt.values()) final = dict() final[0] = round(fuzzy_cnt[(1,0)] / total_score*100,2) final[1] = round(fuzzy_cnt[(0,10^3)] / total_score*100,2) final[2] = round(fuzzy_cnt[(1,1)] / total_score*100,2) pred.append(final) return pred def infer_model_specific(model, tokenizer, index, label_dict, is_mixed_dict, write_model_dict, text_list, K, K_model): encoded_text = tokenizer.batch_encode_plus( text_list, return_tensors="pt", max_length=512, padding="max_length", truncation=True, ) encoded_text = {k: v for k, v in encoded_text.items()} embeddings = model(encoded_text).cpu().detach().numpy() # Get predictions using K=21 top_ids_and_scores = index.search_knn(embeddings, K) pred = [] for i, (ids, scores) in enumerate(top_ids_and_scores): sorted_scores = np.argsort(scores)[::-1] # Get all 21 results for 3-class prediction topk_ids = [ids[j] for j in sorted_scores] topk_scores = [scores[j] for j in sorted_scores] # Get top 9 results for model-specific prediction topk_ids_model = topk_ids[:K_model] topk_scores_model = topk_scores[:K_model] # Process 3-class prediction (using all 21) weights_3class = softmax_weights(topk_scores, temperature=0.1) candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids] initial_pred = Counter(candidate_models).most_common(1)[0][0] fuzzy_cnt_3class = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0} for id, weight in zip(topk_ids, weights_3class): label_3class = (label_dict[int(id)], is_mixed_dict[int(id)]) boost_3class = class_type_boost(is_mixed_dict[int(id)], initial_pred) fuzzy_cnt_3class[label_3class] += weight * boost_3class # Process model-specific prediction (using top 9) weights_model = softmax_weights(topk_scores_model, temperature=0.4) candidate_models_model = [is_mixed_dict[int(_id)] for _id in topk_ids_model] initial_pred_model = Counter(candidate_models_model).most_common(1)[0][0] fuzzy_cnt_model = { (1, 0, 0): 0.0, # Human (0, 10^3, 1): 0.0, (0, 10^3, 2): 0.0, (0, 10^3, 3): 0.0, (0, 10^3, 4): 0.0, # AI (1, 1, 1): 0.0, (1, 1, 2): 0.0, (1, 1, 3): 0.0, (1, 1, 4): 0.0 # Human+AI } for id, weight in zip(topk_ids_model, weights_model): label_model = (label_dict[int(id)], is_mixed_dict[int(id)], write_model_dict[int(id)]) boost_model = class_type_boost(is_mixed_dict[int(id)], initial_pred_model) fuzzy_cnt_model[label_model] += weight * boost_model # Calculate 3-class probabilities total_score_3class = sum(fuzzy_cnt_3class.values()) final_3class = { 0: round(fuzzy_cnt_3class[(1,0)] / total_score_3class * 100, 2), 1: round(fuzzy_cnt_3class[(0,10^3)] / total_score_3class * 100, 2), 2: round(fuzzy_cnt_3class[(1,1)] / total_score_3class * 100, 2) } # Get model-specific prediction final_model = max(fuzzy_cnt_model, key=fuzzy_cnt_model.get) # Combine both predictions final = { "score": final_3class, "model": final_model } pred.append(final) return pred