Spaces:
Sleeping
Sleeping
import os | |
import pickle | |
import numpy as np | |
from src.index import Indexer | |
import torch | |
import argparse | |
from src.text_embedding import TextEmbeddingModel | |
import random | |
from collections import Counter | |
def softmax_weights(scores, temperature=1.0): | |
scores = np.array(scores) | |
scores = scores / temperature | |
e_scores = np.exp(scores - np.max(scores)) | |
return e_scores / np.sum(e_scores) | |
def normalize_fuzzy_cnt(fuzzy_cnt): | |
total = sum(fuzzy_cnt.values()) | |
if total == 0: | |
return fuzzy_cnt | |
for key in fuzzy_cnt: | |
fuzzy_cnt[key] /= total | |
return fuzzy_cnt | |
def class_type_boost(query_type, candidate_type): | |
if query_type == candidate_type: | |
return 1.3 | |
elif abs(query_type - candidate_type) == 1: | |
return 1.1 | |
elif abs(query_type - candidate_type) == 2: | |
return 0.9 | |
else: | |
return 0.8 | |
def set_seed(seed): | |
torch.manual_seed(seed) | |
torch.cuda.manual_seed(seed) | |
torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. | |
np.random.seed(seed) # Numpy module. | |
random.seed(seed) # Python random module. | |
def load_pkl(path): | |
with open(path, 'rb') as f: | |
return pickle.load(f) | |
def infer_3_class(model, tokenizer, index, label_dict, is_mixed_dict, text_list, K): | |
encoded_text = tokenizer.batch_encode_plus( | |
text_list, | |
return_tensors="pt", | |
max_length=512, | |
padding="max_length", | |
truncation=True, | |
) | |
encoded_text = {k: v for k, v in encoded_text.items()} | |
embeddings = model(encoded_text).cpu().detach().numpy() | |
top_ids_and_scores = index.search_knn(embeddings, K) | |
pred = [] | |
for i, (ids, scores) in enumerate(top_ids_and_scores): | |
sorted_scores = np.argsort(scores) | |
sorted_scores = sorted_scores[::-1] | |
topk_ids = [ids[j] for j in sorted_scores] | |
topk_scores = [scores[j] for j in sorted_scores] | |
weights = softmax_weights(topk_scores, temperature=0.1) | |
candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids] | |
initial_pred = Counter(candidate_models).most_common(1)[0][0] | |
fuzzy_cnt = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0} | |
for id, weight in zip(topk_ids, weights): | |
label = (label_dict[int(id)], is_mixed_dict[int(id)]) | |
boost = class_type_boost(is_mixed_dict[int(id)],initial_pred) | |
fuzzy_cnt[label] += weight * boost | |
total_score = sum(fuzzy_cnt.values()) | |
final = dict() | |
final[0] = round(fuzzy_cnt[(1,0)] / total_score*100,2) | |
final[1] = round(fuzzy_cnt[(0,10^3)] / total_score*100,2) | |
final[2] = round(fuzzy_cnt[(1,1)] / total_score*100,2) | |
pred.append(final) | |
return pred | |
def infer_model_specific(model, tokenizer, index, label_dict, is_mixed_dict, write_model_dict, text_list, K, K_model): | |
encoded_text = tokenizer.batch_encode_plus( | |
text_list, | |
return_tensors="pt", | |
max_length=512, | |
padding="max_length", | |
truncation=True, | |
) | |
encoded_text = {k: v for k, v in encoded_text.items()} | |
embeddings = model(encoded_text).cpu().detach().numpy() | |
# Get predictions using K=21 | |
top_ids_and_scores = index.search_knn(embeddings, K) | |
pred = [] | |
for i, (ids, scores) in enumerate(top_ids_and_scores): | |
sorted_scores = np.argsort(scores)[::-1] | |
# Get all 21 results for 3-class prediction | |
topk_ids = [ids[j] for j in sorted_scores] | |
topk_scores = [scores[j] for j in sorted_scores] | |
# Get top 9 results for model-specific prediction | |
topk_ids_model = topk_ids[:K_model] | |
topk_scores_model = topk_scores[:K_model] | |
# Process 3-class prediction (using all 21) | |
weights_3class = softmax_weights(topk_scores, temperature=0.1) | |
candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids] | |
initial_pred = Counter(candidate_models).most_common(1)[0][0] | |
fuzzy_cnt_3class = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0} | |
for id, weight in zip(topk_ids, weights_3class): | |
label_3class = (label_dict[int(id)], is_mixed_dict[int(id)]) | |
boost_3class = class_type_boost(is_mixed_dict[int(id)], initial_pred) | |
fuzzy_cnt_3class[label_3class] += weight * boost_3class | |
# Process model-specific prediction (using top 9) | |
weights_model = softmax_weights(topk_scores_model, temperature=0.4) | |
candidate_models_model = [is_mixed_dict[int(_id)] for _id in topk_ids_model] | |
initial_pred_model = Counter(candidate_models_model).most_common(1)[0][0] | |
fuzzy_cnt_model = { | |
(1, 0, 0): 0.0, # Human | |
(0, 10^3, 1): 0.0, (0, 10^3, 2): 0.0, (0, 10^3, 3): 0.0, (0, 10^3, 4): 0.0, # AI | |
(1, 1, 1): 0.0, (1, 1, 2): 0.0, (1, 1, 3): 0.0, (1, 1, 4): 0.0 # Human+AI | |
} | |
for id, weight in zip(topk_ids_model, weights_model): | |
label_model = (label_dict[int(id)], is_mixed_dict[int(id)], write_model_dict[int(id)]) | |
boost_model = class_type_boost(is_mixed_dict[int(id)], initial_pred_model) | |
fuzzy_cnt_model[label_model] += weight * boost_model | |
# Calculate 3-class probabilities | |
total_score_3class = sum(fuzzy_cnt_3class.values()) | |
final_3class = { | |
0: round(fuzzy_cnt_3class[(1,0)] / total_score_3class * 100, 2), | |
1: round(fuzzy_cnt_3class[(0,10^3)] / total_score_3class * 100, 2), | |
2: round(fuzzy_cnt_3class[(1,1)] / total_score_3class * 100, 2) | |
} | |
# Get model-specific prediction | |
final_model = max(fuzzy_cnt_model, key=fuzzy_cnt_model.get) | |
# Combine both predictions | |
final = { | |
"score": final_3class, | |
"model": final_model | |
} | |
pred.append(final) | |
return pred |