falcon-api / infer.py
ngocminhta
update model search
2fffdc8
import os
import pickle
import numpy as np
from src.index import Indexer
import torch
import argparse
from src.text_embedding import TextEmbeddingModel
import random
from collections import Counter
def softmax_weights(scores, temperature=1.0):
scores = np.array(scores)
scores = scores / temperature
e_scores = np.exp(scores - np.max(scores))
return e_scores / np.sum(e_scores)
def normalize_fuzzy_cnt(fuzzy_cnt):
total = sum(fuzzy_cnt.values())
if total == 0:
return fuzzy_cnt
for key in fuzzy_cnt:
fuzzy_cnt[key] /= total
return fuzzy_cnt
def class_type_boost(query_type, candidate_type):
if query_type == candidate_type:
return 1.3
elif abs(query_type - candidate_type) == 1:
return 1.1
elif abs(query_type - candidate_type) == 2:
return 0.9
else:
return 0.8
def set_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
np.random.seed(seed) # Numpy module.
random.seed(seed) # Python random module.
def load_pkl(path):
with open(path, 'rb') as f:
return pickle.load(f)
def infer_3_class(model, tokenizer, index, label_dict, is_mixed_dict, text_list, K):
encoded_text = tokenizer.batch_encode_plus(
text_list,
return_tensors="pt",
max_length=512,
padding="max_length",
truncation=True,
)
encoded_text = {k: v for k, v in encoded_text.items()}
embeddings = model(encoded_text).cpu().detach().numpy()
top_ids_and_scores = index.search_knn(embeddings, K)
pred = []
for i, (ids, scores) in enumerate(top_ids_and_scores):
sorted_scores = np.argsort(scores)
sorted_scores = sorted_scores[::-1]
topk_ids = [ids[j] for j in sorted_scores]
topk_scores = [scores[j] for j in sorted_scores]
weights = softmax_weights(topk_scores, temperature=0.1)
candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids]
initial_pred = Counter(candidate_models).most_common(1)[0][0]
fuzzy_cnt = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0}
for id, weight in zip(topk_ids, weights):
label = (label_dict[int(id)], is_mixed_dict[int(id)])
boost = class_type_boost(is_mixed_dict[int(id)],initial_pred)
fuzzy_cnt[label] += weight * boost
total_score = sum(fuzzy_cnt.values())
final = dict()
final[0] = round(fuzzy_cnt[(1,0)] / total_score*100,2)
final[1] = round(fuzzy_cnt[(0,10^3)] / total_score*100,2)
final[2] = round(fuzzy_cnt[(1,1)] / total_score*100,2)
pred.append(final)
return pred
def infer_model_specific(model, tokenizer, index, label_dict, is_mixed_dict, write_model_dict, text_list, K, K_model):
encoded_text = tokenizer.batch_encode_plus(
text_list,
return_tensors="pt",
max_length=512,
padding="max_length",
truncation=True,
)
encoded_text = {k: v for k, v in encoded_text.items()}
embeddings = model(encoded_text).cpu().detach().numpy()
# Get predictions using K=21
top_ids_and_scores = index.search_knn(embeddings, K)
pred = []
for i, (ids, scores) in enumerate(top_ids_and_scores):
sorted_scores = np.argsort(scores)[::-1]
# Get all 21 results for 3-class prediction
topk_ids = [ids[j] for j in sorted_scores]
topk_scores = [scores[j] for j in sorted_scores]
# Get top 9 results for model-specific prediction
topk_ids_model = topk_ids[:K_model]
topk_scores_model = topk_scores[:K_model]
# Process 3-class prediction (using all 21)
weights_3class = softmax_weights(topk_scores, temperature=0.1)
candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids]
initial_pred = Counter(candidate_models).most_common(1)[0][0]
fuzzy_cnt_3class = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0}
for id, weight in zip(topk_ids, weights_3class):
label_3class = (label_dict[int(id)], is_mixed_dict[int(id)])
boost_3class = class_type_boost(is_mixed_dict[int(id)], initial_pred)
fuzzy_cnt_3class[label_3class] += weight * boost_3class
# Process model-specific prediction (using top 9)
weights_model = softmax_weights(topk_scores_model, temperature=0.4)
candidate_models_model = [is_mixed_dict[int(_id)] for _id in topk_ids_model]
initial_pred_model = Counter(candidate_models_model).most_common(1)[0][0]
fuzzy_cnt_model = {
(1, 0, 0): 0.0, # Human
(0, 10^3, 1): 0.0, (0, 10^3, 2): 0.0, (0, 10^3, 3): 0.0, (0, 10^3, 4): 0.0, # AI
(1, 1, 1): 0.0, (1, 1, 2): 0.0, (1, 1, 3): 0.0, (1, 1, 4): 0.0 # Human+AI
}
for id, weight in zip(topk_ids_model, weights_model):
label_model = (label_dict[int(id)], is_mixed_dict[int(id)], write_model_dict[int(id)])
boost_model = class_type_boost(is_mixed_dict[int(id)], initial_pred_model)
fuzzy_cnt_model[label_model] += weight * boost_model
# Calculate 3-class probabilities
total_score_3class = sum(fuzzy_cnt_3class.values())
final_3class = {
0: round(fuzzy_cnt_3class[(1,0)] / total_score_3class * 100, 2),
1: round(fuzzy_cnt_3class[(0,10^3)] / total_score_3class * 100, 2),
2: round(fuzzy_cnt_3class[(1,1)] / total_score_3class * 100, 2)
}
# Get model-specific prediction
final_model = max(fuzzy_cnt_model, key=fuzzy_cnt_model.get)
# Combine both predictions
final = {
"score": final_3class,
"model": final_model
}
pred.append(final)
return pred