|
from glob import glob |
|
import json |
|
|
|
|
|
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix |
|
from sklearn.metrics import precision_recall_fscore_support |
|
|
|
def evaluate_predictions(ground_truth, model_scores, mode="multiclass"): |
|
""" |
|
Computes various evaluation metrics (accuracy, precision, recall, F1-score) |
|
for the given ground_truth and model_scores lists. |
|
|
|
:param ground_truth: list of true labels |
|
:param model_scores: list of predicted labels |
|
:return: dict containing accuracy, classification report, confusion matrix, |
|
precision, recall, and f1-score |
|
""" |
|
|
|
accuracy = accuracy_score(ground_truth, model_scores) |
|
|
|
|
|
precision, recall, f1_score, _ = precision_recall_fscore_support( |
|
ground_truth, |
|
model_scores, |
|
average='micro' |
|
) |
|
|
|
|
|
class_report = classification_report(ground_truth, model_scores) |
|
|
|
|
|
cm = confusion_matrix(ground_truth, model_scores) |
|
|
|
|
|
return { |
|
'accuracy': accuracy, |
|
'precision (macro avg)': precision, |
|
'recall (macro avg)': recall, |
|
'f1_score (macro avg)': f1_score, |
|
'classification_report': class_report, |
|
'confusion_matrix': cm |
|
} |
|
|
|
def extract_json_output(data): |
|
api_response = data["api_response"] |
|
|
|
|
|
start = api_response.find("```json") + 7 |
|
end = api_response.rfind("```") |
|
json_str = api_response[start:end].strip() |
|
|
|
try: |
|
return eval(json_str) |
|
except Exception as e: |
|
return "ok" |
|
|
|
if __name__ == "__main__": |
|
|
|
all_files = glob("benchmark_logs/DeepSeek-R1-Distill-Qwen-1.5B/*.json") |
|
print(len(all_files)) |
|
mode = "mu" |
|
|
|
failed_ = 0 |
|
ground_truts = [] |
|
inference_scoes = [] |
|
|
|
|
|
for all_samples in all_files: |
|
with open(all_samples) as f: |
|
da_m = json.load(f) |
|
da_ = extract_json_output(da_m) |
|
if da_!="ok": |
|
try: |
|
|
|
|
|
api_res = da_['assessments'][0]['is_met'].lower() |
|
|
|
if mode=="bi": |
|
if api_res == "undetermined": |
|
inference_scoes.append("no") |
|
else: |
|
inference_scoes.append(api_res) |
|
else: |
|
inference_scoes.append(api_res) |
|
|
|
|
|
ground_truth = eval(da_m['ground_truth'])['is_met'] |
|
|
|
if mode=="bi": |
|
|
|
if ground_truth == "undetermined": |
|
ground_truts.append("no") |
|
else: |
|
ground_truts.append(ground_truth) |
|
|
|
else: |
|
ground_truts.append(ground_truth) |
|
|
|
|
|
|
|
except Exception as e: |
|
print(e) |
|
failed_ += 1 |
|
|
|
pass |
|
|
|
|
|
print(len(ground_truts), len(inference_scoes)) |
|
results = evaluate_predictions(ground_truts, inference_scoes, mode="binary") |
|
|
|
|
|
print(f"Number of failed files: {failed_}") |
|
print("Accuracy:", results['accuracy']) |
|
print("Precision (macro avg):", results['precision (macro avg)']) |
|
print("Recall (macro avg):", results['recall (macro avg)']) |
|
print("F1-score (macro avg):", results['f1_score (macro avg)']) |
|
print("Classification Report:\n", results['classification_report']) |
|
print("Confusion Matrix:\n", results['confusion_matrix']) |
|
|