out / kista_benchmarking /deepseek_benchmarking.py

Upload folder using huggingface_hub

9d5b280 verified 8 months ago

4.18 kB

	from glob import glob
	import json

	# We'll use scikit-learn for evaluation metrics.
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
	from sklearn.metrics import precision_recall_fscore_support

	def evaluate_predictions(ground_truth, model_scores, mode="multiclass"):
	"""
	Computes various evaluation metrics (accuracy, precision, recall, F1-score)
	for the given ground_truth and model_scores lists.

	:param ground_truth: list of true labels
	:param model_scores: list of predicted labels
	:return: dict containing accuracy, classification report, confusion matrix,
	precision, recall, and f1-score
	"""
	# Calculate accuracy
	accuracy = accuracy_score(ground_truth, model_scores)

	# Calculate macro-averaged precision, recall, and F1-score
	precision, recall, f1_score, _ = precision_recall_fscore_support(
	ground_truth,
	model_scores,
	average='micro'
	)

	# Generate a classification report
	class_report = classification_report(ground_truth, model_scores)

	# Generate a confusion matrix
	cm = confusion_matrix(ground_truth, model_scores)

	# Return the metrics in a dictionary
	return {
	'accuracy': accuracy,
	'precision (macro avg)': precision,
	'recall (macro avg)': recall,
	'f1_score (macro avg)': f1_score,
	'classification_report': class_report,
	'confusion_matrix': cm
	}

	def extract_json_output(data):
	api_response = data["api_response"]

	# Find the JSON block between ```json and ```
	start = api_response.find("```json") + 7
	end = api_response.rfind("```")
	json_str = api_response[start:end].strip()

	try:
	return eval(json_str)
	except Exception as e:
	return "ok"

	if __name__ == "__main__":
	# Collect JSON files
	all_files = glob("benchmark_logs/DeepSeek-R1-Distill-Qwen-1.5B/*.json")
	print(len(all_files))
	mode = "mu"

	failed_ = 0
	ground_truts = []
	inference_scoes = []

	# Read each file and extract ground truth + model predictions
	for all_samples in all_files:
	with open(all_samples) as f:
	da_m = json.load(f)
	da_ = extract_json_output(da_m)
	if da_!="ok":
	try:

	# Evaluate the string in da_['api_response'] and extract 'is_met'
	api_res = da_['assessments'][0]['is_met'].lower()

	if mode=="bi":
	if api_res == "undetermined":
	inference_scoes.append("no")
	else:
	inference_scoes.append(api_res)
	else:
	inference_scoes.append(api_res)

	# Evaluate the string in da_['ground_truth'] and extract 'is_met'
	ground_truth = eval(da_m['ground_truth'])['is_met']

	if mode=="bi":
	# print(ground_truth)
	if ground_truth == "undetermined":
	ground_truts.append("no")
	else:
	ground_truts.append(ground_truth)

	else:
	ground_truts.append(ground_truth)



	except Exception as e:
	print(e)
	failed_ += 1
	# If something goes wrong, skip this file
	pass

	# Evaluate predictions
	print(len(ground_truts), len(inference_scoes))
	results = evaluate_predictions(ground_truts, inference_scoes, mode="binary")

	# Print results
	print(f"Number of failed files: {failed_}")
	print("Accuracy:", results['accuracy'])
	print("Precision (macro avg):", results['precision (macro avg)'])
	print("Recall (macro avg):", results['recall (macro avg)'])
	print("F1-score (macro avg):", results['f1_score (macro avg)'])
	print("Classification Report:\n", results['classification_report'])
	print("Confusion Matrix:\n", results['confusion_matrix'])