Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
# %% | |
import os | |
import json | |
from huggingface_hub import Repository | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import matplotlib.figure | |
from datetime import datetime | |
from sklearn.preprocessing import MinMaxScaler | |
# import dotenv | |
# dotenv.load_dotenv() | |
min_max_scaler = MinMaxScaler() | |
# %% | |
def pull_results(results_dir: str): | |
repo = Repository(local_dir = results_dir, clone_from="vectara/results", repo_type="dataset") | |
repo.git_pull() | |
def extract_info_from_result_file(result_file): | |
""" | |
{ | |
"config": { | |
"model_dtype": "float16", | |
"model_name": "databricks/dbrx-instruct", | |
"model_sha": "main" | |
}, | |
"results": { | |
"hallucination_rate": { | |
"hallucination_rate": 8.34990059642147 | |
}, | |
"factual_consistency_rate": { | |
"factual_consistency_rate": 91.65009940357854 | |
}, | |
"answer_rate": { | |
"answer_rate": 100.0 | |
}, | |
"average_summary_length": { | |
"average_summary_length": 85.9 | |
} | |
} | |
""" | |
info = json.load(open(result_file, 'r')) | |
result = { | |
"LLM": info["config"]["model_name"], | |
"Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"], | |
# "Factual Consistency Rate": info["results"]["factual_consistency_rate"]["factual_consistency_rate"], | |
"Answer %": info["results"]["answer_rate"]["answer_rate"], | |
"Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"], | |
} | |
return result | |
def get_latest_result_file(dir: str): | |
""" | |
Get the latest result file in the given directory based on the timestamp in the file name. | |
""" | |
if not os.path.isdir(dir): | |
return None | |
files = os.listdir(dir) | |
files = [f for f in files if f.endswith(".json")] | |
if len(files) == 0: | |
return None | |
files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x))) | |
# print ("Scanning: ", dir, "found latest file: ", files[0]) | |
return os.path.join(dir, files[0]) | |
def scan_and_extract(dir: str): | |
"""Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one. | |
""" | |
results = [] | |
for root, dirs, files in os.walk(dir): | |
if len(dirs) == 0: | |
continue | |
for dir in dirs: | |
result_file = get_latest_result_file(os.path.join(root, dir)) | |
if result_file is not None: | |
results.append(extract_info_from_result_file(result_file)) | |
return results | |
def load_results( | |
results_dir: str = "./results", | |
results_json: str = "./results.json" | |
): | |
try: | |
pull_results(results_dir) | |
print (f"Successfully pulled results from {results_dir}") | |
except Exception as e: | |
print(f"Failed to pull and/or extract latest results: {e}") | |
try: | |
results = scan_and_extract(results_dir) | |
if len(results) > 0: | |
with open(results_json, "w") as f: | |
json.dump(results, f, indent=2) | |
print(f"Successfully scanned and extracted results from {results_dir} and saved to {results_json}") | |
else: | |
print(f"No results found in {results_dir}") | |
except Exception as e: | |
print(f"Failed to scan and extract results from {results_dir}: {e}") | |
print(f"Using pre-dumped results from {results_json}") | |
results = json.load(open(results_json, "r")) | |
# print(results) | |
results_df = pd.DataFrame(results) | |
results_df = results_df.sort_values(by="Hallucination %", ascending=True) | |
# replace any value TBD with -1 | |
results_df = results_df.replace("TBD", 100) | |
for column in ["Hallucination %", "Answer %", "Avg Summary Words"]: | |
results_df[column] = results_df[column].apply(lambda x: round(x, 3)) | |
results_df["LLM_lower_case"] = results_df["LLM"].str.lower() | |
return results_df | |
# %% | |
def determine_font_size(LLM: str, hallucination_percent: float) -> int: | |
# based on both hallucination percent and LLM name, determine font size | |
# if hallucination percentage is low and LLM name is long, use smaller font size | |
name_length = len(LLM) | |
if hallucination_percent < 0.25: | |
if name_length > 10: | |
return 8.5 | |
else: | |
return 9 | |
else: | |
return 9 | |
def determine_font_color(hallucination_percent: float) -> str: | |
if 0.25 < hallucination_percent < 0.65: | |
return 'black' | |
else: | |
return 'white' | |
def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float: | |
name_length = len(LLM) | |
print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length) | |
hallu_rate_to_bar_length_ratio = 5 | |
bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent | |
if name_length < bar_length: | |
return 0.01, determine_font_color(hallucination_percent) | |
else: # to the right of the bar, black anyway | |
return hallucination_percent, 'black' | |
def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure: | |
fig = plt.figure(figsize=(8, 4)) | |
# plot using LLM as x-axis and Hallucination % as y-axis | |
# make bars horizontal | |
plot_df = df.head(10) | |
plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]]) | |
plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"])) | |
# plot_df["LLM_x_position"], plot_df["font_color"] = zip(*plot_df.apply( | |
# lambda row: determine_llm_x_position_and_font_color(row["LLM"], row["Hallucination %"]), | |
# axis=1 | |
# )) | |
for i, row in plot_df.iterrows(): | |
plt.text( | |
# row["LLM_x_position"], | |
row["Hallucination %"] + 0.025, | |
row["LLM"], | |
row["Hallucination %"], | |
# f"{row['LLM']}", | |
ha='left', | |
va='center', | |
fontsize=9, | |
# color=row["font_color"] | |
) | |
# plt.yticks([]) | |
plt.tight_layout() | |
# add margin to the right of the plot | |
plt.subplots_adjust(right=0.95) | |
plt.xticks(fontsize=9) | |
plt.xlabel(f"Copyright (2025) Vectara, Inc. Plot generated on: {datetime.now().strftime('%B %d, %Y')}", fontsize=9) | |
plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12) | |
plt.gca().spines['top'].set_visible(False) | |
plt.gca().spines['right'].set_visible(False) | |
plt.gca().spines['left'].set_visible(False) | |
plt.gca().invert_yaxis() # Invert the y-axis to display bars top-down | |
return fig | |
# %% | |
if __name__ == "__main__": | |
results = scan_and_extract("./results") | |
with open("./results.json", "w") as f: | |
json.dump(results, f, indent=2) | |
# %% | |