# %% import os import json from huggingface_hub import Repository import pandas as pd import matplotlib.pyplot as plt import matplotlib.figure from datetime import datetime from sklearn.preprocessing import MinMaxScaler # import dotenv # dotenv.load_dotenv() min_max_scaler = MinMaxScaler() # %% def pull_results(results_dir: str): repo = Repository(local_dir = results_dir, clone_from="vectara/results", repo_type="dataset") repo.git_pull() def extract_info_from_result_file(result_file): """ { "config": { "model_dtype": "float16", "model_name": "databricks/dbrx-instruct", "model_sha": "main" }, "results": { "hallucination_rate": { "hallucination_rate": 8.34990059642147 }, "factual_consistency_rate": { "factual_consistency_rate": 91.65009940357854 }, "answer_rate": { "answer_rate": 100.0 }, "average_summary_length": { "average_summary_length": 85.9 } } """ info = json.load(open(result_file, 'r')) result = { "LLM": info["config"]["model_name"], "Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"], # "Factual Consistency Rate": info["results"]["factual_consistency_rate"]["factual_consistency_rate"], "Answer %": info["results"]["answer_rate"]["answer_rate"], "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"], } return result def get_latest_result_file(dir: str): """ Get the latest result file in the given directory based on the timestamp in the file name. """ if not os.path.isdir(dir): return None files = os.listdir(dir) files = [f for f in files if f.endswith(".json")] if len(files) == 0: return None files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x))) # print ("Scanning: ", dir, "found latest file: ", files[0]) return os.path.join(dir, files[0]) def scan_and_extract(dir: str): """Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one. """ results = [] for root, dirs, files in os.walk(dir): if len(dirs) == 0: continue for dir in dirs: result_file = get_latest_result_file(os.path.join(root, dir)) if result_file is not None: results.append(extract_info_from_result_file(result_file)) return results def load_results( results_dir: str = "./results", results_json: str = "./results.json" ): try: pull_results(results_dir) print (f"Successfully pulled results from {results_dir}") except Exception as e: print(f"Failed to pull and/or extract latest results: {e}") try: results = scan_and_extract(results_dir) if len(results) > 0: with open(results_json, "w") as f: json.dump(results, f, indent=2) print(f"Successfully scanned and extracted results from {results_dir} and saved to {results_json}") else: print(f"No results found in {results_dir}") except Exception as e: print(f"Failed to scan and extract results from {results_dir}: {e}") print(f"Using pre-dumped results from {results_json}") results = json.load(open(results_json, "r")) # print(results) results_df = pd.DataFrame(results) results_df = results_df.sort_values(by="Hallucination %", ascending=True) # replace any value TBD with -1 results_df = results_df.replace("TBD", 100) for column in ["Hallucination %", "Answer %", "Avg Summary Words"]: results_df[column] = results_df[column].apply(lambda x: round(x, 3)) results_df["LLM_lower_case"] = results_df["LLM"].str.lower() return results_df # %% def determine_font_size(LLM: str, hallucination_percent: float) -> int: # based on both hallucination percent and LLM name, determine font size # if hallucination percentage is low and LLM name is long, use smaller font size name_length = len(LLM) if hallucination_percent < 0.25: if name_length > 10: return 8.5 else: return 9 else: return 9 def determine_font_color(hallucination_percent: float) -> str: if 0.25 < hallucination_percent < 0.65: return 'black' else: return 'white' def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float: name_length = len(LLM) print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length) hallu_rate_to_bar_length_ratio = 5 bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent if name_length < bar_length: return 0.01, determine_font_color(hallucination_percent) else: # to the right of the bar, black anyway return hallucination_percent, 'black' def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure: fig = plt.figure(figsize=(8, 4)) # plot using LLM as x-axis and Hallucination % as y-axis # make bars horizontal plot_df = df.head(10) plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]]) plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"])) # plot_df["LLM_x_position"], plot_df["font_color"] = zip(*plot_df.apply( # lambda row: determine_llm_x_position_and_font_color(row["LLM"], row["Hallucination %"]), # axis=1 # )) for i, row in plot_df.iterrows(): plt.text( # row["LLM_x_position"], row["Hallucination %"] + 0.025, row["LLM"], row["Hallucination %"], # f"{row['LLM']}", ha='left', va='center', fontsize=9, # color=row["font_color"] ) # plt.yticks([]) plt.tight_layout() # add margin to the right of the plot plt.subplots_adjust(right=0.95) plt.xticks(fontsize=9) plt.xlabel(f"Copyright (2025) Vectara, Inc. Plot generated on: {datetime.now().strftime('%B %d, %Y')}", fontsize=9) plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12) plt.gca().spines['top'].set_visible(False) plt.gca().spines['right'].set_visible(False) plt.gca().spines['left'].set_visible(False) plt.gca().invert_yaxis() # Invert the y-axis to display bars top-down return fig # %% if __name__ == "__main__": results = scan_and_extract("./results") with open("./results.json", "w") as f: json.dump(results, f, indent=2) # %%