#!/usr/bin/env python import argparse import json import os from pathlib import Path import pandas as pd from pie_modules.utils import flatten_dict def str2record(s: str | None, sep_parts: str = "-", sep_k_v: str = "=") -> pd.Series: if s is None or s.strip() == "" or s == "None": return pd.Series() return pd.Series(dict(k_v.split(sep_k_v, 1) for k_v in s.split(sep_parts))) def separate_path_and_id(path_and_maybe_id: str, separator: str = ":") -> tuple[str | None, str]: parts = path_and_maybe_id.split(separator, 1) if len(parts) == 1: return None, parts[0] return parts[0], parts[1] def load_data_from_json(path: str | Path) -> pd.DataFrame: with open(path, "r") as f: data_json = json.load(f) data_flat = flatten_dict(data_json) return pd.DataFrame(data_flat) def main( path: str | Path, remove_col_prefix: str | None = None, sparse_col_prefix: str | None = None, tail_cols: list[str] | None = None, sort_cols: list[str] | None = None, split_col: str | None = None, replace_in_col_names: list[tuple[str, str]] | None = None, round_precision: int | None = None, in_percent: bool = False, common_prefix_separator: str | None = None, column_regex_blacklist: list[str] | None = None, column_regex_whitelist: list[str] | None = None, format: str = "markdown", ) -> None: if str(path).lower().endswith(".json"): result = load_data_from_json(path) elif str(path).lower().endswith(".txt"): with open(path, "r") as f: index_data = [separate_path_and_id(line.strip()) for line in f.readlines()] data_list = [] for meta_id, meta_path in index_data: data = load_data_from_json(os.path.join(meta_path, "job_return_value.json")) if meta_id is not None: job_id_prefix = meta_id.replace(",", "-") data["job_id"] = job_id_prefix + "-" + data["job_id"].astype(str) data = data.set_index("job_id") data_list.append(data) result = pd.concat(data_list, axis=1).reset_index() else: raise ValueError("Unsupported file format. Please provide a .json or .txt file.") if remove_col_prefix is not None: result.columns = result.columns.str.replace(r"^" + remove_col_prefix, "", regex=True) if sparse_col_prefix is not None: # get all columns that contain just one not-nan value # number_of_non_nan_values = len(df) - df.isna().sum() # df_sparse = df.loc[:, number_of_non_nan_values == 1] sparse_cols = [col for col in result.columns if col.startswith(sparse_col_prefix)] other_cols = [col for col in result.columns if col not in sparse_cols] value_col = f"{sparse_col_prefix}value" name_col = f"{sparse_col_prefix}name" result = result.melt( id_vars=other_cols, value_vars=sparse_cols, var_name=name_col, value_name=value_col ).dropna( subset=[value_col] ) # keep rows with a value # strip the "f1-" prefix, leaving just the numeric threshold result[name_col] = result[name_col].str.replace(r"^" + sparse_col_prefix, "", regex=True) # convert the column to numeric (if possible) try: result[name_col] = pd.to_numeric(result[name_col]) except ValueError: # if it fails, just keep it as a string pass if split_col is not None: new_frame = result[split_col].apply(str2record) result = pd.concat([result.drop(columns=[split_col]), new_frame], axis=1) if in_percent: float_columns = result.select_dtypes(include=["float64", "float32"]).columns result[float_columns] = result[float_columns] * 100 if round_precision is not None: # round all columns to the given precision result = result.round(round_precision) if common_prefix_separator is not None: # remove common prefix from values in all string columns obj_columns = result.select_dtypes(include=["object"]).columns for obj_col in obj_columns: # get the common prefix common_prefix = os.path.commonprefix(result[obj_col].dropna().astype(str).tolist()) # find last occurrence of the common_prefix_separator last_occurrence = common_prefix.rfind(common_prefix_separator) if last_occurrence != -1: # truncate the common prefix after the last occurrence of the separator common_prefix = common_prefix[: last_occurrence + len(common_prefix_separator)] # remove the common prefix (including the separator) from the column result[obj_col] = result[obj_col].str.replace(r"^" + common_prefix, "", regex=True) # sort columns to get a deterministic order result = result.sort_index(axis=1) if tail_cols is not None: front_cols = [c for c in result.columns if c not in tail_cols] result = result[front_cols + tail_cols] if sort_cols is not None: result = result.sort_values(sort_cols) # also move the sort columns to the front result = result[sort_cols + [c for c in result.columns if c not in sort_cols]] if column_regex_blacklist is not None: # remove columns that match any of the regex patterns in the blacklist for pattern in column_regex_blacklist: result = result.loc[:, ~result.columns.str.contains(pattern, regex=True)] if column_regex_whitelist is not None: # keep only columns that match any of the regex patterns in the whitelist result = result.loc[ :, result.columns.str.contains("|".join(column_regex_whitelist), regex=True) ] if replace_in_col_names is not None: for old_value, new_value in replace_in_col_names: result.columns = result.columns.str.replace(old_value, new_value, regex=False) if format == "markdown": result_str = result.to_markdown(index=False) elif format == "csv": result_str = result.to_csv(index=False) elif format == "tsv": result_str = result.to_csv(index=False, sep="\t") elif format == "json": result_str = result.to_json(orient="records", lines=True) else: raise ValueError( f"Unsupported format: {format}. Supported formats are: markdown, csv, json." ) print(result_str) if __name__ == "__main__": """ Example usage: python src/analysis/format_metric_results.py \ logs/document_evaluation/multiruns/default/2025-05-21_11-59-19/job_return_value.json \ --remove-col-prefix train/ \ --sparse-col-prefix f1- \ --split-col job_id \ --tail-cols num_positives num_total \ --sort-cols experiment model \ --round-precision 4 """ parser = argparse.ArgumentParser( description="Process a JSON file containing metric results (from multirun) and print as Markdown table." ) parser.add_argument( "path", type=str, help="Path to the JSON file to process. The JSON file is expected to contain " "a (maybe nested) dictionary where each leave entry is a list of values with " "the same length.", ) parser.add_argument( "--remove-col-prefix", type=str, default=None, help="Prefix to remove from column names.", ) parser.add_argument( "--sparse-col-prefix", type=str, default=None, help="Prefix of sparse columns. All sparse columns will be melted into " "two columns: name and value. The name column will " "be converted to numeric if possible.", ) parser.add_argument( "--split-col", type=str, default=None, help="Column to split into multiple columns. The format of the " "column entries is expected to be: =-=-...", ) parser.add_argument( "--tail-cols", type=str, nargs="+", default=None, help="Columns to move to the end.", ) parser.add_argument( "--sort-cols", type=str, nargs="+", default=None, help="Columns to sort by (they will be moved to the front).", ) parser.add_argument( "--replace-in-col-names", type=lambda s: s.split(":", 1), nargs="+", default=None, help='List of strings in the format ":" to replace substrings in column names.', ) parser.add_argument( "--round-precision", type=int, default=None, help="Number of decimal places to round to.", ) parser.add_argument( "--in-percent", action="store_true", default=False, help="If set, all float columns will be multiplied by 100 to convert them to percentages.", ) parser.add_argument( "--common-prefix-separator", type=str, default=None, help="For all string columns, remove the common prefix up to the last occurrence of this separator.", ) parser.add_argument( "--column-regex-blacklist", type=str, nargs="+", default=None, help="List of regex patterns to match column names. " "Columns that match any of the patterns will be removed.", ) parser.add_argument( "--column-regex-whitelist", type=str, nargs="+", default=None, help="List of regex patterns to match column names. " "Only columns that match any of the patterns will be kept.", ) parser.add_argument( "--format", type=str, default="markdown", choices=["markdown", "csv", "tsv", "json"], help="Format to print the result in. Supported formats are: markdown, csv, json.", ) kwargs = vars(parser.parse_args()) main(**kwargs)