import pyrootutils root = pyrootutils.setup_root( search_from=__file__, indicator=[".project-root"], pythonpath=True, dotenv=False, ) import argparse from typing import List, Optional import pandas as pd import plotly.figure_factory as ff from pie_datasets import DatasetDict pd.options.plotting.backend = "plotly" if __name__ == "__main__": parser = argparse.ArgumentParser( description="Show score distribution of annotations per layer" ) # --data-dir predictions/default/2025-02-26_14-28-17 parser.add_argument( "--data-dir", type=str, required=True, help="Path to the dataset directory" ) parser.add_argument("--split", type=str, default="test", help="Dataset split to use") parser.add_argument( "--layers", nargs="+", default=["labeled_spans", "binary_relations"], help="Annotation layers to use", ) # --layer-captions ADUs "Argumentative Relations" parser.add_argument( "--layer-captions", nargs="+", help="Captions for the figure traces per layer" ) # --layer-colors "rgb(31,119,180)" "rgb(255,127,14)" parser.add_argument("--layer-colors", nargs="+", help="Colors for the figure traces per layer") args = parser.parse_args() # Load the dataset ds = DatasetDict.from_json(data_dir=args.data_dir)[args.split] # get scores per annotation layer and label layers = args.layers all_scores = [] all_scores_idx = [] for doc in ds: for layer in layers: for ann in doc[layer].predictions: all_scores.append(ann.score) all_scores_idx.append((doc.id, layer, getattr(ann, "label", None))) scores = pd.Series( all_scores, index=pd.MultiIndex.from_tuples(all_scores_idx, names=["doc_id", "layer", "label"]), name="score", ) if args.layer_captions is not None: if len(args.layer_captions) < len(layers): raise ValueError("Not enough captions provided for all layers") name_mapping = dict(zip(layers, args.layer_captions)) else: name_mapping = dict(zip(layers, layers)) colors: Optional[List[str]] = None if args.layer_colors is not None: if len(args.layer_colors) < len(layers): raise ValueError("Not enough colors provided for all layers") color_mapping = dict(zip(layers, args.layer_colors)) colors = [color_mapping[layer] for layer in layers] else: colors = None score_groups = {layer: scores.xs(layer, level="layer").to_numpy() for layer in layers} group_labels, hist_data = zip(*score_groups.items()) group_labels_renamed = [name_mapping[label] for label in group_labels] fig = ff.create_distplot( hist_data, group_labels=group_labels_renamed, show_hist=True, colors=colors, bin_size=0.025, ) fig.update_layout( height=600, width=800, title_text="Score Distribution per Annotation Layer", title_x=0.5, barmode="group", ) fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)) fig.show() print("done")