ScientificArgumentRecommender / src /analysis /show_score_distribution.py
ArneBinder's picture
upload https://github.com/ArneBinder/pie-document-level/pull/452
e7eaeed verified
import pyrootutils
root = pyrootutils.setup_root(
search_from=__file__,
indicator=[".project-root"],
pythonpath=True,
dotenv=False,
)
import argparse
from typing import List, Optional
import pandas as pd
import plotly.figure_factory as ff
from pie_datasets import DatasetDict
pd.options.plotting.backend = "plotly"
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Show score distribution of annotations per layer"
)
# --data-dir predictions/default/2025-02-26_14-28-17
parser.add_argument(
"--data-dir", type=str, required=True, help="Path to the dataset directory"
)
parser.add_argument("--split", type=str, default="test", help="Dataset split to use")
parser.add_argument(
"--layers",
nargs="+",
default=["labeled_spans", "binary_relations"],
help="Annotation layers to use",
)
# --layer-captions ADUs "Argumentative Relations"
parser.add_argument(
"--layer-captions", nargs="+", help="Captions for the figure traces per layer"
)
# --layer-colors "rgb(31,119,180)" "rgb(255,127,14)"
parser.add_argument("--layer-colors", nargs="+", help="Colors for the figure traces per layer")
args = parser.parse_args()
# Load the dataset
ds = DatasetDict.from_json(data_dir=args.data_dir)[args.split]
# get scores per annotation layer and label
layers = args.layers
all_scores = []
all_scores_idx = []
for doc in ds:
for layer in layers:
for ann in doc[layer].predictions:
all_scores.append(ann.score)
all_scores_idx.append((doc.id, layer, getattr(ann, "label", None)))
scores = pd.Series(
all_scores,
index=pd.MultiIndex.from_tuples(all_scores_idx, names=["doc_id", "layer", "label"]),
name="score",
)
if args.layer_captions is not None:
if len(args.layer_captions) < len(layers):
raise ValueError("Not enough captions provided for all layers")
name_mapping = dict(zip(layers, args.layer_captions))
else:
name_mapping = dict(zip(layers, layers))
colors: Optional[List[str]] = None
if args.layer_colors is not None:
if len(args.layer_colors) < len(layers):
raise ValueError("Not enough colors provided for all layers")
color_mapping = dict(zip(layers, args.layer_colors))
colors = [color_mapping[layer] for layer in layers]
else:
colors = None
score_groups = {layer: scores.xs(layer, level="layer").to_numpy() for layer in layers}
group_labels, hist_data = zip(*score_groups.items())
group_labels_renamed = [name_mapping[label] for label in group_labels]
fig = ff.create_distplot(
hist_data,
group_labels=group_labels_renamed,
show_hist=True,
colors=colors,
bin_size=0.025,
)
fig.update_layout(
height=600,
width=800,
title_text="Score Distribution per Annotation Layer",
title_x=0.5,
barmode="group",
)
fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01))
fig.show()
print("done")