|
import pyrootutils |
|
|
|
root = pyrootutils.setup_root( |
|
search_from=__file__, |
|
indicator=[".project-root"], |
|
pythonpath=True, |
|
dotenv=False, |
|
) |
|
|
|
import argparse |
|
from typing import List, Optional |
|
|
|
import pandas as pd |
|
import plotly.figure_factory as ff |
|
from pie_datasets import DatasetDict |
|
|
|
pd.options.plotting.backend = "plotly" |
|
|
|
if __name__ == "__main__": |
|
|
|
parser = argparse.ArgumentParser( |
|
description="Show score distribution of annotations per layer" |
|
) |
|
|
|
parser.add_argument( |
|
"--data-dir", type=str, required=True, help="Path to the dataset directory" |
|
) |
|
parser.add_argument("--split", type=str, default="test", help="Dataset split to use") |
|
parser.add_argument( |
|
"--layers", |
|
nargs="+", |
|
default=["labeled_spans", "binary_relations"], |
|
help="Annotation layers to use", |
|
) |
|
|
|
parser.add_argument( |
|
"--layer-captions", nargs="+", help="Captions for the figure traces per layer" |
|
) |
|
|
|
parser.add_argument("--layer-colors", nargs="+", help="Colors for the figure traces per layer") |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
ds = DatasetDict.from_json(data_dir=args.data_dir)[args.split] |
|
|
|
|
|
layers = args.layers |
|
all_scores = [] |
|
all_scores_idx = [] |
|
for doc in ds: |
|
for layer in layers: |
|
for ann in doc[layer].predictions: |
|
all_scores.append(ann.score) |
|
all_scores_idx.append((doc.id, layer, getattr(ann, "label", None))) |
|
scores = pd.Series( |
|
all_scores, |
|
index=pd.MultiIndex.from_tuples(all_scores_idx, names=["doc_id", "layer", "label"]), |
|
name="score", |
|
) |
|
|
|
if args.layer_captions is not None: |
|
if len(args.layer_captions) < len(layers): |
|
raise ValueError("Not enough captions provided for all layers") |
|
name_mapping = dict(zip(layers, args.layer_captions)) |
|
else: |
|
name_mapping = dict(zip(layers, layers)) |
|
|
|
colors: Optional[List[str]] = None |
|
if args.layer_colors is not None: |
|
if len(args.layer_colors) < len(layers): |
|
raise ValueError("Not enough colors provided for all layers") |
|
color_mapping = dict(zip(layers, args.layer_colors)) |
|
colors = [color_mapping[layer] for layer in layers] |
|
else: |
|
colors = None |
|
|
|
score_groups = {layer: scores.xs(layer, level="layer").to_numpy() for layer in layers} |
|
group_labels, hist_data = zip(*score_groups.items()) |
|
group_labels_renamed = [name_mapping[label] for label in group_labels] |
|
fig = ff.create_distplot( |
|
hist_data, |
|
group_labels=group_labels_renamed, |
|
show_hist=True, |
|
colors=colors, |
|
bin_size=0.025, |
|
) |
|
|
|
fig.update_layout( |
|
height=600, |
|
width=800, |
|
title_text="Score Distribution per Annotation Layer", |
|
title_x=0.5, |
|
barmode="group", |
|
) |
|
fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)) |
|
|
|
fig.show() |
|
print("done") |
|
|