Spaces:

fair-forward
/

evals-for-every-language

Runtime error

File size: 5,166 Bytes

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.special import logit

df = pd.read_json("../results.json")

df = df[df["metric"] != "chrf"]
df = df.groupby(["task", "metric", "bcp_47"]).agg({"score": "mean"}).reset_index()


# Apply logit transformation to classification scores to reduce skewness
def transform_classification_scores(row):
    if row["task"] == "classification":
        # Avoid division by zero and infinite values by clipping
        score = np.clip(row["score"], 0.001, 0.999)
        # Apply logit transformation (log(p/(1-p)))
        return logit(score)
    else:
        return row["score"]


df["score"] = df.apply(transform_classification_scores, axis=1)

# Create a pivot table with tasks as columns and languages as rows
pivot_df = df.pivot_table(
    values="score", index="bcp_47", columns="task", aggfunc="mean"
)

# Sort and filter tasks
ordered_tasks = [
    "translation_from",
    "translation_to",
    "classification",
    "mmlu",
    "arc",
    "mgsm",
]
# Drop 'truthfulqa' if present and reindex columns
pivot_df = pivot_df[[task for task in ordered_tasks if task in pivot_df.columns]]

# Calculate correlation matrix
correlation_matrix = pivot_df.corr()

# Create the correlation plot
plt.figure(figsize=(8, 6))
# Create mask for upper triangle including diagonal to show only lower triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Create a heatmap
sns.heatmap(
    correlation_matrix,
    annot=True,
    cmap="Blues",
    center=0,
    square=True,
    mask=mask,
    cbar_kws={"shrink": 0.8},
    fmt=".3f",
)

plt.xlabel("Tasks", fontsize=12)
plt.ylabel("Tasks", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()

# Save the plot
plt.savefig("task_correlation_matrix.png", dpi=300, bbox_inches="tight")
plt.show()

# Print correlation values for reference
print("Correlation Matrix:")
print("Note: Classification scores have been logit-transformed to reduce skewness")
print(correlation_matrix.round(3))

# Also create a scatter plot matrix for pairwise relationships with highlighted languages
highlighted_languages = ["en", "zh", "hi", "es", "ar"]


# Create color mapping
def get_color_and_label(lang_code):
    if lang_code in highlighted_languages:
        color_map = {
            "en": "red",
            "zh": "blue",
            "hi": "green",
            "es": "orange",
            "ar": "purple",
        }
        return color_map[lang_code], lang_code
    else:
        return "lightgray", "Other"


# Create custom scatter plot matrix
tasks = pivot_df.columns.tolist()
n_tasks = len(tasks)

fig, axes = plt.subplots(n_tasks, n_tasks, figsize=(15, 12))
fig.suptitle("Pairwise Task Performance", fontsize=16, fontweight="bold")

# Create legend elements
legend_elements = []
for lang in highlighted_languages:
    color, _ = get_color_and_label(lang)
    legend_elements.append(
        plt.Line2D(
            [0],
            [0],
            marker="o",
            color="w",
            markerfacecolor=color,
            markersize=8,
            label=lang,
        )
    )
legend_elements.append(
    plt.Line2D(
        [0],
        [0],
        marker="o",
        color="w",
        markerfacecolor="lightgray",
        markersize=8,
        label="Other",
    )
)

for i, task_y in enumerate(tasks):
    for j, task_x in enumerate(tasks):
        ax = axes[i, j]

        if i == j:
            # Diagonal: histogram
            task_data = pivot_df[task_y].dropna()
            colors = [get_color_and_label(lang)[0] for lang in task_data.index]
            ax.hist(task_data, bins=20, alpha=0.7, color="skyblue", edgecolor="black")
            ax.set_title(f"{task_y}", fontsize=10)
        else:
            # Off-diagonal: scatter plot
            for lang_code in pivot_df.index:
                if pd.notna(pivot_df.loc[lang_code, task_x]) and pd.notna(
                    pivot_df.loc[lang_code, task_y]
                ):
                    color, _ = get_color_and_label(lang_code)
                    alpha = 0.8 if lang_code in highlighted_languages else 0.3
                    size = 50 if lang_code in highlighted_languages else 20
                    ax.scatter(
                        pivot_df.loc[lang_code, task_x],
                        pivot_df.loc[lang_code, task_y],
                        c=color,
                        alpha=alpha,
                        s=size,
                    )

        # Set labels
        if i == n_tasks - 1:
            ax.set_xlabel(task_x, fontsize=10)
        if j == 0:
            ax.set_ylabel(task_y, fontsize=10)

        # Remove tick labels except for edges
        if i != n_tasks - 1:
            ax.set_xticklabels([])
        if j != 0:
            ax.set_yticklabels([])

# Add legend
fig.legend(
    handles=legend_elements,
    loc="lower center",
    bbox_to_anchor=(0.5, -0.05),
    ncol=len(legend_elements),
    frameon=False,
    fontsize=10,
    handletextpad=0.5,
    columnspacing=1.0,
)

plt.tight_layout()
plt.savefig("task_scatter_matrix.png", dpi=300, bbox_inches="tight")
plt.show()