RadEval / factual /RadCliQv1 /semb_score.py
X-iZhang's picture
initial
bad8293 verified
from typing import Sequence, Union
import numpy as np
import torch
from factual.f1chexbert import F1CheXbert
def semantic_embedding_scores(
refs: Sequence[str],
hyps: Sequence[str],
*,
device: Union[str, torch.device] = "cpu",
) -> np.ndarray:
"""Return per‑pair cosine similarities between `refs` and `hyps`.
All heavy math is vectorised; no Python loops.
Args:
refs: Iterable of ground‑truth report strings.
hyps: Iterable of predicted report strings (must match `refs` length).
device: Computation device (e.g. "cpu", "cuda", "cuda:0").
Returns
-------
np.ndarray
Shape ``(N,)`` – cosine similarity for each pair, where
``N == len(refs) == len(hyps)``.
Raises
------
ValueError
If `refs` and `hyps` are of different lengths.
"""
if len(refs) != len(hyps):
raise ValueError(f"refs ({len(refs)}) and hyps ({len(hyps)}) differ in length")
labeler = F1CheXbert(device=device)
# Stack embeddings into (N, dim) matrices
gt_embeds = np.vstack(labeler.get_embeddings(refs)) # (N, dim)
pred_embeds = np.vstack(labeler.get_embeddings(hyps)) # (N, dim)
# Cosine similarity – fully vectorised
dot = np.einsum("nd,nd->n", gt_embeds, pred_embeds)
norms = np.linalg.norm(gt_embeds, axis=1) * np.linalg.norm(pred_embeds, axis=1)
with np.errstate(divide="ignore", invalid="ignore"):
sims = np.where(norms > 0, dot / norms, 0.0)
return sims
def mean_semantic_score(scores: np.ndarray) -> float:
"""Convenience helper: mean of an array of scores."""
return float(scores.mean())
if __name__ == "__main__":
_refs = [
"No evidence of pneumothorax following chest tube removal.",
"There is a left pleural effusion.",
"No evidence of pneumothorax following chest tube removal.",
]
_hyps = [
"No pneumothorax detected.",
"Left pleural effusion is present.",
"Left pleural effusion is present.",
]
_scores = semantic_embedding_scores(_refs, _hyps, device="cpu")
print("Per‑pair cosine:", _scores)
print("Mean:", mean_semantic_score(_scores))