from typing import Sequence, Union import numpy as np import torch from factual.f1chexbert import F1CheXbert def semantic_embedding_scores( refs: Sequence[str], hyps: Sequence[str], *, device: Union[str, torch.device] = "cpu", ) -> np.ndarray: """Return per‑pair cosine similarities between `refs` and `hyps`. All heavy math is vectorised; no Python loops. Args: refs: Iterable of ground‑truth report strings. hyps: Iterable of predicted report strings (must match `refs` length). device: Computation device (e.g. "cpu", "cuda", "cuda:0"). Returns ------- np.ndarray Shape ``(N,)`` – cosine similarity for each pair, where ``N == len(refs) == len(hyps)``. Raises ------ ValueError If `refs` and `hyps` are of different lengths. """ if len(refs) != len(hyps): raise ValueError(f"refs ({len(refs)}) and hyps ({len(hyps)}) differ in length") labeler = F1CheXbert(device=device) # Stack embeddings into (N, dim) matrices gt_embeds = np.vstack(labeler.get_embeddings(refs)) # (N, dim) pred_embeds = np.vstack(labeler.get_embeddings(hyps)) # (N, dim) # Cosine similarity – fully vectorised dot = np.einsum("nd,nd->n", gt_embeds, pred_embeds) norms = np.linalg.norm(gt_embeds, axis=1) * np.linalg.norm(pred_embeds, axis=1) with np.errstate(divide="ignore", invalid="ignore"): sims = np.where(norms > 0, dot / norms, 0.0) return sims def mean_semantic_score(scores: np.ndarray) -> float: """Convenience helper: mean of an array of scores.""" return float(scores.mean()) if __name__ == "__main__": _refs = [ "No evidence of pneumothorax following chest tube removal.", "There is a left pleural effusion.", "No evidence of pneumothorax following chest tube removal.", ] _hyps = [ "No pneumothorax detected.", "Left pleural effusion is present.", "Left pleural effusion is present.", ] _scores = semantic_embedding_scores(_refs, _hyps, device="cpu") print("Per‑pair cosine:", _scores) print("Mean:", mean_semantic_score(_scores))