|
import torch |
|
from bert_score import score |
|
|
|
def _get_default_device(): |
|
return torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
class RadEvalBERTScorer: |
|
""" |
|
Wrapper around bert_score for radiology reports using a custom BERT model. |
|
""" |
|
def __init__(self, |
|
model_type: str = "IAMJB/RadEvalModernBERT", |
|
num_layers: int = None, |
|
use_fast_tokenizer: bool = True, |
|
rescale_with_baseline: bool = False, |
|
device: torch.device = None): |
|
self.model_type = model_type |
|
self.num_layers = num_layers |
|
self.use_fast_tokenizer = use_fast_tokenizer |
|
self.rescale_with_baseline = rescale_with_baseline |
|
self.device = device or _get_default_device() |
|
|
|
def score(self, refs: list[str], hyps: list[str]) -> float: |
|
""" |
|
Compute BERTScore F1 between reference and hypothesis texts. |
|
|
|
Args: |
|
refs: list of reference sentences. |
|
hyps: list of hypothesis sentences (predictions). |
|
|
|
Returns: |
|
Mean F1 score as a float. |
|
""" |
|
|
|
P, R, F1 = score( |
|
cands=hyps, |
|
refs=refs, |
|
model_type=self.model_type, |
|
num_layers=self.num_layers, |
|
use_fast_tokenizer=self.use_fast_tokenizer, |
|
rescale_with_baseline=self.rescale_with_baseline, |
|
device=self.device |
|
) |
|
|
|
return F1.mean().item(), F1 |
|
|
|
if __name__ == "__main__": |
|
|
|
refs = ["Chronic mild to moderate cardiomegaly and pulmonary venous hypertension."] |
|
hyps = ["Mild left basal atelectasis; no pneumonia."] |
|
scorer = RadiologyBERTScorer(num_layers=23) |
|
f1_score = scorer.score(refs, hyps) |
|
print(f"Mean F1 score: {f1_score:.4f}") |
|
|