Spaces:

X-iZhang
/

RadEval

Running

File size: 5,109 Bytes

bad8293

import torch
import torch.nn.functional as F
import medspacy
nlp = medspacy.load(medspacy_enable=["medspacy_pyrush", "medspacy_conte"])

def sentence_split(text_list):
    """
    split sentences by medspacy
    """
    clean_text_list = [] 
    is_start_list = []

    for text in text_list:

        doc = nlp(text)

        is_start = 1

        for sent in doc.sents:
            sent = str(sent).strip()
            # # check if the sentence has no words
            if len(sent.split()) == 0:
                continue
            if len(sent) < 3:
                continue
            is_start_list.append(is_start)
            clean_text_list.append(sent)
            is_start = 0

    return clean_text_list, is_start_list

def post_process(tokenized_text, predicted_entities, tokenizer):
    entity_spans = []
    start = end = None
    entity_type = None

    for i, (token, label) in enumerate(zip(tokenized_text, predicted_entities[:len(tokenized_text)])):
        if token in ["[CLS]", "[SEP]"]:
            continue
        if label != "O" and i < len(predicted_entities) - 1:
            if label.startswith("B-") and predicted_entities[i+1].startswith("I-"):
                start = i
                entity_type = label[2:]
            elif label.startswith("B-") and predicted_entities[i+1].startswith("B-"):
                start = i
                end = i
                entity_spans.append((start, end, label[2:]))
                start = i
                entity_type = label[2:]
            elif label.startswith("B-") and predicted_entities[i+1].startswith("O"):
                start = i
                end = i
                entity_spans.append((start, end, label[2:]))
                start = end = None
                entity_type = None
            elif label.startswith("I-") and predicted_entities[i+1].startswith("B-"):
                end = i
                if start is not None:
                    entity_spans.append((start, end, entity_type))
                start = i
                entity_type = label[2:]
            elif label.startswith("I-") and predicted_entities[i+1].startswith("O"):
                end = i
                if start is not None:
                    entity_spans.append((start, end, entity_type))
                start = end = None
                entity_type = None

    # 处理最后一个实体
    if start is not None and end is None:
        end = len(tokenized_text) - 2
        entity_spans.append((start, end, entity_type))

    # 输出结果
    save_pair = []
    for start, end, entity_type in entity_spans:
        entity_str = tokenizer.convert_tokens_to_string(tokenized_text[start:end+1])
        # print(f"实体: {entity_str}, 类型: {entity_type}")
        save_pair.append((entity_str, entity_type))

    return save_pair


def topk_similarity(embeddings1, embeddings2, k=1):
    """
    Compute the top-k similarity between two sets of embeddings using PyTorch.
    """

    ### Normalize the embeddings to use cosine similarity
    embeddings1 = F.normalize(embeddings1, p=2, dim=1)
    embeddings2 = F.normalize(embeddings2, p=2, dim=1)
    
    topk_values = []
    topk_indices = []

    ### Iterate over each embedding in the first set
    for emb1 in embeddings1:
        
        ### Calculate cosine similarity between this embedding and all embeddings in the second set
        similarities = torch.matmul(embeddings2, emb1)

        ### Find the top-k highest similarity values
        values, indices = torch.topk(similarities, k, largest=True)

        topk_values.append(values[0])
        topk_indices.append(indices[0])

    return topk_indices, topk_values

def compute(gt_embeds_word, pred_embeds_word, gt_types, pred_types, weight_matrix):
    neg_class = [('NON-DISEASE', 'DISEASE'), 
                 ('NON-ABNORMALITY', 'ABNORMALITY'), 
                 ('DISEASE', 'NON-DISEASE'), 
                ('ABNORMALITY', 'NON-ABNORMALITY'),
                ('NON-DISEASE', 'ABNORMALITY'),
                ('NON-ABNORMALITY', 'DISEASE'),
                ('DISEASE', 'NON-ABNORMALITY'),
                ('ABNORMALITY', 'NON-DISEASE'),]
    neg_weight = weight_matrix[("NEG", "WEIGHT")]
    topk_indices, topk_values = topk_similarity(gt_embeds_word, pred_embeds_word, k=1)   

    
    for i in range(len(topk_indices)):
        topk_indices[i] = topk_indices[i].cpu().numpy().tolist()
        topk_values[i] = topk_values[i].cpu().numpy().tolist()
        
    # map the indices to type
    topk_map = [pred_types[i] for i in topk_indices]
    
    weight_score = [weight_matrix[(gt_type, pred_type)] for gt_type, pred_type in zip(gt_types, topk_map)]
    type_score = [neg_weight if (gt_type, pred_type) in neg_class else 1 for gt_type, pred_type in zip(gt_types, topk_map)]
    
    weighted_avg_score = 0
    weighted_sum = 0
    for score, weight, type in zip(topk_values, weight_score, type_score):
        weighted_avg_score += score*weight*type
        weighted_sum += weight
    if weighted_sum != 0:
        RaTE = weighted_avg_score/weighted_sum
    else:
        RaTE = 0
    
    return RaTE