File size: 5,109 Bytes
bad8293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import torch
import torch.nn.functional as F
import medspacy
nlp = medspacy.load(medspacy_enable=["medspacy_pyrush", "medspacy_conte"])

def sentence_split(text_list):
    """
    split sentences by medspacy
    """
    clean_text_list = [] 
    is_start_list = []

    for text in text_list:

        doc = nlp(text)

        is_start = 1

        for sent in doc.sents:
            sent = str(sent).strip()
            # # check if the sentence has no words
            if len(sent.split()) == 0:
                continue
            if len(sent) < 3:
                continue
            is_start_list.append(is_start)
            clean_text_list.append(sent)
            is_start = 0

    return clean_text_list, is_start_list

def post_process(tokenized_text, predicted_entities, tokenizer):
    entity_spans = []
    start = end = None
    entity_type = None

    for i, (token, label) in enumerate(zip(tokenized_text, predicted_entities[:len(tokenized_text)])):
        if token in ["[CLS]", "[SEP]"]:
            continue
        if label != "O" and i < len(predicted_entities) - 1:
            if label.startswith("B-") and predicted_entities[i+1].startswith("I-"):
                start = i
                entity_type = label[2:]
            elif label.startswith("B-") and predicted_entities[i+1].startswith("B-"):
                start = i
                end = i
                entity_spans.append((start, end, label[2:]))
                start = i
                entity_type = label[2:]
            elif label.startswith("B-") and predicted_entities[i+1].startswith("O"):
                start = i
                end = i
                entity_spans.append((start, end, label[2:]))
                start = end = None
                entity_type = None
            elif label.startswith("I-") and predicted_entities[i+1].startswith("B-"):
                end = i
                if start is not None:
                    entity_spans.append((start, end, entity_type))
                start = i
                entity_type = label[2:]
            elif label.startswith("I-") and predicted_entities[i+1].startswith("O"):
                end = i
                if start is not None:
                    entity_spans.append((start, end, entity_type))
                start = end = None
                entity_type = None

    # 处理最后一个实体
    if start is not None and end is None:
        end = len(tokenized_text) - 2
        entity_spans.append((start, end, entity_type))

    # 输出结果
    save_pair = []
    for start, end, entity_type in entity_spans:
        entity_str = tokenizer.convert_tokens_to_string(tokenized_text[start:end+1])
        # print(f"实体: {entity_str}, 类型: {entity_type}")
        save_pair.append((entity_str, entity_type))

    return save_pair


def topk_similarity(embeddings1, embeddings2, k=1):
    """
    Compute the top-k similarity between two sets of embeddings using PyTorch.
    """

    ### Normalize the embeddings to use cosine similarity
    embeddings1 = F.normalize(embeddings1, p=2, dim=1)
    embeddings2 = F.normalize(embeddings2, p=2, dim=1)
    
    topk_values = []
    topk_indices = []

    ### Iterate over each embedding in the first set
    for emb1 in embeddings1:
        
        ### Calculate cosine similarity between this embedding and all embeddings in the second set
        similarities = torch.matmul(embeddings2, emb1)

        ### Find the top-k highest similarity values
        values, indices = torch.topk(similarities, k, largest=True)

        topk_values.append(values[0])
        topk_indices.append(indices[0])

    return topk_indices, topk_values

def compute(gt_embeds_word, pred_embeds_word, gt_types, pred_types, weight_matrix):
    neg_class = [('NON-DISEASE', 'DISEASE'), 
                 ('NON-ABNORMALITY', 'ABNORMALITY'), 
                 ('DISEASE', 'NON-DISEASE'), 
                ('ABNORMALITY', 'NON-ABNORMALITY'),
                ('NON-DISEASE', 'ABNORMALITY'),
                ('NON-ABNORMALITY', 'DISEASE'),
                ('DISEASE', 'NON-ABNORMALITY'),
                ('ABNORMALITY', 'NON-DISEASE'),]
    neg_weight = weight_matrix[("NEG", "WEIGHT")]
    topk_indices, topk_values = topk_similarity(gt_embeds_word, pred_embeds_word, k=1)   

    
    for i in range(len(topk_indices)):
        topk_indices[i] = topk_indices[i].cpu().numpy().tolist()
        topk_values[i] = topk_values[i].cpu().numpy().tolist()
        
    # map the indices to type
    topk_map = [pred_types[i] for i in topk_indices]
    
    weight_score = [weight_matrix[(gt_type, pred_type)] for gt_type, pred_type in zip(gt_types, topk_map)]
    type_score = [neg_weight if (gt_type, pred_type) in neg_class else 1 for gt_type, pred_type in zip(gt_types, topk_map)]
    
    weighted_avg_score = 0
    weighted_sum = 0
    for score, weight, type in zip(topk_values, weight_score, type_score):
        weighted_avg_score += score*weight*type
        weighted_sum += weight
    if weighted_sum != 0:
        RaTE = weighted_avg_score/weighted_sum
    else:
        RaTE = 0
    
    return RaTE