File size: 5,109 Bytes
bad8293 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import torch
import torch.nn.functional as F
import medspacy
nlp = medspacy.load(medspacy_enable=["medspacy_pyrush", "medspacy_conte"])
def sentence_split(text_list):
"""
split sentences by medspacy
"""
clean_text_list = []
is_start_list = []
for text in text_list:
doc = nlp(text)
is_start = 1
for sent in doc.sents:
sent = str(sent).strip()
# # check if the sentence has no words
if len(sent.split()) == 0:
continue
if len(sent) < 3:
continue
is_start_list.append(is_start)
clean_text_list.append(sent)
is_start = 0
return clean_text_list, is_start_list
def post_process(tokenized_text, predicted_entities, tokenizer):
entity_spans = []
start = end = None
entity_type = None
for i, (token, label) in enumerate(zip(tokenized_text, predicted_entities[:len(tokenized_text)])):
if token in ["[CLS]", "[SEP]"]:
continue
if label != "O" and i < len(predicted_entities) - 1:
if label.startswith("B-") and predicted_entities[i+1].startswith("I-"):
start = i
entity_type = label[2:]
elif label.startswith("B-") and predicted_entities[i+1].startswith("B-"):
start = i
end = i
entity_spans.append((start, end, label[2:]))
start = i
entity_type = label[2:]
elif label.startswith("B-") and predicted_entities[i+1].startswith("O"):
start = i
end = i
entity_spans.append((start, end, label[2:]))
start = end = None
entity_type = None
elif label.startswith("I-") and predicted_entities[i+1].startswith("B-"):
end = i
if start is not None:
entity_spans.append((start, end, entity_type))
start = i
entity_type = label[2:]
elif label.startswith("I-") and predicted_entities[i+1].startswith("O"):
end = i
if start is not None:
entity_spans.append((start, end, entity_type))
start = end = None
entity_type = None
# 处理最后一个实体
if start is not None and end is None:
end = len(tokenized_text) - 2
entity_spans.append((start, end, entity_type))
# 输出结果
save_pair = []
for start, end, entity_type in entity_spans:
entity_str = tokenizer.convert_tokens_to_string(tokenized_text[start:end+1])
# print(f"实体: {entity_str}, 类型: {entity_type}")
save_pair.append((entity_str, entity_type))
return save_pair
def topk_similarity(embeddings1, embeddings2, k=1):
"""
Compute the top-k similarity between two sets of embeddings using PyTorch.
"""
### Normalize the embeddings to use cosine similarity
embeddings1 = F.normalize(embeddings1, p=2, dim=1)
embeddings2 = F.normalize(embeddings2, p=2, dim=1)
topk_values = []
topk_indices = []
### Iterate over each embedding in the first set
for emb1 in embeddings1:
### Calculate cosine similarity between this embedding and all embeddings in the second set
similarities = torch.matmul(embeddings2, emb1)
### Find the top-k highest similarity values
values, indices = torch.topk(similarities, k, largest=True)
topk_values.append(values[0])
topk_indices.append(indices[0])
return topk_indices, topk_values
def compute(gt_embeds_word, pred_embeds_word, gt_types, pred_types, weight_matrix):
neg_class = [('NON-DISEASE', 'DISEASE'),
('NON-ABNORMALITY', 'ABNORMALITY'),
('DISEASE', 'NON-DISEASE'),
('ABNORMALITY', 'NON-ABNORMALITY'),
('NON-DISEASE', 'ABNORMALITY'),
('NON-ABNORMALITY', 'DISEASE'),
('DISEASE', 'NON-ABNORMALITY'),
('ABNORMALITY', 'NON-DISEASE'),]
neg_weight = weight_matrix[("NEG", "WEIGHT")]
topk_indices, topk_values = topk_similarity(gt_embeds_word, pred_embeds_word, k=1)
for i in range(len(topk_indices)):
topk_indices[i] = topk_indices[i].cpu().numpy().tolist()
topk_values[i] = topk_values[i].cpu().numpy().tolist()
# map the indices to type
topk_map = [pred_types[i] for i in topk_indices]
weight_score = [weight_matrix[(gt_type, pred_type)] for gt_type, pred_type in zip(gt_types, topk_map)]
type_score = [neg_weight if (gt_type, pred_type) in neg_class else 1 for gt_type, pred_type in zip(gt_types, topk_map)]
weighted_avg_score = 0
weighted_sum = 0
for score, weight, type in zip(topk_values, weight_score, type_score):
weighted_avg_score += score*weight*type
weighted_sum += weight
if weighted_sum != 0:
RaTE = weighted_avg_score/weighted_sum
else:
RaTE = 0
return RaTE |