X-iZhang's picture
initial
bad8293 verified
import torch
import torch.nn.functional as F
import medspacy
nlp = medspacy.load(medspacy_enable=["medspacy_pyrush", "medspacy_conte"])
def sentence_split(text_list):
"""
split sentences by medspacy
"""
clean_text_list = []
is_start_list = []
for text in text_list:
doc = nlp(text)
is_start = 1
for sent in doc.sents:
sent = str(sent).strip()
# # check if the sentence has no words
if len(sent.split()) == 0:
continue
if len(sent) < 3:
continue
is_start_list.append(is_start)
clean_text_list.append(sent)
is_start = 0
return clean_text_list, is_start_list
def post_process(tokenized_text, predicted_entities, tokenizer):
entity_spans = []
start = end = None
entity_type = None
for i, (token, label) in enumerate(zip(tokenized_text, predicted_entities[:len(tokenized_text)])):
if token in ["[CLS]", "[SEP]"]:
continue
if label != "O" and i < len(predicted_entities) - 1:
if label.startswith("B-") and predicted_entities[i+1].startswith("I-"):
start = i
entity_type = label[2:]
elif label.startswith("B-") and predicted_entities[i+1].startswith("B-"):
start = i
end = i
entity_spans.append((start, end, label[2:]))
start = i
entity_type = label[2:]
elif label.startswith("B-") and predicted_entities[i+1].startswith("O"):
start = i
end = i
entity_spans.append((start, end, label[2:]))
start = end = None
entity_type = None
elif label.startswith("I-") and predicted_entities[i+1].startswith("B-"):
end = i
if start is not None:
entity_spans.append((start, end, entity_type))
start = i
entity_type = label[2:]
elif label.startswith("I-") and predicted_entities[i+1].startswith("O"):
end = i
if start is not None:
entity_spans.append((start, end, entity_type))
start = end = None
entity_type = None
# 处理最后一个实体
if start is not None and end is None:
end = len(tokenized_text) - 2
entity_spans.append((start, end, entity_type))
# 输出结果
save_pair = []
for start, end, entity_type in entity_spans:
entity_str = tokenizer.convert_tokens_to_string(tokenized_text[start:end+1])
# print(f"实体: {entity_str}, 类型: {entity_type}")
save_pair.append((entity_str, entity_type))
return save_pair
def topk_similarity(embeddings1, embeddings2, k=1):
"""
Compute the top-k similarity between two sets of embeddings using PyTorch.
"""
### Normalize the embeddings to use cosine similarity
embeddings1 = F.normalize(embeddings1, p=2, dim=1)
embeddings2 = F.normalize(embeddings2, p=2, dim=1)
topk_values = []
topk_indices = []
### Iterate over each embedding in the first set
for emb1 in embeddings1:
### Calculate cosine similarity between this embedding and all embeddings in the second set
similarities = torch.matmul(embeddings2, emb1)
### Find the top-k highest similarity values
values, indices = torch.topk(similarities, k, largest=True)
topk_values.append(values[0])
topk_indices.append(indices[0])
return topk_indices, topk_values
def compute(gt_embeds_word, pred_embeds_word, gt_types, pred_types, weight_matrix):
neg_class = [('NON-DISEASE', 'DISEASE'),
('NON-ABNORMALITY', 'ABNORMALITY'),
('DISEASE', 'NON-DISEASE'),
('ABNORMALITY', 'NON-ABNORMALITY'),
('NON-DISEASE', 'ABNORMALITY'),
('NON-ABNORMALITY', 'DISEASE'),
('DISEASE', 'NON-ABNORMALITY'),
('ABNORMALITY', 'NON-DISEASE'),]
neg_weight = weight_matrix[("NEG", "WEIGHT")]
topk_indices, topk_values = topk_similarity(gt_embeds_word, pred_embeds_word, k=1)
for i in range(len(topk_indices)):
topk_indices[i] = topk_indices[i].cpu().numpy().tolist()
topk_values[i] = topk_values[i].cpu().numpy().tolist()
# map the indices to type
topk_map = [pred_types[i] for i in topk_indices]
weight_score = [weight_matrix[(gt_type, pred_type)] for gt_type, pred_type in zip(gt_types, topk_map)]
type_score = [neg_weight if (gt_type, pred_type) in neg_class else 1 for gt_type, pred_type in zip(gt_types, topk_map)]
weighted_avg_score = 0
weighted_sum = 0
for score, weight, type in zip(topk_values, weight_score, type_score):
weighted_avg_score += score*weight*type
weighted_sum += weight
if weighted_sum != 0:
RaTE = weighted_avg_score/weighted_sum
else:
RaTE = 0
return RaTE