Spaces:

X-iZhang
/

RadEval

Running

App Files Files Community

RadEval / factual /RaTEScore /utils.py

X-iZhang

initial

bad8293 verified 17 days ago

raw

history blame contribute delete

5.11 kB

	import torch
	import torch.nn.functional as F
	import medspacy
	nlp = medspacy.load(medspacy_enable=["medspacy_pyrush", "medspacy_conte"])

	def sentence_split(text_list):
	"""
	split sentences by medspacy
	"""
	clean_text_list = []
	is_start_list = []

	for text in text_list:

	doc = nlp(text)

	is_start = 1

	for sent in doc.sents:
	sent = str(sent).strip()
	# # check if the sentence has no words
	if len(sent.split()) == 0:
	continue
	if len(sent) < 3:
	continue
	is_start_list.append(is_start)
	clean_text_list.append(sent)
	is_start = 0

	return clean_text_list, is_start_list

	def post_process(tokenized_text, predicted_entities, tokenizer):
	entity_spans = []
	start = end = None
	entity_type = None

	for i, (token, label) in enumerate(zip(tokenized_text, predicted_entities[:len(tokenized_text)])):
	if token in ["[CLS]", "[SEP]"]:
	continue
	if label != "O" and i < len(predicted_entities) - 1:
	if label.startswith("B-") and predicted_entities[i+1].startswith("I-"):
	start = i
	entity_type = label[2:]
	elif label.startswith("B-") and predicted_entities[i+1].startswith("B-"):
	start = i
	end = i
	entity_spans.append((start, end, label[2:]))
	start = i
	entity_type = label[2:]
	elif label.startswith("B-") and predicted_entities[i+1].startswith("O"):
	start = i
	end = i
	entity_spans.append((start, end, label[2:]))
	start = end = None
	entity_type = None
	elif label.startswith("I-") and predicted_entities[i+1].startswith("B-"):
	end = i
	if start is not None:
	entity_spans.append((start, end, entity_type))
	start = i
	entity_type = label[2:]
	elif label.startswith("I-") and predicted_entities[i+1].startswith("O"):
	end = i
	if start is not None:
	entity_spans.append((start, end, entity_type))
	start = end = None
	entity_type = None

	# 处理最后一个实体
	if start is not None and end is None:
	end = len(tokenized_text) - 2
	entity_spans.append((start, end, entity_type))

	# 输出结果
	save_pair = []
	for start, end, entity_type in entity_spans:
	entity_str = tokenizer.convert_tokens_to_string(tokenized_text[start:end+1])
	# print(f"实体: {entity_str}, 类型: {entity_type}")
	save_pair.append((entity_str, entity_type))

	return save_pair


	def topk_similarity(embeddings1, embeddings2, k=1):
	"""
	Compute the top-k similarity between two sets of embeddings using PyTorch.
	"""

	### Normalize the embeddings to use cosine similarity
	embeddings1 = F.normalize(embeddings1, p=2, dim=1)
	embeddings2 = F.normalize(embeddings2, p=2, dim=1)

	topk_values = []
	topk_indices = []

	### Iterate over each embedding in the first set
	for emb1 in embeddings1:

	### Calculate cosine similarity between this embedding and all embeddings in the second set
	similarities = torch.matmul(embeddings2, emb1)

	### Find the top-k highest similarity values
	values, indices = torch.topk(similarities, k, largest=True)

	topk_values.append(values[0])
	topk_indices.append(indices[0])

	return topk_indices, topk_values

	def compute(gt_embeds_word, pred_embeds_word, gt_types, pred_types, weight_matrix):
	neg_class = [('NON-DISEASE', 'DISEASE'),
	('NON-ABNORMALITY', 'ABNORMALITY'),
	('DISEASE', 'NON-DISEASE'),
	('ABNORMALITY', 'NON-ABNORMALITY'),
	('NON-DISEASE', 'ABNORMALITY'),
	('NON-ABNORMALITY', 'DISEASE'),
	('DISEASE', 'NON-ABNORMALITY'),
	('ABNORMALITY', 'NON-DISEASE'),]
	neg_weight = weight_matrix[("NEG", "WEIGHT")]
	topk_indices, topk_values = topk_similarity(gt_embeds_word, pred_embeds_word, k=1)


	for i in range(len(topk_indices)):
	topk_indices[i] = topk_indices[i].cpu().numpy().tolist()
	topk_values[i] = topk_values[i].cpu().numpy().tolist()

	# map the indices to type
	topk_map = [pred_types[i] for i in topk_indices]

	weight_score = [weight_matrix[(gt_type, pred_type)] for gt_type, pred_type in zip(gt_types, topk_map)]
	type_score = [neg_weight if (gt_type, pred_type) in neg_class else 1 for gt_type, pred_type in zip(gt_types, topk_map)]

	weighted_avg_score = 0
	weighted_sum = 0
	for score, weight, type in zip(topk_values, weight_score, type_score):
	weighted_avg_score += scoreweighttype
	weighted_sum += weight
	if weighted_sum != 0:
	RaTE = weighted_avg_score/weighted_sum
	else:
	RaTE = 0

	return RaTE