nbroad
/

hfie-minilm-l6

Feature Extraction

text-embeddings-inference

Model card Files Files and versions Community

hfie-minilm-l6 / handler.py

nbroad's picture

Update handler.py

2e00bb1 about 2 years ago

history blame contribute delete

2.61 kB

	from transformers import AutoTokenizer, AutoModel
	import torch
	import torch.nn.functional as F
	from typing import Any, Dict, List


	# copied from the model card
	def mean_pooling(model_output, attention_mask):
	token_embeddings = model_output[0] #First element of model_output contains all token embeddings
	input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
	return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

	class EndpointHandler():
	def __init__(self, path="./"):

	# load the optimized model

	self.model = torch.jit.trace(
	AutoModel.from_pretrained(
	path,
	torchscript=True,
	),
	[torch.randint(0,100,(2,128)), torch.randint(0,100,(2,128))],
	)
	self.model.eval()


	self.tokenizer = AutoTokenizer.from_pretrained(path)
	# create inference pipeline

	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	self.model.to(self.device)

	def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
	"""
	Args:
	data (:obj:):
	includes the input data and the parameters for the inference.
	Return:
	A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing :
	- "label": A string representing what the label/class is. There can be multiple labels.
	- "score": A score between 0 and 1 describing how confident the model is for this label/class.
	"""
	inputs = data.pop("inputs", data)
	parameters = data.pop("parameters", None)

	with torch.inference_mode():

	if parameters is None:
	max_length = 512
	else:
	max_length = parameters.pop("max_length", 512)

	inputs = self.tokenizer(
	inputs,
	padding=True,
	truncation=True,
	return_tensors='pt',
	max_length=max_length,
	).to(self.device)

	model_output = self.model(inputs.input_ids, inputs.attention_mask)

	sentence_embeddings = mean_pooling(model_output, inputs.attention_mask)

	sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)


	# postprocess the prediction
	return {
	"embeddings": sentence_embeddings.cpu().tolist()
	}