| from typing import Dict, List, Optional, Union | |
| import numpy as np | |
| import requests | |
| from mteb import DRESModel | |
| from tqdm import tqdm | |
| class SionicEmbeddingModel(DRESModel): | |
| def __init__(self, url: str, instruction: Optional[str] = None, batch_size: int = 128, dimension: int = 2048, **kwargs) -> None: | |
| self.url = url | |
| self.instruction = instruction | |
| self.batch_size = batch_size | |
| self.dimension = dimension | |
| def get_embeddings(self, queries: List[str]) -> np.ndarray: | |
| return np.asarray( | |
| requests.post(self.url, json={'inputs': queries}).json()['embedding'], | |
| dtype=np.float32, | |
| ).reshape(len(queries), self.dimension) | |
| def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray: | |
| return self.encode([f'{self.instruction}{query}' for query in queries]) | |
| def encode_corpus(self, corpus: List[Union[Dict[str, str], str]], **kwargs) -> np.ndarray: | |
| sentences: List[str] = ( | |
| [f"{doc.get('title', '')} {doc['text']}".strip() for doc in corpus] | |
| if isinstance(corpus[0], dict) | |
| else corpus | |
| ) | |
| return self.encode(sentences) | |
| def encode(self, sentences: List[str], **kwargs) -> np.ndarray: | |
| return np.concatenate( | |
| [ | |
| self.get_embeddings(sentences[idx:idx + self.batch_size]) | |
| for idx in tqdm(range(0, len(sentences), self.batch_size), desc='encode') | |
| ], | |
| axis=0, | |
| ) | |