test / app /models.py
Andrchest's picture
final try 68
90510a1
from sentence_transformers import SentenceTransformer, \
CrossEncoder # SentenceTransformer -> model for embeddings, CrossEncoder -> re-ranker
from ctransformers import AutoModelForCausalLM
from torch import Tensor
from google import genai
from google.genai import types
from app.chunks import Chunk
import os
import numpy as np # used only for type hints
from app.settings import device, local_llm_config, local_generation_config, gemini_generation_config
class Embedder:
def __init__(self, model: str = "BAAI/bge-m3"):
self.device: str = device
self.model_name: str = model
self.model: SentenceTransformer = SentenceTransformer(model, device=self.device)
'''
Encodes string to dense vector
'''
def encode(self, text: str | list[str]) -> Tensor | list[Tensor]:
return self.model.encode(sentences=text, show_progress_bar=False, batch_size=32)
'''
Returns the dimensionality of dense vector
'''
def get_vector_dimensionality(self) -> (int | None):
return self.model.get_sentence_embedding_dimension()
class Reranker:
def __init__(self, model: str = "cross-encoder/ms-marco-MiniLM-L6-v2"):
self.device: str = device
self.model_name: str = model
self.model: CrossEncoder = CrossEncoder(model, device=self.device)
'''
Returns re-sorted (by relevance) vector with dicts, from which we need only the 'corpus_id'
since it is a position of chunk in original list
'''
def rank(self, query: str, chunks: list[Chunk]) -> list[dict[str, int]]:
return self.model.rank(query, [chunk.get_raw_text() for chunk in chunks])
# TODO: add models parameters to global config file
# TODO: add exception handling when response have more tokens than was set
# TODO: find a way to restrict the model for providing too long answers
class LocalLLM:
def __init__(self):
self.model = AutoModelForCausalLM.from_pretrained(**local_llm_config)
'''
Produces the response to user's prompt
stream -> flag, determines weather we need to wait until the response is ready or can show it token by token
TODO: invent a way to really stream the answer (as return value)
'''
def get_response(self, prompt: str, stream: bool = True, logging: bool = True,
use_default_config: bool = True) -> str:
with open("prompt.txt", "w") as f:
f.write(prompt)
generated_text = ""
tokenized_text: list[int] = self.model.tokenize(text=prompt)
response: list[int] = self.model.generate(tokens=tokenized_text, **local_generation_config)
if logging:
print(response)
if not stream:
return self.model.detokenize(response)
for token in response:
chunk = self.model.detokenize([token])
generated_text += chunk
if logging:
print(chunk, end="", flush=True) # flush -> clear the buffer
return generated_text
class Gemini:
def __init__(self, model="gemini-2.0-flash"):
self.client = genai.Client(api_key=os.environ['GEMINI_API_KEY'])
self.model = model
def get_response(self, prompt: str, stream: bool = True, logging: bool = True,
use_default_config: bool = False) -> str:
with open("prompt.txt", "w", encoding="utf-8", errors="replace") as f:
f.write(prompt)
response = self.client.models.generate_content(
model=self.model,
contents=prompt,
config=types.GenerateContentConfig(**gemini_generation_config) if use_default_config else None
)
return response.text