from sentence_transformers import SentenceTransformer, \ CrossEncoder # SentenceTransformer -> model for embeddings, CrossEncoder -> re-ranker from ctransformers import AutoModelForCausalLM from torch import Tensor from google import genai from google.genai import types from app.chunks import Chunk import os import numpy as np # used only for type hints from app.settings import device, local_llm_config, local_generation_config, gemini_generation_config class Embedder: def __init__(self, model: str = "BAAI/bge-m3"): self.device: str = device self.model_name: str = model self.model: SentenceTransformer = SentenceTransformer(model, device=self.device) ''' Encodes string to dense vector ''' def encode(self, text: str | list[str]) -> Tensor | list[Tensor]: return self.model.encode(sentences=text, show_progress_bar=False, batch_size=32) ''' Returns the dimensionality of dense vector ''' def get_vector_dimensionality(self) -> (int | None): return self.model.get_sentence_embedding_dimension() class Reranker: def __init__(self, model: str = "cross-encoder/ms-marco-MiniLM-L6-v2"): self.device: str = device self.model_name: str = model self.model: CrossEncoder = CrossEncoder(model, device=self.device) ''' Returns re-sorted (by relevance) vector with dicts, from which we need only the 'corpus_id' since it is a position of chunk in original list ''' def rank(self, query: str, chunks: list[Chunk]) -> list[dict[str, int]]: return self.model.rank(query, [chunk.get_raw_text() for chunk in chunks]) # TODO: add models parameters to global config file # TODO: add exception handling when response have more tokens than was set # TODO: find a way to restrict the model for providing too long answers class LocalLLM: def __init__(self): self.model = AutoModelForCausalLM.from_pretrained(**local_llm_config) ''' Produces the response to user's prompt stream -> flag, determines weather we need to wait until the response is ready or can show it token by token TODO: invent a way to really stream the answer (as return value) ''' def get_response(self, prompt: str, stream: bool = True, logging: bool = True, use_default_config: bool = True) -> str: with open("prompt.txt", "w") as f: f.write(prompt) generated_text = "" tokenized_text: list[int] = self.model.tokenize(text=prompt) response: list[int] = self.model.generate(tokens=tokenized_text, **local_generation_config) if logging: print(response) if not stream: return self.model.detokenize(response) for token in response: chunk = self.model.detokenize([token]) generated_text += chunk if logging: print(chunk, end="", flush=True) # flush -> clear the buffer return generated_text class Gemini: def __init__(self, model="gemini-2.0-flash"): self.client = genai.Client(api_key=os.environ['GEMINI_API_KEY']) self.model = model def get_response(self, prompt: str, stream: bool = True, logging: bool = True, use_default_config: bool = False) -> str: with open("prompt.txt", "w", encoding="utf-8", errors="replace") as f: f.write(prompt) response = self.client.models.generate_content( model=self.model, contents=prompt, config=types.GenerateContentConfig(**gemini_generation_config) if use_default_config else None ) return response.text