""" ranker.py --------- This module implements functionality for ranking candidate sentences by their relevance to a given claim. The ranking is performed by embedding both the claim and the candidate sentences into a semantic vector space using a pre-trained sentence-transformer model and then computing cosine similarity between the claim vector and each candidate. The candidates are returned in descending order of similarity. When the sentence-transformers library is not available or the specified model cannot be loaded, the module falls back to a simpler TF-IDF based cosine similarity using scikit-learn. The fallback approach still yields reasonable relevance orderings without requiring deep learning dependencies. Example: >>> from ranker import rank_sentences >>> ranked = rank_sentences("Cats are adorable pets", ["Cats purr when happy", "Airplanes fly"], top_k=1) >>> print(ranked[0][0]) ... # prints the sentence most similar to the claim """ from __future__ import annotations import logging from typing import Iterable, List, Tuple import numpy as np logger = logging.getLogger(__name__) _st_model = None # type: ignore _use_transformers = False def _load_sentence_transformer(model_name: str = "all-MiniLM-L6-v2"): """Load the sentence transformer model lazily. Attempts to import and instantiate the specified sentence transformer model. If the import fails, sets a flag to indicate fallback use of scikit-learn. """ global _st_model, _use_transformers if _st_model is not None or _use_transformers: return try: from sentence_transformers import SentenceTransformer # type: ignore _st_model = SentenceTransformer(model_name) _use_transformers = True except Exception as exc: logger.warning( "Could not load sentence-transformer model '%s'. Falling back to TF-IDF: %s", model_name, exc, ) _st_model = None _use_transformers = False def _embed_with_st(texts: Iterable[str]) -> np.ndarray: """Embed a list of texts using a sentence transformer model.""" assert _st_model is not None return np.asarray(_st_model.encode(list(texts), convert_to_numpy=True)) def _rank_with_tfidf(claim: str, candidates: List[str], top_k: int) -> List[Tuple[str, float]]: """Rank candidates using TF-IDF cosine similarity. This fallback method uses scikit-learn's TfidfVectorizer to construct vectors for the claim and candidates and then computes pairwise cosine similarity. It does not require any heavy dependencies beyond scikit-learn, which is typically installed. """ from sklearn.feature_extraction.text import TfidfVectorizer # type: ignore from sklearn.metrics.pairwise import cosine_similarity # type: ignore vectorizer = TfidfVectorizer().fit([claim] + candidates) vectors = vectorizer.transform([claim] + candidates) claim_vec = vectors[0] cand_vecs = vectors[1:] sims = cosine_similarity(claim_vec, cand_vecs).flatten() idx_sorted = sims.argsort()[::-1] ranked = [(candidates[i], float(sims[i])) for i in idx_sorted[:top_k]] return ranked def rank_sentences(claim: str, sentences: Iterable[str], top_k: int = 10) -> List[Tuple[str, float]]: """Rank ``sentences`` by semantic similarity to ``claim``. Parameters ---------- claim: The short textual claim against which candidates are compared. sentences: An iterable of candidate sentences to score. top_k: The maximum number of top-ranked sentences to return. If the number of candidates is less than ``top_k``, all candidates are returned in descending order of similarity. Returns ------- List[Tuple[str, float]] A list of ``(sentence, score)`` pairs sorted in descending order of similarity. ``score`` is a cosine similarity between the claim embedding and the candidate embedding (1.0 means identical, 0.0 means orthogonal). When falling back to TF-IDF ranking, the scores may be lower but are still comparable within the same run. """ # IMPORTANT: declare globals before any usage in this function global _use_transformers, _st_model # Convert the iterable to a list so we can index and iterate candidates = list(sentences) if not candidates: return [] # Try to load the transformer model only once if _st_model is None and not _use_transformers: _load_sentence_transformer() if _use_transformers and _st_model is not None: try: # Embed claim and candidates embeddings = _embed_with_st([claim] + candidates) claim_vec = embeddings[0] cand_vecs = embeddings[1:] # Normalize vectors to unit length to compute cosine similarity via dot product claim_norm = claim_vec / (np.linalg.norm(claim_vec) + 1e-8) cand_norms = cand_vecs / (np.linalg.norm(cand_vecs, axis=1, keepdims=True) + 1e-8) sims = cand_norms.dot(claim_norm) idx_sorted = sims.argsort()[::-1] ranked = [(candidates[i], float(sims[i])) for i in idx_sorted[:top_k]] return ranked except Exception as exc: logger.warning( "Sentence-transformer ranking failed. Falling back to TF-IDF. Error: %s", exc, ) # Mark the transformer as unusable for subsequent calls _use_transformers = False _st_model = None # Fallback to TF-IDF ranking return _rank_with_tfidf(claim, candidates, top_k)