"""
ranker.py
---------
This module implements functionality for ranking candidate sentences by
their relevance to a given claim.  The ranking is performed by
embedding both the claim and the candidate sentences into a semantic
vector space using a pre-trained sentence-transformer model and then
computing cosine similarity between the claim vector and each
candidate.  The candidates are returned in descending order of
similarity.

When the sentence-transformers library is not available or the
specified model cannot be loaded, the module falls back to a simpler
TF-IDF based cosine similarity using scikit-learn.  The fallback
approach still yields reasonable relevance orderings without requiring
deep learning dependencies.

Example:
>>> from ranker import rank_sentences
>>> ranked = rank_sentences("Cats are adorable pets", ["Cats purr when happy", "Airplanes fly"], top_k=1)
>>> print(ranked[0][0])
...  # prints the sentence most similar to the claim
"""

from __future__ import annotations

import logging
from typing import Iterable, List, Tuple

import numpy as np

logger = logging.getLogger(__name__)

_st_model = None  # type: ignore
_use_transformers = False


def _load_sentence_transformer(model_name: str = "all-MiniLM-L6-v2"):
    """Load the sentence transformer model lazily.
    Attempts to import and instantiate the specified sentence
    transformer model.  If the import fails, sets a flag to indicate
    fallback use of scikit-learn.
    """
    global _st_model, _use_transformers
    if _st_model is not None or _use_transformers:
        return
    try:
        from sentence_transformers import SentenceTransformer  # type: ignore

        _st_model = SentenceTransformer(model_name)
        _use_transformers = True
    except Exception as exc:
        logger.warning(
            "Could not load sentence-transformer model '%s'. Falling back to TF-IDF: %s",
            model_name,
            exc,
        )
        _st_model = None
        _use_transformers = False


def _embed_with_st(texts: Iterable[str]) -> np.ndarray:
    """Embed a list of texts using a sentence transformer model."""
    assert _st_model is not None
    return np.asarray(_st_model.encode(list(texts), convert_to_numpy=True))


def _rank_with_tfidf(claim: str, candidates: List[str], top_k: int) -> List[Tuple[str, float]]:
    """Rank candidates using TF-IDF cosine similarity.
    This fallback method uses scikit-learn's TfidfVectorizer to
    construct vectors for the claim and candidates and then computes
    pairwise cosine similarity.  It does not require any heavy
    dependencies beyond scikit-learn, which is typically installed.
    """
    from sklearn.feature_extraction.text import TfidfVectorizer  # type: ignore
    from sklearn.metrics.pairwise import cosine_similarity  # type: ignore

    vectorizer = TfidfVectorizer().fit([claim] + candidates)
    vectors = vectorizer.transform([claim] + candidates)
    claim_vec = vectors[0]
    cand_vecs = vectors[1:]
    sims = cosine_similarity(claim_vec, cand_vecs).flatten()
    idx_sorted = sims.argsort()[::-1]
    ranked = [(candidates[i], float(sims[i])) for i in idx_sorted[:top_k]]
    return ranked


def rank_sentences(claim: str, sentences: Iterable[str], top_k: int = 10) -> List[Tuple[str, float]]:
    """Rank ``sentences`` by semantic similarity to ``claim``.

    Parameters
    ----------
    claim:
        The short textual claim against which candidates are compared.
    sentences:
        An iterable of candidate sentences to score.
    top_k:
        The maximum number of top-ranked sentences to return.  If the
        number of candidates is less than ``top_k``, all candidates are
        returned in descending order of similarity.

    Returns
    -------
    List[Tuple[str, float]]
        A list of ``(sentence, score)`` pairs sorted in descending order
        of similarity.  ``score`` is a cosine similarity between the
        claim embedding and the candidate embedding (1.0 means
        identical, 0.0 means orthogonal).  When falling back to TF-IDF
        ranking, the scores may be lower but are still comparable within
        the same run.
    """
    # IMPORTANT: declare globals before any usage in this function
    global _use_transformers, _st_model

    # Convert the iterable to a list so we can index and iterate
    candidates = list(sentences)
    if not candidates:
        return []

    # Try to load the transformer model only once
    if _st_model is None and not _use_transformers:
        _load_sentence_transformer()

    if _use_transformers and _st_model is not None:
        try:
            # Embed claim and candidates
            embeddings = _embed_with_st([claim] + candidates)
            claim_vec = embeddings[0]
            cand_vecs = embeddings[1:]
            # Normalize vectors to unit length to compute cosine similarity via dot product
            claim_norm = claim_vec / (np.linalg.norm(claim_vec) + 1e-8)
            cand_norms = cand_vecs / (np.linalg.norm(cand_vecs, axis=1, keepdims=True) + 1e-8)
            sims = cand_norms.dot(claim_norm)
            idx_sorted = sims.argsort()[::-1]
            ranked = [(candidates[i], float(sims[i])) for i in idx_sorted[:top_k]]
            return ranked
        except Exception as exc:
            logger.warning(
                "Sentence-transformer ranking failed. Falling back to TF-IDF. Error: %s",
                exc,
            )
            # Mark the transformer as unusable for subsequent calls
            _use_transformers = False
            _st_model = None

    # Fallback to TF-IDF ranking
    return _rank_with_tfidf(claim, candidates, top_k)