"""
retriever.py
----------------

This module provides utilities for retrieving candidate evidence sentences
from Wikipedia given a short textual claim.  It uses the ``wikipedia``
package to perform the search and fetch page content, then splits the
content into individual sentences.  Each returned entry includes the
sentence text and the URL of the page from which it originated so that
callers may build proper citations.

The retriever is intentionally conservative: it limits the number of
articles to inspect and truncates overly long pages to avoid excessive
runtime.  When used in combination with the ranking and classification
modules, this component forms the first stage of the TruthLens pipeline.

Example:

>>> from retriever import retrieve_wikipedia_sentences
>>> sentences = retrieve_wikipedia_sentences("Electric vehicles reduce emissions", max_pages=2)
>>> print(len(sentences))
...  # prints the number of candidate sentences found

Note: The ``wikipedia`` package performs network requests and will
therefore only succeed in an environment with outbound internet
connectivity.  During unit tests or in restricted environments the
returned list may be empty or an exception may be raised.
"""

from __future__ import annotations

import logging
from typing import List, Tuple

try:
    import wikipedia  # type: ignore
except ImportError as exc:
    raise ImportError(
        "The 'wikipedia' package is required for retrieval. "
        "Please add it to your requirements.txt file and install via pip."
    ) from exc

try:
    # Try to import NLTK for sentence segmentation.  If it's not
    # available or the punkt models are missing, we fall back to a
    # naive sentence splitter.
    import nltk  # type: ignore
    from nltk.tokenize import sent_tokenize  # type: ignore

    # Ensure the punkt model is available without prompting the user.  If
    # the download fails (for example because there is no network), the
    # exception will be caught and we will revert to the naive splitter.
    try:
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        try:
            nltk.download("punkt", quiet=True)
        except Exception:
            # We'll fall back to the naive splitter below
            pass
    _use_nltk = True
except ImportError:
    _use_nltk = False

logger = logging.getLogger(__name__)


def _split_sentences(text: str) -> List[str]:
    """Split ``text`` into sentences.

    Uses NLTK's sentence tokenizer when available; otherwise falls back
    to a simple rule-based approach splitting on common sentence
    terminators (period, exclamation mark, question mark).
    """
    if _use_nltk:
        try:
            return [s.strip() for s in sent_tokenize(text) if s.strip()]
        except Exception:
            # If NLTK fails for any reason, fall back to naive splitting
            pass
    sentences: List[str] = []
    current = []
    for ch in text:
        current.append(ch)
        if ch in {'.', '!', '?'}:
            sentence = ''.join(current).strip()
            if sentence:
                sentences.append(sentence)
            current = []
    # Add any trailing text as a final sentence
    remainder = ''.join(current).strip()
    if remainder:
        sentences.append(remainder)
    return sentences


def retrieve_wikipedia_sentences(claim: str, *, max_pages: int = 3, max_sentences_per_page: int = 200) -> List[Tuple[str, str]]:
    """Search Wikipedia for the given claim and return candidate sentences.

    Parameters
    ----------
    claim:
        A short description or assertion about which evidence should be
        retrieved.  For example, ``"Electric vehicles reduce CO₂ emissions"``.

    max_pages:
        The maximum number of pages to fetch from Wikipedia's search
        results.  Increasing this value may yield more candidate
        sentences but will also increase runtime.

    max_sentences_per_page:
        Limit on the number of sentences extracted from each page.  This
        prevents extremely long articles from producing thousands of
        candidate sentences.

    Returns
    -------
    List[Tuple[str, str]]
        A list of tuples ``(sentence, source_url)``.  Each sentence is
        stripped of leading/trailing whitespace.  ``source_url`` is the
        canonical URL of the Wikipedia page where the sentence was found.
    """
    if not claim or not claim.strip():
        return []

    candidates: List[Tuple[str, str]] = []
    try:
        search_results = wikipedia.search(claim, results=max_pages)
    except Exception as exc:
        logger.warning("Wikipedia search failed: %s", exc)
        return candidates

    for title in search_results[:max_pages]:
        try:
            page = wikipedia.page(title, auto_suggest=False)
            content = page.content
            sentences = _split_sentences(content)
            if max_sentences_per_page > 0:
                sentences = sentences[:max_sentences_per_page]
            url = page.url
            for sentence in sentences:
                candidates.append((sentence, url))
        except Exception as exc:
            logger.debug("Skipping page '%s' due to error: %s", title, exc)
            continue
    return candidates