""" retriever.py ---------------- This module provides utilities for retrieving candidate evidence sentences from Wikipedia given a short textual claim. It uses the ``wikipedia`` package to perform the search and fetch page content, then splits the content into individual sentences. Each returned entry includes the sentence text and the URL of the page from which it originated so that callers may build proper citations. The retriever is intentionally conservative: it limits the number of articles to inspect and truncates overly long pages to avoid excessive runtime. When used in combination with the ranking and classification modules, this component forms the first stage of the TruthLens pipeline. Example: >>> from retriever import retrieve_wikipedia_sentences >>> sentences = retrieve_wikipedia_sentences("Electric vehicles reduce emissions", max_pages=2) >>> print(len(sentences)) ... # prints the number of candidate sentences found Note: The ``wikipedia`` package performs network requests and will therefore only succeed in an environment with outbound internet connectivity. During unit tests or in restricted environments the returned list may be empty or an exception may be raised. """ from __future__ import annotations import logging from typing import List, Tuple try: import wikipedia # type: ignore except ImportError as exc: raise ImportError( "The 'wikipedia' package is required for retrieval. " "Please add it to your requirements.txt file and install via pip." ) from exc try: # Try to import NLTK for sentence segmentation. If it's not # available or the punkt models are missing, we fall back to a # naive sentence splitter. import nltk # type: ignore from nltk.tokenize import sent_tokenize # type: ignore # Ensure the punkt model is available without prompting the user. If # the download fails (for example because there is no network), the # exception will be caught and we will revert to the naive splitter. try: nltk.data.find("tokenizers/punkt") except LookupError: try: nltk.download("punkt", quiet=True) except Exception: # We'll fall back to the naive splitter below pass _use_nltk = True except ImportError: _use_nltk = False logger = logging.getLogger(__name__) def _split_sentences(text: str) -> List[str]: """Split ``text`` into sentences. Uses NLTK's sentence tokenizer when available; otherwise falls back to a simple rule-based approach splitting on common sentence terminators (period, exclamation mark, question mark). """ if _use_nltk: try: return [s.strip() for s in sent_tokenize(text) if s.strip()] except Exception: # If NLTK fails for any reason, fall back to naive splitting pass sentences: List[str] = [] current = [] for ch in text: current.append(ch) if ch in {'.', '!', '?'}: sentence = ''.join(current).strip() if sentence: sentences.append(sentence) current = [] # Add any trailing text as a final sentence remainder = ''.join(current).strip() if remainder: sentences.append(remainder) return sentences def retrieve_wikipedia_sentences(claim: str, *, max_pages: int = 3, max_sentences_per_page: int = 200) -> List[Tuple[str, str]]: """Search Wikipedia for the given claim and return candidate sentences. Parameters ---------- claim: A short description or assertion about which evidence should be retrieved. For example, ``"Electric vehicles reduce CO₂ emissions"``. max_pages: The maximum number of pages to fetch from Wikipedia's search results. Increasing this value may yield more candidate sentences but will also increase runtime. max_sentences_per_page: Limit on the number of sentences extracted from each page. This prevents extremely long articles from producing thousands of candidate sentences. Returns ------- List[Tuple[str, str]] A list of tuples ``(sentence, source_url)``. Each sentence is stripped of leading/trailing whitespace. ``source_url`` is the canonical URL of the Wikipedia page where the sentence was found. """ if not claim or not claim.strip(): return [] candidates: List[Tuple[str, str]] = [] try: search_results = wikipedia.search(claim, results=max_pages) except Exception as exc: logger.warning("Wikipedia search failed: %s", exc) return candidates for title in search_results[:max_pages]: try: page = wikipedia.page(title, auto_suggest=False) content = page.content sentences = _split_sentences(content) if max_sentences_per_page > 0: sentences = sentences[:max_sentences_per_page] url = page.url for sentence in sentences: candidates.append((sentence, url)) except Exception as exc: logger.debug("Skipping page '%s' due to error: %s", title, exc) continue return candidates