|
""" |
|
pipeline.py |
|
----------- |
|
|
|
This module orchestrates the entire TruthLens pipeline: retrieval, |
|
ranking and classification of candidate sentences in response to a |
|
claim. It produces a structured context card separating supporting, |
|
contradictory and neutral evidence along with citation metadata. |
|
|
|
The primary entry point is :func:`process_claim`, which accepts a |
|
string claim and returns a dictionary with the following keys: |
|
|
|
``support`` |
|
List of dictionaries, each with ``sentence`` and ``source`` keys. |
|
|
|
``contradict`` |
|
List of dictionaries with ``sentence`` and ``source`` keys. |
|
|
|
``neutral`` |
|
List of dictionaries with ``sentence`` and ``source`` keys. |
|
|
|
``sources`` |
|
A mapping from source URL to a dict of counts of each label |
|
category. This is useful for generating aggregated statistics. |
|
|
|
Example: |
|
|
|
>>> from pipeline import process_claim |
|
>>> result = process_claim("The Eiffel Tower is in London", top_k=5) |
|
>>> print(result["contradict"][0]["sentence"]) |
|
... # prints a sentence contradicting the claim |
|
|
|
""" |
|
|
|
from __future__ import annotations |
|
|
|
import logging |
|
from collections import defaultdict |
|
from typing import Dict, Iterable, List, Mapping |
|
|
|
from .retriever import retrieve_wikipedia_sentences |
|
from .ranker import rank_sentences |
|
from .classifier import classify |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def process_claim( |
|
claim: str, |
|
*, |
|
max_pages: int = 3, |
|
max_sentences_per_page: int = 200, |
|
top_k: int = 10, |
|
) -> Dict[str, List[Dict[str, str]] | Mapping[str, Dict[str, int]]]: |
|
"""Process a claim and return structured evidence grouped by stance. |
|
|
|
Parameters |
|
---------- |
|
claim: |
|
The claim text for which context should be gathered. |
|
|
|
max_pages: |
|
Passed to the retriever; limits the number of Wikipedia pages |
|
queried. |
|
|
|
max_sentences_per_page: |
|
Passed to the retriever; limits the number of sentences |
|
considered from each page. |
|
|
|
top_k: |
|
Passed to the ranker; limits the number of top candidate |
|
sentences returned for classification. |
|
|
|
Returns |
|
------- |
|
Dict[str, List[Dict[str, str]] | Mapping[str, Dict[str, int]]] |
|
A dictionary containing three lists (``support``, ``contradict``, |
|
``neutral``) of evidence entries and a ``sources`` mapping |
|
aggregating counts per source URL. |
|
""" |
|
if not claim or not claim.strip(): |
|
return {"support": [], "contradict": [], "neutral": [], "sources": {}} |
|
|
|
|
|
candidates_with_sources = retrieve_wikipedia_sentences( |
|
claim, |
|
max_pages=max_pages, |
|
max_sentences_per_page=max_sentences_per_page, |
|
) |
|
if not candidates_with_sources: |
|
return {"support": [], "contradict": [], "neutral": [], "sources": {}} |
|
sentences = [s for s, _ in candidates_with_sources] |
|
sources = [u for _, u in candidates_with_sources] |
|
|
|
|
|
ranked = rank_sentences(claim, sentences, top_k=top_k) |
|
ranked_sentences = [sent for sent, _score in ranked] |
|
|
|
|
|
ranked_sources = [] |
|
sent_to_sources = {} |
|
for sentence, source in candidates_with_sources: |
|
|
|
|
|
if sentence not in sent_to_sources: |
|
sent_to_sources[sentence] = source |
|
for sent in ranked_sentences: |
|
ranked_sources.append(sent_to_sources.get(sent, "")) |
|
|
|
|
|
labels = classify(claim, ranked_sentences) |
|
|
|
|
|
result = {"support": [], "contradict": [], "neutral": []} |
|
source_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: {"support": 0, "contradict": 0, "neutral": 0}) |
|
for sentence, source, label in zip(ranked_sentences, ranked_sources, labels): |
|
result[label].append({"sentence": sentence, "source": source}) |
|
if source: |
|
source_counts[source][label] += 1 |
|
|
|
result["sources"] = source_counts |
|
return result |