Spaces:

akaafridi
/

TRUTHLENS

Sleeping

App Files Files Community

TRUTHLENS / src /pipeline.py

akaafridi

Upload 11 files

713735a verified 28 days ago

raw

history blame contribute delete

4.15 kB

	"""
	pipeline.py
	-----------

	This module orchestrates the entire TruthLens pipeline: retrieval,
	ranking and classification of candidate sentences in response to a
	claim. It produces a structured context card separating supporting,
	contradictory and neutral evidence along with citation metadata.

	The primary entry point is :func:`process_claim`, which accepts a
	string claim and returns a dictionary with the following keys:

	``support``
	List of dictionaries, each with ``sentence`` and ``source`` keys.

	``contradict``
	List of dictionaries with ``sentence`` and ``source`` keys.

	``neutral``
	List of dictionaries with ``sentence`` and ``source`` keys.

	``sources``
	A mapping from source URL to a dict of counts of each label
	category. This is useful for generating aggregated statistics.

	Example:

	>>> from pipeline import process_claim
	>>> result = process_claim("The Eiffel Tower is in London", top_k=5)
	>>> print(result["contradict"][0]["sentence"])
	... # prints a sentence contradicting the claim

	"""

	from __future__ import annotations

	import logging
	from collections import defaultdict
	from typing import Dict, Iterable, List, Mapping

	from .retriever import retrieve_wikipedia_sentences
	from .ranker import rank_sentences
	from .classifier import classify

	logger = logging.getLogger(__name__)


	def process_claim(
	claim: str,
	*,
	max_pages: int = 3,
	max_sentences_per_page: int = 200,
	top_k: int = 10,
	) -> Dict[str, List[Dict[str, str]] \| Mapping[str, Dict[str, int]]]:
	"""Process a claim and return structured evidence grouped by stance.

	Parameters
	----------
	claim:
	The claim text for which context should be gathered.

	max_pages:
	Passed to the retriever; limits the number of Wikipedia pages
	queried.

	max_sentences_per_page:
	Passed to the retriever; limits the number of sentences
	considered from each page.

	top_k:
	Passed to the ranker; limits the number of top candidate
	sentences returned for classification.

	Returns
	-------
	Dict[str, List[Dict[str, str]] \| Mapping[str, Dict[str, int]]]
	A dictionary containing three lists (``support``, ``contradict``,
	``neutral``) of evidence entries and a ``sources`` mapping
	aggregating counts per source URL.
	"""
	if not claim or not claim.strip():
	return {"support": [], "contradict": [], "neutral": [], "sources": {}}

	# Stage 1: Retrieve candidate sentences with associated sources
	candidates_with_sources = retrieve_wikipedia_sentences(
	claim,
	max_pages=max_pages,
	max_sentences_per_page=max_sentences_per_page,
	)
	if not candidates_with_sources:
	return {"support": [], "contradict": [], "neutral": [], "sources": {}}
	sentences = [s for s, _ in candidates_with_sources]
	sources = [u for _, u in candidates_with_sources]

	# Stage 2: Rank candidates by similarity
	ranked = rank_sentences(claim, sentences, top_k=top_k)
	ranked_sentences = [sent for sent, _score in ranked]

	# Keep track of the associated source for each ranked sentence
	ranked_sources = []
	sent_to_sources = {}
	for sentence, source in candidates_with_sources:
	# It's possible that the same sentence appears multiple times
	# across pages, but we only record the first occurrence here.
	if sentence not in sent_to_sources:
	sent_to_sources[sentence] = source
	for sent in ranked_sentences:
	ranked_sources.append(sent_to_sources.get(sent, ""))

	# Stage 3: Classify stance for each ranked sentence
	labels = classify(claim, ranked_sentences)

	# Group results
	result = {"support": [], "contradict": [], "neutral": []}
	source_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: {"support": 0, "contradict": 0, "neutral": 0})
	for sentence, source, label in zip(ranked_sentences, ranked_sources, labels):
	result[label].append({"sentence": sentence, "source": source})
	if source:
	source_counts[source][label] += 1

	result["sources"] = source_counts
	return result