brainsqueeze's picture
Smarter document context retrieval
f86d7f2 verified
raw
history blame contribute delete
658 Bytes
from langchain_core.documents import Document
from ask_candid.retrieval.sources.schema import ElasticSourceConfig, ElasticHitsResult
CandidNewsConfig = ElasticSourceConfig(
index_name="news_1",
text_fields=("title", "content")
)
def process_news_hit(hit: ElasticHitsResult) -> Document:
return Document(
page_content='\n\n'.join([hit.source.get("title", ""), hit.source.get("content", "")]),
metadata={
"title": hit.source.get("title", ""),
"source": hit.source.get("site_name") or "Candid News",
"source_id": hit.source["id"],
"url": hit.source.get("link", "")
}
)