brainsqueeze's picture
Smarter document context retrieval
f86d7f2 verified
from typing import Dict, Any
from langchain_core.documents import Document
from ask_candid.retrieval.sources.schema import ElasticSourceConfig, ElasticHitsResult
from ask_candid.retrieval.sources.utils import get_context
CandidBlogConfig = ElasticSourceConfig(
index_name="search-semantic-candid-blog",
text_fields=("content", "authors_text", "title_summary_tags")
)
def process_blog_hit(hit: ElasticHitsResult) -> Document:
excerpt = hit.source.get("excerpt", "")
title = hit.source.get("title", "")
# we only need to process long text
content_with_context_txt = get_context("content", hit, context_length=12, add_context=False)
authors = get_context("authors_text", hit, context_length=12, add_context=False)
tags = hit.source.get("title_summary_tags", "")
return Document(
page_content='\n\n'.join([title, excerpt, content_with_context_txt, authors, tags]),
metadata={
"title": title,
"source": "Candid Blog",
"source_id": hit.source["id"],
"url": hit.source["link"]
}
)
def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
url = f"{doc['link']}"
fields = ["title", "excerpt"]
fields_dict = {}
fields_len = 0
for field in fields:
if doc.get(field, None) is not None:
fields_dict[field] = doc[field]
fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"
if (fields_len + len(doc[field])) > 999:
rest_text_len = 999 - fields_len
if rest_text_len > 0:
fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
fields_len = fields_len + len(doc[field])
else:
fields_dict[field] = ""
fields_dict[field + "_txt"] = ""
html = f"""
<div style='height: {height_px}px; padding: 5px;'>
<div style='height: {height_px}px; border: 1px solid #febe10;'>
<span style='padding-left: 10px; display: inline-block; width: 100%;'>
<div>
<span>
<b>Candid blog post:</b>
<a href='{url}' target='_blank' style='text-decoration: none;'>
{doc['title']}
</a>
</span>
<br>
<br>
{fields_dict["excerpt_txt"]}
</div>
</span>
</div>
</div>
"""
return html