from typing import Dict, Any from langchain_core.documents import Document from ask_candid.retrieval.sources.schema import ElasticSourceConfig, ElasticHitsResult from ask_candid.retrieval.sources.utils import get_context YoutubeConfig = ElasticSourceConfig( index_name="search-semantic-youtube-elser_ve1", text_fields=("captions_cleaned", "description_cleaned", "title"), excluded_fields=("captions", "description", "text_cleaned") ) def process_youtube_hit(hit: ElasticHitsResult) -> Document: title = hit.source.get("title", "") # we only need to process long texts description_cleaned_with_context_txt = get_context("description_cleaned", hit, context_length=12) captions_cleaned_with_context_txt = get_context("captions_cleaned", hit, context_length=12) return Document( page_content='\n\n'.join([title, description_cleaned_with_context_txt, captions_cleaned_with_context_txt]), metadata={ "title": title, "source": "Candid YouTube", "source_id": hit.source['video_id'], "url": f"https://www.youtube.com/watch?v={hit.source['video_id']}" } ) def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str: url = f"https://www.youtube.com/watch?v={doc['video_id']}" fields = ["title", "description_cleaned"] fields_dict = {} fields_len = 0 for field in fields: if doc.get(field, None) is not None: fields_dict[field] = doc[field] fields_dict[field + "_txt"] = f"
{doc[field]}
" if (fields_len + len(doc[field])) > 999: rest_text_len = 999 - fields_len if rest_text_len > 0: fields_dict[field + "_txt"] = f"
{doc[field][:rest_text_len] + '[...]'}
" else: fields_dict[field + "_txt"] = f"{'[...]'}" fields_len = fields_len + len(doc[field]) else: fields_dict[field] = "" fields_dict[field + "_txt"] = "" html = f"""
Candid Youtube video: {doc['title']}

{fields_dict["description_cleaned_txt"]}
""" return html