from typing import Dict, Any from langchain_core.documents import Document from ask_candid.retrieval.sources.schema import ElasticSourceConfig, ElasticHitsResult from ask_candid.retrieval.sources.utils import get_context YoutubeConfig = ElasticSourceConfig( index_name="search-semantic-youtube-elser_ve1", text_fields=("captions_cleaned", "description_cleaned", "title"), excluded_fields=("captions", "description", "text_cleaned") ) def process_youtube_hit(hit: ElasticHitsResult) -> Document: title = hit.source.get("title", "") # we only need to process long texts description_cleaned_with_context_txt = get_context("description_cleaned", hit, context_length=12) captions_cleaned_with_context_txt = get_context("captions_cleaned", hit, context_length=12) return Document( page_content='\n\n'.join([title, description_cleaned_with_context_txt, captions_cleaned_with_context_txt]), metadata={ "title": title, "source": "Candid YouTube", "source_id": hit.source['video_id'], "url": f"https://www.youtube.com/watch?v={hit.source['video_id']}" } ) def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str: url = f"https://www.youtube.com/watch?v={doc['video_id']}" fields = ["title", "description_cleaned"] fields_dict = {} fields_len = 0 for field in fields: if doc.get(field, None) is not None: fields_dict[field] = doc[field] fields_dict[field + "_txt"] = f"