from ask_candid.retrieval.sources.schema import ElasticHitsResult def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str: """Pads the relevant chunk of text with context before and after Parameters ---------- field_name : str a field with the long text that was chunked into pieces hit : ElasticHitsResult context_length : int, optional length of text to add before and after the chunk, by default 1024 Returns ------- str longer chunks stuffed together """ chunks = [] # NOTE chunks have tokens, long text is a normal text, but may contain html that also gets weird after tokenization long_text = hit.source.get(field_name) or "" long_text = long_text.lower() inner_hits_field = f"embeddings.{field_name}.chunks" found_chunks = hit.inner_hits.get(inner_hits_field, {}) if found_chunks: hits = found_chunks.get("hits", {}).get("hits", []) for h in hits: chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0] # cutting the middle because we may have tokenizing artifacts there chunk = chunk[3: -3] if add_context: # Find the start and end indices of the chunk in the large text start_index = long_text.find(chunk[:20]) # Chunk is found if start_index != -1: end_index = start_index + len(chunk) pre_start_index = max(0, start_index - context_length) post_end_index = min(len(long_text), end_index + context_length) chunks.append(long_text[pre_start_index:post_end_index]) else: chunks.append(chunk) return '\n\n'.join(chunks)