Spaces:

CandidAI
/

ask-candid

Running

App Files Files Community

brainsqueeze commited on Apr 10

Commit

ce98665

verified ·

1 Parent(s): 2744d22

handle missing/null fields

Browse files

Files changed (1) hide show

ask_candid/utils.py +37 -84

ask_candid/utils.py CHANGED Viewed

@@ -1,95 +1,48 @@
-from typing import List, Dict, Union, Any
-from uuid import uuid4
-from langchain_core.documents import Document
-from ask_candid.retrieval.sources import (
-    candid_blog,
-    candid_help,
-    candid_learning,
-    issuelab,
-    youtube
-)
-def html_format_doc(doc: Dict[str, Any], source: str, show_chunks=False) -> str:
-    height_px = 200
-    html = ""
-    if source == "news":
-        # html = news.article_card_html(doc, height_px, show_chunks)
-        pass
-    elif source == "transactions":
-        pass
-    elif source == "organizations":
-        pass
-    elif source == "issuelab":
-        html = issuelab.issuelab_card_html(doc, height_px, show_chunks)
-    elif source == "youtube":
-        html = youtube.build_card_html(doc, 400, show_chunks)
-    elif source == "candid_blog":
-        html = candid_blog.build_card_html(doc, height_px, show_chunks)
-    elif source == "candid_learning":
-        html = candid_learning.build_card_html(doc, height_px, show_chunks)
-    elif source == "candid_help":
-        html = candid_help.build_card_html(doc, height_px, show_chunks)
-    return html
-def html_format_docs_chat(docs: List[Document]) -> str:
-    """Formats Candid sources
     Parameters
     ----------
-    docs : List[Document]
-        Retrieved documents for context
     Returns
     -------
     str
-        Formatted HTML
     """
-    html = ""
-    if docs:
-        docs_html = []
-        for doc in docs:
-            s_name = doc.metadata.get("source", "Source")
-            s_url = doc.metadata.get("url", "URL")
-            s_html = (
-                "<span class='source-item'>"
-                f"<a href='{s_url}' target='_blank' rel='noreferrer' class='ssearch-source'>"
-                f"{doc.metadata['title']} &vert; {s_name}</a></span>"
-            )
-            docs_html.append(s_html)
-        html = f"<h2>Candid Resources</h2><div id='ssearch-sources'>{'<br>'.join(docs_html)}</div>"
-    return html
-def format_chat_ag_response(chatbot: List[Any]) -> List[Any]:
-    """If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
-        with the AI response
-    Returns:
-        _type_: updated chatbot message as HTML
-    """
-    sources = ""
-    if chatbot:
-        title = (chatbot[-1].get("metadata") or {}).get("title", None)
-        if title == "Sources HTML":
-            sources = chatbot[-1]["content"]
-            chatbot.pop(-1)
-            chatbot[-1]["content"] = chatbot[-1]["content"] + sources
-    return chatbot
-def valid_inputs(*args) -> bool:
-    return any(a is not None or (isinstance(a, str) and a.strip() != '') for a in args)
-def get_session_id(thread_id: Union[str, None]) -> str:
-    if not thread_id:
-        thread_id = uuid4().hex
-    return thread_id

+from ask_candid.retrieval.sources.schema import ElasticHitsResult
+def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
+    """Pads the relevant chunk of text with context before and after
     Parameters
     ----------
+    field_name : str
+        a field with the long text that was chunked into pieces
+    hit : ElasticHitsResult
+    context_length : int, optional
+        length of text to add before and after the chunk, by default 1024
     Returns
     -------
     str
+        longer chunks stuffed together
     """
+    chunks = []
+    # NOTE chunks have tokens, long text is a normal text, but may contain html that also gets weird after tokenization
+    long_text = hit.source.get(field_name) or ""
+    long_text = long_text.lower()
+    inner_hits_field = f"embeddings.{field_name}.chunks"
+    found_chunks = hit.inner_hits.get(inner_hits_field, {})
+    if found_chunks:
+        hits = found_chunks.get("hits", {}).get("hits", [])
+        for h in hits:
+            chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
+            # cutting the middle because we may have tokenizing artifacts there
+            chunk = chunk[3: -3]
+            if add_context:
+                # Find the start and end indices of the chunk in the large text
+                start_index = long_text.find(chunk[:20])
+                # Chunk is found
+                if start_index != -1:
+                    end_index = start_index + len(chunk)
+                    pre_start_index = max(0, start_index - context_length)
+                    post_end_index = min(len(long_text), end_index + context_length)
+                    chunks.append(long_text[pre_start_index:post_end_index])
+            else:
+                chunks.append(chunk)
+    return '\n\n'.join(chunks)