brainsqueeze commited on
Commit
ce98665
·
verified ·
1 Parent(s): 2744d22

handle missing/null fields

Browse files
Files changed (1) hide show
  1. ask_candid/utils.py +37 -84
ask_candid/utils.py CHANGED
@@ -1,95 +1,48 @@
1
- from typing import List, Dict, Union, Any
2
- from uuid import uuid4
3
 
4
- from langchain_core.documents import Document
5
 
6
- from ask_candid.retrieval.sources import (
7
- candid_blog,
8
- candid_help,
9
- candid_learning,
10
- issuelab,
11
- youtube
12
- )
13
-
14
-
15
- def html_format_doc(doc: Dict[str, Any], source: str, show_chunks=False) -> str:
16
- height_px = 200
17
- html = ""
18
-
19
- if source == "news":
20
- # html = news.article_card_html(doc, height_px, show_chunks)
21
- pass
22
- elif source == "transactions":
23
- pass
24
- elif source == "organizations":
25
- pass
26
- elif source == "issuelab":
27
- html = issuelab.issuelab_card_html(doc, height_px, show_chunks)
28
- elif source == "youtube":
29
- html = youtube.build_card_html(doc, 400, show_chunks)
30
- elif source == "candid_blog":
31
- html = candid_blog.build_card_html(doc, height_px, show_chunks)
32
- elif source == "candid_learning":
33
- html = candid_learning.build_card_html(doc, height_px, show_chunks)
34
- elif source == "candid_help":
35
- html = candid_help.build_card_html(doc, height_px, show_chunks)
36
- return html
37
-
38
-
39
- def html_format_docs_chat(docs: List[Document]) -> str:
40
- """Formats Candid sources
41
 
42
  Parameters
43
  ----------
44
- docs : List[Document]
45
- Retrieved documents for context
 
 
 
46
 
47
  Returns
48
  -------
49
  str
50
- Formatted HTML
51
  """
52
 
53
- html = ""
54
- if docs:
55
- docs_html = []
56
- for doc in docs:
57
- s_name = doc.metadata.get("source", "Source")
58
- s_url = doc.metadata.get("url", "URL")
59
-
60
- s_html = (
61
- "<span class='source-item'>"
62
- f"<a href='{s_url}' target='_blank' rel='noreferrer' class='ssearch-source'>"
63
- f"{doc.metadata['title']} &vert; {s_name}</a></span>"
64
- )
65
-
66
- docs_html.append(s_html)
67
-
68
- html = f"<h2>Candid Resources</h2><div id='ssearch-sources'>{'<br>'.join(docs_html)}</div>"
69
- return html
70
-
71
-
72
- def format_chat_ag_response(chatbot: List[Any]) -> List[Any]:
73
- """If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
74
- with the AI response
75
- Returns:
76
- _type_: updated chatbot message as HTML
77
- """
78
- sources = ""
79
- if chatbot:
80
- title = (chatbot[-1].get("metadata") or {}).get("title", None)
81
- if title == "Sources HTML":
82
- sources = chatbot[-1]["content"]
83
- chatbot.pop(-1)
84
- chatbot[-1]["content"] = chatbot[-1]["content"] + sources
85
- return chatbot
86
-
87
-
88
- def valid_inputs(*args) -> bool:
89
- return any(a is not None or (isinstance(a, str) and a.strip() != '') for a in args)
90
-
91
-
92
- def get_session_id(thread_id: Union[str, None]) -> str:
93
- if not thread_id:
94
- thread_id = uuid4().hex
95
- return thread_id
 
1
+ from ask_candid.retrieval.sources.schema import ElasticHitsResult
 
2
 
 
3
 
4
+ def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
5
+ """Pads the relevant chunk of text with context before and after
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  Parameters
8
  ----------
9
+ field_name : str
10
+ a field with the long text that was chunked into pieces
11
+ hit : ElasticHitsResult
12
+ context_length : int, optional
13
+ length of text to add before and after the chunk, by default 1024
14
 
15
  Returns
16
  -------
17
  str
18
+ longer chunks stuffed together
19
  """
20
 
21
+ chunks = []
22
+ # NOTE chunks have tokens, long text is a normal text, but may contain html that also gets weird after tokenization
23
+ long_text = hit.source.get(field_name) or ""
24
+ long_text = long_text.lower()
25
+
26
+ inner_hits_field = f"embeddings.{field_name}.chunks"
27
+ found_chunks = hit.inner_hits.get(inner_hits_field, {})
28
+ if found_chunks:
29
+ hits = found_chunks.get("hits", {}).get("hits", [])
30
+ for h in hits:
31
+ chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
32
+
33
+ # cutting the middle because we may have tokenizing artifacts there
34
+ chunk = chunk[3: -3]
35
+
36
+ if add_context:
37
+ # Find the start and end indices of the chunk in the large text
38
+ start_index = long_text.find(chunk[:20])
39
+
40
+ # Chunk is found
41
+ if start_index != -1:
42
+ end_index = start_index + len(chunk)
43
+ pre_start_index = max(0, start_index - context_length)
44
+ post_end_index = min(len(long_text), end_index + context_length)
45
+ chunks.append(long_text[pre_start_index:post_end_index])
46
+ else:
47
+ chunks.append(chunk)
48
+ return '\n\n'.join(chunks)