brainsqueeze commited on
Commit
ce9dc52
·
verified ·
1 Parent(s): ce98665

updated wrong utils.py

Browse files
Files changed (1) hide show
  1. ask_candid/utils.py +84 -37
ask_candid/utils.py CHANGED
@@ -1,48 +1,95 @@
1
- from ask_candid.retrieval.sources.schema import ElasticHitsResult
 
2
 
 
3
 
4
- def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
5
- """Pads the relevant chunk of text with context before and after
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  Parameters
8
  ----------
9
- field_name : str
10
- a field with the long text that was chunked into pieces
11
- hit : ElasticHitsResult
12
- context_length : int, optional
13
- length of text to add before and after the chunk, by default 1024
14
 
15
  Returns
16
  -------
17
  str
18
- longer chunks stuffed together
19
  """
20
 
21
- chunks = []
22
- # NOTE chunks have tokens, long text is a normal text, but may contain html that also gets weird after tokenization
23
- long_text = hit.source.get(field_name) or ""
24
- long_text = long_text.lower()
25
-
26
- inner_hits_field = f"embeddings.{field_name}.chunks"
27
- found_chunks = hit.inner_hits.get(inner_hits_field, {})
28
- if found_chunks:
29
- hits = found_chunks.get("hits", {}).get("hits", [])
30
- for h in hits:
31
- chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
32
-
33
- # cutting the middle because we may have tokenizing artifacts there
34
- chunk = chunk[3: -3]
35
-
36
- if add_context:
37
- # Find the start and end indices of the chunk in the large text
38
- start_index = long_text.find(chunk[:20])
39
-
40
- # Chunk is found
41
- if start_index != -1:
42
- end_index = start_index + len(chunk)
43
- pre_start_index = max(0, start_index - context_length)
44
- post_end_index = min(len(long_text), end_index + context_length)
45
- chunks.append(long_text[pre_start_index:post_end_index])
46
- else:
47
- chunks.append(chunk)
48
- return '\n\n'.join(chunks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Union, Any
2
+ from uuid import uuid4
3
 
4
+ from langchain_core.documents import Document
5
 
6
+ from ask_candid.retrieval.sources import (
7
+ candid_blog,
8
+ candid_help,
9
+ candid_learning,
10
+ issuelab,
11
+ youtube
12
+ )
13
+
14
+
15
+ def html_format_doc(doc: Dict[str, Any], source: str, show_chunks=False) -> str:
16
+ height_px = 200
17
+ html = ""
18
+
19
+ if source == "news":
20
+ # html = news.article_card_html(doc, height_px, show_chunks)
21
+ pass
22
+ elif source == "transactions":
23
+ pass
24
+ elif source == "organizations":
25
+ pass
26
+ elif source == "issuelab":
27
+ html = issuelab.issuelab_card_html(doc, height_px, show_chunks)
28
+ elif source == "youtube":
29
+ html = youtube.build_card_html(doc, 400, show_chunks)
30
+ elif source == "candid_blog":
31
+ html = candid_blog.build_card_html(doc, height_px, show_chunks)
32
+ elif source == "candid_learning":
33
+ html = candid_learning.build_card_html(doc, height_px, show_chunks)
34
+ elif source == "candid_help":
35
+ html = candid_help.build_card_html(doc, height_px, show_chunks)
36
+ return html
37
+
38
+
39
+ def html_format_docs_chat(docs: List[Document]) -> str:
40
+ """Formats Candid sources
41
 
42
  Parameters
43
  ----------
44
+ docs : List[Document]
45
+ Retrieved documents for context
 
 
 
46
 
47
  Returns
48
  -------
49
  str
50
+ Formatted HTML
51
  """
52
 
53
+ html = ""
54
+ if docs:
55
+ docs_html = []
56
+ for doc in docs:
57
+ s_name = doc.metadata.get("source", "Source")
58
+ s_url = doc.metadata.get("url", "URL")
59
+
60
+ s_html = (
61
+ "<span class='source-item'>"
62
+ f"<a href='{s_url}' target='_blank' rel='noreferrer' class='ssearch-source'>"
63
+ f"{doc.metadata['title']} &vert; {s_name}</a></span>"
64
+ )
65
+
66
+ docs_html.append(s_html)
67
+
68
+ html = f"<h2>Candid Resources</h2><div id='ssearch-sources'>{'<br>'.join(docs_html)}</div>"
69
+ return html
70
+
71
+
72
+ def format_chat_ag_response(chatbot: List[Any]) -> List[Any]:
73
+ """If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
74
+ with the AI response
75
+ Returns:
76
+ _type_: updated chatbot message as HTML
77
+ """
78
+ sources = ""
79
+ if chatbot:
80
+ title = (chatbot[-1].get("metadata") or {}).get("title", None)
81
+ if title == "Sources HTML":
82
+ sources = chatbot[-1]["content"]
83
+ chatbot.pop(-1)
84
+ chatbot[-1]["content"] = chatbot[-1]["content"] + sources
85
+ return chatbot
86
+
87
+
88
+ def valid_inputs(*args) -> bool:
89
+ return any(a is not None or (isinstance(a, str) and a.strip() != '') for a in args)
90
+
91
+
92
+ def get_session_id(thread_id: Union[str, None]) -> str:
93
+ if not thread_id:
94
+ thread_id = uuid4().hex
95
+ return thread_id