brainsqueeze commited on
Commit
92feab2
·
verified ·
1 Parent(s): 11b563b

Initial commit

Browse files
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Candid
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,14 +1,13 @@
1
  ---
2
  title: Ask Candid
3
- emoji: 🚀
4
- colorFrom: gray
5
- colorTo: yellow
 
6
  sdk: gradio
7
- sdk_version: 5.6.0
8
  app_file: app.py
9
- pinned: false
10
  license: mit
11
- short_description: AI assistant for philanthropy and the social sector
12
  ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Ask Candid
3
+ short_description: AI assistant for philanthropy and the social sector
4
+ emoji: 💬
5
+ colorFrom: blue
6
+ colorTo: purple
7
  sdk: gradio
8
+ sdk_version: 5.5.0
9
  app_file: app.py
10
+ pinned: true
11
  license: mit
 
12
  ---
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple, Dict, TypedDict, Optional, Any
2
+ import os
3
+
4
+ import gradio as gr
5
+
6
+ from langchain_openai.chat_models import ChatOpenAI
7
+
8
+ try:
9
+ from utils import format_chat_ag_response
10
+ from retrieval.config import ALL_INDICES
11
+ from chat import run_chat
12
+ except ImportError:
13
+ from .utils import format_chat_ag_response
14
+ from .retrieval.config import ALL_INDICES
15
+ from .chat import run_chat
16
+
17
+ ROOT = os.path.dirname(os.path.abspath(__file__))
18
+
19
+
20
+ class LoggedComponents(TypedDict):
21
+ context: List[gr.components.Component]
22
+ found_helpful: gr.components.Component
23
+ will_recommend: gr.components.Component
24
+ comments: gr.components.Component
25
+ email: gr.components.Component
26
+
27
+
28
+ def execute(
29
+ thread_id: str,
30
+ user_input: Dict[str, Any],
31
+ chatbot: List[Dict],
32
+ max_new_tokens: int,
33
+ indices: Optional[List[str]] = None,
34
+ ):
35
+ llm = ChatOpenAI(
36
+ model_name="gpt-4o",
37
+ max_tokens=max_new_tokens,
38
+ api_key=os.getenv("OPENAI_API_KEY"),
39
+ temperature=0.0,
40
+ streaming=True
41
+ )
42
+
43
+ return run_chat(
44
+ thread_id=thread_id,
45
+ user_input=user_input,
46
+ chatbot=chatbot,
47
+ llm=llm,
48
+ indices=indices
49
+ )
50
+
51
+
52
+ def build_chat() -> Tuple[LoggedComponents, gr.Blocks]:
53
+ with gr.Blocks(theme=gr.themes.Soft(), title="Ask Candid") as demo:
54
+ with gr.Accordion(label="Advanced settings", open=False):
55
+ es_indices = gr.CheckboxGroup(
56
+ choices=list(ALL_INDICES),
57
+ value=list(ALL_INDICES),
58
+ label="Sources to include",
59
+ interactive=True
60
+ )
61
+ max_new_tokens = gr.Slider(
62
+ value=256 * 3, minimum=128, maximum=2048, step=128,
63
+ label="Max new tokens", interactive=True
64
+ )
65
+
66
+ with gr.Column():
67
+ chatbot = gr.Chatbot(
68
+ label="Candid Assistant",
69
+ elem_id="chatbot",
70
+ bubble_full_width=False,
71
+ avatar_images=(
72
+ None,
73
+ os.path.join(ROOT, "static", "candid_logo_yellow.png")
74
+ ),
75
+ height="45vh",
76
+ type="messages",
77
+ show_label=False,
78
+ show_copy_button=True,
79
+ show_share_button=True,
80
+ show_copy_all_button=True
81
+ )
82
+ msg = gr.MultimodalTextbox(label="Your message", interactive=True)
83
+ thread_id = gr.Text(visible=False, value="", label="thread_id")
84
+ gr.ClearButton(components=[msg, chatbot, thread_id], size="sm")
85
+
86
+ # pylint: disable=no-member
87
+ chat_msg = msg.submit(
88
+ fn=execute,
89
+ inputs=[thread_id, msg, chatbot, max_new_tokens, es_indices],
90
+ outputs=[msg, chatbot, thread_id]
91
+ )
92
+ chat_msg.then(format_chat_ag_response, chatbot, chatbot, api_name="bot_response")
93
+ logged = LoggedComponents(
94
+ context=[thread_id, chatbot]
95
+ )
96
+ return logged, demo
97
+
98
+
99
+ if __name__ == '__main__':
100
+ _, app = build_chat()
101
+ app.queue(max_size=5).launch(
102
+ show_api=False,
103
+ auth=[
104
+ (os.getenv("APP_USERNAME"), os.getenv("APP_PASSWORD")),
105
+ (os.getenv("APP_PUBLIC_USERNAME"), os.getenv("APP_PUBLIC_PASSWORD")),
106
+ ],
107
+ auth_message="Login to Candid's AI assistant",
108
+ ssr_mode=False
109
+ )
chat.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Dict, Any, TypedDict, Annotated, Sequence
2
+ from functools import partial
3
+ import os
4
+
5
+ import gradio as gr
6
+
7
+ from langchain_core.messages import AIMessage, BaseMessage
8
+ from langchain_core.output_parsers import StrOutputParser
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+ from langchain_core.language_models.llms import LLM
11
+
12
+ from langgraph.prebuilt import tools_condition, ToolNode
13
+ from langgraph.checkpoint.memory import MemorySaver
14
+ from langgraph.graph.state import StateGraph
15
+ from langgraph.graph.message import add_messages
16
+ from langgraph.constants import START, END
17
+
18
+ try:
19
+ from utils import html_format_docs_chat, get_session_id
20
+ from tools.question_reformulation import reformulate_question_using_history
21
+ from tools.org_seach import (
22
+ extract_org_links_from_chatbot,
23
+ embed_org_links_in_text,
24
+ generate_org_link_dict,
25
+ )
26
+ from retrieval.elastic import retriever_tool
27
+ except ImportError:
28
+ from .utils import html_format_docs_chat, get_session_id
29
+ from .tools.question_reformulation import reformulate_question_using_history
30
+ from .tools.org_seach import (
31
+ extract_org_links_from_chatbot,
32
+ embed_org_links_in_text,
33
+ generate_org_link_dict,
34
+ )
35
+ from .retrieval.elastic import retriever_tool
36
+
37
+ ROOT = os.path.dirname(os.path.abspath(__file__))
38
+
39
+ # TODO https://www.metadocs.co/2024/08/29/simple-domain-specific-corrective-rag-with-langchain-and-langgraph/
40
+
41
+
42
+ class AgentState(TypedDict):
43
+ # The add_messages function defines how an update should be processed
44
+ # Default is to replace. add_messages says "append"
45
+ messages: Annotated[Sequence[BaseMessage], add_messages]
46
+ user_input: str
47
+ org_dict: Dict
48
+
49
+
50
+ def search_agent(state, llm: LLM, tools) -> AgentState:
51
+ """Invokes the agent model to generate a response based on the current state. Given
52
+ the question, it will decide to retrieve using the retriever tool, or simply end.
53
+
54
+ Parameters
55
+ ----------
56
+ state : _type_
57
+ The current state
58
+ llm : LLM
59
+ tools : _type_
60
+ _description_
61
+
62
+ Returns
63
+ -------
64
+ AgentState
65
+ The updated state with the agent response appended to messages
66
+ """
67
+
68
+ print("---SEARCH AGENT---")
69
+ messages = state["messages"]
70
+ question = messages[-1].content
71
+
72
+ model = llm.bind_tools(tools)
73
+ response = model.invoke(messages)
74
+ # return a list, because this will get added to the existing list
75
+ return {"messages": [response], "user_input": question}
76
+
77
+
78
+ def generate_with_context(state, llm: LLM) -> AgentState:
79
+ """Generate answer.
80
+
81
+ Parameters
82
+ ----------
83
+ state : _type_
84
+ The current state
85
+ llm : LLM
86
+ tools : _type_
87
+ _description_
88
+
89
+ Returns
90
+ -------
91
+ AgentState
92
+ The updated state with the agent response appended to messages
93
+ """
94
+
95
+ print("---GENERATE ANSWER---")
96
+ messages = state["messages"]
97
+ question = state["user_input"]
98
+ last_message = messages[-1]
99
+
100
+ sources_str = last_message.content
101
+ sources_list = last_message.artifact # cannot use directly as list of Documents
102
+ # converting to html string
103
+ sources_html = html_format_docs_chat(sources_list)
104
+ if sources_list:
105
+ print("---ADD SOURCES---")
106
+ state["messages"].append(BaseMessage(content=sources_html, type="HTML"))
107
+
108
+ # Prompt
109
+ qa_system_prompt = """
110
+ You are an assistant for question-answering tasks in the social and philanthropic sector. \n
111
+ Use the following pieces of retrieved context to answer the question at the end. \n
112
+ If you don't know the answer, just say that you don't know. \n
113
+ Keep the response professional, friendly, and as concise as possible. \n
114
+ Question: {question}
115
+ Context: {context}
116
+ Answer:
117
+ """
118
+
119
+ qa_prompt = ChatPromptTemplate(
120
+ [
121
+ ("system", qa_system_prompt),
122
+ ("human", question),
123
+ ]
124
+ )
125
+
126
+ rag_chain = qa_prompt | llm | StrOutputParser()
127
+ response = rag_chain.invoke({"context": sources_str, "question": question})
128
+ # couldn't figure out why returning usual "response" was seen as HumanMessage
129
+ return {"messages": [AIMessage(content=response)], "user_input": question}
130
+
131
+
132
+ def has_org_name(state: AgentState) -> AgentState:
133
+ """
134
+ Processes the latest message to extract organization links and determine the next step.
135
+
136
+ Args:
137
+ state (AgentState): The current state of the agent, including a list of messages.
138
+
139
+ Returns:
140
+ dict: A dictionary with the next agent action and, if available, a dictionary of organization links.
141
+ """
142
+ print("---HAS ORG NAMES?---")
143
+ messages = state["messages"]
144
+ last_message = messages[-1].content
145
+ output_list = extract_org_links_from_chatbot(last_message)
146
+ link_dict = generate_org_link_dict(output_list) if output_list else {}
147
+ if link_dict:
148
+ print("---FOUND ORG NAMES---")
149
+ return {"next": "insert_org_link", "org_dict": link_dict}
150
+ print("---NO ORG NAMES FOUND---")
151
+ return {"next": END, "messages": messages}
152
+
153
+
154
+ def insert_org_link(state: AgentState) -> AgentState:
155
+ """
156
+ Embeds organization links in the latest message content and returns it as an AI message.
157
+
158
+ Args:
159
+ state (dict): The current state, including the organization links and latest message.
160
+
161
+ Returns:
162
+ dict: A dictionary with the updated message content as an AIMessage.
163
+ """
164
+ print("---INSERT ORG LINKS---")
165
+ messages = state["messages"]
166
+ last_message = messages[-1].content
167
+ messages.pop(-1) # Deleting the original message because we will append the same one but with links
168
+ link_dict = state["org_dict"]
169
+ last_message = embed_org_links_in_text(last_message, link_dict)
170
+ return {"messages": [AIMessage(content=last_message)]}
171
+
172
+
173
+ def build_compute_graph(llm: LLM, indices: List[str]) -> StateGraph:
174
+ candid_retriever_tool = retriever_tool(indices=indices)
175
+ retrieve = ToolNode([candid_retriever_tool])
176
+ tools = [candid_retriever_tool]
177
+
178
+ G = StateGraph(AgentState)
179
+ # Add nodes
180
+ G.add_node("reformulate", partial(reformulate_question_using_history, llm=llm))
181
+ G.add_node("search_agent", partial(search_agent, llm=llm, tools=tools))
182
+ G.add_node("retrieve", retrieve)
183
+ G.add_node("generate_with_context", partial(generate_with_context, llm=llm))
184
+ G.add_node("has_org_name", has_org_name)
185
+ G.add_node("insert_org_link", insert_org_link)
186
+
187
+ # Add edges
188
+ G.add_edge(START, "reformulate")
189
+ G.add_edge("reformulate", "search_agent")
190
+ # Conditional edges from search_agent
191
+ G.add_conditional_edges(
192
+ source="search_agent",
193
+ path=tools_condition, # TODO just a conditional edge here?
194
+ path_map={
195
+ "tools": "retrieve",
196
+ "__end__": "has_org_name",
197
+ },
198
+ )
199
+ G.add_edge("retrieve", "generate_with_context")
200
+
201
+ # Add edges
202
+ G.add_edge("generate_with_context", "has_org_name")
203
+ # Use add_conditional_edges for has_org_name
204
+ G.add_conditional_edges(
205
+ "has_org_name",
206
+ lambda x: x["next"], # Now we're accessing the 'next' key from the dict
207
+ {"insert_org_link": "insert_org_link", END: END},
208
+ )
209
+ G.add_edge("insert_org_link", END)
210
+
211
+ return G
212
+
213
+
214
+ def run_chat(
215
+ thread_id: str,
216
+ user_input: Dict[str, Any],
217
+ chatbot: List[Dict],
218
+ llm: LLM,
219
+ indices: Optional[List[str]] = None,
220
+ ):
221
+ # https://langchain-ai.github.io/langgraph/tutorials/rag/langgraph_agentic_rag/#graph
222
+
223
+ chatbot.append({"role": "user", "content": user_input["text"]})
224
+ inputs = {"messages": chatbot}
225
+ # thread_id can be an email https://github.com/yurisasc/memory-enhanced-ai-assistant/blob/main/assistant.py
226
+ thread_id = get_session_id(thread_id)
227
+ config = {"configurable": {"thread_id": thread_id}}
228
+
229
+ workflow = build_compute_graph(llm=llm, indices=indices)
230
+
231
+ memory = MemorySaver() # TODO: don't use for Prod
232
+ graph = workflow.compile(checkpointer=memory)
233
+ response = graph.invoke(inputs, config=config)
234
+ messages = response["messages"]
235
+ last_message = messages[-1]
236
+ ai_answer = last_message.content
237
+ sources_html = ""
238
+ for message in messages[-2:]:
239
+ if message.type == "HTML":
240
+ sources_html = message.content
241
+
242
+ chatbot.append({"role": "assistant", "content": ai_answer})
243
+ if sources_html:
244
+ chatbot.append(
245
+ {
246
+ "role": "assistant",
247
+ "content": sources_html,
248
+ "metadata": {"title": "Sources HTML"},
249
+ }
250
+ )
251
+
252
+ return gr.MultimodalTextbox(value=None, interactive=True), chatbot, thread_id
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ elasticsearch==7.17.6
2
+ gradio
3
+ langchain
4
+ langchain-openai
5
+ langgraph
6
+ pydantic
7
+ fuzzywuzzy
retrieval/__init__.py ADDED
File without changes
retrieval/candid_blog.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Tuple, Any
2
+
3
+
4
+ def build_wp_candid_blog_knn_model_query(
5
+ query: str,
6
+ fields: Tuple[str] = (
7
+ "content",
8
+ "title"
9
+ ),
10
+ k: int = 10,
11
+ model_id: str = "sentence-transformers__all-mpnet-base-v2"
12
+ ):
13
+ output = []
14
+
15
+ for f in fields:
16
+ output.append({
17
+ "field": f"embeddings.{f}.chunks.vector.predicted_value",
18
+ "k": k,
19
+ "num_candidates": 100,
20
+ "query_vector_builder": {
21
+ "text_embedding": {
22
+ "model_id": model_id,
23
+ "model_text": query
24
+ }
25
+ },
26
+ "boost": 1 / len(fields)
27
+ })
28
+ return {"knn": output}
29
+
30
+
31
+ def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
32
+ url = f"{doc['link']}"
33
+ fields = ["title", "excerpt"]
34
+
35
+ fields_dict = {}
36
+ fields_len = 0
37
+ for field in fields:
38
+ if doc.get(field, None) is not None:
39
+ fields_dict[field] = doc[field]
40
+ fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"
41
+
42
+ if (fields_len + len(doc[field])) > 999:
43
+ rest_text_len = 999 - fields_len
44
+ if rest_text_len > 0:
45
+ fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
46
+ else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
47
+ fields_len = fields_len + len(doc[field])
48
+ else:
49
+ fields_dict[field] = ""
50
+ fields_dict[field + "_txt"] = ""
51
+ html = f"""
52
+ <div style='height: {height_px}px; padding: 5px;'>
53
+ <div style='height: {height_px}px; border: 1px solid #febe10;'>
54
+ <span style='padding-left: 10px; display: inline-block; width: 100%;'>
55
+ <div>
56
+ <span>
57
+ <b>Candid blog post:</b>
58
+ <a href='{url}' target='_blank' style='text-decoration: none;'>
59
+ {doc['title']}
60
+ </a>
61
+ </span>
62
+ <br>
63
+ <br>
64
+ {fields_dict["excerpt_txt"]}
65
+ </div>
66
+ </span>
67
+ </div>
68
+ </div>
69
+ """
70
+ return html
retrieval/candid_help.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+
3
+
4
+ def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
5
+ url = f"{doc['link']}"
6
+ fields = ["title", "summary"]
7
+
8
+ fields_dict = {}
9
+ fields_len = 0
10
+ for field in fields:
11
+ if doc.get(field, None) is not None:
12
+ fields_dict[field] = doc[field]
13
+ fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"
14
+
15
+ if (fields_len + len(doc[field])) > 999:
16
+ rest_text_len = 999 - fields_len
17
+ if rest_text_len > 0:
18
+ fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
19
+ else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
20
+ fields_len = fields_len + len(doc[field])
21
+ else:
22
+ fields_dict[field] = ""
23
+ fields_dict[field + "_txt"] = ""
24
+ html = f"""
25
+ <div style='height: {height_px}px; padding: 5px;'>
26
+ <div style='height: {height_px}px; border: 1px solid #febe10;'>
27
+ <span style='padding-left: 10px; display: inline-block; width: 100%;'>
28
+ <div>
29
+ <span>
30
+ <b>Candid help article:</b>
31
+ <a href='{url}' target='_blank' style='text-decoration: none;'>
32
+ {doc['title']}
33
+ </a>
34
+ </span>
35
+ <br>
36
+ </div>
37
+ </span>
38
+ </div>
39
+ </div>
40
+ """
41
+ return html
retrieval/candid_learning.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+
3
+
4
+ def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
5
+ url = f"{doc['url']}"
6
+ fields = ["title", "excerpt"]
7
+
8
+ fields_dict = {}
9
+ fields_len = 0
10
+ for field in fields:
11
+ if doc.get(field, None) is not None:
12
+ fields_dict[field] = doc[field]
13
+ fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"
14
+
15
+ if (fields_len + len(doc[field])) > 999:
16
+ rest_text_len = 999 - fields_len
17
+ if rest_text_len > 0:
18
+ fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
19
+ else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
20
+ fields_len = fields_len + len(doc[field])
21
+ else:
22
+ fields_dict[field] = ""
23
+ fields_dict[field + "_txt"] = ""
24
+ html = f"""
25
+ <div style='height: {height_px}px; padding: 5px;'>
26
+ <div style='height: {height_px}px; border: 1px solid #febe10;'>
27
+ <span style='padding-left: 10px; display: inline-block; width: 100%;'>
28
+ <div>
29
+ <span>
30
+ <b>Candid Learning resource:</b>
31
+ <a href='{url}' target='_blank' style='text-decoration: none;'>
32
+ {doc['title']}
33
+ </a>
34
+ </span>
35
+ <br>
36
+ </div>
37
+ </span>
38
+ </div>
39
+ </div>
40
+ """
41
+ return html
retrieval/config.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ ONECANDID_QA = {
4
+ 'ES_URL': os.getenv('ONECANDID_URL_QA'),
5
+ 'ES_PWD': os.getenv('ONECANDID_PWD_QA'),
6
+ 'ES_UID': os.getenv('ONECANDID_UID_QA'),
7
+ "ES_CLOUD_ID": os.getenv("ONECANDID_CLOUD_ID"),
8
+ "ES_API_KEY": os.getenv("ONECANDID_API_KEY")
9
+ }
10
+
11
+ class Indices:
12
+ # NEWS_INDEX = "news-semantic-search-v1"
13
+ # TRANSACTION_INDEX = "search-semantic-cds-transactions_v1"
14
+ # ORGANIZATION_INDEX = "search-semantic-up-organizations_v2"
15
+ ISSUELAB_INDEX = "search-semantic-issuelab_v1"
16
+ ISSUELAB_INDEX_ELSER = "search-semantic-issuelab-elser_ve2"
17
+ YOUTUBE_INDEX = "search-semantic-youtube_v1"
18
+ YOUTUBE_INDEX_ELSER = "search-semantic-youtube-elser_ve1"
19
+ CANDID_BLOG_INDEX = "search-semantic-candid-blog_v1"
20
+ CANDID_BLOG_INDEX_ELSER = "search-semantic-candid-blog-elser_ve2"
21
+ CANDID_LEARNING_INDEX_ELSER = "search-semantic-candid-learning_ve1"
22
+ CANDID_HELP_INDEX_ELSER = "search-semantic-candid-help-elser_ve1"
23
+
24
+
25
+ ALL_INDICES = (
26
+ "issuelab",
27
+ "youtube",
28
+ "candid_blog",
29
+ "candid_learning",
30
+ "candid_help"
31
+ )
retrieval/elastic.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple, Dict, Iterable, Iterator, Optional, Any
2
+ from dataclasses import dataclass
3
+ from functools import partial
4
+ from itertools import groupby
5
+
6
+ from pydantic import BaseModel, Field
7
+ from langchain_core.documents import Document
8
+ from langchain_core.tools import Tool
9
+
10
+ from elasticsearch import Elasticsearch
11
+
12
+ try:
13
+ # from news import build_knn_query as news_query
14
+ # from up_orgs import build_organizations_knn_model_query as org_query
15
+ # from cds import build_transactions_knn_query as transactions_query
16
+ from config import ONECANDID_QA, ALL_INDICES, Indices
17
+ except ImportError:
18
+ # from .news import build_knn_query as news_query
19
+ # from .up_orgs import build_organizations_knn_model_query as org_query
20
+ # from .cds import build_transactions_knn_query as transactions_query
21
+ from .config import ONECANDID_QA, ALL_INDICES, Indices
22
+
23
+
24
+ @dataclass
25
+ class ElasticHitsResult:
26
+ """Dataclass for Elasticsearch hits results
27
+ """
28
+ index: str
29
+ id: Any
30
+ score: float
31
+ source: Dict[str, Any]
32
+ inner_hits: Dict[str, Any]
33
+
34
+
35
+ class RetrieverInput(BaseModel):
36
+ """Input to the Elasticsearch retriever."""
37
+ user_input: str = Field(description="query to look up in retriever")
38
+
39
+
40
+ def build_text_expansion_query(
41
+ query: str,
42
+ fields: Tuple[str],
43
+ model_id: str = ".elser_model_2_linux-x86_64"
44
+ ) -> Dict[str, Any]:
45
+
46
+ output = []
47
+
48
+ for f in fields:
49
+ output.append({
50
+ "nested": {
51
+ "path": f"embeddings.{f}.chunks",
52
+ "query": {
53
+ "text_expansion": {
54
+ f"embeddings.{f}.chunks.vector": {
55
+ "model_id": model_id,
56
+ "model_text": query,
57
+ "boost": 1 / len(fields)
58
+ }
59
+ }
60
+ },
61
+ "inner_hits": {
62
+ "_source": False,
63
+ "size": 2,
64
+ "fields": [f"embeddings.{f}.chunks.chunk"]
65
+ }
66
+ }
67
+ })
68
+ return {"query": {"bool": {"should": output}}}
69
+
70
+
71
+ def query_builder(query: str, indices: List[str], **kwargs):
72
+ queries = []
73
+ if indices is None:
74
+ indices = list(ALL_INDICES)
75
+
76
+ for index in indices:
77
+ if index == "news":
78
+ # q = news_query(query)
79
+ # q["_source"] = {"excludes": ["embeddings"]}
80
+ # q["size"] = 5
81
+ # queries.extend([{"index": Indices.NEWS_INDEX}, q])
82
+ pass
83
+ elif index == "organizations":
84
+ # q = org_query(query)
85
+ # q["_source"] = {"excludes": ["embeddings"]}
86
+ # q["size"] = 10
87
+ # queries.extend([{"index": Indices.ORGANIZATION_INDEX}, q])
88
+ pass
89
+ elif index == "grants":
90
+ # q = transactions_query(query)
91
+ # q["_source"] = {"excludes": ["embeddings"]}
92
+ # q["size"] = 10
93
+ # queries.extend([{"index": Indices.TRANSACTION_INDEX}, q])
94
+ pass
95
+ elif index == "issuelab":
96
+ q = build_text_expansion_query(
97
+ query=query,
98
+ fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
99
+ )
100
+ q["_source"] = {"excludes": ["embeddings"]}
101
+ q["size"] = 1
102
+ queries.extend([{"index": Indices.ISSUELAB_INDEX_ELSER}, q])
103
+ elif index == "youtube":
104
+ q = build_text_expansion_query(
105
+ query=query,
106
+ fields=("captions_cleaned", "description_cleaned", "title")
107
+ )
108
+ # text_cleaned duplicates captions_cleaned
109
+ q["_source"] = {"excludes": ["embeddings", "captions", "description", "text_cleaned"]}
110
+ q["size"] = 2
111
+ queries.extend([{"index": Indices.YOUTUBE_INDEX_ELSER}, q])
112
+ elif index == "candid_blog":
113
+ q = build_text_expansion_query(
114
+ query=query,
115
+ fields=("content", "title")
116
+ )
117
+ q["_source"] = {"excludes": ["embeddings"]}
118
+ q["size"] = 2
119
+ queries.extend([{"index": Indices.CANDID_BLOG_INDEX_ELSER}, q])
120
+ elif index == "candid_learning":
121
+ q = build_text_expansion_query(
122
+ query=query,
123
+ fields=("content", "title", "training_topics", "staff_recommendations")
124
+ )
125
+ q["_source"] = {"excludes": ["embeddings"]}
126
+ q["size"] = 2
127
+ queries.extend([{"index": Indices.CANDID_LEARNING_INDEX_ELSER}, q])
128
+ elif index == "candid_help":
129
+ q = build_text_expansion_query(
130
+ query=query,
131
+ fields=("content", "combined_article_description")
132
+ )
133
+ q["_source"] = {"excludes": ["embeddings"]}
134
+ q["size"] = 2
135
+ queries.extend([{"index": Indices.CANDID_HELP_INDEX_ELSER}, q])
136
+
137
+ return queries
138
+
139
+
140
+ def multi_search(queries: List[ElasticHitsResult]):
141
+ results = []
142
+ with Elasticsearch(
143
+ cloud_id=ONECANDID_QA["ES_CLOUD_ID"],
144
+ api_key=ONECANDID_QA["ES_API_KEY"],
145
+ verify_certs=False,
146
+ request_timeout=60 * 3
147
+ ) as es:
148
+ for query_group in es.msearch(body=queries).get("responses", []):
149
+ for hit in query_group.get("hits", {}).get("hits", []):
150
+ hit = ElasticHitsResult(
151
+ index=hit["_index"],
152
+ id=hit["_id"],
153
+ score=hit["_score"],
154
+ source=hit["_source"],
155
+ inner_hits=hit.get("inner_hits", {})
156
+ )
157
+ results.append(hit)
158
+ return results
159
+
160
+
161
+ def get_query_results(search_text: str, indices: Optional[List[str]] = None):
162
+ queries = query_builder(query=search_text, indices=indices)
163
+ return multi_search(queries)
164
+
165
+
166
+ def reranker(query_results: Iterable[ElasticHitsResult]) -> Iterator[ElasticHitsResult]:
167
+ """Reranks Elasticsearch hits coming from multiple indicies/queries which may have scores on different scales.
168
+ This will shuffle results
169
+
170
+ Parameters
171
+ ----------
172
+ query_results : Iterable[ElasticHitsResult]
173
+
174
+ Yields
175
+ ------
176
+ Iterator[ElasticHitsResult]
177
+ """
178
+
179
+ results: List[ElasticHitsResult] = []
180
+ for _, data in groupby(query_results, key=lambda x: x.index):
181
+ data = list(data)
182
+ max_score = max(data, key=lambda x: x.score).score
183
+ min_score = min(data, key=lambda x: x.score).score
184
+
185
+ for d in data:
186
+ d.score = (d.score - min_score) / (max_score - min_score + 1e-9)
187
+ results.append(d)
188
+
189
+ yield from sorted(results, key=lambda x: x.score, reverse=True)
190
+
191
+
192
+ def get_results(user_input: str, indices: List[str]) -> List[ElasticHitsResult]:
193
+ output = ["Search didn't return any Candid sources"]
194
+ page_content=[]
195
+ content = "Search didn't return any Candid sources"
196
+ results = get_query_results(search_text=user_input, indices=indices)
197
+ if results:
198
+ output = get_reranked_results(results)
199
+ for doc in output:
200
+ page_content.append(doc.page_content)
201
+ content = "/n/n".join(page_content)
202
+ # for the tool we need to return a tuple for content_and_artifact type
203
+ return content, output
204
+
205
+
206
+ def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024) -> str:
207
+ """Pads the relevant chunk of text with context before and after
208
+
209
+ Parameters
210
+ ----------
211
+ field_name : str
212
+ a field with the long text that was chunked into pieces
213
+ hit : ElasticHitsResult
214
+ context_length : int, optional
215
+ length of text to add before and after the chunk, by default 1024
216
+
217
+ Returns
218
+ -------
219
+ str
220
+ longer chunks stuffed together
221
+ """
222
+
223
+ chunks_with_context = []
224
+ long_text = hit.source.get(f"{field_name}", "")
225
+ inner_hits_field = f"embeddings.{field_name}.chunks"
226
+ inner_hits = hit.inner_hits
227
+ found_chunks = inner_hits.get(inner_hits_field, {})
228
+ if found_chunks:
229
+ hits = found_chunks.get("hits", {}).get("hits", [])
230
+ for h in hits:
231
+ chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
232
+ chunk = chunk[3:-3] # cutting the middle because we may have tokenizing artefacts there
233
+ # Find the start and end indices of the chunk in the large text
234
+ start_index = long_text.find(chunk)
235
+ if start_index != -1: # Chunk is found
236
+ end_index = start_index + len(chunk)
237
+ pre_start_index = max(0, start_index - context_length)
238
+ post_end_index = min(len(long_text), end_index + context_length)
239
+ context = long_text[pre_start_index:post_end_index]
240
+ chunks_with_context.append(context)
241
+ chunks_with_context_txt = '\n\n'.join(chunks_with_context)
242
+
243
+ return chunks_with_context_txt
244
+
245
+
246
+ def process_hit(hit: ElasticHitsResult) -> Document | None:
247
+ if "issuelab-elser" in hit.index:
248
+ combined_item_description = hit.source.get("combined_item_description", "") # title inside
249
+ description = hit.source.get("description", "")
250
+ combined_issuelab_findings = hit.source.get("combined_issuelab_findings", "")
251
+ # we only need to process long texts
252
+ chunks_with_context_txt = get_context("content", hit, context_length=12)
253
+ doc = Document(
254
+ page_content='\n\n'.join([
255
+ combined_item_description,
256
+ combined_issuelab_findings,
257
+ description,
258
+ chunks_with_context_txt
259
+ ]),
260
+ metadata={
261
+ "source": "IssueLab",
262
+ "source_id": hit.source["resource_id"],
263
+ "url": hit.source.get("permalink", "")
264
+ }
265
+ )
266
+ elif "youtube" in hit.index:
267
+ title = hit.source.get("title", "")
268
+ # we only need to process long texts
269
+ description_cleaned_with_context_txt = get_context("description_cleaned", hit, context_length=12)
270
+ captions_cleaned_with_context_txt = get_context("captions_cleaned", hit, context_length=12)
271
+ doc = Document(
272
+ page_content='\n\n'.join([title, description_cleaned_with_context_txt, captions_cleaned_with_context_txt]),
273
+ metadata={
274
+ "source": "Candid Youtube",
275
+ "source_id": hit.source['video_id'],
276
+ "url": f"https://www.youtube.com/watch?v&#61;{hit.source['video_id']}"
277
+ }
278
+ )
279
+ elif "candid-blog" in hit.index:
280
+ excerpt = hit.source.get("excerpt", "")
281
+ title = hit.source.get("title", "")
282
+ # we only need to process long texts
283
+ content_with_context_txt = get_context("content", hit, context_length=12)
284
+ doc = Document(
285
+ page_content='\n\n'.join([title, excerpt, content_with_context_txt]),
286
+ metadata={
287
+ "source": "Candid Blog",
288
+ "source_id": hit.source["id"],
289
+ "url": hit.source["link"]
290
+ }
291
+ )
292
+ elif "candid-learning" in hit.index:
293
+ title = hit.source.get("title", "")
294
+ content_with_context_txt = get_context("content", hit, context_length=12)
295
+ training_topics = hit.source.get("training_topics", "")
296
+ staff_recommendations = hit.source.get("staff_recommendations", "")
297
+
298
+ doc = Document(
299
+ page_content='\n\n'.join([title, staff_recommendations, training_topics, content_with_context_txt]),
300
+ metadata={
301
+ "source": "Candid Learning",
302
+ "source_id": hit.source["post_id"],
303
+ "url": hit.source.get("url", "")
304
+ }
305
+ )
306
+ elif "candid-help" in hit.index:
307
+ title = hit.source.get("title", "")
308
+ content_with_context_txt = get_context("content", hit, context_length=12)
309
+ combined_article_description = hit.source.get("combined_article_description", "")
310
+
311
+ doc = Document(
312
+ page_content='\n\n'.join([combined_article_description, content_with_context_txt]),
313
+ metadata={
314
+ "source": "Candid Help",
315
+ "source_id": hit.source["id"],
316
+ "url": hit.source.get("link", "")
317
+ }
318
+ )
319
+ else:
320
+ doc = None
321
+ return doc
322
+
323
+
324
+ def get_reranked_results(results: List[ElasticHitsResult]) -> List[Document]:
325
+ output = []
326
+ for r in reranker(results):
327
+ hit = process_hit(r)
328
+ output.append(hit)
329
+ return output
330
+
331
+
332
+ def retriever_tool(indices: List[str]) -> Tool:
333
+ # cannot use create_retriever_tool because it only provides content losing all metadata on the way
334
+ # https://python.langchain.com/docs/how_to/custom_tools/#returning-artifacts-of-tool-execution
335
+ return Tool(
336
+ name="retrieve_social_sector_information",
337
+ func=partial(get_results, indices=indices),
338
+ description=(
339
+ "Return additional information about social and philanthropic sector, "
340
+ "including nonprofits (NGO), grants, foundations, funding, RFP, LOI, Candid."
341
+ ),
342
+ args_schema=RetrieverInput,
343
+ response_format="content_and_artifact"
344
+ )
retrieval/elastic_qa.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any, Optional
2
+ import re
3
+ import os
4
+
5
+ import numpy as np
6
+
7
+ from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings
8
+ from langchain_core.documents import Document
9
+
10
+ from ...socialspark.retrieval_qa.elastic import ElasticHitsResult, ElasticsearchQABase
11
+
12
+ from .elastic import query_builder
13
+ from .config import ALL_INDICES
14
+
15
+ timestamp_pattern = re.compile(r"^(\d:\d{2}:\d{2}\.\d{3}),(\d:\d{2}:\d{2}\.\d{3})\n(.*)$")
16
+
17
+
18
+ def parse_youtube_captions(caption_text: str) -> List[str]:
19
+ timestamp_blocks = re.split(r'\n{2}', caption_text)
20
+ parsed_blocks = [m.groups() for block in timestamp_blocks if (m:=timestamp_pattern.match(block)) if not None]
21
+ video_length_mins = int(parsed_blocks[-1][0].split(':')[1])
22
+
23
+ # what is the block timing in minutes which gives 10 chunks? keep minimum 1 minute breaks to maintain context
24
+ minute_difference = min(video_length_mins // 10, 1)
25
+
26
+ prev_minute_start = 0
27
+ texts = []
28
+ text_block = []
29
+ for start, _, text in parsed_blocks:
30
+ text_block.append(text)
31
+
32
+ current_minute = int(start.split(':')[1])
33
+ if (current_minute - prev_minute_start) >= minute_difference:
34
+ texts.append(' '.join(text_block))
35
+ text_block.clear()
36
+ prev_minute_start = current_minute
37
+
38
+ return texts
39
+
40
+
41
+ def parse_candid_learning(text: str) -> List[str]:
42
+ texts = []
43
+ for block in map(str.strip, re.split(r'\n{1,}', text, flags=re.I | re.M)):
44
+ if (
45
+ 'back to top' in block.lower()
46
+ or 'table of contents' in block.lower()
47
+ ):
48
+ continue
49
+
50
+ texts.append(block)
51
+ return texts
52
+
53
+
54
+ class ElasticsearchQA(ElasticsearchQABase):
55
+
56
+ indices: Optional[List[str]] = ALL_INDICES
57
+ embedding: Any = HuggingFaceEndpointEmbeddings(
58
+ huggingfacehub_api_token=os.getenv("HF_API_KEY"),
59
+ # model="sentence-transformers/all-mpnet-base-v2"
60
+ model="mixedbread-ai/mxbai-embed-large-v1"
61
+ )
62
+
63
+ def build_query(self, query: str, **kwargs) -> List[Dict[str, Any]]:
64
+ queries = query_builder(query=query, indices=self.indices)
65
+ return queries
66
+
67
+ def sub_section_alignment(self, query: str, document: List[str]) -> str:
68
+ question_vector = np.array(self.embedding.embed_query(query), dtype='float32')
69
+ vectors = np.array(self.embedding.embed_documents(document), dtype='float32')
70
+ vectors = np.array(vectors, dtype='float32')
71
+
72
+ size = max(5, int(0.1 * len(document)))
73
+
74
+ doc_vector = vectors.sum(axis=0, keepdims=True)
75
+ vectors /= np.linalg.norm(vectors, ord=2.0, axis=-1, keepdims=True)
76
+ doc_vector /= np.linalg.norm(doc_vector, ord=2.0, axis=-1, keepdims=True)
77
+ question_vector /= np.linalg.norm(question_vector, ord=2.0, axis=-1, keepdims=True)
78
+ # similarity = (doc_vector * vectors).sum(axis=-1)
79
+ similarity = (question_vector * vectors).sum(axis=-1)
80
+
81
+ return '\n'.join(text for text, _ in sorted(zip(document, similarity), key=lambda x: 1 - x[-1])[:size])
82
+
83
+ def process_hit(self, hit: ElasticHitsResult, q: str) -> Document | None:
84
+ if "news" in hit.index:
85
+ doc = Document(
86
+ page_content='\n\n'.join(v for k, v in hit.source["texts"].items() if v),
87
+ metadata={
88
+ "source": "news",
89
+ "source_id": hit.source['metadata']['link']
90
+ }
91
+ )
92
+ elif "transactions" in hit.index:
93
+ doc = Document(
94
+ page_content='\n\n'.join(v for k, v in hit.source["semantic_texts"].items() if v),
95
+ metadata={
96
+ "source": "cds-transactions",
97
+ "source_id": hit.source["id"]
98
+ }
99
+ )
100
+ elif "organizations" in hit.index:
101
+ source = hit.source
102
+ org_gen = source.get("combined_organization_description_general", "")
103
+ org_fin = source.get("combined_organization_description_financial", "")
104
+ org_contact = source.get("combined_organization_description_contacts", "")
105
+ mission = ""
106
+ if source.get("mission_statement", None) is not None:
107
+ mission = source.get("mission_statement", "")
108
+ keyword = ""
109
+ if source.get("keyword", None) is not None:
110
+ keyword = source.get("keyword", "")
111
+ programs = source.get("programs", "")
112
+ doc = Document(
113
+ page_content='\n\n'.join([org_gen, mission, keyword, org_fin, org_contact, programs]),
114
+ metadata={
115
+ "source": "UP-organizations-QA",
116
+ "source_id": hit.source["candid_entity_id"]
117
+ }
118
+ )
119
+ # elif "issuelab" in hit.index:
120
+ # doc = Document(
121
+ # page_content='\n\n'.join(v for k, v in hit.source["semantic_texts"].items() if v),
122
+ # metadata={
123
+ # "source": "IssueLab",
124
+ # "source_id": hit.source["resource_id"]
125
+ # }
126
+ # )
127
+ elif "issuelab-elser" in hit.index:
128
+ title = hit.source.get("title", "")
129
+ description = hit.source.get("description", "")
130
+ doc = Document(
131
+ page_content='\n\n'.join([title, description]),
132
+ metadata={
133
+ "source": "IssueLab",
134
+ "source_id": hit.source["resource_id"],
135
+ "url": hit.source.get("permalink", "")
136
+ }
137
+ )
138
+ elif "youtube" in hit.index:
139
+ title = hit.source.get("title", "")
140
+ summary = self.sub_section_alignment(
141
+ query=q,
142
+ document=parse_youtube_captions(hit.source.get("text"))
143
+ )
144
+ doc = Document(
145
+ # page_content='\n\n'.join([title]),
146
+ page_content=summary,
147
+ metadata={
148
+ "source": "Candid's Youtube channel",
149
+ "source_id": hit.source['video_id'],
150
+ "url": f"https://www.youtube.com/watch?v={hit.source['video_id']}"
151
+ }
152
+ )
153
+ elif "candid-blog" in hit.index:
154
+ excerpt = hit.source.get("excerpt", "")
155
+ title = hit.source.get("title", "")
156
+ doc = Document(
157
+ page_content='\n\n'.join([title, excerpt]),
158
+ metadata={
159
+ "source": "Candid Blog",
160
+ "source_id": hit.source["id"],
161
+ "url": hit.source["link"]
162
+ }
163
+ )
164
+ elif "candid-learning" in hit.index:
165
+ # content = hit.source.get("content", "")
166
+ title = hit.source.get("title", "")
167
+ summary = self.sub_section_alignment(
168
+ query=q,
169
+ document=parse_candid_learning(hit.source.get("content", ""))
170
+ )
171
+ doc = Document(
172
+ # page_content='\n\n'.join([title]),
173
+ page_content=summary,
174
+ metadata={
175
+ "source": "Candid Learning",
176
+ "source_id": hit.source["post_id"],
177
+ "url": hit.source.get("url", "")
178
+ }
179
+ )
180
+ elif "candid-help" in hit.index:
181
+ title = hit.source.get("title", "")
182
+ content = hit.source.get("content", "")
183
+
184
+ doc = Document(
185
+ page_content='\n\n'.join([title, content]),
186
+ metadata={
187
+ "source": "Candid Help",
188
+ "source_id": hit.source["id"],
189
+ "url": hit.source.get("link", "")
190
+ }
191
+ )
192
+ else:
193
+ doc = None
194
+ return doc
retrieval/issuelab.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Tuple, Union, Any
2
+
3
+ import numpy as np
4
+
5
+
6
+ def build_knn_query(
7
+ query: Union[str, np.ndarray, List[float]],
8
+ fields: Tuple[str] = (
9
+ "item_description_vector",
10
+ "description_vector",
11
+ "title_vector",
12
+ "content_vector",
13
+ "issuelab_key_findings_vector",
14
+ ),
15
+ subjects_vector: List[float] | None = None,
16
+ population_vector: List[float] | None = None,
17
+ k: int = 10,
18
+ model_id: str = "sentence-transformers__all-mpnet-base-v2"
19
+ ):
20
+ output = []
21
+
22
+ for f in fields:
23
+ if isinstance(query, str):
24
+ output.append({
25
+ "field": f"embeddings.{f}.predicted_value",
26
+ "k": k,
27
+ "num_candidates": 100,
28
+ "query_vector_builder": {
29
+ "text_embedding": {
30
+ "model_id": model_id,
31
+ "model_text": query
32
+ }
33
+ },
34
+ "boost": 1 / len(fields)
35
+ })
36
+ elif isinstance(query, (np.ndarray, list)):
37
+ output.append({
38
+ "field": f"embeddings.{f}.predicted_value",
39
+ "query_vector": list(query),
40
+ "k": k,
41
+ "num_candidates": 100,
42
+ })
43
+
44
+ if subjects_vector:
45
+ output.append({
46
+ "field": "embeddings.subjects_vector",
47
+ "query_vector": subjects_vector,
48
+ "k": k,
49
+ "num_candidates": 100,
50
+ })
51
+
52
+ if population_vector:
53
+ output.append({
54
+ "field": "embeddings.populations_vector",
55
+ "query_vector": population_vector,
56
+ "k": k,
57
+ "num_candidates": 100,
58
+ })
59
+ return {"knn": output}
60
+
61
+
62
+ def issuelab_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
63
+ chunks_html = ""
64
+ if show_chunks:
65
+ cleaned_text = []
66
+ for k, v in doc["inner_hits"].items():
67
+ hits = v["hits"]["hits"]
68
+ for h in hits:
69
+ for k1, v1 in h["fields"].items():
70
+ # we don't want other chunks
71
+ if "content" in k1:
72
+ cleaned_text.append(f"<div><p>{v1[0]['chunk'][0]}</p></div>")
73
+
74
+ chunks_html ="<span><b>Relevant parts of the content:</b></span>" + "<br>".join(cleaned_text)
75
+
76
+ html = f"""
77
+ <div style='height: auto; padding: 5px;'>
78
+ <div style='border: 1px solid #febe10;'>
79
+ <span style='display: inline-block; height: {height_px - 10}px; padding: 5px; vertical-align: top;'>
80
+ <img
81
+ src='{doc['cover_graphic_small']}'
82
+ style='max-height: 100%; overflow: hidden; border-radius: 3%;'
83
+ >
84
+ </span>
85
+
86
+ <span style='padding: 10px; display: inline-block; width: 70%;'>
87
+ <div>
88
+ <span><b>Issuelab ID:</b> {doc['resource_id']}</span>
89
+ <br>
90
+ <span>
91
+ <a href='{doc['issuelab_url']}' target='_blank' style='text-decoration: none;'>
92
+ {doc['title']}
93
+ </a>
94
+ </span>
95
+ <br>
96
+
97
+ <span><b>Description:</b> {doc['description']}</span>
98
+ <br>
99
+ <div>{doc['combined_item_description']}</div>
100
+ <br>
101
+ <div>{chunks_html}</div>
102
+
103
+ </div>
104
+ </span>
105
+ </div>
106
+ </div>
107
+ """
108
+ return html
retrieval/youtube.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Tuple, Any
2
+
3
+
4
+ def build_knn_model_query(
5
+ query: str,
6
+ fields: Tuple[str] = (
7
+ "captions_cleaned",
8
+ "description_cleaned",
9
+ "title"
10
+ ),
11
+ k: int = 10,
12
+ model_id: str = "sentence-transformers__all-mpnet-base-v2"
13
+ ):
14
+ output = []
15
+
16
+ for f in fields:
17
+ output.append({
18
+ "field": f"embeddings.{f}.chunks.vector.predicted_value",
19
+ "k": k,
20
+ "num_candidates": 100,
21
+ "query_vector_builder": {
22
+ "text_embedding": {
23
+ "model_id": model_id,
24
+ "model_text": query
25
+ }
26
+ },
27
+ "boost": 1 / len(fields)
28
+ })
29
+ return {"knn": output}
30
+
31
+
32
+ def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
33
+ url = f"https://www.youtube.com/watch?v={doc['video_id']}"
34
+ fields = ["title", "description_cleaned"]
35
+
36
+ fields_dict = {}
37
+ fields_len = 0
38
+ for field in fields:
39
+ if doc.get(field, None) is not None:
40
+ fields_dict[field] = doc[field]
41
+ fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"
42
+
43
+ if (fields_len + len(doc[field])) > 999:
44
+ rest_text_len = 999 - fields_len
45
+ if rest_text_len > 0:
46
+ fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
47
+ else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
48
+ fields_len = fields_len + len(doc[field])
49
+ else:
50
+ fields_dict[field] = ""
51
+ fields_dict[field + "_txt"] = ""
52
+ html = f"""
53
+ <div style='height: {height_px}px; padding: 5px;'>
54
+ <div style='height: {height_px}px; border: 1px solid #febe10;'>
55
+ <span style='padding-left: 10px; display: inline-block; width: 100%;'>
56
+ <div>
57
+ <span>
58
+ <b>Candid Youtube video:</b>
59
+ <a href='{url}' target='_blank' style='text-decoration: none;'>
60
+ {doc['title']}
61
+ </a>
62
+ </span>
63
+ <iframe
64
+ width="426"
65
+ height="240"
66
+ src="https://www.youtube.com/embed/{doc['video_id']}?si=0-y6eRrOzXTUSBDY"
67
+ title="YouTube video player"
68
+ frameborder="0"
69
+ allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
70
+ referrerpolicy="strict-origin-when-cross-origin"
71
+ allowfullscreen
72
+ style="display: inline-block; float: left;padding-right: 10px;padding-top: 5px;">
73
+ </iframe>
74
+ <br>
75
+ <br>
76
+ {fields_dict["description_cleaned_txt"]}
77
+ </div>
78
+ </span>
79
+ </div>
80
+ </div>
81
+ """
82
+ return html
search.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+ import json
3
+
4
+ import gradio as gr
5
+ import requests
6
+
7
+ from .utils import html_format_doc
8
+ from .retrieval.up_orgs_keyword import organization_card_html
9
+ from .retrieval.elastic import reranker, get_query_results
10
+ from .retrieval.config import ALL_INDICES
11
+ from . import UP_QA_SEARCH_API
12
+
13
+
14
+ def run_search(search_text: str, indices: Optional[List[str]] = None):
15
+ results = get_query_results(search_text, indices=indices)
16
+
17
+ output = []
18
+ for result in reranker(results):
19
+ source_name = None
20
+ if "news" in result.index:
21
+ source_name = "news"
22
+ elif "transactions" in result.index:
23
+ source_name = "transactions"
24
+ elif "organizations" in result.index:
25
+ source_name = "organizations"
26
+ elif "issuelab-elser" in result.index:
27
+ source_name = "issuelab"
28
+ # elif "issuelab" in result.index:
29
+ # source_name = "issuelab"
30
+ elif "youtube-elser" in result.index:
31
+ source_name = "youtube"
32
+ # elif "youtube" in result.index:
33
+ # source_name = "youtube"
34
+ elif "candid-blog-elser" in result.index:
35
+ source_name = "candid_blog"
36
+ # elif "candid-blog" in result.index:
37
+ # source_name = "candid_blog"
38
+ elif "candid-learning" in result.index: # TODO fix that
39
+ source_name = "candid_learning"
40
+ elif "candid-help-elser" in result.index:
41
+ source_name = "candid_help"
42
+
43
+ doc = html_format_doc(doc=result.source, source=source_name)
44
+ output.append(doc)
45
+ return f"<div>{''.join(output)}</div>"
46
+
47
+
48
+ def run_ks(search_text: str):
49
+ json_body = {"keyword": search_text, "rowCount": 10}
50
+
51
+ response = requests.post(
52
+ url=UP_QA_SEARCH_API["API_URL"],
53
+ json=json_body,
54
+ headers={
55
+ "accept": "application/json",
56
+ "content-type": "application/json",
57
+ "x-api-key": UP_QA_SEARCH_API["API_KEY"]
58
+ },
59
+ timeout=(5 * 60)
60
+ )
61
+
62
+ r_json = json.loads(response.text)
63
+ output_k = []
64
+ if r_json.get("returnedOrgs", None) is not None:
65
+ for doc in r_json["returnedOrgs"]:
66
+ org = {}
67
+ org["candid_entity_id"] = doc.get("candidEntityID", "")
68
+ org["main_name"] = doc.get("orgName", "")
69
+ org["logo"] = doc.get("logo", "")
70
+ org["seal"] = doc.get("seal", {})
71
+ org["city"] = doc.get("city", "")
72
+ org["admin1"] = doc.get("admin1", "")
73
+ org["country_name"] = doc.get("countryName", "")
74
+ org["taxonomy"] = doc.get("taxonomy", {})
75
+ highlights = doc.get("highlights", [])
76
+ if highlights:
77
+ for h in highlights:
78
+ if h["field"] == "mission_statement":
79
+ org["mission_statement"] = "; ".join(h["highlights"])
80
+
81
+ html = organization_card_html(org, 250)
82
+ output_k.append(html)
83
+
84
+ # Getting semantic results
85
+ output_s = run_search(search_text=search_text)
86
+
87
+ return f"<div>{''.join(output_k)}</div>", output_s
88
+
89
+
90
+ def build_search_tab() -> gr.Blocks:
91
+ with gr.Blocks(theme=gr.themes.Soft(), title="Semantic search") as demo:
92
+ gr.Markdown(
93
+ "<h1>Alpha demo: Semantic search</h1>"
94
+ "Search and ask questions of Candid's data together with casual language"
95
+ )
96
+
97
+ query = gr.Text(placeholder="Search", show_label=False)
98
+
99
+ with gr.Accordion(label="Advanced settings", open=False):
100
+ es_indices = gr.CheckboxGroup(
101
+ choices=list(ALL_INDICES),
102
+ value=list(ALL_INDICES),
103
+ label="Sources to include",
104
+ interactive=True
105
+ )
106
+ search = gr.Button("Search")
107
+
108
+ feed = gr.HTML()
109
+
110
+ # pylint: disable=no-member
111
+ search.click(
112
+ fn=run_search,
113
+ inputs=[query, es_indices],
114
+ outputs=[feed],
115
+ api_name=False,
116
+ queue=True
117
+ )
118
+ return demo
119
+
120
+
121
+ def build_ks_tab() -> gr.Blocks:
122
+ with gr.Blocks(theme=gr.themes.Soft(), title="Semantic search") as demo:
123
+ gr.Markdown(
124
+ "<h1>Alpha demo: Keyword versus Semantic search</h1>"
125
+ "Compare current search results versus semantic search results"
126
+ )
127
+ query = gr.TextArea(placeholder="Search", show_label=False, lines=1)
128
+ ask = gr.Button("Search Unified Platform organizations")
129
+ with gr.Row():
130
+ with gr.Column():
131
+ gr.Markdown("<h2>Keyword results</h2>")
132
+ feed_k = gr.HTML()
133
+ with gr.Column():
134
+ gr.Markdown("<h2>Semantic results</h2>")
135
+ feed_s = gr.HTML()
136
+
137
+ # pylint: disable=no-member
138
+ ask.click(
139
+ fn=run_ks,
140
+ inputs=[query],
141
+ outputs=[feed_k, feed_s],
142
+ api_name=False,
143
+ queue=True
144
+ )
145
+
146
+ return demo
static/candid_logo_yellow.png ADDED
static/css.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ css_chat = """
2
+ .message-row img {
3
+ margin: 0px !important;
4
+ }
5
+ .avatar-container img {
6
+ padding: 0px !important;
7
+ }
8
+
9
+ #ssearch-sources {
10
+ display: flex;
11
+ gap: 10px;
12
+ min-width: 75vw;
13
+ padding-bottom: 5px;
14
+ }
15
+
16
+ .ssearch-source-btn {
17
+ background-color: #febe10;
18
+ color: black;
19
+ padding: 5px;
20
+ text-align: center;
21
+ border-radius: 12px;
22
+ min-width: 70px;
23
+ max-width: 75px;
24
+ box-shadow: 0 2px 5px 0 rgba(0, 0, 0,0.2);
25
+ height: 45px;
26
+ font-size:small;
27
+ }
28
+
29
+ .ssearch-source {
30
+ text-decoration: none;
31
+ display: block;
32
+ box-sizing: border-box;
33
+ }
34
+
35
+ button.upload-button.svelte-1d7elt4 {
36
+ visibility: hidden !important;
37
+ }
38
+
39
+ .candid-org-link {
40
+ font-weight: bold;
41
+ text-decoration: none;
42
+ }
43
+
44
+ .candid-app-link {
45
+ font-size: small;
46
+ }
47
+
48
+ """
tools/__init__.py ADDED
File without changes
tools/config.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import os
2
+ CDS_API = {
3
+ 'CDS_API_URL': os.getenv('CDS_API_URL'),
4
+ 'CDS_API_KEY': os.getenv('CDS_API_KEY')
5
+ }
tools/org_seach.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import re
3
+
4
+ from fuzzywuzzy import fuzz
5
+
6
+ from langchain.output_parsers.openai_tools import JsonOutputToolsParser
7
+ from langchain_openai.chat_models import ChatOpenAI
8
+ from langchain_core.runnables import RunnableSequence
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+ from pydantic import BaseModel
11
+
12
+ from dotenv import load_dotenv
13
+ try:
14
+ from common.org_search_component import OrgSearch
15
+ except ImportError:
16
+ from ...common.org_search_component import OrgSearch
17
+
18
+ load_dotenv()
19
+ search = OrgSearch()
20
+
21
+
22
+ class OrganizationNames(BaseModel):
23
+ orgnames: List[str]
24
+
25
+
26
+ def extract_org_links_from_chatbot(chatbot_output: str):
27
+ """
28
+ Extracts a list of organization names from the provided text.
29
+
30
+ Args:
31
+ chatbot_output (str):The chatbot output containing organization names and other content.
32
+
33
+ Returns:
34
+ list: A list of organization names extracted from the text.
35
+
36
+ Raises:
37
+ ValueError: If parsing fails or if an unexpected output format is received.
38
+ """
39
+ prompt = """Extract only the names of officially recognized organizations, foundations, and government entities from the text below. Do not include any entries that contain descriptions, regional identifiers, or explanations within parentheses or following the name. Strictly exclude databases, resources, crowdfunding platforms, and general terms. Provide the output only in the specified JSON format.
40
+
41
+ input text below:
42
+
43
+ ```{chatbot_output}``
44
+
45
+ output format:
46
+ {{
47
+ 'orgnames' : [list of organization names without any additional descriptions or identifiers]
48
+ }}
49
+
50
+ """
51
+
52
+ try:
53
+ parser = JsonOutputToolsParser()
54
+ llm = ChatOpenAI(model="gpt-4o").bind_tools([OrganizationNames])
55
+ prompt = ChatPromptTemplate.from_template(prompt)
56
+ chain = RunnableSequence(prompt, llm, parser)
57
+
58
+ # Run the chain with the input data
59
+ result = chain.invoke({"chatbot_output": chatbot_output})
60
+
61
+ # Extract the organization names from the output
62
+ output_list = result[0]["args"].get("orgnames", [])
63
+
64
+ # Validate output format
65
+ if not isinstance(output_list, list):
66
+ raise ValueError("Unexpected output format: 'orgnames' should be a list")
67
+
68
+ return output_list
69
+
70
+ except Exception as e:
71
+ # Log or print the error as needed for debugging
72
+ print(f"text does not have any organization: {e}")
73
+ return []
74
+
75
+
76
+ def is_similar(name: str, list_of_dict: list, threshold: int = 80):
77
+ """
78
+ Returns True if `name` is similar to any names in `list_of_dict` based on a similarity threshold.
79
+ """
80
+ try:
81
+ for item in list_of_dict:
82
+ try:
83
+ # Attempt to calculate similarity score
84
+ similarity = fuzz.ratio(name.lower(), item["name"].lower())
85
+ if similarity >= threshold:
86
+ return True
87
+ except KeyError:
88
+ # Handle cases where 'name' key might be missing in dictionary
89
+ print(f"KeyError: Missing 'name' key in dictionary item {item}")
90
+ continue
91
+ except AttributeError:
92
+ # Handle non-string name values in dictionary items
93
+ print(f"AttributeError: Non-string 'name' in dictionary item {item}")
94
+ continue
95
+ except TypeError as e:
96
+ # Handle cases where input types are incorrect
97
+ print(f"TypeError: {e}")
98
+ return False
99
+
100
+ return False
101
+
102
+
103
+ def generate_org_link_dict(org_names_list: list):
104
+ """
105
+ Maps organization names to their Candid profile URLs if available.
106
+
107
+ For each organization in `output_list`, this function attempts to retrieve a matching profile
108
+ using `search_org`. If a similar name is found and a Candid entity ID is available, it constructs
109
+ a profile URL. If no ID or similar match is found, or if an error occurs, it assigns an empty string.
110
+
111
+ Args:
112
+ output_list (list): List of organization names (str) to retrieve Candid profile links for.
113
+
114
+ Returns:
115
+ dict: Dictionary with organization names as keys and Candid profile URLs or empty strings as values.
116
+
117
+ Example:
118
+ get_org_link(['New York-Presbyterian Hospital'])
119
+ # {'New York-Presbyterian Hospital': 'https://app.candid.org/profile/6915255'}
120
+ """
121
+ link_dict = {}
122
+
123
+ for org in org_names_list:
124
+ try:
125
+ # Attempt to retrieve organization data
126
+ response = search(org)
127
+
128
+ # Check if there is a valid response and if names are similar
129
+ if response and is_similar(org, response[0].get("names", "")):
130
+ # Try to get the Candid entity ID and construct the URL
131
+ candid_entity_id = response[0].get("candid_entity_id")
132
+ if candid_entity_id:
133
+ link_dict[org] = (
134
+ f"https://app.candid.org/profile/{candid_entity_id}"
135
+ )
136
+ else:
137
+ link_dict[org] = "" # No ID found, set empty string
138
+ else:
139
+ link_dict[org] = "" # No similar match found
140
+
141
+ except KeyError as e:
142
+ # Handle missing keys in the response dictionary
143
+ print(f"KeyError encountered for organization '{org}': {e}")
144
+ link_dict[org] = ""
145
+
146
+ except Exception as e:
147
+ # Catch any other unexpected errors
148
+
149
+ print(f"An error occurred for organization '{org}': {e}")
150
+ link_dict[org] = ""
151
+
152
+ return link_dict
153
+
154
+
155
+ def embed_org_links_in_text(input_text: str, org_link_dict: dict):
156
+ """
157
+ Replaces organization names in `text` with links from `link_dict` and appends a Candid info message.
158
+
159
+ Args:
160
+ text (str): The text containing organization names.
161
+ link_dict (dict): Mapping of organization names to URLs.
162
+
163
+ Returns:
164
+ str: Updated text with linked organization names and an appended Candid message.
165
+ """
166
+ try:
167
+ for org_name, url in org_link_dict.items():
168
+ if url: # Only proceed if the URL is not empty
169
+ regex_pattern = re.compile(re.escape(org_name))
170
+ input_text = regex_pattern.sub(
171
+ repl=f"<a href={url} target='_blank' rel='noreferrer' class='candid-org-link'>{org_name}</a>",
172
+ string=input_text
173
+ )
174
+
175
+ # Append Candid information message at the end
176
+ input_text += "<p class='candid-app-link'> Visit <a href=https://app.candid.org/ target='_blank' rel='noreferrer' class='candid-org-link'>Candid</a> to get nonprofit information you need.</p>"
177
+
178
+ except TypeError as e:
179
+ print(f"TypeError encountered: {e}")
180
+ return input_text
181
+
182
+ except re.error as e:
183
+ print(f"Regex error encountered for '{org_name}': {e}")
184
+ return input_text
185
+
186
+ except Exception as e:
187
+ print(f"Unexpected error: {e}")
188
+ return input_text
189
+
190
+ return input_text
tools/question_reformulation.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.prompts import ChatPromptTemplate
2
+ from langchain_core.output_parsers import StrOutputParser
3
+
4
+
5
+ def reformulate_question_using_history(state, llm):
6
+ """
7
+ Transform the query to produce a better query with details from previous messages.
8
+
9
+ Args:
10
+ state (messages): The current state
11
+ llm: LLM to use
12
+ Returns:
13
+ dict: The updated state with re-phrased question and original user_input for UI
14
+ """
15
+ print("---REFORMULATE THE USER INPUT---")
16
+ messages = state["messages"]
17
+ question = messages[-1].content
18
+
19
+ if len(messages) > 1:
20
+ contextualize_q_system_prompt = """Given a chat history and the latest user input \
21
+ which might reference context in the chat history, formulate a standalone input \
22
+ which can be understood without the chat history.
23
+ Chat history:
24
+ \n ------- \n
25
+ {chat_history}
26
+ \n ------- \n
27
+ User input:
28
+ \n ------- \n
29
+ {question}
30
+ \n ------- \n
31
+ Do NOT answer the question, \
32
+ just reformulate it if needed and otherwise return it as is.
33
+ """
34
+
35
+ contextualize_q_prompt = ChatPromptTemplate([
36
+ ("system", contextualize_q_system_prompt),
37
+ ("human", question),
38
+ ])
39
+
40
+ rag_chain = contextualize_q_prompt | llm | StrOutputParser()
41
+ new_question = rag_chain.invoke({"chat_history": messages, "question": question})
42
+ print(f"user asked: '{question}', agent reformulated the question basing on the chat history: {new_question}")
43
+ return {"messages": [new_question], "user_input" : question}
44
+ return {"messages": [question], "user_input" : question}
utils.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any
2
+ from uuid import uuid4
3
+
4
+ import gradio as gr
5
+
6
+ try:
7
+ from retrieval import (
8
+ candid_blog,
9
+ candid_help,
10
+ candid_learning,
11
+ # cds,
12
+ issuelab,
13
+ # news,
14
+ # up_orgs,
15
+ youtube
16
+ )
17
+ except ImportError:
18
+ from .retrieval import (
19
+ candid_blog,
20
+ candid_help,
21
+ candid_learning,
22
+ # cds,
23
+ issuelab,
24
+ # news,
25
+ # up_orgs,
26
+ youtube
27
+ )
28
+
29
+
30
+ # TODO summarize messages instead
31
+ def filter_messages(messages, k=10):
32
+ return messages[-k:]
33
+
34
+
35
+ def html_format_doc(doc: Dict[str, Any], source: str, show_chunks=False) -> str:
36
+ height_px = 200
37
+ html = ""
38
+
39
+ if source == "news":
40
+ # html = news.article_card_html(doc, height_px, show_chunks)
41
+ pass
42
+ elif source == "transactions":
43
+ # html = cds.transaction_card_html(doc, height_px, show_chunks)
44
+ pass
45
+ elif source == "organizations":
46
+ # html = up_orgs.organization_card_html(doc, 400, show_chunks)
47
+ pass
48
+ elif source == "issuelab":
49
+ html = issuelab.issuelab_card_html(doc, height_px, show_chunks)
50
+ elif source == "youtube":
51
+ html = youtube.build_card_html(doc, 400, show_chunks)
52
+ elif source == "candid_blog":
53
+ html = candid_blog.build_card_html(doc, height_px, show_chunks)
54
+ elif source == "candid_learning":
55
+ html = candid_learning.build_card_html(doc, height_px, show_chunks)
56
+ elif source == "candid_help":
57
+ html = candid_help.build_card_html(doc, height_px, show_chunks)
58
+ return html
59
+
60
+
61
+ def html_format_docs_chat(docs):
62
+ """
63
+ Formats Candid sources into a line of buttons
64
+ """
65
+ html = ""
66
+ if docs:
67
+ docs_html = []
68
+ for doc in docs:
69
+ s_name = doc.metadata.get("source", "Source")
70
+ s_url = doc.metadata.get("url", "URL")
71
+ s_html = f"""<a href={s_url} target='_blank' rel='noreferrer' class='ssearch-source'> \
72
+ <button class='ssearch-source-btn'>{s_name}</button></a>"""
73
+ docs_html.append(s_html)
74
+ docs_html_insert = "".join(s for s in docs_html)
75
+ html = f"""<div id='ssearch-sources'>{docs_html_insert}</div>"""
76
+ return html
77
+
78
+
79
+ def format_chat_response(chatbot: List[Any]) -> List[Any]:
80
+ """We have sources appended as one more tuple. Here we concatinate HTML of sources
81
+ with the AI response
82
+ Returns:
83
+ _type_: updated chatbot message as HTML
84
+ """
85
+ if chatbot:
86
+ sources = chatbot[-1][1]
87
+ chatbot.pop(-1)
88
+ chatbot[-1][1] = chatbot[-1][1] + sources
89
+ return gr.HTML(chatbot)
90
+
91
+
92
+ def format_chat_ag_response(chatbot: List[Any]) -> List[Any]:
93
+ """If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
94
+ with the AI response
95
+ Returns:
96
+ _type_: updated chatbot message as HTML
97
+ """
98
+ sources = ""
99
+ if chatbot:
100
+ title = chatbot[-1]["metadata"].get("title", None)
101
+ if title == "Sources HTML":
102
+ sources = chatbot[-1]["content"]
103
+ chatbot.pop(-1)
104
+ chatbot[-1]["content"] = chatbot[-1]["content"] + sources
105
+ return gr.HTML(chatbot)
106
+
107
+
108
+ def valid_inputs(*args) -> bool:
109
+ return any(a is not None or (isinstance(a, str) and a.strip() != '') for a in args)
110
+
111
+ def get_session_id(thread_id: gr.components.Component) -> str:
112
+ if not thread_id:
113
+ thread_id = uuid4().hex
114
+ return thread_id