Spaces:
Running
Running
Initial commit
Browse files- LICENSE +21 -0
- README.md +7 -8
- app.py +109 -0
- chat.py +252 -0
- requirements.txt +7 -0
- retrieval/__init__.py +0 -0
- retrieval/candid_blog.py +70 -0
- retrieval/candid_help.py +41 -0
- retrieval/candid_learning.py +41 -0
- retrieval/config.py +31 -0
- retrieval/elastic.py +344 -0
- retrieval/elastic_qa.py +194 -0
- retrieval/issuelab.py +108 -0
- retrieval/youtube.py +82 -0
- search.py +146 -0
- static/candid_logo_yellow.png +0 -0
- static/css.py +48 -0
- tools/__init__.py +0 -0
- tools/config.py +5 -0
- tools/org_seach.py +190 -0
- tools/question_reformulation.py +44 -0
- utils.py +114 -0
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2024 Candid
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,14 +1,13 @@
|
|
1 |
---
|
2 |
title: Ask Candid
|
3 |
-
|
4 |
-
|
5 |
-
|
|
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
10 |
license: mit
|
11 |
-
short_description: AI assistant for philanthropy and the social sector
|
12 |
---
|
13 |
-
|
14 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Ask Candid
|
3 |
+
short_description: AI assistant for philanthropy and the social sector
|
4 |
+
emoji: 💬
|
5 |
+
colorFrom: blue
|
6 |
+
colorTo: purple
|
7 |
sdk: gradio
|
8 |
+
sdk_version: 5.5.0
|
9 |
app_file: app.py
|
10 |
+
pinned: true
|
11 |
license: mit
|
|
|
12 |
---
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
app.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Tuple, Dict, TypedDict, Optional, Any
|
2 |
+
import os
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
from langchain_openai.chat_models import ChatOpenAI
|
7 |
+
|
8 |
+
try:
|
9 |
+
from utils import format_chat_ag_response
|
10 |
+
from retrieval.config import ALL_INDICES
|
11 |
+
from chat import run_chat
|
12 |
+
except ImportError:
|
13 |
+
from .utils import format_chat_ag_response
|
14 |
+
from .retrieval.config import ALL_INDICES
|
15 |
+
from .chat import run_chat
|
16 |
+
|
17 |
+
ROOT = os.path.dirname(os.path.abspath(__file__))
|
18 |
+
|
19 |
+
|
20 |
+
class LoggedComponents(TypedDict):
|
21 |
+
context: List[gr.components.Component]
|
22 |
+
found_helpful: gr.components.Component
|
23 |
+
will_recommend: gr.components.Component
|
24 |
+
comments: gr.components.Component
|
25 |
+
email: gr.components.Component
|
26 |
+
|
27 |
+
|
28 |
+
def execute(
|
29 |
+
thread_id: str,
|
30 |
+
user_input: Dict[str, Any],
|
31 |
+
chatbot: List[Dict],
|
32 |
+
max_new_tokens: int,
|
33 |
+
indices: Optional[List[str]] = None,
|
34 |
+
):
|
35 |
+
llm = ChatOpenAI(
|
36 |
+
model_name="gpt-4o",
|
37 |
+
max_tokens=max_new_tokens,
|
38 |
+
api_key=os.getenv("OPENAI_API_KEY"),
|
39 |
+
temperature=0.0,
|
40 |
+
streaming=True
|
41 |
+
)
|
42 |
+
|
43 |
+
return run_chat(
|
44 |
+
thread_id=thread_id,
|
45 |
+
user_input=user_input,
|
46 |
+
chatbot=chatbot,
|
47 |
+
llm=llm,
|
48 |
+
indices=indices
|
49 |
+
)
|
50 |
+
|
51 |
+
|
52 |
+
def build_chat() -> Tuple[LoggedComponents, gr.Blocks]:
|
53 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Ask Candid") as demo:
|
54 |
+
with gr.Accordion(label="Advanced settings", open=False):
|
55 |
+
es_indices = gr.CheckboxGroup(
|
56 |
+
choices=list(ALL_INDICES),
|
57 |
+
value=list(ALL_INDICES),
|
58 |
+
label="Sources to include",
|
59 |
+
interactive=True
|
60 |
+
)
|
61 |
+
max_new_tokens = gr.Slider(
|
62 |
+
value=256 * 3, minimum=128, maximum=2048, step=128,
|
63 |
+
label="Max new tokens", interactive=True
|
64 |
+
)
|
65 |
+
|
66 |
+
with gr.Column():
|
67 |
+
chatbot = gr.Chatbot(
|
68 |
+
label="Candid Assistant",
|
69 |
+
elem_id="chatbot",
|
70 |
+
bubble_full_width=False,
|
71 |
+
avatar_images=(
|
72 |
+
None,
|
73 |
+
os.path.join(ROOT, "static", "candid_logo_yellow.png")
|
74 |
+
),
|
75 |
+
height="45vh",
|
76 |
+
type="messages",
|
77 |
+
show_label=False,
|
78 |
+
show_copy_button=True,
|
79 |
+
show_share_button=True,
|
80 |
+
show_copy_all_button=True
|
81 |
+
)
|
82 |
+
msg = gr.MultimodalTextbox(label="Your message", interactive=True)
|
83 |
+
thread_id = gr.Text(visible=False, value="", label="thread_id")
|
84 |
+
gr.ClearButton(components=[msg, chatbot, thread_id], size="sm")
|
85 |
+
|
86 |
+
# pylint: disable=no-member
|
87 |
+
chat_msg = msg.submit(
|
88 |
+
fn=execute,
|
89 |
+
inputs=[thread_id, msg, chatbot, max_new_tokens, es_indices],
|
90 |
+
outputs=[msg, chatbot, thread_id]
|
91 |
+
)
|
92 |
+
chat_msg.then(format_chat_ag_response, chatbot, chatbot, api_name="bot_response")
|
93 |
+
logged = LoggedComponents(
|
94 |
+
context=[thread_id, chatbot]
|
95 |
+
)
|
96 |
+
return logged, demo
|
97 |
+
|
98 |
+
|
99 |
+
if __name__ == '__main__':
|
100 |
+
_, app = build_chat()
|
101 |
+
app.queue(max_size=5).launch(
|
102 |
+
show_api=False,
|
103 |
+
auth=[
|
104 |
+
(os.getenv("APP_USERNAME"), os.getenv("APP_PASSWORD")),
|
105 |
+
(os.getenv("APP_PUBLIC_USERNAME"), os.getenv("APP_PUBLIC_PASSWORD")),
|
106 |
+
],
|
107 |
+
auth_message="Login to Candid's AI assistant",
|
108 |
+
ssr_mode=False
|
109 |
+
)
|
chat.py
ADDED
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional, Dict, Any, TypedDict, Annotated, Sequence
|
2 |
+
from functools import partial
|
3 |
+
import os
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
from langchain_core.messages import AIMessage, BaseMessage
|
8 |
+
from langchain_core.output_parsers import StrOutputParser
|
9 |
+
from langchain_core.prompts import ChatPromptTemplate
|
10 |
+
from langchain_core.language_models.llms import LLM
|
11 |
+
|
12 |
+
from langgraph.prebuilt import tools_condition, ToolNode
|
13 |
+
from langgraph.checkpoint.memory import MemorySaver
|
14 |
+
from langgraph.graph.state import StateGraph
|
15 |
+
from langgraph.graph.message import add_messages
|
16 |
+
from langgraph.constants import START, END
|
17 |
+
|
18 |
+
try:
|
19 |
+
from utils import html_format_docs_chat, get_session_id
|
20 |
+
from tools.question_reformulation import reformulate_question_using_history
|
21 |
+
from tools.org_seach import (
|
22 |
+
extract_org_links_from_chatbot,
|
23 |
+
embed_org_links_in_text,
|
24 |
+
generate_org_link_dict,
|
25 |
+
)
|
26 |
+
from retrieval.elastic import retriever_tool
|
27 |
+
except ImportError:
|
28 |
+
from .utils import html_format_docs_chat, get_session_id
|
29 |
+
from .tools.question_reformulation import reformulate_question_using_history
|
30 |
+
from .tools.org_seach import (
|
31 |
+
extract_org_links_from_chatbot,
|
32 |
+
embed_org_links_in_text,
|
33 |
+
generate_org_link_dict,
|
34 |
+
)
|
35 |
+
from .retrieval.elastic import retriever_tool
|
36 |
+
|
37 |
+
ROOT = os.path.dirname(os.path.abspath(__file__))
|
38 |
+
|
39 |
+
# TODO https://www.metadocs.co/2024/08/29/simple-domain-specific-corrective-rag-with-langchain-and-langgraph/
|
40 |
+
|
41 |
+
|
42 |
+
class AgentState(TypedDict):
|
43 |
+
# The add_messages function defines how an update should be processed
|
44 |
+
# Default is to replace. add_messages says "append"
|
45 |
+
messages: Annotated[Sequence[BaseMessage], add_messages]
|
46 |
+
user_input: str
|
47 |
+
org_dict: Dict
|
48 |
+
|
49 |
+
|
50 |
+
def search_agent(state, llm: LLM, tools) -> AgentState:
|
51 |
+
"""Invokes the agent model to generate a response based on the current state. Given
|
52 |
+
the question, it will decide to retrieve using the retriever tool, or simply end.
|
53 |
+
|
54 |
+
Parameters
|
55 |
+
----------
|
56 |
+
state : _type_
|
57 |
+
The current state
|
58 |
+
llm : LLM
|
59 |
+
tools : _type_
|
60 |
+
_description_
|
61 |
+
|
62 |
+
Returns
|
63 |
+
-------
|
64 |
+
AgentState
|
65 |
+
The updated state with the agent response appended to messages
|
66 |
+
"""
|
67 |
+
|
68 |
+
print("---SEARCH AGENT---")
|
69 |
+
messages = state["messages"]
|
70 |
+
question = messages[-1].content
|
71 |
+
|
72 |
+
model = llm.bind_tools(tools)
|
73 |
+
response = model.invoke(messages)
|
74 |
+
# return a list, because this will get added to the existing list
|
75 |
+
return {"messages": [response], "user_input": question}
|
76 |
+
|
77 |
+
|
78 |
+
def generate_with_context(state, llm: LLM) -> AgentState:
|
79 |
+
"""Generate answer.
|
80 |
+
|
81 |
+
Parameters
|
82 |
+
----------
|
83 |
+
state : _type_
|
84 |
+
The current state
|
85 |
+
llm : LLM
|
86 |
+
tools : _type_
|
87 |
+
_description_
|
88 |
+
|
89 |
+
Returns
|
90 |
+
-------
|
91 |
+
AgentState
|
92 |
+
The updated state with the agent response appended to messages
|
93 |
+
"""
|
94 |
+
|
95 |
+
print("---GENERATE ANSWER---")
|
96 |
+
messages = state["messages"]
|
97 |
+
question = state["user_input"]
|
98 |
+
last_message = messages[-1]
|
99 |
+
|
100 |
+
sources_str = last_message.content
|
101 |
+
sources_list = last_message.artifact # cannot use directly as list of Documents
|
102 |
+
# converting to html string
|
103 |
+
sources_html = html_format_docs_chat(sources_list)
|
104 |
+
if sources_list:
|
105 |
+
print("---ADD SOURCES---")
|
106 |
+
state["messages"].append(BaseMessage(content=sources_html, type="HTML"))
|
107 |
+
|
108 |
+
# Prompt
|
109 |
+
qa_system_prompt = """
|
110 |
+
You are an assistant for question-answering tasks in the social and philanthropic sector. \n
|
111 |
+
Use the following pieces of retrieved context to answer the question at the end. \n
|
112 |
+
If you don't know the answer, just say that you don't know. \n
|
113 |
+
Keep the response professional, friendly, and as concise as possible. \n
|
114 |
+
Question: {question}
|
115 |
+
Context: {context}
|
116 |
+
Answer:
|
117 |
+
"""
|
118 |
+
|
119 |
+
qa_prompt = ChatPromptTemplate(
|
120 |
+
[
|
121 |
+
("system", qa_system_prompt),
|
122 |
+
("human", question),
|
123 |
+
]
|
124 |
+
)
|
125 |
+
|
126 |
+
rag_chain = qa_prompt | llm | StrOutputParser()
|
127 |
+
response = rag_chain.invoke({"context": sources_str, "question": question})
|
128 |
+
# couldn't figure out why returning usual "response" was seen as HumanMessage
|
129 |
+
return {"messages": [AIMessage(content=response)], "user_input": question}
|
130 |
+
|
131 |
+
|
132 |
+
def has_org_name(state: AgentState) -> AgentState:
|
133 |
+
"""
|
134 |
+
Processes the latest message to extract organization links and determine the next step.
|
135 |
+
|
136 |
+
Args:
|
137 |
+
state (AgentState): The current state of the agent, including a list of messages.
|
138 |
+
|
139 |
+
Returns:
|
140 |
+
dict: A dictionary with the next agent action and, if available, a dictionary of organization links.
|
141 |
+
"""
|
142 |
+
print("---HAS ORG NAMES?---")
|
143 |
+
messages = state["messages"]
|
144 |
+
last_message = messages[-1].content
|
145 |
+
output_list = extract_org_links_from_chatbot(last_message)
|
146 |
+
link_dict = generate_org_link_dict(output_list) if output_list else {}
|
147 |
+
if link_dict:
|
148 |
+
print("---FOUND ORG NAMES---")
|
149 |
+
return {"next": "insert_org_link", "org_dict": link_dict}
|
150 |
+
print("---NO ORG NAMES FOUND---")
|
151 |
+
return {"next": END, "messages": messages}
|
152 |
+
|
153 |
+
|
154 |
+
def insert_org_link(state: AgentState) -> AgentState:
|
155 |
+
"""
|
156 |
+
Embeds organization links in the latest message content and returns it as an AI message.
|
157 |
+
|
158 |
+
Args:
|
159 |
+
state (dict): The current state, including the organization links and latest message.
|
160 |
+
|
161 |
+
Returns:
|
162 |
+
dict: A dictionary with the updated message content as an AIMessage.
|
163 |
+
"""
|
164 |
+
print("---INSERT ORG LINKS---")
|
165 |
+
messages = state["messages"]
|
166 |
+
last_message = messages[-1].content
|
167 |
+
messages.pop(-1) # Deleting the original message because we will append the same one but with links
|
168 |
+
link_dict = state["org_dict"]
|
169 |
+
last_message = embed_org_links_in_text(last_message, link_dict)
|
170 |
+
return {"messages": [AIMessage(content=last_message)]}
|
171 |
+
|
172 |
+
|
173 |
+
def build_compute_graph(llm: LLM, indices: List[str]) -> StateGraph:
|
174 |
+
candid_retriever_tool = retriever_tool(indices=indices)
|
175 |
+
retrieve = ToolNode([candid_retriever_tool])
|
176 |
+
tools = [candid_retriever_tool]
|
177 |
+
|
178 |
+
G = StateGraph(AgentState)
|
179 |
+
# Add nodes
|
180 |
+
G.add_node("reformulate", partial(reformulate_question_using_history, llm=llm))
|
181 |
+
G.add_node("search_agent", partial(search_agent, llm=llm, tools=tools))
|
182 |
+
G.add_node("retrieve", retrieve)
|
183 |
+
G.add_node("generate_with_context", partial(generate_with_context, llm=llm))
|
184 |
+
G.add_node("has_org_name", has_org_name)
|
185 |
+
G.add_node("insert_org_link", insert_org_link)
|
186 |
+
|
187 |
+
# Add edges
|
188 |
+
G.add_edge(START, "reformulate")
|
189 |
+
G.add_edge("reformulate", "search_agent")
|
190 |
+
# Conditional edges from search_agent
|
191 |
+
G.add_conditional_edges(
|
192 |
+
source="search_agent",
|
193 |
+
path=tools_condition, # TODO just a conditional edge here?
|
194 |
+
path_map={
|
195 |
+
"tools": "retrieve",
|
196 |
+
"__end__": "has_org_name",
|
197 |
+
},
|
198 |
+
)
|
199 |
+
G.add_edge("retrieve", "generate_with_context")
|
200 |
+
|
201 |
+
# Add edges
|
202 |
+
G.add_edge("generate_with_context", "has_org_name")
|
203 |
+
# Use add_conditional_edges for has_org_name
|
204 |
+
G.add_conditional_edges(
|
205 |
+
"has_org_name",
|
206 |
+
lambda x: x["next"], # Now we're accessing the 'next' key from the dict
|
207 |
+
{"insert_org_link": "insert_org_link", END: END},
|
208 |
+
)
|
209 |
+
G.add_edge("insert_org_link", END)
|
210 |
+
|
211 |
+
return G
|
212 |
+
|
213 |
+
|
214 |
+
def run_chat(
|
215 |
+
thread_id: str,
|
216 |
+
user_input: Dict[str, Any],
|
217 |
+
chatbot: List[Dict],
|
218 |
+
llm: LLM,
|
219 |
+
indices: Optional[List[str]] = None,
|
220 |
+
):
|
221 |
+
# https://langchain-ai.github.io/langgraph/tutorials/rag/langgraph_agentic_rag/#graph
|
222 |
+
|
223 |
+
chatbot.append({"role": "user", "content": user_input["text"]})
|
224 |
+
inputs = {"messages": chatbot}
|
225 |
+
# thread_id can be an email https://github.com/yurisasc/memory-enhanced-ai-assistant/blob/main/assistant.py
|
226 |
+
thread_id = get_session_id(thread_id)
|
227 |
+
config = {"configurable": {"thread_id": thread_id}}
|
228 |
+
|
229 |
+
workflow = build_compute_graph(llm=llm, indices=indices)
|
230 |
+
|
231 |
+
memory = MemorySaver() # TODO: don't use for Prod
|
232 |
+
graph = workflow.compile(checkpointer=memory)
|
233 |
+
response = graph.invoke(inputs, config=config)
|
234 |
+
messages = response["messages"]
|
235 |
+
last_message = messages[-1]
|
236 |
+
ai_answer = last_message.content
|
237 |
+
sources_html = ""
|
238 |
+
for message in messages[-2:]:
|
239 |
+
if message.type == "HTML":
|
240 |
+
sources_html = message.content
|
241 |
+
|
242 |
+
chatbot.append({"role": "assistant", "content": ai_answer})
|
243 |
+
if sources_html:
|
244 |
+
chatbot.append(
|
245 |
+
{
|
246 |
+
"role": "assistant",
|
247 |
+
"content": sources_html,
|
248 |
+
"metadata": {"title": "Sources HTML"},
|
249 |
+
}
|
250 |
+
)
|
251 |
+
|
252 |
+
return gr.MultimodalTextbox(value=None, interactive=True), chatbot, thread_id
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
elasticsearch==7.17.6
|
2 |
+
gradio
|
3 |
+
langchain
|
4 |
+
langchain-openai
|
5 |
+
langgraph
|
6 |
+
pydantic
|
7 |
+
fuzzywuzzy
|
retrieval/__init__.py
ADDED
File without changes
|
retrieval/candid_blog.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Tuple, Any
|
2 |
+
|
3 |
+
|
4 |
+
def build_wp_candid_blog_knn_model_query(
|
5 |
+
query: str,
|
6 |
+
fields: Tuple[str] = (
|
7 |
+
"content",
|
8 |
+
"title"
|
9 |
+
),
|
10 |
+
k: int = 10,
|
11 |
+
model_id: str = "sentence-transformers__all-mpnet-base-v2"
|
12 |
+
):
|
13 |
+
output = []
|
14 |
+
|
15 |
+
for f in fields:
|
16 |
+
output.append({
|
17 |
+
"field": f"embeddings.{f}.chunks.vector.predicted_value",
|
18 |
+
"k": k,
|
19 |
+
"num_candidates": 100,
|
20 |
+
"query_vector_builder": {
|
21 |
+
"text_embedding": {
|
22 |
+
"model_id": model_id,
|
23 |
+
"model_text": query
|
24 |
+
}
|
25 |
+
},
|
26 |
+
"boost": 1 / len(fields)
|
27 |
+
})
|
28 |
+
return {"knn": output}
|
29 |
+
|
30 |
+
|
31 |
+
def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
|
32 |
+
url = f"{doc['link']}"
|
33 |
+
fields = ["title", "excerpt"]
|
34 |
+
|
35 |
+
fields_dict = {}
|
36 |
+
fields_len = 0
|
37 |
+
for field in fields:
|
38 |
+
if doc.get(field, None) is not None:
|
39 |
+
fields_dict[field] = doc[field]
|
40 |
+
fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"
|
41 |
+
|
42 |
+
if (fields_len + len(doc[field])) > 999:
|
43 |
+
rest_text_len = 999 - fields_len
|
44 |
+
if rest_text_len > 0:
|
45 |
+
fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
|
46 |
+
else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
|
47 |
+
fields_len = fields_len + len(doc[field])
|
48 |
+
else:
|
49 |
+
fields_dict[field] = ""
|
50 |
+
fields_dict[field + "_txt"] = ""
|
51 |
+
html = f"""
|
52 |
+
<div style='height: {height_px}px; padding: 5px;'>
|
53 |
+
<div style='height: {height_px}px; border: 1px solid #febe10;'>
|
54 |
+
<span style='padding-left: 10px; display: inline-block; width: 100%;'>
|
55 |
+
<div>
|
56 |
+
<span>
|
57 |
+
<b>Candid blog post:</b>
|
58 |
+
<a href='{url}' target='_blank' style='text-decoration: none;'>
|
59 |
+
{doc['title']}
|
60 |
+
</a>
|
61 |
+
</span>
|
62 |
+
<br>
|
63 |
+
<br>
|
64 |
+
{fields_dict["excerpt_txt"]}
|
65 |
+
</div>
|
66 |
+
</span>
|
67 |
+
</div>
|
68 |
+
</div>
|
69 |
+
"""
|
70 |
+
return html
|
retrieval/candid_help.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Any
|
2 |
+
|
3 |
+
|
4 |
+
def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
|
5 |
+
url = f"{doc['link']}"
|
6 |
+
fields = ["title", "summary"]
|
7 |
+
|
8 |
+
fields_dict = {}
|
9 |
+
fields_len = 0
|
10 |
+
for field in fields:
|
11 |
+
if doc.get(field, None) is not None:
|
12 |
+
fields_dict[field] = doc[field]
|
13 |
+
fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"
|
14 |
+
|
15 |
+
if (fields_len + len(doc[field])) > 999:
|
16 |
+
rest_text_len = 999 - fields_len
|
17 |
+
if rest_text_len > 0:
|
18 |
+
fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
|
19 |
+
else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
|
20 |
+
fields_len = fields_len + len(doc[field])
|
21 |
+
else:
|
22 |
+
fields_dict[field] = ""
|
23 |
+
fields_dict[field + "_txt"] = ""
|
24 |
+
html = f"""
|
25 |
+
<div style='height: {height_px}px; padding: 5px;'>
|
26 |
+
<div style='height: {height_px}px; border: 1px solid #febe10;'>
|
27 |
+
<span style='padding-left: 10px; display: inline-block; width: 100%;'>
|
28 |
+
<div>
|
29 |
+
<span>
|
30 |
+
<b>Candid help article:</b>
|
31 |
+
<a href='{url}' target='_blank' style='text-decoration: none;'>
|
32 |
+
{doc['title']}
|
33 |
+
</a>
|
34 |
+
</span>
|
35 |
+
<br>
|
36 |
+
</div>
|
37 |
+
</span>
|
38 |
+
</div>
|
39 |
+
</div>
|
40 |
+
"""
|
41 |
+
return html
|
retrieval/candid_learning.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Any
|
2 |
+
|
3 |
+
|
4 |
+
def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
|
5 |
+
url = f"{doc['url']}"
|
6 |
+
fields = ["title", "excerpt"]
|
7 |
+
|
8 |
+
fields_dict = {}
|
9 |
+
fields_len = 0
|
10 |
+
for field in fields:
|
11 |
+
if doc.get(field, None) is not None:
|
12 |
+
fields_dict[field] = doc[field]
|
13 |
+
fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"
|
14 |
+
|
15 |
+
if (fields_len + len(doc[field])) > 999:
|
16 |
+
rest_text_len = 999 - fields_len
|
17 |
+
if rest_text_len > 0:
|
18 |
+
fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
|
19 |
+
else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
|
20 |
+
fields_len = fields_len + len(doc[field])
|
21 |
+
else:
|
22 |
+
fields_dict[field] = ""
|
23 |
+
fields_dict[field + "_txt"] = ""
|
24 |
+
html = f"""
|
25 |
+
<div style='height: {height_px}px; padding: 5px;'>
|
26 |
+
<div style='height: {height_px}px; border: 1px solid #febe10;'>
|
27 |
+
<span style='padding-left: 10px; display: inline-block; width: 100%;'>
|
28 |
+
<div>
|
29 |
+
<span>
|
30 |
+
<b>Candid Learning resource:</b>
|
31 |
+
<a href='{url}' target='_blank' style='text-decoration: none;'>
|
32 |
+
{doc['title']}
|
33 |
+
</a>
|
34 |
+
</span>
|
35 |
+
<br>
|
36 |
+
</div>
|
37 |
+
</span>
|
38 |
+
</div>
|
39 |
+
</div>
|
40 |
+
"""
|
41 |
+
return html
|
retrieval/config.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
ONECANDID_QA = {
|
4 |
+
'ES_URL': os.getenv('ONECANDID_URL_QA'),
|
5 |
+
'ES_PWD': os.getenv('ONECANDID_PWD_QA'),
|
6 |
+
'ES_UID': os.getenv('ONECANDID_UID_QA'),
|
7 |
+
"ES_CLOUD_ID": os.getenv("ONECANDID_CLOUD_ID"),
|
8 |
+
"ES_API_KEY": os.getenv("ONECANDID_API_KEY")
|
9 |
+
}
|
10 |
+
|
11 |
+
class Indices:
|
12 |
+
# NEWS_INDEX = "news-semantic-search-v1"
|
13 |
+
# TRANSACTION_INDEX = "search-semantic-cds-transactions_v1"
|
14 |
+
# ORGANIZATION_INDEX = "search-semantic-up-organizations_v2"
|
15 |
+
ISSUELAB_INDEX = "search-semantic-issuelab_v1"
|
16 |
+
ISSUELAB_INDEX_ELSER = "search-semantic-issuelab-elser_ve2"
|
17 |
+
YOUTUBE_INDEX = "search-semantic-youtube_v1"
|
18 |
+
YOUTUBE_INDEX_ELSER = "search-semantic-youtube-elser_ve1"
|
19 |
+
CANDID_BLOG_INDEX = "search-semantic-candid-blog_v1"
|
20 |
+
CANDID_BLOG_INDEX_ELSER = "search-semantic-candid-blog-elser_ve2"
|
21 |
+
CANDID_LEARNING_INDEX_ELSER = "search-semantic-candid-learning_ve1"
|
22 |
+
CANDID_HELP_INDEX_ELSER = "search-semantic-candid-help-elser_ve1"
|
23 |
+
|
24 |
+
|
25 |
+
ALL_INDICES = (
|
26 |
+
"issuelab",
|
27 |
+
"youtube",
|
28 |
+
"candid_blog",
|
29 |
+
"candid_learning",
|
30 |
+
"candid_help"
|
31 |
+
)
|
retrieval/elastic.py
ADDED
@@ -0,0 +1,344 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Tuple, Dict, Iterable, Iterator, Optional, Any
|
2 |
+
from dataclasses import dataclass
|
3 |
+
from functools import partial
|
4 |
+
from itertools import groupby
|
5 |
+
|
6 |
+
from pydantic import BaseModel, Field
|
7 |
+
from langchain_core.documents import Document
|
8 |
+
from langchain_core.tools import Tool
|
9 |
+
|
10 |
+
from elasticsearch import Elasticsearch
|
11 |
+
|
12 |
+
try:
|
13 |
+
# from news import build_knn_query as news_query
|
14 |
+
# from up_orgs import build_organizations_knn_model_query as org_query
|
15 |
+
# from cds import build_transactions_knn_query as transactions_query
|
16 |
+
from config import ONECANDID_QA, ALL_INDICES, Indices
|
17 |
+
except ImportError:
|
18 |
+
# from .news import build_knn_query as news_query
|
19 |
+
# from .up_orgs import build_organizations_knn_model_query as org_query
|
20 |
+
# from .cds import build_transactions_knn_query as transactions_query
|
21 |
+
from .config import ONECANDID_QA, ALL_INDICES, Indices
|
22 |
+
|
23 |
+
|
24 |
+
@dataclass
|
25 |
+
class ElasticHitsResult:
|
26 |
+
"""Dataclass for Elasticsearch hits results
|
27 |
+
"""
|
28 |
+
index: str
|
29 |
+
id: Any
|
30 |
+
score: float
|
31 |
+
source: Dict[str, Any]
|
32 |
+
inner_hits: Dict[str, Any]
|
33 |
+
|
34 |
+
|
35 |
+
class RetrieverInput(BaseModel):
|
36 |
+
"""Input to the Elasticsearch retriever."""
|
37 |
+
user_input: str = Field(description="query to look up in retriever")
|
38 |
+
|
39 |
+
|
40 |
+
def build_text_expansion_query(
|
41 |
+
query: str,
|
42 |
+
fields: Tuple[str],
|
43 |
+
model_id: str = ".elser_model_2_linux-x86_64"
|
44 |
+
) -> Dict[str, Any]:
|
45 |
+
|
46 |
+
output = []
|
47 |
+
|
48 |
+
for f in fields:
|
49 |
+
output.append({
|
50 |
+
"nested": {
|
51 |
+
"path": f"embeddings.{f}.chunks",
|
52 |
+
"query": {
|
53 |
+
"text_expansion": {
|
54 |
+
f"embeddings.{f}.chunks.vector": {
|
55 |
+
"model_id": model_id,
|
56 |
+
"model_text": query,
|
57 |
+
"boost": 1 / len(fields)
|
58 |
+
}
|
59 |
+
}
|
60 |
+
},
|
61 |
+
"inner_hits": {
|
62 |
+
"_source": False,
|
63 |
+
"size": 2,
|
64 |
+
"fields": [f"embeddings.{f}.chunks.chunk"]
|
65 |
+
}
|
66 |
+
}
|
67 |
+
})
|
68 |
+
return {"query": {"bool": {"should": output}}}
|
69 |
+
|
70 |
+
|
71 |
+
def query_builder(query: str, indices: List[str], **kwargs):
|
72 |
+
queries = []
|
73 |
+
if indices is None:
|
74 |
+
indices = list(ALL_INDICES)
|
75 |
+
|
76 |
+
for index in indices:
|
77 |
+
if index == "news":
|
78 |
+
# q = news_query(query)
|
79 |
+
# q["_source"] = {"excludes": ["embeddings"]}
|
80 |
+
# q["size"] = 5
|
81 |
+
# queries.extend([{"index": Indices.NEWS_INDEX}, q])
|
82 |
+
pass
|
83 |
+
elif index == "organizations":
|
84 |
+
# q = org_query(query)
|
85 |
+
# q["_source"] = {"excludes": ["embeddings"]}
|
86 |
+
# q["size"] = 10
|
87 |
+
# queries.extend([{"index": Indices.ORGANIZATION_INDEX}, q])
|
88 |
+
pass
|
89 |
+
elif index == "grants":
|
90 |
+
# q = transactions_query(query)
|
91 |
+
# q["_source"] = {"excludes": ["embeddings"]}
|
92 |
+
# q["size"] = 10
|
93 |
+
# queries.extend([{"index": Indices.TRANSACTION_INDEX}, q])
|
94 |
+
pass
|
95 |
+
elif index == "issuelab":
|
96 |
+
q = build_text_expansion_query(
|
97 |
+
query=query,
|
98 |
+
fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
|
99 |
+
)
|
100 |
+
q["_source"] = {"excludes": ["embeddings"]}
|
101 |
+
q["size"] = 1
|
102 |
+
queries.extend([{"index": Indices.ISSUELAB_INDEX_ELSER}, q])
|
103 |
+
elif index == "youtube":
|
104 |
+
q = build_text_expansion_query(
|
105 |
+
query=query,
|
106 |
+
fields=("captions_cleaned", "description_cleaned", "title")
|
107 |
+
)
|
108 |
+
# text_cleaned duplicates captions_cleaned
|
109 |
+
q["_source"] = {"excludes": ["embeddings", "captions", "description", "text_cleaned"]}
|
110 |
+
q["size"] = 2
|
111 |
+
queries.extend([{"index": Indices.YOUTUBE_INDEX_ELSER}, q])
|
112 |
+
elif index == "candid_blog":
|
113 |
+
q = build_text_expansion_query(
|
114 |
+
query=query,
|
115 |
+
fields=("content", "title")
|
116 |
+
)
|
117 |
+
q["_source"] = {"excludes": ["embeddings"]}
|
118 |
+
q["size"] = 2
|
119 |
+
queries.extend([{"index": Indices.CANDID_BLOG_INDEX_ELSER}, q])
|
120 |
+
elif index == "candid_learning":
|
121 |
+
q = build_text_expansion_query(
|
122 |
+
query=query,
|
123 |
+
fields=("content", "title", "training_topics", "staff_recommendations")
|
124 |
+
)
|
125 |
+
q["_source"] = {"excludes": ["embeddings"]}
|
126 |
+
q["size"] = 2
|
127 |
+
queries.extend([{"index": Indices.CANDID_LEARNING_INDEX_ELSER}, q])
|
128 |
+
elif index == "candid_help":
|
129 |
+
q = build_text_expansion_query(
|
130 |
+
query=query,
|
131 |
+
fields=("content", "combined_article_description")
|
132 |
+
)
|
133 |
+
q["_source"] = {"excludes": ["embeddings"]}
|
134 |
+
q["size"] = 2
|
135 |
+
queries.extend([{"index": Indices.CANDID_HELP_INDEX_ELSER}, q])
|
136 |
+
|
137 |
+
return queries
|
138 |
+
|
139 |
+
|
140 |
+
def multi_search(queries: List[ElasticHitsResult]):
|
141 |
+
results = []
|
142 |
+
with Elasticsearch(
|
143 |
+
cloud_id=ONECANDID_QA["ES_CLOUD_ID"],
|
144 |
+
api_key=ONECANDID_QA["ES_API_KEY"],
|
145 |
+
verify_certs=False,
|
146 |
+
request_timeout=60 * 3
|
147 |
+
) as es:
|
148 |
+
for query_group in es.msearch(body=queries).get("responses", []):
|
149 |
+
for hit in query_group.get("hits", {}).get("hits", []):
|
150 |
+
hit = ElasticHitsResult(
|
151 |
+
index=hit["_index"],
|
152 |
+
id=hit["_id"],
|
153 |
+
score=hit["_score"],
|
154 |
+
source=hit["_source"],
|
155 |
+
inner_hits=hit.get("inner_hits", {})
|
156 |
+
)
|
157 |
+
results.append(hit)
|
158 |
+
return results
|
159 |
+
|
160 |
+
|
161 |
+
def get_query_results(search_text: str, indices: Optional[List[str]] = None):
|
162 |
+
queries = query_builder(query=search_text, indices=indices)
|
163 |
+
return multi_search(queries)
|
164 |
+
|
165 |
+
|
166 |
+
def reranker(query_results: Iterable[ElasticHitsResult]) -> Iterator[ElasticHitsResult]:
|
167 |
+
"""Reranks Elasticsearch hits coming from multiple indicies/queries which may have scores on different scales.
|
168 |
+
This will shuffle results
|
169 |
+
|
170 |
+
Parameters
|
171 |
+
----------
|
172 |
+
query_results : Iterable[ElasticHitsResult]
|
173 |
+
|
174 |
+
Yields
|
175 |
+
------
|
176 |
+
Iterator[ElasticHitsResult]
|
177 |
+
"""
|
178 |
+
|
179 |
+
results: List[ElasticHitsResult] = []
|
180 |
+
for _, data in groupby(query_results, key=lambda x: x.index):
|
181 |
+
data = list(data)
|
182 |
+
max_score = max(data, key=lambda x: x.score).score
|
183 |
+
min_score = min(data, key=lambda x: x.score).score
|
184 |
+
|
185 |
+
for d in data:
|
186 |
+
d.score = (d.score - min_score) / (max_score - min_score + 1e-9)
|
187 |
+
results.append(d)
|
188 |
+
|
189 |
+
yield from sorted(results, key=lambda x: x.score, reverse=True)
|
190 |
+
|
191 |
+
|
192 |
+
def get_results(user_input: str, indices: List[str]) -> List[ElasticHitsResult]:
|
193 |
+
output = ["Search didn't return any Candid sources"]
|
194 |
+
page_content=[]
|
195 |
+
content = "Search didn't return any Candid sources"
|
196 |
+
results = get_query_results(search_text=user_input, indices=indices)
|
197 |
+
if results:
|
198 |
+
output = get_reranked_results(results)
|
199 |
+
for doc in output:
|
200 |
+
page_content.append(doc.page_content)
|
201 |
+
content = "/n/n".join(page_content)
|
202 |
+
# for the tool we need to return a tuple for content_and_artifact type
|
203 |
+
return content, output
|
204 |
+
|
205 |
+
|
206 |
+
def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024) -> str:
|
207 |
+
"""Pads the relevant chunk of text with context before and after
|
208 |
+
|
209 |
+
Parameters
|
210 |
+
----------
|
211 |
+
field_name : str
|
212 |
+
a field with the long text that was chunked into pieces
|
213 |
+
hit : ElasticHitsResult
|
214 |
+
context_length : int, optional
|
215 |
+
length of text to add before and after the chunk, by default 1024
|
216 |
+
|
217 |
+
Returns
|
218 |
+
-------
|
219 |
+
str
|
220 |
+
longer chunks stuffed together
|
221 |
+
"""
|
222 |
+
|
223 |
+
chunks_with_context = []
|
224 |
+
long_text = hit.source.get(f"{field_name}", "")
|
225 |
+
inner_hits_field = f"embeddings.{field_name}.chunks"
|
226 |
+
inner_hits = hit.inner_hits
|
227 |
+
found_chunks = inner_hits.get(inner_hits_field, {})
|
228 |
+
if found_chunks:
|
229 |
+
hits = found_chunks.get("hits", {}).get("hits", [])
|
230 |
+
for h in hits:
|
231 |
+
chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
|
232 |
+
chunk = chunk[3:-3] # cutting the middle because we may have tokenizing artefacts there
|
233 |
+
# Find the start and end indices of the chunk in the large text
|
234 |
+
start_index = long_text.find(chunk)
|
235 |
+
if start_index != -1: # Chunk is found
|
236 |
+
end_index = start_index + len(chunk)
|
237 |
+
pre_start_index = max(0, start_index - context_length)
|
238 |
+
post_end_index = min(len(long_text), end_index + context_length)
|
239 |
+
context = long_text[pre_start_index:post_end_index]
|
240 |
+
chunks_with_context.append(context)
|
241 |
+
chunks_with_context_txt = '\n\n'.join(chunks_with_context)
|
242 |
+
|
243 |
+
return chunks_with_context_txt
|
244 |
+
|
245 |
+
|
246 |
+
def process_hit(hit: ElasticHitsResult) -> Document | None:
|
247 |
+
if "issuelab-elser" in hit.index:
|
248 |
+
combined_item_description = hit.source.get("combined_item_description", "") # title inside
|
249 |
+
description = hit.source.get("description", "")
|
250 |
+
combined_issuelab_findings = hit.source.get("combined_issuelab_findings", "")
|
251 |
+
# we only need to process long texts
|
252 |
+
chunks_with_context_txt = get_context("content", hit, context_length=12)
|
253 |
+
doc = Document(
|
254 |
+
page_content='\n\n'.join([
|
255 |
+
combined_item_description,
|
256 |
+
combined_issuelab_findings,
|
257 |
+
description,
|
258 |
+
chunks_with_context_txt
|
259 |
+
]),
|
260 |
+
metadata={
|
261 |
+
"source": "IssueLab",
|
262 |
+
"source_id": hit.source["resource_id"],
|
263 |
+
"url": hit.source.get("permalink", "")
|
264 |
+
}
|
265 |
+
)
|
266 |
+
elif "youtube" in hit.index:
|
267 |
+
title = hit.source.get("title", "")
|
268 |
+
# we only need to process long texts
|
269 |
+
description_cleaned_with_context_txt = get_context("description_cleaned", hit, context_length=12)
|
270 |
+
captions_cleaned_with_context_txt = get_context("captions_cleaned", hit, context_length=12)
|
271 |
+
doc = Document(
|
272 |
+
page_content='\n\n'.join([title, description_cleaned_with_context_txt, captions_cleaned_with_context_txt]),
|
273 |
+
metadata={
|
274 |
+
"source": "Candid Youtube",
|
275 |
+
"source_id": hit.source['video_id'],
|
276 |
+
"url": f"https://www.youtube.com/watch?v={hit.source['video_id']}"
|
277 |
+
}
|
278 |
+
)
|
279 |
+
elif "candid-blog" in hit.index:
|
280 |
+
excerpt = hit.source.get("excerpt", "")
|
281 |
+
title = hit.source.get("title", "")
|
282 |
+
# we only need to process long texts
|
283 |
+
content_with_context_txt = get_context("content", hit, context_length=12)
|
284 |
+
doc = Document(
|
285 |
+
page_content='\n\n'.join([title, excerpt, content_with_context_txt]),
|
286 |
+
metadata={
|
287 |
+
"source": "Candid Blog",
|
288 |
+
"source_id": hit.source["id"],
|
289 |
+
"url": hit.source["link"]
|
290 |
+
}
|
291 |
+
)
|
292 |
+
elif "candid-learning" in hit.index:
|
293 |
+
title = hit.source.get("title", "")
|
294 |
+
content_with_context_txt = get_context("content", hit, context_length=12)
|
295 |
+
training_topics = hit.source.get("training_topics", "")
|
296 |
+
staff_recommendations = hit.source.get("staff_recommendations", "")
|
297 |
+
|
298 |
+
doc = Document(
|
299 |
+
page_content='\n\n'.join([title, staff_recommendations, training_topics, content_with_context_txt]),
|
300 |
+
metadata={
|
301 |
+
"source": "Candid Learning",
|
302 |
+
"source_id": hit.source["post_id"],
|
303 |
+
"url": hit.source.get("url", "")
|
304 |
+
}
|
305 |
+
)
|
306 |
+
elif "candid-help" in hit.index:
|
307 |
+
title = hit.source.get("title", "")
|
308 |
+
content_with_context_txt = get_context("content", hit, context_length=12)
|
309 |
+
combined_article_description = hit.source.get("combined_article_description", "")
|
310 |
+
|
311 |
+
doc = Document(
|
312 |
+
page_content='\n\n'.join([combined_article_description, content_with_context_txt]),
|
313 |
+
metadata={
|
314 |
+
"source": "Candid Help",
|
315 |
+
"source_id": hit.source["id"],
|
316 |
+
"url": hit.source.get("link", "")
|
317 |
+
}
|
318 |
+
)
|
319 |
+
else:
|
320 |
+
doc = None
|
321 |
+
return doc
|
322 |
+
|
323 |
+
|
324 |
+
def get_reranked_results(results: List[ElasticHitsResult]) -> List[Document]:
|
325 |
+
output = []
|
326 |
+
for r in reranker(results):
|
327 |
+
hit = process_hit(r)
|
328 |
+
output.append(hit)
|
329 |
+
return output
|
330 |
+
|
331 |
+
|
332 |
+
def retriever_tool(indices: List[str]) -> Tool:
|
333 |
+
# cannot use create_retriever_tool because it only provides content losing all metadata on the way
|
334 |
+
# https://python.langchain.com/docs/how_to/custom_tools/#returning-artifacts-of-tool-execution
|
335 |
+
return Tool(
|
336 |
+
name="retrieve_social_sector_information",
|
337 |
+
func=partial(get_results, indices=indices),
|
338 |
+
description=(
|
339 |
+
"Return additional information about social and philanthropic sector, "
|
340 |
+
"including nonprofits (NGO), grants, foundations, funding, RFP, LOI, Candid."
|
341 |
+
),
|
342 |
+
args_schema=RetrieverInput,
|
343 |
+
response_format="content_and_artifact"
|
344 |
+
)
|
retrieval/elastic_qa.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Dict, Any, Optional
|
2 |
+
import re
|
3 |
+
import os
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings
|
8 |
+
from langchain_core.documents import Document
|
9 |
+
|
10 |
+
from ...socialspark.retrieval_qa.elastic import ElasticHitsResult, ElasticsearchQABase
|
11 |
+
|
12 |
+
from .elastic import query_builder
|
13 |
+
from .config import ALL_INDICES
|
14 |
+
|
15 |
+
timestamp_pattern = re.compile(r"^(\d:\d{2}:\d{2}\.\d{3}),(\d:\d{2}:\d{2}\.\d{3})\n(.*)$")
|
16 |
+
|
17 |
+
|
18 |
+
def parse_youtube_captions(caption_text: str) -> List[str]:
|
19 |
+
timestamp_blocks = re.split(r'\n{2}', caption_text)
|
20 |
+
parsed_blocks = [m.groups() for block in timestamp_blocks if (m:=timestamp_pattern.match(block)) if not None]
|
21 |
+
video_length_mins = int(parsed_blocks[-1][0].split(':')[1])
|
22 |
+
|
23 |
+
# what is the block timing in minutes which gives 10 chunks? keep minimum 1 minute breaks to maintain context
|
24 |
+
minute_difference = min(video_length_mins // 10, 1)
|
25 |
+
|
26 |
+
prev_minute_start = 0
|
27 |
+
texts = []
|
28 |
+
text_block = []
|
29 |
+
for start, _, text in parsed_blocks:
|
30 |
+
text_block.append(text)
|
31 |
+
|
32 |
+
current_minute = int(start.split(':')[1])
|
33 |
+
if (current_minute - prev_minute_start) >= minute_difference:
|
34 |
+
texts.append(' '.join(text_block))
|
35 |
+
text_block.clear()
|
36 |
+
prev_minute_start = current_minute
|
37 |
+
|
38 |
+
return texts
|
39 |
+
|
40 |
+
|
41 |
+
def parse_candid_learning(text: str) -> List[str]:
|
42 |
+
texts = []
|
43 |
+
for block in map(str.strip, re.split(r'\n{1,}', text, flags=re.I | re.M)):
|
44 |
+
if (
|
45 |
+
'back to top' in block.lower()
|
46 |
+
or 'table of contents' in block.lower()
|
47 |
+
):
|
48 |
+
continue
|
49 |
+
|
50 |
+
texts.append(block)
|
51 |
+
return texts
|
52 |
+
|
53 |
+
|
54 |
+
class ElasticsearchQA(ElasticsearchQABase):
|
55 |
+
|
56 |
+
indices: Optional[List[str]] = ALL_INDICES
|
57 |
+
embedding: Any = HuggingFaceEndpointEmbeddings(
|
58 |
+
huggingfacehub_api_token=os.getenv("HF_API_KEY"),
|
59 |
+
# model="sentence-transformers/all-mpnet-base-v2"
|
60 |
+
model="mixedbread-ai/mxbai-embed-large-v1"
|
61 |
+
)
|
62 |
+
|
63 |
+
def build_query(self, query: str, **kwargs) -> List[Dict[str, Any]]:
|
64 |
+
queries = query_builder(query=query, indices=self.indices)
|
65 |
+
return queries
|
66 |
+
|
67 |
+
def sub_section_alignment(self, query: str, document: List[str]) -> str:
|
68 |
+
question_vector = np.array(self.embedding.embed_query(query), dtype='float32')
|
69 |
+
vectors = np.array(self.embedding.embed_documents(document), dtype='float32')
|
70 |
+
vectors = np.array(vectors, dtype='float32')
|
71 |
+
|
72 |
+
size = max(5, int(0.1 * len(document)))
|
73 |
+
|
74 |
+
doc_vector = vectors.sum(axis=0, keepdims=True)
|
75 |
+
vectors /= np.linalg.norm(vectors, ord=2.0, axis=-1, keepdims=True)
|
76 |
+
doc_vector /= np.linalg.norm(doc_vector, ord=2.0, axis=-1, keepdims=True)
|
77 |
+
question_vector /= np.linalg.norm(question_vector, ord=2.0, axis=-1, keepdims=True)
|
78 |
+
# similarity = (doc_vector * vectors).sum(axis=-1)
|
79 |
+
similarity = (question_vector * vectors).sum(axis=-1)
|
80 |
+
|
81 |
+
return '\n'.join(text for text, _ in sorted(zip(document, similarity), key=lambda x: 1 - x[-1])[:size])
|
82 |
+
|
83 |
+
def process_hit(self, hit: ElasticHitsResult, q: str) -> Document | None:
|
84 |
+
if "news" in hit.index:
|
85 |
+
doc = Document(
|
86 |
+
page_content='\n\n'.join(v for k, v in hit.source["texts"].items() if v),
|
87 |
+
metadata={
|
88 |
+
"source": "news",
|
89 |
+
"source_id": hit.source['metadata']['link']
|
90 |
+
}
|
91 |
+
)
|
92 |
+
elif "transactions" in hit.index:
|
93 |
+
doc = Document(
|
94 |
+
page_content='\n\n'.join(v for k, v in hit.source["semantic_texts"].items() if v),
|
95 |
+
metadata={
|
96 |
+
"source": "cds-transactions",
|
97 |
+
"source_id": hit.source["id"]
|
98 |
+
}
|
99 |
+
)
|
100 |
+
elif "organizations" in hit.index:
|
101 |
+
source = hit.source
|
102 |
+
org_gen = source.get("combined_organization_description_general", "")
|
103 |
+
org_fin = source.get("combined_organization_description_financial", "")
|
104 |
+
org_contact = source.get("combined_organization_description_contacts", "")
|
105 |
+
mission = ""
|
106 |
+
if source.get("mission_statement", None) is not None:
|
107 |
+
mission = source.get("mission_statement", "")
|
108 |
+
keyword = ""
|
109 |
+
if source.get("keyword", None) is not None:
|
110 |
+
keyword = source.get("keyword", "")
|
111 |
+
programs = source.get("programs", "")
|
112 |
+
doc = Document(
|
113 |
+
page_content='\n\n'.join([org_gen, mission, keyword, org_fin, org_contact, programs]),
|
114 |
+
metadata={
|
115 |
+
"source": "UP-organizations-QA",
|
116 |
+
"source_id": hit.source["candid_entity_id"]
|
117 |
+
}
|
118 |
+
)
|
119 |
+
# elif "issuelab" in hit.index:
|
120 |
+
# doc = Document(
|
121 |
+
# page_content='\n\n'.join(v for k, v in hit.source["semantic_texts"].items() if v),
|
122 |
+
# metadata={
|
123 |
+
# "source": "IssueLab",
|
124 |
+
# "source_id": hit.source["resource_id"]
|
125 |
+
# }
|
126 |
+
# )
|
127 |
+
elif "issuelab-elser" in hit.index:
|
128 |
+
title = hit.source.get("title", "")
|
129 |
+
description = hit.source.get("description", "")
|
130 |
+
doc = Document(
|
131 |
+
page_content='\n\n'.join([title, description]),
|
132 |
+
metadata={
|
133 |
+
"source": "IssueLab",
|
134 |
+
"source_id": hit.source["resource_id"],
|
135 |
+
"url": hit.source.get("permalink", "")
|
136 |
+
}
|
137 |
+
)
|
138 |
+
elif "youtube" in hit.index:
|
139 |
+
title = hit.source.get("title", "")
|
140 |
+
summary = self.sub_section_alignment(
|
141 |
+
query=q,
|
142 |
+
document=parse_youtube_captions(hit.source.get("text"))
|
143 |
+
)
|
144 |
+
doc = Document(
|
145 |
+
# page_content='\n\n'.join([title]),
|
146 |
+
page_content=summary,
|
147 |
+
metadata={
|
148 |
+
"source": "Candid's Youtube channel",
|
149 |
+
"source_id": hit.source['video_id'],
|
150 |
+
"url": f"https://www.youtube.com/watch?v={hit.source['video_id']}"
|
151 |
+
}
|
152 |
+
)
|
153 |
+
elif "candid-blog" in hit.index:
|
154 |
+
excerpt = hit.source.get("excerpt", "")
|
155 |
+
title = hit.source.get("title", "")
|
156 |
+
doc = Document(
|
157 |
+
page_content='\n\n'.join([title, excerpt]),
|
158 |
+
metadata={
|
159 |
+
"source": "Candid Blog",
|
160 |
+
"source_id": hit.source["id"],
|
161 |
+
"url": hit.source["link"]
|
162 |
+
}
|
163 |
+
)
|
164 |
+
elif "candid-learning" in hit.index:
|
165 |
+
# content = hit.source.get("content", "")
|
166 |
+
title = hit.source.get("title", "")
|
167 |
+
summary = self.sub_section_alignment(
|
168 |
+
query=q,
|
169 |
+
document=parse_candid_learning(hit.source.get("content", ""))
|
170 |
+
)
|
171 |
+
doc = Document(
|
172 |
+
# page_content='\n\n'.join([title]),
|
173 |
+
page_content=summary,
|
174 |
+
metadata={
|
175 |
+
"source": "Candid Learning",
|
176 |
+
"source_id": hit.source["post_id"],
|
177 |
+
"url": hit.source.get("url", "")
|
178 |
+
}
|
179 |
+
)
|
180 |
+
elif "candid-help" in hit.index:
|
181 |
+
title = hit.source.get("title", "")
|
182 |
+
content = hit.source.get("content", "")
|
183 |
+
|
184 |
+
doc = Document(
|
185 |
+
page_content='\n\n'.join([title, content]),
|
186 |
+
metadata={
|
187 |
+
"source": "Candid Help",
|
188 |
+
"source_id": hit.source["id"],
|
189 |
+
"url": hit.source.get("link", "")
|
190 |
+
}
|
191 |
+
)
|
192 |
+
else:
|
193 |
+
doc = None
|
194 |
+
return doc
|
retrieval/issuelab.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Dict, Tuple, Union, Any
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
def build_knn_query(
|
7 |
+
query: Union[str, np.ndarray, List[float]],
|
8 |
+
fields: Tuple[str] = (
|
9 |
+
"item_description_vector",
|
10 |
+
"description_vector",
|
11 |
+
"title_vector",
|
12 |
+
"content_vector",
|
13 |
+
"issuelab_key_findings_vector",
|
14 |
+
),
|
15 |
+
subjects_vector: List[float] | None = None,
|
16 |
+
population_vector: List[float] | None = None,
|
17 |
+
k: int = 10,
|
18 |
+
model_id: str = "sentence-transformers__all-mpnet-base-v2"
|
19 |
+
):
|
20 |
+
output = []
|
21 |
+
|
22 |
+
for f in fields:
|
23 |
+
if isinstance(query, str):
|
24 |
+
output.append({
|
25 |
+
"field": f"embeddings.{f}.predicted_value",
|
26 |
+
"k": k,
|
27 |
+
"num_candidates": 100,
|
28 |
+
"query_vector_builder": {
|
29 |
+
"text_embedding": {
|
30 |
+
"model_id": model_id,
|
31 |
+
"model_text": query
|
32 |
+
}
|
33 |
+
},
|
34 |
+
"boost": 1 / len(fields)
|
35 |
+
})
|
36 |
+
elif isinstance(query, (np.ndarray, list)):
|
37 |
+
output.append({
|
38 |
+
"field": f"embeddings.{f}.predicted_value",
|
39 |
+
"query_vector": list(query),
|
40 |
+
"k": k,
|
41 |
+
"num_candidates": 100,
|
42 |
+
})
|
43 |
+
|
44 |
+
if subjects_vector:
|
45 |
+
output.append({
|
46 |
+
"field": "embeddings.subjects_vector",
|
47 |
+
"query_vector": subjects_vector,
|
48 |
+
"k": k,
|
49 |
+
"num_candidates": 100,
|
50 |
+
})
|
51 |
+
|
52 |
+
if population_vector:
|
53 |
+
output.append({
|
54 |
+
"field": "embeddings.populations_vector",
|
55 |
+
"query_vector": population_vector,
|
56 |
+
"k": k,
|
57 |
+
"num_candidates": 100,
|
58 |
+
})
|
59 |
+
return {"knn": output}
|
60 |
+
|
61 |
+
|
62 |
+
def issuelab_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
|
63 |
+
chunks_html = ""
|
64 |
+
if show_chunks:
|
65 |
+
cleaned_text = []
|
66 |
+
for k, v in doc["inner_hits"].items():
|
67 |
+
hits = v["hits"]["hits"]
|
68 |
+
for h in hits:
|
69 |
+
for k1, v1 in h["fields"].items():
|
70 |
+
# we don't want other chunks
|
71 |
+
if "content" in k1:
|
72 |
+
cleaned_text.append(f"<div><p>{v1[0]['chunk'][0]}</p></div>")
|
73 |
+
|
74 |
+
chunks_html ="<span><b>Relevant parts of the content:</b></span>" + "<br>".join(cleaned_text)
|
75 |
+
|
76 |
+
html = f"""
|
77 |
+
<div style='height: auto; padding: 5px;'>
|
78 |
+
<div style='border: 1px solid #febe10;'>
|
79 |
+
<span style='display: inline-block; height: {height_px - 10}px; padding: 5px; vertical-align: top;'>
|
80 |
+
<img
|
81 |
+
src='{doc['cover_graphic_small']}'
|
82 |
+
style='max-height: 100%; overflow: hidden; border-radius: 3%;'
|
83 |
+
>
|
84 |
+
</span>
|
85 |
+
|
86 |
+
<span style='padding: 10px; display: inline-block; width: 70%;'>
|
87 |
+
<div>
|
88 |
+
<span><b>Issuelab ID:</b> {doc['resource_id']}</span>
|
89 |
+
<br>
|
90 |
+
<span>
|
91 |
+
<a href='{doc['issuelab_url']}' target='_blank' style='text-decoration: none;'>
|
92 |
+
{doc['title']}
|
93 |
+
</a>
|
94 |
+
</span>
|
95 |
+
<br>
|
96 |
+
|
97 |
+
<span><b>Description:</b> {doc['description']}</span>
|
98 |
+
<br>
|
99 |
+
<div>{doc['combined_item_description']}</div>
|
100 |
+
<br>
|
101 |
+
<div>{chunks_html}</div>
|
102 |
+
|
103 |
+
</div>
|
104 |
+
</span>
|
105 |
+
</div>
|
106 |
+
</div>
|
107 |
+
"""
|
108 |
+
return html
|
retrieval/youtube.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Tuple, Any
|
2 |
+
|
3 |
+
|
4 |
+
def build_knn_model_query(
|
5 |
+
query: str,
|
6 |
+
fields: Tuple[str] = (
|
7 |
+
"captions_cleaned",
|
8 |
+
"description_cleaned",
|
9 |
+
"title"
|
10 |
+
),
|
11 |
+
k: int = 10,
|
12 |
+
model_id: str = "sentence-transformers__all-mpnet-base-v2"
|
13 |
+
):
|
14 |
+
output = []
|
15 |
+
|
16 |
+
for f in fields:
|
17 |
+
output.append({
|
18 |
+
"field": f"embeddings.{f}.chunks.vector.predicted_value",
|
19 |
+
"k": k,
|
20 |
+
"num_candidates": 100,
|
21 |
+
"query_vector_builder": {
|
22 |
+
"text_embedding": {
|
23 |
+
"model_id": model_id,
|
24 |
+
"model_text": query
|
25 |
+
}
|
26 |
+
},
|
27 |
+
"boost": 1 / len(fields)
|
28 |
+
})
|
29 |
+
return {"knn": output}
|
30 |
+
|
31 |
+
|
32 |
+
def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
|
33 |
+
url = f"https://www.youtube.com/watch?v={doc['video_id']}"
|
34 |
+
fields = ["title", "description_cleaned"]
|
35 |
+
|
36 |
+
fields_dict = {}
|
37 |
+
fields_len = 0
|
38 |
+
for field in fields:
|
39 |
+
if doc.get(field, None) is not None:
|
40 |
+
fields_dict[field] = doc[field]
|
41 |
+
fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"
|
42 |
+
|
43 |
+
if (fields_len + len(doc[field])) > 999:
|
44 |
+
rest_text_len = 999 - fields_len
|
45 |
+
if rest_text_len > 0:
|
46 |
+
fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
|
47 |
+
else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
|
48 |
+
fields_len = fields_len + len(doc[field])
|
49 |
+
else:
|
50 |
+
fields_dict[field] = ""
|
51 |
+
fields_dict[field + "_txt"] = ""
|
52 |
+
html = f"""
|
53 |
+
<div style='height: {height_px}px; padding: 5px;'>
|
54 |
+
<div style='height: {height_px}px; border: 1px solid #febe10;'>
|
55 |
+
<span style='padding-left: 10px; display: inline-block; width: 100%;'>
|
56 |
+
<div>
|
57 |
+
<span>
|
58 |
+
<b>Candid Youtube video:</b>
|
59 |
+
<a href='{url}' target='_blank' style='text-decoration: none;'>
|
60 |
+
{doc['title']}
|
61 |
+
</a>
|
62 |
+
</span>
|
63 |
+
<iframe
|
64 |
+
width="426"
|
65 |
+
height="240"
|
66 |
+
src="https://www.youtube.com/embed/{doc['video_id']}?si=0-y6eRrOzXTUSBDY"
|
67 |
+
title="YouTube video player"
|
68 |
+
frameborder="0"
|
69 |
+
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
|
70 |
+
referrerpolicy="strict-origin-when-cross-origin"
|
71 |
+
allowfullscreen
|
72 |
+
style="display: inline-block; float: left;padding-right: 10px;padding-top: 5px;">
|
73 |
+
</iframe>
|
74 |
+
<br>
|
75 |
+
<br>
|
76 |
+
{fields_dict["description_cleaned_txt"]}
|
77 |
+
</div>
|
78 |
+
</span>
|
79 |
+
</div>
|
80 |
+
</div>
|
81 |
+
"""
|
82 |
+
return html
|
search.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional
|
2 |
+
import json
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
import requests
|
6 |
+
|
7 |
+
from .utils import html_format_doc
|
8 |
+
from .retrieval.up_orgs_keyword import organization_card_html
|
9 |
+
from .retrieval.elastic import reranker, get_query_results
|
10 |
+
from .retrieval.config import ALL_INDICES
|
11 |
+
from . import UP_QA_SEARCH_API
|
12 |
+
|
13 |
+
|
14 |
+
def run_search(search_text: str, indices: Optional[List[str]] = None):
|
15 |
+
results = get_query_results(search_text, indices=indices)
|
16 |
+
|
17 |
+
output = []
|
18 |
+
for result in reranker(results):
|
19 |
+
source_name = None
|
20 |
+
if "news" in result.index:
|
21 |
+
source_name = "news"
|
22 |
+
elif "transactions" in result.index:
|
23 |
+
source_name = "transactions"
|
24 |
+
elif "organizations" in result.index:
|
25 |
+
source_name = "organizations"
|
26 |
+
elif "issuelab-elser" in result.index:
|
27 |
+
source_name = "issuelab"
|
28 |
+
# elif "issuelab" in result.index:
|
29 |
+
# source_name = "issuelab"
|
30 |
+
elif "youtube-elser" in result.index:
|
31 |
+
source_name = "youtube"
|
32 |
+
# elif "youtube" in result.index:
|
33 |
+
# source_name = "youtube"
|
34 |
+
elif "candid-blog-elser" in result.index:
|
35 |
+
source_name = "candid_blog"
|
36 |
+
# elif "candid-blog" in result.index:
|
37 |
+
# source_name = "candid_blog"
|
38 |
+
elif "candid-learning" in result.index: # TODO fix that
|
39 |
+
source_name = "candid_learning"
|
40 |
+
elif "candid-help-elser" in result.index:
|
41 |
+
source_name = "candid_help"
|
42 |
+
|
43 |
+
doc = html_format_doc(doc=result.source, source=source_name)
|
44 |
+
output.append(doc)
|
45 |
+
return f"<div>{''.join(output)}</div>"
|
46 |
+
|
47 |
+
|
48 |
+
def run_ks(search_text: str):
|
49 |
+
json_body = {"keyword": search_text, "rowCount": 10}
|
50 |
+
|
51 |
+
response = requests.post(
|
52 |
+
url=UP_QA_SEARCH_API["API_URL"],
|
53 |
+
json=json_body,
|
54 |
+
headers={
|
55 |
+
"accept": "application/json",
|
56 |
+
"content-type": "application/json",
|
57 |
+
"x-api-key": UP_QA_SEARCH_API["API_KEY"]
|
58 |
+
},
|
59 |
+
timeout=(5 * 60)
|
60 |
+
)
|
61 |
+
|
62 |
+
r_json = json.loads(response.text)
|
63 |
+
output_k = []
|
64 |
+
if r_json.get("returnedOrgs", None) is not None:
|
65 |
+
for doc in r_json["returnedOrgs"]:
|
66 |
+
org = {}
|
67 |
+
org["candid_entity_id"] = doc.get("candidEntityID", "")
|
68 |
+
org["main_name"] = doc.get("orgName", "")
|
69 |
+
org["logo"] = doc.get("logo", "")
|
70 |
+
org["seal"] = doc.get("seal", {})
|
71 |
+
org["city"] = doc.get("city", "")
|
72 |
+
org["admin1"] = doc.get("admin1", "")
|
73 |
+
org["country_name"] = doc.get("countryName", "")
|
74 |
+
org["taxonomy"] = doc.get("taxonomy", {})
|
75 |
+
highlights = doc.get("highlights", [])
|
76 |
+
if highlights:
|
77 |
+
for h in highlights:
|
78 |
+
if h["field"] == "mission_statement":
|
79 |
+
org["mission_statement"] = "; ".join(h["highlights"])
|
80 |
+
|
81 |
+
html = organization_card_html(org, 250)
|
82 |
+
output_k.append(html)
|
83 |
+
|
84 |
+
# Getting semantic results
|
85 |
+
output_s = run_search(search_text=search_text)
|
86 |
+
|
87 |
+
return f"<div>{''.join(output_k)}</div>", output_s
|
88 |
+
|
89 |
+
|
90 |
+
def build_search_tab() -> gr.Blocks:
|
91 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Semantic search") as demo:
|
92 |
+
gr.Markdown(
|
93 |
+
"<h1>Alpha demo: Semantic search</h1>"
|
94 |
+
"Search and ask questions of Candid's data together with casual language"
|
95 |
+
)
|
96 |
+
|
97 |
+
query = gr.Text(placeholder="Search", show_label=False)
|
98 |
+
|
99 |
+
with gr.Accordion(label="Advanced settings", open=False):
|
100 |
+
es_indices = gr.CheckboxGroup(
|
101 |
+
choices=list(ALL_INDICES),
|
102 |
+
value=list(ALL_INDICES),
|
103 |
+
label="Sources to include",
|
104 |
+
interactive=True
|
105 |
+
)
|
106 |
+
search = gr.Button("Search")
|
107 |
+
|
108 |
+
feed = gr.HTML()
|
109 |
+
|
110 |
+
# pylint: disable=no-member
|
111 |
+
search.click(
|
112 |
+
fn=run_search,
|
113 |
+
inputs=[query, es_indices],
|
114 |
+
outputs=[feed],
|
115 |
+
api_name=False,
|
116 |
+
queue=True
|
117 |
+
)
|
118 |
+
return demo
|
119 |
+
|
120 |
+
|
121 |
+
def build_ks_tab() -> gr.Blocks:
|
122 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Semantic search") as demo:
|
123 |
+
gr.Markdown(
|
124 |
+
"<h1>Alpha demo: Keyword versus Semantic search</h1>"
|
125 |
+
"Compare current search results versus semantic search results"
|
126 |
+
)
|
127 |
+
query = gr.TextArea(placeholder="Search", show_label=False, lines=1)
|
128 |
+
ask = gr.Button("Search Unified Platform organizations")
|
129 |
+
with gr.Row():
|
130 |
+
with gr.Column():
|
131 |
+
gr.Markdown("<h2>Keyword results</h2>")
|
132 |
+
feed_k = gr.HTML()
|
133 |
+
with gr.Column():
|
134 |
+
gr.Markdown("<h2>Semantic results</h2>")
|
135 |
+
feed_s = gr.HTML()
|
136 |
+
|
137 |
+
# pylint: disable=no-member
|
138 |
+
ask.click(
|
139 |
+
fn=run_ks,
|
140 |
+
inputs=[query],
|
141 |
+
outputs=[feed_k, feed_s],
|
142 |
+
api_name=False,
|
143 |
+
queue=True
|
144 |
+
)
|
145 |
+
|
146 |
+
return demo
|
static/candid_logo_yellow.png
ADDED
![]() |
static/css.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
css_chat = """
|
2 |
+
.message-row img {
|
3 |
+
margin: 0px !important;
|
4 |
+
}
|
5 |
+
.avatar-container img {
|
6 |
+
padding: 0px !important;
|
7 |
+
}
|
8 |
+
|
9 |
+
#ssearch-sources {
|
10 |
+
display: flex;
|
11 |
+
gap: 10px;
|
12 |
+
min-width: 75vw;
|
13 |
+
padding-bottom: 5px;
|
14 |
+
}
|
15 |
+
|
16 |
+
.ssearch-source-btn {
|
17 |
+
background-color: #febe10;
|
18 |
+
color: black;
|
19 |
+
padding: 5px;
|
20 |
+
text-align: center;
|
21 |
+
border-radius: 12px;
|
22 |
+
min-width: 70px;
|
23 |
+
max-width: 75px;
|
24 |
+
box-shadow: 0 2px 5px 0 rgba(0, 0, 0,0.2);
|
25 |
+
height: 45px;
|
26 |
+
font-size:small;
|
27 |
+
}
|
28 |
+
|
29 |
+
.ssearch-source {
|
30 |
+
text-decoration: none;
|
31 |
+
display: block;
|
32 |
+
box-sizing: border-box;
|
33 |
+
}
|
34 |
+
|
35 |
+
button.upload-button.svelte-1d7elt4 {
|
36 |
+
visibility: hidden !important;
|
37 |
+
}
|
38 |
+
|
39 |
+
.candid-org-link {
|
40 |
+
font-weight: bold;
|
41 |
+
text-decoration: none;
|
42 |
+
}
|
43 |
+
|
44 |
+
.candid-app-link {
|
45 |
+
font-size: small;
|
46 |
+
}
|
47 |
+
|
48 |
+
"""
|
tools/__init__.py
ADDED
File without changes
|
tools/config.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
CDS_API = {
|
3 |
+
'CDS_API_URL': os.getenv('CDS_API_URL'),
|
4 |
+
'CDS_API_KEY': os.getenv('CDS_API_KEY')
|
5 |
+
}
|
tools/org_seach.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
import re
|
3 |
+
|
4 |
+
from fuzzywuzzy import fuzz
|
5 |
+
|
6 |
+
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
|
7 |
+
from langchain_openai.chat_models import ChatOpenAI
|
8 |
+
from langchain_core.runnables import RunnableSequence
|
9 |
+
from langchain_core.prompts import ChatPromptTemplate
|
10 |
+
from pydantic import BaseModel
|
11 |
+
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
try:
|
14 |
+
from common.org_search_component import OrgSearch
|
15 |
+
except ImportError:
|
16 |
+
from ...common.org_search_component import OrgSearch
|
17 |
+
|
18 |
+
load_dotenv()
|
19 |
+
search = OrgSearch()
|
20 |
+
|
21 |
+
|
22 |
+
class OrganizationNames(BaseModel):
|
23 |
+
orgnames: List[str]
|
24 |
+
|
25 |
+
|
26 |
+
def extract_org_links_from_chatbot(chatbot_output: str):
|
27 |
+
"""
|
28 |
+
Extracts a list of organization names from the provided text.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
chatbot_output (str):The chatbot output containing organization names and other content.
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
list: A list of organization names extracted from the text.
|
35 |
+
|
36 |
+
Raises:
|
37 |
+
ValueError: If parsing fails or if an unexpected output format is received.
|
38 |
+
"""
|
39 |
+
prompt = """Extract only the names of officially recognized organizations, foundations, and government entities from the text below. Do not include any entries that contain descriptions, regional identifiers, or explanations within parentheses or following the name. Strictly exclude databases, resources, crowdfunding platforms, and general terms. Provide the output only in the specified JSON format.
|
40 |
+
|
41 |
+
input text below:
|
42 |
+
|
43 |
+
```{chatbot_output}``
|
44 |
+
|
45 |
+
output format:
|
46 |
+
{{
|
47 |
+
'orgnames' : [list of organization names without any additional descriptions or identifiers]
|
48 |
+
}}
|
49 |
+
|
50 |
+
"""
|
51 |
+
|
52 |
+
try:
|
53 |
+
parser = JsonOutputToolsParser()
|
54 |
+
llm = ChatOpenAI(model="gpt-4o").bind_tools([OrganizationNames])
|
55 |
+
prompt = ChatPromptTemplate.from_template(prompt)
|
56 |
+
chain = RunnableSequence(prompt, llm, parser)
|
57 |
+
|
58 |
+
# Run the chain with the input data
|
59 |
+
result = chain.invoke({"chatbot_output": chatbot_output})
|
60 |
+
|
61 |
+
# Extract the organization names from the output
|
62 |
+
output_list = result[0]["args"].get("orgnames", [])
|
63 |
+
|
64 |
+
# Validate output format
|
65 |
+
if not isinstance(output_list, list):
|
66 |
+
raise ValueError("Unexpected output format: 'orgnames' should be a list")
|
67 |
+
|
68 |
+
return output_list
|
69 |
+
|
70 |
+
except Exception as e:
|
71 |
+
# Log or print the error as needed for debugging
|
72 |
+
print(f"text does not have any organization: {e}")
|
73 |
+
return []
|
74 |
+
|
75 |
+
|
76 |
+
def is_similar(name: str, list_of_dict: list, threshold: int = 80):
|
77 |
+
"""
|
78 |
+
Returns True if `name` is similar to any names in `list_of_dict` based on a similarity threshold.
|
79 |
+
"""
|
80 |
+
try:
|
81 |
+
for item in list_of_dict:
|
82 |
+
try:
|
83 |
+
# Attempt to calculate similarity score
|
84 |
+
similarity = fuzz.ratio(name.lower(), item["name"].lower())
|
85 |
+
if similarity >= threshold:
|
86 |
+
return True
|
87 |
+
except KeyError:
|
88 |
+
# Handle cases where 'name' key might be missing in dictionary
|
89 |
+
print(f"KeyError: Missing 'name' key in dictionary item {item}")
|
90 |
+
continue
|
91 |
+
except AttributeError:
|
92 |
+
# Handle non-string name values in dictionary items
|
93 |
+
print(f"AttributeError: Non-string 'name' in dictionary item {item}")
|
94 |
+
continue
|
95 |
+
except TypeError as e:
|
96 |
+
# Handle cases where input types are incorrect
|
97 |
+
print(f"TypeError: {e}")
|
98 |
+
return False
|
99 |
+
|
100 |
+
return False
|
101 |
+
|
102 |
+
|
103 |
+
def generate_org_link_dict(org_names_list: list):
|
104 |
+
"""
|
105 |
+
Maps organization names to their Candid profile URLs if available.
|
106 |
+
|
107 |
+
For each organization in `output_list`, this function attempts to retrieve a matching profile
|
108 |
+
using `search_org`. If a similar name is found and a Candid entity ID is available, it constructs
|
109 |
+
a profile URL. If no ID or similar match is found, or if an error occurs, it assigns an empty string.
|
110 |
+
|
111 |
+
Args:
|
112 |
+
output_list (list): List of organization names (str) to retrieve Candid profile links for.
|
113 |
+
|
114 |
+
Returns:
|
115 |
+
dict: Dictionary with organization names as keys and Candid profile URLs or empty strings as values.
|
116 |
+
|
117 |
+
Example:
|
118 |
+
get_org_link(['New York-Presbyterian Hospital'])
|
119 |
+
# {'New York-Presbyterian Hospital': 'https://app.candid.org/profile/6915255'}
|
120 |
+
"""
|
121 |
+
link_dict = {}
|
122 |
+
|
123 |
+
for org in org_names_list:
|
124 |
+
try:
|
125 |
+
# Attempt to retrieve organization data
|
126 |
+
response = search(org)
|
127 |
+
|
128 |
+
# Check if there is a valid response and if names are similar
|
129 |
+
if response and is_similar(org, response[0].get("names", "")):
|
130 |
+
# Try to get the Candid entity ID and construct the URL
|
131 |
+
candid_entity_id = response[0].get("candid_entity_id")
|
132 |
+
if candid_entity_id:
|
133 |
+
link_dict[org] = (
|
134 |
+
f"https://app.candid.org/profile/{candid_entity_id}"
|
135 |
+
)
|
136 |
+
else:
|
137 |
+
link_dict[org] = "" # No ID found, set empty string
|
138 |
+
else:
|
139 |
+
link_dict[org] = "" # No similar match found
|
140 |
+
|
141 |
+
except KeyError as e:
|
142 |
+
# Handle missing keys in the response dictionary
|
143 |
+
print(f"KeyError encountered for organization '{org}': {e}")
|
144 |
+
link_dict[org] = ""
|
145 |
+
|
146 |
+
except Exception as e:
|
147 |
+
# Catch any other unexpected errors
|
148 |
+
|
149 |
+
print(f"An error occurred for organization '{org}': {e}")
|
150 |
+
link_dict[org] = ""
|
151 |
+
|
152 |
+
return link_dict
|
153 |
+
|
154 |
+
|
155 |
+
def embed_org_links_in_text(input_text: str, org_link_dict: dict):
|
156 |
+
"""
|
157 |
+
Replaces organization names in `text` with links from `link_dict` and appends a Candid info message.
|
158 |
+
|
159 |
+
Args:
|
160 |
+
text (str): The text containing organization names.
|
161 |
+
link_dict (dict): Mapping of organization names to URLs.
|
162 |
+
|
163 |
+
Returns:
|
164 |
+
str: Updated text with linked organization names and an appended Candid message.
|
165 |
+
"""
|
166 |
+
try:
|
167 |
+
for org_name, url in org_link_dict.items():
|
168 |
+
if url: # Only proceed if the URL is not empty
|
169 |
+
regex_pattern = re.compile(re.escape(org_name))
|
170 |
+
input_text = regex_pattern.sub(
|
171 |
+
repl=f"<a href={url} target='_blank' rel='noreferrer' class='candid-org-link'>{org_name}</a>",
|
172 |
+
string=input_text
|
173 |
+
)
|
174 |
+
|
175 |
+
# Append Candid information message at the end
|
176 |
+
input_text += "<p class='candid-app-link'> Visit <a href=https://app.candid.org/ target='_blank' rel='noreferrer' class='candid-org-link'>Candid</a> to get nonprofit information you need.</p>"
|
177 |
+
|
178 |
+
except TypeError as e:
|
179 |
+
print(f"TypeError encountered: {e}")
|
180 |
+
return input_text
|
181 |
+
|
182 |
+
except re.error as e:
|
183 |
+
print(f"Regex error encountered for '{org_name}': {e}")
|
184 |
+
return input_text
|
185 |
+
|
186 |
+
except Exception as e:
|
187 |
+
print(f"Unexpected error: {e}")
|
188 |
+
return input_text
|
189 |
+
|
190 |
+
return input_text
|
tools/question_reformulation.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_core.prompts import ChatPromptTemplate
|
2 |
+
from langchain_core.output_parsers import StrOutputParser
|
3 |
+
|
4 |
+
|
5 |
+
def reformulate_question_using_history(state, llm):
|
6 |
+
"""
|
7 |
+
Transform the query to produce a better query with details from previous messages.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
state (messages): The current state
|
11 |
+
llm: LLM to use
|
12 |
+
Returns:
|
13 |
+
dict: The updated state with re-phrased question and original user_input for UI
|
14 |
+
"""
|
15 |
+
print("---REFORMULATE THE USER INPUT---")
|
16 |
+
messages = state["messages"]
|
17 |
+
question = messages[-1].content
|
18 |
+
|
19 |
+
if len(messages) > 1:
|
20 |
+
contextualize_q_system_prompt = """Given a chat history and the latest user input \
|
21 |
+
which might reference context in the chat history, formulate a standalone input \
|
22 |
+
which can be understood without the chat history.
|
23 |
+
Chat history:
|
24 |
+
\n ------- \n
|
25 |
+
{chat_history}
|
26 |
+
\n ------- \n
|
27 |
+
User input:
|
28 |
+
\n ------- \n
|
29 |
+
{question}
|
30 |
+
\n ------- \n
|
31 |
+
Do NOT answer the question, \
|
32 |
+
just reformulate it if needed and otherwise return it as is.
|
33 |
+
"""
|
34 |
+
|
35 |
+
contextualize_q_prompt = ChatPromptTemplate([
|
36 |
+
("system", contextualize_q_system_prompt),
|
37 |
+
("human", question),
|
38 |
+
])
|
39 |
+
|
40 |
+
rag_chain = contextualize_q_prompt | llm | StrOutputParser()
|
41 |
+
new_question = rag_chain.invoke({"chat_history": messages, "question": question})
|
42 |
+
print(f"user asked: '{question}', agent reformulated the question basing on the chat history: {new_question}")
|
43 |
+
return {"messages": [new_question], "user_input" : question}
|
44 |
+
return {"messages": [question], "user_input" : question}
|
utils.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Dict, Any
|
2 |
+
from uuid import uuid4
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
try:
|
7 |
+
from retrieval import (
|
8 |
+
candid_blog,
|
9 |
+
candid_help,
|
10 |
+
candid_learning,
|
11 |
+
# cds,
|
12 |
+
issuelab,
|
13 |
+
# news,
|
14 |
+
# up_orgs,
|
15 |
+
youtube
|
16 |
+
)
|
17 |
+
except ImportError:
|
18 |
+
from .retrieval import (
|
19 |
+
candid_blog,
|
20 |
+
candid_help,
|
21 |
+
candid_learning,
|
22 |
+
# cds,
|
23 |
+
issuelab,
|
24 |
+
# news,
|
25 |
+
# up_orgs,
|
26 |
+
youtube
|
27 |
+
)
|
28 |
+
|
29 |
+
|
30 |
+
# TODO summarize messages instead
|
31 |
+
def filter_messages(messages, k=10):
|
32 |
+
return messages[-k:]
|
33 |
+
|
34 |
+
|
35 |
+
def html_format_doc(doc: Dict[str, Any], source: str, show_chunks=False) -> str:
|
36 |
+
height_px = 200
|
37 |
+
html = ""
|
38 |
+
|
39 |
+
if source == "news":
|
40 |
+
# html = news.article_card_html(doc, height_px, show_chunks)
|
41 |
+
pass
|
42 |
+
elif source == "transactions":
|
43 |
+
# html = cds.transaction_card_html(doc, height_px, show_chunks)
|
44 |
+
pass
|
45 |
+
elif source == "organizations":
|
46 |
+
# html = up_orgs.organization_card_html(doc, 400, show_chunks)
|
47 |
+
pass
|
48 |
+
elif source == "issuelab":
|
49 |
+
html = issuelab.issuelab_card_html(doc, height_px, show_chunks)
|
50 |
+
elif source == "youtube":
|
51 |
+
html = youtube.build_card_html(doc, 400, show_chunks)
|
52 |
+
elif source == "candid_blog":
|
53 |
+
html = candid_blog.build_card_html(doc, height_px, show_chunks)
|
54 |
+
elif source == "candid_learning":
|
55 |
+
html = candid_learning.build_card_html(doc, height_px, show_chunks)
|
56 |
+
elif source == "candid_help":
|
57 |
+
html = candid_help.build_card_html(doc, height_px, show_chunks)
|
58 |
+
return html
|
59 |
+
|
60 |
+
|
61 |
+
def html_format_docs_chat(docs):
|
62 |
+
"""
|
63 |
+
Formats Candid sources into a line of buttons
|
64 |
+
"""
|
65 |
+
html = ""
|
66 |
+
if docs:
|
67 |
+
docs_html = []
|
68 |
+
for doc in docs:
|
69 |
+
s_name = doc.metadata.get("source", "Source")
|
70 |
+
s_url = doc.metadata.get("url", "URL")
|
71 |
+
s_html = f"""<a href={s_url} target='_blank' rel='noreferrer' class='ssearch-source'> \
|
72 |
+
<button class='ssearch-source-btn'>{s_name}</button></a>"""
|
73 |
+
docs_html.append(s_html)
|
74 |
+
docs_html_insert = "".join(s for s in docs_html)
|
75 |
+
html = f"""<div id='ssearch-sources'>{docs_html_insert}</div>"""
|
76 |
+
return html
|
77 |
+
|
78 |
+
|
79 |
+
def format_chat_response(chatbot: List[Any]) -> List[Any]:
|
80 |
+
"""We have sources appended as one more tuple. Here we concatinate HTML of sources
|
81 |
+
with the AI response
|
82 |
+
Returns:
|
83 |
+
_type_: updated chatbot message as HTML
|
84 |
+
"""
|
85 |
+
if chatbot:
|
86 |
+
sources = chatbot[-1][1]
|
87 |
+
chatbot.pop(-1)
|
88 |
+
chatbot[-1][1] = chatbot[-1][1] + sources
|
89 |
+
return gr.HTML(chatbot)
|
90 |
+
|
91 |
+
|
92 |
+
def format_chat_ag_response(chatbot: List[Any]) -> List[Any]:
|
93 |
+
"""If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
|
94 |
+
with the AI response
|
95 |
+
Returns:
|
96 |
+
_type_: updated chatbot message as HTML
|
97 |
+
"""
|
98 |
+
sources = ""
|
99 |
+
if chatbot:
|
100 |
+
title = chatbot[-1]["metadata"].get("title", None)
|
101 |
+
if title == "Sources HTML":
|
102 |
+
sources = chatbot[-1]["content"]
|
103 |
+
chatbot.pop(-1)
|
104 |
+
chatbot[-1]["content"] = chatbot[-1]["content"] + sources
|
105 |
+
return gr.HTML(chatbot)
|
106 |
+
|
107 |
+
|
108 |
+
def valid_inputs(*args) -> bool:
|
109 |
+
return any(a is not None or (isinstance(a, str) and a.strip() != '') for a in args)
|
110 |
+
|
111 |
+
def get_session_id(thread_id: gr.components.Component) -> str:
|
112 |
+
if not thread_id:
|
113 |
+
thread_id = uuid4().hex
|
114 |
+
return thread_id
|