Spaces:

CandidAI
/

ask-candid

Running

App Files Files Community

brainsqueeze commited on Mar 5

Commit

bea5044

verified ·

1 Parent(s): a6e3fec

Adding optional news data source

Browse files

Optional Candid news data source with sparse encoding for quasi-semantic searching

Files changed (16) hide show

app.py +1 -1
ask_candid/agents/elastic.py +380 -23
ask_candid/base/config/connections.py +11 -0
ask_candid/base/config/constants.py +1 -1
ask_candid/base/config/data.py +3 -2
ask_candid/chat.py +4 -1
ask_candid/graph.py +4 -3
ask_candid/retrieval/elastic.py +566 -500
ask_candid/retrieval/sparse_lexical.py +29 -0
ask_candid/tools/elastic/index_details_tool.py +4 -2
ask_candid/tools/elastic/index_search_tool.py +24 -2
ask_candid/tools/org_seach.py +1 -1
ask_candid/tools/question_reformulation.py +39 -19
ask_candid/tools/recommendation.py +189 -140
ask_candid/utils.py +17 -25
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -113,7 +113,7 @@ def build_rag_chat() -> Tuple[LoggedComponents, gr.Blocks]:
         with gr.Accordion(label="Advanced settings", open=False):
             es_indices = gr.CheckboxGroup(
                 choices=list(ALL_INDICES),
-                value=list(ALL_INDICES),
                 label="Sources to include",
                 interactive=True,
             )

         with gr.Accordion(label="Advanced settings", open=False):
             es_indices = gr.CheckboxGroup(
                 choices=list(ALL_INDICES),
+                value=[idx for idx in ALL_INDICES if "news" not in idx],
                 label="Sources to include",
                 interactive=True,
             )

ask_candid/agents/elastic.py CHANGED Viewed

@@ -1,32 +1,27 @@
-from typing import TypedDict
 from functools import partial
 import json
 import ast
 from pydantic import BaseModel, Field
-from langchain_openai import ChatOpenAI
 from langchain_core.runnables import RunnableSequence
 from langchain_core.language_models.llms import LLM
 from langchain.agents.openai_functions_agent.base import create_openai_functions_agent
 from langchain.agents.agent import AgentExecutor
 from langchain.agents.agent_types import AgentType
-from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain.output_parsers import PydanticOutputParser
 from langchain.schema import BaseMessage
 from langgraph.graph import StateGraph, END
-from ask_candid.tools.elastic.list_indices_tool import ListIndicesTool
 from ask_candid.tools.elastic.index_data_tool import IndexShowDataTool
 from ask_candid.tools.elastic.index_details_tool import IndexDetailsTool
 from ask_candid.tools.elastic.index_search_tool import create_search_tool
-from ask_candid.base.config.rest import OPENAI
 tools = [
-    ListIndicesTool(),
     IndexShowDataTool(),
     IndexDetailsTool(),
     create_search_tool(),
@@ -58,7 +53,7 @@ class AnalysisResult(BaseModel):
     category: str = Field(..., description="Either 'general' or 'Database'")
-def agent_factory() -> AgentExecutor:
     """
     Creates and configures an AgentExecutor instance for interacting with Elasticsearch.
@@ -72,9 +67,9 @@ def agent_factory() -> AgentExecutor:
                        providing detailed intermediate steps for transparency.
     """
-    llm = ChatOpenAI(
-        model="gpt-4o", temperature=0, api_key=OPENAI["key"], streaming=False
-    )
     tags_ = []
     agent = AgentType.OPENAI_FUNCTIONS
@@ -101,6 +96,45 @@ def agent_factory() -> AgentExecutor:
     )
 # define graph node functions
 def general_query(state: GraphState, llm: LLM) -> GraphState:
     """
@@ -126,7 +160,7 @@ def general_query(state: GraphState, llm: LLM) -> GraphState:
     return state
-def database_agent(state: GraphState) -> GraphState:
     """
     Executes a database query using an Elasticsearch agent and updates the graph state.
@@ -144,22 +178,28 @@ def database_agent(state: GraphState) -> GraphState:
     print("> database agent")
     input_data = {
         "input": f"""
-        Make sure that you query first the indices in the ElasticSearch database.
-        Make sure that after querying the indices you query the field names.
-        To answer the question choose ```organization_dev_2``` index
-        Then answer this question:
-        {state["query"]}
         """
     }
-    agent_exec = agent_factory()
     res = agent_exec.invoke(input_data)
     state["agent_out"] = res["output"]
     es_queries, es_results = {}, {}
     for i, action in enumerate(res.get("intermediate_steps", []), start=1):
         if action[0].tool == "elastic_index_search_tool":
-            es_queries[f"query_{i}"] = json.loads(action[0].tool_input.get("query") or "{}")
             es_results[f"query_{i}"] = ast.literal_eval(action[-1] or "{}")
     # if len(res["intermediate_steps"]) > 1:
@@ -239,7 +279,7 @@ def final_answer(state: GraphState, llm: LLM) -> GraphState:
     print("> Final Answer")
     prompt_template = """
-    Your task is to present the result based on the user's query:
     Query: ```{query}```
@@ -272,7 +312,7 @@ def build_compute_graph(llm: LLM) -> StateGraph:
     # Add nodes
     workflow.add_node("analyse", partial(analyse_query, llm=llm))
     workflow.add_node("general_query", partial(general_query, llm=llm))
-    workflow.add_node("es_database_agent", database_agent)
     workflow.add_node("final_answer", partial(final_answer, llm=llm))
     # Set entry point
@@ -291,3 +331,320 @@ def build_compute_graph(llm: LLM) -> StateGraph:
     workflow.add_edge("final_answer", END)
     return workflow

+from typing import TypedDict, List
 from functools import partial
 import json
 import ast
 from pydantic import BaseModel, Field
 from langchain_core.runnables import RunnableSequence
 from langchain_core.language_models.llms import LLM
 from langchain.agents.openai_functions_agent.base import create_openai_functions_agent
 from langchain.agents.agent import AgentExecutor
 from langchain.agents.agent_types import AgentType
+from langchain.prompts import ChatPromptTemplate, PromptTemplate, MessagesPlaceholder
 from langchain.output_parsers import PydanticOutputParser
 from langchain.schema import BaseMessage
+from langchain.agents import create_tool_calling_agent, AgentExecutor
+from langchain_core.tools import Tool
 from langgraph.graph import StateGraph, END
 from ask_candid.tools.elastic.index_data_tool import IndexShowDataTool
 from ask_candid.tools.elastic.index_details_tool import IndexDetailsTool
 from ask_candid.tools.elastic.index_search_tool import create_search_tool
 tools = [
     IndexShowDataTool(),
     IndexDetailsTool(),
     create_search_tool(),
     category: str = Field(..., description="Either 'general' or 'Database'")
+def agent_factory(llm: LLM) -> AgentExecutor:
     """
     Creates and configures an AgentExecutor instance for interacting with Elasticsearch.
                        providing detailed intermediate steps for transparency.
     """
+    # llm = ChatOpenAI(
+    #     model="gpt-4o", temperature=0, api_key=OPENAI["key"], streaming=False
+    # )
     tags_ = []
     agent = AgentType.OPENAI_FUNCTIONS
     )
+def agent_factory_claude(llm: LLM) -> AgentExecutor:
+    """
+    Creates and configures an AgentExecutor instance for interacting with Elasticsearch.
+    This function initializes an OpenAI GPT-4-based LLM with specific parameters,
+    constructs a prompt tailored for Elasticsearch assistance, and integrates the
+    agent with a set of tools to handle user queries. The agent is designed to work
+    with OpenAI functions for enhanced capabilities.
+    Returns:
+        AgentExecutor: Configured agent ready to execute tasks with specified tools,
+                       providing detailed intermediate steps for transparency.
+    """
+    # llm = ChatOpenAI(
+    #     model="gpt-4o", temperature=0, api_key=OPENAI["key"], streaming=False
+    # )
+    # tags_ = []
+    # agent = AgentType.OPENAI_FUNCTIONS
+    # tags_.append(agent.value if isinstance(agent, AgentType) else agent)
+    # Create the prompt
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            ("system", "You are a helpful elasticsearch assistant"),
+            MessagesPlaceholder(variable_name="chat_history", optional=True),
+            ("human", "{input}"),
+            MessagesPlaceholder(variable_name="agent_scratchpad"),
+        ]
+    )
+    agent = create_tool_calling_agent(llm, tools, prompt)
+    agent_executor = AgentExecutor.from_agent_and_tools(
+        agent=agent, tools=tools, verbose=True, return_intermediate_steps=True
+    )
+    # Create the agent
+    return agent_executor
 # define graph node functions
 def general_query(state: GraphState, llm: LLM) -> GraphState:
     """
     return state
+def database_agent(state: GraphState, llm: LLM) -> GraphState:
     """
     Executes a database query using an Elasticsearch agent and updates the graph state.
     print("> database agent")
     input_data = {
         "input": f"""
+        You are an Elasticsearch database agent designed to accurately understand and respond to user queries. Follow these steps:
+            1. Understand the user query to determine the required information.
+            2. Query the indices in the Elasticsearch database.
+            3. Retrieve the mappings and field names relevant to the query.
+            4. Use the organization_dev_2 index to extract the necessary data.
+            5. Present the response in a clear and natural language format, addressing the user's question directly.
+        User's quer:
+        ```{state["query"]}```
         """
     }
+    agent_exec = agent_factory_claude(llm)
     res = agent_exec.invoke(input_data)
     state["agent_out"] = res["output"]
     es_queries, es_results = {}, {}
     for i, action in enumerate(res.get("intermediate_steps", []), start=1):
         if action[0].tool == "elastic_index_search_tool":
+            es_queries[f"query_{i}"] = json.loads(
+                action[0].tool_input.get("query") or "{}"
+            )
             es_results[f"query_{i}"] = ast.literal_eval(action[-1] or "{}")
     # if len(res["intermediate_steps"]) > 1:
     print("> Final Answer")
     prompt_template = """
+    You are a chat agent that takes outputs generated by Elasticsearch and presents them in a conversational, natural language format, as if responding to a user's query.
     Query: ```{query}```
     # Add nodes
     workflow.add_node("analyse", partial(analyse_query, llm=llm))
     workflow.add_node("general_query", partial(general_query, llm=llm))
+    workflow.add_node("es_database_agent", partial(database_agent, llm=llm))
     workflow.add_node("final_answer", partial(final_answer, llm=llm))
     # Set entry point
     workflow.add_edge("final_answer", END)
     return workflow
+class ElasticGraph(StateGraph):
+    """Elastic Seach Agent State Graph"""
+    llm: LLM
+    tools: List[Tool]
+    def __init__(self, llm: LLM, tools: List[Tool]):
+        super().__init__(GraphState)
+        self.llm = llm
+        self.tools = tools
+        self.construct_graph()
+    def agent_factory(self) -> AgentExecutor:
+        """
+        Creates and configures an AgentExecutor instance for interacting with Elasticsearch.
+        This function initializes an OpenAI GPT-4-based LLM with specific parameters,
+        constructs a prompt tailored for Elasticsearch assistance, and integrates the
+        agent with a set of tools to handle user queries. The agent is designed to work
+        with OpenAI functions for enhanced capabilities.
+        Returns:
+            AgentExecutor: Configured agent ready to execute tasks with specified tools,
+                        providing detailed intermediate steps for transparency.
+        """
+        # llm = ChatOpenAI(
+        #     model="gpt-4o", temperature=0, api_key=OPENAI["key"], streaming=False
+        # )
+        tags_ = []
+        agent = AgentType.OPENAI_FUNCTIONS
+        tags_.append(agent.value if isinstance(agent, AgentType) else agent)
+        # Create the prompt
+        prompt = ChatPromptTemplate.from_messages(
+            [
+                ("system", "You are a helpful elasticsearch assistant"),
+                MessagesPlaceholder(variable_name="chat_history", optional=True),
+                ("human", "{input}"),
+                MessagesPlaceholder(variable_name="agent_scratchpad"),
+            ]
+        )
+        # Create the agent
+        agent_obj = create_openai_functions_agent(self.llm, tools, prompt)
+        return AgentExecutor.from_agent_and_tools(
+            agent=agent_obj,
+            tools=tools,
+            tags=tags_,
+            verbose=True,
+            return_intermediate_steps=True,
+        )
+    def agent_factory_claude(self) -> AgentExecutor:
+        """
+        Creates and configures an AgentExecutor instance for interacting with Elasticsearch.
+        This function initializes an OpenAI GPT-4-based LLM with specific parameters,
+        constructs a prompt tailored for Elasticsearch assistance, and integrates the
+        agent with a set of tools to handle user queries. The agent is designed to work
+        with OpenAI functions for enhanced capabilities.
+        Returns:
+            AgentExecutor: Configured agent ready to execute tasks with specified tools,
+                        providing detailed intermediate steps for transparency.
+        """
+        prefix = """
+        You are an intelligent agent tasked with generating accurate Elasticsearch DSL queries.
+        Analyze the intent behind the query and determine the appropriate Elasticsearch operations required.
+        Guidelines for generating right elastic seach query:
+            1. Automatically determine whether to return document hits or aggregation results based on the query structure.
+            2. Use keyword fields instead of text fields for aggregations and sorting to avoid fielddata errors
+            3. Avoid using field.keyword if a keyword field is already present to prevent redundant queries.
+            4. Ensure efficient query execution by selecting appropriate query types for filtering, searching, and aggregating.
+                """
+        prompt = ChatPromptTemplate.from_messages(
+            [
+                ("system", f"You are a helpful elasticsearch assistant. {prefix}"),
+                MessagesPlaceholder(variable_name="chat_history", optional=True),
+                ("human", "{input}"),
+                MessagesPlaceholder(variable_name="agent_scratchpad"),
+            ]
+        )
+        agent = create_tool_calling_agent(self.llm, self.tools, prompt)
+        agent_executor = AgentExecutor.from_agent_and_tools(
+            agent=agent, tools=self.tools, verbose=True, return_intermediate_steps=True
+        )
+        # Create the agent
+        return agent_executor
+    def analyse_query(self, state: GraphState) -> GraphState:
+        """
+        Analyzes the user's query to classify it as either general or database-specific
+        and determines the next processing step.
+        Args:
+            state (GraphState): Current graph state containing the user's query.
+            llm (LLM): Language model used for query analysis.
+        Returns:
+            GraphState: Updated state with the classification result and the
+                        next processing step in "next_step".
+        """
+        print("> analyse query")
+        prompt_template = """Your task is to analyze the query  ```{query}``` and classify it in:
+        grant: Grant Index - A query where users seek information about grants, funding opportunities, and grantmakers. This includes inquiries about the purpose of funding, eligibility criteria, application processes, grant recipients, funding amounts, deadlines, and how grants can be used for specific projects or initiatives. Users may also request grants tailored to their unique needs, industries, or social impact goals
+        org: Org Index - Query which asks speicific details about the organizations, their mission statement, where they are located
+        Output format:
+        {{"category": "<your_classification>"}}
+        """
+        parser = PydanticOutputParser(pydantic_object=AnalysisResult)
+        # Create the prompt
+        prompt = PromptTemplate(
+            template=prompt_template,
+            input_variables=["query"],
+            partial_variables={"format_instructions": parser.get_format_instructions()},
+        )
+        # Create the chain
+        chain = RunnableSequence(prompt, self.llm, parser)
+        # Invoke the chain with the query
+        response = chain.invoke({"query": state["query"]})
+        if response.category == "grant":
+            state["next_step"] = "grant-index"
+        else:
+            state["next_step"] = "org-index"
+        return state
+    def grant_index_agent(self, state: GraphState) -> GraphState:
+        print("> Grant Index Agent")
+        input_data = {
+            "input": f"""
+            You are an Elasticsearch database agent designed to accurately understand and respond to user queries. Follow these steps:
+                1. Understand the user query to determine the required information.
+                2. Query the indices in the Elasticsearch database.
+                3. Retrieve the mappings and field names relevant to the query.
+                4. Use the ``grants_qa_1`` index to extract the necessary data.
+                5. Ensure that you correctly identify the grantmaker (funder) or recipient (funded entity) if mentioned in the query.
+                   Users may not always provide the exact name, so the Elasticsearch query should accommodate partial or incomplete names
+                   by searching for relevant keywords.
+                6. Present the response in a clear and natural language format, addressing the user's question directly.
+            Description of some of the fields in the index but rest of the fields which are not here should be easy to understand:
+            fiscal_year: Year when grantmaker allocates budget for funding and grants. format YYYY
+            text: Objectives,mission, program and funding related information
+            Program_area: program area where organization is working on
+            Title: the title of the funding
+            pcs_v3: PCS is taxonomy, describing the work of grantmakers, recipient organizations and the philanthropic transactions between those entities.
+            The facets of the PCS illuminate the work and answer the following questions about philanthropy:
+                Who? = Population Served
+                What? = Subject and Organization Type
+                How? = Support Strategy and Transaction Type
+                the Facets:
+                Subjects: Describes WHAT is being supported. Example: Elementary education or Clean water supply.
+                Populations: Describes WHO is being supported. Example: Girls or People with disabilities.
+                Organization Type: Describes WHAT type of organization is providing or receiving support.
+                Transaction Type: Describes HOW support is being provided.
+                Support Strategies: Describes HOW activities are being implemented.
+                pcs_v3 itself is in a json format:
+                   key - subject
+                    value: it is a list of dictionary so might need to loop around to find the particular aspect
+                        hierarchy: (it is a list having subject name)
+                            [
+                            {{
+                            'name':
+                            }},
+                            {{
+                            'name':
+                            }}
+                            ]
+            Before Writing elastic search query think through which field to use
+            Note: first you should focus on query  `text` then look into pcs_v3. Make sure you pick the right size for the query
+            User's query:
+            ```{state["query"]}```
+            """
+        }
+        agent_exec = self.agent_factory_claude()
+        res = agent_exec.invoke(input_data)
+        state["agent_out"] = res["output"]
+        es_queries, es_results = {}, {}
+        for i, action in enumerate(res.get("intermediate_steps", []), start=1):
+            if action[0].tool == "elastic_index_search_tool":
+                es_queries[f"query_{i}"] = json.loads(
+                    action[0].tool_input.get("query") or "{}"
+                )
+                es_results[f"query_{i}"] = ast.literal_eval(action[-1] or "{}")
+        state["es_query"] = es_queries
+        state["es_result"] = es_results
+        return state
+    def org_index_agent(self, state: GraphState) -> GraphState:
+        """
+        Executes a database query using an Elasticsearch agent and updates the graph state.
+        The agent queries indices and field names in the Elasticsearch database,
+        selects the appropriate index (`organization_dev_2`), and answers the user's question.
+        Args:
+            state (GraphState): Current graph state containing the user's query.
+        Returns:
+            GraphState: Updated state with the agent's output in "agent_out" and
+                        the Elasticsearch query in "es_query".
+        """
+        print("> Org Index Agent")
+        input_data = {
+            "input": f"""
+            You are an Elasticsearch database agent designed to accurately understand and respond to user queries. Follow these steps:
+                1. Understand the user query to determine the required information.
+                2. Query the indices in the Elasticsearch database.
+                3. Retrieve the mappings and field names relevant to the query.
+                4. Use the `organization_qa_2` index to extract the necessary data.
+                5. Present the response in a clear and natural language format, addressing the user's question directly.
+            User's quer:
+            ```{state["query"]}```
+            """
+        }
+        agent_exec = self.agent_factory_claude()
+        res = agent_exec.invoke(input_data)
+        state["agent_out"] = res["output"]
+        es_queries, es_results = {}, {}
+        for i, action in enumerate(res.get("intermediate_steps", []), start=1):
+            if action[0].tool == "elastic_index_search_tool":
+                es_queries[f"query_{i}"] = json.loads(
+                    action[0].tool_input.get("query") or "{}"
+                )
+                es_results[f"query_{i}"] = ast.literal_eval(action[-1] or "{}")
+        state["es_query"] = es_queries
+        state["es_result"] = es_results
+        return state
+    def final_answer(self, state: GraphState) -> GraphState:
+        """
+        Generates and presents the final response based on the user's query and the AI's output.
+        Args:
+            state (GraphState): Current graph state containing the query and AI output.
+            llm (LLM): Language model used to format the final response.
+        Returns:
+            GraphState: Updated state with the formatted final answer in "agent_out".
+        """
+        print("> Final Answer")
+        prompt_template = """
+        You are a chat agent that takes outputs generated by Elasticsearch and presents them in a conversational, natural language format, as if responding to a user's query.
+        Query: ```{query}```
+        AI Output:
+        ```{output}```
+        """
+        prompt = ChatPromptTemplate.from_template(prompt_template)
+        chain = RunnableSequence(prompt, self.llm)
+        response = chain.invoke({"query": state["query"], "output": state["agent_out"]})
+        return {"agent_out": response.content}
+    def construct_graph(self) -> StateGraph:
+        """
+        Constructs a compute graph for processing user queries using a defined workflow.
+        The workflow includes nodes for query analysis, handling general or database-specific queries,
+        and generating the final response. Conditional logic determines the path based on query type.
+        Args:
+            llm (LLM): Language model to be used in various nodes for processing queries.
+        Returns:
+            StateGraph: Configured compute graph ready for execution.
+        """
+        # Add nodes
+        self.add_node("analyse", self.analyse_query)
+        self.add_node("grant-index", self.grant_index_agent)
+        self.add_node("org-index", self.org_index_agent)
+        self.add_node("final_answer", self.final_answer)
+        # Set entry point
+        self.set_entry_point("analyse")
+        # Add conditional edges
+        self.add_conditional_edges(
+            "analyse",
+            lambda x: x["next_step"],  # Use the return value of analyse_query directly
+            {"org-index": "org-index", "grant-index": "grant-index"},
+        )
+        # Add edges to end the workflow
+        self.add_edge("org-index", "final_answer")
+        self.add_edge("grant-index", "final_answer")
+        self.add_edge("final_answer", END)
+def build_elastic_graph(llm: LLM, tools: List[Tool]):
+    """Compile Elastic Agent Graph"""
+    elastic_graph = ElasticGraph(llm=llm, tools=tools)
+    graph = elastic_graph.compile()
+    return graph

ask_candid/base/config/connections.py CHANGED Viewed

@@ -32,3 +32,14 @@ SEMANTIC_ELASTIC_QA = BaseElasticAPIKeyCredential(
     cloud_id=_load_value("SEMANTIC_ELASTIC_CLOUD_ID"),
     api_key=_load_value("SEMANTIC_ELASTIC_API_KEY"),
 )

     cloud_id=_load_value("SEMANTIC_ELASTIC_CLOUD_ID"),
     api_key=_load_value("SEMANTIC_ELASTIC_API_KEY"),
 )
+SEMANTIC_ELASTIC_QA_WRITER = BaseElasticAPIKeyCredential(
+    cloud_id=_load_value("SEMANTIC_ELASTIC_WRITER_CLOUD_ID"),
+    api_key=_load_value("SEMANTIC_ELASTIC_WRITER_API_KEY"),
+)
+NEWS_ELASTIC = BaseElasticSearchConnection(
+    url=_load_value("NEWS_URL"),
+    username=_load_value("NEWS_UID"),
+    password=_load_value("NEWS_PWD")
+)

ask_candid/base/config/constants.py CHANGED Viewed

@@ -1,4 +1,4 @@
 START_SYSTEM_PROMPT = (
     "You are a Candid subject matter expert on the social sector and philanthropy. "
     "You should address the user's queries and stay on topic."
-)

 START_SYSTEM_PROMPT = (
     "You are a Candid subject matter expert on the social sector and philanthropy. "
     "You should address the user's queries and stay on topic."
+)

ask_candid/base/config/data.py CHANGED Viewed

@@ -16,5 +16,6 @@ ALL_INDICES = (
     "youtube",
     "candid_blog",
     "candid_learning",
-    "candid_help"
-)

     "youtube",
     "candid_blog",
     "candid_learning",
+    "candid_help",
+    "news"
+)

ask_candid/chat.py CHANGED Viewed

@@ -15,8 +15,10 @@ def run_chat(
     history: List[Dict],
     llm: LLM,
     indices: Optional[List[str]] = None,
-    enable_recommendations: bool = False
 ) -> Tuple[gr.MultimodalTextbox, List[Dict[str, Any]], str]:
     if len(history) == 0:
         history.append({"role": "system", "content": START_SYSTEM_PROMPT})
@@ -26,6 +28,7 @@ def run_chat(
     thread_id = get_session_id(thread_id)
     config = {"configurable": {"thread_id": thread_id}}
     workflow = build_compute_graph(llm=llm, indices=indices, enable_recommendations=enable_recommendations)
     memory = MemorySaver()  # TODO: don't use for Prod

     history: List[Dict],
     llm: LLM,
     indices: Optional[List[str]] = None,
+    premium_features: Optional[List[str]] = None,
 ) -> Tuple[gr.MultimodalTextbox, List[Dict[str, Any]], str]:
+    if premium_features is None:
+        premium_features = []
     if len(history) == 0:
         history.append({"role": "system", "content": START_SYSTEM_PROMPT})
     thread_id = get_session_id(thread_id)
     config = {"configurable": {"thread_id": thread_id}}
+    enable_recommendations = "Recommendation" in premium_features
     workflow = build_compute_graph(llm=llm, indices=indices, enable_recommendations=enable_recommendations)
     memory = MemorySaver()  # TODO: don't use for Prod

ask_candid/graph.py CHANGED Viewed

@@ -80,6 +80,7 @@ def generate_with_context(state: AgentState, llm: LLM) -> AgentState:
 def add_recommendations_pipeline_(
     G: StateGraph,
     reformulation_node_name: str = "reformulate",
     search_node_name: str = "search_agent"
 ) -> None:
@@ -96,7 +97,7 @@ def add_recommendations_pipeline_(
     """
     # Nodes for recommendation functionalities
-    G.add_node("detect_intent_with_llm", detect_intent_with_llm)
     G.add_node("determine_context", determine_context)
     G.add_node("make_recommendation", make_recommendation)
@@ -142,7 +143,7 @@ def build_compute_graph(
     G = StateGraph(AgentState)
-    G.add_node("reformulate", partial(reformulate_question_using_history, llm=llm))
     G.add_node("search_agent", partial(search_agent, llm=llm, tools=tools))
     G.add_node("retrieve", retrieve)
     G.add_node("generate_with_context", partial(generate_with_context, llm=llm))
@@ -150,7 +151,7 @@ def build_compute_graph(
     G.add_node("insert_org_link", insert_org_link)
     if enable_recommendations:
-        add_recommendations_pipeline_(G, reformulation_node_name="reformulate", search_node_name="search_agent")
     else:
         G.add_edge("reformulate", "search_agent")

 def add_recommendations_pipeline_(
     G: StateGraph,
+    llm: LLM,
     reformulation_node_name: str = "reformulate",
     search_node_name: str = "search_agent"
 ) -> None:
     """
     # Nodes for recommendation functionalities
+    G.add_node("detect_intent_with_llm", partial(detect_intent_with_llm, llm=llm))
     G.add_node("determine_context", determine_context)
     G.add_node("make_recommendation", make_recommendation)
     G = StateGraph(AgentState)
+    G.add_node("reformulate", partial(reformulate_question_using_history, llm=llm, focus_on_recommendations=enable_recommendations))
     G.add_node("search_agent", partial(search_agent, llm=llm, tools=tools))
     G.add_node("retrieve", retrieve)
     G.add_node("generate_with_context", partial(generate_with_context, llm=llm))
     G.add_node("insert_org_link", insert_org_link)
     if enable_recommendations:
+        add_recommendations_pipeline_(G, llm=llm, reformulation_node_name="reformulate", search_node_name="search_agent")
     else:
         G.add_edge("reformulate", "search_agent")

ask_candid/retrieval/elastic.py CHANGED Viewed

@@ -1,500 +1,566 @@
-from typing import List, Tuple, Dict, Iterable, Iterator, Optional, Union, Any
-from dataclasses import dataclass
-from functools import partial
-from itertools import groupby
-from torch.nn import functional as F
-from pydantic import BaseModel, Field
-from langchain_core.documents import Document
-from langchain_core.tools import Tool
-from elasticsearch import Elasticsearch
-from ask_candid.services.small_lm import CandidSLM
-from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA
-from ask_candid.base.config.data import ElasticIndexMapping, ALL_INDICES
-@dataclass
-class ElasticHitsResult:
-    """Dataclass for Elasticsearch hits results
-    """
-    index: str
-    id: Any
-    score: float
-    source: Dict[str, Any]
-    inner_hits: Dict[str, Any]
-class RetrieverInput(BaseModel):
-    """Input to the Elasticsearch retriever."""
-    user_input: str = Field(description="query to look up in retriever")
-def build_sparse_vector_query(
-    query: str,
-    fields: Tuple[str],
-    inference_id: str = ".elser-2-elasticsearch"
-) -> Dict[str, Any]:
-    """Builds a valid Elasticsearch text expansion query payload
-    Parameters
-    ----------
-    query : str
-        Search context string
-    fields : Tuple[str]
-        Semantic text field names
-    inference_id : str, optional
-        ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
-    Returns
-    -------
-    Dict[str, Any]
-    """
-    output = []
-    for f in fields:
-        output.append({
-            "nested": {
-                "path": f"embeddings.{f}.chunks",
-                "query": {
-                    "sparse_vector": {
-                        "field": f"embeddings.{f}.chunks.vector",
-                        "inference_id": inference_id,
-                        "query": query,
-                        "boost": 1 / len(fields)
-                    }
-                },
-                "inner_hits": {
-                    "_source": False,
-                    "size": 2,
-                    "fields": [f"embeddings.{f}.chunks.chunk"]
-                }
-            }
-        })
-    return {"query": {"bool": {"should": output}}}
-def query_builder(query: str, indices: List[str]) -> List[Dict[str, Any]]:
-    """Builds Elasticsearch multi-search query payload
-    Parameters
-    ----------
-    query : str
-        Search context string
-    indices : List[str]
-        Semantic index names to search over
-    Returns
-    -------
-    List[Dict[str, Any]]
-    """
-    queries = []
-    if indices is None:
-        indices = list(ALL_INDICES)
-    for index in indices:
-        if index == "issuelab":
-            q = build_sparse_vector_query(
-                query=query,
-                fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
-            )
-            q["_source"] = {"excludes": ["embeddings"]}
-            q["size"] = 1
-            queries.extend([{"index": ElasticIndexMapping.ISSUELAB_INDEX_ELSER}, q])
-        elif index == "youtube":
-            q = build_sparse_vector_query(
-                query=query,
-                fields=("captions_cleaned", "description_cleaned", "title")
-            )
-            # text_cleaned duplicates captions_cleaned
-            q["_source"] = {"excludes": ["embeddings", "captions", "description", "text_cleaned"]}
-            q["size"] = 2
-            queries.extend([{"index": ElasticIndexMapping.YOUTUBE_INDEX_ELSER}, q])
-        elif index == "candid_blog":
-            q = build_sparse_vector_query(
-                query=query,
-                fields=("content", "authors_text", "title_summary_tags")
-            )
-            q["_source"] = {"excludes": ["embeddings"]}
-            q["size"] = 2
-            queries.extend([{"index": ElasticIndexMapping.CANDID_BLOG_INDEX_ELSER}, q])
-        elif index == "candid_learning":
-            q = build_sparse_vector_query(
-                query=query,
-                fields=("content", "title", "training_topics", "staff_recommendations")
-            )
-            q["_source"] = {"excludes": ["embeddings"]}
-            q["size"] = 2
-            queries.extend([{"index": ElasticIndexMapping.CANDID_LEARNING_INDEX_ELSER}, q])
-        elif index == "candid_help":
-            q = build_sparse_vector_query(
-                query=query,
-                fields=("content", "combined_article_description")
-            )
-            q["_source"] = {"excludes": ["embeddings"]}
-            q["size"] = 2
-            queries.extend([{"index": ElasticIndexMapping.CANDID_HELP_INDEX_ELSER}, q])
-    return queries
-def multi_search(queries: List[Dict[str, Any]]) -> List[ElasticHitsResult]:
-    """Runs multi-search query
-    Parameters
-    ----------
-    queries : List[Dict[str, Any]]
-        Pre-built multi-search query payload
-    Returns
-    -------
-    List[ElasticHitsResult]
-    """
-    results = []
-    with Elasticsearch(
-        cloud_id=SEMANTIC_ELASTIC_QA.cloud_id,
-        api_key=SEMANTIC_ELASTIC_QA.api_key,
-        verify_certs=False,
-        request_timeout=60 * 3
-    ) as es:
-        for query_group in es.msearch(body=queries).get("responses", []):
-            for hit in query_group.get("hits", {}).get("hits", []):
-                hit = ElasticHitsResult(
-                    index=hit["_index"],
-                    id=hit["_id"],
-                    score=hit["_score"],
-                    source=hit["_source"],
-                    inner_hits=hit.get("inner_hits", {})
-                )
-                results.append(hit)
-    return results
-def get_query_results(search_text: str, indices: Optional[List[str]] = None) -> List[ElasticHitsResult]:
-    """Builds and executes Elasticsearch data queries from a search string.
-    Parameters
-    ----------
-    search_text : str
-        Search context string
-    indices : Optional[List[str]], optional
-        Semantic index names to search over, by default None
-    Returns
-    -------
-    List[ElasticHitsResult]
-    """
-    queries = query_builder(query=search_text, indices=indices)
-    return multi_search(queries)
-def retrieved_text(hits: Dict[str, Any]) -> str:
-    """Extracts retrieved sub-texts from documents which are strong hits from semantic queries for the purpose of
-    re-scoring by a secondary language model.
-    Parameters
-    ----------
-    hits : Dict[str, Any]
-    Returns
-    -------
-    str
-    """
-    text = []
-    for _, v in hits.items():
-        for h in (v.get("hits", {}).get("hits") or []):
-            for _, field in h.get("fields", {}).items():
-                for chunk in field:
-                    if chunk.get("chunk"):
-                        text.extend(chunk["chunk"])
-    return '\n'.join(text)
-def cosine_rescore(query: str, contexts: List[str]) -> List[float]:
-    """Computes cosine scores between retrieved contexts and the original query to re-score results based on overall
-    relevance to the original query.
-    Parameters
-    ----------
-    query : str
-        Search context string
-    contexts : List[str]
-        Semantic field sub-texts, order is by document retrieved from the original multi-search query.
-    Returns
-    -------
-    List[float]
-        Scores in the same order as the input document contexts
-    """
-    nlp = CandidSLM()
-    X = nlp.encode([query, *contexts]).vectors
-    X = F.normalize(X, dim=-1, p=2.)
-    cosine = X[1:] @ X[:1].T
-    return cosine.flatten().cpu().numpy().tolist()
-def reranker(
-    query_results: Iterable[ElasticHitsResult],
-    search_text: Optional[str] = None
-) -> Iterator[ElasticHitsResult]:
-    """Reranks Elasticsearch hits coming from multiple indices/queries which may have scores on different scales.
-    This will shuffle results
-    Parameters
-    ----------
-    query_results : Iterable[ElasticHitsResult]
-    Yields
-    ------
-    Iterator[ElasticHitsResult]
-    """
-    results: List[ElasticHitsResult] = []
-    texts: List[str] = []
-    for _, data in groupby(query_results, key=lambda x: x.index):
-        data = list(data)
-        max_score = max(data, key=lambda x: x.score).score
-        min_score = min(data, key=lambda x: x.score).score
-        for d in data:
-            d.score = (d.score - min_score) / (max_score - min_score + 1e-9)
-            results.append(d)
-            if search_text:
-                text = retrieved_text(d.inner_hits)
-                texts.append(text)
-    # if search_text and len(texts) == len(results):
-    #     scores = cosine_rescore(search_text, texts)
-    #     for r, s in zip(results, scores):
-    #         r.score = s
-    yield from sorted(results, key=lambda x: x.score, reverse=True)
-def get_results(user_input: str, indices: List[str]) -> Tuple[str, List[Document]]:
-    """End-to-end search and re-rank function.
-    Parameters
-    ----------
-    user_input : str
-        Search context string
-    indices : List[str]
-        Semantic index names to search over
-    Returns
-    -------
-    Tuple[str, List[Document]]
-        (concatenated text from search results, documents list)
-    """
-    output = ["Search didn't return any Candid sources"]
-    page_content = []
-    content = "Search didn't return any Candid sources"
-    results = get_query_results(search_text=user_input, indices=indices)
-    if results:
-        output = get_reranked_results(results, search_text=user_input)
-        for doc in output:
-            page_content.append(doc.page_content)
-        content = "\n\n".join(page_content)
-    # for the tool we need to return a tuple for content_and_artifact type
-    return content, output
-# TODO make it better!
-def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
-    """Pads the relevant chunk of text with context before and after
-    Parameters
-    ----------
-    field_name : str
-        a field with the long text that was chunked into pieces
-    hit : ElasticHitsResult
-    context_length : int, optional
-        length of text to add before and after the chunk, by default 1024
-    Returns
-    -------
-    str
-        longer chunks stuffed together
-    """
-    chunks = []
-    # TODO chunks have tokens, but long text is a normal text, but may contain html that also gets weird after tokenization
-    long_text = hit.source.get(f"{field_name}", "")
-    long_text = long_text.lower()
-    inner_hits_field = f"embeddings.{field_name}.chunks"
-    found_chunks = hit.inner_hits.get(inner_hits_field, {})
-    if found_chunks:
-        hits = found_chunks.get("hits", {}).get("hits", [])
-        for h in hits:
-            chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
-            # cutting the middle because we may have tokenizing artifacts there
-            chunk = chunk[3: -3]
-            if add_context:
-                # Find the start and end indices of the chunk in the large text
-                start_index = long_text.find(chunk[:20])
-                if start_index != -1: # Chunk is found
-                    end_index = start_index + len(chunk)
-                    pre_start_index = max(0, start_index - context_length)
-                    post_end_index = min(len(long_text), end_index + context_length)
-                    chunks.append(long_text[pre_start_index:post_end_index])
-            else:
-                chunks.append(chunk)
-    return '\n\n'.join(chunks)
-def process_hit(hit: ElasticHitsResult) -> Union[Document, None]:
-    """Parse Elasticsearch hit results into data structures handled by the RAG pipeline.
-    Parameters
-    ----------
-    hit : ElasticHitsResult
-    Returns
-    -------
-    Union[Document, None]
-    """
-    if "issuelab-elser" in hit.index:
-        combined_item_description = hit.source.get("combined_item_description", "") # title inside
-        description = hit.source.get("description", "")
-        combined_issuelab_findings = hit.source.get("combined_issuelab_findings", "")
-        # we only need to process long texts
-        chunks_with_context_txt = get_context("content", hit, context_length=12)
-        doc = Document(
-            page_content='\n\n'.join([
-                combined_item_description,
-                combined_issuelab_findings,
-                description,
-                chunks_with_context_txt
-            ]),
-            metadata={
-                "title": hit.source["title"],
-                "source": "IssueLab",
-                "source_id": hit.source["resource_id"],
-                "url": hit.source.get("permalink", "")
-            }
-        )
-    elif "youtube" in hit.index:
-        title = hit.source.get("title", "")
-        # we only need to process long texts
-        description_cleaned_with_context_txt = get_context("description_cleaned", hit, context_length=12)
-        captions_cleaned_with_context_txt = get_context("captions_cleaned", hit, context_length=12)
-        doc = Document(
-            page_content='\n\n'.join([title, description_cleaned_with_context_txt, captions_cleaned_with_context_txt]),
-            metadata={
-                "title": title,
-                "source": "Candid YouTube",
-                "source_id": hit.source['video_id'],
-                "url": f"https://www.youtube.com/watch?v&#61;{hit.source['video_id']}"
-            }
-        )
-    elif "candid-blog" in hit.index:
-        excerpt = hit.source.get("excerpt", "")
-        title = hit.source.get("title", "")
-        # we only need to process long text
-        content_with_context_txt = get_context("content", hit, context_length=12, add_context=False)
-        authors = get_context("authors_text", hit, context_length=12, add_context=False)
-        tags = hit.source.get("title_summary_tags", "")
-        doc = Document(
-            page_content='\n\n'.join([title, excerpt, content_with_context_txt, authors, tags]),
-            metadata={
-                "title": title,
-                "source": "Candid Blog",
-                "source_id": hit.source["id"],
-                "url": hit.source["link"]
-            }
-        )
-    elif "candid-learning" in hit.index:
-        title = hit.source.get("title", "")
-        content_with_context_txt = get_context("content", hit, context_length=12)
-        training_topics = hit.source.get("training_topics", "")
-        staff_recommendations = hit.source.get("staff_recommendations", "")
-        doc = Document(
-            page_content='\n\n'.join([title, staff_recommendations, training_topics, content_with_context_txt]),
-            metadata={
-                "title": hit.source["title"],
-                "source": "Candid Learning",
-                "source_id": hit.source["post_id"],
-                "url": hit.source.get("url", "")
-            }
-        )
-    elif "candid-help" in hit.index:
-        title = hit.source.get("title", "")
-        content_with_context_txt = get_context("content", hit, context_length=12)
-        combined_article_description = hit.source.get("combined_article_description", "")
-        doc = Document(
-            page_content='\n\n'.join([combined_article_description, content_with_context_txt]),
-            metadata={
-                "title": title,
-                "source": "Candid Help",
-                "source_id": hit.source["id"],
-                "url": hit.source.get("link", "")
-            }
-        )
-    else:
-        doc = None
-    return doc
-def get_reranked_results(results: List[ElasticHitsResult], search_text: Optional[str] = None) -> List[Document]:
-    """Run data re-ranking and document building for tool usage.
-    Parameters
-    ----------
-    results : List[ElasticHitsResult]
-    search_text : Optional[str], optional
-        Search context string, by default None
-    Returns
-    -------
-    List[Document]
-    """
-    output = []
-    for r in reranker(results, search_text=search_text):
-        hit = process_hit(r)
-        if hit is not None:
-            output.append(hit)
-    return output
-def retriever_tool(indices: List[str]) -> Tool:
-    """Tool component for use in conditional edge building for RAG execution graph.
-    Cannot use `create_retriever_tool` because it only provides content losing all metadata on the way
-    https://python.langchain.com/docs/how_to/custom_tools/#returning-artifacts-of-tool-execution
-    Parameters
-    ----------
-    indices : List[str]
-        Semantic index names to search over
-    Returns
-    -------
-    Tool
-    """
-    return Tool(
-        name="retrieve_social_sector_information",
-        func=partial(get_results, indices=indices),
-        description=(
-            "Return additional information about social and philanthropic sector, "
-            "including nonprofits (NGO), grants, foundations, funding, RFP, LOI, Candid."
-        ),
-        args_schema=RetrieverInput,
-        response_format="content_and_artifact"
-    )

+from typing import List, Tuple, Dict, Iterable, Iterator, Optional, Union, Any
+from dataclasses import dataclass
+from functools import partial
+from itertools import groupby
+from torch.nn import functional as F
+from pydantic import BaseModel, Field
+from langchain_core.documents import Document
+from langchain_core.tools import Tool
+from elasticsearch import Elasticsearch
+from ask_candid.retrieval.sparse_lexical import SpladeEncoder
+from ask_candid.services.small_lm import CandidSLM
+from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA, NEWS_ELASTIC
+from ask_candid.base.config.data import ElasticIndexMapping, ALL_INDICES
+encoder = SpladeEncoder()
+@dataclass
+class ElasticHitsResult:
+    """Dataclass for Elasticsearch hits results
+    """
+    index: str
+    id: Any
+    score: float
+    source: Dict[str, Any]
+    inner_hits: Dict[str, Any]
+class RetrieverInput(BaseModel):
+    """Input to the Elasticsearch retriever."""
+    user_input: str = Field(description="query to look up in retriever")
+def build_sparse_vector_query(
+    query: str,
+    fields: Tuple[str],
+    inference_id: str = ".elser-2-elasticsearch"
+) -> Dict[str, Any]:
+    """Builds a valid Elasticsearch text expansion query payload
+    Parameters
+    ----------
+    query : str
+        Search context string
+    fields : Tuple[str]
+        Semantic text field names
+    inference_id : str, optional
+        ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
+    Returns
+    -------
+    Dict[str, Any]
+    """
+    output = []
+    for f in fields:
+        output.append({
+            "nested": {
+                "path": f"embeddings.{f}.chunks",
+                "query": {
+                    "sparse_vector": {
+                        "field": f"embeddings.{f}.chunks.vector",
+                        "inference_id": inference_id,
+                        "prune": True,
+                        "query": query,
+                        "boost": 1 / len(fields)
+                    }
+                },
+                "inner_hits": {
+                    "_source": False,
+                    "size": 2,
+                    "fields": [f"embeddings.{f}.chunks.chunk"]
+                }
+            }
+        })
+    return {"query": {"bool": {"should": output}}}
+def news_query_builder(query: str) -> Dict[str, Any]:
+    tokens = encoder.token_expand(query)
+    query = {
+        "_source": ["id", "link", "title", "content"],
+        "query": {
+            "bool": {
+                "filter": [
+                    {"range": {"event_date": {"gte": "now-60d/d"}}},
+                    {"range": {"insert_date": {"gte": "now-60d/d"}}},
+                    {"range": {"article_trust_worthiness": {"gt": 0.8}}}
+                ],
+                "should": []
+            }
+        }
+    }
+    for token, score in tokens.items():
+        if score > 0.4:
+            query["query"]["bool"]["should"].append({
+                "multi_match": {
+                    "query": token,
+                    "fields": ["title", "content"],
+                    "boost": score
+                }
+            })
+    return query
+def query_builder(query: str, indices: List[str]) -> List[Dict[str, Any]]:
+    """Builds Elasticsearch multi-search query payload
+    Parameters
+    ----------
+    query : str
+        Search context string
+    indices : List[str]
+        Semantic index names to search over
+    Returns
+    -------
+    List[Dict[str, Any]]
+    """
+    queries = []
+    if indices is None:
+        indices = list(ALL_INDICES)
+    for index in indices:
+        if index == "issuelab":
+            q = build_sparse_vector_query(
+                query=query,
+                fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
+            )
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 1
+            queries.extend([{"index": ElasticIndexMapping.ISSUELAB_INDEX_ELSER}, q])
+        elif index == "youtube":
+            q = build_sparse_vector_query(
+                query=query,
+                fields=("captions_cleaned", "description_cleaned", "title")
+            )
+            # text_cleaned duplicates captions_cleaned
+            q["_source"] = {"excludes": ["embeddings", "captions", "description", "text_cleaned"]}
+            q["size"] = 2
+            queries.extend([{"index": ElasticIndexMapping.YOUTUBE_INDEX_ELSER}, q])
+        elif index == "candid_blog":
+            q = build_sparse_vector_query(
+                query=query,
+                fields=("content", "authors_text", "title_summary_tags")
+            )
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 2
+            queries.extend([{"index": ElasticIndexMapping.CANDID_BLOG_INDEX_ELSER}, q])
+        elif index == "candid_learning":
+            q = build_sparse_vector_query(
+                query=query,
+                fields=("content", "title", "training_topics", "staff_recommendations")
+            )
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 2
+            queries.extend([{"index": ElasticIndexMapping.CANDID_LEARNING_INDEX_ELSER}, q])
+        elif index == "candid_help":
+            q = build_sparse_vector_query(
+                query=query,
+                fields=("content", "combined_article_description")
+            )
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 2
+            queries.extend([{"index": ElasticIndexMapping.CANDID_HELP_INDEX_ELSER}, q])
+    return queries
+def multi_search(
+    queries: List[Dict[str, Any]],
+    news_query: Optional[Dict[str, Any]] = None
+) -> List[ElasticHitsResult]:
+    """Runs multi-search query
+    Parameters
+    ----------
+    queries : List[Dict[str, Any]]
+        Pre-built multi-search query payload
+    Returns
+    -------
+    List[ElasticHitsResult]
+    """
+    results = []
+    if len(queries) > 0:
+        with Elasticsearch(
+            cloud_id=SEMANTIC_ELASTIC_QA.cloud_id,
+            api_key=SEMANTIC_ELASTIC_QA.api_key,
+            verify_certs=False,
+            request_timeout=60 * 3
+        ) as es:
+            for query_group in es.msearch(body=queries).get("responses", []):
+                for hit in query_group.get("hits", {}).get("hits", []):
+                    hit = ElasticHitsResult(
+                        index=hit["_index"],
+                        id=hit["_id"],
+                        score=hit["_score"],
+                        source=hit["_source"],
+                        inner_hits=hit.get("inner_hits", {})
+                    )
+                    results.append(hit)
+    if news_query is not None:
+        with Elasticsearch(
+            NEWS_ELASTIC.url,
+            http_auth=(NEWS_ELASTIC.username, NEWS_ELASTIC.password),
+            timeout=60
+        ) as es:
+            for hit in es.search(body=news_query, index="news_1").get("hits", {}).get("hits") or []:
+                hit = ElasticHitsResult(
+                    index=hit["_index"],
+                    id=hit["_id"],
+                    score=hit["_score"],
+                    source=hit["_source"],
+                    inner_hits=hit.get("inner_hits", {})
+                )
+                results.append(hit)
+    return results
+def get_query_results(search_text: str, indices: Optional[List[str]] = None) -> List[ElasticHitsResult]:
+    """Builds and executes Elasticsearch data queries from a search string.
+    Parameters
+    ----------
+    search_text : str
+        Search context string
+    indices : Optional[List[str]], optional
+        Semantic index names to search over, by default None
+    Returns
+    -------
+    List[ElasticHitsResult]
+    """
+    queries = query_builder(query=search_text, indices=indices)
+    news_q = news_query_builder(query=search_text)
+    return multi_search(queries, news_query=news_q)
+def retrieved_text(hits: Dict[str, Any]) -> str:
+    """Extracts retrieved sub-texts from documents which are strong hits from semantic queries for the purpose of
+    re-scoring by a secondary language model.
+    Parameters
+    ----------
+    hits : Dict[str, Any]
+    Returns
+    -------
+    str
+    """
+    text = []
+    for _, v in hits.items():
+        for h in (v.get("hits", {}).get("hits") or []):
+            for _, field in h.get("fields", {}).items():
+                for chunk in field:
+                    if chunk.get("chunk"):
+                        text.extend(chunk["chunk"])
+    return '\n'.join(text)
+def cosine_rescore(query: str, contexts: List[str]) -> List[float]:
+    """Computes cosine scores between retrieved contexts and the original query to re-score results based on overall
+    relevance to the original query.
+    Parameters
+    ----------
+    query : str
+        Search context string
+    contexts : List[str]
+        Semantic field sub-texts, order is by document retrieved from the original multi-search query.
+    Returns
+    -------
+    List[float]
+        Scores in the same order as the input document contexts
+    """
+    nlp = CandidSLM()
+    X = nlp.encode([query, *contexts]).vectors
+    X = F.normalize(X, dim=-1, p=2.)
+    cosine = X[1:] @ X[:1].T
+    return cosine.flatten().cpu().numpy().tolist()
+def reranker(
+    query_results: Iterable[ElasticHitsResult],
+    search_text: Optional[str] = None
+) -> Iterator[ElasticHitsResult]:
+    """Reranks Elasticsearch hits coming from multiple indices/queries which may have scores on different scales.
+    This will shuffle results
+    Parameters
+    ----------
+    query_results : Iterable[ElasticHitsResult]
+    Yields
+    ------
+    Iterator[ElasticHitsResult]
+    """
+    results: List[ElasticHitsResult] = []
+    texts: List[str] = []
+    for _, data in groupby(query_results, key=lambda x: x.index):
+        data = list(data)
+        max_score = max(data, key=lambda x: x.score).score
+        min_score = min(data, key=lambda x: x.score).score
+        for d in data:
+            d.score = (d.score - min_score) / (max_score - min_score + 1e-9)
+            results.append(d)
+            if search_text:
+                text = retrieved_text(d.inner_hits)
+                texts.append(text)
+    # if search_text and len(texts) == len(results):
+    #     scores = cosine_rescore(search_text, texts)
+    #     for r, s in zip(results, scores):
+    #         r.score = s
+    yield from sorted(results, key=lambda x: x.score, reverse=True)
+def get_results(user_input: str, indices: List[str]) -> Tuple[str, List[Document]]:
+    """End-to-end search and re-rank function.
+    Parameters
+    ----------
+    user_input : str
+        Search context string
+    indices : List[str]
+        Semantic index names to search over
+    Returns
+    -------
+    Tuple[str, List[Document]]
+        (concatenated text from search results, documents list)
+    """
+    output = ["Search didn't return any Candid sources"]
+    page_content = []
+    content = "Search didn't return any Candid sources"
+    results = get_query_results(search_text=user_input, indices=indices)
+    if results:
+        output = get_reranked_results(results, search_text=user_input)
+        for doc in output:
+            page_content.append(doc.page_content)
+        content = "\n\n".join(page_content)
+    # for the tool we need to return a tuple for content_and_artifact type
+    return content, output
+def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
+    """Pads the relevant chunk of text with context before and after
+    Parameters
+    ----------
+    field_name : str
+        a field with the long text that was chunked into pieces
+    hit : ElasticHitsResult
+    context_length : int, optional
+        length of text to add before and after the chunk, by default 1024
+    Returns
+    -------
+    str
+        longer chunks stuffed together
+    """
+    chunks = []
+    # NOTE chunks have tokens, long text is a normal text, but may contain html that also gets weird after tokenization
+    long_text = hit.source.get(f"{field_name}", "")
+    long_text = long_text.lower()
+    inner_hits_field = f"embeddings.{field_name}.chunks"
+    found_chunks = hit.inner_hits.get(inner_hits_field, {})
+    if found_chunks:
+        hits = found_chunks.get("hits", {}).get("hits", [])
+        for h in hits:
+            chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
+            # cutting the middle because we may have tokenizing artifacts there
+            chunk = chunk[3: -3]
+            if add_context:
+                # Find the start and end indices of the chunk in the large text
+                start_index = long_text.find(chunk[:20])
+                # Chunk is found
+                if start_index != -1:
+                    end_index = start_index + len(chunk)
+                    pre_start_index = max(0, start_index - context_length)
+                    post_end_index = min(len(long_text), end_index + context_length)
+                    chunks.append(long_text[pre_start_index:post_end_index])
+            else:
+                chunks.append(chunk)
+    return '\n\n'.join(chunks)
+def process_hit(hit: ElasticHitsResult) -> Union[Document, None]:
+    """Parse Elasticsearch hit results into data structures handled by the RAG pipeline.
+    Parameters
+    ----------
+    hit : ElasticHitsResult
+    Returns
+    -------
+    Union[Document, None]
+    """
+    if "issuelab-elser" in hit.index:
+        combined_item_description = hit.source.get("combined_item_description", "") # title inside
+        description = hit.source.get("description", "")
+        combined_issuelab_findings = hit.source.get("combined_issuelab_findings", "")
+        # we only need to process long texts
+        chunks_with_context_txt = get_context("content", hit, context_length=12)
+        doc = Document(
+            page_content='\n\n'.join([
+                combined_item_description,
+                combined_issuelab_findings,
+                description,
+                chunks_with_context_txt
+            ]),
+            metadata={
+                "title": hit.source["title"],
+                "source": "IssueLab",
+                "source_id": hit.source["resource_id"],
+                "url": hit.source.get("permalink", "")
+            }
+        )
+    elif "youtube" in hit.index:
+        title = hit.source.get("title", "")
+        # we only need to process long texts
+        description_cleaned_with_context_txt = get_context("description_cleaned", hit, context_length=12)
+        captions_cleaned_with_context_txt = get_context("captions_cleaned", hit, context_length=12)
+        doc = Document(
+            page_content='\n\n'.join([title, description_cleaned_with_context_txt, captions_cleaned_with_context_txt]),
+            metadata={
+                "title": title,
+                "source": "Candid YouTube",
+                "source_id": hit.source['video_id'],
+                "url": f"https://www.youtube.com/watch?v&#61;{hit.source['video_id']}"
+            }
+        )
+    elif "candid-blog" in hit.index:
+        excerpt = hit.source.get("excerpt", "")
+        title = hit.source.get("title", "")
+        # we only need to process long text
+        content_with_context_txt = get_context("content", hit, context_length=12, add_context=False)
+        authors = get_context("authors_text", hit, context_length=12, add_context=False)
+        tags = hit.source.get("title_summary_tags", "")
+        doc = Document(
+            page_content='\n\n'.join([title, excerpt, content_with_context_txt, authors, tags]),
+            metadata={
+                "title": title,
+                "source": "Candid Blog",
+                "source_id": hit.source["id"],
+                "url": hit.source["link"]
+            }
+        )
+    elif "candid-learning" in hit.index:
+        title = hit.source.get("title", "")
+        content_with_context_txt = get_context("content", hit, context_length=12)
+        training_topics = hit.source.get("training_topics", "")
+        staff_recommendations = hit.source.get("staff_recommendations", "")
+        doc = Document(
+            page_content='\n\n'.join([title, staff_recommendations, training_topics, content_with_context_txt]),
+            metadata={
+                "title": hit.source["title"],
+                "source": "Candid Learning",
+                "source_id": hit.source["post_id"],
+                "url": hit.source.get("url", "")
+            }
+        )
+    elif "candid-help" in hit.index:
+        title = hit.source.get("title", "")
+        content_with_context_txt = get_context("content", hit, context_length=12)
+        combined_article_description = hit.source.get("combined_article_description", "")
+        doc = Document(
+            page_content='\n\n'.join([combined_article_description, content_with_context_txt]),
+            metadata={
+                "title": title,
+                "source": "Candid Help",
+                "source_id": hit.source["id"],
+                "url": hit.source.get("link", "")
+            }
+        )
+    elif "news" in hit.index:
+        doc = Document(
+            page_content='\n\n'.join([hit.source.get("title", ""), hit.source.get("content", "")]),
+            metadata={
+                "title": hit.source.get("title", ""),
+                "source": "Candid News",
+                "source_id": hit.source["id"],
+                "url": hit.source.get("link", "")
+            }
+        )
+    else:
+        doc = None
+    return doc
+def get_reranked_results(results: List[ElasticHitsResult], search_text: Optional[str] = None) -> List[Document]:
+    """Run data re-ranking and document building for tool usage.
+    Parameters
+    ----------
+    results : List[ElasticHitsResult]
+    search_text : Optional[str], optional
+        Search context string, by default None
+    Returns
+    -------
+    List[Document]
+    """
+    output = []
+    for r in reranker(results, search_text=search_text):
+        hit = process_hit(r)
+        if hit is not None:
+            output.append(hit)
+    return output
+def retriever_tool(indices: List[str]) -> Tool:
+    """Tool component for use in conditional edge building for RAG execution graph.
+    Cannot use `create_retriever_tool` because it only provides content losing all metadata on the way
+    https://python.langchain.com/docs/how_to/custom_tools/#returning-artifacts-of-tool-execution
+    Parameters
+    ----------
+    indices : List[str]
+        Semantic index names to search over
+    Returns
+    -------
+    Tool
+    """
+    return Tool(
+        name="retrieve_social_sector_information",
+        func=partial(get_results, indices=indices),
+        description=(
+            "Return additional information about social and philanthropic sector, "
+            "including nonprofits (NGO), grants, foundations, funding, RFP, LOI, Candid."
+        ),
+        args_schema=RetrieverInput,
+        response_format="content_and_artifact"
+    )

ask_candid/retrieval/sparse_lexical.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import Dict
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+import torch
+class SpladeEncoder:
+    def __init__(self):
+        model_id = "naver/splade-v3"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.model = AutoModelForMaskedLM.from_pretrained(model_id)
+        self.idx2token = {idx: token for token, idx in self.tokenizer.get_vocab().items()}
+    @torch.no_grad()
+    def token_expand(self, query: str) -> Dict[str, float]:
+        tokens = self.tokenizer(query, return_tensors='pt')
+        output = self.model(**tokens)
+        vec = torch.max(
+            torch.log(1 + torch.relu(output.logits)) * tokens.attention_mask.unsqueeze(-1),
+            dim=1
+        )[0].squeeze()
+        cols = vec.nonzero().squeeze().cpu().tolist()
+        weights = vec[cols].cpu().tolist()
+        sparse_dict_tokens = {self.idx2token[idx]: round(weight, 3) for idx, weight in zip(cols, weights) if weight > 0}
+        return dict(sorted(sparse_dict_tokens.items(), key=lambda item: item[1], reverse=True))

ask_candid/tools/elastic/index_details_tool.py CHANGED Viewed

@@ -19,7 +19,7 @@ es = Elasticsearch(
     cloud_id=SEMANTIC_ELASTIC_QA.cloud_id,
     api_key=SEMANTIC_ELASTIC_QA.api_key,
     verify_certs=True,
-    request_timeout=60 * 3
 )
@@ -62,7 +62,9 @@ class IndexDetailsTool(BaseTool):
                 }
             )
         except Exception as e:
-            logger.exception("Could not fetch index information for %s: %s", index_name, e)
             return ""
     async def _arun(

     cloud_id=SEMANTIC_ELASTIC_QA.cloud_id,
     api_key=SEMANTIC_ELASTIC_QA.api_key,
     verify_certs=True,
+    request_timeout=60 * 3,
 )
                 }
             )
         except Exception as e:
+            logger.exception(
+                "Could not fetch index information for %s: %s", index_name, e
+            )
             return ""
     async def _arun(

ask_candid/tools/elastic/index_search_tool.py CHANGED Viewed

@@ -3,6 +3,7 @@ import json
 import tiktoken
 from elasticsearch import Elasticsearch
 # from pydantic.v1 import BaseModel, Field  # <-- Uses v1 namespace
 from pydantic import BaseModel, Field
 from langchain.tools import StructuredTool
@@ -15,7 +16,7 @@ es = Elasticsearch(
     cloud_id=SEMANTIC_ELASTIC_QA.cloud_id,
     api_key=SEMANTIC_ELASTIC_QA.api_key,
     verify_certs=True,
-    request_timeout=60 * 3
 )
@@ -81,8 +82,18 @@ def elastic_search(
             if query_dict is None and aggs_dict is not None:
                 # When a result has aggregations, just return that and ignore the rest
                 final_res = str(res["aggregations"])
             else:
                 final_res = str(res["hits"])
             tokens = encoding.encode(final_res)
             retries += 1
             if len(tokens) > 6000:
@@ -98,5 +109,16 @@ def elastic_search(
 def create_search_tool():
     return StructuredTool.from_function(
-        elastic_search, name="elastic_index_search_tool", args_schema=SearchToolInput
     )

 import tiktoken
 from elasticsearch import Elasticsearch
 # from pydantic.v1 import BaseModel, Field  # <-- Uses v1 namespace
 from pydantic import BaseModel, Field
 from langchain.tools import StructuredTool
     cloud_id=SEMANTIC_ELASTIC_QA.cloud_id,
     api_key=SEMANTIC_ELASTIC_QA.api_key,
     verify_certs=True,
+    request_timeout=60 * 3,
 )
             if query_dict is None and aggs_dict is not None:
                 # When a result has aggregations, just return that and ignore the rest
                 final_res = str(res["aggregations"])
+            elif query_dict is not None and aggs_dict is not None:
+                # Return both hits and aggregations
+                final_res = str(
+                    {
+                        "hits": res.get("hits", {}),
+                        "aggregations": res.get("aggregations", {}),
+                    }
+                )
             else:
                 final_res = str(res["hits"])
             tokens = encoding.encode(final_res)
             retries += 1
             if len(tokens) > 6000:
 def create_search_tool():
     return StructuredTool.from_function(
+        elastic_search,
+        name="elastic_index_search_tool",
+        description=(
+            """This tool allows executing queries on an Elasticsearch index efficiently. Provide:
+                        1. index_name (string): The target Elasticsearch index.
+                        2. query (dictionary): Defines the query structure, supporting:
+                            a. Filters: For precise data retrieval (e.g., match, term, range).
+                            b. Aggregations: For statistical summaries and grouping (e.g., sum, average, histogram).
+                            c. Full-text search: For analyzing and ranking text-based results (e.g., match, multi-match, query_string).
+                        """
+        ),
+        args_schema=SearchToolInput,
     )

ask_candid/tools/org_seach.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import List
 import logging
 import re
-from fuzzywuzzy import fuzz
 from langchain.output_parsers.openai_tools import JsonOutputToolsParser
 # from langchain_openai.chat_models import ChatOpenAI

 import logging
 import re
+from thefuzz import fuzz
 from langchain.output_parsers.openai_tools import JsonOutputToolsParser
 # from langchain_openai.chat_models import ChatOpenAI

ask_candid/tools/question_reformulation.py CHANGED Viewed

@@ -2,13 +2,17 @@ from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-def reformulate_question_using_history(state, llm):
     """
-    Transform the query to produce a better query with details from previous messages.
     Args:
-        state (messages): The current state
-        llm: LLM to use
     Returns:
         dict: The updated state with re-phrased question and original user_input for UI
     """
@@ -17,23 +21,39 @@ def reformulate_question_using_history(state, llm):
     question = messages[-1].content
     if len(messages) > 1:
-        contextualize_q_system_prompt = """Given a chat history and the latest user input \
-        which might reference context in the chat history, formulate a standalone input \
-        which can be understood without the chat history.
-        Chat history:
-        \n ------- \n
-        {chat_history}
-        \n ------- \n
-        User input:
-        \n ------- \n
-        {question}
-        \n ------- \n
-        Do NOT answer the question, \
-        just reformulate it if needed and otherwise return it as is.
-        """
         contextualize_q_prompt = ChatPromptTemplate([
-            ("system", contextualize_q_system_prompt),
             ("human", question),
         ])

 from langchain_core.output_parsers import StrOutputParser
+def reformulate_question_using_history(state, llm, focus_on_recommendations=False):
     """
+    Transform the query to produce a better query with details from previous messages and emphasize
+    aspects important for recommendations if needed.
     Args:
+        state (dict): The current state containing messages.
+        llm: LLM to use for generating the reformulation.
+        focus_on_recommendations (bool): Flag to determine if the reformulation should emphasize
+                                         recommendation-relevant aspects such as geographies,
+                                         cause areas, etc.
     Returns:
         dict: The updated state with re-phrased question and original user_input for UI
     """
     question = messages[-1].content
     if len(messages) > 1:
+        if focus_on_recommendations:
+            prompt_text = """Given a chat history and the latest user input \
+                which might reference context in the chat history, \
+                especially geographic locations, cause areas and/or population groups, \
+                formulate a standalone input which can be understood without the chat history.
+            Chat history:
+            \n ------- \n
+            {chat_history}
+            \n ------- \n
+            User input:
+            \n ------- \n
+            {question}
+            \n ------- \n
+            Reformulate the question without adding implications or assumptions about the user's needs or intentions.
+            Focus solely on clarifying any contextual details present in the original input."""
+        else:
+            prompt_text = """Given a chat history and the latest user input \
+            which might reference context in the chat history, formulate a standalone input \
+            which can be understood without the chat history.
+            Chat history:
+            \n ------- \n
+            {chat_history}
+            \n ------- \n
+            User input:
+            \n ------- \n
+            {question}
+            \n ------- \n
+            Do NOT answer the question, \
+            just reformulate it if needed and otherwise return it as is.
+            """
         contextualize_q_prompt = ChatPromptTemplate([
+            ("system", prompt_text),
             ("human", question),
         ])

ask_candid/tools/recommendation.py CHANGED Viewed

@@ -1,87 +1,155 @@
-import logging
 import os
 from openai import OpenAI
 import requests
 from ask_candid.agents.schema import AgentState, Context
-from ask_candid.base.config.rest import OPENAI
-logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-def detect_intent_with_llm(state: AgentState) -> AgentState:
-    """Detect query intent (which type of recommendation) and update the state."""
-    logger.info("---DETECT INTENT---")
-    client = OpenAI(api_key=OPENAI['key'])
-    # query = state["messages"][-1]["content"]
     query = state["messages"][-1].content
-    prompt = f"""Classify the following query as one of the following categories:
-    - 'none': The query is not asking for funding recommendations.
-    - 'funder': The query is asking for recommendations about funders, such as foundations or donors, who might provide longer-term or general funding.
-    - 'rfp': The query is asking for recommendations about specific, active Requests for Proposals (RFPs) that typically focus on short-term, recent opportunities with a deadline.
-    Your classification should consider:
-    1. RFPs often focus on active and time-bound opportunities for specific projects or programs.
-    2. Funders refer to broader, long-term funding sources like organizations or individuals who offer grants.
-    Query: "{query}"
-    """
-    response = client.chat.completions.create(
-        # model="gpt-4-turbo",
-        model="gpt-4o",
-        messages=[{"role": "system", "content": prompt}],
-        max_tokens=10,
-        stop=["\n"]
     )
-    intent = response.choices[0].message.content.strip().lower()
-    state["intent"] = intent.strip("'").strip('"')  # Remove extra quotes
     return state
 def determine_context(state: AgentState) -> AgentState:
-    """Extract subject/geography/population codes and update the state."""
-    logger.info("---GETTING RECOMMENDATION CONTEXT---")
     query = state["messages"][-1].content
     subject_codes, population_codes, geo_ids = [], [], []
-    # subject and population
-    autocoding_headers = {
-        'x-api-key': os.getenv("AUTOCODING_API_KEY"),
-        'Content-Type': 'application/json'
-    }
-    autocoding_params = {
-        'text': query,
-        'taxonomy': 'pcs-v3'
-    }
-    autocoding_response = requests.get(
-        os.getenv("AUTOCODING_API_URL"),
-        headers=autocoding_headers,
-        params=autocoding_params,
-        timeout=30
-    )
-    if autocoding_response.status_code == 200:
-        returned_pcs = autocoding_response.json()["data"]
         population_codes = [item['full_code'] for item in returned_pcs.get("population", [])]
         subject_codes = [item['full_code'] for item in returned_pcs.get("subject", [])]
-    # geography
-    geo_headers = {
-        'x-api-key': os.getenv("GEO_API_KEY"),
-        'Content-Type': 'application/json'
-    }
-    geo_data = {
-        'text': query
-    }
-    geo_response = requests.post(os.getenv("GEO_API_URL"), headers=geo_headers, json=geo_data, timeout=30)
-    if geo_response.status_code == 200:
-        entities = geo_response.json()['data']['entities']
-        geo_ids = [entity['geo']['id'] for entity in entities if 'id' in entity['geo']]
     state["context"] = Context(
         subject=subject_codes,
@@ -91,99 +159,80 @@ def determine_context(state: AgentState) -> AgentState:
     return state
 def make_recommendation(state: AgentState) -> AgentState:
-    """Make an API call based on the extracted context and update the state."""
-    # query = state["messages"][-1]["content"]
-    logger.info("---RECOMMENDING---")
-    org_id = "6908122"
     funder_or_rfp = state["intent"]
-    # Extract context
     contexts = state["context"]
-    subject_codes = contexts.get("subject", [])
-    population_codes = contexts.get("population", [])
-    geo_ids = contexts.get("geography", [])
-    # Prepare parameters
-    params = {
-        "subjects": ",".join(subject_codes),
-        "geos": ",".join([str(geo) for geo in geo_ids]),
-        "populations": ",".join(population_codes)
-    }
-    headers = {"x-api-key": os.getenv("FUNDER_REC_API_KEY")}
-    base_url = os.getenv("FUNDER_REC_API_URL")
-    # Initialize response
-    response = None
     recommendation_display_text = ""
     try:
-        # Make the API call based on intent
         if funder_or_rfp == "funder":
-            response = requests.get(base_url, headers=headers, params=params, timeout=30)
         elif funder_or_rfp == "rfp":
-            params["candid_entity_id"] = org_id #placeholder
-            response = requests.get(f"{base_url}/rfp", headers=headers, params=params, timeout=30)
         else:
-            # Handle unknown intent
-            state["recommendation"] = "Unknown intent. Intent 'funder' or 'rfp' expected."
             return state
-        # Validate response
-        if response and response.status_code == 200:
-            recommendations = response.json().get("recommendations", [])
-            if recommendations:
-                if funder_or_rfp == "funder":
-                    # Format recommendations
-                    recommendation_display_text = "Here are the top 10 recommendations. Click their profiles to learn more:\n" + "\n".join([
-                        f"{recommendation['funder_data']['main_sort_name']} - Profile: https://app.candid.org/profile/{recommendation['funder_id']}"
-                        for recommendation in recommendations
-                ])
-                elif funder_or_rfp == "rfp":
-                    recommendation_display_text = "Here are the top recommendations:\n" + "\n".join([
-                    f"Title: {rec['title']}\n"
-                    f"Funder: {rec['funder_name']}\n"
-                    f"Amount: {rec.get('amount', 'Not specified')}\n"
-                    f"Description: {rec.get('description', 'No description available')}\n"
-                    f"Deadline: {rec.get('deadline', 'No deadline provided')}\n"
-                    f"Application URL: {rec.get('application_url', 'No URL available')}\n"
-                    for rec in recommendations
-                    ])
-            else:
-                # No recommendations found
-                recommendation_display_text = "No recommendations were found for your query. Please try refining your search criteria."
-        elif response and response.status_code == 400:
-            # Handle bad request
-            error_details = response.json()
-            recommendation_display_text = (
-                "An error occurred while processing your request. "
-                f"Details: {error_details.get('message', 'Unknown error.')}"
-            )
-        elif response:
-            # Handle other unexpected status codes
-            recommendation_display_text = (
-                f"An unexpected error occurred (Status Code: {response.status_code}). "
-                "Please try again later or contact support if the problem persists."
-            )
         else:
-            # Handle case where response is None
-            recommendation_display_text = "No response from the server. Please check your connection or try again later."
-    except requests.exceptions.RequestException as e:
-        # Handle network-related errors
-        recommendation_display_text = (
-            "A network error occurred while trying to connect to the recommendation service. "
-            f"Details: {str(e)}"
-        )
     except Exception as e:
-        # Handle other unexpected errors
-        print(params)
-        recommendation_display_text = (
-            "An unexpected error occurred while processing your request. "
-            f"Details: {str(e)}"
-        )
-    # Update state with recommendations or error messages
     state["recommendation"] = recommendation_display_text
-    return state

 import os
 from openai import OpenAI
+from langchain_core.prompts import ChatPromptTemplate
 import requests
 from ask_candid.agents.schema import AgentState, Context
+from ask_candid.base.api_base import BaseAPI
+class AutocodingAPI(BaseAPI):
+    def __init__(self):
+        super().__init__(
+            url=os.getenv("AUTOCODING_API_URL"),
+            headers={
+                'x-api-key': os.getenv("AUTOCODING_API_KEY"),
+                'Content-Type': 'application/json'
+            }
+        )
+    def __call__(self, text: str, taxonomy: str = 'pcs-v3'):
+        params = {
+            'text': text,
+            'taxonomy': taxonomy
+        }
+        return self.get(**params)
+class GeoAPI(BaseAPI):
+    def __init__(self):
+        super().__init__(
+            url=os.getenv("GEO_API_URL"),
+            headers={
+                'x-api-key': os.getenv("GEO_API_KEY"),
+                'Content-Type': 'application/json'
+            }
+        )
+    def __call__(self, text: str):
+        payload = {
+            'text': text
+        }
+        return self.post(payload=payload)
+class EntitiesAPI(BaseAPI):
+    def __init__(self):
+        super().__init__(
+            url=f'{os.getenv("DOCUMENT_API_URL")}/entities',
+            headers={
+                'x-api-key': os.getenv("DOCUMENT_API_KEY"),
+                'Content-Type': 'application/json'
+            }
+        )
+    def __call__(self, text: str):
+        payload = {
+            'text': text
+        }
+        return self.post(payload=payload)
+class FunderRecommendationAPI(BaseAPI):
+    def __init__(self):
+        super().__init__(
+            url=os.getenv("FUNDER_REC_API_URL"),
+            headers={"x-api-key": os.getenv("FUNDER_REC_API_KEY")}
+        )
+    def __call__(self, subjects, populations, geos):
+        params = {
+            "subjects": subjects,
+            "populations": populations,
+            "geos": geos
+        }
+        return self.get(**params)
+class RFPRecommendationAPI(BaseAPI):
+    def __init__(self):
+        super().__init__(
+            url= f'{os.getenv("FUNDER_REC_API_URL")}/rfp',
+            headers={"x-api-key": os.getenv("FUNDER_REC_API_KEY")}
+        )
+    def __call__(self, org_id, subjects, populations, geos):
+        params = {
+            "candid_entity_id": org_id,
+            "subjects": subjects,
+            "populations": populations,
+            "geos": geos
+        }
+        return self.get(**params)
+def detect_intent_with_llm(state: AgentState, llm) -> AgentState:
+    """Detect query intent (which type of recommendation) and update the state using the specified LLM."""
+    print("running detect intent")
     query = state["messages"][-1].content
+    prompt_template = ChatPromptTemplate.from_messages(
+        [
+            ("system", """
+            Please classify the following query by stating ONLY the category name: 'none', 'funder', or 'rfp'.
+            Please answer WITHOUT any reasoning.
+            - 'none': The query does not ask for any recommendations.
+            - 'funder': The query asks for recommendations about funders, such as foundations or donors.
+            - 'rfp': The query asks for recommendations about specific Requests for Proposals (RFPs).
+            Consider:
+            - If the query seeks broad, long-term funding sources or organizations, classify as 'funder'.
+            - If the query seeks specific, time-bound funding opportunities with a deadline, classify as 'rfp'.
+            - If the query does not seek any recommendations, classify as 'none'.
+            Query: """),
+            ("human", f"{query}")
+        ]
     )
+    chain = prompt_template | llm
+    response = chain.invoke({"query": query})
+    intent = response.content.strip().lower()
+    state["intent"] = intent.strip("'").strip('"')  # Remove extra quotes if necessary
+    print(state["intent"])
     return state
 def determine_context(state: AgentState) -> AgentState:
+    print("running context")
     query = state["messages"][-1].content
+    autocoding_api = AutocodingAPI()
+    entities_api = EntitiesAPI()
     subject_codes, population_codes, geo_ids = [], [], []
+    try:
+        autocoding_response = autocoding_api(text=query)
+        returned_pcs = autocoding_response.get("data", {})
         population_codes = [item['full_code'] for item in returned_pcs.get("population", [])]
         subject_codes = [item['full_code'] for item in returned_pcs.get("subject", [])]
+    except Exception as e:
+        print(f"Failed to retrieve autocoding data: {e}")
+    try:
+        geo_response = entities_api(text=query)
+        entities = geo_response.get('entities', [])
+        geo_ids = [match['geonames_id'] for entity in entities if entity['type'] == 'geo' and 'match' in entity
+                   for match in entity['match'] if 'geonames_id' in match]
+    except Exception as e:
+        print(f"Failed to retrieve geographic data: {e}")
     state["context"] = Context(
         subject=subject_codes,
     return state
+def format_recommendations(intent, data):
+    if 'recommendations' not in data:
+        return "No recommendations available."
+    recommendations = data['recommendations']
+    if not recommendations:
+        return "No recommendations found."
+    recommendation_texts = []
+    if intent == "funder":
+        for rec in recommendations:
+            main_sort_name = rec['funder_data']['main_sort_name']
+            profile_url = f"https://app.candid.org/profile/{rec['funder_id']}"
+            recommendation_texts.append(f"{main_sort_name} - Profile: {profile_url}")
+    elif intent == "rfp":
+        for rec in recommendations:
+            title = rec.get('title', 'N/A')
+            funder_name = rec.get('funder_name', 'N/A')
+            amount = rec.get('amount', 'Not specified')
+            description = rec.get('description', 'No description available')
+            deadline = rec.get('deadline', 'No deadline provided')
+            application_url = rec.get('application_url', 'No URL available')
+            text = (f"Title: {title}\n"
+                    f"Funder: {funder_name}\n"
+                    f"Amount: {amount}\n"
+                    f"Description: {description}\n"
+                    f"Deadline: {deadline}\n"
+                    f"Application URL: {application_url}\n")
+            recommendation_texts.append(text)
+    else:
+        return "Only funder recommendation or RFP recommendation are supported."
+    return "\n".join(recommendation_texts)
 def make_recommendation(state: AgentState) -> AgentState:
+    print("running recommendation")
+    org_id = "6908122"  # Example organization ID (Candid)
     funder_or_rfp = state["intent"]
     contexts = state["context"]
+    subject_codes = ",".join(contexts.get("subject", []))
+    population_codes = ",".join(contexts.get("population", []))
+    geo_ids = ",".join([str(geo) for geo in contexts.get("geography", [])])
     recommendation_display_text = ""
     try:
         if funder_or_rfp == "funder":
+            funder_api = FunderRecommendationAPI()
+            recommendations = funder_api(subject_codes, population_codes, geo_ids)
         elif funder_or_rfp == "rfp":
+            rfp_api = RFPRecommendationAPI()
+            recommendations = rfp_api(org_id, subject_codes, population_codes, geo_ids)
         else:
+            recommendation_display_text = "Unknown intent. Intent 'funder' or 'rfp' expected."
+            state["recommendation"] = recommendation_display_text
             return state
+        if recommendations:
+            recommendation_display_text = format_recommendations(funder_or_rfp, recommendations)
         else:
+            recommendation_display_text = "No recommendations were found for your query. Please try refining your search criteria."
+    except requests.exceptions.HTTPError as e:
+        # Handle HTTP errors raised by raise_for_status()
+        print(f"HTTP error occurred: {e.response.status_code} - {e.response.reason}")
+        recommendation_display_text = "HTTP error occurred, please report this to datascience@candid.org"
     except Exception as e:
+        # Catch-all for any other exceptions that are not HTTP errors
+        print(f"An unexpected error occurred: {str(e)}")
+        recommendation_display_text = "Unexpected error occurred, please report this to datascience@candid.org"
     state["recommendation"] = recommendation_display_text
+    return state

ask_candid/utils.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from typing import List, Dict, Union, Any
 from uuid import uuid4
 from ask_candid.retrieval.sources import (
     candid_blog,
     candid_help,
@@ -10,11 +12,6 @@ from ask_candid.retrieval.sources import (
 )
-def filter_messages(messages, k=10):
-    # TODO summarize messages instead
-    return messages[-k:]
 def html_format_doc(doc: Dict[str, Any], source: str, show_chunks=False) -> str:
     height_px = 200
     html = ""
@@ -23,10 +20,8 @@ def html_format_doc(doc: Dict[str, Any], source: str, show_chunks=False) -> str:
         # html = news.article_card_html(doc, height_px, show_chunks)
         pass
     elif source == "transactions":
-        # html = cds.transaction_card_html(doc, height_px, show_chunks)
         pass
     elif source == "organizations":
-        # html = up_orgs.organization_card_html(doc, 400, show_chunks)
         pass
     elif source == "issuelab":
         html = issuelab.issuelab_card_html(doc, height_px, show_chunks)
@@ -41,10 +36,20 @@ def html_format_doc(doc: Dict[str, Any], source: str, show_chunks=False) -> str:
     return html
-def html_format_docs_chat(docs):
-    """
-    Formats Candid sources into a line of buttons
     """
     html = ""
     if docs:
         docs_html = []
@@ -54,8 +59,8 @@ def html_format_docs_chat(docs):
             s_html = (
                 "<span class='source-item'>"
-                f"<a href={s_url} target='_blank' rel='noreferrer' class='ssearch-source'>"
-                f"{doc.metadata['title']} ({s_name})</a></span>"
             )
             docs_html.append(s_html)
@@ -64,19 +69,6 @@ def html_format_docs_chat(docs):
     return html
-def format_chat_response(chatbot: List[Any]) -> List[Any]:
-    """We have sources appended as one more tuple. Here we concatinate HTML of sources
-        with the AI response
-    Returns:
-        _type_: updated chatbot message as HTML
-    """
-    if chatbot:
-        sources = chatbot[-1][1]
-        chatbot.pop(-1)
-        chatbot[-1][1] = chatbot[-1][1] + sources
-    return chatbot
 def format_chat_ag_response(chatbot: List[Any]) -> List[Any]:
     """If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
         with the AI response

 from typing import List, Dict, Union, Any
 from uuid import uuid4
+from langchain_core.documents import Document
 from ask_candid.retrieval.sources import (
     candid_blog,
     candid_help,
 )
 def html_format_doc(doc: Dict[str, Any], source: str, show_chunks=False) -> str:
     height_px = 200
     html = ""
         # html = news.article_card_html(doc, height_px, show_chunks)
         pass
     elif source == "transactions":
         pass
     elif source == "organizations":
         pass
     elif source == "issuelab":
         html = issuelab.issuelab_card_html(doc, height_px, show_chunks)
     return html
+def html_format_docs_chat(docs: List[Document]) -> str:
+    """Formats Candid sources
+    Parameters
+    ----------
+    docs : List[Document]
+        Retrieved documents for context
+    Returns
+    -------
+    str
+        Formatted HTML
     """
     html = ""
     if docs:
         docs_html = []
             s_html = (
                 "<span class='source-item'>"
+                f"<a href='{s_url}' target='_blank' rel='noreferrer' class='ssearch-source'>"
+                f"{doc.metadata['title']} &vert; {s_name}</a></span>"
             )
             docs_html.append(s_html)
     return html
 def format_chat_ag_response(chatbot: List[Any]) -> List[Any]:
     """If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
         with the AI response

requirements.txt CHANGED Viewed

@@ -1,13 +1,15 @@
 boto3
 elasticsearch==7.17.6
-fuzzywuzzy
 gradio
 langchain
 langchain-aws
 langchain-openai
 langgraph
 pydantic
 python-dotenv
 --find-links https://download.pytorch.org/whl/cpu
 torch

 boto3
 elasticsearch==7.17.6
+thefuzz
 gradio
 langchain
 langchain-aws
 langchain-openai
 langgraph
 pydantic
+pyopenssl>22.0.0
 python-dotenv
+transformers
 --find-links https://download.pytorch.org/whl/cpu
 torch