Spaces:

CandidAI
/

ask-candid

Running

App Files Files Community

brainsqueeze commited on Dec 12, 2024

Commit

a0e37e2

verified ·

1 Parent(s): 34b2d9c

Upload 36 files

Browse files

Project re-structuring

Files changed (36) hide show

app.py +78 -45
ask_candid/__init__.py +0 -0
ask_candid/agents/__init__.py +0 -0
ask_candid/agents/elastic.py +293 -0
ask_candid/base/__init__.py +0 -0
ask_candid/base/api_base.py +42 -0
ask_candid/base/api_base_async.py +48 -0
ask_candid/base/config/__init__.py +0 -0
ask_candid/base/config/connections.py +36 -0
ask_candid/base/config/data.py +20 -0
ask_candid/base/config/models.py +9 -0
ask_candid/base/config/rest.py +21 -0
ask_candid/base/lambda_base.py +58 -0
ask_candid/base/utils.py +14 -0
ask_candid/chat.py +251 -0
ask_candid/indexing/__init__.py +0 -0
ask_candid/retrieval/__init__.py +0 -0
ask_candid/retrieval/elastic.py +323 -0
ask_candid/retrieval/sources/__init__.py +0 -0
ask_candid/retrieval/sources/candid_blog.py +43 -0
ask_candid/retrieval/sources/candid_help.py +41 -0
ask_candid/retrieval/sources/candid_learning.py +41 -0
ask_candid/retrieval/sources/issuelab.py +50 -0
ask_candid/retrieval/sources/youtube.py +54 -0
ask_candid/services/__init__.py +0 -0
ask_candid/services/org_search.py +50 -0
ask_candid/services/small_lm.py +53 -0
ask_candid/tools/__init__.py +0 -0
ask_candid/tools/elastic/__init__.py +0 -0
ask_candid/tools/elastic/index_data_tool.py +59 -0
ask_candid/tools/elastic/index_details_tool.py +73 -0
ask_candid/tools/elastic/index_search_tool.py +102 -0
ask_candid/tools/elastic/list_indices_tool.py +58 -0
ask_candid/tools/org_seach.py +194 -0
ask_candid/tools/question_reformulation.py +44 -0
ask_candid/utils.py +103 -0

app.py CHANGED Viewed

@@ -3,21 +3,21 @@ import os
 import gradio as gr
 from langchain_openai.chat_models import ChatOpenAI
-try:
-    from utils import format_chat_ag_response
-    from retrieval.config import ALL_INDICES
-    from static.css import css_chat
-    from chat import run_chat
-except ImportError:
-    from .utils import format_chat_ag_response
-    from .retrieval.config import ALL_INDICES
-    from .static.css import css_chat
-    from .chat import run_chat
 ROOT = os.path.dirname(os.path.abspath(__file__))
 class LoggedComponents(TypedDict):
     context: List[gr.components.Component]
@@ -27,32 +27,46 @@ class LoggedComponents(TypedDict):
     email: gr.components.Component
 def execute(
     thread_id: str,
     user_input: Dict[str, Any],
-    chatbot: List[Dict],
     max_new_tokens: int,
     indices: Optional[List[str]] = None,
 ):
-    llm = ChatOpenAI(
-        model_name="gpt-4o",
-        max_tokens=max_new_tokens,
-        api_key=os.getenv("OPENAI_API_KEY"),
-        temperature=0.0,
-        streaming=True
-    )
     return run_chat(
         thread_id=thread_id,
         user_input=user_input,
-        chatbot=chatbot,
-        llm=llm,
         indices=indices
     )
-def build_chat() -> Tuple[LoggedComponents, gr.Blocks]:
-    with gr.Blocks(theme=gr.themes.Soft(), title="Ask Candid", css=css_chat) as demo:
         gr.Markdown(
             """
@@ -74,28 +88,38 @@ def build_chat() -> Tuple[LoggedComponents, gr.Blocks]:
                 choices=list(ALL_INDICES),
                 value=list(ALL_INDICES),
                 label="Sources to include",
-                interactive=True
             )
             max_new_tokens = gr.Slider(
-                value=256 * 3, minimum=128, maximum=2048, step=128,
-                label="Max new tokens", interactive=True
             )
         with gr.Column():
             chatbot = gr.Chatbot(
-                label="Candid Assistant",
                 elem_id="chatbot",
-                bubble_full_width=False,
                 avatar_images=(
                     None,
-                    os.path.join(ROOT, "static", "candid_logo_yellow.png")
                 ),
                 height="45vh",
                 type="messages",
                 show_label=False,
                 show_copy_button=True,
                 show_share_button=True,
-                show_copy_all_button=True
             )
             msg = gr.MultimodalTextbox(label="Your message", interactive=True)
             thread_id = gr.Text(visible=False, value="", label="thread_id")
@@ -104,24 +128,33 @@ def build_chat() -> Tuple[LoggedComponents, gr.Blocks]:
         # pylint: disable=no-member
         chat_msg = msg.submit(
             fn=execute,
-            inputs=[thread_id, msg, chatbot, max_new_tokens, es_indices],
-            outputs=[msg, chatbot, thread_id]
         )
         chat_msg.then(format_chat_ag_response, chatbot, chatbot, api_name="bot_response")
-        logged = LoggedComponents(
-            context=[thread_id, chatbot]
-        )
     return logged, demo
-if __name__ == '__main__':
-    _, app = build_chat()
-    app.queue(max_size=5).launch(
-        show_api=False,
-        auth=[
-            (os.getenv("APP_USERNAME"), os.getenv("APP_PASSWORD")),
-            (os.getenv("APP_PUBLIC_USERNAME"), os.getenv("APP_PUBLIC_PASSWORD")),
         ],
-        auth_message="Login to Candid's AI assistant",
-        ssr_mode=False
     )

 import gradio as gr
+from langchain_core.language_models.llms import LLM
 from langchain_openai.chat_models import ChatOpenAI
+from langchain_aws import ChatBedrock
+import boto3
+from ask_candid.base.config.rest import OPENAI
+from ask_candid.base.config.models import Name2Endpoint
+from ask_candid.base.config.data import ALL_INDICES
+from ask_candid.utils import format_chat_ag_response
+from ask_candid.chat import run_chat
 ROOT = os.path.dirname(os.path.abspath(__file__))
+BUCKET = "candid-data-science-reporting"
+PREFIX = "Assistant"
 class LoggedComponents(TypedDict):
     context: List[gr.components.Component]
     email: gr.components.Component
+def select_foundation_model(model_name: str, max_new_tokens: int) -> LLM:
+    if model_name == "gpt-4o":
+        llm = ChatOpenAI(
+            model_name=Name2Endpoint[model_name],
+            max_tokens=max_new_tokens,
+            api_key=OPENAI["key"],
+            temperature=0.0,
+            streaming=True,
+        )
+    elif model_name in {"claude-3.5-haiku", "llama-3.1-70b-instruct", "mistral-large", "mixtral-8x7B"}:
+        llm = ChatBedrock(
+            client=boto3.client("bedrock-runtime"),
+            model=Name2Endpoint[model_name],
+            max_tokens=max_new_tokens,
+            temperature=0.0
+        )
+    else:
+        raise gr.Error(f"Base model `{model_name}` is not supported")
+    return llm
 def execute(
     thread_id: str,
     user_input: Dict[str, Any],
+    history: List[Dict],
+    model_name: str,
     max_new_tokens: int,
     indices: Optional[List[str]] = None,
 ):
     return run_chat(
         thread_id=thread_id,
         user_input=user_input,
+        history=history,
+        llm=select_foundation_model(model_name=model_name, max_new_tokens=max_new_tokens),
         indices=indices
     )
+def build_rag_chat() -> Tuple[LoggedComponents, gr.Blocks]:
+    with gr.Blocks(theme=gr.themes.Soft(), title="Chat") as demo:
         gr.Markdown(
             """
                 choices=list(ALL_INDICES),
                 value=list(ALL_INDICES),
                 label="Sources to include",
+                interactive=True,
+            )
+            llmname = gr.Radio(
+                label="Language model",
+                value="gpt-4o",
+                choices=list(Name2Endpoint.keys()),
+                interactive=True,
             )
             max_new_tokens = gr.Slider(
+                value=256 * 3,
+                minimum=128,
+                maximum=2048,
+                step=128,
+                label="Max new tokens",
+                interactive=True,
             )
         with gr.Column():
             chatbot = gr.Chatbot(
+                label="AskCandid",
                 elem_id="chatbot",
+                bubble_full_width=True,
                 avatar_images=(
                     None,
+                    os.path.join(ROOT, "static", "candid_logo_yellow.png"),
                 ),
                 height="45vh",
                 type="messages",
                 show_label=False,
                 show_copy_button=True,
                 show_share_button=True,
+                show_copy_all_button=True,
             )
             msg = gr.MultimodalTextbox(label="Your message", interactive=True)
             thread_id = gr.Text(visible=False, value="", label="thread_id")
         # pylint: disable=no-member
         chat_msg = msg.submit(
             fn=execute,
+            inputs=[thread_id, msg, chatbot, llmname, max_new_tokens, es_indices],
+            outputs=[msg, chatbot, thread_id],
         )
         chat_msg.then(format_chat_ag_response, chatbot, chatbot, api_name="bot_response")
+        logged = LoggedComponents(context=[thread_id, chatbot])
     return logged, demo
+def build_app():
+    _, candid_chat = build_rag_chat()
+    with open(os.path.join(ROOT, "static", "chatStyle.css"), "r", encoding="utf8") as f:
+        css_chat = f.read()
+    demo = gr.TabbedInterface(
+        interface_list=[
+            candid_chat,
         ],
+        tab_names=[
+            "AskCandid",
+        ],
+        theme=gr.themes.Soft(),
+        css=css_chat,
     )
+    return demo
+if __name__ == "__main__":
+    app = build_app()
+    app.queue(max_size=5).launch(show_api=False)

ask_candid/__init__.py ADDED Viewed

File without changes

ask_candid/agents/__init__.py ADDED Viewed

File without changes

ask_candid/agents/elastic.py ADDED Viewed

	@@ -0,0 +1,293 @@

+from typing import TypedDict
+from functools import partial
+import json
+import ast
+from pydantic import BaseModel, Field
+from langchain_openai import ChatOpenAI
+from langchain_core.runnables import RunnableSequence
+from langchain_core.language_models.llms import LLM
+from langchain.agents.openai_functions_agent.base import create_openai_functions_agent
+from langchain.agents.agent import AgentExecutor
+from langchain.agents.agent_types import AgentType
+from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain.output_parsers import PydanticOutputParser
+from langchain.schema import BaseMessage
+from langgraph.graph import StateGraph, END
+from ask_candid.tools.elastic.list_indices_tool import ListIndicesTool
+from ask_candid.tools.elastic.index_data_tool import IndexShowDataTool
+from ask_candid.tools.elastic.index_details_tool import IndexDetailsTool
+from ask_candid.tools.elastic.index_search_tool import create_search_tool
+from ask_candid.base.config.rest import OPENAI
+tools = [
+    ListIndicesTool(),
+    IndexShowDataTool(),
+    IndexDetailsTool(),
+    create_search_tool(),
+]
+class GraphState(TypedDict):
+    query: str = Field(
+        ..., description="The user's query to be processed by the system."
+    )
+    agent_out: str = Field(
+        ...,
+        description="The output generated by the AI agent after processing the query.",
+    )
+    next_step: str = Field(
+        ..., description="The next step in the workflow, determined by query analysis."
+    )
+    es_query: dict = Field(
+        ..., description="The Elasticsearch query generated or used by the agent."
+    )
+    es_result: dict = Field(
+        ...,
+        description="The Elasticsearch query result generated or used by the agent.",
+    )
+class AnalysisResult(BaseModel):
+    category: str = Field(..., description="Either 'general' or 'Database'")
+def agent_factory() -> AgentExecutor:
+    """
+    Creates and configures an AgentExecutor instance for interacting with Elasticsearch.
+    This function initializes an OpenAI GPT-4-based LLM with specific parameters,
+    constructs a prompt tailored for Elasticsearch assistance, and integrates the
+    agent with a set of tools to handle user queries. The agent is designed to work
+    with OpenAI functions for enhanced capabilities.
+    Returns:
+        AgentExecutor: Configured agent ready to execute tasks with specified tools,
+                       providing detailed intermediate steps for transparency.
+    """
+    llm = ChatOpenAI(
+        model="gpt-4o", temperature=0, api_key=OPENAI["key"], streaming=False
+    )
+    tags_ = []
+    agent = AgentType.OPENAI_FUNCTIONS
+    tags_.append(agent.value if isinstance(agent, AgentType) else agent)
+    # Create the prompt
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            ("system", "You are a helpful elasticsearch assistant"),
+            MessagesPlaceholder(variable_name="chat_history", optional=True),
+            ("human", "{input}"),
+            MessagesPlaceholder(variable_name="agent_scratchpad"),
+        ]
+    )
+    # Create the agent
+    agent_obj = create_openai_functions_agent(llm, tools, prompt)
+    return AgentExecutor.from_agent_and_tools(
+        agent=agent_obj,
+        tools=tools,
+        tags=tags_,
+        verbose=True,
+        return_intermediate_steps=True,
+    )
+# define graph node functions
+def general_query(state: GraphState, llm: LLM) -> GraphState:
+    """
+    Processes a user query using an LLM and updates the graph state with the response.
+    Args:
+        state (GraphState): Current graph state containing the user's query.
+        llm (LLM): Language model to process the query.
+    Returns:
+        GraphState: Updated state with the LLM's response in "agent_out".
+    """
+    print("> General query")
+    prompt = ChatPromptTemplate.from_template(
+        "Answer based on the user's query: {query}"
+    )
+    chain = prompt | llm
+    response = chain.invoke({"query": state["query"]})
+    if isinstance(response, BaseMessage):
+        state["agent_out"] = response.content
+    else:
+        state["agent_out"] = str(response)
+    return state
+def database_agent(state: GraphState) -> GraphState:
+    """
+    Executes a database query using an Elasticsearch agent and updates the graph state.
+    The agent queries indices and field names in the Elasticsearch database,
+    selects the appropriate index (`organization_dev_2`), and answers the user's question.
+    Args:
+        state (GraphState): Current graph state containing the user's query.
+    Returns:
+        GraphState: Updated state with the agent's output in "agent_out" and
+                    the Elasticsearch query in "es_query".
+    """
+    print("> database agent")
+    input_data = {
+        "input": f"""
+        Make sure that you query first the indices in the ElasticSearch database.
+        Make sure that after querying the indices you query the field names.
+        To answer the question choose ```organization_dev_2``` index
+        Then answer this question:
+        {state["query"]}
+        """
+    }
+    agent_exec = agent_factory()
+    res = agent_exec.invoke(input_data)
+    state["agent_out"] = res["output"]
+    es_queries, es_results = {}, {}
+    for i, action in enumerate(res.get("intermediate_steps", []), start=1):
+        if action[0].tool == "elastic_index_search_tool":
+            es_queries[f"query_{i}"] = json.loads(action[0].tool_input.get("query") or "{}")
+            es_results[f"query_{i}"] = ast.literal_eval(action[-1] or "{}")
+    # if len(res["intermediate_steps"]) > 1:
+    #     es_queries = {
+    #         f"query_{i}": action[0].tool_input.get("query", "")
+    #         for i, action in enumerate(res.get("intermediate_steps", []), start=1)
+    #         if action[0].tool == "elastic_index_search_tool"
+    #     }
+    #     es_results = {
+    #         f"result_{i}": action[-1]
+    #         for i, action in enumerate(res.get("intermediate_steps", []), start=1)
+    #         if action[0].tool == "elastic_index_search_tool"
+    #     }
+    #     state["es_query"] = es_queries
+    #     state["es_result"] = es_results
+    # else:
+    #     state["es_query"] = res["intermediate_steps"][-1][0].tool_input["query"]
+    #     state["es_result"] = {"result": res["intermediate_steps"][-2][-1]}
+    state["es_query"] = es_queries
+    state["es_result"] = es_results
+    return state
+def analyse_query(state: GraphState, llm: LLM) -> GraphState:
+    """
+    Analyzes the user's query to classify it as either general or database-specific
+    and determines the next processing step.
+    Args:
+        state (GraphState): Current graph state containing the user's query.
+        llm (LLM): Language model used for query analysis.
+    Returns:
+        GraphState: Updated state with the classification result and the
+                    next processing step in "next_step".
+    """
+    print("> analyse query")
+    prompt_template = """Your task is to analyze the query  ```{query}``` and classify it in:
+    general: it's a basic general enquiry
+    Database: query which is complicated and would require to go into the database and extract specific information
+    Output format:
+    {{"category": "<your_classification>"}}
+    """
+    # Create the prompt
+    prompt = ChatPromptTemplate.from_template(prompt_template)
+    # Define the parser
+    parser = PydanticOutputParser(pydantic_object=AnalysisResult)
+    # Create the chain
+    chain = RunnableSequence(prompt, llm)
+    # Invoke the chain with the query
+    response = chain.invoke({"query": state["query"]})
+    if "Database" in response.content:
+        state["next_step"] = "es_database_agent"
+    else:
+        state["next_step"] = "general_query"
+    return state
+def final_answer(state: GraphState, llm: LLM) -> GraphState:
+    """
+    Generates and presents the final response based on the user's query and the AI's output.
+    Args:
+        state (GraphState): Current graph state containing the query and AI output.
+        llm (LLM): Language model used to format the final response.
+    Returns:
+        GraphState: Updated state with the formatted final answer in "agent_out".
+    """
+    print("> Final Answer")
+    prompt_template = """
+    Your task is to present the result based on the user's query:
+    Query: ```{query}```
+    AI Output:
+    ```{output}```
+    """
+    prompt = ChatPromptTemplate.from_template(prompt_template)
+    chain = RunnableSequence(prompt, llm)
+    response = chain.invoke({"query": state["query"], "output": state["agent_out"]})
+    return {"agent_out": response.content}
+def build_compute_graph(llm: LLM) -> StateGraph:
+    """
+    Constructs a compute graph for processing user queries using a defined workflow.
+    The workflow includes nodes for query analysis, handling general or database-specific queries,
+    and generating the final response. Conditional logic determines the path based on query type.
+    Args:
+        llm (LLM): Language model to be used in various nodes for processing queries.
+    Returns:
+        StateGraph: Configured compute graph ready for execution.
+    """
+    # Create the workflow
+    workflow = StateGraph(GraphState)
+    # Add nodes
+    workflow.add_node("analyse", partial(analyse_query, llm=llm))
+    workflow.add_node("general_query", partial(general_query, llm=llm))
+    workflow.add_node("es_database_agent", database_agent)
+    workflow.add_node("final_answer", partial(final_answer, llm=llm))
+    # Set entry point
+    workflow.set_entry_point("analyse")
+    # Add conditional edges
+    workflow.add_conditional_edges(
+        "analyse",
+        lambda x: x["next_step"],  # Use the return value of analyse_query directly
+        {"es_database_agent": "es_database_agent", "general_query": "general_query"},
+    )
+    # Add edges to end the workflow
+    workflow.add_edge("es_database_agent", "final_answer")
+    workflow.add_edge("general_query", "final_answer")
+    workflow.add_edge("final_answer", END)
+    return workflow

ask_candid/base/__init__.py ADDED Viewed

File without changes

ask_candid/base/api_base.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from typing import Dict, Optional, Any
+from urllib3.util.retry import Retry
+from requests.adapters import HTTPAdapter
+import requests
+class BaseAPI:
+    def __init__(
+        self,
+        url: str,
+        headers: Optional[Dict[str, Any]] = None,
+        total_retries: int = 3,
+        backoff_factor: int = 2
+    ) -> None:
+        total_retries = max(total_retries, 10)
+        adapter = HTTPAdapter(
+            max_retries=Retry(
+                total=total_retries,
+                status_forcelist=[429, 500, 502, 503, 504],
+                allowed_methods=frozenset({"HEAD", "GET", "POST", "OPTIONS"}),
+                backoff_factor=backoff_factor,
+            )
+        )
+        self.session = requests.Session()
+        self.session.mount("https://", adapter)
+        self.session.mount("http://", adapter)
+        self.__url = url
+        self.__headers = headers
+    def get(self, **request_kwargs):
+        r = self.session.get(url=self.__url, headers=self.__headers, params=request_kwargs, timeout=30)
+        r.raise_for_status()
+        return r.json()
+    def post(self, payload: Dict[str, Any]):
+        r = self.session.post(url=self.__url, headers=self.__headers, json=payload, timeout=30)
+        r.raise_for_status()
+        return r.json()

ask_candid/base/api_base_async.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from typing import Dict, Optional, Any
+import json
+import aiohttp
+class BaseAsyncAPI:
+    def __init__(self, url: str, headers: Optional[Dict[str, Any]] = None, retries: int = 3) -> None:
+        self.__url = url
+        self.__headers = headers
+        self.__retries = max(retries, 5)
+    async def get(self, **request_kwargs):
+        session_timeout = aiohttp.ClientTimeout(total=30)
+        async with aiohttp.ClientSession(headers=self.__headers, timeout=session_timeout) as session:
+            output = {}
+            count = 1
+            while True:
+                if count >= self.__retries:
+                    break
+                async with session.get(url=self.__url, params=request_kwargs) as r:
+                    if r.status in {429, 500, 502, 503, 504}:
+                        count += 1
+                    elif r.status == 200:
+                        output = await r.json()
+                        break
+                    else:
+                        break
+        return output
+    async def post(self, payload: Dict[str, Any]):
+        session_timeout = aiohttp.ClientTimeout(total=30)
+        async with aiohttp.ClientSession(headers=self.__headers, timeout=session_timeout) as session:
+            output = {}
+            count = 1
+            while True:
+                if count >= self.__retries:
+                    break
+                async with session.post(url=self.__url, data=json.dumps(payload).encode('utf8')) as r:
+                    if r.status in {429, 500, 502, 503, 504}:
+                        count += 1
+                    elif r.status == 200:
+                        output = await r.json()
+                        break
+                    else:
+                        break
+        return output

ask_candid/base/config/__init__.py ADDED Viewed

File without changes

ask_candid/base/config/connections.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from dataclasses import dataclass, field
+from dotenv import dotenv_values, find_dotenv
+@dataclass
+class BaseElasticSearchConnection:
+    """Elasticsearch connection dataclass
+    """
+    url: str = field(default_factory=str)
+    username: str = field(default_factory=str)
+    password: str = field(default_factory=str)
+@dataclass
+class BaseElasticAPIKeyCredential:
+    """Cloud ID/API key data class
+    """
+    cloud_id: str = field(default_factory=str)
+    api_key: str = field(default_factory=str)
+__env_values__ = dotenv_values(
+    dotenv_path=find_dotenv(".env", raise_error_if_not_found=True)
+)
+SEMANTIC_ELASTIC_QA = BaseElasticAPIKeyCredential(
+    cloud_id=__env_values__.get("SEMANTIC_ELASTIC_CLOUD_ID"),
+    api_key=__env_values__.get("SEMANTIC_ELASTIC_API_KEY"),
+)
+CDS_ELASTIC = BaseElasticSearchConnection(
+    url="https://cdses.candid.org:9200",
+    username=__env_values__.get("CDS_UID"),
+    password=__env_values__.get("CDS_PWD")
+)

ask_candid/base/config/data.py ADDED Viewed

	@@ -0,0 +1,20 @@

+class ElasticIndexMapping:
+    "Mapping from plain name to Elasticsearch index name"
+    ISSUELAB_INDEX = "search-semantic-issuelab_v1"
+    ISSUELAB_INDEX_ELSER = "search-semantic-issuelab-elser_ve2"
+    YOUTUBE_INDEX = "search-semantic-youtube_v1"
+    YOUTUBE_INDEX_ELSER = "search-semantic-youtube-elser_ve1"
+    CANDID_BLOG_INDEX = "search-semantic-candid-blog_v1"
+    CANDID_BLOG_INDEX_ELSER = "search-semantic-candid-blog-elser_ve2"
+    CANDID_LEARNING_INDEX_ELSER = "search-semantic-candid-learning_ve1"
+    CANDID_HELP_INDEX_ELSER = "search-semantic-candid-help-elser_ve1"
+ALL_INDICES = (
+    "issuelab",
+    "youtube",
+    "candid_blog",
+    "candid_learning",
+    "candid_help"
+)

ask_candid/base/config/models.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from types import MappingProxyType
+Name2Endpoint = MappingProxyType({
+    "gpt-4o": "gpt-4o",
+    "claude-3.5-haiku": "us.anthropic.claude-3-5-haiku-20241022-v1:0",
+    # "llama-3.1-70b-instruct": "us.meta.llama3-1-70b-instruct-v1:0",
+    # "mistral-large": "mistral.mistral-large-2402-v1:0",
+    # "mixtral-8x7B": "mistral.mixtral-8x7b-instruct-v0:1",
+})

ask_candid/base/config/rest.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from typing import TypedDict
+from dotenv import dotenv_values, find_dotenv
+class Api(TypedDict):
+    """REST API configuration template
+    """
+    url: str
+    key: str
+__env_values__ = dotenv_values(
+    dotenv_path=find_dotenv(".env", raise_error_if_not_found=True)
+)
+CDS_API = Api(
+    url=__env_values__.get("CDS_API_URL"),
+    key=__env_values__.get("CDS_API_KEY")
+)
+OPENAI = Api(url=None, key=__env_values__.get("OPENAI_API_KEY"))

ask_candid/base/lambda_base.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from typing import List, Dict, Union, Optional, Any
+from time import sleep
+import json
+import boto3
+class LambdaInvokeBase:
+    """Base class for AWS Lambda direct-invocation based classes. Each class which inherits from this only serves a
+    single function.
+    Parameters
+    ----------
+    function_name : str
+        Name of the Lambda function to invoke
+    access_key : Optional[str], optional
+        AWS access key, by default None
+    secret_key : Optional[str], optional
+        AWS secret key, by default None
+    """
+    errors = frozenset([
+        "Unhandled"
+    ])
+    def __init__(
+        self, function_name: str,
+        access_key: Optional[str] = None, secret_key: Optional[str] = None,
+    ) -> None:
+        if access_key is not None and secret_key is not None:
+            self._client = boto3.client(
+                "lambda",
+                aws_access_key_id=access_key,
+                aws_secret_access_key=secret_key,
+                region_name="us-east-1",
+            )
+        else:
+            self._client = boto3.client("lambda", region_name='us-east-1')
+        self.function_name = function_name
+    def _submit_request(self, payload: Dict[str, Any]) -> Union[Dict[str, Any], List[Any]]:
+        response = self._client.invoke(
+            FunctionName=self.function_name,
+            InvocationType="RequestResponse",
+            Payload=json.dumps(payload),
+        )
+        if response.get("FunctionError") in self.errors:
+            # could use recursion, but we need to keep track of number of function calls
+            sleep(1)
+            response = self._client.invoke(
+                FunctionName=self.function_name,
+                InvocationType="RequestResponse",
+                Payload=json.dumps(payload),
+            )
+        return json.loads(response["Payload"].read())

ask_candid/base/utils.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import asyncio
+def async_tasks(*tasks):
+    async def gather(*t):
+        t = [await _ for _ in t]
+        return await asyncio.gather(*t)
+    loop = asyncio.new_event_loop()
+    results = loop.run_until_complete(gather(*tasks))
+    loop.stop()
+    loop.close()
+    return results

ask_candid/chat.py ADDED Viewed

	@@ -0,0 +1,251 @@

+from typing import List, Optional, Dict, Any, TypedDict, Annotated, Sequence
+from functools import partial
+import logging
+import os
+import gradio as gr
+from langchain_core.messages import AIMessage, BaseMessage
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.language_models.llms import LLM
+from langgraph.prebuilt import tools_condition, ToolNode
+from langgraph.checkpoint.memory import MemorySaver
+from langgraph.graph.state import StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.constants import START, END
+from ask_candid.tools.org_seach import extract_org_links_from_chatbot, embed_org_links_in_text, generate_org_link_dict
+from ask_candid.tools.question_reformulation import reformulate_question_using_history
+from ask_candid.utils import html_format_docs_chat, get_session_id
+from ask_candid.retrieval.elastic import retriever_tool
+ROOT = os.path.dirname(os.path.abspath(__file__))
+logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+# TODO https://www.metadocs.co/2024/08/29/simple-domain-specific-corrective-rag-with-langchain-and-langgraph/
+class AgentState(TypedDict):
+    # The add_messages function defines how an update should be processed
+    # Default is to replace. add_messages says "append"
+    messages: Annotated[Sequence[BaseMessage], add_messages]
+    user_input: str
+    org_dict: Dict
+def search_agent(state, llm: LLM, tools) -> AgentState:
+    """Invokes the agent model to generate a response based on the current state. Given
+    the question, it will decide to retrieve using the retriever tool, or simply end.
+    Parameters
+    ----------
+    state : _type_
+        The current state
+    llm : LLM
+    tools : _type_
+        _description_
+    Returns
+    -------
+    AgentState
+        The updated state with the agent response appended to messages
+    """
+    logger.info("---SEARCH AGENT---")
+    messages = state["messages"]
+    question = messages[-1].content
+    model = llm.bind_tools(tools)
+    response = model.invoke(messages)
+    # return a list, because this will get added to the existing list
+    return {"messages": [response], "user_input": question}
+def generate_with_context(state, llm: LLM) -> AgentState:
+    """Generate answer.
+    Parameters
+    ----------
+    state : _type_
+        The current state
+    llm : LLM
+    tools : _type_
+        _description_
+    Returns
+    -------
+    AgentState
+        The updated state with the agent response appended to messages
+    """
+    logger.info("---GENERATE ANSWER---")
+    messages = state["messages"]
+    question = state["user_input"]
+    last_message = messages[-1]
+    sources_str = last_message.content
+    sources_list = last_message.artifact  # cannot use directly as list of Documents
+    # converting to html string
+    sources_html = html_format_docs_chat(sources_list)
+    if sources_list:
+        logger.info("---ADD SOURCES---")
+    state["messages"].append(BaseMessage(content=sources_html, type="HTML"))
+    # Prompt
+    qa_system_prompt = """
+        You are an assistant for question-answering tasks in the social and philanthropic sector. \n
+        Use the following pieces of retrieved context to answer the question at the end. \n
+        If you don't know the answer, just say that you don't know. \n
+        Keep the response professional, friendly, and as concise as possible. \n
+        Question: {question}
+        Context: {context}
+        Answer:
+        """
+    qa_prompt = ChatPromptTemplate(
+        [
+            ("system", qa_system_prompt),
+            ("human", question),
+        ]
+    )
+    rag_chain = qa_prompt | llm | StrOutputParser()
+    response = rag_chain.invoke({"context": sources_str, "question": question})
+    # couldn't figure out why returning usual "response" was seen as HumanMessage
+    return {"messages": [AIMessage(content=response)], "user_input": question}
+def has_org_name(state: AgentState) -> AgentState:
+    """
+    Processes the latest message to extract organization links and determine the next step.
+    Args:
+        state (AgentState): The current state of the agent, including a list of messages.
+    Returns:
+        dict: A dictionary with the next agent action and, if available, a dictionary of organization links.
+    """
+    logger.info("---HAS ORG NAMES?---")
+    messages = state["messages"]
+    last_message = messages[-1].content
+    output_list = extract_org_links_from_chatbot(last_message)
+    link_dict = generate_org_link_dict(output_list) if output_list else {}
+    if link_dict:
+        logger.info("---FOUND ORG NAMES---")
+        return {"next": "insert_org_link", "org_dict": link_dict}
+    logger.info("---NO ORG NAMES FOUND---")
+    return {"next": END, "messages": messages}
+def insert_org_link(state: AgentState) -> AgentState:
+    """
+    Embeds organization links in the latest message content and returns it as an AI message.
+    Args:
+        state (dict): The current state, including the organization links and latest message.
+    Returns:
+        dict: A dictionary with the updated message content as an AIMessage.
+    """
+    logger.info("---INSERT ORG LINKS---")
+    messages = state["messages"]
+    last_message = messages[-1].content
+    messages.pop(-1)  # Deleting the original message because we will append the same one but with links
+    link_dict = state["org_dict"]
+    last_message = embed_org_links_in_text(last_message, link_dict)
+    return {"messages": [AIMessage(content=last_message)]}
+def build_compute_graph(llm: LLM, indices: List[str]) -> StateGraph:
+    candid_retriever_tool = retriever_tool(indices=indices)
+    retrieve = ToolNode([candid_retriever_tool])
+    tools = [candid_retriever_tool]
+    G = StateGraph(AgentState)
+    # Add nodes
+    G.add_node("reformulate", partial(reformulate_question_using_history, llm=llm))
+    G.add_node("search_agent", partial(search_agent, llm=llm, tools=tools))
+    G.add_node("retrieve", retrieve)
+    G.add_node("generate_with_context", partial(generate_with_context, llm=llm))
+    G.add_node("has_org_name", has_org_name)
+    G.add_node("insert_org_link", insert_org_link)
+    # Add edges
+    G.add_edge(START, "reformulate")
+    G.add_edge("reformulate", "search_agent")
+    # Conditional edges from search_agent
+    G.add_conditional_edges(
+        source="search_agent",
+        path=tools_condition,
+        path_map={
+            "tools": "retrieve",
+            END: "has_org_name",
+        },
+    )
+    G.add_edge("retrieve", "generate_with_context")
+    # Add edges
+    G.add_edge("generate_with_context", "has_org_name")
+    # Use add_conditional_edges for has_org_name
+    G.add_conditional_edges(
+        "has_org_name",
+        lambda x: x["next"],  # Now we're accessing the 'next' key from the dict
+        {"insert_org_link": "insert_org_link", END: END},
+    )
+    G.add_edge("insert_org_link", END)
+    return G
+def run_chat(
+    thread_id: str,
+    user_input: Dict[str, Any],
+    history: List[Dict],
+    llm: LLM,
+    indices: Optional[List[str]] = None,
+):
+    # https://langchain-ai.github.io/langgraph/tutorials/rag/langgraph_agentic_rag/#graph
+    if len(history) == 0:
+        history.append({
+            "role": "system",
+            "content": (
+                "You are a Candid subject matter expert on the social sector and philanthropy. "
+                "You should address the user's queries and stay on topic."
+            )
+        })
+    history.append({"role": "user", "content": user_input["text"]})
+    inputs = {"messages": history}
+    # thread_id can be an email https://github.com/yurisasc/memory-enhanced-ai-assistant/blob/main/assistant.py
+    thread_id = get_session_id(thread_id)
+    config = {"configurable": {"thread_id": thread_id}}
+    workflow = build_compute_graph(llm=llm, indices=indices)
+    memory = MemorySaver()  # TODO: don't use for Prod
+    graph = workflow.compile(checkpointer=memory)
+    response = graph.invoke(inputs, config=config)
+    messages = response["messages"]
+    last_message = messages[-1]
+    ai_answer = last_message.content
+    sources_html = ""
+    for message in messages[-2:]:
+        if message.type == "HTML":
+            sources_html = message.content
+    history.append({"role": "assistant", "content": ai_answer})
+    if sources_html:
+        history.append(
+            {
+                "role": "assistant",
+                "content": sources_html,
+                "metadata": {"title": "Sources HTML"},
+            }
+        )
+    return gr.MultimodalTextbox(value=None, interactive=True), history, thread_id

ask_candid/indexing/__init__.py ADDED Viewed

File without changes

ask_candid/retrieval/__init__.py ADDED Viewed

File without changes

ask_candid/retrieval/elastic.py ADDED Viewed

	@@ -0,0 +1,323 @@

+from typing import List, Tuple, Dict, Iterable, Iterator, Optional, Any
+from dataclasses import dataclass
+from functools import partial
+from itertools import groupby
+from pydantic import BaseModel, Field
+from langchain_core.documents import Document
+from langchain_core.tools import Tool
+from elasticsearch import Elasticsearch
+from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA
+from ask_candid.base.config.data import ElasticIndexMapping, ALL_INDICES
+@dataclass
+class ElasticHitsResult:
+    """Dataclass for Elasticsearch hits results
+    """
+    index: str
+    id: Any
+    score: float
+    source: Dict[str, Any]
+    inner_hits: Dict[str, Any]
+class RetrieverInput(BaseModel):
+    """Input to the Elasticsearch retriever."""
+    user_input: str = Field(description="query to look up in retriever")
+def build_text_expansion_query(
+    query: str,
+    fields: Tuple[str],
+    model_id: str = ".elser_model_2_linux-x86_64"
+) -> Dict[str, Any]:
+    output = []
+    for f in fields:
+        output.append({
+            "nested": {
+                "path": f"embeddings.{f}.chunks",
+                "query": {
+                    "text_expansion": {
+                        f"embeddings.{f}.chunks.vector": {
+                            "model_id": model_id,
+                            "model_text": query,
+                            "boost": 1 / len(fields)
+                        }
+                    }
+                },
+                 "inner_hits": {
+                    "_source": False,
+                    "size": 2,
+                    "fields": [f"embeddings.{f}.chunks.chunk"]
+                }
+            }
+        })
+    return {"query": {"bool": {"should": output}}}
+def query_builder(query: str, indices: List[str]):
+    queries = []
+    if indices is None:
+        indices = list(ALL_INDICES)
+    for index in indices:
+        if index == "issuelab":
+            q = build_text_expansion_query(
+                query=query,
+                fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
+            )
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 1
+            queries.extend([{"index": ElasticIndexMapping.ISSUELAB_INDEX_ELSER}, q])
+        elif index == "youtube":
+            q = build_text_expansion_query(
+                query=query,
+                fields=("captions_cleaned", "description_cleaned", "title")
+            )
+            # text_cleaned duplicates captions_cleaned
+            q["_source"] = {"excludes": ["embeddings", "captions", "description", "text_cleaned"]}
+            q["size"] = 2
+            queries.extend([{"index": ElasticIndexMapping.YOUTUBE_INDEX_ELSER}, q])
+        elif index == "candid_blog":
+            q = build_text_expansion_query(
+                query=query,
+                fields=("content", "title")
+            )
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 2
+            queries.extend([{"index": ElasticIndexMapping.CANDID_BLOG_INDEX_ELSER}, q])
+        elif index == "candid_learning":
+            q = build_text_expansion_query(
+                query=query,
+                fields=("content", "title", "training_topics", "staff_recommendations")
+            )
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 2
+            queries.extend([{"index": ElasticIndexMapping.CANDID_LEARNING_INDEX_ELSER}, q])
+        elif index == "candid_help":
+            q = build_text_expansion_query(
+                query=query,
+                fields=("content", "combined_article_description")
+            )
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 2
+            queries.extend([{"index": ElasticIndexMapping.CANDID_HELP_INDEX_ELSER}, q])
+    return queries
+def multi_search(queries: List[ElasticHitsResult]):
+    results = []
+    with Elasticsearch(
+        cloud_id=SEMANTIC_ELASTIC_QA.cloud_id,
+        api_key=SEMANTIC_ELASTIC_QA.api_key,
+        verify_certs=False,
+        request_timeout=60 * 3
+    ) as es:
+        for query_group in es.msearch(body=queries).get("responses", []):
+            for hit in query_group.get("hits", {}).get("hits", []):
+                hit = ElasticHitsResult(
+                    index=hit["_index"],
+                    id=hit["_id"],
+                    score=hit["_score"],
+                    source=hit["_source"],
+                    inner_hits=hit.get("inner_hits", {})
+                )
+                results.append(hit)
+    return results
+def get_query_results(search_text: str, indices: Optional[List[str]] = None):
+    queries = query_builder(query=search_text, indices=indices)
+    return multi_search(queries)
+def reranker(query_results: Iterable[ElasticHitsResult]) -> Iterator[ElasticHitsResult]:
+    """Reranks Elasticsearch hits coming from multiple indicies/queries which may have scores on different scales.
+    This will shuffle results
+    Parameters
+    ----------
+    query_results : Iterable[ElasticHitsResult]
+    Yields
+    ------
+    Iterator[ElasticHitsResult]
+    """
+    results: List[ElasticHitsResult] = []
+    for _, data in groupby(query_results, key=lambda x: x.index):
+        data = list(data)
+        max_score = max(data, key=lambda x: x.score).score
+        min_score = min(data, key=lambda x: x.score).score
+        for d in data:
+            d.score = (d.score - min_score) / (max_score - min_score + 1e-9)
+            results.append(d)
+    yield from sorted(results, key=lambda x: x.score, reverse=True)
+def get_results(user_input: str, indices: List[str]) -> List[ElasticHitsResult]:
+    output = ["Search didn't return any Candid sources"]
+    page_content=[]
+    content = "Search didn't return any Candid sources"
+    results = get_query_results(search_text=user_input, indices=indices)
+    if results:
+        output = get_reranked_results(results)
+        for doc in output:
+            page_content.append(doc.page_content)
+        content = "/n/n".join(page_content)
+    # for the tool we need to return a tuple for content_and_artifact type
+    return content, output
+def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024) -> str:
+    """Pads the relevant chunk of text with context before and after
+    Parameters
+    ----------
+    field_name : str
+        a field with the long text that was chunked into pieces
+    hit : ElasticHitsResult
+    context_length : int, optional
+        length of text to add before and after the chunk, by default 1024
+    Returns
+    -------
+    str
+        longer chunks stuffed together
+    """
+    chunks_with_context = []
+    long_text = hit.source.get(f"{field_name}", "")
+    inner_hits_field = f"embeddings.{field_name}.chunks"
+    inner_hits = hit.inner_hits
+    found_chunks = inner_hits.get(inner_hits_field, {})
+    if found_chunks:
+        hits = found_chunks.get("hits", {}).get("hits", [])
+        for h in hits:
+            chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
+            chunk = chunk[3:-3] # cutting the middle because we may have tokenizing artefacts there
+            # Find the start and end indices of the chunk in the large text
+            start_index = long_text.find(chunk)
+            if start_index != -1: # Chunk is found
+                end_index = start_index + len(chunk)
+                pre_start_index = max(0, start_index - context_length)
+                post_end_index = min(len(long_text), end_index + context_length)
+                context = long_text[pre_start_index:post_end_index]
+                chunks_with_context.append(context)
+    chunks_with_context_txt = '\n\n'.join(chunks_with_context)
+    return chunks_with_context_txt
+def process_hit(hit: ElasticHitsResult) -> Document | None:
+    if "issuelab-elser" in hit.index:
+        combined_item_description = hit.source.get("combined_item_description", "") # title inside
+        description = hit.source.get("description", "")
+        combined_issuelab_findings = hit.source.get("combined_issuelab_findings", "")
+        # we only need to process long texts
+        chunks_with_context_txt = get_context("content", hit, context_length=12)
+        doc = Document(
+            page_content='\n\n'.join([
+                combined_item_description,
+                combined_issuelab_findings,
+                description,
+                chunks_with_context_txt
+            ]),
+            metadata={
+                "title": hit.source["title"],
+                "source": "IssueLab",
+                "source_id": hit.source["resource_id"],
+                "url": hit.source.get("permalink", "")
+            }
+        )
+    elif "youtube" in hit.index:
+        title = hit.source.get("title", "")
+        # we only need to process long texts
+        description_cleaned_with_context_txt = get_context("description_cleaned", hit, context_length=12)
+        captions_cleaned_with_context_txt = get_context("captions_cleaned", hit, context_length=12)
+        doc = Document(
+            page_content='\n\n'.join([title, description_cleaned_with_context_txt, captions_cleaned_with_context_txt]),
+            metadata={
+                "title": title,
+                "source": "Candid YouTube",
+                "source_id": hit.source['video_id'],
+                "url": f"https://www.youtube.com/watch?v&#61;{hit.source['video_id']}"
+            }
+        )
+    elif "candid-blog" in hit.index:
+        excerpt = hit.source.get("excerpt", "")
+        title = hit.source.get("title", "")
+        # we only need to process long texts
+        content_with_context_txt = get_context("content", hit, context_length=12)
+        doc = Document(
+            page_content='\n\n'.join([title, excerpt, content_with_context_txt]),
+            metadata={
+                "title": title,
+                "source": "Candid Blog",
+                "source_id": hit.source["id"],
+                "url": hit.source["link"]
+            }
+        )
+    elif "candid-learning" in hit.index:
+        title = hit.source.get("title", "")
+        content_with_context_txt = get_context("content", hit, context_length=12)
+        training_topics = hit.source.get("training_topics", "")
+        staff_recommendations = hit.source.get("staff_recommendations", "")
+        doc = Document(
+            page_content='\n\n'.join([title, staff_recommendations, training_topics, content_with_context_txt]),
+            metadata={
+                "title": hit.source["title"],
+                "source": "Candid Learning",
+                "source_id": hit.source["post_id"],
+                "url": hit.source.get("url", "")
+            }
+        )
+    elif "candid-help" in hit.index:
+        title = hit.source.get("title", "")
+        content_with_context_txt = get_context("content", hit, context_length=12)
+        combined_article_description = hit.source.get("combined_article_description", "")
+        doc = Document(
+            page_content='\n\n'.join([combined_article_description, content_with_context_txt]),
+            metadata={
+                "title": title,
+                "source": "Candid Help",
+                "source_id": hit.source["id"],
+                "url": hit.source.get("link", "")
+            }
+        )
+    else:
+        doc = None
+    return doc
+def get_reranked_results(results: List[ElasticHitsResult]) -> List[Document]:
+    output = []
+    for r in reranker(results):
+        hit = process_hit(r)
+        output.append(hit)
+    return output
+def retriever_tool(indices: List[str]) -> Tool:
+    # cannot use create_retriever_tool because it only provides content losing all metadata on the way
+    # https://python.langchain.com/docs/how_to/custom_tools/#returning-artifacts-of-tool-execution
+    return Tool(
+        name="retrieve_social_sector_information",
+        func=partial(get_results, indices=indices),
+        description=(
+            "Return additional information about social and philanthropic sector, "
+            "including nonprofits (NGO), grants, foundations, funding, RFP, LOI, Candid."
+        ),
+        args_schema=RetrieverInput,
+        response_format="content_and_artifact"
+    )

ask_candid/retrieval/sources/__init__.py ADDED Viewed

File without changes

ask_candid/retrieval/sources/candid_blog.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from typing import Dict, Any
+def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
+    url = f"{doc['link']}"
+    fields = ["title", "excerpt"]
+    fields_dict = {}
+    fields_len = 0
+    for field in fields:
+        if doc.get(field, None) is not None:
+            fields_dict[field] = doc[field]
+            fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"
+            if (fields_len + len(doc[field])) > 999:
+                rest_text_len = 999 - fields_len
+                if rest_text_len > 0:
+                    fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
+                else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
+            fields_len = fields_len + len(doc[field])
+        else:
+            fields_dict[field] = ""
+            fields_dict[field + "_txt"] = ""
+    html = f"""
+    <div style='height: {height_px}px; padding: 5px;'>
+        <div style='height: {height_px}px; border: 1px solid #febe10;'>
+            <span style='padding-left: 10px; display: inline-block; width: 100%;'>
+                <div>
+                    <span>
+                        <b>Candid blog post:</b>
+                        <a href='{url}' target='_blank' style='text-decoration: none;'>
+                            {doc['title']}
+                        </a>
+                    </span>
+                    <br>
+                    <br>
+                    {fields_dict["excerpt_txt"]}
+                </div>
+            </span>
+        </div>
+    </div>
+    """
+    return html

ask_candid/retrieval/sources/candid_help.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from typing import Dict, Any
+def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
+    url = f"{doc['link']}"
+    fields = ["title", "summary"]
+    fields_dict = {}
+    fields_len = 0
+    for field in fields:
+        if doc.get(field, None) is not None:
+            fields_dict[field] = doc[field]
+            fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"
+            if (fields_len + len(doc[field])) > 999:
+                rest_text_len = 999 - fields_len
+                if rest_text_len > 0:
+                    fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
+                else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
+            fields_len = fields_len + len(doc[field])
+        else:
+            fields_dict[field] = ""
+            fields_dict[field + "_txt"] = ""
+    html = f"""
+    <div style='height: {height_px}px; padding: 5px;'>
+        <div style='height: {height_px}px; border: 1px solid #febe10;'>
+            <span style='padding-left: 10px; display: inline-block; width: 100%;'>
+                <div>
+                    <span>
+                        <b>Candid help article:</b>
+                        <a href='{url}' target='_blank' style='text-decoration: none;'>
+                            {doc['title']}
+                        </a>
+                    </span>
+                    <br>
+                </div>
+            </span>
+        </div>
+    </div>
+    """
+    return html

ask_candid/retrieval/sources/candid_learning.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from typing import Dict, Any
+def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
+    url = f"{doc['url']}"
+    fields = ["title", "excerpt"]
+    fields_dict = {}
+    fields_len = 0
+    for field in fields:
+        if doc.get(field, None) is not None:
+            fields_dict[field] = doc[field]
+            fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"
+            if (fields_len + len(doc[field])) > 999:
+                rest_text_len = 999 - fields_len
+                if rest_text_len > 0:
+                    fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
+                else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
+            fields_len = fields_len + len(doc[field])
+        else:
+            fields_dict[field] = ""
+            fields_dict[field + "_txt"] = ""
+    html = f"""
+    <div style='height: {height_px}px; padding: 5px;'>
+        <div style='height: {height_px}px; border: 1px solid #febe10;'>
+            <span style='padding-left: 10px; display: inline-block; width: 100%;'>
+                <div>
+                    <span>
+                        <b>Candid Learning resource:</b>
+                        <a href='{url}' target='_blank' style='text-decoration: none;'>
+                            {doc['title']}
+                        </a>
+                    </span>
+                    <br>
+                </div>
+            </span>
+        </div>
+    </div>
+    """
+    return html

ask_candid/retrieval/sources/issuelab.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from typing import Dict, Any
+def issuelab_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
+    chunks_html = ""
+    if show_chunks:
+        cleaned_text = []
+        for k, v in doc["inner_hits"].items():
+            hits = v["hits"]["hits"]
+            for h in hits:
+                for k1, v1 in h["fields"].items():
+                    # we don't want other chunks
+                    if "content" in k1:
+                        cleaned_text.append(f"<div><p>{v1[0]['chunk'][0]}</p></div>")
+        chunks_html ="<span><b>Relevant parts of the content:</b></span>" + "<br>".join(cleaned_text)
+    html = f"""
+    <div style='height: auto; padding: 5px;'>
+        <div style='border: 1px solid #febe10;'>
+            <span style='display: inline-block; height: {height_px - 10}px; padding: 5px; vertical-align: top;'>
+                <img
+                    src='{doc['cover_graphic_small']}'
+                    style='max-height: 100%; overflow: hidden; border-radius: 3%;'
+                >
+            </span>
+            <span style='padding: 10px; display: inline-block; width: 70%;'>
+                <div>
+                    <span><b>Issuelab ID:</b> {doc['resource_id']}</span>
+                    <br>
+                    <span>
+                        <a href='{doc['issuelab_url']}' target='_blank' style='text-decoration: none;'>
+                            {doc['title']}
+                        </a>
+                    </span>
+                    <br>
+                    <span><b>Description:</b> {doc['description']}</span>
+                    <br>
+                    <div>{doc['combined_item_description']}</div>
+                    <br>
+                    <div>{chunks_html}</div>
+                </div>
+            </span>
+        </div>
+    </div>
+    """
+    return html

ask_candid/retrieval/sources/youtube.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from typing import Dict, Any
+def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
+    url = f"https://www.youtube.com/watch?v={doc['video_id']}"
+    fields = ["title", "description_cleaned"]
+    fields_dict = {}
+    fields_len = 0
+    for field in fields:
+        if doc.get(field, None) is not None:
+            fields_dict[field] = doc[field]
+            fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"
+            if (fields_len + len(doc[field])) > 999:
+                rest_text_len = 999 - fields_len
+                if rest_text_len > 0:
+                    fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
+                else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
+            fields_len = fields_len + len(doc[field])
+        else:
+            fields_dict[field] = ""
+            fields_dict[field + "_txt"] = ""
+    html = f"""
+    <div style='height: {height_px}px; padding: 5px;'>
+        <div style='height: {height_px}px; border: 1px solid #febe10;'>
+            <span style='padding-left: 10px; display: inline-block; width: 100%;'>
+                <div>
+                    <span>
+                        <b>Candid Youtube video:</b>
+                        <a href='{url}' target='_blank' style='text-decoration: none;'>
+                            {doc['title']}
+                        </a>
+                    </span>
+                    <iframe
+                        width="426"
+                        height="240"
+                        src="https://www.youtube.com/embed/{doc['video_id']}?si=0-y6eRrOzXTUSBDY"
+                        title="YouTube video player"
+                        frameborder="0"
+                        allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
+                        referrerpolicy="strict-origin-when-cross-origin"
+                        allowfullscreen
+                        style="display: inline-block; float: left;padding-right: 10px;padding-top: 5px;">
+                    </iframe>
+                    <br>
+                    <br>
+                    {fields_dict["description_cleaned_txt"]}
+                </div>
+            </span>
+        </div>
+    </div>
+    """
+    return html

ask_candid/services/__init__.py ADDED Viewed

File without changes

ask_candid/services/org_search.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from ask_candid.base.api_base import BaseAPI
+from ask_candid.base.config.rest import CDS_API
+class OrgSearch(BaseAPI):
+    def __init__(self):
+        super().__init__(
+            url=f"{CDS_API['url']}/v1/organization/search",
+            headers={"x-api-key": CDS_API["key"]}
+        )
+    def __call__(self, name: str, name_only: bool = False, **kwargs):
+        is_valid = False
+        payload = {
+            "names": [{
+                "value": name,
+                "type": "main"
+            }],
+            "status": "authorized"
+        }
+        if name_only:
+            is_valid = True
+        else:
+            if kwargs.get("ein"):
+                ein = kwargs.get("ein")
+                if "-" not in ein:
+                    ein = f"{ein[:2]}-{ein[2:]}"
+                payload["ids"] = [{
+                    "value": ein,
+                    "type": "ein"
+                }]
+                is_valid = True
+            if kwargs.get("street") or kwargs.get("city") or kwargs.get("state") or kwargs.get("postal_code"):
+                payload["addresses"] = [{
+                    "street1": kwargs.get("street") or "",
+                    "city": kwargs.get("city") or "",
+                    "state": kwargs.get("state") or "",
+                    "postal_code": kwargs.get("postal_code") or ""
+                }]
+                is_valid = True
+        if not is_valid:
+            return None
+        result = self.post(payload=payload)
+        return result.get("payload", [])

ask_candid/services/small_lm.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from typing import List, Optional
+from dataclasses import dataclass
+from enum import Enum
+import torch
+from ask_candid.base.lambda_base import LambdaInvokeBase
+@dataclass(slots=True)
+class Encoding:
+    inputs: List[str]
+    vectors: torch.Tensor
+class CandidSLM(LambdaInvokeBase):
+    """Wrapper around Candid's custom small language model.
+    For more details see https://dev.azure.com/guidestar/DataScience/_git/graph-ai?path=/releases/language.
+    This services includes:
+        * text encoding
+        * document summarization
+        * entity salience estimation
+    Parameters
+    ----------
+    access_key : Optional[str], optional
+        AWS access key, by default None
+    secret_key : Optional[str], optional
+        AWS secret key, by default None
+    """
+    class Tasks(Enum):  # pylint: disable=missing-class-docstring
+        ENCODE = "/encode"
+        DOCUMENT_SUMMARIZE = "/document/summarize"
+        DOCUMENT_NER_SALIENCE = "/document/entitySalience"
+    def __init__(
+        self, access_key: Optional[str] = None, secret_key: Optional[str] = None
+    ) -> None:
+        super().__init__(
+            function_name="small-lm",
+            access_key=access_key,
+            secret_key=secret_key
+        )
+    def encode(self, text: List[str]) -> Encoding:
+        response = self._submit_request({"text": text})
+        output = Encoding(
+            inputs=(response.get("inputs") or []),
+            vectors=torch.tensor((response.get("vectors") or []), dtype=torch.float32)
+        )
+        return output

ask_candid/tools/__init__.py ADDED Viewed

File without changes

ask_candid/tools/elastic/__init__.py ADDED Viewed

File without changes

ask_candid/tools/elastic/index_data_tool.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from typing import Type, Optional
+import logging
+from pydantic import BaseModel, Field
+from elasticsearch import Elasticsearch
+from langchain.callbacks.manager import CallbackManagerForToolRun
+from langchain.tools.base import BaseTool
+from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA
+logging.basicConfig(level="INFO")
+logger = logging.getLogger("elasticsearch_playground")
+es = Elasticsearch(
+    cloud_id=SEMANTIC_ELASTIC_QA.cloud_id,
+    api_key=SEMANTIC_ELASTIC_QA.api_key,
+    verify_certs=True,
+    request_timeout=60 * 3
+)
+class IndexShowDataInput(BaseModel):
+    """Input for the index show data tool."""
+    index_name: str = Field(
+        ..., description="The name of the index for which the data is to be retrieved"
+    )
+class IndexShowDataTool(BaseTool):
+    """Tool for getting a list of entries from an ElasticSearch index, helpful to figure out what data is available."""
+    name: str = "elastic_index_show_data"  # Added type annotation
+    description: str = (
+        "Input is an index name, output is a JSON based string with an extract of the data of the index"
+    )
+    args_schema: Optional[Type[BaseModel]] = (
+        IndexShowDataInput  # This should be placed before methods
+    )
+    def _run(
+        self,
+        index_name: str,
+        run_manager: Optional[CallbackManagerForToolRun] = None,
+    ) -> str:
+        """Get all indices in the Elasticsearch server, usually separated by a line break."""
+        try:
+            # Ensure `es` is properly initialized before this method is called
+            res = es.search(
+                index=index_name,
+                from_=0,
+                size=20,
+                query={"match_all": {}},
+            )
+            return str(res["hits"])
+        except Exception as e:
+            print(e)
+            logger.exception("Could not fetch index data for %s", index_name)
+            return ""

ask_candid/tools/elastic/index_details_tool.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from typing import Type, Optional
+import logging
+from pydantic import BaseModel, Field
+from elasticsearch import Elasticsearch
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForToolRun,
+    CallbackManagerForToolRun,
+)
+from langchain.tools.base import BaseTool
+from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA
+logging.basicConfig(level="INFO")
+logger = logging.getLogger("elasticsearch_playground")
+es = Elasticsearch(
+    cloud_id=SEMANTIC_ELASTIC_QA.cloud_id,
+    api_key=SEMANTIC_ELASTIC_QA.api_key,
+    verify_certs=True,
+    request_timeout=60 * 3
+)
+class IndexDetailsInput(BaseModel):
+    """Input for the list index details tool."""
+    index_name: str = Field(
+        ...,
+        description="The name of the index for which the details are to be retrieved",
+    )
+class IndexDetailsTool(BaseTool):
+    """Tool for getting information about a single ElasticSearch index."""
+    name: str = "elastic_index_show_details"  # Added type annotation
+    description: str = (
+        "Input is an index name, output is a JSON-based string with the aliases, mappings containing the field names, and settings of an index."
+    )
+    args_schema: Optional[Type[BaseModel]] = (
+        IndexDetailsInput  # Ensure this is above the methods
+    )
+    def _run(
+        self,
+        index_name: str,
+        run_manager: Optional[CallbackManagerForToolRun] = None,
+    ) -> str:
+        """Get information about a single Elasticsearch index."""
+        try:
+            # Ensure that `es` is correctly initialized before calling this method
+            alias = es.indices.get_alias(index=index_name)
+            field_mappings = es.indices.get_field_mapping(index=index_name, fields="*")
+            field_settings = es.indices.get_settings(index=index_name)
+            return str(
+                {
+                    "alias": alias[index_name],
+                    "field_mappings": field_mappings[index_name],
+                    "settings": field_settings[index_name],
+                }
+            )
+        except Exception as e:
+            logger.exception("Could not fetch index information for %s: %s", index_name, e)
+            return ""
+    async def _arun(
+        self,
+        index_name: str = "",
+        run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
+    ) -> str:
+        raise NotImplementedError("IndexDetailsTool does not support async operations")

ask_candid/tools/elastic/index_search_tool.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import logging
+import json
+import tiktoken
+from elasticsearch import Elasticsearch
+# from pydantic.v1 import BaseModel, Field  # <-- Uses v1 namespace
+from pydantic import BaseModel, Field
+from langchain.tools import StructuredTool
+from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA
+logging.basicConfig(level="INFO")
+logger = logging.getLogger("elasticsearch_playground")
+es = Elasticsearch(
+    cloud_id=SEMANTIC_ELASTIC_QA.cloud_id,
+    api_key=SEMANTIC_ELASTIC_QA.api_key,
+    verify_certs=True,
+    request_timeout=60 * 3
+)
+class SearchToolInput(BaseModel):
+    """Input for the index show data tool."""
+    index_name: str = Field(
+        ..., description="The name of the index for which the data is to be retrieved"
+    )
+    query: str = Field(
+        ...,
+        description="The ElasticSearch JSON query used to filter all hits. Should use the _source field if possible to specify required fields.",
+    )
+    from_: int = Field(
+        ..., description="The record index from which the query will start"
+    )
+    size: int = Field(
+        ...,
+        description="How many records will be retrieved from the ElasticSearch query",
+    )
+def elastic_search(
+    index_name: str,
+    query: str,
+    from_: int = 0,
+    size: int = 20,
+):
+    """Executes a specific query on an ElasticSearch index and returns all hits or aggregation results"""
+    size = min(50, size)
+    encoding = tiktoken.encoding_for_model("gpt-4")
+    try:
+        full_dict: dict = json.loads(query)
+        query_dict = None
+        aggs_dict = None
+        sort_dict = None
+        if "query" in full_dict:
+            query_dict = full_dict["query"]
+        if "aggs" in full_dict:
+            aggs_dict = full_dict["aggs"]
+        if "sort" in full_dict:
+            sort_dict = full_dict["sort"]
+        if query_dict is None and aggs_dict is None and sort_dict is None:
+            # Assume that there is a query but that the query part was ommitted.
+            query_dict = full_dict
+        if query_dict is None and aggs_dict is not None:
+            # This is an aggregation query, therefore we suppress the hits here
+            size = 200
+        logger.info(query)
+        # Print the query
+        # print(f"Executing Elasticsearch Query: {query}")
+        final_res = ""
+        retries = 0
+        while retries < 100:
+            res = es.search(
+                index=index_name,
+                from_=from_,
+                size=size,
+                query=query_dict,
+                aggs=aggs_dict,
+                sort=sort_dict,
+            )
+            if query_dict is None and aggs_dict is not None:
+                # When a result has aggregations, just return that and ignore the rest
+                final_res = str(res["aggregations"])
+            else:
+                final_res = str(res["hits"])
+            tokens = encoding.encode(final_res)
+            retries += 1
+            if len(tokens) > 6000:
+                size -= 1
+            else:
+                return final_res
+    except Exception as e:
+        logger.exception("Could not execute query %s", query)
+        msg = str(e)
+        return msg
+def create_search_tool():
+    return StructuredTool.from_function(
+        elastic_search, name="elastic_index_search_tool", args_schema=SearchToolInput
+    )

ask_candid/tools/elastic/list_indices_tool.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from typing import Type, Optional, List
+import logging
+from pydantic import BaseModel, Field
+from elasticsearch import Elasticsearch
+from langchain.callbacks.manager import AsyncCallbackManagerForToolRun
+from langchain.tools.base import BaseTool
+from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA
+logging.basicConfig(level="INFO")
+logger = logging.getLogger("elasticsearch_playground")
+es = Elasticsearch(
+    cloud_id=SEMANTIC_ELASTIC_QA.cloud_id,
+    api_key=SEMANTIC_ELASTIC_QA.api_key,
+    verify_certs=True,
+    request_timeout=60 * 3
+)
+class ListIndicesInput(BaseModel):
+    """Input for the list indices tool."""
+    separator: str = Field(..., description="Separator for the list of indices")
+class ListIndicesTool(BaseTool):
+    """Tool for getting all ElasticSearch indices."""
+    name: str = "elastic_list_indices"  # Added type annotation
+    description: str = (
+        "Input is a delimiter like comma or new line. Output is a separated list of indices in the database. Always use this tool to get to know the indices in the ElasticSearch cluster."
+    )
+    args_schema: Optional[Type[BaseModel]] = (
+        ListIndicesInput  # Define this before methods
+    )
+    def _run(self, separator: str) -> str:
+        """Get all indices in the Elasticsearch server, usually separated by a line break."""
+        try:
+            # Ensure that `es` is correctly initialized before calling this method
+            indices: List[str] = es.cat.indices(h="index", s="index").split()
+            # Filter out hidden indices starting with a dot
+            return separator.join(
+                [index for index in indices if not index.startswith(".")]
+            )
+        except Exception as e:
+            logger.exception("Could not list indices: %s", e)
+            return ""
+    async def _arun(
+        self,
+        separator: str = "",
+        run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
+    ) -> str:
+        raise NotImplementedError("ListIndicesTool does not support async operations")

ask_candid/tools/org_seach.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from typing import List
+import re
+from fuzzywuzzy import fuzz
+from langchain.output_parsers.openai_tools import JsonOutputToolsParser
+from langchain_openai.chat_models import ChatOpenAI
+from langchain_core.runnables import RunnableSequence
+from langchain_core.prompts import ChatPromptTemplate
+from pydantic import BaseModel, Field
+from ask_candid.services.org_search import OrgSearch
+from ask_candid.base.config.rest import OPENAI
+search = OrgSearch()
+class OrganizationNames(BaseModel):
+    """List of names of social-sector organizations, such as nonprofits and foundations."""
+    orgnames: List[str] = Field(description="List of organization names")
+def extract_org_links_from_chatbot(chatbot_output: str):
+    """
+    Extracts a list of organization names from the provided text.
+    Args:
+        chatbot_output (str):The chatbot output containing organization names and other content.
+    Returns:
+        list: A list of organization names extracted from the text.
+    Raises:
+        ValueError: If parsing fails or if an unexpected output format is received.
+    """
+    prompt = """Extract only the names of officially recognized organizations, foundations, and government entities
+    from the text below. Do not include any entries that contain descriptions, regional identifiers, or explanations
+    within parentheses or following the name. Strictly exclude databases, resources, crowdfunding platforms, and general
+    terms. Provide the output only in the specified JSON format.
+    input text below:
+        ```{chatbot_output}``
+    output format:
+    {{
+    'orgnames' : [list of organization names without any additional descriptions or identifiers]
+    }}
+    """
+    try:
+        parser = JsonOutputToolsParser()
+        llm = ChatOpenAI(model="gpt-4o", api_key=OPENAI["key"]).bind_tools([OrganizationNames])
+        prompt = ChatPromptTemplate.from_template(prompt)
+        chain = RunnableSequence(prompt, llm, parser)
+        # Run the chain with the input data
+        result = chain.invoke({"chatbot_output": chatbot_output})
+        # Extract the organization names from the output
+        output_list = result[0]["args"].get("orgnames", [])
+        # Validate output format
+        if not isinstance(output_list, list):
+            raise ValueError("Unexpected output format: 'orgnames' should be a list")
+        return output_list
+    except Exception as e:
+        # Log or print the error as needed for debugging
+        print(f"text does not have any organization: {e}")
+        return []
+def is_similar(name: str, list_of_dict: list, threshold: int = 80):
+    """
+    Returns True if `name` is similar to any names in `list_of_dict` based on a similarity threshold.
+    """
+    try:
+        for item in list_of_dict:
+            try:
+                # Attempt to calculate similarity score
+                similarity = fuzz.ratio(name.lower(), item["name"].lower())
+                if similarity >= threshold:
+                    return True
+            except KeyError:
+                # Handle cases where 'name' key might be missing in dictionary
+                print(f"KeyError: Missing 'name' key in dictionary item {item}")
+                continue
+            except AttributeError:
+                # Handle non-string name values in dictionary items
+                print(f"AttributeError: Non-string 'name' in dictionary item {item}")
+                continue
+    except TypeError as e:
+        # Handle cases where input types are incorrect
+        print(f"TypeError: {e}")
+        return False
+    return False
+def generate_org_link_dict(org_names_list: list):
+    """
+    Maps organization names to their Candid profile URLs if available.
+    For each organization in `output_list`, this function attempts to retrieve a matching profile
+    using `search_org`. If a similar name is found and a Candid entity ID is available, it constructs
+    a profile URL. If no ID or similar match is found, or if an error occurs, it assigns an empty string.
+    Args:
+        output_list (list): List of organization names (str) to retrieve Candid profile links for.
+    Returns:
+        dict: Dictionary with organization names as keys and Candid profile URLs or empty strings as values.
+    Example:
+        get_org_link(['New York-Presbyterian Hospital'])
+        # {'New York-Presbyterian Hospital': 'https://app.candid.org/profile/6915255'}
+    """
+    link_dict = {}
+    for org in org_names_list:
+        try:
+            # Attempt to retrieve organization data
+            response = search(org, name_only=True)
+            # Check if there is a valid response and if names are similar
+            if response and is_similar(org, response[0].get("names", "")):
+                # Try to get the Candid entity ID and construct the URL
+                candid_entity_id = response[0].get("candid_entity_id")
+                if candid_entity_id:
+                    link_dict[org] = (
+                        f"https://app.candid.org/profile/{candid_entity_id}"
+                    )
+                else:
+                    link_dict[org] = ""  # No ID found, set empty string
+            else:
+                link_dict[org] = ""  # No similar match found
+        except KeyError as e:
+            # Handle missing keys in the response dictionary
+            print(f"KeyError encountered for organization '{org}': {e}")
+            link_dict[org] = ""
+        except Exception as e:
+            # Catch any other unexpected errors
+            print(f"An error occurred for organization '{org}': {e}")
+            link_dict[org] = ""
+    return link_dict
+def embed_org_links_in_text(input_text: str, org_link_dict: dict):
+    """
+    Replaces organization names in `text` with links from `link_dict` and appends a Candid info message.
+    Args:
+        text (str): The text containing organization names.
+        link_dict (dict): Mapping of organization names to URLs.
+    Returns:
+        str: Updated text with linked organization names and an appended Candid message.
+    """
+    try:
+        for org_name, url in org_link_dict.items():
+            if url:  # Only proceed if the URL is not empty
+                regex_pattern = re.compile(re.escape(org_name))
+                input_text = regex_pattern.sub(
+                    repl=f"<a href={url} target='_blank' rel='noreferrer' class='candid-org-link'>{org_name}</a>",
+                    string=input_text
+                )
+        # Append Candid information message at the end
+        input_text += (
+            "<p class='candid-app-link'> "
+            "Visit <a href=https://app.candid.org/ target='_blank' rel='noreferrer' class='candid-org-link'>Candid</a> "
+            "to get nonprofit information you need.</p>"
+        )
+    except TypeError as e:
+        print(f"TypeError encountered: {e}")
+        return input_text
+    except re.error as e:
+        print(f"Regex error encountered for '{org_name}': {e}")
+        return input_text
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+        return input_text
+    return input_text

ask_candid/tools/question_reformulation.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+def reformulate_question_using_history(state, llm):
+    """
+    Transform the query to produce a better query with details from previous messages.
+    Args:
+        state (messages): The current state
+        llm: LLM to use
+    Returns:
+        dict: The updated state with re-phrased question and original user_input for UI
+    """
+    print("---REFORMULATE THE USER INPUT---")
+    messages = state["messages"]
+    question = messages[-1].content
+    if len(messages) > 1:
+        contextualize_q_system_prompt = """Given a chat history and the latest user input \
+        which might reference context in the chat history, formulate a standalone input \
+        which can be understood without the chat history.
+        Chat history:
+        \n ------- \n
+        {chat_history}
+        \n ------- \n
+        User input:
+        \n ------- \n
+        {question}
+        \n ------- \n
+        Do NOT answer the question, \
+        just reformulate it if needed and otherwise return it as is.
+        """
+        contextualize_q_prompt = ChatPromptTemplate([
+            ("system", contextualize_q_system_prompt),
+            ("human", question),
+        ])
+        rag_chain = contextualize_q_prompt | llm | StrOutputParser()
+        new_question = rag_chain.invoke({"chat_history": messages, "question": question})
+        print(f"user asked: '{question}', agent reformulated the question basing on the chat history: {new_question}")
+        return {"messages": [new_question], "user_input" : question}
+    return {"messages": [question], "user_input" : question}

ask_candid/utils.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from typing import List, Dict, Union, Any
+from uuid import uuid4
+from ask_candid.retrieval.sources import (
+    candid_blog,
+    candid_help,
+    candid_learning,
+    issuelab,
+    youtube
+)
+def filter_messages(messages, k=10):
+    # TODO summarize messages instead
+    return messages[-k:]
+def html_format_doc(doc: Dict[str, Any], source: str, show_chunks=False) -> str:
+    height_px = 200
+    html = ""
+    if source == "news":
+        # html = news.article_card_html(doc, height_px, show_chunks)
+        pass
+    elif source == "transactions":
+        # html = cds.transaction_card_html(doc, height_px, show_chunks)
+        pass
+    elif source == "organizations":
+        # html = up_orgs.organization_card_html(doc, 400, show_chunks)
+        pass
+    elif source == "issuelab":
+        html = issuelab.issuelab_card_html(doc, height_px, show_chunks)
+    elif source == "youtube":
+        html = youtube.build_card_html(doc, 400, show_chunks)
+    elif source == "candid_blog":
+        html = candid_blog.build_card_html(doc, height_px, show_chunks)
+    elif source == "candid_learning":
+        html = candid_learning.build_card_html(doc, height_px, show_chunks)
+    elif source == "candid_help":
+        html = candid_help.build_card_html(doc, height_px, show_chunks)
+    return html
+def html_format_docs_chat(docs):
+    """
+    Formats Candid sources into a line of buttons
+    """
+    html = ""
+    if docs:
+        docs_html = []
+        for doc in docs:
+            s_name = doc.metadata.get("source", "Source")
+            s_url = doc.metadata.get("url", "URL")
+            s_html = (
+                "<span class='source-item'>"
+                f"<a href={s_url} target='_blank' rel='noreferrer' class='ssearch-source'>"
+                f"{doc.metadata['title']} ({s_name})</a></span>"
+            )
+            docs_html.append(s_html)
+        html = f"<h2>Candid Resources</h2><div id='ssearch-sources'>{'<br>'.join(docs_html)}</div>"
+    return html
+def format_chat_response(chatbot: List[Any]) -> List[Any]:
+    """We have sources appended as one more tuple. Here we concatinate HTML of sources
+        with the AI response
+    Returns:
+        _type_: updated chatbot message as HTML
+    """
+    if chatbot:
+        sources = chatbot[-1][1]
+        chatbot.pop(-1)
+        chatbot[-1][1] = chatbot[-1][1] + sources
+    return chatbot
+def format_chat_ag_response(chatbot: List[Any]) -> List[Any]:
+    """If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
+        with the AI response
+    Returns:
+        _type_: updated chatbot message as HTML
+    """
+    sources = ""
+    if chatbot:
+        title = chatbot[-1]["metadata"].get("title", None)
+        if title == "Sources HTML":
+            sources = chatbot[-1]["content"]
+            chatbot.pop(-1)
+            chatbot[-1]["content"] = chatbot[-1]["content"] + sources
+    return chatbot
+def valid_inputs(*args) -> bool:
+    return any(a is not None or (isinstance(a, str) and a.strip() != '') for a in args)
+def get_session_id(thread_id: Union[str, None]) -> str:
+    if not thread_id:
+        thread_id = uuid4().hex
+    return thread_id