Spaces:

CandidAI
/

ask-candid

Running

App Files Files Community

v2 of public chat

by brainsqueeze - opened 1 day ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+1496

-212

Files changed (23) hide show

README.md +2 -2
ask_candid/base/api_base.py +3 -3
ask_candid/base/api_base_async.py +3 -3
ask_candid/base/config/base.py +10 -0
ask_candid/base/config/connections.py +6 -14
ask_candid/base/config/models.py +1 -0
ask_candid/base/config/rest.py +49 -10
ask_candid/base/lambda_base.py +3 -3
ask_candid/base/retrieval/__init__.py +0 -0
ask_candid/base/retrieval/elastic.py +205 -0
ask_candid/base/retrieval/knowledge_base.py +362 -0
ask_candid/base/retrieval/schemas.py +23 -0
ask_candid/base/retrieval/sources.py +40 -0
ask_candid/base/retrieval/sparse_lexical.py +98 -0
ask_candid/base/utils.py +52 -0
ask_candid/chat.py +68 -55
ask_candid/services/small_lm.py +32 -6
ask_candid/tools/general.py +17 -0
ask_candid/tools/org_search.py +182 -0
ask_candid/tools/search.py +56 -111
ask_candid/tools/utils.py +14 -0
chat_v2.py +265 -0
requirements.txt +5 -5

README.md CHANGED Viewed

@@ -6,8 +6,8 @@ colorFrom: blue
 colorTo: purple
 python_version: 3.12
 sdk: gradio
-sdk_version: 5.20.0
-app_file: app.py
 pinned: true
 license: mit
 ---

 colorTo: purple
 python_version: 3.12
 sdk: gradio
+sdk_version: 5.42.0
+app_file: chat_v2.py
 pinned: true
 license: mit
 ---

ask_candid/base/api_base.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Any
 from urllib3.util.retry import Retry
 from requests.adapters import HTTPAdapter
@@ -10,7 +10,7 @@ class BaseAPI:
     def __init__(
         self,
         url: str,
-        headers: Optional[Dict[str, Any]] = None,
         total_retries: int = 3,
         backoff_factor: int = 2
     ) -> None:
@@ -36,7 +36,7 @@ class BaseAPI:
         r.raise_for_status()
         return r.json()
-    def post(self, payload: Dict[str, Any]):
         r = self.session.post(url=self.__url, headers=self.__headers, json=payload, timeout=30)
         r.raise_for_status()
         return r.json()

+from typing import Any
 from urllib3.util.retry import Retry
 from requests.adapters import HTTPAdapter
     def __init__(
         self,
         url: str,
+        headers: dict[str, Any] | None = None,
         total_retries: int = 3,
         backoff_factor: int = 2
     ) -> None:
         r.raise_for_status()
         return r.json()
+    def post(self, payload: dict[str, Any]):
         r = self.session.post(url=self.__url, headers=self.__headers, json=payload, timeout=30)
         r.raise_for_status()
         return r.json()

ask_candid/base/api_base_async.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Any
 import json
 import aiohttp
@@ -6,7 +6,7 @@ import aiohttp
 class BaseAsyncAPI:
-    def __init__(self, url: str, headers: Optional[Dict[str, Any]] = None, retries: int = 3) -> None:
         self.__url = url
         self.__headers = headers
         self.__retries = max(retries, 5)
@@ -29,7 +29,7 @@ class BaseAsyncAPI:
                         break
         return output
-    async def post(self, payload: Dict[str, Any]):
         session_timeout = aiohttp.ClientTimeout(total=30)
         async with aiohttp.ClientSession(headers=self.__headers, timeout=session_timeout) as session:
             output = {}

+from typing import Any
 import json
 import aiohttp
 class BaseAsyncAPI:
+    def __init__(self, url: str, headers: dict[str, Any] | None = None, retries: int = 3) -> None:
         self.__url = url
         self.__headers = headers
         self.__retries = max(retries, 5)
                         break
         return output
+    async def post(self, payload: dict[str, Any]):
         session_timeout = aiohttp.ClientTimeout(total=30)
         async with aiohttp.ClientSession(headers=self.__headers, timeout=session_timeout) as session:
             output = {}

ask_candid/base/config/base.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+from dotenv import dotenv_values, find_dotenv
+__env_values__ = dotenv_values(
+    dotenv_path=find_dotenv(".env", raise_error_if_not_found=False)
+)
+def _load_value(key: str):
+    return __env_values__.get(key) or os.getenv(key)

ask_candid/base/config/connections.py CHANGED Viewed

@@ -1,33 +1,25 @@
 from dataclasses import dataclass, field
-import os
-from dotenv import dotenv_values, find_dotenv
 @dataclass
 class BaseElasticSearchConnection:
     """Elasticsearch connection dataclass
     """
-    url: str = field(default_factory=str)
-    username: str = field(default_factory=str)
-    password: str = field(default_factory=str)
 @dataclass
 class BaseElasticAPIKeyCredential:
     """Cloud ID/API key data class
     """
-    cloud_id: str = field(default_factory=str)
-    api_key: str = field(default_factory=str)
-__env_values__ = dotenv_values(
-    dotenv_path=find_dotenv(".env", raise_error_if_not_found=False)
-)
-def _load_value(key: str):
-    return __env_values__.get(key) or os.getenv(key)
 SEMANTIC_ELASTIC_QA = BaseElasticAPIKeyCredential(
     cloud_id=_load_value("SEMANTIC_ELASTIC_CLOUD_ID"),
     api_key=_load_value("SEMANTIC_ELASTIC_API_KEY"),

 from dataclasses import dataclass, field
+from ask_candid.base.config.base import _load_value
 @dataclass
 class BaseElasticSearchConnection:
     """Elasticsearch connection dataclass
     """
+    url: str | None = field(default_factory=str)
+    username: str | None = field(default_factory=str)
+    password: str | None = field(default_factory=str)
 @dataclass
 class BaseElasticAPIKeyCredential:
     """Cloud ID/API key data class
     """
+    cloud_id: str | None = field(default_factory=str)
+    api_key: str | None = field(default_factory=str)
 SEMANTIC_ELASTIC_QA = BaseElasticAPIKeyCredential(
     cloud_id=_load_value("SEMANTIC_ELASTIC_CLOUD_ID"),
     api_key=_load_value("SEMANTIC_ELASTIC_API_KEY"),

ask_candid/base/config/models.py CHANGED Viewed

@@ -3,6 +3,7 @@ from types import MappingProxyType
 Name2Endpoint = MappingProxyType({
     "gpt-4o": "gpt-4o",
     "claude-3.5-haiku": "us.anthropic.claude-3-5-haiku-20241022-v1:0",
     # "llama-3.1-70b-instruct": "us.meta.llama3-1-70b-instruct-v1:0",
     # "mistral-large": "mistral.mistral-large-2402-v1:0",
     # "mixtral-8x7B": "mistral.mixtral-8x7b-instruct-v0:1",

 Name2Endpoint = MappingProxyType({
     "gpt-4o": "gpt-4o",
     "claude-3.5-haiku": "us.anthropic.claude-3-5-haiku-20241022-v1:0",
+    "claude-4-sonnet": "us.anthropic.claude-sonnet-4-20250514-v1:0",
     # "llama-3.1-70b-instruct": "us.meta.llama3-1-70b-instruct-v1:0",
     # "mistral-large": "mistral.mistral-large-2402-v1:0",
     # "mixtral-8x7B": "mistral.mixtral-8x7b-instruct-v0:1",

ask_candid/base/config/rest.py CHANGED Viewed

@@ -1,25 +1,64 @@
-from typing import TypedDict
-import os
-from dotenv import dotenv_values, find_dotenv
 class Api(TypedDict):
     """REST API configuration template
     """
-    url: str
-    key: str
-__env_values__ = dotenv_values(
-    dotenv_path=find_dotenv(".env", raise_error_if_not_found=False)
-)
-def _load_value(key: str):
-    return __env_values__.get(key) or os.getenv(key)
 CDS_API = Api(
     url=_load_value("CDS_API_URL"),
     key=_load_value("CDS_API_KEY")
 )
 OPENAI = Api(url=None, key=_load_value("OPENAI_API_KEY"))

+from typing import TypedDict, NamedTuple
+from ask_candid.base.config.base import _load_value
 class Api(TypedDict):
     """REST API configuration template
     """
+    url: str | None
+    key: str | None
+class ApiConfig(NamedTuple):
+    url: str | None
+    key: str | None
+    @property
+    def header(self) -> dict[str, str | None]:
+        return {"x-api-key": self.key}
+    def endpoint(self, route: str):
+        return f"{self.url}/{route}"
 CDS_API = Api(
     url=_load_value("CDS_API_URL"),
     key=_load_value("CDS_API_KEY")
 )
+CANDID_SEARCH_API = Api(
+    url=_load_value("CANDID_SEARCH_API_URL"),
+    key=_load_value("CANDID_SEARCH_API_KEY")
+)
 OPENAI = Api(url=None, key=_load_value("OPENAI_API_KEY"))
+SEARCH = ApiConfig(
+    url="https://ajr9jccwf0.execute-api.us-east-1.amazonaws.com/Prod",
+    key=_load_value("SEARCH_API_KEY")
+)
+AUTOCODING = ApiConfig(
+    url="https://auto-coding-api.candid.org",
+    key=_load_value("AUTOCODING_API_KEY")
+)
+DOCUMENT = ApiConfig(
+    url="https://dtntz2p635.execute-api.us-east-1.amazonaws.com/Prod",
+    key=_load_value("GEOCODING_API_KEY")
+)
+FUNDER_RECOMMENDATION = ApiConfig(
+    url="https://r6g59fxbie.execute-api.us-east-1.amazonaws.com/Prod",
+    key=_load_value("FUNDER_RECS_API_KEY")
+)
+LOI_WRITER = ApiConfig(
+    url="https://tc2ir1o7ne.execute-api.us-east-1.amazonaws.com/Prod",
+    key=_load_value("LOI_WRITER_API_KEY")
+)
+GOLDEN_ORG = ApiConfig(
+    url="https://qfdur742ih.execute-api.us-east-1.amazonaws.com/Prod",
+    key=_load_value("GOLDEN_RECORD_API_KEY")
+)

ask_candid/base/lambda_base.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List, Dict, Union, Optional, Any
 from time import sleep
 import json
@@ -25,7 +25,7 @@ class LambdaInvokeBase:
     def __init__(
         self, function_name: str,
-        access_key: Optional[str] = None, secret_key: Optional[str] = None,
     ) -> None:
         if access_key is not None and secret_key is not None:
             self._client = boto3.client(
@@ -39,7 +39,7 @@ class LambdaInvokeBase:
         self.function_name = function_name
-    def _submit_request(self, payload: Dict[str, Any]) -> Union[Dict[str, Any], List[Any]]:
         response = self._client.invoke(
             FunctionName=self.function_name,
             InvocationType="RequestResponse",

+from typing import Any
 from time import sleep
 import json
     def __init__(
         self, function_name: str,
+        access_key: str | None = None, secret_key: str | None = None,
     ) -> None:
         if access_key is not None and secret_key is not None:
             self._client = boto3.client(
         self.function_name = function_name
+    def _submit_request(self, payload: dict[str, Any]) -> dict[str, Any] | list[Any]:
         response = self._client.invoke(
             FunctionName=self.function_name,
             InvocationType="RequestResponse",

ask_candid/base/retrieval/__init__.py ADDED Viewed

File without changes

ask_candid/base/retrieval/elastic.py ADDED Viewed

	@@ -0,0 +1,205 @@

+from typing import Any
+from collections.abc import Iterator
+from elasticsearch import Elasticsearch
+from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
+from ask_candid.base.config.connections import BaseElasticAPIKeyCredential, BaseElasticSearchConnection
+NEWS_TRUST_SCORE_THRESHOLD = 0.8
+SPARSE_ENCODING_SCORE_THRESHOLD = 0.4
+def build_sparse_vector_query(
+    query: str,
+    fields: tuple[str, ...],
+    inference_id: str = ".elser-2-elasticsearch"
+) -> dict[str, Any]:
+    """Builds a valid Elasticsearch text expansion query payload
+    Parameters
+    ----------
+    query : str
+        Search context string
+    fields : Tuple[str, ...]
+        Semantic text field names
+    inference_id : str, optional
+        ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
+    Returns
+    -------
+    Dict[str, Any]
+    """
+    output = []
+    for f in fields:
+        output.append({
+            "nested": {
+                "path": f"embeddings.{f}.chunks",
+                "query": {
+                    "sparse_vector": {
+                        "field": f"embeddings.{f}.chunks.vector",
+                        "inference_id": inference_id,
+                        "prune": True,
+                        "query": query,
+                        # "boost": 1 / len(fields)
+                    }
+                },
+                "inner_hits": {
+                    "_source": False,
+                    "size": 2,
+                    "fields": [f"embeddings.{f}.chunks.chunk"]
+                }
+            }
+        })
+    return {"query": {"bool": {"should": output}}}
+def build_sparse_vector_and_text_query(
+    query: str,
+    semantic_fields: tuple[str, ...],
+    text_fields: tuple[str, ...] | None,
+    highlight_fields: tuple[str, ...] | None,
+    excluded_fields: tuple[str, ...] | None,
+    inference_id: str = ".elser-2-elasticsearch"
+) -> dict[str, Any]:
+    """Builds Elasticsearch sparse vector and text query payload
+    Parameters
+    ----------
+    query : str
+        Search context string
+    semantic_fields : Tuple[str]
+        Semantic text field names
+    highlight_fields: Tuple[str]
+        Fields which relevant chunks will be helpful for the agent to read
+    text_fields : Tuple[str]
+        Regular text fields
+    excluded_fields :  Tuple[str]
+        Fields to exclude from the source
+    inference_id : str, optional
+        ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
+    Returns
+    -------
+    Dict[str, Any]
+    """
+    output = []
+    final_query = {}
+    for f in semantic_fields:
+        output.append({
+            "sparse_vector": {
+                "field": f"{f}",
+                "inference_id": inference_id,
+                "query": query,
+                "boost": 1,
+                "prune": True # doesn't seem it changes anything if we use text queries additionally
+            }
+        })
+    if text_fields:
+        output.append({
+            "multi_match": {
+                "fields": text_fields,
+                "query": query,
+                "boost": 3
+            }
+        })
+    final_query = {
+        "track_total_hits": False,
+        "query": {
+            "bool": {"should": output}
+        }
+    }
+    if highlight_fields:
+        final_query["highlight"] = {
+            "fields": {
+                f"{f}": {
+                    "type": "semantic", # ensures that highlighting is applied exclusively to semantic_text fields.
+                    "number_of_fragments": 2, # number of chunks
+                    "order": "none" # can be "score", but we have only two and hope for context
+                }
+                for f in highlight_fields
+            }
+        }
+    if excluded_fields:
+        final_query["_source"] = {"excludes": list(excluded_fields)}
+    return final_query
+def news_query_builder(
+    query: str,
+    fields: tuple[str, ...],
+    encoder: SpladeEncoder,
+    days_ago: int = 60,
+) -> dict[str, Any]:
+    """Builds a valid Elasticsearch query against Candid news, simulating a token expansion.
+    Parameters
+    ----------
+    query : str
+        Search context string
+    Returns
+    -------
+    Dict[str, Any]
+    """
+    tokens = encoder.token_expand(query)
+    elastic_query = {
+        "_source": ["id", "link", "title", "content", "site_name"],
+        "query": {
+            "bool": {
+                "filter": [
+                    {"range": {"event_date": {"gte": f"now-{days_ago}d/d"}}},
+                    {"range": {"insert_date": {"gte": f"now-{days_ago}d/d"}}},
+                    {"range": {"article_trust_worthiness": {"gt": NEWS_TRUST_SCORE_THRESHOLD}}}
+                ],
+                "should": []
+            }
+        }
+    }
+    for token, score in tokens.items():
+        if score > SPARSE_ENCODING_SCORE_THRESHOLD:
+            elastic_query["query"]["bool"]["should"].append({
+                "multi_match": {
+                    "query": token,
+                    "fields": fields,
+                    "boost": score
+                }
+            })
+    return elastic_query
+def multi_search_base(
+    queries: list[dict[str, Any]],
+    credentials: BaseElasticSearchConnection | BaseElasticAPIKeyCredential,
+    timeout: int = 180
+) -> Iterator[dict[str, Any]]:
+    if isinstance(credentials, BaseElasticAPIKeyCredential):
+        es = Elasticsearch(
+            cloud_id=credentials.cloud_id,
+            api_key=credentials.api_key,
+            verify_certs=False,
+            request_timeout=timeout
+        )
+    elif isinstance(credentials, BaseElasticSearchConnection):
+        es = Elasticsearch(
+            credentials.url,
+            http_auth=(credentials.username, credentials.password),
+            timeout=timeout
+        )
+    else:
+        raise TypeError(f"Invalid credentials of type `{type(credentials)}")
+    yield from es.msearch(body=queries).get("responses", [])
+    es.close()

ask_candid/base/retrieval/knowledge_base.py ADDED Viewed

	@@ -0,0 +1,362 @@

+from typing import Literal, Any
+from collections.abc import Iterator, Iterable
+from itertools import groupby
+import logging
+from langchain_core.documents import Document
+from ask_candid.base.retrieval.elastic import (
+    build_sparse_vector_query,
+    build_sparse_vector_and_text_query,
+    news_query_builder,
+    multi_search_base
+)
+from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
+from ask_candid.base.retrieval.schemas import ElasticHitsResult
+import ask_candid.base.retrieval.sources as S
+from ask_candid.services.small_lm import CandidSLM
+from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA, NEWS_ELASTIC
+SourceNames = Literal[
+    "Candid Blog",
+    "Candid Help",
+    "Candid Learning",
+    "Candid News",
+    "IssueLab Research Reports",
+    "YouTube Training"
+]
+sparse_encoder = SpladeEncoder()
+logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+# TODO remove
+def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
+    """Pads the relevant chunk of text with context before and after
+    Parameters
+    ----------
+    field_name : str
+        a field with the long text that was chunked into pieces
+    hit : ElasticHitsResult
+    context_length : int, optional
+        length of text to add before and after the chunk, by default 1024
+    add_context : bool, optional
+        Set to `False` to expand the text context by searching for the Elastic inner hit inside the larger document
+        , by default True
+    Returns
+    -------
+    str
+        longer chunks stuffed together
+    """
+    chunks = []
+    # NOTE chunks have tokens, long text is a string, but may contain html which affects tokenization
+    long_text = hit.source.get(field_name) or ""
+    long_text = long_text.lower()
+    inner_hits_field = f"embeddings.{field_name}.chunks"
+    found_chunks = hit.inner_hits.get(inner_hits_field, {}) if hit.inner_hits else None
+    if found_chunks:
+        for h in found_chunks.get("hits", {}).get("hits") or []:
+            chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
+            # cutting the middle because we may have tokenizing artifacts there
+            chunk = chunk[3: -3]
+            if add_context:
+                # Find the start and end indices of the chunk in the large text
+                start_index = long_text.find(chunk[:20])
+                # Chunk is found
+                if start_index != -1:
+                    end_index = start_index + len(chunk)
+                    pre_start_index = max(0, start_index - context_length)
+                    post_end_index = min(len(long_text), end_index + context_length)
+                    chunks.append(long_text[pre_start_index:post_end_index])
+            else:
+                chunks.append(chunk)
+    return '\n\n'.join(chunks)
+def generate_queries(
+    query: str,
+    sources: list[SourceNames],
+    news_days_ago: int = 60
+) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
+    """Builds Elastic queries against indices which do or do not support sparse vector queries.
+    Parameters
+    ----------
+    query : str
+        Text describing a user's question or a description of investigative work which requires support from Candid's
+        knowledge base
+    sources : list[SourceNames]
+        One or more sources of knowledge from different areas at Candid.
+        * Candid Blog: Blog posts from Candid staff and trusted partners intended to help those in the sector or
+        illuminate ongoing work
+        * Candid Help: Candid FAQs to help user's get started with Candid's product platform and learning resources
+        * Candid Learning: Training documents from Candid's subject matter experts
+        * Candid News: News articles and press releases about real-time activity in the philanthropic sector
+        * IssueLab Research Reports: Academic research reports about the social/philanthropic sector
+        * YouTube Training: Transcripts from video-based training seminars from Candid's subject matter experts
+    news_days_ago : int, optional
+        How many days in the past to search for news articles, if a user is asking for recent trends then this value
+        should be set lower >~ 10, by default 60
+    Returns
+    -------
+    tuple[list[dict[str, Any]], list[dict[str, Any]]]
+        (sparse vector queries, queries for indices which do not support sparse vectors)
+    """
+    vector_queries = []
+    quasi_vector_queries = []
+    for source_name in sources:
+        if source_name == "Candid Blog":
+            q = build_sparse_vector_query(query=query, fields=S.CandidBlogConfig.semantic_fields)
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 5
+            vector_queries.extend([{"index": S.CandidBlogConfig.index_name}, q])
+        elif source_name == "Candid Help":
+            q = build_sparse_vector_query(query=query, fields=S.CandidHelpConfig.semantic_fields)
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 5
+            vector_queries.extend([{"index": S.CandidHelpConfig.index_name}, q])
+        elif source_name == "Candid Learning":
+            q = build_sparse_vector_query(query=query, fields=S.CandidLearningConfig.semantic_fields)
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 5
+            vector_queries.extend([{"index": S.CandidLearningConfig.index_name}, q])
+        elif source_name == "Candid News":
+            q = news_query_builder(
+                query=query,
+                fields=S.CandidNewsConfig.semantic_fields,
+                encoder=sparse_encoder,
+                days_ago=news_days_ago
+            )
+            q["size"] = 5
+            quasi_vector_queries.extend([{"index": S.CandidNewsConfig.index_name}, q])
+        elif source_name == "IssueLab Research Reports":
+            q = build_sparse_vector_query(query=query, fields=S.IssueLabConfig.semantic_fields)
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 1
+            vector_queries.extend([{"index": S.IssueLabConfig.index_name}, q])
+        elif source_name == "YouTube Training":
+            q = build_sparse_vector_and_text_query(
+                query=query,
+                semantic_fields=S.YoutubeConfig.semantic_fields,
+                text_fields=S.YoutubeConfig.text_fields,
+                highlight_fields=S.YoutubeConfig.highlight_fields,
+                excluded_fields=S.YoutubeConfig.excluded_fields
+            )
+            q["size"] = 5
+            vector_queries.extend([{"index": S.YoutubeConfig.index_name}, q])
+    return vector_queries, quasi_vector_queries
+def run_search(
+    vector_searches: list[dict[str, Any]] | None = None,
+    non_vector_searches: list[dict[str, Any]] | None = None,
+) -> list[ElasticHitsResult]:
+    def _msearch_response_generator(responses: Iterable[dict[str, Any]]) -> Iterator[ElasticHitsResult]:
+        for query_group in responses:
+            for h in query_group.get("hits", {}).get("hits", []):
+                inner_hits = h.get("inner_hits", {})
+                if not inner_hits and "news" in h.get("_index"):
+                    inner_hits = {"text": h.get("_source", {}).get("content")}
+                yield ElasticHitsResult(
+                    index=h["_index"],
+                    id=h["_id"],
+                    score=h["_score"],
+                    source=h["_source"],
+                    inner_hits=inner_hits,
+                    highlight=h.get("highlight", {})
+                )
+    results = []
+    if vector_searches is not None and len(vector_searches) > 0:
+        hits = multi_search_base(queries=vector_searches, credentials=SEMANTIC_ELASTIC_QA)
+        for hit in _msearch_response_generator(responses=hits):
+            results.append(hit)
+    if non_vector_searches is not None and len(non_vector_searches) > 0:
+        hits = multi_search_base(queries=non_vector_searches, credentials=NEWS_ELASTIC)
+        for hit in _msearch_response_generator(responses=hits):
+            results.append(hit)
+    return results
+def retrieved_text(hits: dict[str, Any]) -> str:
+    """Extracts retrieved sub-texts from documents which are strong hits from semantic queries for the purpose of
+    re-scoring by a secondary language model.
+    Parameters
+    ----------
+    hits : Dict[str, Any]
+    Returns
+    -------
+    str
+    """
+    nlp = CandidSLM()
+    text = []
+    for _, v in hits.items():
+        if _ == "text":
+            s = nlp.summarize(v, top_k=3)
+            text.append(s.summary)
+            # text.append(v)
+            continue
+        for h in (v.get("hits", {}).get("hits") or []):
+            for _, field in h.get("fields", {}).items():
+                for chunk in field:
+                    if chunk.get("chunk"):
+                        text.extend(chunk["chunk"])
+    return '\n'.join(text)
+def reranker(
+    query_results: Iterable[ElasticHitsResult],
+    search_text: str | None = None,
+    max_num_results: int = 5
+) -> Iterator[ElasticHitsResult]:
+    """Reranks Elasticsearch hits coming from multiple indices/queries which may have scores on different scales.
+    This will shuffle results
+    Parameters
+    ----------
+    query_results : Iterable[ElasticHitsResult]
+    Yields
+    ------
+    Iterator[ElasticHitsResult]
+    """
+    results: list[ElasticHitsResult] = []
+    texts: list[str] = []
+    for _, data in groupby(query_results, key=lambda x: x.index):
+        data = list(data)  # noqa: PLW2901
+        max_score = max(data, key=lambda x: x.score).score
+        min_score = min(data, key=lambda x: x.score).score
+        for d in data:
+            d.score = (d.score - min_score) / (max_score - min_score + 1e-9)
+            results.append(d)
+            if search_text:
+                if d.inner_hits:
+                    text = retrieved_text(d.inner_hits)
+                if d.highlight:
+                    highlight_texts = []
+                    for k,v in d.highlight.items():
+                        v_text = '\n'.join(v)
+                        highlight_texts.append(v_text)
+                    text = '\n'.join(highlight_texts)
+                texts.append(text)
+    if search_text and len(texts) == len(results) and len(texts) > 1:
+        logger.info("Re-ranking %d retrieval results", len(results))
+        scores = sparse_encoder.query_reranking(query=search_text, documents=texts)
+        for r, s in zip(results, scores):
+            r.score = s
+    yield from sorted(results, key=lambda x: x.score, reverse=True)[:max_num_results]
+def process_hit(hit: ElasticHitsResult) -> Document:
+    if "issuelab-elser" in hit.index:
+        doc = Document(
+            page_content='\n\n'.join([
+                hit.source.get("combined_item_description", ""),
+                hit.source.get("description", ""),
+                hit.source.get("combined_issuelab_findings", ""),
+                get_context("content", hit, context_length=12)
+            ]),
+            metadata={
+                "title": hit.source["title"],
+                "source": "IssueLab",
+                "source_id": hit.source["resource_id"],
+                "url": hit.source.get("permalink", "")
+            }
+        )
+    elif "youtube" in hit.index:
+        highlight = hit.highlight or {}
+        doc = Document(
+            page_content='\n\n'.join([
+                hit.source.get("title", ""),
+                hit.source.get("semantic_description", ""),
+                ' '.join(highlight.get("semantic_cc_text", []))
+            ]),
+            metadata={
+                "title": hit.source.get("title", ""),
+                "source": "Candid YouTube",
+                "source_id": hit.source['video_id'],
+                "url": f"https://www.youtube.com/watch?v={hit.source['video_id']}"
+            }
+        )
+    elif "candid-blog" in hit.index:
+        doc = Document(
+            page_content='\n\n'.join([
+                hit.source.get("title", ""),
+                hit.source.get("excerpt", ""),
+                get_context("content", hit, context_length=12, add_context=False),
+                get_context("authors_text", hit, context_length=12, add_context=False),
+                hit.source.get("title_summary_tags", "")
+            ]),
+            metadata={
+                "title": hit.source.get("title", ""),
+                "source": "Candid Blog",
+                "source_id": hit.source["id"],
+                "url": hit.source["link"]
+            }
+        )
+    elif "candid-learning" in hit.index:
+        doc = Document(
+            page_content='\n\n'.join([
+                hit.source.get("title", ""),
+                hit.source.get("staff_recommendations", ""),
+                hit.source.get("training_topics", ""),
+                get_context("content", hit, context_length=12)
+            ]),
+            metadata={
+                "title": hit.source["title"],
+                "source": "Candid Learning",
+                "source_id": hit.source["post_id"],
+                "url": hit.source.get("url", "")
+            }
+        )
+    elif "candid-help" in hit.index:
+        doc = Document(
+            page_content='\n\n'.join([
+                hit.source.get("combined_article_description", ""),
+                get_context("content", hit, context_length=12)
+            ]),
+            metadata={
+                "title": hit.source.get("title", ""),
+                "source": "Candid Help",
+                "source_id": hit.source["id"],
+                "url": hit.source.get("link", "")
+            }
+        )
+    elif "news" in hit.index:
+        doc = Document(
+            page_content='\n\n'.join([hit.source.get("title", ""), hit.source.get("content", "")]),
+            metadata={
+                "title": hit.source.get("title", ""),
+                "source": hit.source.get("site_name") or "Candid News",
+                "source_id": hit.source["id"],
+                "url": hit.source.get("link", "")
+            }
+        )
+    else:
+        raise ValueError(f"Unknown source result from index {hit.index}")
+    return doc

ask_candid/base/retrieval/schemas.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from typing import Any
+from dataclasses import dataclass, field
+@dataclass
+class ElasticSourceConfig:
+    index_name: str
+    semantic_fields: tuple[str,...] = field(default_factory=tuple)
+    text_fields: tuple[str,...] | None = field(default_factory=tuple)
+    highlight_fields: tuple[str,...] | None = field(default_factory=tuple)
+    excluded_fields: tuple[str,...] | None = field(default_factory=tuple)
+@dataclass
+class ElasticHitsResult:
+    """Dataclass for Elasticsearch hits results
+    """
+    index: str
+    id: Any
+    score: float
+    source: dict[str, Any]
+    inner_hits: dict[str, Any] | None
+    highlight: dict[str, list[str]] | None

ask_candid/base/retrieval/sources.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from ask_candid.base.retrieval.schemas import ElasticSourceConfig
+CandidBlogConfig = ElasticSourceConfig(
+    index_name="search-semantic-candid-blog",
+    semantic_fields=("content", "authors_text", "title_summary_tags")
+)
+CandidHelpConfig = ElasticSourceConfig(
+    index_name="search-semantic-candid-help-elser_ve1",
+    semantic_fields=("content", "combined_article_description")
+)
+CandidLearningConfig = ElasticSourceConfig(
+    index_name="search-semantic-candid-learning_ve1",
+    semantic_fields=("content", "title", "training_topics", "staff_recommendations")
+)
+CandidNewsConfig = ElasticSourceConfig(
+    index_name="news_1",
+    semantic_fields=("title", "content")
+)
+IssueLabConfig = ElasticSourceConfig(
+    index_name="search-semantic-issuelab-elser_ve2",
+    semantic_fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
+)
+YoutubeConfig = ElasticSourceConfig(
+    index_name="search-semantic-youtube",
+    semantic_fields=("semantic_title", "semantic_description","semantic_cc_text"),
+    text_fields=("title", "description", "cc_text"),
+    highlight_fields=("semantic_cc_text",),
+    excluded_fields=("cc_text", "semantic_cc_text", "semantic_title")
+)

ask_candid/base/retrieval/sparse_lexical.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from tqdm.auto import tqdm
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+from transformers.tokenization_utils_base import BatchEncoding
+from torch.utils.data import DataLoader
+import torch.nn.functional as F
+from torch import Tensor
+import torch
+class SpladeEncoder:
+    batch_size = 8
+    model_id = "naver/splade-v3"
+    def __init__(self):
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+        self.model = AutoModelForMaskedLM.from_pretrained(self.model_id)
+        self.idx2token = {idx: token for token, idx in self.tokenizer.get_vocab().items()}
+        if torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        elif torch.mps.is_available():
+            self.device = torch.device("mps")
+        else:
+            self.device = torch.device("cpu")
+        self.model.to(self.device)
+    @torch.no_grad()
+    def forward(self, inputs: BatchEncoding) -> Tensor:
+        output = self.model(**inputs.to(self.device))
+        logits: Tensor = output.logits
+        mask: Tensor = inputs.attention_mask
+        vec = (logits.relu() + 1).log() * mask.unsqueeze(dim=-1)
+        return vec.max(dim=1)[0].squeeze()
+    def encode(self, texts: list[str]) -> Tensor:
+        """Forward pass to get dense vectors
+        Parameters
+        ----------
+        texts : list[str]
+        Returns
+        -------
+        torch.Tensor
+            Dense vectors
+        """
+        vectors = []
+        for batch in tqdm(DataLoader(dataset=texts, shuffle=False, batch_size=self.batch_size), desc="Encoding"): # type: ignore
+            tokens = self.tokenizer(batch, return_tensors='pt', truncation=True, padding=True)
+            vec = self.forward(inputs=tokens)
+            vectors.append(vec)
+        return torch.vstack(vectors)
+    def query_reranking(self, query: str, documents: list[str]) -> list[float]:
+        """Cosine similarity re-ranking.
+        Parameters
+        ----------
+        query : str
+            Retrieval query
+        documents : list[str]
+            Retrieved documents
+        Returns
+        -------
+        list[float]
+            Cosine values
+        """
+        vec = self.encode([query, *documents])
+        xQ = F.normalize(vec[:1], dim=-1, p=2.)
+        xD = F.normalize(vec[1:], dim=-1, p=2.)
+        return (xQ * xD).sum(dim=-1).cpu().tolist()
+    def token_expand(self, query: str) -> dict[str, float]:
+        """Sparse lexical token expansion.
+        Parameters
+        ----------
+        query : str
+            Retrieval query
+        Returns
+        -------
+        dict[str, float]
+        """
+        vec = self.encode([query]).squeeze()
+        cols = vec.nonzero().squeeze().cpu().tolist()
+        weights = vec[cols].cpu().tolist()
+        sparse_dict_tokens = {self.idx2token[idx]: round(weight, 3) for idx, weight in zip(cols, weights) if weight > 0}
+        return dict(sorted(sparse_dict_tokens.items(), key=lambda item: item[1], reverse=True))

ask_candid/base/utils.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import asyncio
@@ -12,3 +15,52 @@ def async_tasks(*tasks):
     loop.stop()
     loop.close()
     return results

+from collections.abc import Callable
+from functools import wraps
+from time import sleep
 import asyncio
     loop.stop()
     loop.close()
     return results
+def retry_on_status(
+    num_retries: int = 3,
+    backoff_factor: float = 0.5,
+    max_backoff: float | None = None,
+    retry_statuses: tuple[int, ...] = (501, 503)
+):
+    """
+    Retry decorator for functions making httpx requests.
+    Retries on specific HTTP status codes with exponential backoff.
+    Args:
+        num_retries (int): Max number of retries.
+        backoff_factor (float): Multiplier for delay (e.g., 0.5, 1, etc.).
+        max_backoff (float, optional): Cap on the backoff delay in seconds.
+        retry_statuses (tuple): HTTP status codes to retry on.
+    """
+    def decorator(func: Callable):
+        if asyncio.iscoroutinefunction(func):
+            # Async version
+            @wraps(func)
+            async def async_wrapper(*args, **kwargs):
+                for attempt in range(num_retries + 1):
+                    response = await func(*args, **kwargs)
+                    if response.status_code not in retry_statuses:
+                        return response
+                    if attempt < num_retries:
+                        delay = min(backoff_factor * (2 ** attempt), max_backoff or float('inf'))
+                        await asyncio.sleep(delay)
+                return response
+            return async_wrapper
+        # Sync version
+        @wraps(func)
+        def sync_wrapper(*args, **kwargs):
+            for attempt in range(num_retries + 1):
+                response = func(*args, **kwargs)
+                if response.status_code not in retry_statuses:
+                    return response
+                if attempt < num_retries:
+                    delay = min(backoff_factor * (2 ** attempt), max_backoff or float('inf'))
+                    sleep(delay)
+            return response
+        return sync_wrapper
+    return decorator

ask_candid/chat.py CHANGED Viewed

@@ -1,66 +1,79 @@
-from typing import List, Dict, Tuple, Optional, Any
-import gradio as gr
-from langchain_core.language_models.llms import LLM
-from langgraph.checkpoint.memory import MemorySaver
-from ask_candid.utils import get_session_id
-from ask_candid.graph import build_compute_graph
-from ask_candid.base.config.constants import START_SYSTEM_PROMPT
-def run_chat(
-    thread_id: str,
-    user_input: Dict[str, Any],
-    history: List[Dict],
-    llm: LLM,
-    indices: Optional[List[str]] = None,
-    premium_features: Optional[List[str]] = None,
-) -> Tuple[gr.MultimodalTextbox, List[Dict[str, Any]], str]:
-    if premium_features is None:
-        premium_features = []
-    if len(history) == 0:
-        history.append({"role": "system", "content": START_SYSTEM_PROMPT})
-    history.append({"role": "user", "content": user_input["text"]})
-    inputs = {"messages": history}
-    # thread_id can be an email https://github.com/yurisasc/memory-enhanced-ai-assistant/blob/main/assistant.py
-    thread_id = get_session_id(thread_id)
-    config = {"configurable": {"thread_id": thread_id}}
-    enable_recommendations = "Recommendation" in premium_features
-    workflow = build_compute_graph(
-        llm=llm,
-        indices=indices,
-        user_callback=gr.Info,
-        enable_recommendations=enable_recommendations
-    )
-    memory = MemorySaver()  # TODO: don't use for Prod
-    graph = workflow.compile(checkpointer=memory)
-    response = graph.invoke(inputs, config=config)
-    messages = response["messages"]
-    # Return the recommendation if there is any
-    recommendation = response.get("recommendation", None)
-    if recommendation:
-        ai_answer = recommendation
-    else:
-        # Fallback to the chatbot response
-        last_message = messages[-1]
-        ai_answer = last_message.content
-    sources_html = ""
-    for message in messages[-2:]:
-        if message.type == "HTML":
-            sources_html = message.content
-    history.append({"role": "assistant", "content": ai_answer})
-    if sources_html:
-        history.append({
-            "role": "assistant",
-            "content": sources_html,
-            "metadata": {"title": "Sources HTML"},
-        })
-    return gr.MultimodalTextbox(value=None, interactive=True), history, thread_id

+from typing import TypedDict, Literal, Any
+from collections.abc import Iterator
+from dataclasses import asdict
+import logging
+import json
+from langchain_core.messages.tool import ToolMessage
+from gradio import ChatMessage
+logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+class ToolInput(TypedDict):
+    name: str
+    args: dict[str, Any]
+    id: str
+    type: Literal["tool_call"]
+class CalledTool(TypedDict):
+    id: str
+    name: Literal["tools"]
+    input: list[ToolInput]
+    triggers: tuple[str, ...]
+class ToolResult(TypedDict):
+    id: str
+    name: Literal["tools"]
+    error: bool | None
+    result: list[tuple[str, list[ToolMessage]]]
+    interrupts: list
+def convert_history_for_graph_agent(history: list[dict | ChatMessage]) -> list[dict]:
+    _hist = []
+    for h in history:
+        if isinstance(h, ChatMessage):
+            h = asdict(h)
+        if h.get("content"):
+            # if h.get("metadata"):
+            #     # skip if it's a tool-call
+            #     continue
+            _hist.append(h)
+    return _hist
+def format_tool_call(input_chunk: CalledTool) -> Iterator[ChatMessage]:
+    for graph_input in input_chunk["input"]:
+        yield ChatMessage(
+            role="assistant",
+            content=json.dumps(graph_input["args"]),
+            metadata={
+                "title": f"Using tool `{graph_input.get('name')}`",
+                "status": "done",
+                "id": input_chunk["id"],
+                "parent_id": input_chunk["id"]
+            }
+        )
+def format_tool_response(result_chunk: ToolResult) -> Iterator[ChatMessage]:
+    for _, outputs in result_chunk["result"]:
+        for tool in outputs:
+            logger.info("Called tool `%s`", tool.name)
+            yield ChatMessage(
+                role="assistant",
+                content=tool.content,
+                metadata={
+                    "title": f"Results from tool `{tool.name}`",
+                    "tool_name": tool.name,
+                    "documents": tool.artifact,
+                    "status": "done",
+                    "parent_id": result_chunk["id"]
+                } # pyright: ignore[reportArgumentType]
+            )

ask_candid/services/small_lm.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from typing import List, Optional
 from dataclasses import dataclass
 from enum import Enum
@@ -9,10 +8,26 @@ from ask_candid.base.lambda_base import LambdaInvokeBase
 @dataclass(slots=True)
 class Encoding:
-    inputs: List[str]
     vectors: torch.Tensor
 class CandidSLM(LambdaInvokeBase):
     """Wrapper around Candid's custom small language model.
     For more details see https://dev.azure.com/guidestar/DataScience/_git/graph-ai?path=/releases/language.
@@ -35,7 +50,7 @@ class CandidSLM(LambdaInvokeBase):
         DOCUMENT_NER_SALIENCE = "/document/entitySalience"
     def __init__(
-        self, access_key: Optional[str] = None, secret_key: Optional[str] = None
     ) -> None:
         super().__init__(
             function_name="small-lm",
@@ -43,11 +58,22 @@ class CandidSLM(LambdaInvokeBase):
             secret_key=secret_key
         )
-    def encode(self, text: List[str]) -> Encoding:
         response = self._submit_request({"text": text, "path": self.Tasks.ENCODE.value})
-        output = Encoding(
             inputs=(response.get("inputs") or []),
             vectors=torch.tensor((response.get("vectors") or []), dtype=torch.float32)
         )
-        return output

 from dataclasses import dataclass
 from enum import Enum
 @dataclass(slots=True)
 class Encoding:
+    inputs: list[str]
     vectors: torch.Tensor
+@dataclass(slots=True)
+class SummaryItem:
+    rank: int
+    score: float
+    text: str
+@dataclass(slots=True)
+class TextSummary:
+    snippets: list[SummaryItem]
+    @property
+    def summary(self) -> str:
+        return ' '.join([_.text for _ in self.snippets])
 class CandidSLM(LambdaInvokeBase):
     """Wrapper around Candid's custom small language model.
     For more details see https://dev.azure.com/guidestar/DataScience/_git/graph-ai?path=/releases/language.
         DOCUMENT_NER_SALIENCE = "/document/entitySalience"
     def __init__(
+        self, access_key: str | None = None, secret_key: str | None = None
     ) -> None:
         super().__init__(
             function_name="small-lm",
             secret_key=secret_key
         )
+    def encode(self, text: list[str]) -> Encoding:
         response = self._submit_request({"text": text, "path": self.Tasks.ENCODE.value})
+        assert isinstance(response, dict)
+        return Encoding(
             inputs=(response.get("inputs") or []),
             vectors=torch.tensor((response.get("vectors") or []), dtype=torch.float32)
         )
+    def summarize(self, text: list[str], top_k: int) -> TextSummary:
+        response = self._submit_request({"text": text, "path": self.Tasks.DOCUMENT_SUMMARIZE.value})
+        assert isinstance(response, dict)
+        return TextSummary(
+            snippets=[
+                SummaryItem(rank=item["rank"], score=item["score"], text=item["value"])
+                for item in (response.get("summary") or [])[:top_k]
+            ]
+        )

ask_candid/tools/general.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from datetime import date
+from langchain_core.tools import tool
+@tool
+def get_current_day() -> date:
+    """Get the current day to reference for any time-sensitive data requests. This might be useful for information
+    searches through news data, where more current articles may be more relevant.
+    Returns
+    -------
+    date
+        Today's date
+    """
+    return date.today()

ask_candid/tools/org_search.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from typing import Any
+from pydantic import BaseModel, Field
+from langchain_core.output_parsers.pydantic import PydanticOutputParser
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.runnables import RunnableSequence
+from langchain_core.prompts import PromptTemplate
+from langchain_core.tools import tool, BaseTool
+from thefuzz import fuzz
+from ask_candid.tools.utils import format_candid_profile_link
+from ask_candid.base.api_base import BaseAPI
+from ask_candid.base.config.rest import CANDID_SEARCH_API
+class OrganizationNames(BaseModel):
+    """List of names of social-sector organizations, such as nonprofits and foundations."""
+    orgnames: list[str] = Field(..., description="List of organization names.")
+class OrganizationIdentifierArgs(BaseModel):
+    text: str = Field(..., description="Chat model response text which contains named organizations.")
+class OrganizationIdentifier(BaseTool):
+    llm: BaseChatModel
+    parser: type[PydanticOutputParser] = PydanticOutputParser(pydantic_object=OrganizationNames)
+    template: str = """Extract only the names of officially recognized organizations, foundations, and government
+    entities from the text below. Do not include any entries that contain descriptions, regional identifiers, or
+    explanations within parentheses or following the name. Strictly exclude databases, resources, crowdfunding
+    platforms, and general terms. Provide the output only in the specified JSON format.
+    input text: ```{chatbot_output}```
+    output format: ```{format_instructions}```
+    """
+    name: str = "organization-identifier"
+    description: str = """
+    Identify the names of nonprofits and foundations from chat model responses. If it is likely that a response contains
+    proper names then it should be processed through this tool.
+    Examples
+    --------
+    >>> `organization_identifier('My Favorite Foundation awarded a grant to My Favorite Nonprofit.')`
+    >>> `organization_identifier('The LoremIpsum Nonprofit will be running a community event this Thursday')`
+    """
+    args_schema: type[OrganizationIdentifierArgs] = OrganizationIdentifierArgs
+    def _build_pipeline(self):
+        prompt = PromptTemplate(
+            template=self.template,
+            input_variables=["chatbot_output"],
+            partial_variables={"format_instructions": self.parser.get_format_instructions()}
+        )
+        return RunnableSequence(prompt, self.llm, self.parser)
+    def _run(self, text: str) -> str:
+        chain = self._build_pipeline()
+        result: OrganizationNames = chain.invoke({"chatbot_output": text})
+        return result.orgnames
+    async def _arun(self, text: str) -> str:
+        chain = self._build_pipeline()
+        result: OrganizationNames = await chain.ainvoke({"chatbot_output": text})
+        return result.orgnames
+def name_search(name: str) -> list[dict[str, Any]]:
+    candid_org_search = BaseAPI(
+        url=f'{CANDID_SEARCH_API["url"]}/v1/search',
+        headers={"x-api-key": CANDID_SEARCH_API["key"]}
+    )
+    results = candid_org_search.get(
+        query=f"'{name}'",
+        searchMode="organization_only",
+        rowCount=5
+    )
+    return results.get("returnedOrgs") or []
+def find_similar(name: str, potential_matches: list[dict[str, Any]], threshold: int = 80):
+    for org in potential_matches:
+        similarity = max(
+            fuzz.ratio(name.lower(), (org["orgName"] or "").lower()),
+            fuzz.ratio(name.lower(), (org["akaName"] or "").lower()),
+            fuzz.ratio(name.lower(), (org["dbaName"] or "").lower()),
+        )
+        if similarity >= threshold:
+            yield org, similarity
+@tool(response_format="content_and_artifact")
+def find_mentioned_organizations(organizations: list[str]) -> tuple[str, dict[str, str]]:
+    """Match organization names found in a chat response to official organizations tracked by Candid. This involves
+    using the Candid Search API in a lookup mode, and then finding the best result(s) using a heuristic string
+    similarity search.
+    This tool is focused on getting links to the organization's Candid profile for the user to click and explore in
+    more detail.
+    Use the URLs here to replace organization names in the chat response with links to the organization's profile. Links
+    to Candid profiles **MUST** be used to do the following:
+    1. Generate direct links to Candid organization profiles
+    2. Provide a mechanism for users to easily access detailed organizational information
+    3. Enhance responses with authoritative source links
+    Key Usage Requirements:
+    - Always incorporate returned profile URLs directly into the response text
+    - Replace organization name mentions with hyperlinked Candid profile URLs
+    - Prioritize creating a seamless user experience by making URLs contextually relevant
+    Example Desired Output:
+    Instead of: 'The Gates Foundation does impressive work.'
+    Use: 'The [Gates Foundation](https://app.candid.org/profile/XXXXX) does impressive work.'
+    The function returns a tuple with:
+    - A link information text (optional)
+    - A dictionary mapping input names to their best Candid Search profile URL
+    Failure to integrate the URLs into the response is considered an incomplete implementation.",
+    Examples
+    --------
+    >>> find_mentioned_organizations(organizations=['Gates Foundation', 'Candid'])
+    Parameters
+    ----------
+    organizations : list[str]
+        A list of organization name strings found in a chat response message which need to be matches
+    Returns
+    -------
+    tuple[str, dict[str, str]]
+        (Link information text, mapping input name --> Candid Search profile URL of the best potential match)
+    """
+    output = {}
+    for name in organizations:
+        search_results = name_search(name)
+        try:
+            best_result, _ = max(find_similar(name=name, potential_matches=search_results), key=lambda x: x[-1])
+        except ValueError:
+            # no similar organizations could be found for this one, keep going
+            continue
+        output[name] = format_candid_profile_link(best_result["candidEntityID"])
+    response = [f"The Candid profile link for {name} is {url}" for name, url in output.items()]
+    return '. '.join(response), output
+@tool
+def find_mentioned_organizations_detailed(organizations: list[str]) -> dict[str, dict[str, Any]]:
+    """Match organization names found in a chat response to official organizations tracked by Candid. This involves
+    using the Candid Search API in a lookup mode, and then finding the best result(s) using a heuristic string
+    similarity search.
+    Examples
+    --------
+    >>> find_mentioned_organizations(organizations=['Gates Foundation', 'Candid'])
+    Parameters
+    ----------
+    organizations : list[str]
+        A list of organization name strings found in a chat response message which need to be matches
+    Returns
+    -------
+    dict[str, dict[str, Any]]
+        Mapping from the input name(s) to the best potential match.
+    """
+    output = {}
+    for name in organizations:
+        search_results = name_search(name)
+        try:
+            best_result, _ = max(find_similar(name=name, potential_matches=search_results), key=lambda x: x[-1])
+        except ValueError:
+            # no similar organizations could be found for this one, keep going
+            continue
+        output[name] = best_result
+    return output

ask_candid/tools/search.py CHANGED Viewed

@@ -1,122 +1,67 @@
-from typing import List, Tuple, Callable, Optional, Any
-from functools import partial
-import logging
-from pydantic import BaseModel, Field
-from langchain_core.language_models.llms import LLM
 from langchain_core.documents import Document
-from langchain_core.tools import Tool
-from ask_candid.retrieval.elastic import get_query_results, get_reranked_results
-from ask_candid.base.config.data import DataIndices
-from ask_candid.agents.schema import AgentState
-logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-class RetrieverInput(BaseModel):
-    """Input to the Elasticsearch retriever."""
-    user_input: str = Field(description="query to look up in retriever")
-def get_search_results(
-    user_input: str,
-    indices: List[DataIndices],
-    user_callback: Optional[Callable[[str], Any]] = None
-) -> Tuple[str, List[Document]]:
-    """End-to-end search and re-rank function.
     Parameters
     ----------
-    user_input : str
-        Search context string
-    indices : List[DataIndices]
-        Semantic index names to search over
-    user_callback : Optional[Callable[[str], Any]], optional
-        Optional UI callback to inform the user of apps states, by default None
     Returns
     -------
-    Tuple[str, List[Document]]
-        (concatenated text from search results, documents list)
     """
-    if user_callback is not None:
-        try:
-            user_callback("Searching for relevant information")
-        except Exception as ex:
-            logger.warning("User callback was passed in but failed: %s", ex)
-    output = ["Search didn't return any Candid sources"]
-    page_content = []
-    content = "Search didn't return any Candid sources"
-    results = get_query_results(search_text=user_input, indices=indices)
-    if results:
-        output = get_reranked_results(results, search_text=user_input)
-        for doc in output:
-            page_content.append(doc.page_content)
-        content = "\n\n".join(page_content)
-    # for the tool we need to return a tuple for content_and_artifact type
-    return content, output
-def retriever_tool(
-    indices: List[DataIndices],
-    user_callback: Optional[Callable[[str], Any]] = None
-) -> Tool:
-    """Tool component for use in conditional edge building for RAG execution graph.
-    Cannot use `create_retriever_tool` because it only provides content losing all metadata on the way
-    https://python.langchain.com/docs/how_to/custom_tools/#returning-artifacts-of-tool-execution
-    Parameters
-    ----------
-    indices : List[DataIndices]
-        Semantic index names to search over
-    user_callback : Optional[Callable[[str], Any]], optional
-        Optional UI callback to inform the user of apps states, by default None
-    Returns
-    -------
-    Tool
-    """
-    return Tool(
-        name="retrieve_social_sector_information",
-        func=partial(get_search_results, indices=indices, user_callback=user_callback),
-        description=(
-            "Return additional information about social and philanthropic sector, "
-            "including nonprofits (NGO), grants, foundations, funding, RFP, LOI, Candid."
-        ),
-        args_schema=RetrieverInput,
-        response_format="content_and_artifact"
     )
-def search_agent(state: AgentState, llm: LLM, tools: List[Tool]) -> AgentState:
-    """Invokes the agent model to generate a response based on the current state. Given
-    the question, it will decide to retrieve using the retriever tool, or simply end.
-    Parameters
-    ----------
-    state : _type_
-        The current state
-    llm : LLM
-    tools : List[Tool]
-    Returns
-    -------
-    AgentState
-        The updated state with the agent response appended to messages
-    """
-    logger.info("---SEARCH AGENT---")
-    messages = state["messages"]
-    question = messages[-1].content
-    model = llm.bind_tools(tools)
-    response = model.invoke(messages)
-    # return a list, because this will get added to the existing list
-    return {"messages": [response], "user_input": question}

 from langchain_core.documents import Document
+from langchain_core.tools import tool
+from ask_candid.base.retrieval.knowledge_base import (
+    SourceNames,
+    generate_queries,
+    run_search,
+    reranker,
+    process_hit
+)
+@tool(response_format="content_and_artifact")
+def search_candid_knowledge_base(
+    query: str,
+    sources: list[SourceNames],
+    news_days_ago: int = 60
+) -> tuple[str, list[Document]]:
+    """Search Candid's subject matter expert knowledge base to find answers about the social and philanthropic sector.
+    This knowledge includes help articles and video training sessions from Candid's subject matter experts, blog posts
+    about the sector from Candid staff and trusted partner authors, research documents about the sector and news
+    articles curated about activity happening in the sector around the world.
+    Searches are performed through a combination of vector and keyword searching. Results are then re-ranked against
+    the original query to get the best results.
+    Search results often come back with specific organizations named, especially if referencing the news. In these cases
+    the organizations should be identified in Candid's data and links to their profiles **MUST** be included in final
+    chat response to the user.
     Parameters
     ----------
+    query : str
+        Text describing a user's question or a description of investigative work which requires support from Candid's
+        knowledge base
+    sources : list[SourceNames]
+        One or more sources of knowledge from different areas at Candid.
+        * Candid Blog: Blog posts from Candid staff and trusted partners intended to help those in the sector or
+        illuminate ongoing work
+        * Candid Help: Candid FAQs to help user's get started with Candid's product platform and learning resources
+        * Candid Learning: Training documents from Candid's subject matter experts
+        * Candid News: News articles and press releases about real-time activity in the philanthropic sector
+        * IssueLab Research Reports: Academic research reports about the social/philanthropic sector
+        * YouTube Training: Transcripts from video-based training seminars from Candid's subject matter experts
+    news_days_ago : int, optional
+        How many days in the past to search for news articles, if a user is asking for recent trends then this value
+        should be set lower >~ 10, by default 60
     Returns
     -------
+    str
+        Re-ranked document text
     """
+    vector_queries, quasi_vector_queries = generate_queries(
+        query=query,
+        sources=sources,
+        news_days_ago=news_days_ago
     )
+    results = run_search(vector_searches=vector_queries, non_vector_searches=quasi_vector_queries)
+    text_response = []
+    response_sources = []
+    for hit in map(process_hit, reranker(results, search_text=query)):
+        text_response.append(hit.page_content)
+        response_sources.append(hit)
+    return '\n\n'.join(text_response), response_sources

ask_candid/tools/utils.py ADDED Viewed

	@@ -0,0 +1,14 @@

+def format_candid_profile_link(candid_entity_id: int | str) -> str:
+    """Format the Candid Search organization profile link.
+    Parameters
+    ----------
+    candid_entity_id : int | str
+    Returns
+    -------
+    str
+        URL
+    """
+    return f"https://app.candid.org/profile/{candid_entity_id}"

chat_v2.py ADDED Viewed

	@@ -0,0 +1,265 @@

+from typing import TypedDict, Any
+from collections.abc import Iterator, AsyncIterator
+import os
+import gradio as gr
+from langgraph.graph.state import CompiledStateGraph
+from langgraph.prebuilt import create_react_agent
+from langchain_aws import ChatBedrock
+import boto3
+from ask_candid.tools.org_search import OrganizationIdentifier, find_mentioned_organizations
+from ask_candid.tools.search import search_candid_knowledge_base
+from ask_candid.tools.general import get_current_day
+from ask_candid.utils import html_format_docs_chat
+from ask_candid.base.config.constants import START_SYSTEM_PROMPT
+from ask_candid.base.config.models import Name2Endpoint
+from ask_candid.chat import convert_history_for_graph_agent, format_tool_call, format_tool_response
+try:
+    from feedback import FeedbackApi
+    ROOT = "."
+except ImportError:
+    from demos.feedback import FeedbackApi
+    ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..")
+BOT_LOGO = os.path.join(ROOT, "static", "candid_logo_yellow.png")
+if not os.path.isfile(BOT_LOGO):
+    BOT_LOGO = os.path.join(ROOT, "..", "..", "static", "candid_logo_yellow.png")
+class LoggedComponents(TypedDict):
+    context: list[gr.Component]
+    found_helpful: gr.Component
+    will_recommend: gr.Component
+    comments: gr.Component
+    email: gr.Component
+def build_execution_graph() -> CompiledStateGraph:
+    llm = ChatBedrock(
+        client=boto3.client("bedrock-runtime", region_name="us-east-1"),
+        model=Name2Endpoint["claude-3.5-haiku"]
+    )
+    org_name_recognition = OrganizationIdentifier(llm=llm) # bind the main chat model to the tool
+    return create_react_agent(
+        model=llm,
+        tools=[
+            get_current_day,
+            org_name_recognition,
+            find_mentioned_organizations,
+            search_candid_knowledge_base
+        ],
+    )
+def generate_postscript_messages(history: list[gr.ChatMessage]) -> Iterator[gr.ChatMessage]:
+    for record in history:
+        title = record.metadata.get("tool_name")
+        if title == search_candid_knowledge_base.name:
+            yield gr.ChatMessage(
+                role="assistant",
+                content=html_format_docs_chat(record.metadata.get("documents")),
+                metadata={
+                    "title": "Source citations",
+                }
+            )
+        elif title == find_mentioned_organizations.name:
+            pass
+async def execute(
+    user_input: dict[str, Any],
+    history: list[gr.ChatMessage]
+) -> AsyncIterator[tuple[gr.Component, list[gr.ChatMessage]]]:
+    if len(history) == 0:
+        history.append(gr.ChatMessage(role="system", content=START_SYSTEM_PROMPT))
+    history.append(gr.ChatMessage(role="user", content=user_input["text"]))
+    for fname in user_input.get("files") or []:
+        fname: str
+        if fname.endswith('.txt'):
+            with open(fname, 'r', encoding='utf8') as f:
+                history.append(gr.ChatMessage(role="user", content=f.read()))
+    yield gr.MultimodalTextbox(value=None, interactive=True), history
+    horizon = len(history)
+    inputs = {"messages": convert_history_for_graph_agent(history)}
+    graph = build_execution_graph()
+    history.append(gr.ChatMessage(role="assistant", content=""))
+    async for stream_mode, chunk in graph.astream(inputs, stream_mode=["messages", "tasks"]):
+        if stream_mode == "messages" and chunk[0].content:
+            for msg in chunk[0].content:
+                if 'text' in msg:
+                    history[-1].content += msg["text"]
+                    yield gr.MultimodalTextbox(value=None, interactive=True), history
+        elif stream_mode == "tasks" and chunk.get("name") == "tools" and chunk.get("error") is None:
+            if "input" in chunk:
+                for msg in format_tool_call(chunk):
+                    history.append(msg)
+                    yield gr.MultimodalTextbox(value=None, interactive=True), history
+            elif "result" in chunk:
+                for msg in format_tool_response(chunk):
+                    history.append(msg)
+                    yield gr.MultimodalTextbox(value=None, interactive=True), history
+                history.append(gr.ChatMessage(role="assistant", content=""))
+    for post_msg in generate_postscript_messages(history=history[horizon:]):
+        history.append(post_msg)
+        yield gr.MultimodalTextbox(value=None, interactive=True), history
+def send_feedback(
+    chat_context,
+    found_helpful,
+    will_recommend,
+    comments,
+    email
+):
+    api = FeedbackApi()
+    total_submissions = 0
+    try:
+        response = api(
+            context=chat_context,
+            found_helpful=found_helpful,
+            will_recommend=will_recommend,
+            comments=comments,
+            email=email
+        )
+        total_submissions = response.get("response", 0)
+        gr.Info("Thank you for submitting feedback")
+    except Exception as ex:
+        raise gr.Error(f"Error submitting feedback: {ex}")
+    return total_submissions
+def build_chat_app():
+    with gr.Blocks(theme=gr.themes.Soft(), title="Chat") as demo:
+        gr.Markdown(
+            """
+            <h1>Candid's AI assistant</h1>
+            <p>
+                Please read the <a
+                    href='https://info.candid.org/chatbot-reference-guide'
+                    target="_blank"
+                    rel="noopener noreferrer"
+                >guide</a> to get started.
+            </p>
+            <hr>
+            """
+        )
+        with gr.Column():
+            chatbot = gr.Chatbot(
+                label="AskCandid",
+                elem_id="chatbot",
+                editable="user",
+                avatar_images=(
+                    None,  # user
+                    BOT_LOGO,  # bot
+                ),
+                height="60vh",
+                type="messages",
+                show_label=False,
+                show_copy_button=True,
+                autoscroll=True,
+                layout="panel",
+            )
+            msg = gr.MultimodalTextbox(label="Your message", interactive=True)
+            gr.ClearButton(components=[msg, chatbot], size="sm")
+            # pylint: disable=no-member
+            # chatbot.like(fn=like_callback, inputs=chatbot, outputs=None)
+            msg.submit(
+                fn=execute,
+                inputs=[msg, chatbot],
+                outputs=[msg, chatbot],
+                show_api=False
+            )
+            logged = LoggedComponents(context=chatbot)
+    return demo, logged
+def build_feedback(components: LoggedComponents) -> gr.Blocks:
+    with gr.Blocks(theme=gr.themes.Soft(), title="Candid AI demo") as demo:
+        gr.Markdown("<h1>Help us improve this tool with your valuable feedback</h1>")
+        with gr.Row():
+            with gr.Column():
+                found_helpful = gr.Radio(
+                    [True, False], label="Did you find what you were looking for?"
+                )
+                will_recommend = gr.Radio(
+                    [True, False],
+                    label="Will you recommend this Chatbot to others?",
+                )
+                comment = gr.Textbox(label="Additional comments (optional)", lines=4)
+                email = gr.Textbox(label="Your email (optional)", lines=1)
+                submit = gr.Button("Submit Feedback")
+                components["found_helpful"] = found_helpful
+                components["will_recommend"] = will_recommend
+                components["comments"] = comment
+                components["email"] = email
+                # pylint: disable=no-member
+                submit.click(
+                    fn=send_feedback,
+                    inputs=[
+                        components["context"],
+                        components["found_helpful"],
+                        components["will_recommend"],
+                        components["comments"],
+                        components["email"]
+                    ],
+                    outputs=None,
+                    show_api=False,
+                    api_name=False,
+                    preprocess=False,
+                )
+    return demo
+def build_app():
+    candid_chat, logger = build_chat_app()
+    feedback = build_feedback(logger)
+    with open(os.path.join(ROOT, "static", "chatStyle.css"), "r", encoding="utf8") as f:
+        css_chat = f.read()
+    demo = gr.TabbedInterface(
+        interface_list=[
+            candid_chat,
+            feedback
+        ],
+        tab_names=[
+            "Candid's AI assistant",
+            "Feedback"
+        ],
+        title="Candid's AI assistant",
+        theme=gr.themes.Soft(),
+        css=css_chat,
+    )
+    return demo
+if __name__ == "__main__":
+    app = build_app()
+    app.queue(max_size=5).launch(
+        show_api=False,
+        mcp_server=False,
+        auth=[
+            (os.getenv("APP_USERNAME"), os.getenv("APP_PASSWORD")),
+            (os.getenv("APP_PUBLIC_USERNAME"), os.getenv("APP_PUBLIC_PASSWORD")),
+        ],
+        auth_message="Login to Candid's AI assistant",
+    )

requirements.txt CHANGED Viewed

@@ -1,11 +1,11 @@
 boto3
 elasticsearch==7.17.6
 thefuzz
-gradio==5.20.0
-langchain
-langchain-aws
-langchain-openai
-langgraph
 pydantic==2.10.6
 pyopenssl>22.0.0
 python-dotenv

 boto3
 elasticsearch==7.17.6
 thefuzz
+gradio==5.42.0
+langchain==0.3.27
+langchain-aws==0.2.30
+langgraph==0.6.5
+langgraph-prebuilt==0.6.4
 pydantic==2.10.6
 pyopenssl>22.0.0
 python-dotenv