Spaces:
Running
Running
from typing import Any | |
from pydantic import BaseModel, Field | |
from langchain_core.output_parsers.pydantic import PydanticOutputParser | |
from langchain_core.language_models.chat_models import BaseChatModel | |
from langchain_core.runnables import RunnableSequence | |
from langchain_core.prompts import PromptTemplate | |
from langchain_core.tools import tool, BaseTool | |
from thefuzz import fuzz | |
from ask_candid.tools.utils import format_candid_profile_link | |
from ask_candid.base.api_base import BaseAPI | |
from ask_candid.base.config.rest import CANDID_SEARCH_API | |
class OrganizationNames(BaseModel): | |
"""List of names of social-sector organizations, such as nonprofits and foundations.""" | |
orgnames: list[str] = Field(..., description="List of organization names.") | |
class OrganizationIdentifierArgs(BaseModel): | |
text: str = Field(..., description="Chat model response text which contains named organizations.") | |
class OrganizationIdentifier(BaseTool): | |
llm: BaseChatModel | |
parser: type[PydanticOutputParser] = PydanticOutputParser(pydantic_object=OrganizationNames) | |
template: str = """Extract only the names of officially recognized organizations, foundations, and government | |
entities from the text below. Do not include any entries that contain descriptions, regional identifiers, or | |
explanations within parentheses or following the name. Strictly exclude databases, resources, crowdfunding | |
platforms, and general terms. Provide the output only in the specified JSON format. | |
input text: ```{chatbot_output}``` | |
output format: ```{format_instructions}``` | |
""" | |
name: str = "organization-identifier" | |
description: str = """ | |
Identify the names of nonprofits and foundations from chat model responses. If it is likely that a response contains | |
proper names then it should be processed through this tool. | |
Examples | |
-------- | |
>>> `organization_identifier('My Favorite Foundation awarded a grant to My Favorite Nonprofit.')` | |
>>> `organization_identifier('The LoremIpsum Nonprofit will be running a community event this Thursday')` | |
""" | |
args_schema: type[OrganizationIdentifierArgs] = OrganizationIdentifierArgs | |
def _build_pipeline(self): | |
prompt = PromptTemplate( | |
template=self.template, | |
input_variables=["chatbot_output"], | |
partial_variables={"format_instructions": self.parser.get_format_instructions()} | |
) | |
return RunnableSequence(prompt, self.llm, self.parser) | |
def _run(self, text: str) -> str: | |
chain = self._build_pipeline() | |
result: OrganizationNames = chain.invoke({"chatbot_output": text}) | |
return result.orgnames | |
async def _arun(self, text: str) -> str: | |
chain = self._build_pipeline() | |
result: OrganizationNames = await chain.ainvoke({"chatbot_output": text}) | |
return result.orgnames | |
def name_search(name: str) -> list[dict[str, Any]]: | |
candid_org_search = BaseAPI( | |
url=f'{CANDID_SEARCH_API["url"]}/v1/search', | |
headers={"x-api-key": CANDID_SEARCH_API["key"]} | |
) | |
results = candid_org_search.get( | |
query=f"'{name}'", | |
searchMode="organization_only", | |
rowCount=5 | |
) | |
return results.get("returnedOrgs") or [] | |
def find_similar(name: str, potential_matches: list[dict[str, Any]], threshold: int = 80): | |
for org in potential_matches: | |
similarity = max( | |
fuzz.ratio(name.lower(), (org["orgName"] or "").lower()), | |
fuzz.ratio(name.lower(), (org["akaName"] or "").lower()), | |
fuzz.ratio(name.lower(), (org["dbaName"] or "").lower()), | |
) | |
if similarity >= threshold: | |
yield org, similarity | |
def find_mentioned_organizations(organizations: list[str]) -> tuple[str, dict[str, str]]: | |
"""Match organization names found in a chat response to official organizations tracked by Candid. This involves | |
using the Candid Search API in a lookup mode, and then finding the best result(s) using a heuristic string | |
similarity search. | |
This tool is focused on getting links to the organization's Candid profile for the user to click and explore in | |
more detail. | |
Use the URLs here to replace organization names in the chat response with links to the organization's profile. Links | |
to Candid profiles **MUST** be used to do the following: | |
1. Generate direct links to Candid organization profiles | |
2. Provide a mechanism for users to easily access detailed organizational information | |
3. Enhance responses with authoritative source links | |
Key Usage Requirements: | |
- Always incorporate returned profile URLs directly into the response text | |
- Replace organization name mentions with hyperlinked Candid profile URLs | |
- Prioritize creating a seamless user experience by making URLs contextually relevant | |
Example Desired Output: | |
Instead of: 'The Gates Foundation does impressive work.' | |
Use: 'The [Gates Foundation](https://app.candid.org/profile/XXXXX) does impressive work.' | |
The function returns a tuple with: | |
- A link information text (optional) | |
- A dictionary mapping input names to their best Candid Search profile URL | |
Failure to integrate the URLs into the response is considered an incomplete implementation.", | |
Examples | |
-------- | |
>>> find_mentioned_organizations(organizations=['Gates Foundation', 'Candid']) | |
Parameters | |
---------- | |
organizations : list[str] | |
A list of organization name strings found in a chat response message which need to be matches | |
Returns | |
------- | |
tuple[str, dict[str, str]] | |
(Link information text, mapping input name --> Candid Search profile URL of the best potential match) | |
""" | |
output = {} | |
for name in organizations: | |
search_results = name_search(name) | |
try: | |
best_result, _ = max(find_similar(name=name, potential_matches=search_results), key=lambda x: x[-1]) | |
except ValueError: | |
# no similar organizations could be found for this one, keep going | |
continue | |
output[name] = format_candid_profile_link(best_result["candidEntityID"]) | |
response = [f"The Candid profile link for {name} is {url}" for name, url in output.items()] | |
return '. '.join(response), output | |
def find_mentioned_organizations_detailed(organizations: list[str]) -> dict[str, dict[str, Any]]: | |
"""Match organization names found in a chat response to official organizations tracked by Candid. This involves | |
using the Candid Search API in a lookup mode, and then finding the best result(s) using a heuristic string | |
similarity search. | |
Examples | |
-------- | |
>>> find_mentioned_organizations(organizations=['Gates Foundation', 'Candid']) | |
Parameters | |
---------- | |
organizations : list[str] | |
A list of organization name strings found in a chat response message which need to be matches | |
Returns | |
------- | |
dict[str, dict[str, Any]] | |
Mapping from the input name(s) to the best potential match. | |
""" | |
output = {} | |
for name in organizations: | |
search_results = name_search(name) | |
try: | |
best_result, _ = max(find_similar(name=name, potential_matches=search_results), key=lambda x: x[-1]) | |
except ValueError: | |
# no similar organizations could be found for this one, keep going | |
continue | |
output[name] = best_result | |
return output | |