ask-candid / search.py
brainsqueeze's picture
Initial commit
92feab2 verified
raw
history blame
4.92 kB
from typing import List, Optional
import json
import gradio as gr
import requests
from .utils import html_format_doc
from .retrieval.up_orgs_keyword import organization_card_html
from .retrieval.elastic import reranker, get_query_results
from .retrieval.config import ALL_INDICES
from . import UP_QA_SEARCH_API
def run_search(search_text: str, indices: Optional[List[str]] = None):
results = get_query_results(search_text, indices=indices)
output = []
for result in reranker(results):
source_name = None
if "news" in result.index:
source_name = "news"
elif "transactions" in result.index:
source_name = "transactions"
elif "organizations" in result.index:
source_name = "organizations"
elif "issuelab-elser" in result.index:
source_name = "issuelab"
# elif "issuelab" in result.index:
# source_name = "issuelab"
elif "youtube-elser" in result.index:
source_name = "youtube"
# elif "youtube" in result.index:
# source_name = "youtube"
elif "candid-blog-elser" in result.index:
source_name = "candid_blog"
# elif "candid-blog" in result.index:
# source_name = "candid_blog"
elif "candid-learning" in result.index: # TODO fix that
source_name = "candid_learning"
elif "candid-help-elser" in result.index:
source_name = "candid_help"
doc = html_format_doc(doc=result.source, source=source_name)
output.append(doc)
return f"<div>{''.join(output)}</div>"
def run_ks(search_text: str):
json_body = {"keyword": search_text, "rowCount": 10}
response = requests.post(
url=UP_QA_SEARCH_API["API_URL"],
json=json_body,
headers={
"accept": "application/json",
"content-type": "application/json",
"x-api-key": UP_QA_SEARCH_API["API_KEY"]
},
timeout=(5 * 60)
)
r_json = json.loads(response.text)
output_k = []
if r_json.get("returnedOrgs", None) is not None:
for doc in r_json["returnedOrgs"]:
org = {}
org["candid_entity_id"] = doc.get("candidEntityID", "")
org["main_name"] = doc.get("orgName", "")
org["logo"] = doc.get("logo", "")
org["seal"] = doc.get("seal", {})
org["city"] = doc.get("city", "")
org["admin1"] = doc.get("admin1", "")
org["country_name"] = doc.get("countryName", "")
org["taxonomy"] = doc.get("taxonomy", {})
highlights = doc.get("highlights", [])
if highlights:
for h in highlights:
if h["field"] == "mission_statement":
org["mission_statement"] = "; ".join(h["highlights"])
html = organization_card_html(org, 250)
output_k.append(html)
# Getting semantic results
output_s = run_search(search_text=search_text)
return f"<div>{''.join(output_k)}</div>", output_s
def build_search_tab() -> gr.Blocks:
with gr.Blocks(theme=gr.themes.Soft(), title="Semantic search") as demo:
gr.Markdown(
"<h1>Alpha demo: Semantic search</h1>"
"Search and ask questions of Candid's data together with casual language"
)
query = gr.Text(placeholder="Search", show_label=False)
with gr.Accordion(label="Advanced settings", open=False):
es_indices = gr.CheckboxGroup(
choices=list(ALL_INDICES),
value=list(ALL_INDICES),
label="Sources to include",
interactive=True
)
search = gr.Button("Search")
feed = gr.HTML()
# pylint: disable=no-member
search.click(
fn=run_search,
inputs=[query, es_indices],
outputs=[feed],
api_name=False,
queue=True
)
return demo
def build_ks_tab() -> gr.Blocks:
with gr.Blocks(theme=gr.themes.Soft(), title="Semantic search") as demo:
gr.Markdown(
"<h1>Alpha demo: Keyword versus Semantic search</h1>"
"Compare current search results versus semantic search results"
)
query = gr.TextArea(placeholder="Search", show_label=False, lines=1)
ask = gr.Button("Search Unified Platform organizations")
with gr.Row():
with gr.Column():
gr.Markdown("<h2>Keyword results</h2>")
feed_k = gr.HTML()
with gr.Column():
gr.Markdown("<h2>Semantic results</h2>")
feed_s = gr.HTML()
# pylint: disable=no-member
ask.click(
fn=run_ks,
inputs=[query],
outputs=[feed_k, feed_s],
api_name=False,
queue=True
)
return demo