Spaces:
Running
Running
Migrate to sparse_vector query
Browse files- ask_candid/retrieval/elastic.py +37 -31
ask_candid/retrieval/elastic.py
CHANGED
@@ -32,10 +32,10 @@ class RetrieverInput(BaseModel):
|
|
32 |
user_input: str = Field(description="query to look up in retriever")
|
33 |
|
34 |
|
35 |
-
def
|
36 |
query: str,
|
37 |
fields: Tuple[str],
|
38 |
-
|
39 |
) -> Dict[str, Any]:
|
40 |
"""Builds a valid Elasticsearch text expansion query payload
|
41 |
|
@@ -45,8 +45,8 @@ def build_text_expansion_query(
|
|
45 |
Search context string
|
46 |
fields : Tuple[str]
|
47 |
Semantic text field names
|
48 |
-
|
49 |
-
ID of model deployed in Elasticsearch, by default ".
|
50 |
|
51 |
Returns
|
52 |
-------
|
@@ -60,15 +60,14 @@ def build_text_expansion_query(
|
|
60 |
"nested": {
|
61 |
"path": f"embeddings.{f}.chunks",
|
62 |
"query": {
|
63 |
-
"
|
64 |
-
f"embeddings.{f}.chunks.vector"
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
}
|
69 |
}
|
70 |
},
|
71 |
-
|
72 |
"_source": False,
|
73 |
"size": 2,
|
74 |
"fields": [f"embeddings.{f}.chunks.chunk"]
|
@@ -99,7 +98,7 @@ def query_builder(query: str, indices: List[str]) -> List[Dict[str, Any]]:
|
|
99 |
|
100 |
for index in indices:
|
101 |
if index == "issuelab":
|
102 |
-
q =
|
103 |
query=query,
|
104 |
fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
|
105 |
)
|
@@ -107,7 +106,7 @@ def query_builder(query: str, indices: List[str]) -> List[Dict[str, Any]]:
|
|
107 |
q["size"] = 1
|
108 |
queries.extend([{"index": ElasticIndexMapping.ISSUELAB_INDEX_ELSER}, q])
|
109 |
elif index == "youtube":
|
110 |
-
q =
|
111 |
query=query,
|
112 |
fields=("captions_cleaned", "description_cleaned", "title")
|
113 |
)
|
@@ -116,15 +115,15 @@ def query_builder(query: str, indices: List[str]) -> List[Dict[str, Any]]:
|
|
116 |
q["size"] = 2
|
117 |
queries.extend([{"index": ElasticIndexMapping.YOUTUBE_INDEX_ELSER}, q])
|
118 |
elif index == "candid_blog":
|
119 |
-
q =
|
120 |
query=query,
|
121 |
-
fields=("content", "
|
122 |
)
|
123 |
q["_source"] = {"excludes": ["embeddings"]}
|
124 |
q["size"] = 2
|
125 |
queries.extend([{"index": ElasticIndexMapping.CANDID_BLOG_INDEX_ELSER}, q])
|
126 |
elif index == "candid_learning":
|
127 |
-
q =
|
128 |
query=query,
|
129 |
fields=("content", "title", "training_topics", "staff_recommendations")
|
130 |
)
|
@@ -132,7 +131,7 @@ def query_builder(query: str, indices: List[str]) -> List[Dict[str, Any]]:
|
|
132 |
q["size"] = 2
|
133 |
queries.extend([{"index": ElasticIndexMapping.CANDID_LEARNING_INDEX_ELSER}, q])
|
134 |
elif index == "candid_help":
|
135 |
-
q =
|
136 |
query=query,
|
137 |
fields=("content", "combined_article_description")
|
138 |
)
|
@@ -311,7 +310,8 @@ def get_results(user_input: str, indices: List[str]) -> Tuple[str, List[Document
|
|
311 |
return content, output
|
312 |
|
313 |
|
314 |
-
|
|
|
315 |
"""Pads the relevant chunk of text with context before and after
|
316 |
|
317 |
Parameters
|
@@ -328,8 +328,10 @@ def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1
|
|
328 |
longer chunks stuffed together
|
329 |
"""
|
330 |
|
331 |
-
|
|
|
332 |
long_text = hit.source.get(f"{field_name}", "")
|
|
|
333 |
inner_hits_field = f"embeddings.{field_name}.chunks"
|
334 |
found_chunks = hit.inner_hits.get(inner_hits_field, {})
|
335 |
if found_chunks:
|
@@ -340,15 +342,17 @@ def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1
|
|
340 |
# cutting the middle because we may have tokenizing artifacts there
|
341 |
chunk = chunk[3: -3]
|
342 |
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
|
|
|
|
352 |
|
353 |
|
354 |
def process_hit(hit: ElasticHitsResult) -> Union[Document, None]:
|
@@ -400,10 +404,12 @@ def process_hit(hit: ElasticHitsResult) -> Union[Document, None]:
|
|
400 |
elif "candid-blog" in hit.index:
|
401 |
excerpt = hit.source.get("excerpt", "")
|
402 |
title = hit.source.get("title", "")
|
403 |
-
# we only need to process long
|
404 |
-
content_with_context_txt = get_context("content", hit, context_length=12)
|
|
|
|
|
405 |
doc = Document(
|
406 |
-
page_content='\n\n'.join([title, excerpt, content_with_context_txt]),
|
407 |
metadata={
|
408 |
"title": title,
|
409 |
"source": "Candid Blog",
|
|
|
32 |
user_input: str = Field(description="query to look up in retriever")
|
33 |
|
34 |
|
35 |
+
def build_sparse_vector_query(
|
36 |
query: str,
|
37 |
fields: Tuple[str],
|
38 |
+
inference_id: str = ".elser-2-elasticsearch"
|
39 |
) -> Dict[str, Any]:
|
40 |
"""Builds a valid Elasticsearch text expansion query payload
|
41 |
|
|
|
45 |
Search context string
|
46 |
fields : Tuple[str]
|
47 |
Semantic text field names
|
48 |
+
inference_id : str, optional
|
49 |
+
ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
|
50 |
|
51 |
Returns
|
52 |
-------
|
|
|
60 |
"nested": {
|
61 |
"path": f"embeddings.{f}.chunks",
|
62 |
"query": {
|
63 |
+
"sparse_vector": {
|
64 |
+
"field": f"embeddings.{f}.chunks.vector",
|
65 |
+
"inference_id": inference_id,
|
66 |
+
"query": query,
|
67 |
+
"boost": 1 / len(fields)
|
|
|
68 |
}
|
69 |
},
|
70 |
+
"inner_hits": {
|
71 |
"_source": False,
|
72 |
"size": 2,
|
73 |
"fields": [f"embeddings.{f}.chunks.chunk"]
|
|
|
98 |
|
99 |
for index in indices:
|
100 |
if index == "issuelab":
|
101 |
+
q = build_sparse_vector_query(
|
102 |
query=query,
|
103 |
fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
|
104 |
)
|
|
|
106 |
q["size"] = 1
|
107 |
queries.extend([{"index": ElasticIndexMapping.ISSUELAB_INDEX_ELSER}, q])
|
108 |
elif index == "youtube":
|
109 |
+
q = build_sparse_vector_query(
|
110 |
query=query,
|
111 |
fields=("captions_cleaned", "description_cleaned", "title")
|
112 |
)
|
|
|
115 |
q["size"] = 2
|
116 |
queries.extend([{"index": ElasticIndexMapping.YOUTUBE_INDEX_ELSER}, q])
|
117 |
elif index == "candid_blog":
|
118 |
+
q = build_sparse_vector_query(
|
119 |
query=query,
|
120 |
+
fields=("content", "authors_text", "title_summary_tags")
|
121 |
)
|
122 |
q["_source"] = {"excludes": ["embeddings"]}
|
123 |
q["size"] = 2
|
124 |
queries.extend([{"index": ElasticIndexMapping.CANDID_BLOG_INDEX_ELSER}, q])
|
125 |
elif index == "candid_learning":
|
126 |
+
q = build_sparse_vector_query(
|
127 |
query=query,
|
128 |
fields=("content", "title", "training_topics", "staff_recommendations")
|
129 |
)
|
|
|
131 |
q["size"] = 2
|
132 |
queries.extend([{"index": ElasticIndexMapping.CANDID_LEARNING_INDEX_ELSER}, q])
|
133 |
elif index == "candid_help":
|
134 |
+
q = build_sparse_vector_query(
|
135 |
query=query,
|
136 |
fields=("content", "combined_article_description")
|
137 |
)
|
|
|
310 |
return content, output
|
311 |
|
312 |
|
313 |
+
# TODO make it better!
|
314 |
+
def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
|
315 |
"""Pads the relevant chunk of text with context before and after
|
316 |
|
317 |
Parameters
|
|
|
328 |
longer chunks stuffed together
|
329 |
"""
|
330 |
|
331 |
+
chunks = []
|
332 |
+
# TODO chunks have tokens, but long text is a normal text, but may contain html that also gets weird after tokenization
|
333 |
long_text = hit.source.get(f"{field_name}", "")
|
334 |
+
long_text = long_text.lower()
|
335 |
inner_hits_field = f"embeddings.{field_name}.chunks"
|
336 |
found_chunks = hit.inner_hits.get(inner_hits_field, {})
|
337 |
if found_chunks:
|
|
|
342 |
# cutting the middle because we may have tokenizing artifacts there
|
343 |
chunk = chunk[3: -3]
|
344 |
|
345 |
+
if add_context:
|
346 |
+
# Find the start and end indices of the chunk in the large text
|
347 |
+
start_index = long_text.find(chunk[:20])
|
348 |
+
if start_index != -1: # Chunk is found
|
349 |
+
end_index = start_index + len(chunk)
|
350 |
+
pre_start_index = max(0, start_index - context_length)
|
351 |
+
post_end_index = min(len(long_text), end_index + context_length)
|
352 |
+
chunks.append(long_text[pre_start_index:post_end_index])
|
353 |
+
else:
|
354 |
+
chunks.append(chunk)
|
355 |
+
return '\n\n'.join(chunks)
|
356 |
|
357 |
|
358 |
def process_hit(hit: ElasticHitsResult) -> Union[Document, None]:
|
|
|
404 |
elif "candid-blog" in hit.index:
|
405 |
excerpt = hit.source.get("excerpt", "")
|
406 |
title = hit.source.get("title", "")
|
407 |
+
# we only need to process long text
|
408 |
+
content_with_context_txt = get_context("content", hit, context_length=12, add_context=False)
|
409 |
+
authors = get_context("authors_text", hit, context_length=12, add_context=False)
|
410 |
+
tags = hit.source.get("title_summary_tags", "")
|
411 |
doc = Document(
|
412 |
+
page_content='\n\n'.join([title, excerpt, content_with_context_txt, authors, tags]),
|
413 |
metadata={
|
414 |
"title": title,
|
415 |
"source": "Candid Blog",
|