brainsqueeze commited on
Commit
a0802f4
·
verified ·
1 Parent(s): d738003

Migrate to sparse_vector query

Browse files
Files changed (1) hide show
  1. ask_candid/retrieval/elastic.py +37 -31
ask_candid/retrieval/elastic.py CHANGED
@@ -32,10 +32,10 @@ class RetrieverInput(BaseModel):
32
  user_input: str = Field(description="query to look up in retriever")
33
 
34
 
35
- def build_text_expansion_query(
36
  query: str,
37
  fields: Tuple[str],
38
- model_id: str = ".elser_model_2_linux-x86_64"
39
  ) -> Dict[str, Any]:
40
  """Builds a valid Elasticsearch text expansion query payload
41
 
@@ -45,8 +45,8 @@ def build_text_expansion_query(
45
  Search context string
46
  fields : Tuple[str]
47
  Semantic text field names
48
- model_id : str, optional
49
- ID of model deployed in Elasticsearch, by default ".elser_model_2_linux-x86_64"
50
 
51
  Returns
52
  -------
@@ -60,15 +60,14 @@ def build_text_expansion_query(
60
  "nested": {
61
  "path": f"embeddings.{f}.chunks",
62
  "query": {
63
- "text_expansion": {
64
- f"embeddings.{f}.chunks.vector": {
65
- "model_id": model_id,
66
- "model_text": query,
67
- "boost": 1 / len(fields)
68
- }
69
  }
70
  },
71
- "inner_hits": {
72
  "_source": False,
73
  "size": 2,
74
  "fields": [f"embeddings.{f}.chunks.chunk"]
@@ -99,7 +98,7 @@ def query_builder(query: str, indices: List[str]) -> List[Dict[str, Any]]:
99
 
100
  for index in indices:
101
  if index == "issuelab":
102
- q = build_text_expansion_query(
103
  query=query,
104
  fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
105
  )
@@ -107,7 +106,7 @@ def query_builder(query: str, indices: List[str]) -> List[Dict[str, Any]]:
107
  q["size"] = 1
108
  queries.extend([{"index": ElasticIndexMapping.ISSUELAB_INDEX_ELSER}, q])
109
  elif index == "youtube":
110
- q = build_text_expansion_query(
111
  query=query,
112
  fields=("captions_cleaned", "description_cleaned", "title")
113
  )
@@ -116,15 +115,15 @@ def query_builder(query: str, indices: List[str]) -> List[Dict[str, Any]]:
116
  q["size"] = 2
117
  queries.extend([{"index": ElasticIndexMapping.YOUTUBE_INDEX_ELSER}, q])
118
  elif index == "candid_blog":
119
- q = build_text_expansion_query(
120
  query=query,
121
- fields=("content", "title")
122
  )
123
  q["_source"] = {"excludes": ["embeddings"]}
124
  q["size"] = 2
125
  queries.extend([{"index": ElasticIndexMapping.CANDID_BLOG_INDEX_ELSER}, q])
126
  elif index == "candid_learning":
127
- q = build_text_expansion_query(
128
  query=query,
129
  fields=("content", "title", "training_topics", "staff_recommendations")
130
  )
@@ -132,7 +131,7 @@ def query_builder(query: str, indices: List[str]) -> List[Dict[str, Any]]:
132
  q["size"] = 2
133
  queries.extend([{"index": ElasticIndexMapping.CANDID_LEARNING_INDEX_ELSER}, q])
134
  elif index == "candid_help":
135
- q = build_text_expansion_query(
136
  query=query,
137
  fields=("content", "combined_article_description")
138
  )
@@ -311,7 +310,8 @@ def get_results(user_input: str, indices: List[str]) -> Tuple[str, List[Document
311
  return content, output
312
 
313
 
314
- def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024) -> str:
 
315
  """Pads the relevant chunk of text with context before and after
316
 
317
  Parameters
@@ -328,8 +328,10 @@ def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1
328
  longer chunks stuffed together
329
  """
330
 
331
- chunks_with_context = []
 
332
  long_text = hit.source.get(f"{field_name}", "")
 
333
  inner_hits_field = f"embeddings.{field_name}.chunks"
334
  found_chunks = hit.inner_hits.get(inner_hits_field, {})
335
  if found_chunks:
@@ -340,15 +342,17 @@ def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1
340
  # cutting the middle because we may have tokenizing artifacts there
341
  chunk = chunk[3: -3]
342
 
343
- # Find the start and end indices of the chunk in the large text
344
- start_index = long_text.find(chunk)
345
- if start_index != -1: # Chunk is found
346
- end_index = start_index + len(chunk)
347
- pre_start_index = max(0, start_index - context_length)
348
- post_end_index = min(len(long_text), end_index + context_length)
349
- chunks_with_context.append(long_text[pre_start_index:post_end_index])
350
-
351
- return '\n\n'.join(chunks_with_context)
 
 
352
 
353
 
354
  def process_hit(hit: ElasticHitsResult) -> Union[Document, None]:
@@ -400,10 +404,12 @@ def process_hit(hit: ElasticHitsResult) -> Union[Document, None]:
400
  elif "candid-blog" in hit.index:
401
  excerpt = hit.source.get("excerpt", "")
402
  title = hit.source.get("title", "")
403
- # we only need to process long texts
404
- content_with_context_txt = get_context("content", hit, context_length=12)
 
 
405
  doc = Document(
406
- page_content='\n\n'.join([title, excerpt, content_with_context_txt]),
407
  metadata={
408
  "title": title,
409
  "source": "Candid Blog",
 
32
  user_input: str = Field(description="query to look up in retriever")
33
 
34
 
35
+ def build_sparse_vector_query(
36
  query: str,
37
  fields: Tuple[str],
38
+ inference_id: str = ".elser-2-elasticsearch"
39
  ) -> Dict[str, Any]:
40
  """Builds a valid Elasticsearch text expansion query payload
41
 
 
45
  Search context string
46
  fields : Tuple[str]
47
  Semantic text field names
48
+ inference_id : str, optional
49
+ ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
50
 
51
  Returns
52
  -------
 
60
  "nested": {
61
  "path": f"embeddings.{f}.chunks",
62
  "query": {
63
+ "sparse_vector": {
64
+ "field": f"embeddings.{f}.chunks.vector",
65
+ "inference_id": inference_id,
66
+ "query": query,
67
+ "boost": 1 / len(fields)
 
68
  }
69
  },
70
+ "inner_hits": {
71
  "_source": False,
72
  "size": 2,
73
  "fields": [f"embeddings.{f}.chunks.chunk"]
 
98
 
99
  for index in indices:
100
  if index == "issuelab":
101
+ q = build_sparse_vector_query(
102
  query=query,
103
  fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
104
  )
 
106
  q["size"] = 1
107
  queries.extend([{"index": ElasticIndexMapping.ISSUELAB_INDEX_ELSER}, q])
108
  elif index == "youtube":
109
+ q = build_sparse_vector_query(
110
  query=query,
111
  fields=("captions_cleaned", "description_cleaned", "title")
112
  )
 
115
  q["size"] = 2
116
  queries.extend([{"index": ElasticIndexMapping.YOUTUBE_INDEX_ELSER}, q])
117
  elif index == "candid_blog":
118
+ q = build_sparse_vector_query(
119
  query=query,
120
+ fields=("content", "authors_text", "title_summary_tags")
121
  )
122
  q["_source"] = {"excludes": ["embeddings"]}
123
  q["size"] = 2
124
  queries.extend([{"index": ElasticIndexMapping.CANDID_BLOG_INDEX_ELSER}, q])
125
  elif index == "candid_learning":
126
+ q = build_sparse_vector_query(
127
  query=query,
128
  fields=("content", "title", "training_topics", "staff_recommendations")
129
  )
 
131
  q["size"] = 2
132
  queries.extend([{"index": ElasticIndexMapping.CANDID_LEARNING_INDEX_ELSER}, q])
133
  elif index == "candid_help":
134
+ q = build_sparse_vector_query(
135
  query=query,
136
  fields=("content", "combined_article_description")
137
  )
 
310
  return content, output
311
 
312
 
313
+ # TODO make it better!
314
+ def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
315
  """Pads the relevant chunk of text with context before and after
316
 
317
  Parameters
 
328
  longer chunks stuffed together
329
  """
330
 
331
+ chunks = []
332
+ # TODO chunks have tokens, but long text is a normal text, but may contain html that also gets weird after tokenization
333
  long_text = hit.source.get(f"{field_name}", "")
334
+ long_text = long_text.lower()
335
  inner_hits_field = f"embeddings.{field_name}.chunks"
336
  found_chunks = hit.inner_hits.get(inner_hits_field, {})
337
  if found_chunks:
 
342
  # cutting the middle because we may have tokenizing artifacts there
343
  chunk = chunk[3: -3]
344
 
345
+ if add_context:
346
+ # Find the start and end indices of the chunk in the large text
347
+ start_index = long_text.find(chunk[:20])
348
+ if start_index != -1: # Chunk is found
349
+ end_index = start_index + len(chunk)
350
+ pre_start_index = max(0, start_index - context_length)
351
+ post_end_index = min(len(long_text), end_index + context_length)
352
+ chunks.append(long_text[pre_start_index:post_end_index])
353
+ else:
354
+ chunks.append(chunk)
355
+ return '\n\n'.join(chunks)
356
 
357
 
358
  def process_hit(hit: ElasticHitsResult) -> Union[Document, None]:
 
404
  elif "candid-blog" in hit.index:
405
  excerpt = hit.source.get("excerpt", "")
406
  title = hit.source.get("title", "")
407
+ # we only need to process long text
408
+ content_with_context_txt = get_context("content", hit, context_length=12, add_context=False)
409
+ authors = get_context("authors_text", hit, context_length=12, add_context=False)
410
+ tags = hit.source.get("title_summary_tags", "")
411
  doc = Document(
412
+ page_content='\n\n'.join([title, excerpt, content_with_context_txt, authors, tags]),
413
  metadata={
414
  "title": title,
415
  "source": "Candid Blog",