Nielo47 commited on
Commit
c3c6fa1
·
1 Parent(s): fbbd0f1

Update space

Browse files
Files changed (1) hide show
  1. utils/rag_retriever.py +69 -34
utils/rag_retriever.py CHANGED
@@ -9,21 +9,24 @@ from nltk import sent_tokenize
9
  import nltk
10
 
11
  # Baixar o tokenizador de frases do NLTK (necessário apenas uma vez)
12
- try:
13
- print("tentanto encontrar o tokenizador de frases do NLTK...")
14
- nltk.data.find('tokenizers/punkt') or nltk.download('tokenizers/punkt_tab')
15
- except nltk.downloader.DownloadError:
16
- print("Tokenizador de frases do NLTK não encontrado. Baixando...")
17
- nltk.download('punkt_tab')
 
 
18
 
19
  # Configurações
20
  # Configurações
21
- RAG_DIR = r'.\RAG'
22
- DATA_DIR = os.path.join(RAG_DIR, 'data')
23
- FAISS_INDEX_DIR = os.path.join(RAG_DIR, 'FAISS') # Renamed from FAISS_DIR for clarity
24
- CONTEXT_FAISS_INDEX_PATH = os.path.join(FAISS_INDEX_DIR, 'context_index.faiss') # Renamed variable
25
- CONTEXT_JSON_TEXT_PATH = os.path.join(FAISS_INDEX_DIR, 'context_texts.json') # Renamed variable
26
- EMBEDDING_MODEL_NAME = 'nomic-ai/nomic-embed-text-v2-moe' # Renamed variable
 
27
 
28
  def _load_embedding_model() -> SentenceTransformer:
29
  """
@@ -38,6 +41,7 @@ def _load_embedding_model() -> SentenceTransformer:
38
  print(f"Carregando modelo de embeddings {EMBEDDING_MODEL_NAME}...")
39
  return SentenceTransformer(EMBEDDING_MODEL_NAME, trust_remote_code=True)
40
 
 
41
  def _load_existing_index_and_documents() -> tuple[list | None, faiss.Index | None]:
42
  """
43
  Attempts to load an existing FAISS index and its associated text documents
@@ -56,7 +60,7 @@ def _load_existing_index_and_documents() -> tuple[list | None, faiss.Index | Non
56
  print("Carregando índice e documentos existentes...")
57
  try:
58
  faiss_index = faiss.read_index(CONTEXT_FAISS_INDEX_PATH)
59
- with open(CONTEXT_JSON_TEXT_PATH, 'r', encoding='utf-8') as f:
60
  loaded_documents = json.load(f)
61
  print(f"Carregados {len(loaded_documents)} documentos do índice existente.")
62
  return loaded_documents, faiss_index
@@ -65,6 +69,7 @@ def _load_existing_index_and_documents() -> tuple[list | None, faiss.Index | Non
65
  return None, None
66
  return None, None
67
 
 
68
  def _load_source_documents() -> list[str]:
69
  """
70
  Loads and preprocesses text documents from the data folder (DATA_DIR).
@@ -81,16 +86,16 @@ def _load_source_documents() -> list[str]:
81
  ValueError: If no '.txt' files are found in the data directory
82
  or if no valid documents are loaded after processing.
83
  """
84
- file_paths = glob.glob(os.path.join(DATA_DIR, '*.txt'))
85
  if not file_paths:
86
  raise ValueError(f"Nenhum arquivo .txt encontrado em {DATA_DIR}. Por favor, adicione documentos.")
87
 
88
  context_chunks = []
89
  for file_path in file_paths:
90
  try:
91
- with open(file_path, 'r', encoding='utf-8') as f:
92
  # Splits by double newline, strips whitespace, and filters out empty strings
93
- context_chunks.extend(list(filter(None, map(str.strip, f.read().split('\n\n')))))
94
  except Exception as e:
95
  print(f"Erro ao ler o arquivo {file_path}: {e}")
96
  continue
@@ -101,6 +106,7 @@ def _load_source_documents() -> list[str]:
101
  print(f"Carregados {len(context_chunks)} documentos.")
102
  return context_chunks
103
 
 
104
  def _generate_text_embeddings(embedder_model: SentenceTransformer, text_documents: list[str]) -> np.ndarray:
105
  """
106
  Generates numerical embeddings for a list of text documents using the provided embedder.
@@ -123,9 +129,9 @@ def _generate_text_embeddings(embedder_model: SentenceTransformer, text_document
123
  batch_size = 32
124
  generated_embeddings_list = []
125
  for i in range(0, len(text_documents), batch_size):
126
- batch = text_documents[i:i + batch_size]
127
  try:
128
- if batch: # Ensure the batch is not empty
129
  generated_embeddings_list.extend(embedder_model.encode(batch, show_progress_bar=False))
130
  except Exception as e:
131
  print(f"Erro ao gerar embeddings para lote {i//batch_size if batch_size > 0 else i}: {e}")
@@ -138,6 +144,7 @@ def _generate_text_embeddings(embedder_model: SentenceTransformer, text_document
138
 
139
  return np.array(generated_embeddings_list, dtype=np.float32)
140
 
 
141
  def _create_faiss_index(document_embeddings: np.ndarray) -> faiss.Index:
142
  """
143
  Creates and populates a FAISS (Facebook AI Similarity Search) index from a set of embeddings.
@@ -159,6 +166,7 @@ def _create_faiss_index(document_embeddings: np.ndarray) -> faiss.Index:
159
  faiss_index.add(document_embeddings)
160
  return faiss_index
161
 
 
162
  def initialize_rag_system() -> tuple[list[str], faiss.Index, SentenceTransformer]:
163
  """
164
  Initializes the complete RAG (Retrieval Augmented Generation) system.
@@ -181,20 +189,27 @@ def initialize_rag_system() -> tuple[list[str], faiss.Index, SentenceTransformer
181
  text_embedder = _load_embedding_model()
182
  context_documents, faiss_index = _load_existing_index_and_documents()
183
 
184
- if faiss_index is None: # If the index doesn't exist or an error occurred loading it, rebuild
185
  print("Índice FAISS não encontrado ou corrompido. Reconstruindo...")
186
  context_documents = _load_source_documents()
187
  document_embeddings = _generate_text_embeddings(text_embedder, context_documents)
188
  faiss_index = _create_faiss_index(document_embeddings)
189
 
190
  faiss.write_index(faiss_index, CONTEXT_FAISS_INDEX_PATH)
191
- with open(CONTEXT_JSON_TEXT_PATH, 'w', encoding='utf-8') as f:
192
- json.dump(context_documents, f, ensure_ascii=False, indent=4) # Added indent for readability
193
  print("Novo índice e documentos salvos com sucesso.")
194
 
195
  return context_documents, faiss_index, text_embedder
196
 
197
- def search_with_full_query(full_question_text: str, context_documents: list[str], faiss_index: faiss.Index, embedder_model: SentenceTransformer, k_results: int = 3) -> list[tuple[int, str, float]]:
 
 
 
 
 
 
 
198
  """
199
  Searches for the 'k_results' most relevant documents for the **entire question**,
200
  treating it as a single search unit. This function does not segment the question into sentences.
@@ -238,7 +253,14 @@ def search_with_full_query(full_question_text: str, context_documents: list[str]
238
  print(f"Erro ao buscar contexto completo: {e}")
239
  return []
240
 
241
- def search_with_multiple_sentences(question_text: str, context_documents: list[str], faiss_index: faiss.Index, embedder_model: SentenceTransformer, k_per_sentence: int = 2) -> list[tuple[int, str, float]]:
 
 
 
 
 
 
 
242
  """
243
  Segments the question into sentences and searches for the 'k_per_sentence' most relevant
244
  documents for **EACH sentence**, then consolidates and returns only unique contexts.
@@ -264,7 +286,7 @@ def search_with_multiple_sentences(question_text: str, context_documents: list[s
264
 
265
  print(f"Buscando múltiplos contextos para: '{question_text}'")
266
 
267
- sentences = sent_tokenize(question_text, language='portuguese')
268
  if not sentences:
269
  print("Nenhuma frase detectada na pergunta para busca de múltiplos contextos.")
270
  return []
@@ -277,7 +299,7 @@ def search_with_multiple_sentences(question_text: str, context_documents: list[s
277
  try:
278
  for sentence in sentences:
279
  print(f"Processando frase para múltiplos contextos: '{sentence}'")
280
- if not sentence.strip(): # Skip empty sentences that might be produced by sent_tokenize
281
  continue
282
  query_embedding = np.array(embedder_model.encode([sentence]), dtype=np.float32)
283
  distances, indices = faiss_index.search(query_embedding, k_per_sentence)
@@ -288,8 +310,15 @@ def search_with_multiple_sentences(question_text: str, context_documents: list[s
288
 
289
  if 0 <= document_index < len(context_documents):
290
  # If the document has already been found, update if the new distance is smaller (more relevant)
291
- if document_index not in consolidated_contexts_map or distance_score < consolidated_contexts_map[document_index][2]:
292
- consolidated_contexts_map[document_index] = (document_index, context_documents[document_index], distance_score)
 
 
 
 
 
 
 
293
 
294
  # Convert the dictionary of consolidated contexts back to a list
295
  results_list = list(consolidated_contexts_map.values())
@@ -302,6 +331,7 @@ def search_with_multiple_sentences(question_text: str, context_documents: list[s
302
  print(f"Erro ao buscar múltiplos contextos: {e}")
303
  return []
304
 
 
305
  # --- Funções de Teste ---
306
  def test_context_search_interactive():
307
  """
@@ -319,7 +349,7 @@ def test_context_search_interactive():
319
 
320
  while True:
321
  user_question = input("\nDigite uma pergunta (ou 'sair' para encerrar): ")
322
- if user_question.lower() == 'sair':
323
  break
324
 
325
  print("\nEscolha o tipo de busca:")
@@ -328,12 +358,16 @@ def test_context_search_interactive():
328
  search_choice = input("Opção (1 ou 2): ")
329
 
330
  retrieved_contexts = []
331
- if search_choice == '1':
332
  print(f"\nRealizando busca de contexto completo para: '{user_question}'")
333
- retrieved_contexts = search_with_full_query(user_question, context_documents, faiss_index, text_embedder, k_results=5)
334
- elif search_choice == '2':
 
 
335
  print(f"\nRealizando busca de múltiplos contextos para: '{user_question}'")
336
- retrieved_contexts = search_with_multiple_sentences(user_question, context_documents, faiss_index, text_embedder, k_per_sentence=3)
 
 
337
  else:
338
  print("Opção inválida. Tente novamente.")
339
  continue
@@ -345,8 +379,9 @@ def test_context_search_interactive():
345
  print("\nContextos mais relevantes:")
346
  for doc_idx, text_content, distance_score in retrieved_contexts:
347
  print(f"\nÍndice Original do Documento: {doc_idx}, Distância: {distance_score:.4f}")
348
- print(f"Texto: {text_content[:500]}...") # Limita o texto para melhor visualização
349
  print("-" * 50)
350
 
 
351
  if __name__ == "__main__":
352
- test_context_search_interactive()
 
9
  import nltk
10
 
11
  # Baixar o tokenizador de frases do NLTK (necessário apenas uma vez)
12
+ # try:
13
+ # print("tentanto encontrar o tokenizador de frases do NLTK...")
14
+ # nltk.data.find('tokenizers/punkt') or nltk.download('tokenizers/punkt_tab')
15
+ # except nltk.downloader.DownloadError:
16
+ # print("Tokenizador de frases do NLTK não encontrado. Baixando...")
17
+ # nltk.download('punkt_tab')
18
+
19
+ nltk.download("punkt")
20
 
21
  # Configurações
22
  # Configurações
23
+ RAG_DIR = r".\RAG"
24
+ DATA_DIR = os.path.join(RAG_DIR, "data")
25
+ FAISS_INDEX_DIR = os.path.join(RAG_DIR, "FAISS") # Renamed from FAISS_DIR for clarity
26
+ CONTEXT_FAISS_INDEX_PATH = os.path.join(FAISS_INDEX_DIR, "context_index.faiss") # Renamed variable
27
+ CONTEXT_JSON_TEXT_PATH = os.path.join(FAISS_INDEX_DIR, "context_texts.json") # Renamed variable
28
+ EMBEDDING_MODEL_NAME = "nomic-ai/nomic-embed-text-v2-moe" # Renamed variable
29
+
30
 
31
  def _load_embedding_model() -> SentenceTransformer:
32
  """
 
41
  print(f"Carregando modelo de embeddings {EMBEDDING_MODEL_NAME}...")
42
  return SentenceTransformer(EMBEDDING_MODEL_NAME, trust_remote_code=True)
43
 
44
+
45
  def _load_existing_index_and_documents() -> tuple[list | None, faiss.Index | None]:
46
  """
47
  Attempts to load an existing FAISS index and its associated text documents
 
60
  print("Carregando índice e documentos existentes...")
61
  try:
62
  faiss_index = faiss.read_index(CONTEXT_FAISS_INDEX_PATH)
63
+ with open(CONTEXT_JSON_TEXT_PATH, "r", encoding="utf-8") as f:
64
  loaded_documents = json.load(f)
65
  print(f"Carregados {len(loaded_documents)} documentos do índice existente.")
66
  return loaded_documents, faiss_index
 
69
  return None, None
70
  return None, None
71
 
72
+
73
  def _load_source_documents() -> list[str]:
74
  """
75
  Loads and preprocesses text documents from the data folder (DATA_DIR).
 
86
  ValueError: If no '.txt' files are found in the data directory
87
  or if no valid documents are loaded after processing.
88
  """
89
+ file_paths = glob.glob(os.path.join(DATA_DIR, "*.txt"))
90
  if not file_paths:
91
  raise ValueError(f"Nenhum arquivo .txt encontrado em {DATA_DIR}. Por favor, adicione documentos.")
92
 
93
  context_chunks = []
94
  for file_path in file_paths:
95
  try:
96
+ with open(file_path, "r", encoding="utf-8") as f:
97
  # Splits by double newline, strips whitespace, and filters out empty strings
98
+ context_chunks.extend(list(filter(None, map(str.strip, f.read().split("\n\n")))))
99
  except Exception as e:
100
  print(f"Erro ao ler o arquivo {file_path}: {e}")
101
  continue
 
106
  print(f"Carregados {len(context_chunks)} documentos.")
107
  return context_chunks
108
 
109
+
110
  def _generate_text_embeddings(embedder_model: SentenceTransformer, text_documents: list[str]) -> np.ndarray:
111
  """
112
  Generates numerical embeddings for a list of text documents using the provided embedder.
 
129
  batch_size = 32
130
  generated_embeddings_list = []
131
  for i in range(0, len(text_documents), batch_size):
132
+ batch = text_documents[i : i + batch_size]
133
  try:
134
+ if batch: # Ensure the batch is not empty
135
  generated_embeddings_list.extend(embedder_model.encode(batch, show_progress_bar=False))
136
  except Exception as e:
137
  print(f"Erro ao gerar embeddings para lote {i//batch_size if batch_size > 0 else i}: {e}")
 
144
 
145
  return np.array(generated_embeddings_list, dtype=np.float32)
146
 
147
+
148
  def _create_faiss_index(document_embeddings: np.ndarray) -> faiss.Index:
149
  """
150
  Creates and populates a FAISS (Facebook AI Similarity Search) index from a set of embeddings.
 
166
  faiss_index.add(document_embeddings)
167
  return faiss_index
168
 
169
+
170
  def initialize_rag_system() -> tuple[list[str], faiss.Index, SentenceTransformer]:
171
  """
172
  Initializes the complete RAG (Retrieval Augmented Generation) system.
 
189
  text_embedder = _load_embedding_model()
190
  context_documents, faiss_index = _load_existing_index_and_documents()
191
 
192
+ if faiss_index is None: # If the index doesn't exist or an error occurred loading it, rebuild
193
  print("Índice FAISS não encontrado ou corrompido. Reconstruindo...")
194
  context_documents = _load_source_documents()
195
  document_embeddings = _generate_text_embeddings(text_embedder, context_documents)
196
  faiss_index = _create_faiss_index(document_embeddings)
197
 
198
  faiss.write_index(faiss_index, CONTEXT_FAISS_INDEX_PATH)
199
+ with open(CONTEXT_JSON_TEXT_PATH, "w", encoding="utf-8") as f:
200
+ json.dump(context_documents, f, ensure_ascii=False, indent=4) # Added indent for readability
201
  print("Novo índice e documentos salvos com sucesso.")
202
 
203
  return context_documents, faiss_index, text_embedder
204
 
205
+
206
+ def search_with_full_query(
207
+ full_question_text: str,
208
+ context_documents: list[str],
209
+ faiss_index: faiss.Index,
210
+ embedder_model: SentenceTransformer,
211
+ k_results: int = 3,
212
+ ) -> list[tuple[int, str, float]]:
213
  """
214
  Searches for the 'k_results' most relevant documents for the **entire question**,
215
  treating it as a single search unit. This function does not segment the question into sentences.
 
253
  print(f"Erro ao buscar contexto completo: {e}")
254
  return []
255
 
256
+
257
+ def search_with_multiple_sentences(
258
+ question_text: str,
259
+ context_documents: list[str],
260
+ faiss_index: faiss.Index,
261
+ embedder_model: SentenceTransformer,
262
+ k_per_sentence: int = 2,
263
+ ) -> list[tuple[int, str, float]]:
264
  """
265
  Segments the question into sentences and searches for the 'k_per_sentence' most relevant
266
  documents for **EACH sentence**, then consolidates and returns only unique contexts.
 
286
 
287
  print(f"Buscando múltiplos contextos para: '{question_text}'")
288
 
289
+ sentences = sent_tokenize(question_text, language="portuguese")
290
  if not sentences:
291
  print("Nenhuma frase detectada na pergunta para busca de múltiplos contextos.")
292
  return []
 
299
  try:
300
  for sentence in sentences:
301
  print(f"Processando frase para múltiplos contextos: '{sentence}'")
302
+ if not sentence.strip(): # Skip empty sentences that might be produced by sent_tokenize
303
  continue
304
  query_embedding = np.array(embedder_model.encode([sentence]), dtype=np.float32)
305
  distances, indices = faiss_index.search(query_embedding, k_per_sentence)
 
310
 
311
  if 0 <= document_index < len(context_documents):
312
  # If the document has already been found, update if the new distance is smaller (more relevant)
313
+ if (
314
+ document_index not in consolidated_contexts_map
315
+ or distance_score < consolidated_contexts_map[document_index][2]
316
+ ):
317
+ consolidated_contexts_map[document_index] = (
318
+ document_index,
319
+ context_documents[document_index],
320
+ distance_score,
321
+ )
322
 
323
  # Convert the dictionary of consolidated contexts back to a list
324
  results_list = list(consolidated_contexts_map.values())
 
331
  print(f"Erro ao buscar múltiplos contextos: {e}")
332
  return []
333
 
334
+
335
  # --- Funções de Teste ---
336
  def test_context_search_interactive():
337
  """
 
349
 
350
  while True:
351
  user_question = input("\nDigite uma pergunta (ou 'sair' para encerrar): ")
352
+ if user_question.lower() == "sair":
353
  break
354
 
355
  print("\nEscolha o tipo de busca:")
 
358
  search_choice = input("Opção (1 ou 2): ")
359
 
360
  retrieved_contexts = []
361
+ if search_choice == "1":
362
  print(f"\nRealizando busca de contexto completo para: '{user_question}'")
363
+ retrieved_contexts = search_with_full_query(
364
+ user_question, context_documents, faiss_index, text_embedder, k_results=5
365
+ )
366
+ elif search_choice == "2":
367
  print(f"\nRealizando busca de múltiplos contextos para: '{user_question}'")
368
+ retrieved_contexts = search_with_multiple_sentences(
369
+ user_question, context_documents, faiss_index, text_embedder, k_per_sentence=3
370
+ )
371
  else:
372
  print("Opção inválida. Tente novamente.")
373
  continue
 
379
  print("\nContextos mais relevantes:")
380
  for doc_idx, text_content, distance_score in retrieved_contexts:
381
  print(f"\nÍndice Original do Documento: {doc_idx}, Distância: {distance_score:.4f}")
382
+ print(f"Texto: {text_content[:500]}...") # Limita o texto para melhor visualização
383
  print("-" * 50)
384
 
385
+
386
  if __name__ == "__main__":
387
+ test_context_search_interactive()