Spaces:

Nielo47
/

RAG-Test

Running

App Files Files Community

Nielo47 commited on Jun 9

Commit

c3c6fa1

1 Parent(s): fbbd0f1

Update space

Browse files

Files changed (1) hide show

utils/rag_retriever.py +69 -34

utils/rag_retriever.py CHANGED Viewed

@@ -9,21 +9,24 @@ from nltk import sent_tokenize
 import nltk
 # Baixar o tokenizador de frases do NLTK (necessário apenas uma vez)
-try:
-    print("tentanto encontrar o tokenizador de frases do NLTK...")
-    nltk.data.find('tokenizers/punkt') or nltk.download('tokenizers/punkt_tab')
-except nltk.downloader.DownloadError:
-    print("Tokenizador de frases do NLTK não encontrado. Baixando...")
-    nltk.download('punkt_tab')
 # Configurações
 # Configurações
-RAG_DIR = r'.\RAG'
-DATA_DIR = os.path.join(RAG_DIR, 'data')
-FAISS_INDEX_DIR = os.path.join(RAG_DIR, 'FAISS') # Renamed from FAISS_DIR for clarity
-CONTEXT_FAISS_INDEX_PATH = os.path.join(FAISS_INDEX_DIR, 'context_index.faiss') # Renamed variable
-CONTEXT_JSON_TEXT_PATH = os.path.join(FAISS_INDEX_DIR, 'context_texts.json') # Renamed variable
-EMBEDDING_MODEL_NAME = 'nomic-ai/nomic-embed-text-v2-moe' # Renamed variable
 def _load_embedding_model() -> SentenceTransformer:
     """
@@ -38,6 +41,7 @@ def _load_embedding_model() -> SentenceTransformer:
     print(f"Carregando modelo de embeddings {EMBEDDING_MODEL_NAME}...")
     return SentenceTransformer(EMBEDDING_MODEL_NAME, trust_remote_code=True)
 def _load_existing_index_and_documents() -> tuple[list | None, faiss.Index | None]:
     """
     Attempts to load an existing FAISS index and its associated text documents
@@ -56,7 +60,7 @@ def _load_existing_index_and_documents() -> tuple[list | None, faiss.Index | Non
         print("Carregando índice e documentos existentes...")
         try:
             faiss_index = faiss.read_index(CONTEXT_FAISS_INDEX_PATH)
-            with open(CONTEXT_JSON_TEXT_PATH, 'r', encoding='utf-8') as f:
                 loaded_documents = json.load(f)
             print(f"Carregados {len(loaded_documents)} documentos do índice existente.")
             return loaded_documents, faiss_index
@@ -65,6 +69,7 @@ def _load_existing_index_and_documents() -> tuple[list | None, faiss.Index | Non
             return None, None
     return None, None
 def _load_source_documents() -> list[str]:
     """
     Loads and preprocesses text documents from the data folder (DATA_DIR).
@@ -81,16 +86,16 @@ def _load_source_documents() -> list[str]:
         ValueError: If no '.txt' files are found in the data directory
                     or if no valid documents are loaded after processing.
     """
-    file_paths = glob.glob(os.path.join(DATA_DIR, '*.txt'))
     if not file_paths:
         raise ValueError(f"Nenhum arquivo .txt encontrado em {DATA_DIR}. Por favor, adicione documentos.")
     context_chunks = []
     for file_path in file_paths:
         try:
-            with open(file_path, 'r', encoding='utf-8') as f:
                 # Splits by double newline, strips whitespace, and filters out empty strings
-                context_chunks.extend(list(filter(None, map(str.strip, f.read().split('\n\n')))))
         except Exception as e:
             print(f"Erro ao ler o arquivo {file_path}: {e}")
             continue
@@ -101,6 +106,7 @@ def _load_source_documents() -> list[str]:
     print(f"Carregados {len(context_chunks)} documentos.")
     return context_chunks
 def _generate_text_embeddings(embedder_model: SentenceTransformer, text_documents: list[str]) -> np.ndarray:
     """
     Generates numerical embeddings for a list of text documents using the provided embedder.
@@ -123,9 +129,9 @@ def _generate_text_embeddings(embedder_model: SentenceTransformer, text_document
     batch_size = 32
     generated_embeddings_list = []
     for i in range(0, len(text_documents), batch_size):
-        batch = text_documents[i:i + batch_size]
         try:
-            if batch: # Ensure the batch is not empty
                 generated_embeddings_list.extend(embedder_model.encode(batch, show_progress_bar=False))
         except Exception as e:
             print(f"Erro ao gerar embeddings para lote {i//batch_size if batch_size > 0 else i}: {e}")
@@ -138,6 +144,7 @@ def _generate_text_embeddings(embedder_model: SentenceTransformer, text_document
     return np.array(generated_embeddings_list, dtype=np.float32)
 def _create_faiss_index(document_embeddings: np.ndarray) -> faiss.Index:
     """
     Creates and populates a FAISS (Facebook AI Similarity Search) index from a set of embeddings.
@@ -159,6 +166,7 @@ def _create_faiss_index(document_embeddings: np.ndarray) -> faiss.Index:
     faiss_index.add(document_embeddings)
     return faiss_index
 def initialize_rag_system() -> tuple[list[str], faiss.Index, SentenceTransformer]:
     """
     Initializes the complete RAG (Retrieval Augmented Generation) system.
@@ -181,20 +189,27 @@ def initialize_rag_system() -> tuple[list[str], faiss.Index, SentenceTransformer
     text_embedder = _load_embedding_model()
     context_documents, faiss_index = _load_existing_index_and_documents()
-    if faiss_index is None: # If the index doesn't exist or an error occurred loading it, rebuild
         print("Índice FAISS não encontrado ou corrompido. Reconstruindo...")
         context_documents = _load_source_documents()
         document_embeddings = _generate_text_embeddings(text_embedder, context_documents)
         faiss_index = _create_faiss_index(document_embeddings)
         faiss.write_index(faiss_index, CONTEXT_FAISS_INDEX_PATH)
-        with open(CONTEXT_JSON_TEXT_PATH, 'w', encoding='utf-8') as f:
-            json.dump(context_documents, f, ensure_ascii=False, indent=4) # Added indent for readability
         print("Novo índice e documentos salvos com sucesso.")
     return context_documents, faiss_index, text_embedder
-def search_with_full_query(full_question_text: str, context_documents: list[str], faiss_index: faiss.Index, embedder_model: SentenceTransformer, k_results: int = 3) -> list[tuple[int, str, float]]:
     """
     Searches for the 'k_results' most relevant documents for the **entire question**,
     treating it as a single search unit. This function does not segment the question into sentences.
@@ -238,7 +253,14 @@ def search_with_full_query(full_question_text: str, context_documents: list[str]
         print(f"Erro ao buscar contexto completo: {e}")
         return []
-def search_with_multiple_sentences(question_text: str, context_documents: list[str], faiss_index: faiss.Index, embedder_model: SentenceTransformer, k_per_sentence: int = 2) -> list[tuple[int, str, float]]:
     """
     Segments the question into sentences and searches for the 'k_per_sentence' most relevant
     documents for **EACH sentence**, then consolidates and returns only unique contexts.
@@ -264,7 +286,7 @@ def search_with_multiple_sentences(question_text: str, context_documents: list[s
     print(f"Buscando múltiplos contextos para: '{question_text}'")
-    sentences = sent_tokenize(question_text, language='portuguese')
     if not sentences:
         print("Nenhuma frase detectada na pergunta para busca de múltiplos contextos.")
         return []
@@ -277,7 +299,7 @@ def search_with_multiple_sentences(question_text: str, context_documents: list[s
     try:
         for sentence in sentences:
             print(f"Processando frase para múltiplos contextos: '{sentence}'")
-            if not sentence.strip(): # Skip empty sentences that might be produced by sent_tokenize
                 continue
             query_embedding = np.array(embedder_model.encode([sentence]), dtype=np.float32)
             distances, indices = faiss_index.search(query_embedding, k_per_sentence)
@@ -288,8 +310,15 @@ def search_with_multiple_sentences(question_text: str, context_documents: list[s
                 if 0 <= document_index < len(context_documents):
                     # If the document has already been found, update if the new distance is smaller (more relevant)
-                    if document_index not in consolidated_contexts_map or distance_score < consolidated_contexts_map[document_index][2]:
-                        consolidated_contexts_map[document_index] = (document_index, context_documents[document_index], distance_score)
         # Convert the dictionary of consolidated contexts back to a list
         results_list = list(consolidated_contexts_map.values())
@@ -302,6 +331,7 @@ def search_with_multiple_sentences(question_text: str, context_documents: list[s
         print(f"Erro ao buscar múltiplos contextos: {e}")
         return []
 # --- Funções de Teste ---
 def test_context_search_interactive():
     """
@@ -319,7 +349,7 @@ def test_context_search_interactive():
     while True:
         user_question = input("\nDigite uma pergunta (ou 'sair' para encerrar): ")
-        if user_question.lower() == 'sair':
             break
         print("\nEscolha o tipo de busca:")
@@ -328,12 +358,16 @@ def test_context_search_interactive():
         search_choice = input("Opção (1 ou 2): ")
         retrieved_contexts = []
-        if search_choice == '1':
             print(f"\nRealizando busca de contexto completo para: '{user_question}'")
-            retrieved_contexts = search_with_full_query(user_question, context_documents, faiss_index, text_embedder, k_results=5)
-        elif search_choice == '2':
             print(f"\nRealizando busca de múltiplos contextos para: '{user_question}'")
-            retrieved_contexts = search_with_multiple_sentences(user_question, context_documents, faiss_index, text_embedder, k_per_sentence=3)
         else:
             print("Opção inválida. Tente novamente.")
             continue
@@ -345,8 +379,9 @@ def test_context_search_interactive():
         print("\nContextos mais relevantes:")
         for doc_idx, text_content, distance_score in retrieved_contexts:
             print(f"\nÍndice Original do Documento: {doc_idx}, Distância: {distance_score:.4f}")
-            print(f"Texto: {text_content[:500]}...") # Limita o texto para melhor visualização
             print("-" * 50)
 if __name__ == "__main__":
-    test_context_search_interactive()

 import nltk
 # Baixar o tokenizador de frases do NLTK (necessário apenas uma vez)
+# try:
+#     print("tentanto encontrar o tokenizador de frases do NLTK...")
+#     nltk.data.find('tokenizers/punkt') or nltk.download('tokenizers/punkt_tab')
+# except nltk.downloader.DownloadError:
+#     print("Tokenizador de frases do NLTK não encontrado. Baixando...")
+#     nltk.download('punkt_tab')
+nltk.download("punkt")
 # Configurações
 # Configurações
+RAG_DIR = r".\RAG"
+DATA_DIR = os.path.join(RAG_DIR, "data")
+FAISS_INDEX_DIR = os.path.join(RAG_DIR, "FAISS")  # Renamed from FAISS_DIR for clarity
+CONTEXT_FAISS_INDEX_PATH = os.path.join(FAISS_INDEX_DIR, "context_index.faiss")  # Renamed variable
+CONTEXT_JSON_TEXT_PATH = os.path.join(FAISS_INDEX_DIR, "context_texts.json")  # Renamed variable
+EMBEDDING_MODEL_NAME = "nomic-ai/nomic-embed-text-v2-moe"  # Renamed variable
 def _load_embedding_model() -> SentenceTransformer:
     """
     print(f"Carregando modelo de embeddings {EMBEDDING_MODEL_NAME}...")
     return SentenceTransformer(EMBEDDING_MODEL_NAME, trust_remote_code=True)
 def _load_existing_index_and_documents() -> tuple[list | None, faiss.Index | None]:
     """
     Attempts to load an existing FAISS index and its associated text documents
         print("Carregando índice e documentos existentes...")
         try:
             faiss_index = faiss.read_index(CONTEXT_FAISS_INDEX_PATH)
+            with open(CONTEXT_JSON_TEXT_PATH, "r", encoding="utf-8") as f:
                 loaded_documents = json.load(f)
             print(f"Carregados {len(loaded_documents)} documentos do índice existente.")
             return loaded_documents, faiss_index
             return None, None
     return None, None
 def _load_source_documents() -> list[str]:
     """
     Loads and preprocesses text documents from the data folder (DATA_DIR).
         ValueError: If no '.txt' files are found in the data directory
                     or if no valid documents are loaded after processing.
     """
+    file_paths = glob.glob(os.path.join(DATA_DIR, "*.txt"))
     if not file_paths:
         raise ValueError(f"Nenhum arquivo .txt encontrado em {DATA_DIR}. Por favor, adicione documentos.")
     context_chunks = []
     for file_path in file_paths:
         try:
+            with open(file_path, "r", encoding="utf-8") as f:
                 # Splits by double newline, strips whitespace, and filters out empty strings
+                context_chunks.extend(list(filter(None, map(str.strip, f.read().split("\n\n")))))
         except Exception as e:
             print(f"Erro ao ler o arquivo {file_path}: {e}")
             continue
     print(f"Carregados {len(context_chunks)} documentos.")
     return context_chunks
 def _generate_text_embeddings(embedder_model: SentenceTransformer, text_documents: list[str]) -> np.ndarray:
     """
     Generates numerical embeddings for a list of text documents using the provided embedder.
     batch_size = 32
     generated_embeddings_list = []
     for i in range(0, len(text_documents), batch_size):
+        batch = text_documents[i : i + batch_size]
         try:
+            if batch:  # Ensure the batch is not empty
                 generated_embeddings_list.extend(embedder_model.encode(batch, show_progress_bar=False))
         except Exception as e:
             print(f"Erro ao gerar embeddings para lote {i//batch_size if batch_size > 0 else i}: {e}")
     return np.array(generated_embeddings_list, dtype=np.float32)
 def _create_faiss_index(document_embeddings: np.ndarray) -> faiss.Index:
     """
     Creates and populates a FAISS (Facebook AI Similarity Search) index from a set of embeddings.
     faiss_index.add(document_embeddings)
     return faiss_index
 def initialize_rag_system() -> tuple[list[str], faiss.Index, SentenceTransformer]:
     """
     Initializes the complete RAG (Retrieval Augmented Generation) system.
     text_embedder = _load_embedding_model()
     context_documents, faiss_index = _load_existing_index_and_documents()
+    if faiss_index is None:  # If the index doesn't exist or an error occurred loading it, rebuild
         print("Índice FAISS não encontrado ou corrompido. Reconstruindo...")
         context_documents = _load_source_documents()
         document_embeddings = _generate_text_embeddings(text_embedder, context_documents)
         faiss_index = _create_faiss_index(document_embeddings)
         faiss.write_index(faiss_index, CONTEXT_FAISS_INDEX_PATH)
+        with open(CONTEXT_JSON_TEXT_PATH, "w", encoding="utf-8") as f:
+            json.dump(context_documents, f, ensure_ascii=False, indent=4)  # Added indent for readability
         print("Novo índice e documentos salvos com sucesso.")
     return context_documents, faiss_index, text_embedder
+def search_with_full_query(
+    full_question_text: str,
+    context_documents: list[str],
+    faiss_index: faiss.Index,
+    embedder_model: SentenceTransformer,
+    k_results: int = 3,
+) -> list[tuple[int, str, float]]:
     """
     Searches for the 'k_results' most relevant documents for the **entire question**,
     treating it as a single search unit. This function does not segment the question into sentences.
         print(f"Erro ao buscar contexto completo: {e}")
         return []
+def search_with_multiple_sentences(
+    question_text: str,
+    context_documents: list[str],
+    faiss_index: faiss.Index,
+    embedder_model: SentenceTransformer,
+    k_per_sentence: int = 2,
+) -> list[tuple[int, str, float]]:
     """
     Segments the question into sentences and searches for the 'k_per_sentence' most relevant
     documents for **EACH sentence**, then consolidates and returns only unique contexts.
     print(f"Buscando múltiplos contextos para: '{question_text}'")
+    sentences = sent_tokenize(question_text, language="portuguese")
     if not sentences:
         print("Nenhuma frase detectada na pergunta para busca de múltiplos contextos.")
         return []
     try:
         for sentence in sentences:
             print(f"Processando frase para múltiplos contextos: '{sentence}'")
+            if not sentence.strip():  # Skip empty sentences that might be produced by sent_tokenize
                 continue
             query_embedding = np.array(embedder_model.encode([sentence]), dtype=np.float32)
             distances, indices = faiss_index.search(query_embedding, k_per_sentence)
                 if 0 <= document_index < len(context_documents):
                     # If the document has already been found, update if the new distance is smaller (more relevant)
+                    if (
+                        document_index not in consolidated_contexts_map
+                        or distance_score < consolidated_contexts_map[document_index][2]
+                    ):
+                        consolidated_contexts_map[document_index] = (
+                            document_index,
+                            context_documents[document_index],
+                            distance_score,
+                        )
         # Convert the dictionary of consolidated contexts back to a list
         results_list = list(consolidated_contexts_map.values())
         print(f"Erro ao buscar múltiplos contextos: {e}")
         return []
 # --- Funções de Teste ---
 def test_context_search_interactive():
     """
     while True:
         user_question = input("\nDigite uma pergunta (ou 'sair' para encerrar): ")
+        if user_question.lower() == "sair":
             break
         print("\nEscolha o tipo de busca:")
         search_choice = input("Opção (1 ou 2): ")
         retrieved_contexts = []
+        if search_choice == "1":
             print(f"\nRealizando busca de contexto completo para: '{user_question}'")
+            retrieved_contexts = search_with_full_query(
+                user_question, context_documents, faiss_index, text_embedder, k_results=5
+            )
+        elif search_choice == "2":
             print(f"\nRealizando busca de múltiplos contextos para: '{user_question}'")
+            retrieved_contexts = search_with_multiple_sentences(
+                user_question, context_documents, faiss_index, text_embedder, k_per_sentence=3
+            )
         else:
             print("Opção inválida. Tente novamente.")
             continue
         print("\nContextos mais relevantes:")
         for doc_idx, text_content, distance_score in retrieved_contexts:
             print(f"\nÍndice Original do Documento: {doc_idx}, Distância: {distance_score:.4f}")
+            print(f"Texto: {text_content[:500]}...")  # Limita o texto para melhor visualização
             print("-" * 50)
 if __name__ == "__main__":
+    test_context_search_interactive()