import os import json import faiss import numpy as np from PyPDF2 import PdfReader from sentence_transformers import SentenceTransformer def extract_text_from_pdf(pdf_path): try: reader = PdfReader(pdf_path) if not reader.pages: print(f"O arquivo {pdf_path} não contém páginas.") return "" text = "" for i, page in enumerate(reader.pages): try: page_text = page.extract_text() if page_text: text += page_text else: # Usa OCR para extrair texto images = convert_from_path(pdf_path, first_page=i+1, last_page=i+1) for image in images: text += pytesseract.image_to_string(image, lang='por') except Exception as e: print(f"Erro ao processar a página {i+1} do arquivo {pdf_path}: {e}") return text except Exception as e: print(f"Erro ao abrir o arquivo {pdf_path}: {e}") return "" def extract_text_from_txt(txt_path): with open(txt_path, "r", encoding="utf-8") as f: return f.read() def process_documents(directory): documents = [] for root, _, files in os.walk(directory): for file in files: file_path = os.path.join(root, file) if file.endswith(".pdf"): text = extract_text_from_pdf(file_path) elif file.endswith(".txt"): text = extract_text_from_txt(file_path) else: continue document = { "filename": file, "content": text, "path": file_path, "directory": root } documents.append(document) return documents def save_documents_to_json(documents, output_file): with open(output_file, "w", encoding="utf-8") as f: json.dump(documents, f, ensure_ascii=False, indent=4) def load_documents(json_file): with open(json_file, "r", encoding="utf-8") as f: return json.load(f) def create_faiss_index(documents): model = SentenceTransformer('neuralmind/bert-base-portuguese-cased') embeddings = model.encode([doc["content"] for doc in documents], convert_to_tensor=True) index = faiss.IndexFlatL2(embeddings.size()[1]) index.add(np.array(embeddings)) return index, model def save_index(index, file_path): faiss.write_index(index, file_path) def load_index(file_path): return faiss.read_index(file_path) if __name__ == "__main__": input_directory = "data" # Pasta raiz com os documentos output_json = "data/documents.json" # Arquivo JSON final output_index = "models/faiss_index.pkl" # Índice FAISS documents = process_documents(input_directory) save_documents_to_json(documents, output_json) index, _ = create_faiss_index(documents) save_index(index, output_index)