Spaces:
Sleeping
Sleeping
File size: 2,987 Bytes
49afae4 cf6ca5c 49afae4 92aa435 49afae4 92aa435 49afae4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import os
import json
import faiss
import numpy as np
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
def extract_text_from_pdf(pdf_path):
try:
reader = PdfReader(pdf_path)
if not reader.pages:
print(f"O arquivo {pdf_path} não contém páginas.")
return ""
text = ""
for i, page in enumerate(reader.pages):
try:
page_text = page.extract_text()
if page_text:
text += page_text
else:
# Usa OCR para extrair texto
images = convert_from_path(pdf_path, first_page=i+1, last_page=i+1)
for image in images:
text += pytesseract.image_to_string(image, lang='por')
except Exception as e:
print(f"Erro ao processar a página {i+1} do arquivo {pdf_path}: {e}")
return text
except Exception as e:
print(f"Erro ao abrir o arquivo {pdf_path}: {e}")
return ""
def extract_text_from_txt(txt_path):
with open(txt_path, "r", encoding="utf-8") as f:
return f.read()
def process_documents(directory):
documents = []
for root, _, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
if file.endswith(".pdf"):
text = extract_text_from_pdf(file_path)
elif file.endswith(".txt"):
text = extract_text_from_txt(file_path)
else:
continue
document = {
"filename": file,
"content": text,
"path": file_path,
"directory": root
}
documents.append(document)
return documents
def save_documents_to_json(documents, output_file):
with open(output_file, "w", encoding="utf-8") as f:
json.dump(documents, f, ensure_ascii=False, indent=4)
def load_documents(json_file):
with open(json_file, "r", encoding="utf-8") as f:
return json.load(f)
def create_faiss_index(documents):
model = SentenceTransformer('neuralmind/bert-base-portuguese-cased')
embeddings = model.encode([doc["content"] for doc in documents], convert_to_tensor=True)
index = faiss.IndexFlatL2(embeddings.size()[1])
index.add(np.array(embeddings))
return index, model
def save_index(index, file_path):
faiss.write_index(index, file_path)
def load_index(file_path):
return faiss.read_index(file_path)
if __name__ == "__main__":
input_directory = "data" # Pasta raiz com os documentos
output_json = "data/documents.json" # Arquivo JSON final
output_index = "models/faiss_index.pkl" # Índice FAISS
documents = process_documents(input_directory)
save_documents_to_json(documents, output_json)
index, _ = create_faiss_index(documents)
save_index(index, output_index) |