Spaces:

MaNmAxImO
/

Politometro

Sleeping

File size: 2,987 Bytes

import os
import json
import faiss
import numpy as np
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer

def extract_text_from_pdf(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        if not reader.pages:
            print(f"O arquivo {pdf_path} não contém páginas.")
            return ""
        
        text = ""
        for i, page in enumerate(reader.pages):
            try:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
                else:
                    # Usa OCR para extrair texto
                    images = convert_from_path(pdf_path, first_page=i+1, last_page=i+1)
                    for image in images:
                        text += pytesseract.image_to_string(image, lang='por')
            except Exception as e:
                print(f"Erro ao processar a página {i+1} do arquivo {pdf_path}: {e}")
        return text
    except Exception as e:
        print(f"Erro ao abrir o arquivo {pdf_path}: {e}")
        return ""

def extract_text_from_txt(txt_path):
    with open(txt_path, "r", encoding="utf-8") as f:
        return f.read()

def process_documents(directory):
    documents = []
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            if file.endswith(".pdf"):
                text = extract_text_from_pdf(file_path)
            elif file.endswith(".txt"):
                text = extract_text_from_txt(file_path)
            else:
                continue
            
            document = {
                "filename": file,
                "content": text,
                "path": file_path,
                "directory": root
            }
            documents.append(document)
    return documents

def save_documents_to_json(documents, output_file):
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(documents, f, ensure_ascii=False, indent=4)

def load_documents(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        return json.load(f)

def create_faiss_index(documents):
    model = SentenceTransformer('neuralmind/bert-base-portuguese-cased')
    embeddings = model.encode([doc["content"] for doc in documents], convert_to_tensor=True)
    index = faiss.IndexFlatL2(embeddings.size()[1])
    index.add(np.array(embeddings))
    return index, model

def save_index(index, file_path):
    faiss.write_index(index, file_path)

def load_index(file_path):
    return faiss.read_index(file_path)

if __name__ == "__main__":
    input_directory = "data"  # Pasta raiz com os documentos
    output_json = "data/documents.json"  # Arquivo JSON final
    output_index = "models/faiss_index.pkl"  # Índice FAISS
    
    documents = process_documents(input_directory)
    save_documents_to_json(documents, output_json)
    
    index, _ = create_faiss_index(documents)
    save_index(index, output_index)