Spaces:
Running
Running
from pathlib import Path | |
import argparse | |
import sys | |
import os | |
from langchain_community.document_loaders import TextLoader, PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.vectorstores import FAISS | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
import os | |
from dotenv import load_dotenv | |
load_dotenv() # still works locally | |
HF_API_TOKEN = os.getenv("HUGGING_FACE_API_TOKEN") | |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") | |
EMBED_MODEL_NAME = os.getenv("HUGGING_FACE_EMBEDDING_MODEL") | |
LLM_MODEL_NAME = os.getenv("LLM_MODEL") | |
ROOT_DIR = Path(__file__).parent | |
INDEX_DIR = Path(f"{ROOT_DIR}/data_index") | |
ROOT_DIR = Path(__file__).parent | |
INDEX_DIR = Path(f"{ROOT_DIR}/data_index") | |
DATA_DIR = Path(f"{ROOT_DIR}/data") | |
def load_documents(data_dir: Path): | |
docs = [] | |
for path in data_dir.rglob("*"): | |
if path.is_dir(): | |
continue | |
try: | |
if path.suffix.lower() in [".txt", ".md"]: | |
docs.extend(TextLoader(str(path), encoding="utf-8").load()) | |
elif path.suffix.lower() == ".pdf": | |
docs.extend(PyPDFLoader(str(path)).load()) | |
except Exception as e: | |
print(f"[skip] {path.name}: {e}", file=sys.stderr) | |
if not docs: | |
raise RuntimeError(f"No documents found in {data_dir}. Put .txt/.md/.pdf files there.") | |
return docs | |
def build_vectorstore(docs): | |
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120) | |
chunks = splitter.split_documents(docs) | |
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME) | |
vs = FAISS.from_documents(chunks, embeddings) | |
return vs | |
def main(): | |
parser = argparse.ArgumentParser(description="Ingest documents and build FAISS index.") | |
args = parser.parse_args() | |
print(f"Loading documents from {DATA_DIR}") | |
docs = load_documents(DATA_DIR) | |
print(f"Loaded {len(docs)} documents. Building index…") | |
vs = build_vectorstore(docs) | |
INDEX_DIR.mkdir(parents=True, exist_ok=True) | |
vs.save_local(str(INDEX_DIR)) | |
# Persist embedding model name for safety | |
(INDEX_DIR / "embeddings_model.txt").write_text(EMBED_MODEL_NAME, encoding="utf-8") | |
print(f"Index saved to {INDEX_DIR.resolve()}") | |
if __name__ == "__main__": | |
main() | |