PersonalChatbot / src /ingest.py
Maheen Saleh
updated proj structure
4a3a2c0
from pathlib import Path
import argparse
import sys
import os
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
import os
from dotenv import load_dotenv
load_dotenv() # still works locally
HF_API_TOKEN = os.getenv("HUGGING_FACE_API_TOKEN")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
EMBED_MODEL_NAME = os.getenv("HUGGING_FACE_EMBEDDING_MODEL")
LLM_MODEL_NAME = os.getenv("LLM_MODEL")
ROOT_DIR = Path(__file__).parent
INDEX_DIR = Path(f"{ROOT_DIR}/data_index")
ROOT_DIR = Path(__file__).parent
INDEX_DIR = Path(f"{ROOT_DIR}/data_index")
DATA_DIR = Path(f"{ROOT_DIR}/data")
def load_documents(data_dir: Path):
docs = []
for path in data_dir.rglob("*"):
if path.is_dir():
continue
try:
if path.suffix.lower() in [".txt", ".md"]:
docs.extend(TextLoader(str(path), encoding="utf-8").load())
elif path.suffix.lower() == ".pdf":
docs.extend(PyPDFLoader(str(path)).load())
except Exception as e:
print(f"[skip] {path.name}: {e}", file=sys.stderr)
if not docs:
raise RuntimeError(f"No documents found in {data_dir}. Put .txt/.md/.pdf files there.")
return docs
def build_vectorstore(docs):
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120)
chunks = splitter.split_documents(docs)
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)
vs = FAISS.from_documents(chunks, embeddings)
return vs
def main():
parser = argparse.ArgumentParser(description="Ingest documents and build FAISS index.")
args = parser.parse_args()
print(f"Loading documents from {DATA_DIR}")
docs = load_documents(DATA_DIR)
print(f"Loaded {len(docs)} documents. Building index…")
vs = build_vectorstore(docs)
INDEX_DIR.mkdir(parents=True, exist_ok=True)
vs.save_local(str(INDEX_DIR))
# Persist embedding model name for safety
(INDEX_DIR / "embeddings_model.txt").write_text(EMBED_MODEL_NAME, encoding="utf-8")
print(f"Index saved to {INDEX_DIR.resolve()}")
if __name__ == "__main__":
main()