Spaces:
Running
Running
import os | |
import logging | |
from fastapi import APIRouter, HTTPException | |
from pydantic import BaseModel, EmailStr | |
from langchain_community.document_loaders import PyPDFDirectoryLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_openai import OpenAIEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from dotenv import load_dotenv | |
load_dotenv() | |
router = APIRouter() | |
logger = logging.getLogger("pdf_ingestion") | |
class IngestRequest(BaseModel): | |
user_id: int | |
async def ingest_pdfs(request: IngestRequest): | |
user_id = request.user_id | |
user_pdf_path = f"./pdfs/user_{user_id}" | |
user_vector_path = f"./vector_store/user_{user_id}" | |
if not os.path.exists(user_pdf_path): | |
raise HTTPException(status_code=404, detail=f"No PDF directory found for user {user_id}") | |
try: | |
logger.info(f"π₯ Loading PDFs for user {user_id} from {user_pdf_path}") | |
loader = PyPDFDirectoryLoader(user_pdf_path) | |
documents = loader.load() | |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
split_docs = splitter.split_documents(documents) | |
embeddings = OpenAIEmbeddings() | |
vector_store = FAISS.from_documents(split_docs, embeddings) | |
os.makedirs(user_vector_path, exist_ok=True) | |
vector_store.save_local(user_vector_path) | |
logger.info(f"β Re-ingested and saved vector store for user {user_id}") | |
return {"message": f"Vector store updated for user {user_id}", "documents_ingested": len(split_docs)} | |
except Exception as e: | |
logger.error(f"β PDF ingestion failed: {e}") | |
raise HTTPException(status_code=500, detail="Failed to ingest PDFs and build vector store.") | |