dubswayAgenticV2 / app /pdf_ingestion.py
peace2024's picture
pydantic fix
ed6b1d2
import os
import logging
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel, EmailStr
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv
load_dotenv()
router = APIRouter()
logger = logging.getLogger("pdf_ingestion")
class IngestRequest(BaseModel):
user_id: int
@router.post("/ingest-pdfs")
async def ingest_pdfs(request: IngestRequest):
user_id = request.user_id
user_pdf_path = f"./pdfs/user_{user_id}"
user_vector_path = f"./vector_store/user_{user_id}"
if not os.path.exists(user_pdf_path):
raise HTTPException(status_code=404, detail=f"No PDF directory found for user {user_id}")
try:
logger.info(f"πŸ“₯ Loading PDFs for user {user_id} from {user_pdf_path}")
loader = PyPDFDirectoryLoader(user_pdf_path)
documents = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
vector_store = FAISS.from_documents(split_docs, embeddings)
os.makedirs(user_vector_path, exist_ok=True)
vector_store.save_local(user_vector_path)
logger.info(f"βœ… Re-ingested and saved vector store for user {user_id}")
return {"message": f"Vector store updated for user {user_id}", "documents_ingested": len(split_docs)}
except Exception as e:
logger.error(f"❌ PDF ingestion failed: {e}")
raise HTTPException(status_code=500, detail="Failed to ingest PDFs and build vector store.")