File size: 1,769 Bytes
e27e999
 
 
ed6b1d2
e27e999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import os
import logging
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel, EmailStr
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv

load_dotenv()

router = APIRouter()
logger = logging.getLogger("pdf_ingestion")

class IngestRequest(BaseModel):
    user_id: int

@router.post("/ingest-pdfs")
async def ingest_pdfs(request: IngestRequest):
    user_id = request.user_id
    user_pdf_path = f"./pdfs/user_{user_id}"
    user_vector_path = f"./vector_store/user_{user_id}"

    if not os.path.exists(user_pdf_path):
        raise HTTPException(status_code=404, detail=f"No PDF directory found for user {user_id}")

    try:
        logger.info(f"📥 Loading PDFs for user {user_id} from {user_pdf_path}")
        loader = PyPDFDirectoryLoader(user_pdf_path)
        documents = loader.load()

        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        split_docs = splitter.split_documents(documents)

        embeddings = OpenAIEmbeddings()
        vector_store = FAISS.from_documents(split_docs, embeddings)

        os.makedirs(user_vector_path, exist_ok=True)
        vector_store.save_local(user_vector_path)

        logger.info(f"✅ Re-ingested and saved vector store for user {user_id}")
        return {"message": f"Vector store updated for user {user_id}", "documents_ingested": len(split_docs)}

    except Exception as e:
        logger.error(f"❌ PDF ingestion failed: {e}")
        raise HTTPException(status_code=500, detail="Failed to ingest PDFs and build vector store.")