# ingest/ingest_forms.py from pathlib import Path from qdrant_client import QdrantClient, models from langchain_community.document_loaders import PyPDFLoader from langchain_openai import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from tqdm import tqdm, trange from dotenv import load_dotenv load_dotenv() # picks up OPENAI_API_KEY from .env DATA_DIR = Path("data") QDRANT_PATH = "qdrant_data" COLL = "formpilot_docs" client = QdrantClient( path=QDRANT_PATH, force_disable_check_same_thread=True ) # Check if collection exists - updated API method try: client.get_collection(collection_name=COLL) print(f"Collection '{COLL}' already exists") except Exception: # we assume 1536‑dim OpenAI vectors client.create_collection( collection_name=COLL, vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE), ) print(f"Created collection '{COLL}'") embedder = OpenAIEmbeddings(model="text-embedding-3-small") splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=50) vecs, payloads, ids = [], [], [] next_id = client.count(COLL).count # continue where we left off for pdf in DATA_DIR.glob("*.pdf"): form_code = pdf.stem.split("instr")[0].upper() # crude → "I-485" docs = PyPDFLoader(str(pdf)).load() chunks = splitter.split_documents(docs) for doc in tqdm(chunks, desc=pdf.name): vecs.append(embedder.embed_query(doc.page_content)) payloads.append( dict(text=doc.page_content, source=f"{pdf.name}:page-{doc.metadata.get('page',0)}", form=form_code) ) ids.append(next_id) next_id += 1 if vecs: client.upload_collection(COLL, vecs, payloads, ids, batch_size=64) print(f"✅ Upserted {len(vecs)} vectors across {len(list(DATA_DIR.glob('*.pdf')))} forms") else: print("ℹ️ Nothing new to ingest.") # --------------------------------------------------------------------- # Helper for Stage‑5 synthetic‑data generation # --------------------------------------------------------------------- def load_raw_docs() -> list[str]: """ Return every PDF chunk (page_content) that was just ingested. We simply stream the payloads back out of Qdrant; that keeps make_synthetic.py independent of PyPDFLoader etc. """ # pull only the text payloads, unsorted order is fine for synthetic from qdrant_client.http import models as rest docs: list[str] = [] attempt = client.scroll( collection_name=COLL, limit=5000, scroll_filter=None, with_payload=["text"], with_vectors=False, ) for batch, _ in attempt: for pt in batch.payloads: docs.append(pt["text"]) return docs