formpilot-demo / ingest /ingest_forms_old.py
afulara's picture
Auto‑deploy from GitHub
c3967db verified
# ingest/ingest_forms.py
from pathlib import Path
from qdrant_client import QdrantClient, models
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm, trange
from dotenv import load_dotenv
load_dotenv() # picks up OPENAI_API_KEY from .env
DATA_DIR = Path("data")
QDRANT_PATH = "qdrant_data"
COLL = "formpilot_docs"
client = QdrantClient(
path=QDRANT_PATH,
force_disable_check_same_thread=True
)
# Check if collection exists - updated API method
try:
client.get_collection(collection_name=COLL)
print(f"Collection '{COLL}' already exists")
except Exception:
# we assume 1536‑dim OpenAI vectors
client.create_collection(
collection_name=COLL,
vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
)
print(f"Created collection '{COLL}'")
embedder = OpenAIEmbeddings(model="text-embedding-3-small")
splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=50)
vecs, payloads, ids = [], [], []
next_id = client.count(COLL).count # continue where we left off
for pdf in DATA_DIR.glob("*.pdf"):
form_code = pdf.stem.split("instr")[0].upper() # crude → "I-485"
docs = PyPDFLoader(str(pdf)).load()
chunks = splitter.split_documents(docs)
for doc in tqdm(chunks, desc=pdf.name):
vecs.append(embedder.embed_query(doc.page_content))
payloads.append(
dict(text=doc.page_content,
source=f"{pdf.name}:page-{doc.metadata.get('page',0)}",
form=form_code)
)
ids.append(next_id)
next_id += 1
if vecs:
client.upload_collection(COLL, vecs, payloads, ids, batch_size=64)
print(f"✅ Upserted {len(vecs)} vectors across {len(list(DATA_DIR.glob('*.pdf')))} forms")
else:
print("ℹ️ Nothing new to ingest.")
# ---------------------------------------------------------------------
# Helper for Stage‑5 synthetic‑data generation
# ---------------------------------------------------------------------
def load_raw_docs() -> list[str]:
"""
Return every PDF chunk (page_content) that was just ingested.
We simply stream the payloads back out of Qdrant; that keeps
make_synthetic.py independent of PyPDFLoader etc.
"""
# pull only the text payloads, unsorted order is fine for synthetic
from qdrant_client.http import models as rest
docs: list[str] = []
attempt = client.scroll(
collection_name=COLL,
limit=5000,
scroll_filter=None,
with_payload=["text"],
with_vectors=False,
)
for batch, _ in attempt:
for pt in batch.payloads:
docs.append(pt["text"])
return docs