Spaces:
Running
Running
# ingest/ingest_forms.py | |
from pathlib import Path | |
from qdrant_client import QdrantClient, models | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_openai import OpenAIEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from tqdm import tqdm, trange | |
from dotenv import load_dotenv | |
load_dotenv() # picks up OPENAI_API_KEY from .env | |
DATA_DIR = Path("data") | |
QDRANT_PATH = "qdrant_data" | |
COLL = "formpilot_docs" | |
client = QdrantClient( | |
path=QDRANT_PATH, | |
force_disable_check_same_thread=True | |
) | |
# Check if collection exists - updated API method | |
try: | |
client.get_collection(collection_name=COLL) | |
print(f"Collection '{COLL}' already exists") | |
except Exception: | |
# we assume 1536‑dim OpenAI vectors | |
client.create_collection( | |
collection_name=COLL, | |
vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE), | |
) | |
print(f"Created collection '{COLL}'") | |
embedder = OpenAIEmbeddings(model="text-embedding-3-small") | |
splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=50) | |
vecs, payloads, ids = [], [], [] | |
next_id = client.count(COLL).count # continue where we left off | |
for pdf in DATA_DIR.glob("*.pdf"): | |
form_code = pdf.stem.split("instr")[0].upper() # crude → "I-485" | |
docs = PyPDFLoader(str(pdf)).load() | |
chunks = splitter.split_documents(docs) | |
for doc in tqdm(chunks, desc=pdf.name): | |
vecs.append(embedder.embed_query(doc.page_content)) | |
payloads.append( | |
dict(text=doc.page_content, | |
source=f"{pdf.name}:page-{doc.metadata.get('page',0)}", | |
form=form_code) | |
) | |
ids.append(next_id) | |
next_id += 1 | |
if vecs: | |
client.upload_collection(COLL, vecs, payloads, ids, batch_size=64) | |
print(f"✅ Upserted {len(vecs)} vectors across {len(list(DATA_DIR.glob('*.pdf')))} forms") | |
else: | |
print("ℹ️ Nothing new to ingest.") | |
# --------------------------------------------------------------------- | |
# Helper for Stage‑5 synthetic‑data generation | |
# --------------------------------------------------------------------- | |
def load_raw_docs() -> list[str]: | |
""" | |
Return every PDF chunk (page_content) that was just ingested. | |
We simply stream the payloads back out of Qdrant; that keeps | |
make_synthetic.py independent of PyPDFLoader etc. | |
""" | |
# pull only the text payloads, unsorted order is fine for synthetic | |
from qdrant_client.http import models as rest | |
docs: list[str] = [] | |
attempt = client.scroll( | |
collection_name=COLL, | |
limit=5000, | |
scroll_filter=None, | |
with_payload=["text"], | |
with_vectors=False, | |
) | |
for batch, _ in attempt: | |
for pt in batch.payloads: | |
docs.append(pt["text"]) | |
return docs | |