File size: 2,830 Bytes
c3967db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# ingest/ingest_forms.py

from pathlib import Path
from qdrant_client import QdrantClient, models
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm, trange
from dotenv import load_dotenv
load_dotenv()            # picks up OPENAI_API_KEY from .env

DATA_DIR = Path("data")
QDRANT_PATH = "qdrant_data"
COLL = "formpilot_docs"
client = QdrantClient(
    path=QDRANT_PATH,
    force_disable_check_same_thread=True
    )

# Check if collection exists - updated API method
try:
    client.get_collection(collection_name=COLL)
    print(f"Collection '{COLL}' already exists")
except Exception:
    # we assume 1536‑dim OpenAI vectors
    client.create_collection(
        collection_name=COLL,
        vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
    )
    print(f"Created collection '{COLL}'")

embedder = OpenAIEmbeddings(model="text-embedding-3-small")
splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=50)

vecs, payloads, ids = [], [], []
next_id = client.count(COLL).count  # continue where we left off

for pdf in DATA_DIR.glob("*.pdf"):
    form_code = pdf.stem.split("instr")[0].upper()  # crude → "I-485"
    docs = PyPDFLoader(str(pdf)).load()
    chunks = splitter.split_documents(docs)
    for doc in tqdm(chunks, desc=pdf.name):
        vecs.append(embedder.embed_query(doc.page_content))
        payloads.append(
            dict(text=doc.page_content,
                 source=f"{pdf.name}:page-{doc.metadata.get('page',0)}",
                 form=form_code)
        )
        ids.append(next_id)
        next_id += 1

if vecs:
    client.upload_collection(COLL, vecs, payloads, ids, batch_size=64)
    print(f"✅  Upserted {len(vecs)} vectors across {len(list(DATA_DIR.glob('*.pdf')))} forms")
else:
    print("ℹ️  Nothing new to ingest.")


# ---------------------------------------------------------------------
# Helper for Stage‑5 synthetic‑data generation
# ---------------------------------------------------------------------
def load_raw_docs() -> list[str]:
    """
    Return every PDF chunk (page_content) that was just ingested.
    We simply stream the payloads back out of Qdrant; that keeps
    make_synthetic.py independent of PyPDFLoader etc.
    """
    # pull only the text payloads, unsorted order is fine for synthetic
    from qdrant_client.http import models as rest

    docs: list[str] = []
    attempt = client.scroll(
        collection_name=COLL,
        limit=5000,
        scroll_filter=None,
        with_payload=["text"],
        with_vectors=False,
    )
    for batch, _ in attempt:
        for pt in batch.payloads:
            docs.append(pt["text"])

    return docs