Spaces:
Running
Running
File size: 3,992 Bytes
68ae700 c3967db 68ae700 c3967db 68ae700 c3967db 17f9660 c3967db 68ae700 c3967db 68ae700 c3967db 68ae700 c3967db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# ingest/ingest_forms.py
"""
Ingest every PDF in data/ ➜ local Qdrant store (run once)
Also exposes load_raw_docs() for Stage‑5 synthetic generation.
"""
from pathlib import Path
from typing import List
from dotenv import load_dotenv
from qdrant_client import QdrantClient, models
from qdrant_client.http import models as rest
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import trange
from typing import Optional
import pathlib
load_dotenv() # OPENAI_API_KEY
DATA_DIR = Path("data")
QDRANT_PATH = "qdrant_data"
COLL = "formpilot_docs"
EMB_SIZE = 1536 # text‑embedding‑3‑small
# ------------------------------------------------------------------
# helpers
# ------------------------------------------------------------------
def _get_client(readonly: bool = False) -> QdrantClient:
"""
• readonly=False – opens once for ingest (caller is __main__)
• readonly=True – opens for load_raw_docs(). Uses the same
folder but *disables* the LMDB write lock so it can coexist.
"""
return QdrantClient(
path=QDRANT_PATH,
force_disable_check_same_thread=True,
prefer_grpc=False, # local, pure‑python
read_only=readonly,
)
# ingest/ingest_forms.py – drop‑in replacement for load_raw_docs
def load_raw_docs(limit: Optional[int] = None) -> list[str]: # ← Optional
"""
Fetch text chunks back from the local Qdrant collection so that
downstream scripts (e.g. make_synthetic.py) can work without
re‑ingesting PDF files.
Parameters
----------
limit : int | None
Max number of chunks to return (handy for quick tests).
Returns
-------
list[str]
The raw text strings you originally inserted.
"""
from qdrant_client import QdrantClient # local‑only client
QDRANT_PATH = pathlib.Path("qdrant_data")
cli = QdrantClient(path=QDRANT_PATH, force_disable_check_same_thread=True)
# scroll() returns (points, next_offset). We only need points.
points, _ = cli.scroll(collection_name="formpilot_docs")
texts = [p.payload["text"] for p in points]
if limit:
texts = texts[:limit]
return texts
# ------------------------------------------------------------------
# main ingest routine – only runs if script executed directly
# ------------------------------------------------------------------
def _ingest_all() -> None:
cli = _get_client()
# create collection once
if COLL not in {c.name for c in cli.get_collections().collections}:
cli.create_collection(
collection_name=COLL,
vectors_config=models.VectorParams(
size=EMB_SIZE, distance=models.Distance.COSINE
),
)
print(f"✅ Created collection {COLL}")
embedder = OpenAIEmbeddings(model="text-embedding-3-small")
splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=50)
vecs, payloads, ids = [], [], []
next_id = cli.count(COLL).count
for pdf in DATA_DIR.glob("*.pdf"):
form_code = pdf.stem.split("instr")[0].upper()
for doc in PyPDFLoader(str(pdf)).load_and_split(splitter):
vecs.append(embedder.embed_query(doc.page_content))
payloads.append(
dict(
text=doc.page_content,
source=f"{pdf.name}:page-{doc.metadata.get('page',0)}",
form=form_code,
)
)
ids.append(next_id)
next_id += 1
if vecs:
cli.upload_collection(COLL, vecs, payloads, ids, batch_size=64)
print(f"✅ Upserted {len(vecs):,} vectors into {COLL}")
else:
print("ℹ️ Nothing new to ingest.")
if __name__ == "__main__":
_ingest_all()
|