# scripts/make_synthetic.py import json, random, sys, pathlib from dotenv import load_dotenv load_dotenv() # ------------------------------------------------------------------ # make local packages importable ▼ (unchanged if you already added) ROOT = pathlib.Path(__file__).parent.parent sys.path.append(str(ROOT)) # ------------------------------------------------------------------ from ingest.ingest_forms import load_raw_docs # ✅ now safe from langchain_openai import ChatOpenAI from langchain.prompts import PromptTemplate DATA_DIR = pathlib.Path("data") OUT = DATA_DIR / "synth_I485.jsonl" N_PAIRS = 75 llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2) prompt = PromptTemplate.from_template(""" You are a helpful immigration‑law paralegal. Read the I‑485 instructions below (delimited by <>). Generate one user **question** and the precise **answer** quoted from the text. Return JSON with keys "question" and "answer". {chunk} """) raw_docs = load_raw_docs() # <-- works if not raw_docs: sys.exit("❌ No documents in Qdrant. Run `python -m ingest.ingest_forms` first.") samples = random.sample(raw_docs, k=min(len(raw_docs), 80)) with OUT.open("w") as f: for chunk in samples[:N_PAIRS]: qa_json = llm.invoke(prompt.format(chunk=chunk)).content obj = json.loads(qa_json) obj["contexts"] = [chunk] obj["ground_truths"] = [obj["answer"]] f.write(json.dumps(obj, ensure_ascii=False) + "\n") print(f"✅ Wrote {N_PAIRS} synthetic pairs → {OUT}")