Spaces:
Running
Running
# scripts/make_synthetic.py | |
import json, random, sys, pathlib | |
from dotenv import load_dotenv | |
load_dotenv() | |
# ------------------------------------------------------------------ | |
# make local packages importable ▼ (unchanged if you already added) | |
ROOT = pathlib.Path(__file__).parent.parent | |
sys.path.append(str(ROOT)) | |
# ------------------------------------------------------------------ | |
from ingest.ingest_forms import load_raw_docs # ✅ now safe | |
from langchain_openai import ChatOpenAI | |
from langchain.prompts import PromptTemplate | |
DATA_DIR = pathlib.Path("data") | |
OUT = DATA_DIR / "synth_I485.jsonl" | |
N_PAIRS = 75 | |
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2) | |
prompt = PromptTemplate.from_template(""" | |
You are a helpful immigration‑law paralegal. Read the I‑485 instructions below | |
(delimited by <>). Generate one user **question** and the precise **answer** | |
quoted from the text. Return JSON with keys "question" and "answer". | |
<Instructions> | |
{chunk} | |
</Instructions> | |
""") | |
raw_docs = load_raw_docs() # <-- works | |
if not raw_docs: | |
sys.exit("❌ No documents in Qdrant. Run `python -m ingest.ingest_forms` first.") | |
samples = random.sample(raw_docs, k=min(len(raw_docs), 80)) | |
with OUT.open("w") as f: | |
for chunk in samples[:N_PAIRS]: | |
qa_json = llm.invoke(prompt.format(chunk=chunk)).content | |
obj = json.loads(qa_json) | |
obj["contexts"] = [chunk] | |
obj["ground_truths"] = [obj["answer"]] | |
f.write(json.dumps(obj, ensure_ascii=False) + "\n") | |
print(f"✅ Wrote {N_PAIRS} synthetic pairs → {OUT}") | |