File size: 1,620 Bytes
c3967db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# scripts/make_synthetic.py
import json, random, sys, pathlib
from dotenv import load_dotenv
load_dotenv()

# ------------------------------------------------------------------
# make local packages importable  ▼ (unchanged if you already added)
ROOT = pathlib.Path(__file__).parent.parent
sys.path.append(str(ROOT))
# ------------------------------------------------------------------

from ingest.ingest_forms import load_raw_docs          # ✅  now safe
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

DATA_DIR = pathlib.Path("data")
OUT      = DATA_DIR / "synth_I485.jsonl"
N_PAIRS  = 75

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)

prompt = PromptTemplate.from_template("""
You are a helpful immigration‑law paralegal. Read the I‑485 instructions below
(delimited by <>). Generate one user **question** and the precise **answer**
quoted from the text. Return JSON with keys "question" and "answer".

<Instructions>
{chunk}
</Instructions>
""")

raw_docs = load_raw_docs()                            # <-- works
if not raw_docs:
    sys.exit("❌  No documents in Qdrant.  Run `python -m ingest.ingest_forms` first.")

samples  = random.sample(raw_docs, k=min(len(raw_docs), 80))

with OUT.open("w") as f:
    for chunk in samples[:N_PAIRS]:
        qa_json = llm.invoke(prompt.format(chunk=chunk)).content
        obj     = json.loads(qa_json)

        obj["contexts"]      = [chunk]
        obj["ground_truths"] = [obj["answer"]]

        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print(f"✅  Wrote {N_PAIRS} synthetic pairs → {OUT}")