Spaces:
Running
Running
File size: 1,620 Bytes
c3967db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
# scripts/make_synthetic.py
import json, random, sys, pathlib
from dotenv import load_dotenv
load_dotenv()
# ------------------------------------------------------------------
# make local packages importable ▼ (unchanged if you already added)
ROOT = pathlib.Path(__file__).parent.parent
sys.path.append(str(ROOT))
# ------------------------------------------------------------------
from ingest.ingest_forms import load_raw_docs # ✅ now safe
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
DATA_DIR = pathlib.Path("data")
OUT = DATA_DIR / "synth_I485.jsonl"
N_PAIRS = 75
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)
prompt = PromptTemplate.from_template("""
You are a helpful immigration‑law paralegal. Read the I‑485 instructions below
(delimited by <>). Generate one user **question** and the precise **answer**
quoted from the text. Return JSON with keys "question" and "answer".
<Instructions>
{chunk}
</Instructions>
""")
raw_docs = load_raw_docs() # <-- works
if not raw_docs:
sys.exit("❌ No documents in Qdrant. Run `python -m ingest.ingest_forms` first.")
samples = random.sample(raw_docs, k=min(len(raw_docs), 80))
with OUT.open("w") as f:
for chunk in samples[:N_PAIRS]:
qa_json = llm.invoke(prompt.format(chunk=chunk)).content
obj = json.loads(qa_json)
obj["contexts"] = [chunk]
obj["ground_truths"] = [obj["answer"]]
f.write(json.dumps(obj, ensure_ascii=False) + "\n")
print(f"✅ Wrote {N_PAIRS} synthetic pairs → {OUT}")
|