Spaces:

afulara
/

formpilot-demo

Running

formpilot-demo / scripts /make_synthetic_new.py

Auto‑deploy from GitHub

c3967db verified 3 months ago

1.62 kB

	# scripts/make_synthetic.py
	import json, random, sys, pathlib
	from dotenv import load_dotenv
	load_dotenv()

	# ------------------------------------------------------------------
	# make local packages importable ▼ (unchanged if you already added)
	ROOT = pathlib.Path(__file__).parent.parent
	sys.path.append(str(ROOT))
	# ------------------------------------------------------------------

	from ingest.ingest_forms import load_raw_docs # ✅ now safe
	from langchain_openai import ChatOpenAI
	from langchain.prompts import PromptTemplate

	DATA_DIR = pathlib.Path("data")
	OUT = DATA_DIR / "synth_I485.jsonl"
	N_PAIRS = 75

	llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)

	prompt = PromptTemplate.from_template("""
	You are a helpful immigration‑law paralegal. Read the I‑485 instructions below
	(delimited by <>). Generate one user question and the precise answer
	quoted from the text. Return JSON with keys "question" and "answer".

	<Instructions>
	{chunk}
	</Instructions>
	""")

	raw_docs = load_raw_docs() # <-- works
	if not raw_docs:
	sys.exit("❌ No documents in Qdrant. Run `python -m ingest.ingest_forms` first.")

	samples = random.sample(raw_docs, k=min(len(raw_docs), 80))

	with OUT.open("w") as f:
	for chunk in samples[:N_PAIRS]:
	qa_json = llm.invoke(prompt.format(chunk=chunk)).content
	obj = json.loads(qa_json)

	obj["contexts"] = [chunk]
	obj["ground_truths"] = [obj["answer"]]

	f.write(json.dumps(obj, ensure_ascii=False) + "\n")

	print(f"✅ Wrote {N_PAIRS} synthetic pairs → {OUT}")