Spaces:

afulara
/

formpilot-demo

Running

App Files Files Community

formpilot-demo / scripts /make_synthetic.py

afulara

Auto‑deploy from GitHub

c3967db verified 3 months ago

raw

history blame contribute delete

2.21 kB

	# print(sys.path) # Added for debugging module resolution

	"""
	Generate synthetic Q‑A pairs about the I‑485 form / instructions.
	Outputs: data/synth_I485.jsonl – 1 JSON per line:
	{"question": "...", "answer": "...", "contexts": ["..."], "ground_truths": ["..."]}
	"""
	import json, random, pathlib

	# ------------------------------------------------------------------
	# Ensure the project root (parent of scripts) is on sys.path
	# so `rag` and `ingest` become import‑able when the script is run
	# as “python scripts/make_synthetic.py”
	# ------------------------------------------------------------------
	import pathlib as _pl
	ROOT = (_pl.Path(__file__).parent).parent


	import sys
	sys.path.append(str(ROOT))
	#import os.path
	#sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))

	from dotenv import load_dotenv
	load_dotenv()

	import rag.qa_chain as get_answer
	from ingest.ingest_forms import load_raw_docs # ← helper you already created

	from langchain.prompts import PromptTemplate
	from langchain_openai import ChatOpenAI

	DATA_DIR = pathlib.Path("data")
	OUT = DATA_DIR / "synth_I485.jsonl"
	N_PAIRS = 75

	llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)

	template = PromptTemplate.from_template("""
	You are an helpful and empathetic immigration‑law paralegal. Read the I‑485 instructions below (delimited
	by <>). Generate a user question someone might ask, and the concise answer
	verbatim from the text.

	<Instructions>
	{chunk}
	</Instructions>

	Return JSON with keys "question" and "answer".
	""")

	# raw_docs = ingest.ingest_forms.load_raw_docs() # list[str] -> your PDF chunks
	samples = random.sample(raw_docs, k=min(len(raw_docs), 80))

	with OUT.open("w") as f:
	for chunk in samples[:N_PAIRS]:
	text = template.format(chunk=chunk)
	qa = llm.invoke(text).content # {"question": "...", "answer": "..."}
	obj = json.loads(qa)

	# For ragas we also need contexts + ground_truths
	obj["contexts"] = [chunk]
	obj["ground_truths"] = [obj["answer"]]
	f.write(json.dumps(obj, ensure_ascii=False) + "\n")

	print(f"Wrote {N_PAIRS} synthetic pairs → {OUT}")