# print(sys.path) # Added for debugging module resolution """ Generate synthetic Q‑A pairs about the I‑485 form / instructions. Outputs: data/synth_I485.jsonl – 1 JSON per line: {"question": "...", "answer": "...", "contexts": ["..."], "ground_truths": ["..."]} """ import json, random, pathlib # ------------------------------------------------------------------ # Ensure the project root (parent of *scripts*) is on sys.path # so `rag` and `ingest` become import‑able when the script is run # as “python scripts/make_synthetic.py” # ------------------------------------------------------------------ import pathlib as _pl ROOT = (_pl.Path(__file__).parent).parent import sys sys.path.append(str(ROOT)) #import os.path #sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) from dotenv import load_dotenv load_dotenv() import rag.qa_chain as get_answer from ingest.ingest_forms import load_raw_docs # ← helper you already created from langchain.prompts import PromptTemplate from langchain_openai import ChatOpenAI DATA_DIR = pathlib.Path("data") OUT = DATA_DIR / "synth_I485.jsonl" N_PAIRS = 75 llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2) template = PromptTemplate.from_template(""" You are an helpful and empathetic immigration‑law paralegal. Read the I‑485 instructions below (delimited by <>). Generate a user question someone might ask, and the *concise* answer verbatim from the text. {chunk} Return JSON with keys "question" and "answer". """) # raw_docs = ingest.ingest_forms.load_raw_docs() # list[str] -> your PDF chunks samples = random.sample(raw_docs, k=min(len(raw_docs), 80)) with OUT.open("w") as f: for chunk in samples[:N_PAIRS]: text = template.format(chunk=chunk) qa = llm.invoke(text).content # {"question": "...", "answer": "..."} obj = json.loads(qa) # For ragas we also need contexts + ground_truths obj["contexts"] = [chunk] obj["ground_truths"] = [obj["answer"]] f.write(json.dumps(obj, ensure_ascii=False) + "\n") print(f"Wrote {N_PAIRS} synthetic pairs → {OUT}")