Spaces:
Running
Running
# print(sys.path) # Added for debugging module resolution | |
""" | |
Generate synthetic Q‑A pairs about the I‑485 form / instructions. | |
Outputs: data/synth_I485.jsonl – 1 JSON per line: | |
{"question": "...", "answer": "...", "contexts": ["..."], "ground_truths": ["..."]} | |
""" | |
import json, random, pathlib | |
# ------------------------------------------------------------------ | |
# Ensure the project root (parent of *scripts*) is on sys.path | |
# so `rag` and `ingest` become import‑able when the script is run | |
# as “python scripts/make_synthetic.py” | |
# ------------------------------------------------------------------ | |
import pathlib as _pl | |
ROOT = (_pl.Path(__file__).parent).parent | |
import sys | |
sys.path.append(str(ROOT)) | |
#import os.path | |
#sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) | |
from dotenv import load_dotenv | |
load_dotenv() | |
import rag.qa_chain as get_answer | |
from ingest.ingest_forms import load_raw_docs # ← helper you already created | |
from langchain.prompts import PromptTemplate | |
from langchain_openai import ChatOpenAI | |
DATA_DIR = pathlib.Path("data") | |
OUT = DATA_DIR / "synth_I485.jsonl" | |
N_PAIRS = 75 | |
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2) | |
template = PromptTemplate.from_template(""" | |
You are an helpful and empathetic immigration‑law paralegal. Read the I‑485 instructions below (delimited | |
by <>). Generate a user question someone might ask, and the *concise* answer | |
verbatim from the text. | |
<Instructions> | |
{chunk} | |
</Instructions> | |
Return JSON with keys "question" and "answer". | |
""") | |
# raw_docs = ingest.ingest_forms.load_raw_docs() # list[str] -> your PDF chunks | |
samples = random.sample(raw_docs, k=min(len(raw_docs), 80)) | |
with OUT.open("w") as f: | |
for chunk in samples[:N_PAIRS]: | |
text = template.format(chunk=chunk) | |
qa = llm.invoke(text).content # {"question": "...", "answer": "..."} | |
obj = json.loads(qa) | |
# For ragas we also need contexts + ground_truths | |
obj["contexts"] = [chunk] | |
obj["ground_truths"] = [obj["answer"]] | |
f.write(json.dumps(obj, ensure_ascii=False) + "\n") | |
print(f"Wrote {N_PAIRS} synthetic pairs → {OUT}") | |