Spaces:
Running
Running
File size: 2,211 Bytes
c3967db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
# print(sys.path) # Added for debugging module resolution
"""
Generate synthetic Q‑A pairs about the I‑485 form / instructions.
Outputs: data/synth_I485.jsonl – 1 JSON per line:
{"question": "...", "answer": "...", "contexts": ["..."], "ground_truths": ["..."]}
"""
import json, random, pathlib
# ------------------------------------------------------------------
# Ensure the project root (parent of *scripts*) is on sys.path
# so `rag` and `ingest` become import‑able when the script is run
# as “python scripts/make_synthetic.py”
# ------------------------------------------------------------------
import pathlib as _pl
ROOT = (_pl.Path(__file__).parent).parent
import sys
sys.path.append(str(ROOT))
#import os.path
#sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
from dotenv import load_dotenv
load_dotenv()
import rag.qa_chain as get_answer
from ingest.ingest_forms import load_raw_docs # ← helper you already created
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
DATA_DIR = pathlib.Path("data")
OUT = DATA_DIR / "synth_I485.jsonl"
N_PAIRS = 75
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)
template = PromptTemplate.from_template("""
You are an helpful and empathetic immigration‑law paralegal. Read the I‑485 instructions below (delimited
by <>). Generate a user question someone might ask, and the *concise* answer
verbatim from the text.
<Instructions>
{chunk}
</Instructions>
Return JSON with keys "question" and "answer".
""")
# raw_docs = ingest.ingest_forms.load_raw_docs() # list[str] -> your PDF chunks
samples = random.sample(raw_docs, k=min(len(raw_docs), 80))
with OUT.open("w") as f:
for chunk in samples[:N_PAIRS]:
text = template.format(chunk=chunk)
qa = llm.invoke(text).content # {"question": "...", "answer": "..."}
obj = json.loads(qa)
# For ragas we also need contexts + ground_truths
obj["contexts"] = [chunk]
obj["ground_truths"] = [obj["answer"]]
f.write(json.dumps(obj, ensure_ascii=False) + "\n")
print(f"Wrote {N_PAIRS} synthetic pairs → {OUT}")
|