File size: 2,211 Bytes
c3967db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# print(sys.path) # Added for debugging module resolution

"""
Generate synthetic Q‑A pairs about the I‑485 form / instructions.
Outputs: data/synth_I485.jsonl – 1 JSON per line:
{"question": "...", "answer": "...", "contexts": ["..."], "ground_truths": ["..."]}
"""
import json, random, pathlib

# ------------------------------------------------------------------
# Ensure the project root (parent of *scripts*) is on sys.path
# so `rag` and `ingest` become import‑able when the script is run
# as  “python scripts/make_synthetic.py”
# ------------------------------------------------------------------
import pathlib as _pl
ROOT = (_pl.Path(__file__).parent).parent


import sys
sys.path.append(str(ROOT))
#import os.path
#sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))

from dotenv import load_dotenv
load_dotenv()

import rag.qa_chain as get_answer 
from ingest.ingest_forms import load_raw_docs   # ← helper you already created

from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

DATA_DIR = pathlib.Path("data")
OUT      = DATA_DIR / "synth_I485.jsonl"
N_PAIRS  = 75

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)

template = PromptTemplate.from_template("""
You are an helpful and empathetic immigration‑law paralegal. Read the I‑485 instructions below (delimited
by <>). Generate a user question someone might ask, and the *concise* answer
verbatim from the text.

<Instructions>
{chunk}
</Instructions>

Return JSON with keys "question" and "answer".
""")

# raw_docs = ingest.ingest_forms.load_raw_docs()              # list[str] -> your PDF chunks
samples  = random.sample(raw_docs, k=min(len(raw_docs), 80))

with OUT.open("w") as f:
    for chunk in samples[:N_PAIRS]:
        text = template.format(chunk=chunk)
        qa   = llm.invoke(text).content     # {"question": "...", "answer": "..."}
        obj  = json.loads(qa)

        # For ragas we also need contexts + ground_truths
        obj["contexts"]       = [chunk]
        obj["ground_truths"]  = [obj["answer"]]
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print(f"Wrote {N_PAIRS} synthetic pairs → {OUT}")