formpilot-demo / scripts /eval_ragas.py
afulara's picture
Auto‑deploy from GitHub
c3967db verified
"""
Run RAGAS evaluation on synthetic set.
Saves Markdown table to reports/ragas_baseline.md
"""
import json, pathlib, pandas as pd, rich
from datasets import Dataset
from rag.qa_chain import get_answer
from ragas.metrics import (
faithfulness, answer_relevance,
context_precision, context_recall
)
from ragas import evaluate
DATA = pathlib.Path("data/synth_I485.jsonl")
REPORT_DIR = pathlib.Path("reports"); REPORT_DIR.mkdir(exist_ok=True)
TABLE = REPORT_DIR / "ragas_baseline.md"
def load_pairs():
rows = []
for line in DATA.open():
obj = json.loads(line)
rows.append({
"question": obj["question"],
"answer": obj["answer"],
"contexts": obj["contexts"],
"ground_truths": obj["ground_truths"],
})
return Dataset.from_list(rows)
def produce_rag_answers(dataset):
pred_answers, pred_contexts = [], []
for q in dataset["question"]:
ans, cites = get_answer(q) # your langchain QA
pred_answers.append(ans)
pred_contexts.append(cites)
dataset = dataset.add_column("generated_answer", pred_answers)
dataset = dataset.add_column("retrieved_contexts", pred_contexts)
return dataset
ds = load_pairs()
ds = produce_rag_answers(ds)
result = evaluate(
ds,
metrics=[faithfulness, answer_relevance, context_precision, context_recall],
raise_exceptions=False
)
df = pd.DataFrame(result)
rich.print(df)
# save table
table_md = df.to_markdown(index=False)
TABLE.write_text(table_md)
# auto‑append a short conclusion boiler‑plate
with TABLE.open("a") as f:
f.write("\n\n### Conclusions\n")
f.write("- Faithfulness below 0.7 means the model sometimes hallucinates.\n")
f.write("- Context precision > 0.8 shows retriever brings mostly relevant chunks.\n")
f.write("- We will fine‑tune embeddings in Stage 6 to raise recall and relevance.\n")
print(f"\nMarkdown table saved → {TABLE}")