""" Run RAGAS evaluation on synthetic set. Saves Markdown table to reports/ragas_baseline.md """ import json, pathlib, pandas as pd, rich from datasets import Dataset from rag.qa_chain import get_answer from ragas.metrics import ( faithfulness, answer_relevance, context_precision, context_recall ) from ragas import evaluate DATA = pathlib.Path("data/synth_I485.jsonl") REPORT_DIR = pathlib.Path("reports"); REPORT_DIR.mkdir(exist_ok=True) TABLE = REPORT_DIR / "ragas_baseline.md" def load_pairs(): rows = [] for line in DATA.open(): obj = json.loads(line) rows.append({ "question": obj["question"], "answer": obj["answer"], "contexts": obj["contexts"], "ground_truths": obj["ground_truths"], }) return Dataset.from_list(rows) def produce_rag_answers(dataset): pred_answers, pred_contexts = [], [] for q in dataset["question"]: ans, cites = get_answer(q) # your langchain QA pred_answers.append(ans) pred_contexts.append(cites) dataset = dataset.add_column("generated_answer", pred_answers) dataset = dataset.add_column("retrieved_contexts", pred_contexts) return dataset ds = load_pairs() ds = produce_rag_answers(ds) result = evaluate( ds, metrics=[faithfulness, answer_relevance, context_precision, context_recall], raise_exceptions=False ) df = pd.DataFrame(result) rich.print(df) # save table table_md = df.to_markdown(index=False) TABLE.write_text(table_md) # auto‑append a short conclusion boiler‑plate with TABLE.open("a") as f: f.write("\n\n### Conclusions\n") f.write("- Faithfulness below 0.7 means the model sometimes hallucinates.\n") f.write("- Context precision > 0.8 shows retriever brings mostly relevant chunks.\n") f.write("- We will fine‑tune embeddings in Stage 6 to raise recall and relevance.\n") print(f"\nMarkdown table saved → {TABLE}")