Spaces:
Running
Running
File size: 1,964 Bytes
c3967db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
"""
Run RAGAS evaluation on synthetic set.
Saves Markdown table to reports/ragas_baseline.md
"""
import json, pathlib, pandas as pd, rich
from datasets import Dataset
from rag.qa_chain import get_answer
from ragas.metrics import (
faithfulness, answer_relevance,
context_precision, context_recall
)
from ragas import evaluate
DATA = pathlib.Path("data/synth_I485.jsonl")
REPORT_DIR = pathlib.Path("reports"); REPORT_DIR.mkdir(exist_ok=True)
TABLE = REPORT_DIR / "ragas_baseline.md"
def load_pairs():
rows = []
for line in DATA.open():
obj = json.loads(line)
rows.append({
"question": obj["question"],
"answer": obj["answer"],
"contexts": obj["contexts"],
"ground_truths": obj["ground_truths"],
})
return Dataset.from_list(rows)
def produce_rag_answers(dataset):
pred_answers, pred_contexts = [], []
for q in dataset["question"]:
ans, cites = get_answer(q) # your langchain QA
pred_answers.append(ans)
pred_contexts.append(cites)
dataset = dataset.add_column("generated_answer", pred_answers)
dataset = dataset.add_column("retrieved_contexts", pred_contexts)
return dataset
ds = load_pairs()
ds = produce_rag_answers(ds)
result = evaluate(
ds,
metrics=[faithfulness, answer_relevance, context_precision, context_recall],
raise_exceptions=False
)
df = pd.DataFrame(result)
rich.print(df)
# save table
table_md = df.to_markdown(index=False)
TABLE.write_text(table_md)
# auto‑append a short conclusion boiler‑plate
with TABLE.open("a") as f:
f.write("\n\n### Conclusions\n")
f.write("- Faithfulness below 0.7 means the model sometimes hallucinates.\n")
f.write("- Context precision > 0.8 shows retriever brings mostly relevant chunks.\n")
f.write("- We will fine‑tune embeddings in Stage 6 to raise recall and relevance.\n")
print(f"\nMarkdown table saved → {TABLE}")
|