Spaces:
Running
Running
""" | |
Run RAGAS evaluation on synthetic set. | |
Saves Markdown table to reports/ragas_baseline.md | |
""" | |
import json, pathlib, pandas as pd, rich | |
from datasets import Dataset | |
from rag.qa_chain import get_answer | |
from ragas.metrics import ( | |
faithfulness, answer_relevance, | |
context_precision, context_recall | |
) | |
from ragas import evaluate | |
DATA = pathlib.Path("data/synth_I485.jsonl") | |
REPORT_DIR = pathlib.Path("reports"); REPORT_DIR.mkdir(exist_ok=True) | |
TABLE = REPORT_DIR / "ragas_baseline.md" | |
def load_pairs(): | |
rows = [] | |
for line in DATA.open(): | |
obj = json.loads(line) | |
rows.append({ | |
"question": obj["question"], | |
"answer": obj["answer"], | |
"contexts": obj["contexts"], | |
"ground_truths": obj["ground_truths"], | |
}) | |
return Dataset.from_list(rows) | |
def produce_rag_answers(dataset): | |
pred_answers, pred_contexts = [], [] | |
for q in dataset["question"]: | |
ans, cites = get_answer(q) # your langchain QA | |
pred_answers.append(ans) | |
pred_contexts.append(cites) | |
dataset = dataset.add_column("generated_answer", pred_answers) | |
dataset = dataset.add_column("retrieved_contexts", pred_contexts) | |
return dataset | |
ds = load_pairs() | |
ds = produce_rag_answers(ds) | |
result = evaluate( | |
ds, | |
metrics=[faithfulness, answer_relevance, context_precision, context_recall], | |
raise_exceptions=False | |
) | |
df = pd.DataFrame(result) | |
rich.print(df) | |
# save table | |
table_md = df.to_markdown(index=False) | |
TABLE.write_text(table_md) | |
# auto‑append a short conclusion boiler‑plate | |
with TABLE.open("a") as f: | |
f.write("\n\n### Conclusions\n") | |
f.write("- Faithfulness below 0.7 means the model sometimes hallucinates.\n") | |
f.write("- Context precision > 0.8 shows retriever brings mostly relevant chunks.\n") | |
f.write("- We will fine‑tune embeddings in Stage 6 to raise recall and relevance.\n") | |
print(f"\nMarkdown table saved → {TABLE}") | |