Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,217 Bytes
08fac87 3b06efd 08fac87 3b06efd 08fac87 3b06efd 08fac87 3b06efd 08fac87 3b06efd 08fac87 3b06efd 08fac87 3b06efd 100d2c7 3b06efd 08fac87 3b06efd 08fac87 3b06efd 08fac87 3b06efd 08fac87 3b06efd 08fac87 3b06efd 08fac87 3b06efd 1130c52 3b06efd 08fac87 100d2c7 3b06efd 08fac87 3b06efd 08fac87 100d2c7 08fac87 3b06efd 08fac87 3b06efd 08fac87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import sys
import os
import csv
from main import RunChain, RunGraph
from retriever import BuildRetriever
from ragas import EvaluationDataset, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import (
AnswerAccuracy,
ContextRelevance,
ResponseGroundedness,
)
from langchain_openai import ChatOpenAI
import argparse
import logging
import traceback
# Suppress these messages:
# INFO:openai._base_client:Retrying request to /chat/completions in ___ seconds
# https://community.openai.com/t/suppress-http-request-post-message/583334/8
openai_logger = logging.getLogger("openai")
openai_logger.setLevel(logging.WARNING)
def load_questions_and_references(csv_path):
"""Read questions and references from CSV"""
questions = []
references = []
with open(csv_path, newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
questions.append(row["question"].strip('"'))
references.append(row["reference"].strip('"'))
return questions, references
def build_eval_dataset(questions, references, compute_mode, workflow, search_type):
"""Build dataset for evaluation"""
dataset = []
for question, reference in zip(questions, references):
try:
if workflow == "chain":
print("\n\n--- Question ---")
print(question)
response = RunChain(question, compute_mode, search_type)
print("--- Response ---")
print(response)
# Retrieve context documents for a question
retriever = BuildRetriever(compute_mode, search_type)
docs = retriever.invoke(question)
retrieved_contexts = [doc.page_content for doc in docs]
if workflow == "graph":
result = RunGraph(question, compute_mode, search_type)
retrieved_contexts = []
if "retrieved_emails" in result:
# Remove the source file names (e.g. R-help/2022-September.txt) as it confuses the evaluator
retrieved_contexts = [
"\n\n\nFrom" + email.split("\n\n\nFrom")[1]
for email in result["retrieved_emails"]
]
response = result["answer"]
dataset.append(
{
"user_input": question,
"retrieved_contexts": retrieved_contexts,
"response": response,
"reference": reference,
}
)
except:
print(
f"--- Question omitted from evals due to failed generation: {question} ---"
)
print(traceback.format_exc())
return dataset
def run_evals_with_csv(csv_path):
"""Run evals using saved responses in a CSV file"""
# Load an evaluation dataset from saved responses in a CSV file
csv_questions = []
retrieved_emails = []
answers = []
with open(csv_path, newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
csv_questions.append(row["question"].strip('"'))
retrieved_emails.append(row["retrieved_emails"].strip('"'))
answers.append(row["answer"].strip('"'))
questions, references = load_questions_and_references("eval.csv")
# Make sure the questions are the same
assert csv_questions == questions
# Build dataset for evaluation
dataset = []
for question, reference, retrieved_email, answer in zip(
questions, references, retrieved_emails, answers
):
# Remove the source file names (e.g. R-help/2022-September.txt) as it confuses the evaluator
retrieved_contexts = (
[
"\n\n\nFrom" + email.split("\n\n\nFrom")[1]
for email in retrieved_email.split(
"\n\n--- --- --- --- Next Email --- --- --- ---\n\n"
)
]
if retrieved_email != ""
else []
)
dataset.append(
{
"user_input": question,
"retrieved_contexts": retrieved_contexts,
"response": answer,
"reference": reference,
}
)
evaluation_dataset = EvaluationDataset.from_list(dataset)
# Set up LLM for evaluation
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
evaluator_llm = LangchainLLMWrapper(llm)
# Evaluate
result = evaluate(
dataset=evaluation_dataset,
# NVIDIA metrics
metrics=[ContextRelevance(), ResponseGroundedness(), AnswerAccuracy()],
llm=evaluator_llm,
)
print("Evaluation Results:")
print(result)
def main():
parser = argparse.ArgumentParser(
description="Evaluate RAG retrieval and generation."
)
parser.add_argument(
"--compute_mode",
choices=["remote", "local"],
required=True,
help="Compute mode: remote or local.",
)
parser.add_argument(
"--workflow",
choices=["chain", "graph"],
required=True,
help="Workflow: chain or graph.",
)
parser.add_argument(
"--search_type",
choices=["dense", "sparse", "hybrid"],
required=True,
help="Search type: dense, sparse, or hybrid.",
)
args = parser.parse_args()
compute_mode = args.compute_mode
workflow = args.workflow
search_type = args.search_type
questions, references = load_questions_and_references("eval.csv")
dataset = build_eval_dataset(
questions, references, compute_mode, workflow, search_type
)
evaluation_dataset = EvaluationDataset.from_list(dataset)
# Set up LLM for evaluation
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
evaluator_llm = LangchainLLMWrapper(llm)
# Evaluate
result = evaluate(
dataset=evaluation_dataset,
# NVIDIA metrics
metrics=[ContextRelevance(), ResponseGroundedness(), AnswerAccuracy()],
llm=evaluator_llm,
)
print("Evaluation Results:")
print(result)
if __name__ == "__main__":
main()
|