File size: 6,217 Bytes
08fac87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b06efd
 
 
08fac87
 
 
 
3b06efd
08fac87
3b06efd
08fac87
 
3b06efd
08fac87
 
3b06efd
08fac87
 
3b06efd
 
 
08fac87
 
3b06efd
100d2c7
3b06efd
08fac87
 
3b06efd
08fac87
 
3b06efd
08fac87
3b06efd
08fac87
 
 
 
 
3b06efd
08fac87
 
 
 
 
 
3b06efd
 
 
08fac87
 
 
 
 
3b06efd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1130c52
 
 
 
 
 
 
 
 
 
3b06efd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08fac87
 
 
 
 
100d2c7
3b06efd
08fac87
3b06efd
08fac87
 
 
 
 
 
 
 
 
 
 
 
 
 
100d2c7
08fac87
 
 
3b06efd
08fac87
3b06efd
08fac87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import sys
import os
import csv
from main import RunChain, RunGraph
from retriever import BuildRetriever
from ragas import EvaluationDataset, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import (
    AnswerAccuracy,
    ContextRelevance,
    ResponseGroundedness,
)
from langchain_openai import ChatOpenAI
import argparse
import logging
import traceback

# Suppress these messages:
# INFO:openai._base_client:Retrying request to /chat/completions in ___ seconds
# https://community.openai.com/t/suppress-http-request-post-message/583334/8
openai_logger = logging.getLogger("openai")
openai_logger.setLevel(logging.WARNING)


def load_questions_and_references(csv_path):
    """Read questions and references from CSV"""
    questions = []
    references = []
    with open(csv_path, newline="") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            questions.append(row["question"].strip('"'))
            references.append(row["reference"].strip('"'))
    return questions, references


def build_eval_dataset(questions, references, compute_mode, workflow, search_type):
    """Build dataset for evaluation"""
    dataset = []
    for question, reference in zip(questions, references):
        try:
            if workflow == "chain":
                print("\n\n--- Question ---")
                print(question)
                response = RunChain(question, compute_mode, search_type)
                print("--- Response ---")
                print(response)
                # Retrieve context documents for a question
                retriever = BuildRetriever(compute_mode, search_type)
                docs = retriever.invoke(question)
                retrieved_contexts = [doc.page_content for doc in docs]
            if workflow == "graph":
                result = RunGraph(question, compute_mode, search_type)
                retrieved_contexts = []
                if "retrieved_emails" in result:
                    # Remove the source file names (e.g. R-help/2022-September.txt) as it confuses the evaluator
                    retrieved_contexts = [
                        "\n\n\nFrom" + email.split("\n\n\nFrom")[1]
                        for email in result["retrieved_emails"]
                    ]
                response = result["answer"]
            dataset.append(
                {
                    "user_input": question,
                    "retrieved_contexts": retrieved_contexts,
                    "response": response,
                    "reference": reference,
                }
            )
        except:
            print(
                f"--- Question omitted from evals due to failed generation: {question} ---"
            )
            print(traceback.format_exc())

    return dataset


def run_evals_with_csv(csv_path):
    """Run evals using saved responses in a CSV file"""

    # Load an evaluation dataset from saved responses in a CSV file
    csv_questions = []
    retrieved_emails = []
    answers = []

    with open(csv_path, newline="") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            csv_questions.append(row["question"].strip('"'))
            retrieved_emails.append(row["retrieved_emails"].strip('"'))
            answers.append(row["answer"].strip('"'))

    questions, references = load_questions_and_references("eval.csv")

    # Make sure the questions are the same
    assert csv_questions == questions

    # Build dataset for evaluation
    dataset = []
    for question, reference, retrieved_email, answer in zip(
        questions, references, retrieved_emails, answers
    ):
        # Remove the source file names (e.g. R-help/2022-September.txt) as it confuses the evaluator
        retrieved_contexts = (
            [
                "\n\n\nFrom" + email.split("\n\n\nFrom")[1]
                for email in retrieved_email.split(
                    "\n\n--- --- --- --- Next Email --- --- --- ---\n\n"
                )
            ]
            if retrieved_email != ""
            else []
        )
        dataset.append(
            {
                "user_input": question,
                "retrieved_contexts": retrieved_contexts,
                "response": answer,
                "reference": reference,
            }
        )

    evaluation_dataset = EvaluationDataset.from_list(dataset)

    # Set up LLM for evaluation
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    evaluator_llm = LangchainLLMWrapper(llm)

    # Evaluate
    result = evaluate(
        dataset=evaluation_dataset,
        # NVIDIA metrics
        metrics=[ContextRelevance(), ResponseGroundedness(), AnswerAccuracy()],
        llm=evaluator_llm,
    )
    print("Evaluation Results:")
    print(result)


def main():
    parser = argparse.ArgumentParser(
        description="Evaluate RAG retrieval and generation."
    )
    parser.add_argument(
        "--compute_mode",
        choices=["remote", "local"],
        required=True,
        help="Compute mode: remote or local.",
    )
    parser.add_argument(
        "--workflow",
        choices=["chain", "graph"],
        required=True,
        help="Workflow: chain or graph.",
    )
    parser.add_argument(
        "--search_type",
        choices=["dense", "sparse", "hybrid"],
        required=True,
        help="Search type: dense, sparse, or hybrid.",
    )
    args = parser.parse_args()
    compute_mode = args.compute_mode
    workflow = args.workflow
    search_type = args.search_type

    questions, references = load_questions_and_references("eval.csv")
    dataset = build_eval_dataset(
        questions, references, compute_mode, workflow, search_type
    )
    evaluation_dataset = EvaluationDataset.from_list(dataset)

    # Set up LLM for evaluation
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    evaluator_llm = LangchainLLMWrapper(llm)

    # Evaluate
    result = evaluate(
        dataset=evaluation_dataset,
        # NVIDIA metrics
        metrics=[ContextRelevance(), ResponseGroundedness(), AnswerAccuracy()],
        llm=evaluator_llm,
    )
    print("Evaluation Results:")
    print(result)


if __name__ == "__main__":
    main()