Spaces:

Loren
/

GAIA_Agents_Evaluations

Running

File size: 15,308 Bytes

import streamlit as st
import pandas as pd
import re
import ast
import io
import os
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
from pathlib import Path
import uuid
import warnings
warnings.filterwarnings("ignore")

####################################################################
###   FUNCTIONS                                                  ###
####################################################################

@st.cache_data(show_spinner=True)
def initializations():
    st.session_state.question = ""
    st.session_state.file_dataset = "./data/gaia_subset.csv"
    st.session_state.file_evaluations = "./data/gaia_evals.csv"
    st.session_state.gaia = True
    st.session_state.file_lib = "./data/lib.md"
    st.session_state.file_sidebar = "./data/gaia_sidebar.txt"
    st.session_state.dfk = str(uuid.uuid4())
#

@st.cache_data(show_spinner=True)
def get_dataset(dataset_file):
    return pd.read_csv(dataset_file, sep='µ', engine='python')
#

@st.cache_data(show_spinner=True)
def get_evaluations(eval_file):
    def set_eval(answer1, answer2):
        answer1 = re.sub(r'\.$', '', answer1.lower()).replace(', ', ',')
        answer2 = re.sub(r'\.$', '', answer2.lower()).replace(', ', ',')
        return answer1 == answer2

    df = pd.read_csv(eval_file, sep='µ', engine='python')
    df = df.merge(st.session_state.df_dataset[['task_id', 'question', 'file_url', 'answer']],
                  on='task_id', how='left')
    list_labels = pd.unique(df['label'])
    list_questions = pd.unique(df['question'])
    df['eval'] = df.apply(lambda r: set_eval(str(r['submitted_answer']),
                                             str(r['answer'])), axis=1)
    df_pivot = df.pivot(index=['task_id','question'], columns='label',
                        values=['eval','submitted_answer','messages'])
    df_reset = df_pivot.reindex(columns=list_labels, level=1).reset_index()
    df_reset['question'] = pd.Categorical(df_reset['question'],
                                          categories=list_questions, ordered=True)
    df_eval = df_reset.sort_values('question')

    df_synth = df.pivot(index='question', columns='label', values='eval') \
                 .reindex(columns=list_labels) \
                 .reindex(pd.unique(df_eval['question']))

    totaux = df_synth.sum(axis=0)

    df_perf = totaux.reset_index().T
    df_perf.columns = df_perf.iloc[0]
    df_perf = df_perf.iloc[1:]
    df_perf.loc["Nb correct"] = totaux
    df_perf.loc["% correct"] = totaux *100 / len(df_eval)
    df_perf = df_perf.iloc[1:]

    return df_eval, df_synth, df_perf, list_labels
#

@st.cache_data(show_spinner=True)
def get_lib(lib_file):
    lib = ''
    if isinstance(lib_file, str):
        lib = Path(lib_file).read_text(encoding="utf-8")
    else:
        lib = lib_file.read().decode("utf-8")
    return lib
#

@st.cache_data(show_spinner=True)
def get_sidebar(sidebar_file):
    if isinstance(sidebar_file, str):
        with open(sidebar_file, "r", encoding="utf-8") as f:
            lignes = f.readlines()
    else:
        stringio = io.StringIO(sidebar_file.read().decode("utf-8"))
        lignes = stringio.readlines()

    return lignes
#

def parse_messages_from_string(messages_str):
    messages = []
    status = True
    try:
        messages_match = re.search(r"'messages': \[(.*)\]", messages_str, re.DOTALL)
        messages_content = messages_match.group(1)
        message_splits = re.findall(r'(HumanMessage\(.*?\)|AIMessage\(.*?\)|ToolMessage\(.*?\))(?=, HumanMessage\(|, AIMessage\(|, ToolMessage\(|$)', messages_content, re.DOTALL)

        for msg_str in message_splits:
            # Identifier le type de message
            if msg_str.startswith('HumanMessage'):
                msg_type = 'HumanMessage'
            elif msg_str.startswith('AIMessage'):
                msg_type = 'AIMessage'
            elif msg_str.startswith('ToolMessage'):
                msg_type = 'ToolMessage'
            else:
                continue  # Type inconnu, passer au suivant

            # Extraire les arguments du constructeur
            args_str = msg_str[len(msg_type)+1:-1]  # Supprimer 'TypeMessage(' et ')'
            # Convertir les arguments en dictionnaire
            # Remplacer les paires clé=valeur par des paires 'clé': valeur
            args_str = re.sub(r'(\w+)=', r'"\1":', args_str)
            try:
                args = ast.literal_eval('{' + args_str + '}')
                # Créer l'objet de message approprié
                if msg_type == 'HumanMessage':
                    message = HumanMessage(**args)
                elif msg_type == 'AIMessage':
                    message = AIMessage(**args)
                elif msg_type == 'ToolMessage':
                    message = ToolMessage(**args)
                else:
                    continue
                messages.append(message)
            except Exception as e:
                message = HumanMessage(f"*** Error parsing message: {e}")
                messages.append(message)
                message = HumanMessage(f"*** See the original list of messages below")
                messages.append(message)
                status = False
                print(f"Error parsing message: {e}")
                continue
    except Exception as e:
        print(f"Erreur lors de l'analyse du messageparse_message_from_string: {e}")
    finally:
        return messages, status
#

def get_details():
    dfkey = st.session_state.dfk
    if len(st.session_state[dfkey]) > 0:
        if len(st.session_state[dfkey]["selection"]["rows"]):
            num_raw = st.session_state[dfkey]["selection"]["rows"][0]
            df_eval = st.session_state.df_eval
            st.session_state.question = df_eval.iloc[num_raw].question.squeeze()
            for i in range(0, len(st.session_state.list_labels)):
                with list_tabs[i].chat_message("ai"):
                    if df_eval.iloc[num_raw].eval[i]:
                        st.markdown(str(df_eval.iloc[num_raw].submitted_answer[i])+"     "+
                                    ":green-badge[:material/check: Correct]")
                    else:
                        st.markdown(str(df_eval.iloc[num_raw].submitted_answer[i]) + "     " +
                                    ":orange-badge[⚠️ Needs review]")
                    messages, status = parse_messages_from_string(df_eval.iloc[num_raw].messages[i])
                    c = st.container(border=True)
                    c.markdown("### Message history:")
                    c.text("\n".join(m.pretty_repr() for m in messages))
                    if not status:
                        c.text(df_eval.iloc[num_raw].messages[i])
                    #print("\n".join(m.pretty_repr() for m in messages))
#

def save_uploaded_file(uploaded_file, folder="data"):
    os.makedirs(folder, exist_ok=True)
    save_path = os.path.join(folder, uploaded_file.name)
    with open(save_path, "wb") as f:
        f.write(uploaded_file.getbuffer())
    return save_path
#
####################################################################
###   MAIN                                                       ###
####################################################################

#--- Initializations
st.set_page_config(page_title='Agents evaluation',layout="wide",
                   initial_sidebar_state="auto")
initializations()
if 'question' not in st.session_state:
    st.session_state.question = ""
if 'file_dataset' not in st.session_state:
    st.session_state.file_dataset = "./data/gaia_subset.csv"
if 'file_evaluations' not in st.session_state:
    st.session_state.file_evaluations = "./data/gaia_evals.csv"
if 'gaia' not in st.session_state:
    st.session_state.gaia = True
if 'file_lib' not in st.session_state:
    st.session_state.file_lib = "./data/lib.md"
if 'file_sidebar' not in st.session_state:
    st.session_state.file_sidebar = "./data/gaia_sidebar.txt"
if 'dfk' not in st.session_state:
    st.session_state.dfk = str(uuid.uuid4())
    
#--- Set title
if st.session_state.gaia:
    col1, col2 = st.columns([0.4, 0.6], vertical_alignment="center")
    col1.image("thumbnail.jpg")
    col2.markdown("<h1 style='text-align: center; color: orange;'>GAIA subset evaluation</h1>",
                unsafe_allow_html=True)
    col1.link_button(":blue[More information]",
                     "https://huggingface.co/learn/agents-course/unit4/introduction")
    pop = col2.container()
    upd = col2.expander(":red[**Upload files to update app**]")
else:
    st.markdown("<h1 style='text-align: center; color: orange;'>Agents evaluation</h1>",
                unsafe_allow_html=True)
    pop = st.container()
    upd = st.expander(":red[**Upload files to update app**]")


#--- Popover
with pop.popover("### 💡 :red[**How to configure the app to use it with a different evaluation?**]",
                 use_container_width=True):
    st.markdown("""You can modify the data the application is based on by **uploading** your own files, respecting the expected **formats**:  \n
The **test dataset** must be a csv file with the **µ** separator character. The header line must contain the expected **fields**:  \n
>***task id,  question,  file name,  file url  ,answer.***   \n
>*task_id, question, file_name, file_url, answer*   \n
*Example of test dataset:*""")
    st.code("""task_idµquestionµfile_nameµfile_urlµanswer   \n
2d83110e-a098-4ebb-9987-066c06fa42d0µ".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"µµµright   \n
""", language=None)
    st.markdown("___")
    st.markdown("""The **evaluation dataset** must also be a csv file with the **µ** separator character. The header line must contain the expected **fields**:  \n
>***label of the agent,  task id,  agent's response,  message history (a string formatted as a list of HumanMessage, AIMessage, ToolMessage from Langchain).***   \n
>*label,  task_id,  submitted_answer,  messages*   \n
*Example of evaluation dataset:*""")
    st.code("""labelµtask_idµsubmitted_answerµmessages
Qwen2.5-72B-Instructµ2d83110e-a098-4ebb-9987-066c06fa42d0µrightµ"{'messages': [HumanMessage(content='.rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI', additional_kwargs={}, response_metadata={}, id='98460ac1-f0c0-41dc-8f32-ddf50b123a71'), AIMessage(content='The user wrote a sentence in reverse. ... There\'s no need for any tools here because this is a basic vocabulary question. ... Therefore, the final answer is ""right.""\n</think>\n\nFINAL ANSWER: right', additional_kwargs={}, response_metadata={...}, 'model_name': 'Qwen/Qwen3-235B-A22B', ...}, ..."
""", language=None)
    st.markdown("___")
    st.markdown("""You can also set your **title** and your **sidebar** by **uploading** appropriate files:   \n
* a md or txt file for the title.   \n
*Example:*""")
    st.code("""*GAIA is a benchmark which aims at ...*
***Data***
*GAIA is made of more than 450 non-trivial question with an unambiguous answer, ...*
""", language=None)
    st.markdown("""* a text file describing, in markdown, the section titles and tool descriptions.   \n
*Example:*""")
    st.code("""title;:orange[Langchain tools]
tool;:material/language: TavilySearch
tool;:material/newsstand: WikipediaQueryRun
title;:orange[Custom tools]
tool;:material/slideshow: Ask Youtube video
tool;:material/chess: Chessboard description
tool;:material/speech_to_text: Audio transcription
tool;:material/text_snippet: Get file content
tool;:material/add: Sum numbers
""", language=None)

#--- Update app configuration
with upd.form(":red[**Update app**]"):
    uploaded_dataset = st.file_uploader("Choose the **dataset** file:", type='csv')
    uploaded_evaluations = st.file_uploader("Choose the **evaluation**s file:", type='csv')
    uploaded_lib = st.file_uploader("Choose the file with the dataset **description**:", type=['md', 'txt'])
    uploaded_sidebar = st.file_uploader("Choose the file with the **sidebar** description:", type=['md', 'txt'])
    valid = st.form_submit_button("🚀 :red[**Update app**]")
if valid:
    if uploaded_lib is not None:
        st.session_state.gaia = False
        st.session_state.file_lib = uploaded_lib
    if uploaded_dataset is not None:
        st.session_state.file_dataset = uploaded_dataset
        st.session_state.question = ""
    if uploaded_evaluations is not None:
        st.session_state.file_evaluations = save_uploaded_file(uploaded_evaluations)
        print('fichier sauvegardé : ', st.session_state.file_evaluations)
        st.session_state.dfk = str(uuid.uuid4())
        st.session_state.question = ""
        if 'list_tabs' in locals():
            del list_tabs
    if uploaded_sidebar is not None:
        st.session_state.file_sidebar = uploaded_sidebar


#--- Get dataset information
try:
    st.session_state.lib = get_lib(st.session_state.file_lib)
except Exception as e:
    st.exception(f'Error during get_lib: {e}')

#--- Get sidebar description
try:
    st.session_state.lignes = get_sidebar(st.session_state.file_sidebar)
except Exception as e:
    st.exception(f'Error during get_sidebar: {e}')

#--- Set sidebar
try:
    with st.sidebar:
        st.markdown("# :material/construction: Tools used")
        for ligne in st.session_state.lignes:
            lig = ligne.split(";")
            if lig[0] == 'title':
                st.markdown("## "+lig[1])
            if lig[0] == 'tool':
                with st.container(border=True):
                    st.markdown("### "+lig[1])
except Exception as e:
    st.exception(f'Error during set sidebar: {e}')

#--- Get dataset
try:
    st.session_state.df_dataset = get_dataset(st.session_state.file_dataset)
except Exception as e:
    st.exception(f'Error during get_dataset: {e}')

#--- Get evaluations
try:
    st.session_state.df_eval, st.session_state.df_synth, st.session_state.df_perf, \
        st.session_state.list_labels = get_evaluations(st.session_state.file_evaluations)
except Exception as e:
    st.exception(f'Error during get_evaluations: {e}')


#--- Show dataset expander
with st.expander("## **:orange[Dataset informations]**", expanded=False):
    try:
        st.markdown(">"+st.session_state.lib)
        st.markdown("#### Test dataset:")
        st.dataframe(st.session_state.df_dataset[['question', 'file_url']],
                    column_config={"file_url": st.column_config.LinkColumn("Attached file",
                                                            display_text="Download attached file"),
                                    "question": st.column_config.TextColumn(max_chars=None)})
    except Exception as e:
        st.exception(f'Error in dataset informations: {e}')

#--- Show perf dataframe
st.dataframe(st.session_state.df_perf)

#--- Show evaluations synthesys
st.markdown("👇 Click to the left of the question to obtain details of the different model evaluations")
st.dataframe(st.session_state.df_synth, on_select=get_details, key=st.session_state.dfk,
             selection_mode="single-row")

#--- Details container
cont = st.container()

with cont.chat_message('user'):
    st.markdown(f'###### :blue[{st.session_state.question}]')

cols = [''.join(col).strip() for col in st.session_state.list_labels]

list_tabs = cont.tabs(cols)