GAIA-Agent

Sleeping

App Files Files Community

Essi commited on Jun 1

Commit

2e0b688

1 Parent(s): e04c929

feat: refactor prompt handling and improve routing logic for task classification

Browse files

Files changed (3) hide show

app.py +17 -52
prompts.yaml +44 -1
tools.py +10 -15

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ from langchain_core.messages import HumanMessage, SystemMessage
 from langchain_openai import ChatOpenAI
 from langgraph.graph import END, StateGraph
-from helpers import fetch_task_file, sniff_excel_type
 from tools import (
     analyze_excel_file,
     calculator,
@@ -28,11 +28,6 @@ DEFAULT_API_URL: str = "https://agents-course-unit4-scoring.hf.space"
 MODEL_NAME: str = "o4-mini"  # "gpt-4.1-mini"
 TEMPERATURE: float = 0.1
-_SYSTEM_PROMPT = """You are a precise research assistant. Return ONLY the literal answer - no preamble.
-If the question asks for a *first name*, output the first given name only.
-If the answer is numeric, output digits only (no commas, units, or words).
-"""
 # --------------------------------------------------------------------------- #
 #                           QUESTION  CLASSIFIER                               #
 # --------------------------------------------------------------------------- #
@@ -40,45 +35,13 @@ If the answer is numeric, output digits only (no commas, units, or words).
 _LABELS = Literal[
     "math",
     "youtube",
-    "image_generic",
-    "image_puzzle",
     "code",
     "excel",
     "audio",
     "general",
 ]
-_CLASSIFY_PROMPT = """You are a *routing* assistant.
-Your ONLY job is to print **one** of the allowed labels - nothing else.
-Allowed labels
-==============
-{labels}
-Guidelines
-----------
-• **math**: the question is a pure arithmetic/numeric expression.
-• **youtube**: the question contains a YouTube URL and asks about its content.
-• **code**: the task references attached Python code; caller wants its output.
-• **excel**: the task references an attached .xlsx/.xls/.csv and asks for a sum, average, etc.
-• **audio**: the task references an attached audio file and asks for its transcript or facts in it.
-• **image_generic**: the question asks only *what* is in the picture (e.g. “Which animal is shown?”).
-• **image_puzzle**: the question asks for a *move, count, coordinate,* or other board-game tactic that needs an exact piece layout (e.g. "What is Black's winning move?").
-• **general**: anything else (fallback).
-Example for the two image labels
---------------------------------
-1. "Identify the landmark in this photo." --> **image_generic**
-2. "It's Black to move in the attached chess position; give the winning line." --> **image_puzzle**
-~~~
-User question:
-{question}
-~~~
-IMPORTANT: Respond with **one label exactly**, no punctuation, no explanation.
-"""
 # --------------------------------------------------------------------------- #
 # -------------------------------  AGENT STATE  ----------------------------- #
@@ -104,14 +67,12 @@ def classify(state: AgentState) -> AgentState:  # noqa: D401
     question = state["question"]
     label_values = set(get_args(_LABELS))  # -> ("math", "youtube", ...)
-    parsed_labels = ", ".join(repr(v) for v in label_values)
-    resp = (
-        _llm_router.invoke(
-            _CLASSIFY_PROMPT.format(question=question, labels=parsed_labels)
-        )
-        .content.strip()
-        .lower()
     )
     state["label"] = resp if resp in label_values else "general"
     return state
@@ -146,7 +107,7 @@ def gather_context(state: AgentState) -> AgentState:
                     print(f"[DEBUG] octet-stream sniffed as {sniff_excel_type(blob)}")
                 print("[DEBUG] Working with a Excel/CSV attachment file")
-                state["context"] = analyze_excel_file.invoke(
                     {"xls_bytes": blob, "question": question}
                 )
                 state["label"] = "excel"
@@ -162,7 +123,7 @@ def gather_context(state: AgentState) -> AgentState:
             # ── Image --------------------------------------------------------
             if "image" in ctype:
                 print("[DEBUG] Working with an image attachment file")
-                state["context"] = vision_task.invoke(
                     {"img_bytes": blob, "question": question}
                 )
                 state["label"] = "image"
@@ -187,14 +148,18 @@ def gather_context(state: AgentState) -> AgentState:
 def generate_answer(state: AgentState) -> AgentState:
-    # Skip LLM for deterministic labels
-    if state["label"] in {"math", "code", "excel"}:
         return state
     prompt = [
-        SystemMessage(content=_SYSTEM_PROMPT),
         HumanMessage(
-            content=f"Question: {state['question']}\n\nContext:\n{state['context']}\n\nAnswer:"
         ),
     ]
     raw = _llm_answer.invoke(prompt).content.strip()

 from langchain_openai import ChatOpenAI
 from langgraph.graph import END, StateGraph
+from helpers import fetch_task_file, get_prompt, sniff_excel_type
 from tools import (
     analyze_excel_file,
     calculator,
 MODEL_NAME: str = "o4-mini"  # "gpt-4.1-mini"
 TEMPERATURE: float = 0.1
 # --------------------------------------------------------------------------- #
 #                           QUESTION  CLASSIFIER                               #
 # --------------------------------------------------------------------------- #
 _LABELS = Literal[
     "math",
     "youtube",
+    "image",
     "code",
     "excel",
     "audio",
     "general",
 ]
 # --------------------------------------------------------------------------- #
 # -------------------------------  AGENT STATE  ----------------------------- #
     question = state["question"]
     label_values = set(get_args(_LABELS))  # -> ("math", "youtube", ...)
+    prompt = get_prompt(
+        prompt_key="router",
+        question=question,
+        labels=", ".join(repr(v) for v in label_values),
     )
+    resp = _llm_router.invoke(prompt).content.strip().lower()
     state["label"] = resp if resp in label_values else "general"
     return state
                     print(f"[DEBUG] octet-stream sniffed as {sniff_excel_type(blob)}")
                 print("[DEBUG] Working with a Excel/CSV attachment file")
+                state["answer"] = analyze_excel_file.invoke(
                     {"xls_bytes": blob, "question": question}
                 )
                 state["label"] = "excel"
             # ── Image --------------------------------------------------------
             if "image" in ctype:
                 print("[DEBUG] Working with an image attachment file")
+                state["answer"] = vision_task.invoke(
                     {"img_bytes": blob, "question": question}
                 )
                 state["label"] = "image"
 def generate_answer(state: AgentState) -> AgentState:
+    # Skip LLM for deterministic labels or tasks that already used LLMs
+    if state["label"] in {"code", "excel", "image", "math"}:
         return state
     prompt = [
+        SystemMessage(content=get_prompt("final_llm_system")),
         HumanMessage(
+            content=get_prompt(
+                prompt_key="final_llm_user",
+                question=state["question"],
+                context=state["context"],
+            )
         ),
     ]
     raw = _llm_answer.invoke(prompt).content.strip()

prompts.yaml CHANGED Viewed

@@ -1,4 +1,47 @@
-excel_analysis_one_liner: |
   You are a **pandas one-liner generator**.
   Context

+router: |
+  You are a *routing* assistant.
+  Your ONLY job is to print **one** of the allowed labels - nothing else.
+  Allowed labels
+  ==============
+  {labels}
+  Guidelines
+  ----------
+  • **math**: the question is a pure arithmetic/numeric expression.
+  • **youtube**: the question contains a YouTube URL and asks about its content.
+  • **code**: the task references attached Python code; caller wants its output.
+  • **excel**: the task references an attached .xlsx/.xls/.csv and asks for a sum, average, etc.
+  • **audio**: the task references an attached audio file and asks for its transcript or facts in it.
+  • **image**: the task could be either generic like "what is in the picture (e.g. Which animal is shown?) or could be a puzzle like asking for a *move, count, coordinate,* or other board-game tactic that needs an exact piece layout (e.g. "What is Black's winning move?").
+  • **general**: anything else (fallback).
+  ~~~
+  User question:
+  {question}
+  ~~~
+  IMPORTANT: Respond with **one label exactly**, no punctuation, no explanation.
+final_llm_system: |
+  You are a precise research assistant.
+  Return ONLY the literal answer - no preamble.
+  If the question asks for a *first name*, output the first given name only.
+  If the answer is numeric, output digits only (no commas, units, or words).
+final_llm_user: |
+  Question: {question}
+  Context: {context}
+  Answer:
+vision_system: |
+  You are a terse assistant. Respond with ONLY the answer to the user's question—no explanations, no punctuation except what the answer itself requires.
+  If the answer is a chess move, output it in algebraic notation.
+  IMPORTANT: Only respond with the final answer with no extra text.
+excel_system: |
   You are a **pandas one-liner generator**.
   Context

tools.py CHANGED Viewed

@@ -165,11 +165,6 @@ def vision_task(img_bytes: bytes, question: str) -> str:
     Pass the user's question AND the referenced image to a multimodal LLM and
     return its first line of text as the answer.  No domain assumptions made.
     """
-    sys_prompt = (
-        "You are a terse assistant. Respond with ONLY the answer to the user's "
-        "question—no explanations, no punctuation except what the answer itself "
-        "requires. If the answer is a chess move, output it in algebraic notation."
-    )
     vision_llm = ChatOpenAI(
         model="gpt-4o-mini",  # set OPENAI_API_KEY in env
         temperature=0,
@@ -178,7 +173,7 @@ def vision_task(img_bytes: bytes, question: str) -> str:
     try:
         b64 = b64encode(img_bytes).decode()
         messages = [
-            SystemMessage(content=sys_prompt),
             HumanMessage(
                 content=[
                     {"type": "text", "text": question.strip()},
@@ -215,10 +210,10 @@ def run_py(code: str) -> str:
 @tool
-def transcribe_via_whisper(mp3_bytes: bytes) -> str:
-    """Transcribe MP3 bytes with Whisper (CPU)."""
     with NamedTemporaryFile(suffix=".mp3", delete=False) as f:
-        f.write(mp3_bytes)
         path = f.name
     try:
         import whisper  # openai-whisper
@@ -236,7 +231,6 @@ def analyze_excel_file(xls_bytes: bytes, question: str) -> str:
     "Analyze Excel or CSV file by passing the data preview to LLM and getting the Python Pandas operation to run"
     llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=64)
-    # 1. full dataframe
     try:
         df = pd.read_excel(BytesIO(xls_bytes))
     except Exception:
@@ -245,18 +239,19 @@ def analyze_excel_file(xls_bytes: bytes, question: str) -> str:
     for col in df.select_dtypes(include="number").columns:
         df[col] = df[col].astype(float)
-    # 2. ask the LLM for a single expression
     prompt = get_prompt(
-        prompt_key="excel_analysis_one_liner", preview=df.head(5).to_dict(orient="list")
     )
     expr = llm.invoke(prompt).content.strip()
-    # 3. run it on the FULL df
     try:
         result = eval(expr, {"df": df, "pd": pd, "__builtins__": {}})
-        # ── normalize scalars to string -------------------------------------------
         if isinstance(result, np.generic):
-            # keep existing LLM formatting (e.g. {:.2f}) if it's already a str
             result = float(result)  # → plain Python float
             return f"{result:.2f}"  # or str(result) if no decimals needed

     Pass the user's question AND the referenced image to a multimodal LLM and
     return its first line of text as the answer.  No domain assumptions made.
     """
     vision_llm = ChatOpenAI(
         model="gpt-4o-mini",  # set OPENAI_API_KEY in env
         temperature=0,
     try:
         b64 = b64encode(img_bytes).decode()
         messages = [
+            SystemMessage(content=get_prompt(prompt_key="vision_system")),
             HumanMessage(
                 content=[
                     {"type": "text", "text": question.strip()},
 @tool
+def transcribe_via_whisper(audio_bytes: bytes) -> str:
+    """Transcribe audio with Whisper (CPU)."""
     with NamedTemporaryFile(suffix=".mp3", delete=False) as f:
+        f.write(audio_bytes)
         path = f.name
     try:
         import whisper  # openai-whisper
     "Analyze Excel or CSV file by passing the data preview to LLM and getting the Python Pandas operation to run"
     llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=64)
     try:
         df = pd.read_excel(BytesIO(xls_bytes))
     except Exception:
     for col in df.select_dtypes(include="number").columns:
         df[col] = df[col].astype(float)
+    # Ask the LLM for a single expression
     prompt = get_prompt(
+        prompt_key="excel_system",
+        question=question,
+        preview=df.head(5).to_dict(orient="list"),
     )
     expr = llm.invoke(prompt).content.strip()
+    # Run generated Pandas' one-line expression
     try:
         result = eval(expr, {"df": df, "pd": pd, "__builtins__": {}})
+        # Normalize scalars to string
         if isinstance(result, np.generic):
             result = float(result)  # → plain Python float
             return f"{result:.2f}"  # or str(result) if no decimals needed