GAIA-Agent

Sleeping

App Files Files Community

Essi commited on Jun 1

Commit

e879014

1 Parent(s): 5b89395

feat: enhance file handling capabilities with support for code execution, Excel analysis, and audio transcription

Browse files

Files changed (3) hide show

app.py +38 -8
requirements.txt +1 -0
tools.py +64 -15

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 import re
-from typing import Literal, TypedDict
 import gradio as gr
 import pandas as pd
@@ -10,8 +10,11 @@ from langchain_openai import ChatOpenAI
 from langgraph.graph import END, StateGraph
 from tools import (
     calculator,
     image_describe,
     web_multi_search,
     wiki_search,
     youtube_transcript,
@@ -33,7 +36,7 @@ If the answer is numeric, output digits only (no commas, units, or words).
 #                           QUESTION  CLASSIFIER                               #
 # --------------------------------------------------------------------------- #
-_LABELS = {"math", "youtube", "image", "general"}
 _CLASSIFY_PROMPT = """You are a router that labels the user question with exactly one of the following categories:
 {labels}.
@@ -50,7 +53,7 @@ Label:
 # --------------------------------------------------------------------------- #
 class AgentState(TypedDict):
     question: str
-    label: Literal["math", "youtube", "image", "general"]
     context: str
     answer: str
     confidence: float
@@ -68,9 +71,12 @@ _llm_answer = ChatOpenAI(model=MODEL_NAME)
 def classify(state: AgentState) -> AgentState:  # noqa: D401
     """Label the task so we know which toolchain to invoke."""
     question = state["question"]
     resp = (
         _llm_router.invoke(
-            _CLASSIFY_PROMPT.format(question=question, labels=", ".join(_LABELS))
         )
         .content.strip()
         .lower()
@@ -80,11 +86,36 @@ def classify(state: AgentState) -> AgentState:  # noqa: D401
 def gather_context(state: AgentState) -> AgentState:
-    question, label = state["question"], state["label"]
     matched_pattern = r"https?://\S+"
     matched_obj = re.search(matched_pattern, question)
     if label == "math":
         print("[TOOL] calculator")
         expr = re.sub(r"\s+", "", question)
@@ -109,9 +140,8 @@ def gather_context(state: AgentState) -> AgentState:
 def generate_answer(state: AgentState) -> AgentState:
-    # Deterministic calculator path
-    if state["label"] == "math":
-        state["answer"] = state["context"].strip()
         state["confidence"] = 0.9
         return state

 import os
 import re
+from typing import Literal, TypedDict, get_args
 import gradio as gr
 import pandas as pd
 from langgraph.graph import END, StateGraph
 from tools import (
+    analyze_excel_file,
     calculator,
     image_describe,
+    run_py,
+    transcribe_via_whisper,
     web_multi_search,
     wiki_search,
     youtube_transcript,
 #                           QUESTION  CLASSIFIER                               #
 # --------------------------------------------------------------------------- #
+_LABELS = Literal["math", "youtube", "image", "code", "excel", "audio", "general"]
 _CLASSIFY_PROMPT = """You are a router that labels the user question with exactly one of the following categories:
 {labels}.
 # --------------------------------------------------------------------------- #
 class AgentState(TypedDict):
     question: str
+    label: _LABELS
     context: str
     answer: str
     confidence: float
 def classify(state: AgentState) -> AgentState:  # noqa: D401
     """Label the task so we know which toolchain to invoke."""
     question = state["question"]
+    values = get_args(_LABELS)  # -> ("math", "youtube", ...)
+    parsed_labels = ", ".join(repr(v) for v in values)
     resp = (
         _llm_router.invoke(
+            _CLASSIFY_PROMPT.format(question=question, labels=parsed_labels)
         )
         .content.strip()
         .lower()
 def gather_context(state: AgentState) -> AgentState:
+    question, label, task_id = state["question"], state["label"], state["task_id"]
     matched_pattern = r"https?://\S+"
     matched_obj = re.search(matched_pattern, question)
+    # ---- attachment detection ------------------------------------------------
+    if task_id:
+        file_url = f"{DEFAULT_API_URL}/files/{task_id}"
+        head = requests.head(file_url, timeout=10)
+        ctype = head.headers.get("content-type", "")
+        print(f"[DEBUG] attachment type={ctype} | url={file_url}")
+        if "python" in ctype or file_url.endswith(".py"):
+            code = requests.get(file_url, timeout=10).text
+            state["answer"] = run_py.invoke({"code": code})
+            state["label"] = "code"
+            return state
+        if "excel" in ctype or file_url.endswith((".xlsx", ".csv")):
+            blob = requests.get(file_url, timeout=10).content
+            state["context"] = analyze_excel_file.invoke(
+                {"xls_bytes": blob, "question": question}
+            )
+            state["label"] = "excel"
+            return state
+        if "audio" in ctype or file_url.endswith(".mp3"):
+            blob = requests.get(file_url, timeout=10).content
+            state["context"] = transcribe_via_whisper.invoke({"mp3_bytes": blob})
+            state["label"] = "audio"
+            return state
     if label == "math":
         print("[TOOL] calculator")
         expr = re.sub(r"\s+", "", question)
 def generate_answer(state: AgentState) -> AgentState:
+    # Skip LLM for deterministic labels
+    if state["label"] in {"math", "code", "excel"}:
         state["confidence"] = 0.9
         return state

requirements.txt CHANGED Viewed

@@ -20,6 +20,7 @@ wikipedia==1.4.0                  # WikipediaLoader
 youtube-transcript-api==1.0.3  # YouTube transcripts
 openpyxl==3.1.5                # Excel parsing when GAIA attaches .xlsx
 Pillow>=10.2.0                 # image handling for transformers
 # ── Lightweight vision model
 transformers>=4.41.2

 youtube-transcript-api==1.0.3  # YouTube transcripts
 openpyxl==3.1.5                # Excel parsing when GAIA attaches .xlsx
 Pillow>=10.2.0                 # image handling for transformers
+openai-whisper=20240930
 # ── Lightweight vision model
 transformers>=4.41.2

tools.py CHANGED Viewed

@@ -2,8 +2,10 @@ import ast
 import json
 import operator
 import re
 from functools import lru_cache
 from io import BytesIO
 import requests
 from langchain_community.document_loaders import WikipediaLoader
@@ -46,7 +48,7 @@ def calculator(expression: str) -> str:
         tree = ast.parse(expression, mode="eval")
         value = _safe_eval(tree.body)
         return str(value)
-    except Exception as exc:  # pragma: no cover – we surface errors to the agent
         return f"calc_error:{exc}"
@@ -62,9 +64,9 @@ def _ddg_search(query: str, k: int = 6) -> list[dict[str, str]]:
     hits = wrapper.results(query)
     return [
         {
-            "title": hit.get("title", "")[:120],
-            "snippet": hit.get("snippet", "")[:300],
-            "link": hit.get("link", "")[:200],
         }
         for hit in hits[:k]
     ]
@@ -87,9 +89,9 @@ def web_multi_search(query: str, k: int = 6) -> str:
         )
         formatted = [
             {
-                "title": d.metadata.get("title", "")[:120],
-                "snippet": d.page_content[:300],
-                "link": d.metadata.get("source", "")[:200],
             }
             for d in tavily_hits
         ]
@@ -156,16 +158,61 @@ def image_describe(image_url: str, top_k: int = 3) -> str:
 @tool
-def csv_sum(url: str, column: str) -> str:
-    """Download a CSV and return the sum of the specified numeric column."""
     try:
-        import pandas as pd  # local import to avoid mandatory pandas if unused
-        df = pd.read_csv(url)
-        total = df[column].sum()
-        return str(total)
     except Exception as exc:
-        return f"csv_error:{exc}"
 __all__ = [
@@ -174,5 +221,7 @@ __all__ = [
     "wiki_search",
     "youtube_transcript",
     "image_describe",
-    "csv_sum",
 ]

 import json
 import operator
 import re
+import subprocess
 from functools import lru_cache
 from io import BytesIO
+from tempfile import NamedTemporaryFile
 import requests
 from langchain_community.document_loaders import WikipediaLoader
         tree = ast.parse(expression, mode="eval")
         value = _safe_eval(tree.body)
         return str(value)
+    except Exception as exc:
         return f"calc_error:{exc}"
     hits = wrapper.results(query)
     return [
         {
+            "title": hit.get("title", "")[:500],
+            "snippet": hit.get("snippet", "")[:750],
+            "link": hit.get("link", "")[:300],
         }
         for hit in hits[:k]
     ]
         )
         formatted = [
             {
+                "title": d.metadata.get("title", "")[:500],
+                "snippet": d.page_content[:750],
+                "link": d.metadata.get("source", "")[:300],
             }
             for d in tavily_hits
         ]
 @tool
+def run_py(code: str) -> str:
+    """Execute Python code in a sandboxed subprocess and return last stdout line."""
+    try:
+        with NamedTemporaryFile(delete=False, suffix=".py", mode="w") as f:
+            f.write(code)
+            path = f.name
+        proc = subprocess.run(
+            ["python", path], capture_output=True, text=True, timeout=4
+        )
+        out = proc.stdout.strip().splitlines()
+        return out[-1] if out else ""
+    except Exception as exc:
+        return f"py_error:{exc}"
+@tool
+def transcribe_via_whisper(mp3_bytes: bytes) -> str:
+    """Transcribe MP3 bytes with Whisper (CPU)."""
+    with NamedTemporaryFile(suffix=".mp3", delete=False) as f:
+        f.write(mp3_bytes)
+        path = f.name
     try:
+        import whisper  # openai-whisper
+        model = whisper.load_model("base")
+        output = model.transcribe(path)["text"].strip()
+        print(f"[DEBUG] Whisper transcript (first 200 chars): {output[:200]}")
+        return output
     except Exception as exc:
+        return f"asr_error:{exc}"
+@tool
+def analyze_excel_file(xls_bytes: bytes, question: str) -> str:
+    """Generic Excel/CSV aggregation handler."""
+    import pandas as pd
+    # Try both Excel and CSV loaders
+    try:
+        df = pd.read_excel(BytesIO(xls_bytes))
+    except Exception:
+        df = pd.read_csv(BytesIO(xls_bytes))
+    numeric = df.select_dtypes("number")
+    if numeric.empty:
+        return "No numeric data"
+    q = question.lower()
+    if any(term in q for term in ["total", "sum", "aggregate"]):
+        return f"{numeric.sum().sum():.2f}"
+    if any(term in q for term in ["average", "mean"]):
+        return f"{numeric.mean().mean():.2f}"
+    # Fallback: return first 10 rows as csv for LLM to reason on
+    return df.head(10).to_csv(index=False)
 __all__ = [
     "wiki_search",
     "youtube_transcript",
     "image_describe",
+    "run_py",
+    "transcribe_via_whisper",
+    "analyze_excel_file",
 ]