GAIA-Agent

Sleeping

Essi commited on Jun 1

Commit

edd4fd8

1 Parent(s): 27d27df

feat: add `sniff_excel_type` function to determine file type from raw bytes

Files changed (1) hide show

helpers.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import requests
@@ -16,3 +20,35 @@ def fetch_task_file(api_url: str, task_id: str) -> tuple[bytes, str]:
         print(f"[DEBUG] GET {url} → {r.status_code}")
         return b"", ""
     return r.content, r.headers.get("content-type", "").lower()

+import csv
+import io
+import zipfile
 import requests
         print(f"[DEBUG] GET {url} → {r.status_code}")
         return b"", ""
     return r.content, r.headers.get("content-type", "").lower()
+def sniff_excel_type(blob: bytes) -> str:
+    """
+    Return one of 'xlsx', 'xls', 'csv', or '' (unknown) given raw bytes.
+    """
+    # 1️⃣ XLSX / XLSM / ODS  (ZIP container)
+    if blob[:4] == b"PK\x03\x04":
+        try:
+            with zipfile.ZipFile(io.BytesIO(blob)) as zf:
+                names = set(zf.namelist())
+                if {"xl/workbook.xml", "[Content_Types].xml"} & names:
+                    return "xlsx"
+        except zipfile.BadZipFile:
+            pass  # fall through
+    # 2️⃣ Legacy XLS (OLE Compound File)
+    if blob[:8] == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1":
+        return "xls"
+    # 3️⃣ Text-like -> CSV/TSV
+    try:
+        sample = blob[:1024].decode("utf-8", "ignore")
+        first_line = sample.splitlines()[0]
+        if any(sep in first_line for sep in (",", ";", "\t")):
+            # Confirm via csv.Sniffer to avoid random text
+            csv.Sniffer().sniff(sample)
+            return "csv"
+    except (UnicodeDecodeError, csv.Error):
+        pass
+    return ""