Essi commited on
Commit
edd4fd8
·
1 Parent(s): 27d27df

feat: add `sniff_excel_type` function to determine file type from raw bytes

Browse files
Files changed (1) hide show
  1. helpers.py +36 -0
helpers.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  import requests
2
 
3
 
@@ -16,3 +20,35 @@ def fetch_task_file(api_url: str, task_id: str) -> tuple[bytes, str]:
16
  print(f"[DEBUG] GET {url} → {r.status_code}")
17
  return b"", ""
18
  return r.content, r.headers.get("content-type", "").lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import io
3
+ import zipfile
4
+
5
  import requests
6
 
7
 
 
20
  print(f"[DEBUG] GET {url} → {r.status_code}")
21
  return b"", ""
22
  return r.content, r.headers.get("content-type", "").lower()
23
+
24
+
25
+ def sniff_excel_type(blob: bytes) -> str:
26
+ """
27
+ Return one of 'xlsx', 'xls', 'csv', or '' (unknown) given raw bytes.
28
+ """
29
+ # 1️⃣ XLSX / XLSM / ODS (ZIP container)
30
+ if blob[:4] == b"PK\x03\x04":
31
+ try:
32
+ with zipfile.ZipFile(io.BytesIO(blob)) as zf:
33
+ names = set(zf.namelist())
34
+ if {"xl/workbook.xml", "[Content_Types].xml"} & names:
35
+ return "xlsx"
36
+ except zipfile.BadZipFile:
37
+ pass # fall through
38
+
39
+ # 2️⃣ Legacy XLS (OLE Compound File)
40
+ if blob[:8] == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1":
41
+ return "xls"
42
+
43
+ # 3️⃣ Text-like -> CSV/TSV
44
+ try:
45
+ sample = blob[:1024].decode("utf-8", "ignore")
46
+ first_line = sample.splitlines()[0]
47
+ if any(sep in first_line for sep in (",", ";", "\t")):
48
+ # Confirm via csv.Sniffer to avoid random text
49
+ csv.Sniffer().sniff(sample)
50
+ return "csv"
51
+ except (UnicodeDecodeError, csv.Error):
52
+ pass
53
+
54
+ return ""