Spaces:
Sleeping
Sleeping
Essi
commited on
Commit
·
edd4fd8
1
Parent(s):
27d27df
feat: add `sniff_excel_type` function to determine file type from raw bytes
Browse files- helpers.py +36 -0
helpers.py
CHANGED
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
1 |
import requests
|
2 |
|
3 |
|
@@ -16,3 +20,35 @@ def fetch_task_file(api_url: str, task_id: str) -> tuple[bytes, str]:
|
|
16 |
print(f"[DEBUG] GET {url} → {r.status_code}")
|
17 |
return b"", ""
|
18 |
return r.content, r.headers.get("content-type", "").lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import io
|
3 |
+
import zipfile
|
4 |
+
|
5 |
import requests
|
6 |
|
7 |
|
|
|
20 |
print(f"[DEBUG] GET {url} → {r.status_code}")
|
21 |
return b"", ""
|
22 |
return r.content, r.headers.get("content-type", "").lower()
|
23 |
+
|
24 |
+
|
25 |
+
def sniff_excel_type(blob: bytes) -> str:
|
26 |
+
"""
|
27 |
+
Return one of 'xlsx', 'xls', 'csv', or '' (unknown) given raw bytes.
|
28 |
+
"""
|
29 |
+
# 1️⃣ XLSX / XLSM / ODS (ZIP container)
|
30 |
+
if blob[:4] == b"PK\x03\x04":
|
31 |
+
try:
|
32 |
+
with zipfile.ZipFile(io.BytesIO(blob)) as zf:
|
33 |
+
names = set(zf.namelist())
|
34 |
+
if {"xl/workbook.xml", "[Content_Types].xml"} & names:
|
35 |
+
return "xlsx"
|
36 |
+
except zipfile.BadZipFile:
|
37 |
+
pass # fall through
|
38 |
+
|
39 |
+
# 2️⃣ Legacy XLS (OLE Compound File)
|
40 |
+
if blob[:8] == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1":
|
41 |
+
return "xls"
|
42 |
+
|
43 |
+
# 3️⃣ Text-like -> CSV/TSV
|
44 |
+
try:
|
45 |
+
sample = blob[:1024].decode("utf-8", "ignore")
|
46 |
+
first_line = sample.splitlines()[0]
|
47 |
+
if any(sep in first_line for sep in (",", ";", "\t")):
|
48 |
+
# Confirm via csv.Sniffer to avoid random text
|
49 |
+
csv.Sniffer().sniff(sample)
|
50 |
+
return "csv"
|
51 |
+
except (UnicodeDecodeError, csv.Error):
|
52 |
+
pass
|
53 |
+
|
54 |
+
return ""
|