Spaces:
Sleeping
Sleeping
Essi
commited on
Commit
Β·
ce76bed
1
Parent(s):
21394c0
feat: add prompt retrieval functionality and new Excel analysis prompt template
Browse files- .pre-commit-config.yaml +3 -2
- helpers.py +15 -4
- prompts.yaml +22 -0
- tools.py +34 -24
.pre-commit-config.yaml
CHANGED
@@ -1,19 +1,20 @@
|
|
1 |
repos:
|
2 |
# ββ FORMATTERS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
3 |
- repo: https://github.com/astral-sh/ruff-pre-commit
|
4 |
-
rev: v0.11.
|
5 |
hooks:
|
6 |
- id: ruff-check
|
7 |
- id: ruff-format
|
8 |
# ββ STATIC ANALYSIS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
9 |
- repo: https://github.com/pre-commit/mirrors-mypy
|
10 |
-
rev: v1.
|
11 |
hooks:
|
12 |
- id: mypy
|
13 |
additional_dependencies:
|
14 |
- "pydantic==1.10.*"
|
15 |
- "types-requests"
|
16 |
- "mypy-extensions"
|
|
|
17 |
# ββ SECRET / KEY DETECTORS βββββββββββββββββββββββββββββββββββββββββββββββββ
|
18 |
- repo: https://github.com/pre-commit/pre-commit-hooks
|
19 |
rev: v5.0.0
|
|
|
1 |
repos:
|
2 |
# ββ FORMATTERS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
3 |
- repo: https://github.com/astral-sh/ruff-pre-commit
|
4 |
+
rev: v0.11.12
|
5 |
hooks:
|
6 |
- id: ruff-check
|
7 |
- id: ruff-format
|
8 |
# ββ STATIC ANALYSIS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
9 |
- repo: https://github.com/pre-commit/mirrors-mypy
|
10 |
+
rev: v1.16.0
|
11 |
hooks:
|
12 |
- id: mypy
|
13 |
additional_dependencies:
|
14 |
- "pydantic==1.10.*"
|
15 |
- "types-requests"
|
16 |
- "mypy-extensions"
|
17 |
+
- "types-PyYAML"
|
18 |
# ββ SECRET / KEY DETECTORS βββββββββββββββββββββββββββββββββββββββββββββββββ
|
19 |
- repo: https://github.com/pre-commit/pre-commit-hooks
|
20 |
rev: v5.0.0
|
helpers.py
CHANGED
@@ -1,8 +1,14 @@
|
|
1 |
import csv
|
2 |
-
import
|
3 |
-
import
|
|
|
4 |
|
5 |
import requests
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
def fetch_task_file(api_url: str, task_id: str) -> tuple[bytes, str]:
|
@@ -29,11 +35,11 @@ def sniff_excel_type(blob: bytes) -> str:
|
|
29 |
# 1οΈβ£ XLSX / XLSM / ODS (ZIP container)
|
30 |
if blob[:4] == b"PK\x03\x04":
|
31 |
try:
|
32 |
-
with
|
33 |
names = set(zf.namelist())
|
34 |
if {"xl/workbook.xml", "[Content_Types].xml"} & names:
|
35 |
return "xlsx"
|
36 |
-
except
|
37 |
pass # fall through
|
38 |
|
39 |
# 2οΈβ£ Legacy XLS (OLE Compound File)
|
@@ -52,3 +58,8 @@ def sniff_excel_type(blob: bytes) -> str:
|
|
52 |
pass
|
53 |
|
54 |
return ""
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import csv
|
2 |
+
from io import BytesIO
|
3 |
+
from pathlib import Path
|
4 |
+
from zipfile import BadZipFile, ZipFile
|
5 |
|
6 |
import requests
|
7 |
+
from yaml import safe_load
|
8 |
+
|
9 |
+
CURRENT_DIR = Path(__file__).parent
|
10 |
+
|
11 |
+
_PROMPTS = safe_load(CURRENT_DIR.joinpath("prompts.yaml").read_text())
|
12 |
|
13 |
|
14 |
def fetch_task_file(api_url: str, task_id: str) -> tuple[bytes, str]:
|
|
|
35 |
# 1οΈβ£ XLSX / XLSM / ODS (ZIP container)
|
36 |
if blob[:4] == b"PK\x03\x04":
|
37 |
try:
|
38 |
+
with ZipFile(BytesIO(blob)) as zf:
|
39 |
names = set(zf.namelist())
|
40 |
if {"xl/workbook.xml", "[Content_Types].xml"} & names:
|
41 |
return "xlsx"
|
42 |
+
except BadZipFile:
|
43 |
pass # fall through
|
44 |
|
45 |
# 2οΈβ£ Legacy XLS (OLE Compound File)
|
|
|
58 |
pass
|
59 |
|
60 |
return ""
|
61 |
+
|
62 |
+
|
63 |
+
def get_prompt(prompt_key: str, **kwargs: str) -> str:
|
64 |
+
"""Get a prompt by key and fill in placeholders via `.format(**kwargs)`"""
|
65 |
+
return _PROMPTS[prompt_key].format(**kwargs)
|
prompts.yaml
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
excel_analysis_one_liner: |
|
2 |
+
You are a **pandas one-liner generator**.
|
3 |
+
|
4 |
+
Context
|
5 |
+
-------
|
6 |
+
β’ A full DataFrame named `df` is already loaded.
|
7 |
+
β’ Only the preview below is shown for reference; use column names from it.
|
8 |
+
|
9 |
+
Preview
|
10 |
+
-------
|
11 |
+
{preview}
|
12 |
+
|
13 |
+
Formatting rules
|
14 |
+
----------------
|
15 |
+
1. Result must be a plain Python scalar (use .item(), float(), int() β¦).
|
16 |
+
2. If the question asks for currency / 2 decimals β wrap in an f-string.
|
17 |
+
3. If the question asks for a count β wrap in int().
|
18 |
+
4. Return **one** expression, nothing else.
|
19 |
+
|
20 |
+
Question
|
21 |
+
--------
|
22 |
+
{question}
|
tools.py
CHANGED
@@ -8,15 +8,18 @@ from functools import lru_cache
|
|
8 |
from io import BytesIO
|
9 |
from tempfile import NamedTemporaryFile
|
10 |
|
|
|
|
|
11 |
from langchain_community.document_loaders import WikipediaLoader
|
12 |
from langchain_community.tools.tavily_search import TavilySearchResults
|
13 |
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
|
14 |
from langchain_core.messages import HumanMessage, SystemMessage
|
15 |
from langchain_core.tools import tool
|
16 |
from langchain_openai import ChatOpenAI
|
17 |
-
from transformers import pipeline
|
18 |
from youtube_transcript_api import YouTubeTranscriptApi
|
19 |
|
|
|
|
|
20 |
# --------------------------------------------------------------------------- #
|
21 |
# ARITHMETIC (SAFE CALCULATOR) #
|
22 |
# --------------------------------------------------------------------------- #
|
@@ -57,8 +60,6 @@ def calculator(expression: str) -> str:
|
|
57 |
# --------------------------------------------------------------------------- #
|
58 |
# WEB & WIKI SEARCH #
|
59 |
# --------------------------------------------------------------------------- #
|
60 |
-
|
61 |
-
|
62 |
@lru_cache(maxsize=256)
|
63 |
def _ddg_search(query: str, k: int = 6) -> list[dict[str, str]]:
|
64 |
"""Cached DuckDuckGo JSON search."""
|
@@ -114,8 +115,6 @@ def wiki_search(query: str, max_pages: int = 2) -> str:
|
|
114 |
# --------------------------------------------------------------------------- #
|
115 |
# YOUTUBE TRANSCRIPT #
|
116 |
# --------------------------------------------------------------------------- #
|
117 |
-
|
118 |
-
|
119 |
@tool
|
120 |
def youtube_transcript(url: str, chars: int = 10_000) -> str:
|
121 |
"""Fetch full YouTube transcript (first *chars* characters)."""
|
@@ -137,10 +136,9 @@ def youtube_transcript(url: str, chars: int = 10_000) -> str:
|
|
137 |
# Instantiate a lightweight CLIPβbased zeroβshot image classifier (runs on CPU)
|
138 |
### The model 'openai/clip-vit-base-patch32' is a vision transformer (ViT) model trained as part of OpenAIβs CLIP project.
|
139 |
### It performs zero-shot image classification by mapping images and labels into the same embedding space.
|
140 |
-
_image_pipe = pipeline(
|
141 |
-
|
142 |
-
)
|
143 |
-
|
144 |
|
145 |
# @tool
|
146 |
# def image_describe(img_bytes: bytes, top_k: int = 3) -> str:
|
@@ -200,8 +198,6 @@ def vision_task(img_bytes: bytes, question: str) -> str:
|
|
200 |
# --------------------------------------------------------------------------- #
|
201 |
# FILE UTILS #
|
202 |
# --------------------------------------------------------------------------- #
|
203 |
-
|
204 |
-
|
205 |
@tool
|
206 |
def run_py(code: str) -> str:
|
207 |
"""Execute Python code in a sandboxed subprocess and return last stdout line."""
|
@@ -237,27 +233,41 @@ def transcribe_via_whisper(mp3_bytes: bytes) -> str:
|
|
237 |
|
238 |
@tool
|
239 |
def analyze_excel_file(xls_bytes: bytes, question: str) -> str:
|
240 |
-
"
|
241 |
-
|
242 |
|
243 |
-
#
|
244 |
try:
|
245 |
df = pd.read_excel(BytesIO(xls_bytes))
|
246 |
except Exception:
|
247 |
df = pd.read_csv(BytesIO(xls_bytes))
|
248 |
|
249 |
-
|
250 |
-
|
251 |
-
return "No numeric data"
|
252 |
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
|
259 |
-
#
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
|
263 |
__all__ = [
|
|
|
8 |
from io import BytesIO
|
9 |
from tempfile import NamedTemporaryFile
|
10 |
|
11 |
+
import numpy as np
|
12 |
+
import pandas as pd
|
13 |
from langchain_community.document_loaders import WikipediaLoader
|
14 |
from langchain_community.tools.tavily_search import TavilySearchResults
|
15 |
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
|
16 |
from langchain_core.messages import HumanMessage, SystemMessage
|
17 |
from langchain_core.tools import tool
|
18 |
from langchain_openai import ChatOpenAI
|
|
|
19 |
from youtube_transcript_api import YouTubeTranscriptApi
|
20 |
|
21 |
+
from helpers import get_prompt
|
22 |
+
|
23 |
# --------------------------------------------------------------------------- #
|
24 |
# ARITHMETIC (SAFE CALCULATOR) #
|
25 |
# --------------------------------------------------------------------------- #
|
|
|
60 |
# --------------------------------------------------------------------------- #
|
61 |
# WEB & WIKI SEARCH #
|
62 |
# --------------------------------------------------------------------------- #
|
|
|
|
|
63 |
@lru_cache(maxsize=256)
|
64 |
def _ddg_search(query: str, k: int = 6) -> list[dict[str, str]]:
|
65 |
"""Cached DuckDuckGo JSON search."""
|
|
|
115 |
# --------------------------------------------------------------------------- #
|
116 |
# YOUTUBE TRANSCRIPT #
|
117 |
# --------------------------------------------------------------------------- #
|
|
|
|
|
118 |
@tool
|
119 |
def youtube_transcript(url: str, chars: int = 10_000) -> str:
|
120 |
"""Fetch full YouTube transcript (first *chars* characters)."""
|
|
|
136 |
# Instantiate a lightweight CLIPβbased zeroβshot image classifier (runs on CPU)
|
137 |
### The model 'openai/clip-vit-base-patch32' is a vision transformer (ViT) model trained as part of OpenAIβs CLIP project.
|
138 |
### It performs zero-shot image classification by mapping images and labels into the same embedding space.
|
139 |
+
# _image_pipe = pipeline(
|
140 |
+
# "image-classification", model="openai/clip-vit-base-patch32", device="cpu"
|
141 |
+
# )
|
|
|
142 |
|
143 |
# @tool
|
144 |
# def image_describe(img_bytes: bytes, top_k: int = 3) -> str:
|
|
|
198 |
# --------------------------------------------------------------------------- #
|
199 |
# FILE UTILS #
|
200 |
# --------------------------------------------------------------------------- #
|
|
|
|
|
201 |
@tool
|
202 |
def run_py(code: str) -> str:
|
203 |
"""Execute Python code in a sandboxed subprocess and return last stdout line."""
|
|
|
233 |
|
234 |
@tool
|
235 |
def analyze_excel_file(xls_bytes: bytes, question: str) -> str:
|
236 |
+
"Analyze Excel or CSV file by passing the data preview to LLM and getting the Python Pandas operation to run"
|
237 |
+
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=64)
|
238 |
|
239 |
+
# 1. full dataframe
|
240 |
try:
|
241 |
df = pd.read_excel(BytesIO(xls_bytes))
|
242 |
except Exception:
|
243 |
df = pd.read_csv(BytesIO(xls_bytes))
|
244 |
|
245 |
+
for col in df.select_dtypes(include="number").columns:
|
246 |
+
df[col] = df[col].astype(float)
|
|
|
247 |
|
248 |
+
# 2. ask the LLM for a single expression
|
249 |
+
prompt = get_prompt(
|
250 |
+
prompt_key="excel_analysis_one_liner", preview=df.head(5).to_dict(orient="list")
|
251 |
+
)
|
252 |
+
expr = llm.invoke(prompt).content.strip()
|
253 |
|
254 |
+
# 3. run it on the FULL df
|
255 |
+
try:
|
256 |
+
result = eval(expr, {"df": df, "pd": pd, "__builtins__": {}})
|
257 |
+
# ββ normalize scalars to string -------------------------------------------
|
258 |
+
if isinstance(result, np.generic):
|
259 |
+
# keep existing LLM formatting (e.g. {:.2f}) if it's already a str
|
260 |
+
result = float(result) # β plain Python float
|
261 |
+
return f"{result:.2f}" # or str(result) if no decimals needed
|
262 |
+
|
263 |
+
# DataFrame / Series β single-line string
|
264 |
+
return (
|
265 |
+
result.to_string(index=False)
|
266 |
+
if hasattr(result, "to_string")
|
267 |
+
else str(result)
|
268 |
+
)
|
269 |
+
except Exception as e:
|
270 |
+
return f"eval_error:{e}"
|
271 |
|
272 |
|
273 |
__all__ = [
|