Essi commited on
Commit
ce76bed
Β·
1 Parent(s): 21394c0

feat: add prompt retrieval functionality and new Excel analysis prompt template

Browse files
Files changed (4) hide show
  1. .pre-commit-config.yaml +3 -2
  2. helpers.py +15 -4
  3. prompts.yaml +22 -0
  4. tools.py +34 -24
.pre-commit-config.yaml CHANGED
@@ -1,19 +1,20 @@
1
  repos:
2
  # ── FORMATTERS ─────────────────────────────────────────────────────────────
3
  - repo: https://github.com/astral-sh/ruff-pre-commit
4
- rev: v0.11.11
5
  hooks:
6
  - id: ruff-check
7
  - id: ruff-format
8
  # ── STATIC ANALYSIS ────────────────────────────────────────────────────────
9
  - repo: https://github.com/pre-commit/mirrors-mypy
10
- rev: v1.15.0
11
  hooks:
12
  - id: mypy
13
  additional_dependencies:
14
  - "pydantic==1.10.*"
15
  - "types-requests"
16
  - "mypy-extensions"
 
17
  # ── SECRET / KEY DETECTORS ─────────────────────────────────────────────────
18
  - repo: https://github.com/pre-commit/pre-commit-hooks
19
  rev: v5.0.0
 
1
  repos:
2
  # ── FORMATTERS ─────────────────────────────────────────────────────────────
3
  - repo: https://github.com/astral-sh/ruff-pre-commit
4
+ rev: v0.11.12
5
  hooks:
6
  - id: ruff-check
7
  - id: ruff-format
8
  # ── STATIC ANALYSIS ────────────────────────────────────────────────────────
9
  - repo: https://github.com/pre-commit/mirrors-mypy
10
+ rev: v1.16.0
11
  hooks:
12
  - id: mypy
13
  additional_dependencies:
14
  - "pydantic==1.10.*"
15
  - "types-requests"
16
  - "mypy-extensions"
17
+ - "types-PyYAML"
18
  # ── SECRET / KEY DETECTORS ─────────────────────────────────────────────────
19
  - repo: https://github.com/pre-commit/pre-commit-hooks
20
  rev: v5.0.0
helpers.py CHANGED
@@ -1,8 +1,14 @@
1
  import csv
2
- import io
3
- import zipfile
 
4
 
5
  import requests
 
 
 
 
 
6
 
7
 
8
  def fetch_task_file(api_url: str, task_id: str) -> tuple[bytes, str]:
@@ -29,11 +35,11 @@ def sniff_excel_type(blob: bytes) -> str:
29
  # 1️⃣ XLSX / XLSM / ODS (ZIP container)
30
  if blob[:4] == b"PK\x03\x04":
31
  try:
32
- with zipfile.ZipFile(io.BytesIO(blob)) as zf:
33
  names = set(zf.namelist())
34
  if {"xl/workbook.xml", "[Content_Types].xml"} & names:
35
  return "xlsx"
36
- except zipfile.BadZipFile:
37
  pass # fall through
38
 
39
  # 2️⃣ Legacy XLS (OLE Compound File)
@@ -52,3 +58,8 @@ def sniff_excel_type(blob: bytes) -> str:
52
  pass
53
 
54
  return ""
 
 
 
 
 
 
1
  import csv
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from zipfile import BadZipFile, ZipFile
5
 
6
  import requests
7
+ from yaml import safe_load
8
+
9
+ CURRENT_DIR = Path(__file__).parent
10
+
11
+ _PROMPTS = safe_load(CURRENT_DIR.joinpath("prompts.yaml").read_text())
12
 
13
 
14
  def fetch_task_file(api_url: str, task_id: str) -> tuple[bytes, str]:
 
35
  # 1️⃣ XLSX / XLSM / ODS (ZIP container)
36
  if blob[:4] == b"PK\x03\x04":
37
  try:
38
+ with ZipFile(BytesIO(blob)) as zf:
39
  names = set(zf.namelist())
40
  if {"xl/workbook.xml", "[Content_Types].xml"} & names:
41
  return "xlsx"
42
+ except BadZipFile:
43
  pass # fall through
44
 
45
  # 2️⃣ Legacy XLS (OLE Compound File)
 
58
  pass
59
 
60
  return ""
61
+
62
+
63
+ def get_prompt(prompt_key: str, **kwargs: str) -> str:
64
+ """Get a prompt by key and fill in placeholders via `.format(**kwargs)`"""
65
+ return _PROMPTS[prompt_key].format(**kwargs)
prompts.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ excel_analysis_one_liner: |
2
+ You are a **pandas one-liner generator**.
3
+
4
+ Context
5
+ -------
6
+ β€’ A full DataFrame named `df` is already loaded.
7
+ β€’ Only the preview below is shown for reference; use column names from it.
8
+
9
+ Preview
10
+ -------
11
+ {preview}
12
+
13
+ Formatting rules
14
+ ----------------
15
+ 1. Result must be a plain Python scalar (use .item(), float(), int() …).
16
+ 2. If the question asks for currency / 2 decimals β†’ wrap in an f-string.
17
+ 3. If the question asks for a count β†’ wrap in int().
18
+ 4. Return **one** expression, nothing else.
19
+
20
+ Question
21
+ --------
22
+ {question}
tools.py CHANGED
@@ -8,15 +8,18 @@ from functools import lru_cache
8
  from io import BytesIO
9
  from tempfile import NamedTemporaryFile
10
 
 
 
11
  from langchain_community.document_loaders import WikipediaLoader
12
  from langchain_community.tools.tavily_search import TavilySearchResults
13
  from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
14
  from langchain_core.messages import HumanMessage, SystemMessage
15
  from langchain_core.tools import tool
16
  from langchain_openai import ChatOpenAI
17
- from transformers import pipeline
18
  from youtube_transcript_api import YouTubeTranscriptApi
19
 
 
 
20
  # --------------------------------------------------------------------------- #
21
  # ARITHMETIC (SAFE CALCULATOR) #
22
  # --------------------------------------------------------------------------- #
@@ -57,8 +60,6 @@ def calculator(expression: str) -> str:
57
  # --------------------------------------------------------------------------- #
58
  # WEB & WIKI SEARCH #
59
  # --------------------------------------------------------------------------- #
60
-
61
-
62
  @lru_cache(maxsize=256)
63
  def _ddg_search(query: str, k: int = 6) -> list[dict[str, str]]:
64
  """Cached DuckDuckGo JSON search."""
@@ -114,8 +115,6 @@ def wiki_search(query: str, max_pages: int = 2) -> str:
114
  # --------------------------------------------------------------------------- #
115
  # YOUTUBE TRANSCRIPT #
116
  # --------------------------------------------------------------------------- #
117
-
118
-
119
  @tool
120
  def youtube_transcript(url: str, chars: int = 10_000) -> str:
121
  """Fetch full YouTube transcript (first *chars* characters)."""
@@ -137,10 +136,9 @@ def youtube_transcript(url: str, chars: int = 10_000) -> str:
137
  # Instantiate a lightweight CLIP‑based zero‑shot image classifier (runs on CPU)
138
  ### The model 'openai/clip-vit-base-patch32' is a vision transformer (ViT) model trained as part of OpenAI’s CLIP project.
139
  ### It performs zero-shot image classification by mapping images and labels into the same embedding space.
140
- _image_pipe = pipeline(
141
- "image-classification", model="openai/clip-vit-base-patch32", device="cpu"
142
- )
143
-
144
 
145
  # @tool
146
  # def image_describe(img_bytes: bytes, top_k: int = 3) -> str:
@@ -200,8 +198,6 @@ def vision_task(img_bytes: bytes, question: str) -> str:
200
  # --------------------------------------------------------------------------- #
201
  # FILE UTILS #
202
  # --------------------------------------------------------------------------- #
203
-
204
-
205
  @tool
206
  def run_py(code: str) -> str:
207
  """Execute Python code in a sandboxed subprocess and return last stdout line."""
@@ -237,27 +233,41 @@ def transcribe_via_whisper(mp3_bytes: bytes) -> str:
237
 
238
  @tool
239
  def analyze_excel_file(xls_bytes: bytes, question: str) -> str:
240
- """Generic Excel/CSV aggregation handler."""
241
- import pandas as pd
242
 
243
- # Try both Excel and CSV loaders
244
  try:
245
  df = pd.read_excel(BytesIO(xls_bytes))
246
  except Exception:
247
  df = pd.read_csv(BytesIO(xls_bytes))
248
 
249
- numeric = df.select_dtypes("number")
250
- if numeric.empty:
251
- return "No numeric data"
252
 
253
- q = question.lower()
254
- if any(term in q for term in ["total", "sum", "aggregate"]):
255
- return f"{numeric.sum().sum():.2f}"
256
- if any(term in q for term in ["average", "mean"]):
257
- return f"{numeric.mean().mean():.2f}"
258
 
259
- # Fallback: return first 10 rows as csv for LLM to reason on
260
- return df.head(10).to_csv(index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
 
263
  __all__ = [
 
8
  from io import BytesIO
9
  from tempfile import NamedTemporaryFile
10
 
11
+ import numpy as np
12
+ import pandas as pd
13
  from langchain_community.document_loaders import WikipediaLoader
14
  from langchain_community.tools.tavily_search import TavilySearchResults
15
  from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
16
  from langchain_core.messages import HumanMessage, SystemMessage
17
  from langchain_core.tools import tool
18
  from langchain_openai import ChatOpenAI
 
19
  from youtube_transcript_api import YouTubeTranscriptApi
20
 
21
+ from helpers import get_prompt
22
+
23
  # --------------------------------------------------------------------------- #
24
  # ARITHMETIC (SAFE CALCULATOR) #
25
  # --------------------------------------------------------------------------- #
 
60
  # --------------------------------------------------------------------------- #
61
  # WEB & WIKI SEARCH #
62
  # --------------------------------------------------------------------------- #
 
 
63
  @lru_cache(maxsize=256)
64
  def _ddg_search(query: str, k: int = 6) -> list[dict[str, str]]:
65
  """Cached DuckDuckGo JSON search."""
 
115
  # --------------------------------------------------------------------------- #
116
  # YOUTUBE TRANSCRIPT #
117
  # --------------------------------------------------------------------------- #
 
 
118
  @tool
119
  def youtube_transcript(url: str, chars: int = 10_000) -> str:
120
  """Fetch full YouTube transcript (first *chars* characters)."""
 
136
  # Instantiate a lightweight CLIP‑based zero‑shot image classifier (runs on CPU)
137
  ### The model 'openai/clip-vit-base-patch32' is a vision transformer (ViT) model trained as part of OpenAI’s CLIP project.
138
  ### It performs zero-shot image classification by mapping images and labels into the same embedding space.
139
+ # _image_pipe = pipeline(
140
+ # "image-classification", model="openai/clip-vit-base-patch32", device="cpu"
141
+ # )
 
142
 
143
  # @tool
144
  # def image_describe(img_bytes: bytes, top_k: int = 3) -> str:
 
198
  # --------------------------------------------------------------------------- #
199
  # FILE UTILS #
200
  # --------------------------------------------------------------------------- #
 
 
201
  @tool
202
  def run_py(code: str) -> str:
203
  """Execute Python code in a sandboxed subprocess and return last stdout line."""
 
233
 
234
  @tool
235
  def analyze_excel_file(xls_bytes: bytes, question: str) -> str:
236
+ "Analyze Excel or CSV file by passing the data preview to LLM and getting the Python Pandas operation to run"
237
+ llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=64)
238
 
239
+ # 1. full dataframe
240
  try:
241
  df = pd.read_excel(BytesIO(xls_bytes))
242
  except Exception:
243
  df = pd.read_csv(BytesIO(xls_bytes))
244
 
245
+ for col in df.select_dtypes(include="number").columns:
246
+ df[col] = df[col].astype(float)
 
247
 
248
+ # 2. ask the LLM for a single expression
249
+ prompt = get_prompt(
250
+ prompt_key="excel_analysis_one_liner", preview=df.head(5).to_dict(orient="list")
251
+ )
252
+ expr = llm.invoke(prompt).content.strip()
253
 
254
+ # 3. run it on the FULL df
255
+ try:
256
+ result = eval(expr, {"df": df, "pd": pd, "__builtins__": {}})
257
+ # ── normalize scalars to string -------------------------------------------
258
+ if isinstance(result, np.generic):
259
+ # keep existing LLM formatting (e.g. {:.2f}) if it's already a str
260
+ result = float(result) # β†’ plain Python float
261
+ return f"{result:.2f}" # or str(result) if no decimals needed
262
+
263
+ # DataFrame / Series β†’ single-line string
264
+ return (
265
+ result.to_string(index=False)
266
+ if hasattr(result, "to_string")
267
+ else str(result)
268
+ )
269
+ except Exception as e:
270
+ return f"eval_error:{e}"
271
 
272
 
273
  __all__ = [