Essi commited on
Commit
545e8ad
·
1 Parent(s): 2e2232b

feat: refactor image processing to use multimodal LLM queries with `vision_task` instead of CLIP model function

Browse files
Files changed (1) hide show
  1. tools.py +53 -8
tools.py CHANGED
@@ -3,15 +3,17 @@ import json
3
  import operator
4
  import re
5
  import subprocess
 
6
  from functools import lru_cache
7
  from io import BytesIO
8
  from tempfile import NamedTemporaryFile
9
 
10
- import requests
11
  from langchain_community.document_loaders import WikipediaLoader
12
  from langchain_community.tools.tavily_search import TavilySearchResults
13
  from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
 
14
  from langchain_core.tools import tool
 
15
  from transformers import pipeline
16
  from youtube_transcript_api import YouTubeTranscriptApi
17
 
@@ -140,14 +142,57 @@ _image_pipe = pipeline(
140
  )
141
 
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  @tool
144
- def image_describe(image_url: str, top_k: int = 3) -> str:
145
- """Download an image and return top-k labels using CLIP zero-shot classification."""
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  try:
147
- resp = requests.get(image_url, timeout=10)
148
- resp.raise_for_status()
149
- labels = _image_pipe(BytesIO(resp.content))[:top_k]
150
- return ", ".join(f"{d['label']} ({d['score']:.2f})" for d in labels)
 
 
 
 
 
 
 
 
 
 
 
151
  except Exception as exc:
152
  return f"img_error:{exc}"
153
 
@@ -220,7 +265,7 @@ __all__ = [
220
  "web_multi_search",
221
  "wiki_search",
222
  "youtube_transcript",
223
- "image_describe",
224
  "run_py",
225
  "transcribe_via_whisper",
226
  "analyze_excel_file",
 
3
  import operator
4
  import re
5
  import subprocess
6
+ from base64 import b64encode
7
  from functools import lru_cache
8
  from io import BytesIO
9
  from tempfile import NamedTemporaryFile
10
 
 
11
  from langchain_community.document_loaders import WikipediaLoader
12
  from langchain_community.tools.tavily_search import TavilySearchResults
13
  from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
14
+ from langchain_core.messages import HumanMessage, SystemMessage
15
  from langchain_core.tools import tool
16
+ from langchain_openai import ChatOpenAI
17
  from transformers import pipeline
18
  from youtube_transcript_api import YouTubeTranscriptApi
19
 
 
142
  )
143
 
144
 
145
+ # @tool
146
+ # def image_describe(img_bytes: bytes, top_k: int = 3) -> str:
147
+ # """Return the top-k CLIP labels for an image supplied as raw bytes.
148
+
149
+ # typical result for a random cat photo can be:
150
+ # [
151
+ # {'label': 'tabby, tabby cat', 'score': 0.41},
152
+ # {'label': 'tiger cat', 'score': 0.24},
153
+ # {'label': 'Egyptian cat', 'score': 0.22}
154
+ # ]
155
+ # """
156
+
157
+ # try:
158
+ # labels = _image_pipe(BytesIO(img_bytes))[:top_k]
159
+ # return ", ".join(f"{d['label']} (score={d['score']:.2f})" for d in labels)
160
+ # except Exception as exc:
161
+ # return f"img_error:{exc}"
162
+
163
+
164
  @tool
165
+ def vision_task(img_bytes: bytes, question: str) -> str:
166
+ """
167
+ Pass the user's question AND the referenced image to a multimodal LLM and
168
+ return its first line of text as the answer. No domain assumptions made.
169
+ """
170
+ sys_prompt = (
171
+ "You are a terse assistant. Respond with ONLY the answer to the user's "
172
+ "question—no explanations, no punctuation except what the answer itself "
173
+ "requires. If the answer is a chess move, output it in algebraic notation."
174
+ )
175
+ vision_llm = ChatOpenAI(
176
+ model="gpt-4o-mini", # set OPENAI_API_KEY in env
177
+ temperature=0,
178
+ max_tokens=64,
179
+ )
180
  try:
181
+ b64 = b64encode(img_bytes).decode()
182
+ messages = [
183
+ SystemMessage(content=sys_prompt),
184
+ HumanMessage(
185
+ content=[
186
+ {"type": "text", "text": question.strip()},
187
+ {
188
+ "type": "image_url",
189
+ "image_url": {"url": f"data:image/png;base64,{b64}"},
190
+ },
191
+ ]
192
+ ),
193
+ ]
194
+ reply = vision_llm.invoke(messages).content.strip()
195
+ return reply
196
  except Exception as exc:
197
  return f"img_error:{exc}"
198
 
 
265
  "web_multi_search",
266
  "wiki_search",
267
  "youtube_transcript",
268
+ "vision_task",
269
  "run_py",
270
  "transcribe_via_whisper",
271
  "analyze_excel_file",