Spaces:
Sleeping
Sleeping
Essi
commited on
Commit
·
545e8ad
1
Parent(s):
2e2232b
feat: refactor image processing to use multimodal LLM queries with `vision_task` instead of CLIP model function
Browse files
tools.py
CHANGED
@@ -3,15 +3,17 @@ import json
|
|
3 |
import operator
|
4 |
import re
|
5 |
import subprocess
|
|
|
6 |
from functools import lru_cache
|
7 |
from io import BytesIO
|
8 |
from tempfile import NamedTemporaryFile
|
9 |
|
10 |
-
import requests
|
11 |
from langchain_community.document_loaders import WikipediaLoader
|
12 |
from langchain_community.tools.tavily_search import TavilySearchResults
|
13 |
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
|
|
|
14 |
from langchain_core.tools import tool
|
|
|
15 |
from transformers import pipeline
|
16 |
from youtube_transcript_api import YouTubeTranscriptApi
|
17 |
|
@@ -140,14 +142,57 @@ _image_pipe = pipeline(
|
|
140 |
)
|
141 |
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
@tool
|
144 |
-
def
|
145 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
try:
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
except Exception as exc:
|
152 |
return f"img_error:{exc}"
|
153 |
|
@@ -220,7 +265,7 @@ __all__ = [
|
|
220 |
"web_multi_search",
|
221 |
"wiki_search",
|
222 |
"youtube_transcript",
|
223 |
-
"
|
224 |
"run_py",
|
225 |
"transcribe_via_whisper",
|
226 |
"analyze_excel_file",
|
|
|
3 |
import operator
|
4 |
import re
|
5 |
import subprocess
|
6 |
+
from base64 import b64encode
|
7 |
from functools import lru_cache
|
8 |
from io import BytesIO
|
9 |
from tempfile import NamedTemporaryFile
|
10 |
|
|
|
11 |
from langchain_community.document_loaders import WikipediaLoader
|
12 |
from langchain_community.tools.tavily_search import TavilySearchResults
|
13 |
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
|
14 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
15 |
from langchain_core.tools import tool
|
16 |
+
from langchain_openai import ChatOpenAI
|
17 |
from transformers import pipeline
|
18 |
from youtube_transcript_api import YouTubeTranscriptApi
|
19 |
|
|
|
142 |
)
|
143 |
|
144 |
|
145 |
+
# @tool
|
146 |
+
# def image_describe(img_bytes: bytes, top_k: int = 3) -> str:
|
147 |
+
# """Return the top-k CLIP labels for an image supplied as raw bytes.
|
148 |
+
|
149 |
+
# typical result for a random cat photo can be:
|
150 |
+
# [
|
151 |
+
# {'label': 'tabby, tabby cat', 'score': 0.41},
|
152 |
+
# {'label': 'tiger cat', 'score': 0.24},
|
153 |
+
# {'label': 'Egyptian cat', 'score': 0.22}
|
154 |
+
# ]
|
155 |
+
# """
|
156 |
+
|
157 |
+
# try:
|
158 |
+
# labels = _image_pipe(BytesIO(img_bytes))[:top_k]
|
159 |
+
# return ", ".join(f"{d['label']} (score={d['score']:.2f})" for d in labels)
|
160 |
+
# except Exception as exc:
|
161 |
+
# return f"img_error:{exc}"
|
162 |
+
|
163 |
+
|
164 |
@tool
|
165 |
+
def vision_task(img_bytes: bytes, question: str) -> str:
|
166 |
+
"""
|
167 |
+
Pass the user's question AND the referenced image to a multimodal LLM and
|
168 |
+
return its first line of text as the answer. No domain assumptions made.
|
169 |
+
"""
|
170 |
+
sys_prompt = (
|
171 |
+
"You are a terse assistant. Respond with ONLY the answer to the user's "
|
172 |
+
"question—no explanations, no punctuation except what the answer itself "
|
173 |
+
"requires. If the answer is a chess move, output it in algebraic notation."
|
174 |
+
)
|
175 |
+
vision_llm = ChatOpenAI(
|
176 |
+
model="gpt-4o-mini", # set OPENAI_API_KEY in env
|
177 |
+
temperature=0,
|
178 |
+
max_tokens=64,
|
179 |
+
)
|
180 |
try:
|
181 |
+
b64 = b64encode(img_bytes).decode()
|
182 |
+
messages = [
|
183 |
+
SystemMessage(content=sys_prompt),
|
184 |
+
HumanMessage(
|
185 |
+
content=[
|
186 |
+
{"type": "text", "text": question.strip()},
|
187 |
+
{
|
188 |
+
"type": "image_url",
|
189 |
+
"image_url": {"url": f"data:image/png;base64,{b64}"},
|
190 |
+
},
|
191 |
+
]
|
192 |
+
),
|
193 |
+
]
|
194 |
+
reply = vision_llm.invoke(messages).content.strip()
|
195 |
+
return reply
|
196 |
except Exception as exc:
|
197 |
return f"img_error:{exc}"
|
198 |
|
|
|
265 |
"web_multi_search",
|
266 |
"wiki_search",
|
267 |
"youtube_transcript",
|
268 |
+
"vision_task",
|
269 |
"run_py",
|
270 |
"transcribe_via_whisper",
|
271 |
"analyze_excel_file",
|