philincloud commited on
Commit
5798d39
·
verified ·
1 Parent(s): 3ea579f

Update langgraph_agent.py

Browse files
Files changed (1) hide show
  1. langgraph_agent.py +52 -57
langgraph_agent.py CHANGED
@@ -5,8 +5,8 @@ import pandas as pd
5
  from typing import Dict, List, Union
6
  import re
7
 
8
- from PIL import Image as PILImage
9
- from huggingface_hub import InferenceClient
10
 
11
  from langgraph.graph import START, StateGraph, MessagesState
12
  from langgraph.prebuilt import tools_condition, ToolNode
@@ -82,12 +82,14 @@ def arvix_search(query: str) -> dict:
82
  )
83
  return {"arvix_results": formatted}
84
 
 
 
85
  HF_API_TOKEN = os.getenv("HF_API_TOKEN")
86
  HF_INFERENCE_CLIENT = None
87
  if HF_API_TOKEN:
88
  HF_INFERENCE_CLIENT = InferenceClient(token=HF_API_TOKEN)
89
  else:
90
- print("WARNING: HF_API_TOKEN not set. Image and Audio tools will not function.")
91
 
92
  @tool
93
  def read_file_content(file_path: str) -> Dict[str, str]:
@@ -105,12 +107,10 @@ def read_file_content(file_path: str) -> Dict[str, str]:
105
  content = df.to_string()
106
  return {"file_type": "excel", "file_name": file_path, "file_content": content}
107
  elif file_extension in (".jpeg", ".jpg", ".png"):
108
- return {"file_type": "image", "file_name": file_path, "file_content": f"Image file '{file_path}' detected. Use 'describe_image' tool to get a textual description."}
 
109
  elif file_extension == ".mp3":
110
  # For MP3, we indicate it's an audio file and expect the LLM to handle the blob directly.
111
- # In a real Langchain setup, you might actually read the bytes here and pass them
112
- # as a part of the message content to the LLM if it supports direct binary upload.
113
- # For now, this tool simply confirms its type for the agent.
114
  return {"file_type": "audio", "file_name": file_path, "file_content": f"Audio file '{file_path}' detected. The LLM (Gemini 2.5 Pro) can process this audio content directly."}
115
  else:
116
  return {"file_type": "unsupported", "file_name": file_path, "file_content": f"Unsupported file type: {file_extension}. Only .txt, .py, .xlsx, .jpeg, .jpg, .png, .mp3 files are recognized."}
@@ -133,21 +133,6 @@ def python_interpreter(code: str) -> Dict[str, str]:
133
  except Exception as e:
134
  return {"execution_error": str(e)}
135
 
136
- @tool
137
- def describe_image(image_path: str) -> Dict[str, str]:
138
- """Generates a textual description for an image file (JPEG, JPG, PNG) using an image-to-text model from the Hugging Face Inference API. Requires HF_API_TOKEN environment variable to be set."""
139
- if not HF_INFERENCE_CLIENT:
140
- return {"error": "Hugging Face API token not configured for image description. Cannot use this tool."}
141
- try:
142
- with open(image_path, "rb") as f:
143
- image_bytes = f.read()
144
- description = HF_INFERENCE_CLIENT.image_to_text(image_bytes)
145
- return {"image_description": description, "image_path": image_path}
146
- except FileNotFoundError:
147
- return {"error": f"Image file not found: {image_path}. Please ensure the file exists."}
148
- except Exception as e:
149
- return {"error": f"Error describing image {image_path}: {str(e)}"}
150
-
151
  # --- Youtube Tool (Remains the same) ---
152
  @tool
153
  def Youtube(url: str, question: str) -> Dict[str, str]:
@@ -181,10 +166,10 @@ def Youtube(url: str, question: str) -> Dict[str, str]:
181
  # --- END YOUTUBE TOOL ---
182
 
183
  API_KEY = os.getenv("GEMINI_API_KEY")
184
- HF_API_TOKEN = os.getenv("HF_SPACE_TOKEN")
185
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
186
 
187
- # Update the tools list (removed transcribe_audio)
188
  tools = [
189
  multiply, add, subtract, divide, modulus,
190
  wiki_search,
@@ -192,8 +177,7 @@ tools = [
192
  arvix_search,
193
  read_file_content,
194
  python_interpreter,
195
- describe_image,
196
- Youtube, # <-- transcribe_audio has been removed
197
  ]
198
 
199
  with open("prompt.txt", "r", encoding="utf-8") as f:
@@ -224,38 +208,49 @@ def build_graph(provider: str = "gemini"):
224
  def assistant(state: MessagesState):
225
  messages_to_send = [sys_msg] + state["messages"]
226
 
227
- # When sending messages to Gemini, if read_file_content identified an audio file,
228
- # you'll need to ensure the actual binary content of the audio file is included
229
- # in the message parts for the LLM to process it natively.
230
- # This part requires a bit more advanced handling than just text.
231
- # Langchain often handles this when you use `tool_code.File(...)` or similar constructs.
232
- # For simplicity in this prompt and code example, we're assuming the framework
233
- # will correctly pass the file content if `read_file_content` returns an audio type.
234
-
235
- # A more robust implementation would involve modifying the `assistant` node
236
- # to explicitly read the file bytes and add them to the message parts
237
- # if a file is detected in the input state.
238
-
239
- # Example of how you might include binary content (conceptual, depends on LangChain/API):
240
  # new_messages_to_send = []
241
- # for msg in messages_to_send:
242
- # if isinstance(msg, HumanMessage) and "audio file" in msg.content: # Simplified check
243
- # # Assume you can get the actual file path from the context
244
- # file_path_from_context = "Strawberry pie.mp3" # Or extract from msg.content
245
- # if os.path.exists(file_path_from_context):
246
- # with open(file_path_from_context, "rb") as f:
247
- # audio_bytes = f.read()
248
- # new_messages_to_send.append(
249
- # HumanMessage(
250
- # content=[
251
- # {"type": "text", "text": "Here is the audio file:"},
252
- # {"type": "media", "media_type": "audio/mp3", "data": audio_bytes}
253
- # ]
254
- # )
255
- # )
256
- # else:
257
- # new_messages_to_send.append(msg)
258
- # llm_response = llm_with_tools.invoke(new_messages_to_send)
 
 
 
 
 
 
 
 
 
 
 
259
 
260
  llm_response = llm_with_tools.invoke(messages_to_send) # For now, keep as is, rely on framework
261
  print(f"LLM Raw Response: {llm_response}")
 
5
  from typing import Dict, List, Union
6
  import re
7
 
8
+ from PIL import Image as PILImage # Keep PIL for potential future use or if other parts depend on it, but describe_image is removed.
9
+ from huggingface_hub import InferenceClient # Keep InferenceClient for other potential HF uses, but describe_image is removed.
10
 
11
  from langgraph.graph import START, StateGraph, MessagesState
12
  from langgraph.prebuilt import tools_condition, ToolNode
 
82
  )
83
  return {"arvix_results": formatted}
84
 
85
+ # HF_API_TOKEN is no longer directly needed for describe_image as that tool is removed.
86
+ # But keeping InferenceClient initialization for completeness if other HF tools might be added later.
87
  HF_API_TOKEN = os.getenv("HF_API_TOKEN")
88
  HF_INFERENCE_CLIENT = None
89
  if HF_API_TOKEN:
90
  HF_INFERENCE_CLIENT = InferenceClient(token=HF_API_TOKEN)
91
  else:
92
+ print("WARNING: HF_API_TOKEN not set. If any other HF tools are used, they might not function.")
93
 
94
  @tool
95
  def read_file_content(file_path: str) -> Dict[str, str]:
 
107
  content = df.to_string()
108
  return {"file_type": "excel", "file_name": file_path, "file_content": content}
109
  elif file_extension in (".jpeg", ".jpg", ".png"):
110
+ # For images, we indicate it's an image file and expect the LLM to handle the blob directly.
111
+ return {"file_type": "image", "file_name": file_path, "file_content": f"Image file '{file_path}' detected. The LLM (Gemini 2.5 Pro) can process this image content directly."}
112
  elif file_extension == ".mp3":
113
  # For MP3, we indicate it's an audio file and expect the LLM to handle the blob directly.
 
 
 
114
  return {"file_type": "audio", "file_name": file_path, "file_content": f"Audio file '{file_path}' detected. The LLM (Gemini 2.5 Pro) can process this audio content directly."}
115
  else:
116
  return {"file_type": "unsupported", "file_name": file_path, "file_content": f"Unsupported file type: {file_extension}. Only .txt, .py, .xlsx, .jpeg, .jpg, .png, .mp3 files are recognized."}
 
133
  except Exception as e:
134
  return {"execution_error": str(e)}
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  # --- Youtube Tool (Remains the same) ---
137
  @tool
138
  def Youtube(url: str, question: str) -> Dict[str, str]:
 
166
  # --- END YOUTUBE TOOL ---
167
 
168
  API_KEY = os.getenv("GEMINI_API_KEY")
169
+ HF_API_TOKEN = os.getenv("HF_SPACE_TOKEN") # Kept for potential future HF uses, but not for describe_image
170
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
171
 
172
+ # Update the tools list (removed describe_image)
173
  tools = [
174
  multiply, add, subtract, divide, modulus,
175
  wiki_search,
 
177
  arvix_search,
178
  read_file_content,
179
  python_interpreter,
180
+ Youtube,
 
181
  ]
182
 
183
  with open("prompt.txt", "r", encoding="utf-8") as f:
 
208
  def assistant(state: MessagesState):
209
  messages_to_send = [sys_msg] + state["messages"]
210
 
211
+ # --- IMPORTANT NOTE ON HANDLING BINARY BLOB DATA FOR MULTIMODAL LLMs ---
212
+ # When read_file_content returns a file_type of "image" or "audio",
213
+ # the agent should be able to send the actual binary data of that file
214
+ # as part of the message to the LLM. LangChain's ChatGoogleGenerativeAI
215
+ # supports this via content parts in HumanMessage.
216
+ #
217
+ # For this setup, we're assuming the framework (LangGraph/LangChain)
218
+ # will correctly handle passing the actual file content when read_file_content
219
+ # is called and its output indicates a media type.
220
+ #
221
+ # A more explicit implementation in the assistant node might look like this
222
+ # for real binary file handling if the framework doesn't do it implicitly:
223
+ #
224
  # new_messages_to_send = []
225
+ # for msg in state["messages"]:
226
+ # if isinstance(msg, HumanMessage) and msg.tool_calls:
227
+ # # If a tool call to read_file_content happened in the previous turn
228
+ # # and it returned a media type, we might need to get the file data
229
+ # # and append it to the message parts. This logic is complex and
230
+ # # depends heavily on how tool outputs are structured and passed.
231
+ # # For simplicity in this template, we assume direct handling by the LLM
232
+ # # if the tool output indicates media, and the file itself is accessible
233
+ # # via the environment.
234
+ # pass # Keep original message, tool output will follow
235
+ # elif isinstance(msg, HumanMessage) and any(part.get("file_type") in ["image", "audio"] for part in msg.content if isinstance(part, dict)):
236
+ # # This is a conceptual example for if the HumanMessage itself contains file data
237
+ # # or a reference that needs to be resolved into data.
238
+ # # You'd need to load the actual file bytes here.
239
+ # # e.g., if msg.content was like: [{"type": "file_reference", "file_path": "image.png"}]
240
+ # # with open(msg.content[0]["file_path"], "rb") as f:
241
+ # # file_bytes = f.read()
242
+ # # new_messages_to_send.append(
243
+ # # HumanMessage(
244
+ # # content=[
245
+ # # {"type": "text", "text": "Here is the media content:"},
246
+ # # {"type": "image_data" if "image" in msg.content[0]["file_type"] else "audio_data", "data": base64.b64encode(file_bytes).decode('utf-8'), "media_type": "image/png" if "image" in msg.content[0]["file_type"] else "audio/mp3"}
247
+ # # ]
248
+ # # )
249
+ # # )
250
+ # else:
251
+ # new_messages_to_send.append(msg)
252
+ # llm_response = llm_with_tools.invoke([sys_msg] + new_messages_to_send)
253
+ # --- END IMPORTANT NOTE ---
254
 
255
  llm_response = llm_with_tools.invoke(messages_to_send) # For now, keep as is, rely on framework
256
  print(f"LLM Raw Response: {llm_response}")