Update agent.py
Browse files
agent.py
CHANGED
@@ -4,6 +4,7 @@ import os
|
|
4 |
import re
|
5 |
from typing import Dict, Any, List
|
6 |
from urllib.parse import urlparse
|
|
|
7 |
|
8 |
# Third-party imports
|
9 |
import requests
|
@@ -34,6 +35,7 @@ from llama_index.readers.web import TrafilaturaWebReader
|
|
34 |
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
|
35 |
from llama_index.tools.arxiv import ArxivToolSpec
|
36 |
from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
|
|
|
37 |
|
38 |
# --- Import all required official LlamaIndex Readers ---
|
39 |
from llama_index.readers.file import (
|
@@ -53,8 +55,6 @@ from llama_index.core.query_pipeline import QueryPipeline
|
|
53 |
import importlib.util
|
54 |
import sys
|
55 |
|
56 |
-
wandb_callback = WandbCallbackHandler(run_args={"project": "gaia-llamaindex-agents"})
|
57 |
-
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
|
58 |
|
59 |
# Comprehensive callback manager
|
60 |
callback_manager = CallbackManager([
|
@@ -100,7 +100,6 @@ code_llm = HuggingFaceLLM(
|
|
100 |
generate_kwargs={"temperature": 0.0, "do_sample": False}
|
101 |
)
|
102 |
|
103 |
-
|
104 |
embed_model = HuggingFaceEmbedding("BAAI/bge-small-en-v1.5")
|
105 |
|
106 |
wandb.init(project="gaia-llamaindex-agents") # Choisis ton nom de projet
|
@@ -112,7 +111,6 @@ Settings.llm = proj_llm
|
|
112 |
Settings.embed_model = embed_model
|
113 |
Settings.callback_manager = callback_manager
|
114 |
|
115 |
-
|
116 |
def read_and_parse_content(input_path: str) -> List[Document]:
|
117 |
"""
|
118 |
Reads and parses content from a local file path into Document objects.
|
@@ -177,13 +175,6 @@ def read_and_parse_content(input_path: str) -> List[Document]:
|
|
177 |
|
178 |
return documents
|
179 |
|
180 |
-
# --- Create the final LlamaIndex Tool from the completed function ---
|
181 |
-
extract_url_tool = FunctionTool.from_defaults(
|
182 |
-
fn=search_and_extract_top_url,
|
183 |
-
name="extract_url_tool",
|
184 |
-
description="Searches web and returns a relevant URL based on a query"
|
185 |
-
)
|
186 |
-
|
187 |
class DynamicQueryEngineManager:
|
188 |
"""Single unified manager for all RAG operations - replaces the entire static approach."""
|
189 |
|
@@ -205,27 +196,67 @@ class DynamicQueryEngineManager:
|
|
205 |
print(f"Loaded {len(self.documents)} initial documents")
|
206 |
|
207 |
def _create_rag_tool(self):
|
208 |
-
"""Create RAG tool using
|
209 |
documents = self.documents if self.documents else [
|
210 |
Document(text="No documents loaded yet. Use web search to add content.")
|
211 |
]
|
212 |
|
213 |
-
#
|
214 |
-
|
215 |
-
|
216 |
-
window_size=3,
|
217 |
-
window_metadata_key="window",
|
218 |
-
original_text_metadata_key="original_text",
|
219 |
-
)
|
220 |
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
|
226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
|
228 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
class HybridReranker:
|
230 |
def __init__(self):
|
231 |
self.text_reranker = SentenceTransformerRerank(
|
@@ -304,7 +335,6 @@ dynamic_qe_manager = DynamicQueryEngineManager()
|
|
304 |
# This tool returns text summaries of search results, not just URLs.
|
305 |
base_duckduckgo_tool = DuckDuckGoSearchToolSpec().to_tool_list()[1]
|
306 |
|
307 |
-
|
308 |
def search_and_extract_content_from_url(query: str) -> List[Document]:
|
309 |
"""
|
310 |
Searches web, gets top URL, and extracts both text content and images.
|
@@ -327,7 +357,7 @@ def search_and_extract_content_from_url(query: str) -> List[Document]:
|
|
327 |
documents = loader.load_data(youtubelinks=[url])
|
328 |
else:
|
329 |
loader = TrafilaturaWebReader (include_images = True)
|
330 |
-
documents = loader.load_data(
|
331 |
|
332 |
|
333 |
def enhanced_web_search_and_update(query: str) -> str:
|
|
|
4 |
import re
|
5 |
from typing import Dict, Any, List
|
6 |
from urllib.parse import urlparse
|
7 |
+
import torch
|
8 |
|
9 |
# Third-party imports
|
10 |
import requests
|
|
|
35 |
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
|
36 |
from llama_index.tools.arxiv import ArxivToolSpec
|
37 |
from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
|
38 |
+
from llama_index.core.agent.workflow import AgentWorkflow
|
39 |
|
40 |
# --- Import all required official LlamaIndex Readers ---
|
41 |
from llama_index.readers.file import (
|
|
|
55 |
import importlib.util
|
56 |
import sys
|
57 |
|
|
|
|
|
58 |
|
59 |
# Comprehensive callback manager
|
60 |
callback_manager = CallbackManager([
|
|
|
100 |
generate_kwargs={"temperature": 0.0, "do_sample": False}
|
101 |
)
|
102 |
|
|
|
103 |
embed_model = HuggingFaceEmbedding("BAAI/bge-small-en-v1.5")
|
104 |
|
105 |
wandb.init(project="gaia-llamaindex-agents") # Choisis ton nom de projet
|
|
|
111 |
Settings.embed_model = embed_model
|
112 |
Settings.callback_manager = callback_manager
|
113 |
|
|
|
114 |
def read_and_parse_content(input_path: str) -> List[Document]:
|
115 |
"""
|
116 |
Reads and parses content from a local file path into Document objects.
|
|
|
175 |
|
176 |
return documents
|
177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
class DynamicQueryEngineManager:
|
179 |
"""Single unified manager for all RAG operations - replaces the entire static approach."""
|
180 |
|
|
|
196 |
print(f"Loaded {len(self.documents)} initial documents")
|
197 |
|
198 |
def _create_rag_tool(self):
|
199 |
+
"""Create RAG tool using multimodal-aware parsing."""
|
200 |
documents = self.documents if self.documents else [
|
201 |
Document(text="No documents loaded yet. Use web search to add content.")
|
202 |
]
|
203 |
|
204 |
+
# Separate text and image documents for proper processing
|
205 |
+
text_documents = []
|
206 |
+
image_documents = []
|
|
|
|
|
|
|
|
|
207 |
|
208 |
+
for doc in documents:
|
209 |
+
doc_type = doc.metadata.get("type", "")
|
210 |
+
source = doc.metadata.get("source", "").lower()
|
211 |
+
file_type = doc.metadata.get("file_type", "")
|
212 |
+
|
213 |
+
# Identify image documents
|
214 |
+
if (doc_type in ["image", "web_image"] or
|
215 |
+
file_type in ['jpg', 'png', 'jpeg', 'gif', 'bmp', 'webp'] or
|
216 |
+
any(ext in source for ext in ['.jpg', '.png', '.jpeg', '.gif', '.bmp', '.webp'])):
|
217 |
+
image_documents.append(doc)
|
218 |
+
else:
|
219 |
+
text_documents.append(doc)
|
220 |
|
221 |
+
# Use UnstructuredElementNodeParser for text content with multimodal awareness
|
222 |
+
element_parser = UnstructuredElementNodeParser()
|
223 |
+
|
224 |
+
nodes = []
|
225 |
+
|
226 |
+
# Process text documents with UnstructuredElementNodeParser
|
227 |
+
if text_documents:
|
228 |
+
try:
|
229 |
+
text_nodes = element_parser.get_nodes_from_documents(text_documents)
|
230 |
+
nodes.extend(text_nodes)
|
231 |
+
except Exception as e:
|
232 |
+
print(f"Error parsing text documents with UnstructuredElementNodeParser: {e}")
|
233 |
+
# Fallback to simple parsing if UnstructuredElementNodeParser fails
|
234 |
+
from llama_index.core.node_parser import SimpleNodeParser
|
235 |
+
simple_parser = SimpleNodeParser.from_defaults(chunk_size=1024, chunk_overlap=200)
|
236 |
+
text_nodes = simple_parser.get_nodes_from_documents(text_documents)
|
237 |
+
nodes.extend(text_nodes)
|
238 |
|
239 |
+
# Process image documents as ImageNodes
|
240 |
+
if image_documents:
|
241 |
+
for img_doc in image_documents:
|
242 |
+
try:
|
243 |
+
image_node = ImageNode(
|
244 |
+
text=img_doc.text or f"Image content from {img_doc.metadata.get('source', 'unknown')}",
|
245 |
+
metadata=img_doc.metadata,
|
246 |
+
image_path=img_doc.metadata.get("path"),
|
247 |
+
image=img_doc.metadata.get("image_data")
|
248 |
+
)
|
249 |
+
nodes.append(image_node)
|
250 |
+
except Exception as e:
|
251 |
+
print(f"Error creating ImageNode: {e}")
|
252 |
+
# Fallback to regular TextNode for images
|
253 |
+
text_node = TextNode(
|
254 |
+
text=img_doc.text or f"Image content from {img_doc.metadata.get('source', 'unknown')}",
|
255 |
+
metadata=img_doc.metadata
|
256 |
+
)
|
257 |
+
nodes.append(text_node)
|
258 |
+
|
259 |
+
index = VectorStoreIndex(nodes)
|
260 |
class HybridReranker:
|
261 |
def __init__(self):
|
262 |
self.text_reranker = SentenceTransformerRerank(
|
|
|
335 |
# This tool returns text summaries of search results, not just URLs.
|
336 |
base_duckduckgo_tool = DuckDuckGoSearchToolSpec().to_tool_list()[1]
|
337 |
|
|
|
338 |
def search_and_extract_content_from_url(query: str) -> List[Document]:
|
339 |
"""
|
340 |
Searches web, gets top URL, and extracts both text content and images.
|
|
|
357 |
documents = loader.load_data(youtubelinks=[url])
|
358 |
else:
|
359 |
loader = TrafilaturaWebReader (include_images = True)
|
360 |
+
documents = loader.load_data(urls=[url])
|
361 |
|
362 |
|
363 |
def enhanced_web_search_and_update(query: str) -> str:
|