Coool2 commited on
Commit
bd6923a
·
verified ·
1 Parent(s): afecb73

Update agent.py

Browse files
Files changed (1) hide show
  1. agent.py +262 -450
agent.py CHANGED
@@ -63,379 +63,209 @@ Settings.llm = proj_llm
63
  Settings.embed_model = embed_model
64
  Settings.callback_manager = callback_manager
65
 
 
 
 
66
 
67
-
68
- class EnhancedRAGQueryEngine:
69
- def __init__(self, task_context: str = ""):
70
- self.task_context = task_context
71
- self.embed_model = embed_model
72
- self.reranker = SentenceTransformerRerank(model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=5)
73
-
74
- self.readers = {
75
- '.pdf': PDFReader(),
76
- '.docx': DocxReader(),
77
- '.doc': DocxReader(),
78
- '.csv': CSVReader(),
79
- '.txt': lambda file_path: [Document(text=open(file_path, 'r', encoding='utf-8').read())],
80
- '.jpg': ImageReader(),
81
- '.jpeg': ImageReader(),
82
- '.png': ImageReader(),
83
- 'web': TrafilaturaWebReader(),
84
- 'youtube': YoutubeTranscriptReader()
85
- }
86
-
87
- self.sentence_window_parser = SentenceWindowNodeParser.from_defaults(
88
- window_size=3,
89
- window_metadata_key="window",
90
- original_text_metadata_key="original_text"
91
- )
92
-
93
- self.hierarchical_parser = HierarchicalNodeParser.from_defaults(
94
- chunk_sizes=[2048, 512, 128]
95
- )
96
-
97
- def load_and_process_documents(self, file_paths: List[str]) -> List[Document]:
98
- documents = []
99
-
100
- for file_path in file_paths:
101
- file_ext = os.path.splitext(file_path)[1].lower()
102
-
103
- try:
104
- if file_ext in self.readers:
105
- reader = self.readers[file_ext]
106
- if callable(reader):
107
- docs = reader(file_path)
108
- else:
109
- docs = reader.load_data(file=file_path)
110
-
111
- # Ensure docs is a list
112
- if not isinstance(docs, list):
113
- docs = [docs]
114
-
115
- # Add metadata to all documents
116
- for doc in docs:
117
- if hasattr(doc, 'metadata'):
118
- doc.metadata.update({
119
- "file_path": file_path,
120
- "file_type": file_ext[1:],
121
- "task_context": self.task_context
122
- })
123
- documents.extend(docs)
124
-
125
- except Exception as e:
126
- # Fallback to text reading
127
- try:
128
- with open(file_path, 'r', encoding='utf-8') as f:
129
- content = f.read()
130
- documents.append(Document(
131
- text=content,
132
- metadata={"file_path": file_path, "file_type": "text", "error": str(e)}
133
- ))
134
- except Exception as fallback_error:
135
- print(f"Failed to process {file_path}: {e}, Fallback error: {fallback_error}")
136
-
137
- return documents
138
-
139
- def create_advanced_index(self, documents: List[Document], use_hierarchical: bool = False) -> VectorStoreIndex:
140
- if use_hierarchical or len(documents) > 10:
141
- nodes = self.hierarchical_parser.get_nodes_from_documents(documents)
142
  else:
143
- nodes = self.sentence_window_parser.get_nodes_from_documents(documents)
144
-
145
- index = VectorStoreIndex(
146
- nodes,
147
- embed_model=self.embed_model
148
- )
149
-
150
- return index
151
 
152
- def create_context_aware_query_engine(self, index: VectorStoreIndex):
153
- retriever = VectorIndexRetriever(
154
- index=index,
155
- similarity_top_k=10
156
- )
157
-
158
- query_engine = RetrieverQueryEngine(
159
- retriever=retriever,
160
- node_postprocessors=[self.reranker],
161
- llm=proj_llm
162
- )
163
-
164
- return query_engine
165
-
166
- class HybridWebRAGTool:
167
- def __init__(self, rag_engine: EnhancedRAGQueryEngine):
168
- self.duckduckgo_tool = DuckDuckGoSearchToolSpec().to_tool_list()[0]
169
- self.rag_engine = rag_engine
170
-
171
- def is_youtube_url(self, url: str) -> bool:
172
- """Check if URL is a YouTube video"""
173
- return 'youtube.com/watch' in url or 'youtu.be/' in url
174
-
175
- def search_and_analyze(self, query: str, max_results: int = 3) -> str:
176
- """Search web and analyze content with RAG, including YouTube support"""
177
-
178
- try:
179
- # Step 1: Get URLs from DuckDuckGo
180
- search_results = self.duckduckgo_tool.call(query=query, max_results=max_results)
181
-
182
- if isinstance(search_results, list):
183
- urls = [r.get('href', '') for r in search_results if r.get('href')]
184
- else:
185
- return f"Search failed: {search_results}"
186
-
187
- if not urls:
188
- return "No URLs found in search results"
189
-
190
- # Step 2: Process URLs based on type
191
- web_documents = []
192
- youtube_urls = []
193
- regular_urls = []
194
-
195
- # Separate YouTube URLs from regular web URLs
196
- for url in urls:
197
- if self.is_youtube_url(url):
198
- youtube_urls.append(url)
199
- else:
200
- regular_urls.append(url)
201
-
202
- # Process YouTube videos
203
- if youtube_urls:
204
- try:
205
- youtube_docs = self.rag_engine.readers['youtube'].load_data(youtube_urls)
206
- if isinstance(youtube_docs, list):
207
- web_documents.extend(youtube_docs)
208
- else:
209
- web_documents.append(youtube_docs)
210
- except Exception as e:
211
- print(f"Failed to load YouTube videos: {e}")
212
-
213
- # Process regular web pages
214
- for url in regular_urls:
215
- try:
216
- docs = self.rag_engine.readers['web'].load_data([url])
217
- if isinstance(docs, list):
218
- web_documents.extend(docs)
219
- else:
220
- web_documents.append(docs)
221
- except Exception as e:
222
- print(f"Failed to load {url}: {e}")
223
- continue
224
-
225
- if not web_documents:
226
- return "No content could be extracted from URLs"
227
-
228
- # Step 3: Create temporary index and query
229
- temp_index = self.rag_engine.create_advanced_index(web_documents)
230
-
231
- # Step 4: Query the indexed content
232
- query_engine = self.rag_engine.create_context_aware_query_engine(temp_index)
233
-
234
- response = query_engine.query(query)
235
-
236
- # Add source information
237
- source_info = []
238
- if youtube_urls:
239
- source_info.append(f"YouTube videos: {len(youtube_urls)}")
240
- if regular_urls:
241
- source_info.append(f"Web pages: {len(regular_urls)}")
242
-
243
- return f"{str(response)}\n\nSources analyzed: {', '.join(source_info)}"
244
-
245
- except Exception as e:
246
- return f"Error in hybrid search: {str(e)}"
247
-
248
- # Create the research tool function
249
- def research_tool_function(query: str) -> str:
250
- """Combines DuckDuckGo search with RAG analysis of web content and YouTube videos"""
251
- try:
252
- rag_engine = EnhancedRAGQueryEngine()
253
- hybrid_tool = HybridWebRAGTool(rag_engine)
254
- return hybrid_tool.search_and_analyze(query)
255
- except Exception as e:
256
- return f"Research tool error: {str(e)}"
257
-
258
- # Create the research tool for your agent
259
- research_tool = FunctionTool.from_defaults(
260
- fn=research_tool_function,
261
- name="research_tool",
262
- description="""Advanced research tool that combines web search with RAG analysis, supporting both web pages and YouTube videos, with context-aware processing.
263
-
264
- **When to Use:**
265
- - Questions requiring external knowledge beyond training data
266
- - Current or recent information (post-training cutoff)
267
- - Scientific research requiring academic sources
268
- - Factual verification of specific claims
269
- - Any question where search results could provide the exact answer
270
- - Research involving video content and tutorials
271
- - Complex queries needing synthesis of multiple sources
272
-
273
- **Advantages:**
274
- - Full content analysis from both web and video sources
275
- - Automatic content type detection and processing
276
- - Semantic search within retrieved content
277
- - Reranking for relevance across all source types
278
- - Comprehensive synthesis of multimedia information"""
279
- )
280
-
281
- def comprehensive_rag_analysis(file_paths: List[str], query: str, task_context: str = "") -> str:
282
- try:
283
- rag_engine = EnhancedRAGQueryEngine(task_context)
284
- documents = rag_engine.load_and_process_documents(file_paths)
285
-
286
- if not documents:
287
- return "No documents could be processed successfully."
288
-
289
- total_text_length = sum(len(doc.text) for doc in documents)
290
- use_hierarchical = total_text_length > 50000 or len(documents) > 5
291
-
292
- index = rag_engine.create_advanced_index(documents, use_hierarchical)
293
- query_engine = rag_engine.create_context_aware_query_engine(index)
294
-
295
- enhanced_query = f"""
296
- Task Context: {task_context}
297
- Original Query: {query}
298
-
299
- Please analyze the provided documents and answer the query with precise, factual information.
300
- """
301
 
302
- response = query_engine.query(enhanced_query)
303
-
304
- result = f"**RAG Analysis Results:**\n\n"
305
- result += f"**Documents Processed:** {len(documents)}\n"
306
- result += f"**Answer:**\n{response.response}\n\n"
307
-
308
- return result
309
-
310
- except Exception as e:
311
- return f"RAG analysis failed: {str(e)}"
312
-
313
- def cross_document_analysis(file_paths: List[str], query: str, task_context: str = "") -> str:
314
- try:
315
- rag_engine = EnhancedRAGQueryEngine(task_context)
316
- all_documents = []
317
- document_groups = {}
318
-
319
- for file_path in file_paths:
320
- docs = rag_engine.load_and_process_documents([file_path])
321
- doc_key = os.path.basename(file_path)
322
- document_groups[doc_key] = docs
323
-
324
- for doc in docs:
325
- doc.metadata.update({
326
- "document_group": doc_key,
327
- "total_documents": len(file_paths)
328
- })
329
- all_documents.extend(docs)
330
-
331
- index = rag_engine.create_advanced_index(all_documents, use_hierarchical=True)
332
- query_engine = rag_engine.create_context_aware_query_engine(index)
333
-
334
- response = query_engine.query(f"Task: {task_context}\nQuery: {query}")
335
-
336
- result = f"**Cross-Document Analysis:**\n"
337
- result += f"**Documents:** {list(document_groups.keys())}\n"
338
- result += f"**Answer:**\n{response.response}\n"
339
-
340
- return result
341
-
342
- except Exception as e:
343
- return f"Cross-document analysis failed: {str(e)}"
344
 
345
- # Create tools
346
- enhanced_rag_tool = FunctionTool.from_defaults(
347
- fn=comprehensive_rag_analysis,
348
- name="Enhanced RAG Analysis",
349
- description="Comprehensive document analysis using advanced RAG with context-aware processing"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  )
351
 
352
- cross_document_tool = FunctionTool.from_defaults(
353
- fn=cross_document_analysis,
354
- name="Cross-Document Analysis",
355
- description="Advanced analysis across multiple documents with cross-referencing capabilities"
356
- )
 
357
 
358
- # Analysis Agent
359
- analysis_agent = FunctionAgent(
360
- name="AnalysisAgent",
361
- description="Advanced multimodal analysis using enhanced RAG and cross-document capabilities",
362
- system_prompt="""
363
- You are an advanced analysis specialist with access to:
364
- - Enhanced RAG with hybrid search and reranking
365
- - Multi-format document processing (PDF, Word, CSV, images, text)
366
- - Cross-document analysis and synthesis
367
- - Context-aware query processing
368
-
369
- Your capabilities:
370
- 1. Process multiple file types simultaneously
371
- 2. Perform semantic search across document collections
372
- 3. Cross-reference information between documents
373
- 4. Extract precise information with source attribution
374
- 5. Handle both text and visual content analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
- Always consider the task context and provide precise, well-sourced answers.
377
- """,
378
- llm=proj_llm,
379
- tools=[enhanced_rag_tool, cross_document_tool],
380
- max_steps=5,
381
- verbose = True
382
- )
383
 
384
- class IntelligentSourceRouter:
385
- def __init__(self):
386
- self.arxiv_tool = ArxivToolSpec().to_tool_list()[0]
387
- self.duckduckgo_tool = DuckDuckGoSearchToolSpec().to_tool_list()[0]
388
 
389
- def route_and_search(self, query: str) -> str:
390
- """Simple routing between academic and general search - returns URLs only"""
391
-
392
- # Quick intent detection
393
- intent_prompt = f"""
394
- Is this question about scientific research or general information?
395
- Question: "{query}"
396
-
397
- Answer "arxiv" for scientific/academic topics, "web" for everything else.
398
- """
399
-
400
- response = proj_llm.complete(intent_prompt)
401
- source = "arxiv" if "arxiv" in response.text.lower() else "web"
402
-
403
- try:
404
- if source == "arxiv":
405
- # ArXiv results typically contain URLs in the response text
406
- arxiv_result = self.arxiv_tool.call(query=query)
407
- # Extract URLs from ArXiv response (you may need to parse this based on actual format)
408
- return str(arxiv_result) # ArXiv tool should return URLs
409
- else:
410
- result = self.duckduckgo_tool.call(query=query)
411
- if isinstance(result, list):
412
- # Extract only URLs from search results
413
- urls = [r.get('href', '') for r in result if r.get('href')]
414
- return "\n".join(urls)
415
- return str(result)
416
- except Exception as e:
417
- return f"Search failed: {str(e)}"
418
-
419
- # Simple research function
420
- def research_tool_function(query: str) -> str:
421
- """Returns URLs for queries using intelligent source routing"""
422
- router = IntelligentSourceRouter()
423
- return router.route_and_search(query)
424
-
425
- # Clean tool definition
426
- research_tool = FunctionTool.from_defaults(
427
- fn=research_tool_function,
428
- name="research_tool",
429
- description="""Intelligent URL finder that routes between academic (ArXiv) and general (web) search sources to return relevant URLs.
430
-
431
- **When to Use:**
432
- - Questions requiring external knowledge beyond training data
433
- - Current or recent information (post-training cutoff)
434
- - Scientific research requiring academic sources
435
- - Factual verification of specific claims
436
- - Any question where you need URLs to relevant sources
437
-
438
- Simply provide your question and get URLs to visit for further reading."""
439
  )
440
 
441
  def execute_python_code(code: str) -> str:
@@ -539,97 +369,79 @@ name="Python Code Execution",
539
  description="Execute Python code safely for calculations and data processing"
540
  )
541
 
542
- # Code Agent as ReActAgent with explicit code generation
543
- code_agent = ReActAgent(
544
- name="CodeAgent",
545
- description="Advanced calculations, data processing using code generation and execution",
546
- system_prompt="""
547
- You are a coding specialist. For EVERY computational task:
548
-
549
- 1. THINK: Analyze what calculation/processing is needed
550
- 2. GENERATE CODE: Write Python code to solve the problem
551
- 3. EXECUTE: Use the Python Code Execution tool to run your code
552
- 4. OBSERVE: Check the results
553
- 5. REPEAT if needed
554
-
555
- ALWAYS write code for:
556
- - Mathematical calculations
557
- - Data processing
558
- - Numerical analysis
559
- - Text processing
560
- - Any computational task
561
-
562
- Example workflow:
563
- Question: "What is 15 * 23 + 7?"
564
-
565
- Thought: I need to calculate 15 * 23 + 7
566
- Action: Python Code Execution
567
- Action Input: {"code": "result = 15 * 23 + 7\nprint(f'The answer is: {result}')"}
568
-
569
- Store your final answer in a variable called 'result'.
570
- """,
571
- llm=proj_llm,
572
- tools=[code_execution_tool],
573
- max_steps=5,
574
- verbose=True,
575
- callback_manager=callback_manager,
576
- )
577
-
578
- def analysis_function(query: str, files=None):
579
- ctx = Context(analysis_agent)
580
- return analysis_agent.run(query, ctx=ctx)
581
-
582
- def code_function(query: str):
583
- ctx = Context(code_agent)
584
- return code_agent.run(query, ctx=ctx)
585
-
586
 
587
- analysis_tool = FunctionTool.from_defaults(
588
- fn=analysis_function,
589
- name="AnalysisAgent",
590
- description="""Advanced multimodal document analysis specialist. Use this tool at least when you need to:
591
-
592
- **Document Processing:**
593
- - Analyze PDF, Word, CSV, or image files provided with the question
594
- - Extract specific information from tables, charts, or structured documents
595
- - Cross-reference information across multiple documents
596
- - Perform semantic search within document collections
597
-
598
- **Content Analysis:**
599
- - Summarize long documents or extract key facts
600
- - Find specific data points, numbers, or text within files
601
- - Analyze visual content in images (charts, graphs, diagrams)
602
- - Compare information between different document sources
603
-
604
- **When to use:** Questions involving file attachments, document analysis, data extraction from PDFs/images, or when you need to process structured/unstructured content.
605
-
606
- **Input format:** Provide the query and mention any relevant files or context."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
607
  )
608
 
609
- code_tool = FunctionTool.from_defaults(
610
- fn=code_function,
611
- name="CodeAgent",
612
- description="""Advanced computational specialist using ReAct reasoning. Use this tool at least when you need:
613
-
614
- **Core Capabilities:**
615
- - **Autonomous Code Generation**: Writes Python code from scratch to solve computational problems
616
- - **Multi-step Problem Solving**: Breaks complex tasks into manageable coding steps
617
- - **Self-debugging**: Identifies and fixes errors through iterative refinement
618
- - **Library Integration**: Leverages numpy, pandas, matplotlib, scipy, sklearn, and other scientific libraries
619
- - **Result Verification**: Validates outputs and adjusts approach as needed
620
-
621
- **When to Use:**
622
- - Mathematical calculations requiring step-by-step computation
623
- - Data analysis and statistical processing
624
- - Algorithm implementation, optimization and execution
625
- - Numerical simulations and modeling
626
- - Text processing and pattern analysis
627
- - Complex logical operations requiring code verification
628
-
629
- **Unique Advantage**: Unlike simple calculation tools, this agent can autonomously write, execute, debug, and refine code until achieving the correct solution, making it ideal for complex computational tasks that require adaptive problem-solving.
630
-
631
- **Input Format**: Describe the computational task clearly, including any data, constraints, or specific requirements."""
632
- )
633
 
634
  class EnhancedGAIAAgent:
635
  def __init__(self):
 
63
  Settings.embed_model = embed_model
64
  Settings.callback_manager = callback_manager
65
 
66
+ import os
67
+ from typing import List
68
+ from urllib.parse import urlparse
69
 
70
+ from llama_index.core.tools import FunctionTool
71
+ from llama_index.core import Document
72
+
73
+ # --- Import all required official LlamaIndex Readers ---
74
+ from llama_index.readers.file import (
75
+ PDFReader,
76
+ DocxReader,
77
+ CSVReader,
78
+ PandasExcelReader,
79
+ ImageReader,
80
+ )
81
+ from llama_index.readers.json import JSONReader
82
+ from llama_index.readers.web import TrafilaturaWebReader
83
+ from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
84
+ from llama_index.readers.audiotranscribe.openai import OpenAIAudioTranscriptReader
85
+
86
+ def read_and_parse_content(input_path: str) -> List[Document]:
87
+ """
88
+ Reads and parses content from a file path or URL into Document objects.
89
+ It automatically detects the input type and uses the appropriate LlamaIndex reader.
90
+
91
+ Args:
92
+ input_path: A local file path or a web URL.
93
+
94
+ Returns:
95
+ A list of LlamaIndex Document objects with the extracted text.
96
+ """
97
+ # --- Completed readers map for various local file types ---
98
+ readers_map = {
99
+ # Documents
100
+ '.pdf': PDFReader(),
101
+ '.docx': DocxReader(),
102
+ '.doc': DocxReader(),
103
+ # Data files
104
+ '.csv': CSVReader(),
105
+ '.json': JSONReader(),
106
+ '.xlsx': PandasExcelReader(),
107
+ # Media files
108
+ '.jpg': ImageReader(),
109
+ '.jpeg': ImageReader(),
110
+ '.png': ImageReader(),
111
+ '.mp3': OpenAIAudioTranscriptReader(),
112
+ }
113
+
114
+ # --- URL Handling ---
115
+ if input_path.startswith("http"):
116
+ if "https://www.youtube.com/watch?v=2N-rwsa5lEw2" in urlparse(input_path).netloc or "https://www.youtube.com/watch?v=2N-rwsa5lEw3" in urlparse(input_path).netloc:
117
+ loader = YoutubeTranscriptReader()
118
+ documents = loader.load_data(youtubelinks=[input_path])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  else:
120
+ loader = TrafilaturaWebReader()
121
+ documents = loader.load_data(urls=[input_path])
 
 
 
 
 
 
122
 
123
+ # --- Local File Handling ---
124
+ else:
125
+ if not os.path.exists(input_path):
126
+ return [Document(text=f"Error: File not found at {input_path}")]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ file_extension = os.path.splitext(input_path)[1].lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ if file_extension in readers_map:
131
+ loader = readers_map[file_extension]
132
+ documents = loader.load_data(file=input_path)
133
+ else:
134
+ # Fallback for text-based files without a specific reader (e.g., .py, .txt, .md)
135
+ try:
136
+ with open(input_path, 'r', encoding='utf-8') as f:
137
+ content = f.read()
138
+ documents = [Document(text=content, metadata={"source": input_path})]
139
+ except Exception as e:
140
+ return [Document(text=f"Error reading file as plain text: {e}")]
141
+
142
+ # Add the source path to metadata for traceability
143
+ for doc in documents:
144
+ doc.metadata["source"] = input_path
145
+
146
+ return documents
147
+
148
+ # --- Create the final LlamaIndex Tool from the completed function ---
149
+ read_and_parse_tool = FunctionTool.from_defaults(
150
+ fn=read_and_parse_content,
151
+ name="read_and_parse_tool",
152
+ description=(
153
+ "Use this tool to read and extract content from any given file or URL. "
154
+ "It handles PDF, DOCX, CSV, JSON, XLSX, and image files, as well as web pages, "
155
+ "YouTube videos (transcripts), and MP3 audio (transcripts). It also reads plain text "
156
+ "from files like .py or .txt. The input MUST be a single valid file path or a URL."
157
+ )
158
  )
159
 
160
+ from typing import List
161
+ from llama_index.core import VectorStoreIndex, Document, Settings
162
+ from llama_index.core.tools import QueryEngineTool
163
+ from llama_index.core.node_parser import SentenceWindowNodeParser, HierarchicalNodeParser
164
+ from llama_index.core.postprocessor import SentenceTransformerRerank
165
+ from llama_index.core.query_engine import RetrieverQueryEngine
166
 
167
+ def create_rag_tool(documents: List[Document]) -> QueryEngineTool:
168
+ """
169
+ Creates a RAG query engine tool from a list of documents using advanced components.
170
+ Inspired by 'create_advanced_index' and 'create_context_aware_query_engine' methods.
171
+
172
+ Args:
173
+ documents: A list of LlamaIndex Document objects from the read_and_parse_tool.
174
+
175
+ Returns:
176
+ A QueryEngineTool configured for the agent to use in the current task.
177
+ """
178
+ if not documents:
179
+ return None
180
+
181
+ # --- 1. Node Parsing (from your 'create_advanced_index' logic) ---
182
+ # Using the exact parsers and logic you defined.
183
+ hierarchical_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=[2048, 512, 128])
184
+ sentence_window_parser = SentenceWindowNodeParser.from_defaults(
185
+ window_size=3,
186
+ window_metadata_key="window",
187
+ original_text_metadata_key="original_text",
188
+ )
189
+
190
+ # Choose parser based on document count
191
+ if len(documents) > 5: # Heuristic for using hierarchical parser
192
+ nodes = hierarchical_parser.get_nodes_from_documents(documents)
193
+ else:
194
+ nodes = sentence_window_parser.get_nodes_from_documents(documents)
195
+
196
+ # --- 2. Index Creation ---
197
+ # Assumes Settings.embed_model is configured globally as in your snippet
198
+ index = VectorStoreIndex(nodes)
199
+
200
+ # --- 3. Query Engine Creation (from your 'create_context_aware_query_engine' logic) ---
201
+ # Using the exact reranker you specified
202
+ reranker = SentenceTransformerRerank(
203
+ model="cross-encoder/ms-marco-MiniLM-L-2-v2",
204
+ top_n=5
205
+ )
206
+
207
+ query_engine = index.as_query_engine(
208
+ similarity_top_k=10,
209
+ node_postprocessors=[reranker],
210
+ # Assumes Settings.llm is configured globally
211
+ )
212
+
213
+ # --- 4. Wrap the Query Engine in a Tool ---
214
+ rag_engine_tool = QueryEngineTool.from_defaults(
215
+ query_engine=query_engine,
216
+ name="rag_engine_tool",
217
+ description=(
218
+ "Use this tool to ask questions and query the content of documents that have already "
219
+ "been loaded. This is your primary way to find answers from the provided context. "
220
+ "The input is a natural language question about the documents' content."
221
+ )
222
+ )
223
 
224
+ return rag_engine_tool
 
 
 
 
 
 
225
 
 
 
 
 
226
 
227
+ import re
228
+ from llama_index.core.tools import FunctionTool
229
+ from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
230
+
231
+ # 1. Create the base DuckDuckGo search tool from the official spec.
232
+ # This tool returns text summaries of search results, not just URLs.
233
+ base_duckduckgo_tool = DuckDuckGoSearchToolSpec().to_tool_list()[0]
234
+
235
+ # 2. Define a wrapper function to post-process the output.
236
+ def search_and_extract_top_url(query: str) -> str:
237
+ """
238
+ Takes a search query, uses the base DuckDuckGo search tool to get results,
239
+ and then parses the output to extract and return only the first URL.
240
+
241
+ Args:
242
+ query: The natural language search query.
243
+
244
+ Returns:
245
+ A string containing the first URL found, or an error message if none is found.
246
+ """
247
+ # Call the base tool to get the search results as text
248
+ search_results = base_duckduckgo_tool(query)
249
+
250
+ # Use a regular expression to find the first URL in the text output
251
+ # The \S+ pattern matches any sequence of non-whitespace characters
252
+ url_match = re.search(r"https?://\S+", str(search_results))
253
+
254
+ if url_match:
255
+ return url_match.group(0)
256
+ else:
257
+ return "No URL could be extracted from the search results."
258
+
259
+ # 3. Create the final, customized FunctionTool for the agent.
260
+ # This is the tool you will actually give to your agent.
261
+ extract_url_tool = FunctionTool.from_defaults(
262
+ fn=search_and_extract_top_url,
263
+ name="extract_url_tool",
264
+ description=(
265
+ "Use this tool ONLY when you need to find a relevant URL to answer a question but no "
266
+ "specific file, document, or URL has been provided. It takes a search query as input "
267
+ "and returns a single, relevant URL."
268
+ )
 
 
 
 
 
 
 
 
269
  )
270
 
271
  def execute_python_code(code: str) -> str:
 
369
  description="Execute Python code safely for calculations and data processing"
370
  )
371
 
372
+ import re
373
+ from llama_index.core.tools import FunctionTool
374
+ from llama_index.llms.huggingface import HuggingFaceLLM
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
+ # --- 1. Initialize a dedicated LLM for Code Generation ---
377
+ # It's good practice to use a model specifically fine-tuned for coding.
378
+ # This model is loaded only once for efficiency.
379
+ try:
380
+ code_llm = HuggingFaceLLM(
381
+ model_name="Qwen/Qwen2.5-Coder-7B",
382
+ tokenizer_name="Qwen/Qwen2.5-Coder-7B",
383
+ device_map="auto",
384
+ model_kwargs={"torch_dtype": "auto"},
385
+ # Set generation parameters for precise, non-creative code output
386
+ generate_kwargs={"temperature": 0.0, "do_sample": False}
387
+ )
388
+ except Exception as e:
389
+ print(f"Error initializing code generation model: {e}")
390
+ print("Code generation tool will not be available.")
391
+ code_llm = None
392
+
393
+
394
+ def generate_python_code(query: str) -> str:
395
+ """
396
+ Generates executable Python code based on a natural language query.
397
+
398
+ Args:
399
+ query: A detailed description of the desired functionality for the Python code.
400
+
401
+ Returns:
402
+ A string containing only the generated Python code, ready for execution.
403
+ """
404
+ if not code_llm:
405
+ return "Error: Code generation model is not available."
406
+
407
+ # --- 2. Create a precise prompt for the code model ---
408
+ # This prompt explicitly asks for only code, no explanations.
409
+ prompt = f"""
410
+ Your task is to generate ONLY the Python code for the following request.
411
+ Do not include any explanations, introductory text, or markdown formatting like '```python'.
412
+ The output must be a single, clean block of Python code.
413
+
414
+ Request: "{query}"
415
+
416
+ Python Code:
417
+ """
418
+
419
+ # --- 3. Generate the response and post-process it ---
420
+ response = code_llm.complete(prompt)
421
+ raw_code = str(response)
422
+
423
+ # --- 4. Clean the output to ensure it's pure code ---
424
+ # Models often wrap code in markdown fences, this removes them.
425
+ code_match = re.search(r"```(?:python)?\n(.*)```", raw_code, re.DOTALL)
426
+ if code_match:
427
+ # Extract the code from within the markdown block
428
+ return code_match.group(1).strip()
429
+ else:
430
+ # If no markdown, assume the model followed instructions and return the text directly
431
+ return raw_code.strip()
432
+
433
+
434
+ # --- 5. Create the LlamaIndex Tool from the function ---
435
+ generate_code_tool = FunctionTool.from_defaults(
436
+ fn=generate_python_code,
437
+ name="generate_python_code_tool",
438
+ description=(
439
+ "Use this tool to generate executable Python code based on a natural language description of a task. "
440
+ "The input should be a clear and specific request for what the code should do (e.g., 'a function to "
441
+ "calculate the nth Fibonacci number'). The tool returns a string containing only the Python code."
442
+ )
443
  )
444
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
446
  class EnhancedGAIAAgent:
447
  def __init__(self):