HF_GAIA_AGENT / tools /arxiv_search.py
Euclides H.
Fix some bugs
34b429f
from typing import Any, Optional
from smolagents.tools import Tool
from langchain_community.document_loaders import ArxivLoader
#import logging
# Configurar el logger
#logger = logging.getLogger("smolagent")
#logger.setLevel(logging.INFO)
#if not logger.handlers:
# # Crear un handler para archivo
# file_handler = logging.FileHandler("agent_tools_logs.txt")
# file_handler.setLevel(logging.INFO)
# # Formato del log
# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# file_handler.setFormatter(formatter)
# # Agregar el handler al logger
# logger.addHandler(file_handler)
class ArxivSearchTool(Tool):
name = "arxiv_search"
description = "Search arXiv papers based on a query and return relevant papers with their abstracts. Useful for finding scientific papers, research articles, and academic content."
inputs = {'query': {'type': 'string', 'description': 'The search query to look up papers on arXiv.'}}
output_type = "string"
def __init__(self, load_max_docs=3, **kwargs):
super().__init__()
self.load_max_docs = load_max_docs
try:
import arxiv
except ImportError as e:
raise ImportError(
"You must install package `arxiv` to run this tool: run `pip install arxiv`."
) from e
self.is_initialized = True
def forward(self, query: str) -> str:
#logger.info(f"ArxivSearchTool invocado con query: {query}")
try:
#logger.info("Check if pymupdf y fitz is installed...")
import pymupdf
import fitz
# Monkey patch to manage fitz.fitz runtime error
if not hasattr(fitz, 'fitz'):
fitz.fitz = fitz
#logger.info(f"Versión de fitz (PyMuPDF): {fitz.__doc__}")
#logger.info(f"Ubicación del módulo fitz: {fitz.__file__}")
except ImportError as e:
raise ImportError(
"You must install package `pymupdf` to run this tool: run `pip install pymupdf`."
) from e
try:
# Use ArxivLoader from langchain_community to load papers
#logger.info("Creating ArxivLoader object...")
loader = ArxivLoader(
query=query,
load_max_docs=self.load_max_docs,
load_all_available_meta=True
)
# Get the documents (papers)
#logger.info("ArxivLoader method load is invoked...")
docs = loader.load()
if not docs:
return f"No arXiv papers found for the query: {query}"
# Format the results nicely
results = []
#logger.info("Papers found, formatting results...")
for doc in docs:
# Extract metadata
metadata = doc.metadata
title = metadata.get('Title', 'Untitled')
authors = metadata.get('Authors', 'Unknown Authors')
published = metadata.get('Published', 'Unknown Date')
paper_url = metadata.get('Entry Id', '#')
# Get the abstract (usually in the page_content)
abstract = doc.page_content[:800] + "..." if len(doc.page_content) > 800 else doc.page_content
# Format each paper
paper = f"## {title}\n\n**Authors:** {authors}\n**Published:** {published}\n**URL:** {paper_url}\n\n**Abstract:**\n{abstract}\n\n---\n\n"
results.append(paper)
#logger.info("Formatting results ended... SUCCESS!")
return "\n".join(results)
except Exception as e:
return f"Error searching arXiv: {str(e)}"