HF_GAIA_AGENT / tools /document_processing.py
Euclides H.
Improve tools
16a27fa
from typing import Any, Optional, Dict
from smolagents.tools import Tool
import os
import tempfile
import requests
from urllib.parse import urlparse
import uuid
import pandas as pd
class DocumentProcessingTool(Tool):
name = "document_processing"
description = "Process various document types including saving files, downloading files, and analyzing CSV/Excel files."
inputs = {
'action': {'type': 'string', 'description': 'The action to perform: "save", "download", "analyze_csv", "analyze_excel"'},
'content': {'type': 'string', 'description': 'The content to process: text content for save, URL for download, filepath for analysis'}
}
output_type = "string"
def __init__(self):
"""Initialize the document processing tool"""
super().__init__()
try:
import pandas as pd
except ImportError as e:
raise ImportError("You must install required packages: pip install pandas openpyxl") from e
self.is_initialized = True
def _save_and_read_file(self, content: str) -> str:
"""
Save content to a file and return the path.
Args:
content (str): the content to save to the file
"""
temp_dir = tempfile.gettempdir()
random_filename = f"file_{uuid.uuid4().hex[:8]}.txt"
filepath = os.path.join(temp_dir, random_filename)
with open(filepath, "w") as f:
f.write(content)
return f"File saved to {filepath}. You can read this file to process its contents."
def _download_file_from_url(self, url: str) -> str:
"""
Download a file from a URL and save it to a temporary location.
Args:
url (str): the URL of the file to download.
"""
try:
# Generate random filename with original extension if available
path = urlparse(url).path
ext = os.path.splitext(path)[1] or '.tmp'
random_filename = f"downloaded_{uuid.uuid4().hex[:8]}{ext}"
# Create temporary file
temp_dir = tempfile.gettempdir()
filepath = os.path.join(temp_dir, random_filename)
# Download the file
response = requests.get(url, stream=True)
response.raise_for_status()
# Save the file
with open(filepath, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return f"File downloaded to {filepath}. You can read this file to process its contents."
except Exception as e:
return f"Error downloading file: {str(e)}"
def _analyze_csv_file(self, file_path: str) -> str:
"""
Analyze a CSV file using pandas.
Args:
file_path (str): the path to the CSV file.
"""
try:
# Read the CSV file
df = pd.read_csv(file_path)
# Run various analyses
result = f"CSV file loaded with {len(df)} rows and {len(df.columns)} columns.\n"
result += f"Columns: {', '.join(df.columns)}\n\n"
# Add summary statistics
result += "Summary statistics:\n"
result += str(df.describe())
result += "\n\nTop 50 rows content:\n"
result += str(df.head(50))
return result
except Exception as e:
return f"Error analyzing CSV file: {str(e)}"
def _analyze_excel_file(self, file_path: str) -> str:
"""
Analyze an Excel file using pandas.
Args:
file_path (str): the path to the Excel file.
"""
try:
# Read the Excel file
df = pd.read_excel(file_path)
# Run various analyses
result = f"Excel file loaded with {len(df)} rows and {len(df.columns)} columns.\n"
result += f"Columns: {', '.join(df.columns)}\n\n"
# Add summary statistics
result += "Summary statistics:\n"
result += str(df.describe())
result += "\n\nTop 50 rows content:\n"
result += str(df.head(50))
return result
except Exception as e:
return f"Error analyzing Excel file: {str(e)}"
def forward(self, action: str, content: str) -> str:
"""
Process documents based on the specified action.
Args:
action (str): The action to perform (save, download, analyze_csv, analyze_excel)
content (str): The content to process
Returns:
str: Result of the operation
"""
try:
if action == "save":
return self._save_and_read_file(content)
elif action == "download":
return self._download_file_from_url(content)
elif action == "analyze_csv":
return self._analyze_csv_file(content)
elif action == "analyze_excel":
return self._analyze_excel_file(content)
else:
if ((content.find(".xls") != -1) or (content.find(".xlsx") != -1)):
return self._analyze_excel_file(content)
elif (content.find("csv") != -1):
return self._analyze_csv_file(content)
else:
return f"Error: Invalid action '{action}'. Valid actions are: save, download, analyze_csv, analyze_excel"
except Exception as e:
return f"Error performing {action} operation: {str(e)}"