import json from langchain_community.document_loaders import ArxivLoader, WikipediaLoader from markitdown import MarkItDown from smolagents import ( tool, ) md = MarkItDown(enable_plugins=True) # Set to True to enable plugins @tool def arvix_search(query: str) -> str: """Search Arxiv for a query and return maximum 3 result. Args: query: The search query.""" search_docs = ArxivLoader(query=query, load_max_docs=3).load() formatted_search_docs = "\n\n---\n\n".join( [ f'\n{doc.page_content[:1000]}\n' for doc in search_docs ] ) return formatted_search_docs @tool def read_excel_content_to_markdown_content(file_location: str) -> str: """Read the content of an Excel file and convert it to markdown content. Args: file_location: The path to the Excel file.""" result = md.convert(file_location) return result.text_content @tool def read_pdf_content_to_markdown(file_location: str) -> str: """Read the content of a PDF file and convert it to markdown. Args: file_location: The path to the PDF file.""" result = md.convert(file_location) return result.text_content @tool def get_audio_transcription(file_path: str) -> str: """Get the transcription of the audio file using the file path. Args: file_path: The path of the audio file.""" result = md.convert(file_path) return result.text_content @tool def get_python_file_content(file_name: str) -> str: """Get the content of a mentioned Python file. Args: file_name: The name of the file.""" file_path = f"{file_name}" with open(file_path, "r") as f: content = f.read() return content @tool def visit_webpage_to_markdown(url: str) -> str: """Visit a web page and return its content in markdown format. Args: url: The URL of the web page.""" result = md.convert(url) return result.text_content @tool def extract_markdown_tables_from_markdown_content(markdown_content: str) -> str: """Extract and return the markdown tables from a given markdown content string in a structured json format. Args: markdown_content: The markdown string containing the table.""" from mrkdwn_analysis import MarkdownAnalyzer analyzer = MarkdownAnalyzer.from_string(markdown_content) analyzer.analyse() return json.dumps(analyzer.identify_tables()) @tool def wiki_search(query: str) -> str: """Search Wikipedia for a query and return maximum 2 results. Args: query: The search query.""" search_docs = WikipediaLoader(query=query, load_max_docs=2).load() formatted_search_docs = "\n\n---\n\n".join( [ f'\n{doc.page_content}\n' for doc in search_docs ] ) return formatted_search_docs