""" ArXiv Fetcher Component Fetches and processes research papers from ArXiv """ import re import time import requests from typing import List, Dict, Optional, Any from datetime import datetime, timedelta import arxiv class ArxivFetcher: """ Fetches research papers from ArXiv Provides search, download, and metadata extraction capabilities """ def __init__(self, config = None): # Import Config only when needed to avoid dependency issues if config is None: try: from .config import Config self.config = Config() except ImportError: # Fallback to None if Config cannot be imported self.config = None else: self.config = config self.client = arxiv.Client() def search_papers(self, query: str, max_results: int = 10, sort_by: str = "relevance", category: str = None, date_range: int = None) -> List[Dict[str, Any]]: """ Search for papers on ArXiv Args: query: Search query max_results: Maximum number of results sort_by: Sort criteria ('relevance', 'lastUpdatedDate', 'submittedDate') category: ArXiv category filter (e.g., 'cs.AI', 'cs.LG') date_range: Days back to search (e.g., 7, 30, 365) Returns: List of paper dictionaries """ try: print(f"Searching ArXiv for: '{query}'") # Build search query search_query = query if category: search_query = f"cat:{category} AND {query}" # Set sort criteria sort_criteria = { "relevance": arxiv.SortCriterion.Relevance, "lastUpdatedDate": arxiv.SortCriterion.LastUpdatedDate, "submittedDate": arxiv.SortCriterion.SubmittedDate }.get(sort_by, arxiv.SortCriterion.Relevance) # Create search search = arxiv.Search( query=search_query, max_results=max_results, sort_by=sort_criteria, sort_order=arxiv.SortOrder.Descending ) papers = [] for result in self.client.results(search): # Date filtering if date_range: cutoff_date = datetime.now() - timedelta(days=date_range) if result.published.replace(tzinfo=None) < cutoff_date: continue # Extract paper information paper = self._extract_paper_info(result) papers.append(paper) print(f"Found {len(papers)} papers") return papers except Exception as e: print(f"Error searching ArXiv: {e}") return [] def get_paper_by_id(self, arxiv_id: str) -> Optional[Dict[str, Any]]: """ Get a specific paper by ArXiv ID Args: arxiv_id: ArXiv paper ID (e.g., '2301.12345') Returns: Paper dictionary or None """ try: print(f"Fetching paper: {arxiv_id}") search = arxiv.Search(id_list=[arxiv_id]) results = list(self.client.results(search)) if results: paper = self._extract_paper_info(results[0]) print(f"Retrieved paper: {paper['title']}") return paper else: print(f"Paper not found: {arxiv_id}") return None except Exception as e: print(f"Error fetching paper {arxiv_id}: {e}") return None def search_by_author(self, author: str, max_results: int = 20) -> List[Dict[str, Any]]: """ Search for papers by author Args: author: Author name max_results: Maximum number of results Returns: List of paper dictionaries """ query = f"au:{author}" return self.search_papers(query, max_results=max_results, sort_by="lastUpdatedDate") def search_by_category(self, category: str, max_results: int = 20) -> List[Dict[str, Any]]: """ Search for papers by category Args: category: ArXiv category (e.g., 'cs.AI', 'cs.LG', 'stat.ML') max_results: Maximum number of results Returns: List of paper dictionaries """ query = f"cat:{category}" return self.search_papers(query, max_results=max_results, sort_by="lastUpdatedDate") def get_trending_papers(self, category: str = "cs.AI", days: int = 7, max_results: int = 10) -> List[Dict[str, Any]]: """ Get trending papers in a category Args: category: ArXiv category days: Days back to look for papers max_results: Maximum number of results Returns: List of paper dictionaries """ return self.search_by_category(category, max_results=max_results) def _extract_paper_info(self, result) -> Dict[str, Any]: """ Extract paper information from ArXiv result Args: result: ArXiv search result Returns: Paper dictionary """ try: # Extract ArXiv ID arxiv_id = result.entry_id.split('/')[-1] # Clean and format data paper = { 'arxiv_id': arxiv_id, 'title': result.title.strip(), 'authors': [author.name for author in result.authors], 'summary': result.summary.strip(), 'published': result.published.isoformat(), 'updated': result.updated.isoformat(), 'categories': result.categories, 'primary_category': result.primary_category, 'pdf_url': result.pdf_url, 'entry_id': result.entry_id, 'journal_ref': result.journal_ref, 'doi': result.doi, 'comment': result.comment, 'links': [{'title': link.title, 'href': link.href} for link in result.links], 'fetched_at': datetime.now().isoformat() } # Add formatted metadata paper['authors_str'] = ', '.join(paper['authors'][:3]) + ('...' if len(paper['authors']) > 3 else '') paper['categories_str'] = ', '.join(paper['categories'][:3]) + ('...' if len(paper['categories']) > 3 else '') paper['year'] = result.published.year paper['month'] = result.published.month return paper except Exception as e: print(f"Error extracting paper info: {e}") return { 'arxiv_id': 'unknown', 'title': 'Error extracting title', 'authors': [], 'summary': 'Error extracting summary', 'error': str(e) } def download_pdf(self, paper: Dict[str, Any], download_dir: str = "downloads") -> Optional[str]: """ Download PDF for a paper Args: paper: Paper dictionary download_dir: Directory to save PDF Returns: Path to downloaded PDF or None """ try: import os os.makedirs(download_dir, exist_ok=True) pdf_url = paper.get('pdf_url') if not pdf_url: print(f"No PDF URL for paper: {paper.get('title', 'Unknown')}") return None arxiv_id = paper.get('arxiv_id', 'unknown') filename = f"{arxiv_id}.pdf" filepath = os.path.join(download_dir, filename) if os.path.exists(filepath): print(f"PDF already exists: {filepath}") return filepath print(f"Downloading PDF: {paper.get('title', 'Unknown')}") response = requests.get(pdf_url, timeout=30) response.raise_for_status() with open(filepath, 'wb') as f: f.write(response.content) print(f"PDF downloaded: {filepath}") return filepath except Exception as e: print(f"Error downloading PDF: {e}") return None def get_paper_recommendations(self, paper_id: str, max_results: int = 5) -> List[Dict[str, Any]]: """ Get paper recommendations based on a paper's content Args: paper_id: ArXiv ID of the base paper max_results: Number of recommendations Returns: List of recommended papers """ try: # Get the base paper base_paper = self.get_paper_by_id(paper_id) if not base_paper: return [] # Extract key terms from title and summary title = base_paper.get('title', '') summary = base_paper.get('summary', '') categories = base_paper.get('categories', []) # Simple keyword extraction (can be improved with NLP) keywords = self._extract_keywords(title + ' ' + summary) # Search for related papers query = ' '.join(keywords[:5]) # Use top 5 keywords related_papers = self.search_papers( query=query, max_results=max_results + 5, # Get more to filter out the original sort_by="relevance" ) # Filter out the original paper recommendations = [p for p in related_papers if p.get('arxiv_id') != paper_id] return recommendations[:max_results] except Exception as e: print(f"Error getting recommendations: {e}") return [] def _extract_keywords(self, text: str) -> List[str]: """ Simple keyword extraction from text Args: text: Input text Returns: List of keywords """ # Simple implementation - can be improved with NLP libraries import re from collections import Counter # Remove common stop words stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'a', 'an', 'as', 'is', 'was', 'are', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'we', 'us', 'our', 'you', 'your', 'he', 'him', 'his', 'she', 'her', 'it', 'its', 'they', 'them', 'their'} # Extract words words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower()) # Filter and count filtered_words = [word for word in words if word not in stop_words] word_counts = Counter(filtered_words) # Return most common words return [word for word, count in word_counts.most_common(20)] def get_categories(self) -> Dict[str, str]: """ Get available ArXiv categories Returns: Dictionary of category codes and descriptions """ return { 'cs.AI': 'Artificial Intelligence', 'cs.LG': 'Machine Learning', 'cs.CV': 'Computer Vision', 'cs.CL': 'Computation and Language', 'cs.NE': 'Neural and Evolutionary Computing', 'cs.RO': 'Robotics', 'cs.CR': 'Cryptography and Security', 'cs.DC': 'Distributed, Parallel, and Cluster Computing', 'cs.DB': 'Databases', 'cs.DS': 'Data Structures and Algorithms', 'cs.HC': 'Human-Computer Interaction', 'cs.IR': 'Information Retrieval', 'cs.IT': 'Information Theory', 'cs.MM': 'Multimedia', 'cs.NI': 'Networking and Internet Architecture', 'cs.OS': 'Operating Systems', 'cs.PL': 'Programming Languages', 'cs.SE': 'Software Engineering', 'cs.SY': 'Systems and Control', 'stat.ML': 'Machine Learning (Statistics)', 'stat.AP': 'Applications (Statistics)', 'stat.CO': 'Computation (Statistics)', 'stat.ME': 'Methodology (Statistics)', 'stat.TH': 'Statistics Theory', 'math.ST': 'Statistics Theory (Mathematics)', 'math.PR': 'Probability (Mathematics)', 'math.OC': 'Optimization and Control', 'math.NA': 'Numerical Analysis', 'eess.AS': 'Audio and Speech Processing', 'eess.IV': 'Image and Video Processing', 'eess.SP': 'Signal Processing', 'eess.SY': 'Systems and Control', 'q-bio.QM': 'Quantitative Methods', 'q-bio.NC': 'Neurons and Cognition', 'physics.data-an': 'Data Analysis, Statistics and Probability' }