Spaces:
Sleeping
Sleeping
""" | |
Unified Research Paper Fetcher | |
Fetches papers from multiple sources: ArXiv, Semantic Scholar, Crossref, and PubMed | |
Replaces all previous fetcher components for maximum minimalism | |
""" | |
import re | |
import time | |
import requests | |
import xml.etree.ElementTree as ET | |
from typing import List, Dict, Optional, Any, Union | |
from datetime import datetime, timedelta | |
import arxiv | |
import json | |
from collections import Counter | |
class UnifiedPaperFetcher: | |
""" | |
Unified fetcher for research papers from multiple academic databases | |
Supports: ArXiv, Semantic Scholar, Crossref, PubMed | |
""" | |
def __init__(self, config=None): | |
# Import Config only when needed to avoid dependency issues | |
if config is None: | |
try: | |
from .config import Config | |
self.config = Config() | |
except ImportError: | |
self.config = None | |
else: | |
self.config = config | |
# Initialize clients | |
self.arxiv_client = arxiv.Client() | |
# API endpoints | |
self.semantic_scholar_base = "https://api.semanticscholar.org/graph/v1" | |
self.crossref_base = "https://api.crossref.org/works" | |
self.pubmed_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" | |
# Rate limiting | |
self.last_request_time = {} | |
self.min_request_interval = { | |
'semantic_scholar': 5.0, # 5 seconds between requests | |
'crossref': 0.1, # 100ms between requests (polite) | |
'pubmed': 0.34, # ~3 requests per second | |
'arxiv': 3.0 # 3 seconds between requests | |
} | |
def search_papers(self, | |
query: str, | |
max_results: int = 10, | |
sources: List[str] = None, | |
sort_by: str = "relevance") -> List[Dict[str, Any]]: | |
""" | |
Search for papers across multiple sources | |
Args: | |
query: Search query | |
max_results: Maximum number of results per source | |
sources: List of sources ['arxiv', 'semantic_scholar', 'crossref', 'pubmed'] | |
sort_by: Sort criteria | |
Returns: | |
List of paper dictionaries with unified format | |
""" | |
if sources is None: | |
sources = ['arxiv', 'semantic_scholar', 'crossref', 'pubmed'] | |
all_papers = [] | |
results_per_source = max(1, max_results // len(sources)) | |
print(f"Searching for: '{query}' across sources: {sources}") | |
for source in sources: | |
try: | |
print(f"Searching {source}...") | |
if source == 'arxiv': | |
papers = self._search_arxiv(query, results_per_source) | |
elif source == 'semantic_scholar': | |
papers = self._search_semantic_scholar(query, results_per_source) | |
elif source == 'crossref': | |
papers = self._search_crossref(query, results_per_source) | |
elif source == 'pubmed': | |
papers = self._search_pubmed(query, results_per_source) | |
else: | |
print(f"Unknown source: {source}") | |
continue | |
print(f"Found {len(papers)} papers from {source}") | |
all_papers.extend(papers) | |
except Exception as e: | |
print(f"Error searching {source}: {e}") | |
continue | |
# Remove duplicates and sort | |
unique_papers = self._deduplicate_papers(all_papers) | |
# Sort by relevance/date | |
if sort_by == "date": | |
unique_papers.sort(key=lambda x: x.get('published_date', ''), reverse=True) | |
print(f"Total unique papers found: {len(unique_papers)}") | |
return unique_papers[:max_results] | |
def _search_arxiv(self, query: str, max_results: int) -> List[Dict[str, Any]]: | |
"""Search ArXiv""" | |
self._rate_limit('arxiv') | |
try: | |
search = arxiv.Search( | |
query=query, | |
max_results=max_results, | |
sort_by=arxiv.SortCriterion.Relevance, | |
sort_order=arxiv.SortOrder.Descending | |
) | |
papers = [] | |
for result in self.arxiv_client.results(search): | |
paper = { | |
'title': result.title, | |
'authors': [author.name for author in result.authors], | |
'abstract': result.summary, | |
'published_date': result.published.strftime('%Y-%m-%d'), | |
'year': result.published.year, | |
'url': result.entry_id, | |
'pdf_url': result.pdf_url, | |
'source': 'ArXiv', | |
'arxiv_id': result.entry_id.split('/')[-1], | |
'categories': [cat for cat in result.categories], | |
'doi': result.doi | |
} | |
papers.append(paper) | |
return papers | |
except Exception as e: | |
print(f"ArXiv search error: {e}") | |
return [] | |
def _search_semantic_scholar(self, query: str, max_results: int) -> List[Dict[str, Any]]: | |
"""Search Semantic Scholar""" | |
self._rate_limit('semantic_scholar') | |
try: | |
url = f"{self.semantic_scholar_base}/paper/search" | |
params = { | |
'query': query, | |
'limit': min(max_results, 100), | |
'fields': 'title,authors,abstract,year,url,venue,citationCount,referenceCount,publicationDate,externalIds' | |
} | |
# Retry logic for rate limiting | |
max_retries = 3 | |
data = None | |
for attempt in range(max_retries): | |
data = self.safe_get(url, params) | |
if data and 'data' in data: | |
break | |
elif attempt < max_retries - 1: | |
wait_time = (attempt + 1) * 5 | |
print(f"Semantic Scholar rate limited, waiting {wait_time} seconds...") | |
time.sleep(wait_time) # Exponential backoff | |
else: | |
print("Semantic Scholar API unavailable after retries") | |
return [] | |
if not data or 'data' not in data: | |
return [] | |
papers = [] | |
for paper_data in data.get('data', []): | |
# Handle authors | |
authors = [] | |
if paper_data.get('authors'): | |
authors = [author.get('name', 'Unknown') for author in paper_data['authors']] | |
# Handle external IDs | |
external_ids = paper_data.get('externalIds', {}) | |
doi = external_ids.get('DOI') | |
arxiv_id = external_ids.get('ArXiv') | |
paper = { | |
'title': paper_data.get('title', 'No title'), | |
'authors': authors, | |
'abstract': paper_data.get('abstract', ''), | |
'published_date': paper_data.get('publicationDate', ''), | |
'year': paper_data.get('year'), | |
'url': paper_data.get('url', ''), | |
'source': 'Semantic Scholar', | |
'venue': paper_data.get('venue', ''), | |
'citation_count': paper_data.get('citationCount', 0), | |
'reference_count': paper_data.get('referenceCount', 0), | |
'doi': doi, | |
'arxiv_id': arxiv_id | |
} | |
papers.append(paper) | |
return papers | |
except Exception as e: | |
print(f"Semantic Scholar search error: {e}") | |
return [] | |
def _search_crossref(self, query: str, max_results: int) -> List[Dict[str, Any]]: | |
"""Search Crossref""" | |
self._rate_limit('crossref') | |
try: | |
url = self.crossref_base | |
params = { | |
'query': query, | |
'rows': min(max_results, 20), | |
'sort': 'relevance', | |
'select': 'title,author,abstract,published-print,published-online,URL,DOI,container-title,type' | |
} | |
headers = { | |
'User-Agent': 'ResearchMate/2.0 (mailto:research@example.com)' | |
} | |
response = requests.get(url, params=params, headers=headers, timeout=30) | |
response.raise_for_status() | |
data = response.json() | |
papers = [] | |
for item in data.get('message', {}).get('items', []): | |
# Handle authors | |
authors = [] | |
if item.get('author'): | |
for author in item['author']: | |
given = author.get('given', '') | |
family = author.get('family', '') | |
name = f"{given} {family}".strip() | |
if name: | |
authors.append(name) | |
# Handle publication date | |
published_date = '' | |
year = None | |
if item.get('published-print'): | |
date_parts = item['published-print'].get('date-parts', [[]])[0] | |
if date_parts: | |
year = date_parts[0] | |
if len(date_parts) >= 3: | |
published_date = f"{date_parts[0]:04d}-{date_parts[1]:02d}-{date_parts[2]:02d}" | |
elif len(date_parts) >= 2: | |
published_date = f"{date_parts[0]:04d}-{date_parts[1]:02d}-01" | |
else: | |
published_date = f"{date_parts[0]:04d}-01-01" | |
paper = { | |
'title': item.get('title', ['No title'])[0] if item.get('title') else 'No title', | |
'authors': authors, | |
'abstract': item.get('abstract', ''), | |
'published_date': published_date, | |
'year': year, | |
'url': item.get('URL', ''), | |
'source': 'Crossref', | |
'doi': item.get('DOI', ''), | |
'journal': item.get('container-title', [''])[0] if item.get('container-title') else '', | |
'type': item.get('type', '') | |
} | |
papers.append(paper) | |
return papers | |
except Exception as e: | |
print(f"Crossref search error: {e}") | |
return [] | |
def _search_pubmed(self, query: str, max_results: int) -> List[Dict[str, Any]]: | |
"""Search PubMed""" | |
self._rate_limit('pubmed') | |
try: | |
# Step 1: Search for PMIDs | |
search_url = f"{self.pubmed_base}/esearch.fcgi" | |
search_params = { | |
'db': 'pubmed', | |
'term': query, | |
'retmax': min(max_results, 20), | |
'retmode': 'json', | |
'sort': 'relevance' | |
} | |
response = requests.get(search_url, params=search_params, timeout=30) | |
response.raise_for_status() | |
search_data = response.json() | |
pmids = search_data.get('esearchresult', {}).get('idlist', []) | |
if not pmids: | |
return [] | |
# Step 2: Fetch details for PMIDs | |
self._rate_limit('pubmed') | |
fetch_url = f"{self.pubmed_base}/efetch.fcgi" | |
fetch_params = { | |
'db': 'pubmed', | |
'id': ','.join(pmids), | |
'retmode': 'xml' | |
} | |
response = requests.get(fetch_url, params=fetch_params, timeout=30) | |
response.raise_for_status() | |
# Parse XML | |
root = ET.fromstring(response.content) | |
papers = [] | |
for article in root.findall('.//PubmedArticle'): | |
try: | |
# Extract basic info | |
medline = article.find('.//MedlineCitation') | |
if medline is None: | |
continue | |
article_elem = medline.find('.//Article') | |
if article_elem is None: | |
continue | |
# Title | |
title_elem = article_elem.find('.//ArticleTitle') | |
title = title_elem.text if title_elem is not None else 'No title' | |
# Authors | |
authors = [] | |
author_list = article_elem.find('.//AuthorList') | |
if author_list is not None: | |
for author in author_list.findall('.//Author'): | |
last_name = author.find('.//LastName') | |
first_name = author.find('.//ForeName') | |
if last_name is not None and first_name is not None: | |
authors.append(f"{first_name.text} {last_name.text}") | |
elif last_name is not None: | |
authors.append(last_name.text) | |
# Abstract | |
abstract = '' | |
abstract_elem = article_elem.find('.//AbstractText') | |
if abstract_elem is not None: | |
abstract = abstract_elem.text or '' | |
# Publication date | |
pub_date = article_elem.find('.//PubDate') | |
published_date = '' | |
year = None | |
if pub_date is not None: | |
year_elem = pub_date.find('.//Year') | |
month_elem = pub_date.find('.//Month') | |
day_elem = pub_date.find('.//Day') | |
if year_elem is not None: | |
year = int(year_elem.text) | |
month = month_elem.text if month_elem is not None else '01' | |
day = day_elem.text if day_elem is not None else '01' | |
# Convert month name to number if needed | |
month_map = { | |
'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', | |
'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', | |
'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12' | |
} | |
if month in month_map: | |
month = month_map[month] | |
elif not month.isdigit(): | |
month = '01' | |
published_date = f"{year}-{month.zfill(2)}-{day.zfill(2)}" | |
# PMID | |
pmid_elem = medline.find('.//PMID') | |
pmid = pmid_elem.text if pmid_elem is not None else '' | |
# Journal | |
journal_elem = article_elem.find('.//Journal/Title') | |
journal = journal_elem.text if journal_elem is not None else '' | |
# DOI | |
doi = '' | |
article_ids = article.findall('.//ArticleId') | |
for article_id in article_ids: | |
if article_id.get('IdType') == 'doi': | |
doi = article_id.text | |
break | |
paper = { | |
'title': title, | |
'authors': authors, | |
'abstract': abstract, | |
'published_date': published_date, | |
'year': year, | |
'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", | |
'source': 'PubMed', | |
'pmid': pmid, | |
'journal': journal, | |
'doi': doi | |
} | |
papers.append(paper) | |
except Exception as e: | |
print(f"Error parsing PubMed article: {e}") | |
continue | |
return papers | |
except Exception as e: | |
print(f"PubMed search error: {e}") | |
return [] | |
def _rate_limit(self, source: str): | |
"""Implement rate limiting for API calls""" | |
now = time.time() | |
last_request = self.last_request_time.get(source, 0) | |
interval = self.min_request_interval.get(source, 1.0) | |
time_since_last = now - last_request | |
if time_since_last < interval: | |
sleep_time = interval - time_since_last | |
time.sleep(sleep_time) | |
self.last_request_time[source] = time.time() | |
def safe_get(self, url: str, params: dict = None, headers: dict = None, timeout: int = 30) -> Optional[Dict[str, Any]]: | |
"""Safe HTTP GET with error handling""" | |
try: | |
response = requests.get(url, params=params, headers=headers, timeout=timeout) | |
response.raise_for_status() | |
return response.json() | |
except requests.exceptions.RequestException as e: | |
print(f"HTTP request failed: {e}") | |
return None | |
except json.JSONDecodeError as e: | |
print(f"JSON decode error: {e}") | |
return None | |
def _deduplicate_papers(self, papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
"""Remove duplicate papers based on title, DOI, or ArXiv ID""" | |
seen = set() | |
unique_papers = [] | |
for paper in papers: | |
# Create identifier based on available fields | |
identifiers = [] | |
# Use DOI if available | |
doi = paper.get('doi') | |
if doi is None: | |
doi = '' | |
doi = str(doi).strip() | |
if doi: | |
identifiers.append(f"doi:{doi.lower()}") | |
# Use ArXiv ID if available | |
arxiv_id = paper.get('arxiv_id') | |
if arxiv_id is None: | |
arxiv_id = '' | |
arxiv_id = str(arxiv_id).strip() | |
if arxiv_id: | |
identifiers.append(f"arxiv:{arxiv_id.lower()}") | |
# Use PMID if available | |
pmid = paper.get('pmid') | |
if pmid is None: | |
pmid = '' | |
pmid = str(pmid).strip() | |
if pmid: | |
identifiers.append(f"pmid:{pmid}") | |
# Use title as fallback | |
title = paper.get('title') | |
if title is None: | |
title = '' | |
title = str(title).strip().lower() | |
if title and title != 'no title': | |
# Clean title for comparison | |
clean_title = re.sub(r'[^\w\s]', '', title) | |
clean_title = ' '.join(clean_title.split()) | |
identifiers.append(f"title:{clean_title}") | |
# Check if any identifier has been seen | |
found_duplicate = False | |
for identifier in identifiers: | |
if identifier in seen: | |
found_duplicate = True | |
break | |
if not found_duplicate: | |
# Add all identifiers to seen set | |
for identifier in identifiers: | |
seen.add(identifier) | |
unique_papers.append(paper) | |
return unique_papers | |
def get_paper_by_doi(self, doi: str) -> Optional[Dict[str, Any]]: | |
"""Get paper details by DOI from Crossref""" | |
try: | |
url = f"{self.crossref_base}/{doi}" | |
headers = { | |
'User-Agent': 'ResearchMate/2.0 (mailto:research@example.com)' | |
} | |
response = requests.get(url, headers=headers, timeout=30) | |
response.raise_for_status() | |
data = response.json() | |
item = data.get('message', {}) | |
if not item: | |
return None | |
# Parse the item (similar to _search_crossref) | |
authors = [] | |
if item.get('author'): | |
for author in item['author']: | |
given = author.get('given', '') | |
family = author.get('family', '') | |
name = f"{given} {family}".strip() | |
if name: | |
authors.append(name) | |
# Handle publication date | |
published_date = '' | |
year = None | |
if item.get('published-print'): | |
date_parts = item['published-print'].get('date-parts', [[]])[0] | |
if date_parts: | |
year = date_parts[0] | |
if len(date_parts) >= 3: | |
published_date = f"{date_parts[0]:04d}-{date_parts[1]:02d}-{date_parts[2]:02d}" | |
paper = { | |
'title': item.get('title', ['No title'])[0] if item.get('title') else 'No title', | |
'authors': authors, | |
'abstract': item.get('abstract', ''), | |
'published_date': published_date, | |
'year': year, | |
'url': item.get('URL', ''), | |
'source': 'Crossref', | |
'doi': item.get('DOI', ''), | |
'journal': item.get('container-title', [''])[0] if item.get('container-title') else '' | |
} | |
return paper | |
except Exception as e: | |
print(f"Error fetching DOI {doi}: {e}") | |
return None | |
class PaperFetcher(UnifiedPaperFetcher): | |
""" | |
Consolidated paper fetcher combining all sources | |
This is the single fetcher class that replaces all previous fetcher components | |
""" | |
def __init__(self, config=None): | |
super().__init__(config) | |
def search_papers(self, | |
query: str, | |
max_results: int = 10, | |
sources: List[str] = None, | |
sort_by: str = "relevance", | |
category: str = None, | |
date_range: int = None) -> List[Dict[str, Any]]: | |
""" | |
Enhanced search with additional parameters from original ArxivFetcher | |
Args: | |
query: Search query | |
max_results: Maximum number of results | |
sources: List of sources ['arxiv', 'semantic_scholar', 'crossref', 'pubmed'] | |
sort_by: Sort criteria ('relevance', 'date', 'lastUpdatedDate', 'submittedDate') | |
category: ArXiv category filter (e.g., 'cs.AI', 'cs.LG') | |
date_range: Days back to search (e.g., 7, 30, 365) | |
Returns: | |
List of paper dictionaries with unified format | |
""" | |
# Use all sources by default | |
if sources is None: | |
sources = ['arxiv', 'semantic_scholar', 'crossref', 'pubmed'] | |
# Apply category filter to ArXiv query if specified | |
if category and 'arxiv' in sources: | |
enhanced_query = f"cat:{category} AND {query}" | |
return self._search_with_enhanced_query(enhanced_query, max_results, sources, sort_by, date_range) | |
return super().search_papers(query, max_results, sources, sort_by) | |
def _search_with_enhanced_query(self, query: str, max_results: int, sources: List[str], sort_by: str, date_range: int) -> List[Dict[str, Any]]: | |
"""Internal method for enhanced search with date filtering""" | |
papers = super().search_papers(query, max_results, sources, sort_by) | |
# Apply date filtering if specified | |
if date_range: | |
cutoff_date = datetime.now() - timedelta(days=date_range) | |
filtered_papers = [] | |
for paper in papers: | |
pub_date_str = paper.get('published_date', '') | |
if pub_date_str: | |
try: | |
pub_date = datetime.strptime(pub_date_str, '%Y-%m-%d') | |
if pub_date >= cutoff_date: | |
filtered_papers.append(paper) | |
except ValueError: | |
# If date parsing fails, include the paper | |
filtered_papers.append(paper) | |
else: | |
# If no date, include the paper | |
filtered_papers.append(paper) | |
return filtered_papers | |
return papers | |
def get_paper_by_id(self, paper_id: str) -> Optional[Dict[str, Any]]: | |
""" | |
Get a specific paper by ID (supports ArXiv ID, DOI, PMID) | |
Args: | |
paper_id: Paper ID (ArXiv ID, DOI, or PMID) | |
Returns: | |
Paper dictionary or None | |
""" | |
# Check if it's an ArXiv ID | |
if re.match(r'^\d{4}\.\d{4,5}(v\d+)?$', paper_id): | |
return self._get_arxiv_paper_by_id(paper_id) | |
# Check if it's a DOI | |
if '/' in paper_id and ('10.' in paper_id or paper_id.startswith('doi:')): | |
doi = paper_id.replace('doi:', '') | |
return self.get_paper_by_doi(doi) | |
# Check if it's a PMID | |
if paper_id.isdigit(): | |
return self._get_pubmed_paper_by_id(paper_id) | |
# Fallback: search for it | |
results = self.search_papers(paper_id, max_results=1) | |
return results[0] if results else None | |
def _get_arxiv_paper_by_id(self, arxiv_id: str) -> Optional[Dict[str, Any]]: | |
"""Get paper by ArXiv ID""" | |
try: | |
search = arxiv.Search(id_list=[arxiv_id]) | |
results = list(self.arxiv_client.results(search)) | |
if results: | |
result = results[0] | |
return { | |
'title': result.title, | |
'authors': [author.name for author in result.authors], | |
'abstract': result.summary, | |
'published_date': result.published.strftime('%Y-%m-%d'), | |
'year': result.published.year, | |
'url': result.entry_id, | |
'pdf_url': result.pdf_url, | |
'source': 'ArXiv', | |
'arxiv_id': result.entry_id.split('/')[-1], | |
'categories': [cat for cat in result.categories], | |
'doi': result.doi | |
} | |
return None | |
except Exception as e: | |
print(f"Error fetching ArXiv paper {arxiv_id}: {e}") | |
return None | |
def _get_pubmed_paper_by_id(self, pmid: str) -> Optional[Dict[str, Any]]: | |
"""Get paper by PubMed ID""" | |
try: | |
fetch_url = f"{self.pubmed_base}/efetch.fcgi" | |
fetch_params = { | |
'db': 'pubmed', | |
'id': pmid, | |
'retmode': 'xml' | |
} | |
response = requests.get(fetch_url, params=fetch_params, timeout=30) | |
response.raise_for_status() | |
root = ET.fromstring(response.content) | |
article = root.find('.//PubmedArticle') | |
if article is not None: | |
# Parse similar to _search_pubmed | |
medline = article.find('.//MedlineCitation') | |
article_elem = medline.find('.//Article') | |
title_elem = article_elem.find('.//ArticleTitle') | |
title = title_elem.text if title_elem is not None else 'No title' | |
authors = [] | |
author_list = article_elem.find('.//AuthorList') | |
if author_list is not None: | |
for author in author_list.findall('.//Author'): | |
last_name = author.find('.//LastName') | |
first_name = author.find('.//ForeName') | |
if last_name is not None and first_name is not None: | |
authors.append(f"{first_name.text} {last_name.text}") | |
abstract = '' | |
abstract_elem = article_elem.find('.//AbstractText') | |
if abstract_elem is not None: | |
abstract = abstract_elem.text or '' | |
return { | |
'title': title, | |
'authors': authors, | |
'abstract': abstract, | |
'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", | |
'source': 'PubMed', | |
'pmid': pmid | |
} | |
return None | |
except Exception as e: | |
print(f"Error fetching PubMed paper {pmid}: {e}") | |
return None | |
def search_by_author(self, author: str, max_results: int = 20) -> List[Dict[str, Any]]: | |
""" | |
Search for papers by author across all sources | |
Args: | |
author: Author name | |
max_results: Maximum number of results | |
Returns: | |
List of paper dictionaries | |
""" | |
return self.search_papers(f"author:{author}", max_results=max_results, sort_by="date") | |
def search_by_category(self, category: str, max_results: int = 20) -> List[Dict[str, Any]]: | |
""" | |
Search for papers by category (primarily ArXiv) | |
Args: | |
category: Category (e.g., 'cs.AI', 'cs.LG', 'stat.ML') | |
max_results: Maximum number of results | |
Returns: | |
List of paper dictionaries | |
""" | |
return self.search_papers("", max_results=max_results, category=category, sort_by="date") | |
def get_trending_papers(self, category: str = "cs.AI", days: int = 7, max_results: int = 10) -> List[Dict[str, Any]]: | |
""" | |
Get trending papers in a category | |
Args: | |
category: Category to search | |
days: Days back to look for papers | |
max_results: Maximum number of results | |
Returns: | |
List of paper dictionaries | |
""" | |
return self.search_papers( | |
query="recent", | |
max_results=max_results, | |
category=category, | |
date_range=days, | |
sort_by="date" | |
) | |
def download_pdf(self, paper: Dict[str, Any], download_dir: str = "downloads") -> Optional[str]: | |
""" | |
Download PDF for a paper | |
Args: | |
paper: Paper dictionary | |
download_dir: Directory to save PDF | |
Returns: | |
Path to downloaded PDF or None | |
""" | |
try: | |
import os | |
os.makedirs(download_dir, exist_ok=True) | |
pdf_url = paper.get('pdf_url') | |
if not pdf_url: | |
print(f"No PDF URL for paper: {paper.get('title', 'Unknown')}") | |
return None | |
# Generate filename | |
paper_id = paper.get('arxiv_id', paper.get('pmid', paper.get('doi', 'unknown'))) | |
filename = f"{paper_id}.pdf" | |
filepath = os.path.join(download_dir, filename) | |
if os.path.exists(filepath): | |
print(f"PDF already exists: {filepath}") | |
return filepath | |
print(f"Downloading PDF: {paper.get('title', 'Unknown')}") | |
response = requests.get(pdf_url, timeout=30) | |
response.raise_for_status() | |
with open(filepath, 'wb') as f: | |
f.write(response.content) | |
print(f"PDF downloaded: {filepath}") | |
return filepath | |
except Exception as e: | |
print(f"Error downloading PDF: {e}") | |
return None | |
def get_paper_recommendations(self, paper_id: str, max_results: int = 5) -> List[Dict[str, Any]]: | |
""" | |
Get paper recommendations based on a paper's content | |
Args: | |
paper_id: Paper ID | |
max_results: Number of recommendations | |
Returns: | |
List of recommended papers | |
""" | |
try: | |
# Get the base paper | |
base_paper = self.get_paper_by_id(paper_id) | |
if not base_paper: | |
return [] | |
# Extract key terms from title and abstract | |
title = base_paper.get('title', '') | |
abstract = base_paper.get('abstract', '') | |
# Simple keyword extraction | |
keywords = self._extract_keywords(title + ' ' + abstract) | |
# Search for related papers | |
query = ' '.join(keywords[:5]) # Use top 5 keywords | |
related_papers = self.search_papers( | |
query=query, | |
max_results=max_results + 5, # Get more to filter out the original | |
sort_by="relevance" | |
) | |
# Filter out the original paper | |
recommendations = [p for p in related_papers if p.get('arxiv_id') != paper_id and p.get('pmid') != paper_id] | |
return recommendations[:max_results] | |
except Exception as e: | |
print(f"Error getting recommendations: {e}") | |
return [] | |
def _extract_keywords(self, text: str) -> List[str]: | |
""" | |
Simple keyword extraction from text | |
Args: | |
text: Input text | |
Returns: | |
List of keywords | |
""" | |
# Simple implementation - can be improved with NLP libraries | |
stop_words = { | |
'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', | |
'a', 'an', 'as', 'is', 'was', 'are', 'were', 'be', 'been', 'have', 'has', 'had', | |
'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', | |
'can', 'this', 'that', 'these', 'those', 'we', 'us', 'our', 'you', 'your', | |
'he', 'him', 'his', 'she', 'her', 'it', 'its', 'they', 'them', 'their' | |
} | |
# Extract words | |
words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower()) | |
# Filter and count | |
filtered_words = [word for word in words if word not in stop_words] | |
word_counts = Counter(filtered_words) | |
# Return most common words | |
return [word for word, count in word_counts.most_common(20)] | |
def get_categories(self) -> Dict[str, str]: | |
""" | |
Get available categories (primarily ArXiv) | |
Returns: | |
Dictionary of category codes and descriptions | |
""" | |
return { | |
'cs.AI': 'Artificial Intelligence', | |
'cs.LG': 'Machine Learning', | |
'cs.CV': 'Computer Vision', | |
'cs.CL': 'Computation and Language', | |
'cs.NE': 'Neural and Evolutionary Computing', | |
'cs.RO': 'Robotics', | |
'cs.CR': 'Cryptography and Security', | |
'cs.DC': 'Distributed, Parallel, and Cluster Computing', | |
'cs.DB': 'Databases', | |
'cs.DS': 'Data Structures and Algorithms', | |
'cs.HC': 'Human-Computer Interaction', | |
'cs.IR': 'Information Retrieval', | |
'cs.IT': 'Information Theory', | |
'cs.MM': 'Multimedia', | |
'cs.NI': 'Networking and Internet Architecture', | |
'cs.OS': 'Operating Systems', | |
'cs.PL': 'Programming Languages', | |
'cs.SE': 'Software Engineering', | |
'cs.SY': 'Systems and Control', | |
'stat.ML': 'Machine Learning (Statistics)', | |
'stat.AP': 'Applications (Statistics)', | |
'stat.CO': 'Computation (Statistics)', | |
'stat.ME': 'Methodology (Statistics)', | |
'stat.TH': 'Statistics Theory', | |
'math.ST': 'Statistics Theory (Mathematics)', | |
'math.PR': 'Probability (Mathematics)', | |
'math.OC': 'Optimization and Control', | |
'math.NA': 'Numerical Analysis', | |
'eess.AS': 'Audio and Speech Processing', | |
'eess.IV': 'Image and Video Processing', | |
'eess.SP': 'Signal Processing', | |
'eess.SY': 'Systems and Control', | |
'q-bio.QM': 'Quantitative Methods', | |
'q-bio.NC': 'Neurons and Cognition', | |
'physics.data-an': 'Data Analysis, Statistics and Probability' | |
} | |
# Backward compatibility aliases | |
class ArxivFetcher(PaperFetcher): | |
"""Backward compatibility class for ArxivFetcher""" | |
def __init__(self, config=None): | |
super().__init__(config) | |
def search_papers(self, query: str, max_results: int = 10, **kwargs) -> List[Dict[str, Any]]: | |
"""Search only ArXiv for backward compatibility""" | |
return super().search_papers(query, max_results, sources=['arxiv'], **kwargs) | |
# Main class alias for the unified fetcher | |
UnifiedFetcher = PaperFetcher | |