Spaces:
Runtime error
Runtime error
from parser import parse_article | |
import os | |
import requests | |
import datetime | |
import hashlib | |
import json | |
API_URL = "https://huggingface.co/api/daily_papers" | |
cache = {} | |
def make_request(url: str): | |
# Create a hash of the URL to use as the cache key | |
url_hash = hashlib.md5(url.encode()).hexdigest() | |
# Check if the response is already cached | |
if url_hash in cache: | |
print(f"Cache hit for URL: {url}") | |
return cache[url_hash] | |
http_proxy = os.getenv("HF_HTTP_PROXY") | |
https_proxy = os.getenv("HF_HTTPS_PROXY") | |
proxies = { | |
"http": http_proxy, | |
"https": https_proxy | |
} if http_proxy or https_proxy else None | |
attempts = 0 | |
while attempts < 3: | |
try: | |
response = requests.get(url, proxies=proxies) | |
response.raise_for_status() | |
data = response.json() | |
# Cache the response | |
cache[url_hash] = data | |
return data | |
except requests.RequestException as e: | |
attempts += 1 | |
print(f"Attempt {attempts} failed: {e}") | |
if attempts == 3: | |
return [] | |
def fetch_papers(): | |
data = make_request(API_URL) | |
return [parse_article(item) for item in data] | |
def fetch_papers_with_date(date: datetime): | |
formatted_date = date.strftime("%Y-%m-%d") | |
data = make_request(API_URL + "?date=" + formatted_date) | |
return [parse_article(item) for item in data] | |
def fetch_papers_with_daterange(start_date: datetime, end_date: datetime): | |
# return [] | |
# 每天的数据都是独立的,所以只需要遍历日期范围即可 | |
articles = [] | |
current_date = start_date | |
while current_date <= end_date: | |
print(current_date) | |
articles.extend(fetch_papers_with_date(current_date)) | |
print(f"Total articles: {len(articles)}") | |
current_date += datetime.timedelta(days=1) | |
# 根据每个文章的.paper.id去重 | |
unique_articles = {} | |
for article in articles: | |
if article.paper.id not in unique_articles: | |
unique_articles[article.paper.id] = article | |
return list(unique_articles.values()) | |
if __name__ == "__main__": | |
from rich import print | |
start_date = datetime.datetime(2024, 1, 28) | |
end_date = datetime.datetime(2024, 1, 30) | |
articles = fetch_papers_with_daterange(start_date=start_date, end_date=end_date) | |
# print(articles) | |
print(f"Total articles: {len(articles)}") | |