mujib71 / scripts /data_collector.py
likhonsheikhdev's picture
Upload 28 files
cce70aa verified
import requests
from bs4 import BeautifulSoup
import time
import random
import json
from pathlib import Path
import logging
from urllib.parse import urljoin
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class BengaliDataCollector:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
self.output_dir = Path('data/raw')
self.output_dir.mkdir(parents=True, exist_ok=True)
def make_request(self, url, retries=3, delay=1):
"""Make HTTP request with retry logic and rate limiting"""
for attempt in range(retries):
try:
time.sleep(delay + random.random()) # Rate limiting with jitter
response = requests.get(url, headers=self.headers)
response.raise_for_status()
return response
except requests.RequestException as e:
logger.warning(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
if attempt == retries - 1:
logger.error(f"Failed to fetch {url} after {retries} attempts")
raise
time.sleep(delay * (attempt + 1)) # Exponential backoff
def scrape_wikipedia(self):
"""Scrape Bengali text from Wikipedia"""
url = "https://bn.wikipedia.org/wiki/প্রধান_পাতা"
logger.info(f"Scraping Wikipedia: {url}")
try:
response = self.make_request(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Get main content and featured articles
content_div = soup.find('div', {'id': 'mw-content-text'})
articles = []
if content_div:
# Extract article links
article_links = content_div.find_all('a', href=True)
for link in article_links[:50]: # Limit to first 50 articles
if link['href'].startswith('/wiki/') and ':' not in link['href']:
article_url = urljoin('https://bn.wikipedia.org', link['href'])
try:
article_response = self.make_request(article_url)
article_soup = BeautifulSoup(article_response.content, 'html.parser')
# Extract article content
article_content = article_soup.find('div', {'id': 'mw-content-text'})
if article_content:
text = article_content.get_text(separator='\n', strip=True)
articles.append({
'url': article_url,
'content': text
})
logger.info(f"Successfully scraped article: {article_url}")
except Exception as e:
logger.error(f"Failed to scrape article {article_url}: {str(e)}")
# Save Wikipedia data
with open(self.output_dir / 'wikipedia_data.json', 'w', encoding='utf-8') as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
return len(articles)
except Exception as e:
logger.error(f"Failed to scrape Wikipedia: {str(e)}")
return 0
def scrape_prothom_alo(self):
"""Scrape Bengali text from Prothom Alo"""
base_url = "https://www.prothomalo.com"
categories = ['bangladesh', 'international', 'opinion', 'science-technology']
articles = []
for category in categories:
url = f"{base_url}/{category}"
logger.info(f"Scraping Prothom Alo category: {category}")
try:
response = self.make_request(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Find article links
article_links = soup.find_all('a', href=True)
for link in article_links[:10]: # Limit to 10 articles per category
article_url = urljoin(base_url, link['href'])
if category in article_url:
try:
article_response = self.make_request(article_url)
article_soup = BeautifulSoup(article_response.content, 'html.parser')
# Extract article content
article_content = article_soup.find('div', {'class': 'story-content'})
if article_content:
text = article_content.get_text(separator='\n', strip=True)
articles.append({
'url': article_url,
'category': category,
'content': text
})
logger.info(f"Successfully scraped article: {article_url}")
except Exception as e:
logger.error(f"Failed to scrape article {article_url}: {str(e)}")
except Exception as e:
logger.error(f"Failed to scrape category {category}: {str(e)}")
# Save Prothom Alo data
with open(self.output_dir / 'prothomalo_data.json', 'w', encoding='utf-8') as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
return len(articles)
def collect(self):
"""Main method to collect data from all sources"""
logger.info("Starting data collection")
wiki_count = self.scrape_wikipedia()
logger.info(f"Collected {wiki_count} articles from Wikipedia")
prothomalo_count = self.scrape_prothom_alo()
logger.info(f"Collected {prothomalo_count} articles from Prothom Alo")
# Combine and process the collected data
self.process_collected_data()
logger.info("Data collection completed")
def process_collected_data(self):
"""Process and combine collected data"""
try:
# Read collected data
with open(self.output_dir / 'wikipedia_data.json', 'r', encoding='utf-8') as f:
wiki_data = json.load(f)
with open(self.output_dir / 'prothomalo_data.json', 'r', encoding='utf-8') as f:
news_data = json.load(f)
# Combine and format data
processed_data = []
# Process Wikipedia articles
for article in wiki_data:
processed_data.append({
'text': article['content'],
'source': 'wikipedia',
'url': article['url']
})
# Process news articles
for article in news_data:
processed_data.append({
'text': article['content'],
'source': 'prothomalo',
'category': article.get('category', ''),
'url': article['url']
})
# Save processed data
with open(self.output_dir / 'processed_data.json', 'w', encoding='utf-8') as f:
json.dump(processed_data, f, ensure_ascii=False, indent=2)
logger.info(f"Successfully processed {len(processed_data)} articles")
except Exception as e:
logger.error(f"Failed to process collected data: {str(e)}")
raise
if __name__ == "__main__":
collector = BengaliDataCollector()
collector.collect()