|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
import time
|
|
|
import random
|
|
|
import json
|
|
|
from pathlib import Path
|
|
|
import logging
|
|
|
from urllib.parse import urljoin
|
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
|
level=logging.INFO,
|
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
|
)
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BengaliDataCollector:
|
|
|
def __init__(self):
|
|
|
self.headers = {
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
|
}
|
|
|
self.output_dir = Path('data/raw')
|
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def make_request(self, url, retries=3, delay=1):
|
|
|
"""Make HTTP request with retry logic and rate limiting"""
|
|
|
for attempt in range(retries):
|
|
|
try:
|
|
|
time.sleep(delay + random.random())
|
|
|
response = requests.get(url, headers=self.headers)
|
|
|
response.raise_for_status()
|
|
|
return response
|
|
|
except requests.RequestException as e:
|
|
|
logger.warning(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
|
|
|
if attempt == retries - 1:
|
|
|
logger.error(f"Failed to fetch {url} after {retries} attempts")
|
|
|
raise
|
|
|
time.sleep(delay * (attempt + 1))
|
|
|
|
|
|
def scrape_wikipedia(self):
|
|
|
"""Scrape Bengali text from Wikipedia"""
|
|
|
url = "https://bn.wikipedia.org/wiki/প্রধান_পাতা"
|
|
|
logger.info(f"Scraping Wikipedia: {url}")
|
|
|
|
|
|
try:
|
|
|
response = self.make_request(url)
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
|
|
|
|
|
content_div = soup.find('div', {'id': 'mw-content-text'})
|
|
|
articles = []
|
|
|
|
|
|
if content_div:
|
|
|
|
|
|
article_links = content_div.find_all('a', href=True)
|
|
|
for link in article_links[:50]:
|
|
|
if link['href'].startswith('/wiki/') and ':' not in link['href']:
|
|
|
article_url = urljoin('https://bn.wikipedia.org', link['href'])
|
|
|
try:
|
|
|
article_response = self.make_request(article_url)
|
|
|
article_soup = BeautifulSoup(article_response.content, 'html.parser')
|
|
|
|
|
|
|
|
|
article_content = article_soup.find('div', {'id': 'mw-content-text'})
|
|
|
if article_content:
|
|
|
text = article_content.get_text(separator='\n', strip=True)
|
|
|
articles.append({
|
|
|
'url': article_url,
|
|
|
'content': text
|
|
|
})
|
|
|
logger.info(f"Successfully scraped article: {article_url}")
|
|
|
except Exception as e:
|
|
|
logger.error(f"Failed to scrape article {article_url}: {str(e)}")
|
|
|
|
|
|
|
|
|
with open(self.output_dir / 'wikipedia_data.json', 'w', encoding='utf-8') as f:
|
|
|
json.dump(articles, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
return len(articles)
|
|
|
except Exception as e:
|
|
|
logger.error(f"Failed to scrape Wikipedia: {str(e)}")
|
|
|
return 0
|
|
|
|
|
|
def scrape_prothom_alo(self):
|
|
|
"""Scrape Bengali text from Prothom Alo"""
|
|
|
base_url = "https://www.prothomalo.com"
|
|
|
categories = ['bangladesh', 'international', 'opinion', 'science-technology']
|
|
|
articles = []
|
|
|
|
|
|
for category in categories:
|
|
|
url = f"{base_url}/{category}"
|
|
|
logger.info(f"Scraping Prothom Alo category: {category}")
|
|
|
|
|
|
try:
|
|
|
response = self.make_request(url)
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
|
|
|
|
|
article_links = soup.find_all('a', href=True)
|
|
|
for link in article_links[:10]:
|
|
|
article_url = urljoin(base_url, link['href'])
|
|
|
if category in article_url:
|
|
|
try:
|
|
|
article_response = self.make_request(article_url)
|
|
|
article_soup = BeautifulSoup(article_response.content, 'html.parser')
|
|
|
|
|
|
|
|
|
article_content = article_soup.find('div', {'class': 'story-content'})
|
|
|
if article_content:
|
|
|
text = article_content.get_text(separator='\n', strip=True)
|
|
|
articles.append({
|
|
|
'url': article_url,
|
|
|
'category': category,
|
|
|
'content': text
|
|
|
})
|
|
|
logger.info(f"Successfully scraped article: {article_url}")
|
|
|
except Exception as e:
|
|
|
logger.error(f"Failed to scrape article {article_url}: {str(e)}")
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.error(f"Failed to scrape category {category}: {str(e)}")
|
|
|
|
|
|
|
|
|
with open(self.output_dir / 'prothomalo_data.json', 'w', encoding='utf-8') as f:
|
|
|
json.dump(articles, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
return len(articles)
|
|
|
|
|
|
def collect(self):
|
|
|
"""Main method to collect data from all sources"""
|
|
|
logger.info("Starting data collection")
|
|
|
|
|
|
wiki_count = self.scrape_wikipedia()
|
|
|
logger.info(f"Collected {wiki_count} articles from Wikipedia")
|
|
|
|
|
|
prothomalo_count = self.scrape_prothom_alo()
|
|
|
logger.info(f"Collected {prothomalo_count} articles from Prothom Alo")
|
|
|
|
|
|
|
|
|
self.process_collected_data()
|
|
|
|
|
|
logger.info("Data collection completed")
|
|
|
|
|
|
def process_collected_data(self):
|
|
|
"""Process and combine collected data"""
|
|
|
try:
|
|
|
|
|
|
with open(self.output_dir / 'wikipedia_data.json', 'r', encoding='utf-8') as f:
|
|
|
wiki_data = json.load(f)
|
|
|
|
|
|
with open(self.output_dir / 'prothomalo_data.json', 'r', encoding='utf-8') as f:
|
|
|
news_data = json.load(f)
|
|
|
|
|
|
|
|
|
processed_data = []
|
|
|
|
|
|
|
|
|
for article in wiki_data:
|
|
|
processed_data.append({
|
|
|
'text': article['content'],
|
|
|
'source': 'wikipedia',
|
|
|
'url': article['url']
|
|
|
})
|
|
|
|
|
|
|
|
|
for article in news_data:
|
|
|
processed_data.append({
|
|
|
'text': article['content'],
|
|
|
'source': 'prothomalo',
|
|
|
'category': article.get('category', ''),
|
|
|
'url': article['url']
|
|
|
})
|
|
|
|
|
|
|
|
|
with open(self.output_dir / 'processed_data.json', 'w', encoding='utf-8') as f:
|
|
|
json.dump(processed_data, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
logger.info(f"Successfully processed {len(processed_data)} articles")
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.error(f"Failed to process collected data: {str(e)}")
|
|
|
raise
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
collector = BengaliDataCollector()
|
|
|
collector.collect()
|
|
|
|