Spaces:
Running
Running
import requests | |
from bs4 import BeautifulSoup | |
import logging | |
import json | |
import time | |
from typing import List, Dict | |
from random import choice | |
from datetime import datetime | |
# Logging setup | |
logging.basicConfig( | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
level=logging.INFO, | |
handlers=[logging.FileHandler("scraper.log", encoding='utf-8'), logging.StreamHandler()] | |
) | |
logger = logging.getLogger(__name__) | |
# قائمة User-Agent عشوائية لتجنب الحظر | |
USER_AGENTS = [ | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15", | |
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36", | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0" | |
] | |
# Headers لمحاكاة متصفح حقيقي | |
def get_headers() -> dict: | |
return { | |
"User-Agent": choice(USER_AGENTS), | |
"Accept-Language": "en-US,en;q=0.9", | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
"Connection": "keep-alive" | |
} | |
def scrape_website(url: str, retries: int = 3) -> List[Dict]: | |
""" | |
جمع بيانات السيارات من موقع ويب معين مع إعادة المحاولة عند الفشل. | |
""" | |
for attempt in range(retries): | |
try: | |
response = requests.get(url, headers=get_headers(), timeout=10) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, "html.parser") | |
cars_data = [] | |
# محددات عامة (يجب تعديلها حسب هيكلية كل موقع) | |
car_listings = soup.select(".car-item, .listing-item, .product-item, .car-listing, .ad-item, .vehicle-item") | |
if not car_listings: | |
logger.warning(f"لم يتم العثور على قوائم سيارات في {url}. تحققي من المحددات.") | |
return cars_data | |
for car in car_listings: | |
try: | |
car_data = {} | |
# تحقق من وجود العنصر قبل استخراج النص | |
name_elem = car.select_one(".car-title, .product-title, .listing-title, .ad-title, .vehicle-title, h2, h3") | |
car_data["name"] = name_elem.text.strip() if name_elem else "غير محدد" | |
price_elem = car.select_one(".car-price, .price, .product-price, .listing-price, .ad-price") | |
car_data["price"] = price_elem.text.strip() if price_elem else "غير محدد" | |
year_elem = car.select_one(".car-year, .year, .model-year") | |
car_data["model_year"] = year_elem.text.strip() if year_elem else "غير محدد" | |
location_elem = car.select_one(".car-location, .location, .city") | |
car_data["location"] = location_elem.text.strip() if location_elem else "غير محدد" | |
desc_elem = car.select_one(".car-description, .description, .details") | |
car_data["description"] = desc_elem.text.strip() if desc_elem else "غير محدد" | |
# إضافة بيانات الصيانة وقطع الغيار | |
maintenance_elem = car.select_one(".maintenance, .service-info") | |
car_data["maintenance"] = maintenance_elem.text.strip() if maintenance_elem else "غير محدد" | |
spare_parts_elem = car.select_one(".spare-parts, .parts-info") | |
car_data["spare_parts"] = spare_parts_elem.text.strip() if spare_parts_elem else "غير محدد" | |
car_data["source"] = url.split("//")[1].split("/")[0] | |
car_data["scraped_at"] = datetime.now().isoformat() | |
cars_data.append(car_data) | |
logger.info(f"تم جمع بيانات: {car_data['name']} - {car_data['price']} من {url}") | |
except Exception as e: | |
logger.error(f"خطأ في استخراج بيانات سيارة من {url}: {str(e)}") | |
continue | |
return cars_data | |
except requests.exceptions.RequestException as e: | |
logger.error(f"خطأ في الاتصال بـ {url} (محاولة {attempt + 1}/{retries}): {str(e)}") | |
if attempt < retries - 1: | |
time.sleep(5 * (attempt + 1)) # تأخير متزايد | |
else: | |
return [] | |
except Exception as e: | |
logger.error(f"خطأ غير متوقع في scrape_website لـ {url}: {str(e)}") | |
return [] | |
def save_data(data: List[Dict], filename: str = "cars_data.json"): | |
""" | |
تخزين البيانات في ملف JSON. | |
""" | |
try: | |
with open(filename, "w", encoding="utf-8") as f: | |
json.dump(data, f, ensure_ascii=False, indent=4) | |
logger.info(f"تم تخزين {len(data)} عنصر في {filename}") | |
except Exception as e: | |
logger.error(f"خطأ في تخزين البيانات في {filename}: {str(e)}") | |
if __name__ == "__main__": | |
websites = [ | |
"https://iq.labeb.com/ct/cars-for-sale-558", | |
"https://www.akosayara.com/", | |
"https://www.iqcars.net/", | |
"https://alrashad.com.iq/", | |
"https://website.tao.iq/index.php/ar/", | |
"https://www.quattro-iq.com/home-ar", | |
"https://iq.opensooq.com/ar/%D9%85%D8%B1%D8%A7%D9%83%D8%B2-%D8%AE%D8%AF%D9%85%D8%A7%D8%AA-%D8%A7%D9%84%D8%B3%D9%8A%D8%A7%D8%B1%D8%A7%D8%AA/%D8%A7%D9%84%D8%AE%D8%AF%D9%85%D8%A7%D8%AA/%D9%82%D8%B7%D8%B9-%D8%BA%D9%8A%D8%A7%D8%B1", | |
"https://ghiarati.com/", | |
"https://www.motors.iq/", | |
"https://www.alsayyara.com/" | |
] | |
while True: | |
all_cars_data = [] | |
for url in websites: | |
logger.info(f"جمع البيانات من {url}") | |
cars_data = scrape_website(url) | |
all_cars_data.extend(cars_data) | |
time.sleep(5) # تأخير 5 ثواني لتجنب الحظر | |
save_data(all_cars_data) | |
logger.info("تم الانتهاء من جمع البيانات. النوم لمدة ساعة...") | |
time.sleep(3600) # تحديث كل ساعة | |