|
import os |
|
import re |
|
from bs4 import BeautifulSoup |
|
import requests |
|
import urllib.parse |
|
import hashlib |
|
|
|
HEADERS = { |
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" |
|
} |
|
|
|
|
|
def get_links(url): |
|
response = requests.get(url, headers=HEADERS) |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
links = [] |
|
for link in soup.findAll('a'): |
|
href = link.get('href') |
|
if re.match(r'^http[s]?://.*', href) or not href: |
|
continue |
|
elif '/etka/' in href: |
|
full_path = urllib.parse.urljoin(url, href) |
|
links.append(full_path) |
|
return links |
|
|
|
|
|
def download(url, base_directory, is_start_url=False): |
|
response = requests.get(url, headers=HEADERS) |
|
path = urllib.parse.urlsplit(url).path |
|
|
|
if is_start_url: |
|
filename = os.path.join(base_directory, 'MINETKA.html') |
|
else: |
|
if path.endswith('/'): |
|
path += 'index.html' |
|
elif path.endswith('.php'): |
|
path = path[:-4] + '.html' |
|
|
|
|
|
path_after_etka = url.split('/etka/')[-1] |
|
hashed_name = hashlib.sha1(path_after_etka.encode('utf-8')).hexdigest()[:10] |
|
filename = os.path.join(base_directory, f'etka/{hashed_name}.html') |
|
|
|
directory = os.path.dirname(filename) |
|
os.makedirs(directory, exist_ok=True) |
|
|
|
with open(filename, mode='wb+') as f: |
|
f.write(response.content) |
|
|
|
|
|
if __name__ == '__main__': |
|
start_url = 'https://superetka.com/etka/wap.php' |
|
base_directory = 'C:\etka' |
|
os.makedirs(base_directory, exist_ok=True) |
|
|
|
print("Начало сканирования...") |
|
|
|
visited = set() |
|
|
|
start_urls = [start_url] |
|
|
|
while len(start_urls) > 0: |
|
current_url = start_urls.pop(0) |
|
if current_url in visited: |
|
continue |
|
|
|
is_start = (current_url == start_url) |
|
try: |
|
download(current_url, base_directory, is_start_url=is_start) |
|
visited.add(current_url) |
|
new_links = get_links(current_url) |
|
start_urls += new_links |
|
|
|
print(f"Скачано: {current_url}") |
|
print(f"Новые ссылки: {len(new_links)}") |
|
except Exception as e: |
|
print(f"Ошибка при обработке {current_url}: {str(e)}") |
|
|
|
print("Сканирование завершено") |
|
|