File size: 1,153 Bytes
71f9aed
 
 
 
 
 
 
 
 
 
f11978e
71f9aed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By 
from selenium.common.exceptions import StaleElementReferenceException
import json 

chromedriver_autoinstaller.install()  # Automatically installs compatible driver

options = Options()
options.binary_location = "/usr/bin/chromium-browser"
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")


def scraper(link:str,options = options):
    driver = webdriver.Chrome(options=options)
    driver.get(link)
    page_text = driver.find_element(By.TAG_NAME, "body").text 
    scripts = driver.find_elements(By.TAG_NAME, "script")
    script_sources = [s.get_attribute("src") for s in scripts if s.get_attribute("src")]
    links = driver.find_elements(By.TAG_NAME, "link")
    link_sources = [l.get_attribute("href") for l in links if l.get_attribute("href")]

    driver.quit()
    data =  {
        "page_text": page_text,
        "script_sources": script_sources,
        "link_sources": link_sources,
    }
    return data