apexherbert200's picture
Changes made
f11978e
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException
import json
chromedriver_autoinstaller.install() # Automatically installs compatible driver
options = Options()
options.binary_location = "/usr/bin/chromium-browser"
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
def scraper(link:str,options = options):
driver = webdriver.Chrome(options=options)
driver.get(link)
page_text = driver.find_element(By.TAG_NAME, "body").text
scripts = driver.find_elements(By.TAG_NAME, "script")
script_sources = [s.get_attribute("src") for s in scripts if s.get_attribute("src")]
links = driver.find_elements(By.TAG_NAME, "link")
link_sources = [l.get_attribute("href") for l in links if l.get_attribute("href")]
driver.quit()
data = {
"page_text": page_text,
"script_sources": script_sources,
"link_sources": link_sources,
}
return data