Spaces:
No application file
No application file
import chromedriver_autoinstaller | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.common.by import By | |
from selenium.common.exceptions import StaleElementReferenceException | |
import json | |
chromedriver_autoinstaller.install() # Automatically installs compatible driver | |
options = Options() | |
options.binary_location = "/usr/bin/chromium-browser" | |
options.add_argument("--headless") | |
options.add_argument("--no-sandbox") | |
options.add_argument("--disable-dev-shm-usage") | |
def scraper(link:str,options = options): | |
driver = webdriver.Chrome(options=options) | |
driver.get(link) | |
page_text = driver.find_element(By.TAG_NAME, "body").text | |
scripts = driver.find_elements(By.TAG_NAME, "script") | |
script_sources = [s.get_attribute("src") for s in scripts if s.get_attribute("src")] | |
links = driver.find_elements(By.TAG_NAME, "link") | |
link_sources = [l.get_attribute("href") for l in links if l.get_attribute("href")] | |
driver.quit() | |
data = { | |
"page_text": page_text, | |
"script_sources": script_sources, | |
"link_sources": link_sources, | |
} | |
return data | |