Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import time | |
import os | |
import subprocess | |
import chromedriver_autoinstaller | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
def install_chrome(): | |
if not os.path.exists("/usr/bin/chromium-browser"): | |
subprocess.run(["apt-get", "update"], check=True) | |
subprocess.run(["apt-get", "install", "-y", "chromium-browser"], check=True) | |
os.environ["PATH"] += os.pathsep + "/usr/bin/" | |
def scrape_redfin(zipcode): | |
install_chrome() # Ensure Chrome/Chromium is installed | |
chromedriver_autoinstaller.install() # Ensure the correct chromedriver version is installed | |
options = Options() | |
options.add_argument("--headless") # Run in headless mode | |
options.add_argument("--no-sandbox") | |
options.add_argument("--disable-dev-shm-usage") | |
options.add_argument("--incognito") | |
options.add_argument("--disable-blink-features=AutomationControlled") | |
options.add_argument("start-maximized") | |
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") | |
options.binary_location = "/usr/bin/chromium-browser" # Use Chromium | |
service = Service(chromedriver_autoinstaller.install()) | |
driver = webdriver.Chrome(service=service, options=options) | |
url = f"https://www.redfin.com/zipcode/{zipcode}" | |
driver.get(url) | |
try: | |
listings_container = WebDriverWait(driver, 60).until( | |
EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div")) | |
) | |
except Exception as e: | |
st.error("Error: Listings did not load properly") | |
driver.quit() | |
return pd.DataFrame() | |
scroll_pause_time = 5 | |
screen_height = driver.execute_script("return window.innerHeight;") | |
last_height = driver.execute_script("return document.body.scrollHeight") | |
while True: | |
driver.execute_script("window.scrollBy(0, arguments[0]);", screen_height // 2) | |
time.sleep(scroll_pause_time) | |
new_height = driver.execute_script("return document.body.scrollHeight") | |
if new_height == last_height: | |
break | |
last_height = new_height | |
houses = [] | |
listings = driver.find_elements(By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div/div") | |
for listing in listings: | |
try: | |
price = listing.find_element(By.XPATH, ".//div/div/div[2]/div[1]/div[1]/span").text | |
except: | |
price = "N/A" | |
try: | |
address = listing.find_element(By.XPATH, ".//div/div/div[2]/div[3]").text | |
except: | |
address = "N/A" | |
try: | |
size = listing.find_element(By.XPATH, ".//div/div/div[2]/div[4]/div").text | |
except: | |
size = "N/A" | |
try: | |
link = listing.find_element(By.TAG_NAME, "a").get_attribute("href") | |
except: | |
link = "N/A" | |
houses.append({"Price": price, "Address": address, "Size": size, "Link": link}) | |
driver.quit() | |
return pd.DataFrame(houses) | |
st.title("Redfin House Listings Scraper") | |
zipcode = st.text_input("Enter ZIP code:") | |
if st.button("Scrape Data"): | |
if zipcode: | |
with st.spinner("Scraping data, please wait..."): | |
df = scrape_redfin(zipcode) | |
if not df.empty: | |
st.success("Scraping complete! Here are the available houses:") | |
st.dataframe(df) | |
else: | |
st.warning("No houses found for the given ZIP code.") | |
else: | |
st.error("Please enter a valid ZIP code.") | |
## working best code ever | |
# import streamlit as st | |
# import pandas as pd | |
# import time | |
# from selenium import webdriver | |
# from selenium.webdriver.common.by import By | |
# from selenium.webdriver.chrome.service import Service | |
# from selenium.webdriver.chrome.options import Options | |
# from selenium.webdriver.support.ui import WebDriverWait | |
# from selenium.webdriver.support import expected_conditions as EC | |
# from webdriver_manager.chrome import ChromeDriverManager | |
# def scrape_redfin(zipcode): | |
# options = Options() | |
# options.add_argument("--headless") | |
# options.add_argument("--incognito") | |
# options.add_argument("--disable-blink-features=AutomationControlled") | |
# options.add_argument("start-maximized") | |
# options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") | |
# driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) | |
# url = f"https://www.redfin.com/zipcode/{zipcode}" | |
# driver.get(url) | |
# try: | |
# listings_container = WebDriverWait(driver, 60).until( | |
# EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div")) | |
# ) | |
# except Exception as e: | |
# st.error("Error: Listings did not load properly") | |
# driver.quit() | |
# return pd.DataFrame() | |
# scroll_pause_time = 5 | |
# screen_height = driver.execute_script("return window.innerHeight;") | |
# last_height = driver.execute_script("return document.body.scrollHeight") | |
# while True: | |
# driver.execute_script("window.scrollBy(0, arguments[0]);", screen_height // 2) | |
# time.sleep(scroll_pause_time) | |
# new_height = driver.execute_script("return document.body.scrollHeight") | |
# if new_height == last_height: | |
# break | |
# last_height = new_height | |
# houses = [] | |
# listings = driver.find_elements(By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div/div") | |
# for listing in listings: | |
# try: | |
# price = listing.find_element(By.XPATH, ".//div/div/div[2]/div[1]/div[1]/span").text | |
# except: | |
# price = "N/A" | |
# try: | |
# address = listing.find_element(By.XPATH, ".//div/div/div[2]/div[3]").text | |
# except: | |
# address = "N/A" | |
# try: | |
# size = listing.find_element(By.XPATH, ".//div/div/div[2]/div[4]/div").text | |
# except: | |
# size = "N/A" | |
# try: | |
# link = listing.find_element(By.TAG_NAME, "a").get_attribute("href") | |
# except: | |
# link = "N/A" | |
# houses.append({"Price": price, "Address": address, "Size": size, "Link": link}) | |
# driver.quit() | |
# return pd.DataFrame(houses) | |
# st.title("Redfin House Listings Scraper") | |
# zipcode = st.text_input("Enter ZIP code:") | |
# if st.button("Scrape Data"): | |
# if zipcode: | |
# with st.spinner("Scraping data, please wait..."): | |
# df = scrape_redfin(zipcode) | |
# if not df.empty: | |
# st.success("Scraping complete! Here are the available houses:") | |
# st.dataframe(df) | |
# else: | |
# st.warning("No houses found for the given ZIP code.") | |
# else: | |
# st.error("Please enter a valid ZIP code.") | |