import streamlit as st import pandas as pd import time import os import subprocess import chromedriver_autoinstaller from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC def install_chrome(): if not os.path.exists("/usr/bin/chromium-browser"): subprocess.run(["apt-get", "update"], check=True) subprocess.run(["apt-get", "install", "-y", "chromium-browser"], check=True) os.environ["PATH"] += os.pathsep + "/usr/bin/" def scrape_redfin(zipcode): install_chrome() # Ensure Chrome/Chromium is installed chromedriver_autoinstaller.install() # Ensure the correct chromedriver version is installed options = Options() options.add_argument("--headless") # Run in headless mode options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") options.add_argument("--incognito") options.add_argument("--disable-blink-features=AutomationControlled") options.add_argument("start-maximized") options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") options.binary_location = "/usr/bin/chromium-browser" # Use Chromium service = Service(chromedriver_autoinstaller.install()) driver = webdriver.Chrome(service=service, options=options) url = f"https://www.redfin.com/zipcode/{zipcode}" driver.get(url) try: listings_container = WebDriverWait(driver, 60).until( EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div")) ) except Exception as e: st.error("Error: Listings did not load properly") driver.quit() return pd.DataFrame() scroll_pause_time = 5 screen_height = driver.execute_script("return window.innerHeight;") last_height = driver.execute_script("return document.body.scrollHeight") while True: driver.execute_script("window.scrollBy(0, arguments[0]);", screen_height // 2) time.sleep(scroll_pause_time) new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height houses = [] listings = driver.find_elements(By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div/div") for listing in listings: try: price = listing.find_element(By.XPATH, ".//div/div/div[2]/div[1]/div[1]/span").text except: price = "N/A" try: address = listing.find_element(By.XPATH, ".//div/div/div[2]/div[3]").text except: address = "N/A" try: size = listing.find_element(By.XPATH, ".//div/div/div[2]/div[4]/div").text except: size = "N/A" try: link = listing.find_element(By.TAG_NAME, "a").get_attribute("href") except: link = "N/A" houses.append({"Price": price, "Address": address, "Size": size, "Link": link}) driver.quit() return pd.DataFrame(houses) st.title("Redfin House Listings Scraper") zipcode = st.text_input("Enter ZIP code:") if st.button("Scrape Data"): if zipcode: with st.spinner("Scraping data, please wait..."): df = scrape_redfin(zipcode) if not df.empty: st.success("Scraping complete! Here are the available houses:") st.dataframe(df) else: st.warning("No houses found for the given ZIP code.") else: st.error("Please enter a valid ZIP code.") ## working best code ever # import streamlit as st # import pandas as pd # import time # from selenium import webdriver # from selenium.webdriver.common.by import By # from selenium.webdriver.chrome.service import Service # from selenium.webdriver.chrome.options import Options # from selenium.webdriver.support.ui import WebDriverWait # from selenium.webdriver.support import expected_conditions as EC # from webdriver_manager.chrome import ChromeDriverManager # def scrape_redfin(zipcode): # options = Options() # options.add_argument("--headless") # options.add_argument("--incognito") # options.add_argument("--disable-blink-features=AutomationControlled") # options.add_argument("start-maximized") # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") # driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) # url = f"https://www.redfin.com/zipcode/{zipcode}" # driver.get(url) # try: # listings_container = WebDriverWait(driver, 60).until( # EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div")) # ) # except Exception as e: # st.error("Error: Listings did not load properly") # driver.quit() # return pd.DataFrame() # scroll_pause_time = 5 # screen_height = driver.execute_script("return window.innerHeight;") # last_height = driver.execute_script("return document.body.scrollHeight") # while True: # driver.execute_script("window.scrollBy(0, arguments[0]);", screen_height // 2) # time.sleep(scroll_pause_time) # new_height = driver.execute_script("return document.body.scrollHeight") # if new_height == last_height: # break # last_height = new_height # houses = [] # listings = driver.find_elements(By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div/div") # for listing in listings: # try: # price = listing.find_element(By.XPATH, ".//div/div/div[2]/div[1]/div[1]/span").text # except: # price = "N/A" # try: # address = listing.find_element(By.XPATH, ".//div/div/div[2]/div[3]").text # except: # address = "N/A" # try: # size = listing.find_element(By.XPATH, ".//div/div/div[2]/div[4]/div").text # except: # size = "N/A" # try: # link = listing.find_element(By.TAG_NAME, "a").get_attribute("href") # except: # link = "N/A" # houses.append({"Price": price, "Address": address, "Size": size, "Link": link}) # driver.quit() # return pd.DataFrame(houses) # st.title("Redfin House Listings Scraper") # zipcode = st.text_input("Enter ZIP code:") # if st.button("Scrape Data"): # if zipcode: # with st.spinner("Scraping data, please wait..."): # df = scrape_redfin(zipcode) # if not df.empty: # st.success("Scraping complete! Here are the available houses:") # st.dataframe(df) # else: # st.warning("No houses found for the given ZIP code.") # else: # st.error("Please enter a valid ZIP code.")