Spaces:

Krish-Upgrix
/

Redfin-app

Sleeping

File size: 7,421 Bytes

import streamlit as st
import pandas as pd
import time
import os
import subprocess
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def install_chrome():
    if not os.path.exists("/usr/bin/chromium-browser"):
        subprocess.run(["apt-get", "update"], check=True)
        subprocess.run(["apt-get", "install", "-y", "chromium-browser"], check=True)
    os.environ["PATH"] += os.pathsep + "/usr/bin/"

def scrape_redfin(zipcode):
    install_chrome()  # Ensure Chrome/Chromium is installed
    chromedriver_autoinstaller.install()  # Ensure the correct chromedriver version is installed
    
    options = Options()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--incognito")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("start-maximized")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    
    options.binary_location = "/usr/bin/chromium-browser"  # Use Chromium
    service = Service(chromedriver_autoinstaller.install())
    driver = webdriver.Chrome(service=service, options=options)
    url = f"https://www.redfin.com/zipcode/{zipcode}"
    driver.get(url)
    
    try:
        listings_container = WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div"))
        )
    except Exception as e:
        st.error("Error: Listings did not load properly")
        driver.quit()
        return pd.DataFrame()
    
    scroll_pause_time = 5  
    screen_height = driver.execute_script("return window.innerHeight;")
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        driver.execute_script("window.scrollBy(0, arguments[0]);", screen_height // 2)
        time.sleep(scroll_pause_time)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    houses = []
    listings = driver.find_elements(By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div/div")
    
    for listing in listings:
        try:
            price = listing.find_element(By.XPATH, ".//div/div/div[2]/div[1]/div[1]/span").text
        except:
            price = "N/A"
        
        try:
            address = listing.find_element(By.XPATH, ".//div/div/div[2]/div[3]").text
        except:
            address = "N/A"
        
        try:
            size = listing.find_element(By.XPATH, ".//div/div/div[2]/div[4]/div").text
        except:
            size = "N/A"
        
        try:
            link = listing.find_element(By.TAG_NAME, "a").get_attribute("href")
        except:
            link = "N/A"
        
        houses.append({"Price": price, "Address": address, "Size": size, "Link": link})
    
    driver.quit()
    return pd.DataFrame(houses)

st.title("Redfin House Listings Scraper")
zipcode = st.text_input("Enter ZIP code:")

if st.button("Scrape Data"):
    if zipcode:
        with st.spinner("Scraping data, please wait..."):
            df = scrape_redfin(zipcode)
            if not df.empty:
                st.success("Scraping complete! Here are the available houses:")
                st.dataframe(df)
            else:
                st.warning("No houses found for the given ZIP code.")
    else:
        st.error("Please enter a valid ZIP code.")














## working best code ever

# import streamlit as st
# import pandas as pd
# import time
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from webdriver_manager.chrome import ChromeDriverManager

# def scrape_redfin(zipcode):
#     options = Options()
#     options.add_argument("--headless")
#     options.add_argument("--incognito")
#     options.add_argument("--disable-blink-features=AutomationControlled")
#     options.add_argument("start-maximized")
#     options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    
#     driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
#     url = f"https://www.redfin.com/zipcode/{zipcode}"
#     driver.get(url)
    
#     try:
#         listings_container = WebDriverWait(driver, 60).until(
#             EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div"))
#         )
#     except Exception as e:
#         st.error("Error: Listings did not load properly")
#         driver.quit()
#         return pd.DataFrame()
    
#     scroll_pause_time = 5  
#     screen_height = driver.execute_script("return window.innerHeight;")
#     last_height = driver.execute_script("return document.body.scrollHeight")
    
#     while True:
#         driver.execute_script("window.scrollBy(0, arguments[0]);", screen_height // 2)
#         time.sleep(scroll_pause_time)
#         new_height = driver.execute_script("return document.body.scrollHeight")
#         if new_height == last_height:
#             break
#         last_height = new_height
    
#     houses = []
#     listings = driver.find_elements(By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div/div")
    
#     for listing in listings:
#         try:
#             price = listing.find_element(By.XPATH, ".//div/div/div[2]/div[1]/div[1]/span").text
#         except:
#             price = "N/A"
        
#         try:
#             address = listing.find_element(By.XPATH, ".//div/div/div[2]/div[3]").text
#         except:
#             address = "N/A"
        
#         try:
#             size = listing.find_element(By.XPATH, ".//div/div/div[2]/div[4]/div").text
#         except:
#             size = "N/A"
        
#         try:
#             link = listing.find_element(By.TAG_NAME, "a").get_attribute("href")
#         except:
#             link = "N/A"
        
#         houses.append({"Price": price, "Address": address, "Size": size, "Link": link})
    
#     driver.quit()
#     return pd.DataFrame(houses)

# st.title("Redfin House Listings Scraper")
# zipcode = st.text_input("Enter ZIP code:")

# if st.button("Scrape Data"):
#     if zipcode:
#         with st.spinner("Scraping data, please wait..."):
#             df = scrape_redfin(zipcode)
#             if not df.empty:
#                 st.success("Scraping complete! Here are the available houses:")
#                 st.dataframe(df)
#             else:
#                 st.warning("No houses found for the given ZIP code.")
#     else:
#         st.error("Please enter a valid ZIP code.")