from flask import Flask, request, render_template import requests from bs4 import BeautifulSoup import urllib.parse import logging import re from typing import List, Dict import time from datetime import datetime # Import datetime for current year # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('craigslist_search.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) app = Flask(__name__) # List of Craigslist cities (partial list for brevity; expand as needed) CRAIGSLIST_CITIES = [ "newyork", "losangeles", "chicago", "houston", "phoenix", "philadelphia", "sanantonio", "sandiego", "dallas", "sanjose", "austin", "jacksonville", "sanfrancisco", "columbus", "seattle", "denver", "boston", "miami", "atlanta" ] def search_craigslist(query: str, city: str) -> List[Dict]: """ Search Craigslist for a query in a specific city or all cities. Returns a list of posts with title, link, price, location, and city. """ start_time = time.time() logger.info(f"Starting search for query: '{query}' in city: '{city}'") posts = [] query = urllib.parse.quote(query.strip()) # URL-encode the query try: if city == "all": # Search across multiple cities for city_name in CRAIGSLIST_CITIES[1:]: # Skip "all" url = f"https://{city_name}.craigslist.org/search/sss?query={query}" logger.debug(f"Fetching URL: {url}") html_content = fetch_html_with_retry(url) posts.extend(parse_html(html_content, city_name)) time.sleep(1) # Add delay to avoid rate limiting else: # Search in a specific city url = f"https://{city}.craigslist.org/search/sss?query={query}" logger.debug(f"Fetching URL: {url}") html_content = fetch_html_with_retry(url) posts.extend(parse_html(html_content, city)) logger.info(f"Search completed in {time.time() - start_time:.2f} seconds. Found {len(posts)} posts") return posts except Exception as e: logger.error(f"Error during search: {str(e)}") return [] def fetch_html_with_retry(url: str, retries: int = 3, delay: int = 2) -> str: """ Fetch HTML content with retry mechanism to handle network issues. """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } for attempt in range(retries): try: logger.debug(f"Fetching URL (attempt {attempt + 1}): {url}") response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() return response.text except requests.RequestException as e: logger.error(f"Network error on attempt {attempt + 1} for {url}: {str(e)}") except Exception as e: logger.error(f"Unexpected error on attempt {attempt + 1} for {url}: {str(e)}") if attempt < retries - 1: logger.info(f"Retrying after {delay} seconds...") time.sleep(delay) logger.error(f"Failed to fetch HTML after {retries} attempts: {url}") return "" # Return empty string on failure def parse_html(html_content: str, city: str) -> List[Dict]: """ Parse the HTML search results and extract relevant post information. """ posts = [] try: soup = BeautifulSoup(html_content, "html.parser") # Find all search result items results = soup.find_all("li", class_="cl-static-search-result") for index, result in enumerate(results): try: # Extract title title_elem = result.find("div", class_="title") title = title_elem.get_text(strip=True) if title_elem else "No title" # Extract link link_elem = result.find("a") link = link_elem["href"] if link_elem and "href" in link_elem.attrs else "#" # Extract price price_elem = result.find("div", class_="price") price = price_elem.get_text(strip=True) if price_elem else "No price" # Extract location location_elem = result.find("div", class_="location") location = location_elem.get_text(strip=True) if location_elem else "No location" post = { "title": title, "link": link, "price": price, "location": location, "city": city.capitalize() } posts.append(post) except Exception as e: logger.warning(f"Skipping invalid entry {index} in HTML for city {city}: {str(e)}") continue logger.debug(f"Parsed {len(posts)} valid posts for city: {city}") return posts except Exception as e: logger.error(f"Error parsing HTML for city {city}: {str(e)}") return [] @app.route("/", methods=["GET", "POST"]) def index(): posts = [] query = "" selected_city = "all" current_year = datetime.now().year # Get current year try: if request.method == "POST": query = request.form.get("query", "").strip() selected_city = request.form.get("city", "all") # Validate inputs if not query: logger.warning("Empty query received") return render_template( "index.html", posts=[], query="", cities=CRAIGSLIST_CITIES, selected_city=selected_city, error="Please enter a search query", current_year=current_year ) if selected_city not in CRAIGSLIST_CITIES: logger.warning(f"Invalid city selected: {selected_city}") selected_city = "all" logger.info(f"Processing POST request: query='{query}', city='{selected_city}'") posts = search_craigslist(query, selected_city) return render_template( "index.html", posts=posts, query=query, cities=CRAIGSLIST_CITIES, selected_city=selected_city, current_year=current_year ) except Exception as e: logger.error(f"Error in index route: {str(e)}") return render_template( "index.html", posts=[], query=query, cities=CRAIGSLIST_CITIES, selected_city=selected_city, error="An error occurred while processing your request", current_year=current_year ) if __name__ == "__main__": logger.info("Starting Flask application") app.run(host="0.0.0.0", port=7860, debug=True)