from flask import Flask, request, jsonify import requests from bs4 import BeautifulSoup import re from urllib.parse import urljoin app = Flask(__name__) # Default configuration DEFAULT_TIMEOUT = 10 MAX_CONTENT_LENGTH = 5000 @app.route('/') def home(): return jsonify({ "message": "Welcome to the Web Scraping API", "endpoints": { "/scrape": { "method": "POST", "description": "Scrape web content based on provided parameters", "parameters": { "url": "Target URL to scrape (required)", "query": "Search query to filter content (optional)", "element": "HTML element to target (optional, e.g., 'p', 'h1')", "class_name": "CSS class to filter elements (optional)", "max_length": "Maximum characters to return (optional)" } } } }) @app.route('/scrape', methods=['POST']) def scrape_website(): try: # Get JSON data from request data = request.get_json() # Validate required parameters if not data or 'url' not in data: return jsonify({ "error": "Missing required parameter: url" }), 400 url = data.get('url') query = data.get('query', '') element = data.get('element', 'p') # Default to paragraphs class_name = data.get('class_name') max_length = data.get('max_length', MAX_CONTENT_LENGTH) # Make HTTP request headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=DEFAULT_TIMEOUT) response.raise_for_status() # Parse HTML content soup = BeautifulSoup(response.text, 'html.parser') # Find elements based on parameters if class_name: elements = soup.find_all(element, class_=class_name) else: elements = soup.find_all(element) # Extract and process content content = [] for elem in elements: text = elem.get_text(strip=True) if text and (not query or query.lower() in text.lower()): content.append({ "text": text[:max_length], "tag": elem.name, "classes": elem.get('class', []), "url": url }) # If no content found, try to get some basic page info if not content: title = soup.title.string if soup.title else "No title" content.append({ "text": title[:max_length], "tag": "title", "classes": [], "url": url }) return jsonify({ "status": "success", "results": content[:10], # Limit to 10 results "count": len(content), "url": url, "query": query }) except requests.exceptions.RequestException as e: return jsonify({ "error": f"Failed to fetch URL: {str(e)}" }), 500 except Exception as e: return jsonify({ "error": f"An error occurred: {str(e)}" }), 500 # For Hugging Face Spaces if __name__ == '__main__': app.run(host='0.0.0.0', port=7860)