File size: 3,427 Bytes
520e05b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from flask import Flask, request, jsonify
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

app = Flask(__name__)

# Default configuration
DEFAULT_TIMEOUT = 10
MAX_CONTENT_LENGTH = 5000

@app.route('/')
def home():
    return jsonify({
        "message": "Welcome to the Web Scraping API",
        "endpoints": {
            "/scrape": {
                "method": "POST",
                "description": "Scrape web content based on provided parameters",
                "parameters": {
                    "url": "Target URL to scrape (required)",
                    "query": "Search query to filter content (optional)",
                    "element": "HTML element to target (optional, e.g., 'p', 'h1')",
                    "class_name": "CSS class to filter elements (optional)",
                    "max_length": "Maximum characters to return (optional)"
                }
            }
        }
    })

@app.route('/scrape', methods=['POST'])
def scrape_website():
    try:
        # Get JSON data from request
        data = request.get_json()
        
        # Validate required parameters
        if not data or 'url' not in data:
            return jsonify({
                "error": "Missing required parameter: url"
            }), 400

        url = data.get('url')
        query = data.get('query', '')
        element = data.get('element', 'p')  # Default to paragraphs
        class_name = data.get('class_name')
        max_length = data.get('max_length', MAX_CONTENT_LENGTH)

        # Make HTTP request
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=DEFAULT_TIMEOUT)
        response.raise_for_status()

        # Parse HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find elements based on parameters
        if class_name:
            elements = soup.find_all(element, class_=class_name)
        else:
            elements = soup.find_all(element)

        # Extract and process content
        content = []
        for elem in elements:
            text = elem.get_text(strip=True)
            if text and (not query or query.lower() in text.lower()):
                content.append({
                    "text": text[:max_length],
                    "tag": elem.name,
                    "classes": elem.get('class', []),
                    "url": url
                })

        # If no content found, try to get some basic page info
        if not content:
            title = soup.title.string if soup.title else "No title"
            content.append({
                "text": title[:max_length],
                "tag": "title",
                "classes": [],
                "url": url
            })

        return jsonify({
            "status": "success",
            "results": content[:10],  # Limit to 10 results
            "count": len(content),
            "url": url,
            "query": query
        })

    except requests.exceptions.RequestException as e:
        return jsonify({
            "error": f"Failed to fetch URL: {str(e)}"
        }), 500
    except Exception as e:
        return jsonify({
            "error": f"An error occurred: {str(e)}"
        }), 500

# For Hugging Face Spaces
if __name__ == '__main__':
    app.run(host='0.0.0.0', port=7860)