Spaces:
Sleeping
Sleeping
File size: 3,427 Bytes
520e05b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
from flask import Flask, request, jsonify
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
app = Flask(__name__)
# Default configuration
DEFAULT_TIMEOUT = 10
MAX_CONTENT_LENGTH = 5000
@app.route('/')
def home():
return jsonify({
"message": "Welcome to the Web Scraping API",
"endpoints": {
"/scrape": {
"method": "POST",
"description": "Scrape web content based on provided parameters",
"parameters": {
"url": "Target URL to scrape (required)",
"query": "Search query to filter content (optional)",
"element": "HTML element to target (optional, e.g., 'p', 'h1')",
"class_name": "CSS class to filter elements (optional)",
"max_length": "Maximum characters to return (optional)"
}
}
}
})
@app.route('/scrape', methods=['POST'])
def scrape_website():
try:
# Get JSON data from request
data = request.get_json()
# Validate required parameters
if not data or 'url' not in data:
return jsonify({
"error": "Missing required parameter: url"
}), 400
url = data.get('url')
query = data.get('query', '')
element = data.get('element', 'p') # Default to paragraphs
class_name = data.get('class_name')
max_length = data.get('max_length', MAX_CONTENT_LENGTH)
# Make HTTP request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=DEFAULT_TIMEOUT)
response.raise_for_status()
# Parse HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Find elements based on parameters
if class_name:
elements = soup.find_all(element, class_=class_name)
else:
elements = soup.find_all(element)
# Extract and process content
content = []
for elem in elements:
text = elem.get_text(strip=True)
if text and (not query or query.lower() in text.lower()):
content.append({
"text": text[:max_length],
"tag": elem.name,
"classes": elem.get('class', []),
"url": url
})
# If no content found, try to get some basic page info
if not content:
title = soup.title.string if soup.title else "No title"
content.append({
"text": title[:max_length],
"tag": "title",
"classes": [],
"url": url
})
return jsonify({
"status": "success",
"results": content[:10], # Limit to 10 results
"count": len(content),
"url": url,
"query": query
})
except requests.exceptions.RequestException as e:
return jsonify({
"error": f"Failed to fetch URL: {str(e)}"
}), 500
except Exception as e:
return jsonify({
"error": f"An error occurred: {str(e)}"
}), 500
# For Hugging Face Spaces
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860) |