Spaces:
Sleeping
Sleeping
from flask import Flask, request, jsonify | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from urllib.parse import urljoin | |
app = Flask(__name__) | |
# Default configuration | |
DEFAULT_TIMEOUT = 10 | |
MAX_CONTENT_LENGTH = 5000 | |
def home(): | |
return jsonify({ | |
"message": "Welcome to the Web Scraping API", | |
"endpoints": { | |
"/scrape": { | |
"method": "POST", | |
"description": "Scrape web content based on provided parameters", | |
"parameters": { | |
"url": "Target URL to scrape (required)", | |
"query": "Search query to filter content (optional)", | |
"element": "HTML element to target (optional, e.g., 'p', 'h1')", | |
"class_name": "CSS class to filter elements (optional)", | |
"max_length": "Maximum characters to return (optional)" | |
} | |
} | |
} | |
}) | |
def scrape_website(): | |
try: | |
# Get JSON data from request | |
data = request.get_json() | |
# Validate required parameters | |
if not data or 'url' not in data: | |
return jsonify({ | |
"error": "Missing required parameter: url" | |
}), 400 | |
url = data.get('url') | |
query = data.get('query', '') | |
element = data.get('element', 'p') # Default to paragraphs | |
class_name = data.get('class_name') | |
max_length = data.get('max_length', MAX_CONTENT_LENGTH) | |
# Make HTTP request | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
} | |
response = requests.get(url, headers=headers, timeout=DEFAULT_TIMEOUT) | |
response.raise_for_status() | |
# Parse HTML content | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Find elements based on parameters | |
if class_name: | |
elements = soup.find_all(element, class_=class_name) | |
else: | |
elements = soup.find_all(element) | |
# Extract and process content | |
content = [] | |
for elem in elements: | |
text = elem.get_text(strip=True) | |
if text and (not query or query.lower() in text.lower()): | |
content.append({ | |
"text": text[:max_length], | |
"tag": elem.name, | |
"classes": elem.get('class', []), | |
"url": url | |
}) | |
# If no content found, try to get some basic page info | |
if not content: | |
title = soup.title.string if soup.title else "No title" | |
content.append({ | |
"text": title[:max_length], | |
"tag": "title", | |
"classes": [], | |
"url": url | |
}) | |
return jsonify({ | |
"status": "success", | |
"results": content[:10], # Limit to 10 results | |
"count": len(content), | |
"url": url, | |
"query": query | |
}) | |
except requests.exceptions.RequestException as e: | |
return jsonify({ | |
"error": f"Failed to fetch URL: {str(e)}" | |
}), 500 | |
except Exception as e: | |
return jsonify({ | |
"error": f"An error occurred: {str(e)}" | |
}), 500 | |
# For Hugging Face Spaces | |
if __name__ == '__main__': | |
app.run(host='0.0.0.0', port=7860) |