NihalGazi commited on
Commit
841ef85
·
verified ·
1 Parent(s): 0c4b272

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -0
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup # For pretty-printing HTML
4
+
5
+ # --- Function to extract HTML ---
6
+ def get_html_content(url: str):
7
+ if not url:
8
+ return "Please enter a URL.", "Status: No URL provided."
9
+
10
+ original_url_for_error = url # Keep original input for error messages
11
+
12
+ # Add https:// if no scheme is present, as requests requires it.
13
+ if not (url.startswith("http://") or url.startswith("https://")):
14
+ # Try https first as it's more common and secure
15
+ url_https = "https://" + url
16
+ url_http = "http://" + url
17
+
18
+ # Briefly check if HTTPS is responsive before defaulting to it for the main request
19
+ try:
20
+ print(f"No scheme provided for '{original_url_for_error}', trying to determine scheme (HTTPS first)...")
21
+ # Using a HEAD request to be lighter, with a short timeout
22
+ response_head = requests.head(url_https, timeout=5, allow_redirects=True, headers={'User-Agent': 'HuggingFaceSpaceHTMLSchemeChecker/1.0'})
23
+ if response_head.status_code < 400:
24
+ url = url_https
25
+ print(f"HTTPS seems responsive for '{original_url_for_error}'. Proceeding with {url}")
26
+ else:
27
+ # If HTTPS gives an error or non-success, try HTTP
28
+ print(f"HTTPS check for '{original_url_for_error}' returned {response_head.status_code}. Trying HTTP.")
29
+ response_head_http = requests.head(url_http, timeout=5, allow_redirects=True, headers={'User-Agent': 'HuggingFaceSpaceHTMLSchemeChecker/1.0'})
30
+ if response_head_http.status_code < 400:
31
+ url = url_http
32
+ print(f"HTTP seems responsive for '{original_url_for_error}'. Proceeding with {url}")
33
+ else:
34
+ # If both fail, default to HTTPS for the main GET request to provide a potentially more useful error from the GET
35
+ print(f"HTTP check for '{original_url_for_error}' also returned {response_head_http.status_code}. Defaulting to HTTPS for the main fetch attempt.")
36
+ url = url_https # Stick with HTTPS for the main request
37
+ except requests.RequestException as e:
38
+ print(f"Error during scheme probing for '{original_url_for_error}': {e}. Defaulting to HTTPS for the main fetch attempt: {url_https}")
39
+ url = url_https
40
+
41
+
42
+ status_message = f"Attempting to fetch HTML from: {url}"
43
+ print(status_message)
44
+
45
+ try:
46
+ # It's good practice to set a User-Agent. Some sites may block default Python/requests UAs.
47
+ headers = {
48
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 HuggingFaceSpaceHTMLScraper/1.0',
49
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
50
+ 'Accept-Language': 'en-US,en;q=0.9',
51
+ 'Accept-Encoding': 'gzip, deflate, br' # Requests handles decompression
52
+ }
53
+
54
+ # Allow a reasonable timeout for the request
55
+ response = requests.get(url, headers=headers, timeout=20) # 20 seconds timeout
56
+
57
+ # This will raise an HTTPError if the HTTP request returned an unsuccessful status code (4xx or 5xx)
58
+ response.raise_for_status()
59
+
60
+ # Use BeautifulSoup to parse and prettify the HTML for better readability
61
+ # response.content is used instead of response.text to let BS handle encoding detection better
62
+ soup = BeautifulSoup(response.content, 'html.parser')
63
+ pretty_html = soup.prettify()
64
+
65
+ status_message = f"Successfully fetched and parsed HTML from {url} (Status: {response.status_code})."
66
+ print(status_message)
67
+ return pretty_html, status_message
68
+
69
+ except requests.exceptions.HTTPError as e:
70
+ error_detail = f"HTTP Error: {e.response.status_code} for URL: {url}."
71
+ if e.response.text: # Include some of the response body if available and error is client/server
72
+ error_detail += f" Response preview: {e.response.text[:200]}"
73
+ print(error_detail)
74
+ return f"Error fetching HTML: {error_detail}", error_detail
75
+ except requests.exceptions.ConnectionError as e:
76
+ error_detail = f"Connection Error: Could not connect to {url}. The server may be down or the domain name incorrect. (Details: {e})"
77
+ print(error_detail)
78
+ return f"Error fetching HTML: {error_detail}", error_detail
79
+ except requests.exceptions.Timeout as e:
80
+ error_detail = f"Timeout Error: The request to {url} timed out. The server might be too slow or unreachable. (Details: {e})"
81
+ print(error_detail)
82
+ return f"Error fetching HTML: {error_detail}", error_detail
83
+ except requests.exceptions.RequestException as e: # Catch any other requests-related errors
84
+ error_detail = f"Request Error: An error occurred while trying to fetch {url}. (Details: {e})"
85
+ print(error_detail)
86
+ return f"Error fetching HTML: {error_detail}", error_detail
87
+ except Exception as e:
88
+ # Catch any other unexpected errors, including potential BeautifulSoup errors during parsing
89
+ error_detail = f"An unexpected error occurred during processing: {str(e)}"
90
+ print(error_detail)
91
+ return f"Error processing HTML: {error_detail}", error_detail
92
+
93
+ # --- Gradio Interface ---
94
+ iface = gr.Interface(
95
+ fn=get_html_content,
96
+ inputs=gr.Textbox(
97
+ label="Website URL",
98
+ placeholder="e.g., https://www.example.com or example.com"
99
+ ),
100
+ outputs=[
101
+ gr.Textbox(label="Extracted HTML Code", lines=25, show_copy_button=True, interactive=False),
102
+ gr.Textbox(label="Status", interactive=False)
103
+ ],
104
+ title="HTML Content Extractor 🌐",
105
+ description=(
106
+ "Enter a website URL to extract its raw HTML content. "
107
+ "The tool fetches the HTML as served by the server and uses BeautifulSoup to prettify it. "
108
+ "It will **not** execute JavaScript. For websites that build their content dynamically with JavaScript, "
109
+ "the extracted HTML will be the initial source before JavaScript execution. "
110
+ "Please be respectful of website terms of service and robots.txt."
111
+ ),
112
+ examples=[
113
+ ["https://gradio.app"],
114
+ ["httpbin.org/html"],
115
+ ["example.com"]
116
+ ],
117
+ allow_flagging="never",
118
+ css=".gradio-container {max-width: 1024px !important; margin: auto !important;}" # Optional: for better layout
119
+ )
120
+
121
+ # --- Main launch ---
122
+ if __name__ == "__main__":
123
+ print("Starting Gradio HTML Extractor application...")
124
+ iface.launch()
125
+