Spaces:
Running
Running
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import html2text | |
| from readability import Document | |
| import re | |
| from urllib.parse import urljoin, urlparse | |
| import time | |
| class URLToMarkdownConverter: | |
| def __init__(self): | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| }) | |
| def is_valid_url(self, url): | |
| """Check if URL is valid""" | |
| try: | |
| result = urlparse(url) | |
| return all([result.scheme, result.netloc]) | |
| except: | |
| return False | |
| def fetch_webpage(self, url, timeout=10): | |
| """Fetch webpage content""" | |
| try: | |
| response = self.session.get(url, timeout=timeout) | |
| response.raise_for_status() | |
| return response.text, response.status_code | |
| except requests.exceptions.RequestException as e: | |
| raise Exception(f"Error fetching URL: {str(e)}") | |
| def extract_title(self, html_content): | |
| """Extract page title from HTML""" | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| title_tag = soup.find('title') | |
| return title_tag.get_text().strip() if title_tag else "" | |
| def improve_readability(self, html_content): | |
| """Use readability to extract main content""" | |
| try: | |
| doc = Document(html_content) | |
| return doc.content() | |
| except: | |
| return html_content | |
| def convert_to_markdown(self, html_content, ignore_links=False): | |
| """Convert HTML to Markdown""" | |
| h = html2text.HTML2Text() | |
| h.ignore_images = False | |
| h.ignore_links = ignore_links | |
| h.body_width = 0 # Don't wrap lines | |
| h.unicode_snob = True | |
| h.bypass_tables = False | |
| # Convert to markdown | |
| markdown = h.handle(html_content) | |
| # Clean up excessive whitespace | |
| markdown = re.sub(r'\n\s*\n\s*\n', '\n\n', markdown) | |
| markdown = markdown.strip() | |
| return markdown | |
| def process_url(self, url, include_title=True, ignore_links=False, improve_readability=True): | |
| """Main processing function""" | |
| if not url: | |
| return "Please enter a URL", "" | |
| if not self.is_valid_url(url): | |
| return "Please enter a valid URL", "" | |
| try: | |
| # Fetch webpage | |
| html_content, status_code = self.fetch_webpage(url) | |
| # Extract title | |
| title = self.extract_title(html_content) | |
| # Improve readability if requested | |
| if improve_readability: | |
| html_content = self.improve_readability(html_content) | |
| # Convert to markdown | |
| markdown = self.convert_to_markdown(html_content, ignore_links) | |
| # Add title if requested | |
| if include_title and title: | |
| markdown = f"# {title}\n\n{markdown}" | |
| return markdown, title | |
| except Exception as e: | |
| return f"Error processing URL: {str(e)}", "" | |
| # Initialize converter | |
| converter = URLToMarkdownConverter() | |
| def convert_url_to_markdown(url, include_title, ignore_links, improve_readability): | |
| """Gradio interface function""" | |
| markdown, title = converter.process_url( | |
| url=url, | |
| include_title=include_title, | |
| ignore_links=ignore_links, | |
| improve_readability=improve_readability | |
| ) | |
| return markdown, title | |
| # Create Gradio interface | |
| with gr.Blocks(title="URL to Markdown Converter", theme=gr.themes.Soft()) as app: | |
| gr.Markdown(""" | |
| # π URL to Markdown Converter | |
| Convert any webpage to clean, readable Markdown format. Perfect for documentation, note-taking, and content archival. | |
| ## How to use: | |
| 1. Enter a URL in the text box below | |
| 2. Configure your options | |
| 3. Click "Convert to Markdown" | |
| 4. Copy the generated Markdown from the output box | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| url_input = gr.Textbox( | |
| label="URL", | |
| placeholder="https://example.com", | |
| lines=1, | |
| info="Enter the URL of the webpage you want to convert" | |
| ) | |
| with gr.Row(): | |
| include_title = gr.Checkbox( | |
| label="Include Title", | |
| value=True, | |
| info="Add the page title as a heading" | |
| ) | |
| ignore_links = gr.Checkbox( | |
| label="Ignore Links", | |
| value=False, | |
| info="Remove all hyperlinks from output" | |
| ) | |
| improve_readability = gr.Checkbox( | |
| label="Improve Readability", | |
| value=True, | |
| info="Extract main content and remove clutter" | |
| ) | |
| convert_btn = gr.Button("Convert to Markdown", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown(""" | |
| ### Options Explained: | |
| **Include Title**: Adds the webpage's title as an H1 heading at the top of the markdown. | |
| **Ignore Links**: Removes all hyperlinks, keeping only the link text. | |
| **Improve Readability**: Uses Mozilla's Readability algorithm to extract the main content and remove navigation, ads, and other clutter. | |
| """) | |
| with gr.Row(): | |
| extracted_title = gr.Textbox( | |
| label="Extracted Title", | |
| interactive=False, | |
| lines=1, | |
| info="The title extracted from the webpage" | |
| ) | |
| markdown_output = gr.Textbox( | |
| label="Markdown Output", | |
| lines=20, | |
| max_lines=50, | |
| show_copy_button=True, | |
| info="The converted Markdown content" | |
| ) | |
| # Event handlers | |
| convert_btn.click( | |
| fn=convert_url_to_markdown, | |
| inputs=[url_input, include_title, ignore_links, improve_readability], | |
| outputs=[markdown_output, extracted_title] | |
| ) | |
| # Allow Enter key to trigger conversion | |
| url_input.submit( | |
| fn=convert_url_to_markdown, | |
| inputs=[url_input, include_title, ignore_links, improve_readability], | |
| outputs=[markdown_output, extracted_title] | |
| ) | |
| # Examples | |
| gr.Examples( | |
| examples=[ | |
| ["https://www.mozilla.org/en-US/firefox/", True, False, True], | |
| ["https://github.com/python/cpython", True, False, True], | |
| ["https://docs.python.org/3/tutorial/", False, True, True], | |
| ], | |
| inputs=[url_input, include_title, ignore_links, improve_readability], | |
| outputs=[markdown_output, extracted_title], | |
| fn=convert_url_to_markdown, | |
| cache_examples=False | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### Tips: | |
| - The converter works best with article-style content | |
| - Some websites may block automated requests | |
| - Large pages may take a few seconds to process | |
| - For best results, keep "Improve Readability" enabled | |
| ### Supported Sites: | |
| Most standard websites work well. Some sites with heavy JavaScript or anti-bot measures may not work properly. | |
| --- | |
| ### Credits: | |
| This Gradio app was inspired by and is a rewrite of [macsplit/urltomarkdown](https://github.com/macsplit/urltomarkdown). | |
| """) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True | |
| ) |