import gradio as gr import requests from bs4 import BeautifulSoup import html2text from readability import Document import re from urllib.parse import urljoin, urlparse import time class URLToMarkdownConverter: def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }) def is_valid_url(self, url): """Check if URL is valid""" try: result = urlparse(url) return all([result.scheme, result.netloc]) except: return False def fetch_webpage(self, url, timeout=10): """Fetch webpage content""" try: response = self.session.get(url, timeout=timeout) response.raise_for_status() return response.text, response.status_code except requests.exceptions.RequestException as e: raise Exception(f"Error fetching URL: {str(e)}") def extract_title(self, html_content): """Extract page title from HTML""" soup = BeautifulSoup(html_content, 'html.parser') title_tag = soup.find('title') return title_tag.get_text().strip() if title_tag else "" def improve_readability(self, html_content): """Use readability to extract main content""" try: doc = Document(html_content) return doc.content() except: return html_content def convert_to_markdown(self, html_content, ignore_links=False): """Convert HTML to Markdown""" h = html2text.HTML2Text() h.ignore_images = False h.ignore_links = ignore_links h.body_width = 0 # Don't wrap lines h.unicode_snob = True h.bypass_tables = False # Convert to markdown markdown = h.handle(html_content) # Clean up excessive whitespace markdown = re.sub(r'\n\s*\n\s*\n', '\n\n', markdown) markdown = markdown.strip() return markdown def process_url(self, url, include_title=True, ignore_links=False, improve_readability=True): """Main processing function""" if not url: return "Please enter a URL", "" if not self.is_valid_url(url): return "Please enter a valid URL", "" try: # Fetch webpage html_content, status_code = self.fetch_webpage(url) # Extract title title = self.extract_title(html_content) # Improve readability if requested if improve_readability: html_content = self.improve_readability(html_content) # Convert to markdown markdown = self.convert_to_markdown(html_content, ignore_links) # Add title if requested if include_title and title: markdown = f"# {title}\n\n{markdown}" return markdown, title except Exception as e: return f"Error processing URL: {str(e)}", "" # Initialize converter converter = URLToMarkdownConverter() def convert_url_to_markdown(url, include_title, ignore_links, improve_readability): """Gradio interface function""" markdown, title = converter.process_url( url=url, include_title=include_title, ignore_links=ignore_links, improve_readability=improve_readability ) return markdown, title # Create Gradio interface with gr.Blocks(title="URL to Markdown Converter", theme=gr.themes.Soft()) as app: gr.Markdown(""" # 🔗 URL to Markdown Converter Convert any webpage to clean, readable Markdown format. Perfect for documentation, note-taking, and content archival. ## How to use: 1. Enter a URL in the text box below 2. Configure your options 3. Click "Convert to Markdown" 4. Copy the generated Markdown from the output box """) with gr.Row(): with gr.Column(scale=2): url_input = gr.Textbox( label="URL", placeholder="https://example.com", lines=1, info="Enter the URL of the webpage you want to convert" ) with gr.Row(): include_title = gr.Checkbox( label="Include Title", value=True, info="Add the page title as a heading" ) ignore_links = gr.Checkbox( label="Ignore Links", value=False, info="Remove all hyperlinks from output" ) improve_readability = gr.Checkbox( label="Improve Readability", value=True, info="Extract main content and remove clutter" ) convert_btn = gr.Button("Convert to Markdown", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown(""" ### Options Explained: **Include Title**: Adds the webpage's title as an H1 heading at the top of the markdown. **Ignore Links**: Removes all hyperlinks, keeping only the link text. **Improve Readability**: Uses Mozilla's Readability algorithm to extract the main content and remove navigation, ads, and other clutter. """) with gr.Row(): extracted_title = gr.Textbox( label="Extracted Title", interactive=False, lines=1, info="The title extracted from the webpage" ) markdown_output = gr.Textbox( label="Markdown Output", lines=20, max_lines=50, show_copy_button=True, info="The converted Markdown content" ) # Event handlers convert_btn.click( fn=convert_url_to_markdown, inputs=[url_input, include_title, ignore_links, improve_readability], outputs=[markdown_output, extracted_title] ) # Allow Enter key to trigger conversion url_input.submit( fn=convert_url_to_markdown, inputs=[url_input, include_title, ignore_links, improve_readability], outputs=[markdown_output, extracted_title] ) # Examples gr.Examples( examples=[ ["https://www.mozilla.org/en-US/firefox/", True, False, True], ["https://github.com/python/cpython", True, False, True], ["https://docs.python.org/3/tutorial/", False, True, True], ], inputs=[url_input, include_title, ignore_links, improve_readability], outputs=[markdown_output, extracted_title], fn=convert_url_to_markdown, cache_examples=False ) gr.Markdown(""" --- ### Tips: - The converter works best with article-style content - Some websites may block automated requests - Large pages may take a few seconds to process - For best results, keep "Improve Readability" enabled ### Supported Sites: Most standard websites work well. Some sites with heavy JavaScript or anti-bot measures may not work properly. --- ### Credits: This Gradio app was inspired by and is a rewrite of [macsplit/urltomarkdown](https://github.com/macsplit/urltomarkdown). """) # Launch the app if __name__ == "__main__": app.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True )