Spaces:

nyuuzyou
/

urltomarkdown

Running

File size: 7,509 Bytes

c3ac509

import gradio as gr
import requests
from bs4 import BeautifulSoup
import html2text
from readability import Document
import re
from urllib.parse import urljoin, urlparse
import time

class URLToMarkdownConverter:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def is_valid_url(self, url):
        """Check if URL is valid"""
        try:
            result = urlparse(url)
            return all([result.scheme, result.netloc])
        except:
            return False

    def fetch_webpage(self, url, timeout=10):
        """Fetch webpage content"""
        try:
            response = self.session.get(url, timeout=timeout)
            response.raise_for_status()
            return response.text, response.status_code
        except requests.exceptions.RequestException as e:
            raise Exception(f"Error fetching URL: {str(e)}")

    def extract_title(self, html_content):
        """Extract page title from HTML"""
        soup = BeautifulSoup(html_content, 'html.parser')
        title_tag = soup.find('title')
        return title_tag.get_text().strip() if title_tag else ""

    def improve_readability(self, html_content):
        """Use readability to extract main content"""
        try:
            doc = Document(html_content)
            return doc.content()
        except:
            return html_content

    def convert_to_markdown(self, html_content, ignore_links=False):
        """Convert HTML to Markdown"""
        h = html2text.HTML2Text()
        h.ignore_images = False
        h.ignore_links = ignore_links
        h.body_width = 0  # Don't wrap lines
        h.unicode_snob = True
        h.bypass_tables = False

        # Convert to markdown
        markdown = h.handle(html_content)

        # Clean up excessive whitespace
        markdown = re.sub(r'\n\s*\n\s*\n', '\n\n', markdown)
        markdown = markdown.strip()

        return markdown

    def process_url(self, url, include_title=True, ignore_links=False, improve_readability=True):
        """Main processing function"""
        if not url:
            return "Please enter a URL", ""

        if not self.is_valid_url(url):
            return "Please enter a valid URL", ""

        try:
            # Fetch webpage
            html_content, status_code = self.fetch_webpage(url)

            # Extract title
            title = self.extract_title(html_content)

            # Improve readability if requested
            if improve_readability:
                html_content = self.improve_readability(html_content)

            # Convert to markdown
            markdown = self.convert_to_markdown(html_content, ignore_links)

            # Add title if requested
            if include_title and title:
                markdown = f"# {title}\n\n{markdown}"

            return markdown, title

        except Exception as e:
            return f"Error processing URL: {str(e)}", ""

# Initialize converter
converter = URLToMarkdownConverter()

def convert_url_to_markdown(url, include_title, ignore_links, improve_readability):
    """Gradio interface function"""
    markdown, title = converter.process_url(
        url=url,
        include_title=include_title,
        ignore_links=ignore_links,
        improve_readability=improve_readability
    )
    return markdown, title

# Create Gradio interface
with gr.Blocks(title="URL to Markdown Converter", theme=gr.themes.Soft()) as app:
    gr.Markdown("""
    # 🔗 URL to Markdown Converter

    Convert any webpage to clean, readable Markdown format. Perfect for documentation, note-taking, and content archival.

    ## How to use:
    1. Enter a URL in the text box below
    2. Configure your options
    3. Click "Convert to Markdown"
    4. Copy the generated Markdown from the output box
    """)

    with gr.Row():
        with gr.Column(scale=2):
            url_input = gr.Textbox(
                label="URL",
                placeholder="https://example.com",
                lines=1,
                info="Enter the URL of the webpage you want to convert"
            )

            with gr.Row():
                include_title = gr.Checkbox(
                    label="Include Title",
                    value=True,
                    info="Add the page title as a heading"
                )
                ignore_links = gr.Checkbox(
                    label="Ignore Links",
                    value=False,
                    info="Remove all hyperlinks from output"
                )
                improve_readability = gr.Checkbox(
                    label="Improve Readability",
                    value=True,
                    info="Extract main content and remove clutter"
                )

            convert_btn = gr.Button("Convert to Markdown", variant="primary", size="lg")

        with gr.Column(scale=1):
            gr.Markdown("""
            ### Options Explained:

            **Include Title**: Adds the webpage's title as an H1 heading at the top of the markdown.

            **Ignore Links**: Removes all hyperlinks, keeping only the link text.

            **Improve Readability**: Uses Mozilla's Readability algorithm to extract the main content and remove navigation, ads, and other clutter.
            """)

    with gr.Row():
        extracted_title = gr.Textbox(
            label="Extracted Title",
            interactive=False,
            lines=1,
            info="The title extracted from the webpage"
        )

    markdown_output = gr.Textbox(
        label="Markdown Output",
        lines=20,
        max_lines=50,
        show_copy_button=True,
        info="The converted Markdown content"
    )

    # Event handlers
    convert_btn.click(
        fn=convert_url_to_markdown,
        inputs=[url_input, include_title, ignore_links, improve_readability],
        outputs=[markdown_output, extracted_title]
    )

    # Allow Enter key to trigger conversion
    url_input.submit(
        fn=convert_url_to_markdown,
        inputs=[url_input, include_title, ignore_links, improve_readability],
        outputs=[markdown_output, extracted_title]
    )

    # Examples
    gr.Examples(
        examples=[
            ["https://www.mozilla.org/en-US/firefox/", True, False, True],
            ["https://github.com/python/cpython", True, False, True],
            ["https://docs.python.org/3/tutorial/", False, True, True],
        ],
        inputs=[url_input, include_title, ignore_links, improve_readability],
        outputs=[markdown_output, extracted_title],
        fn=convert_url_to_markdown,
        cache_examples=False
    )

    gr.Markdown("""
    ---

    ### Tips:
    - The converter works best with article-style content
    - Some websites may block automated requests
    - Large pages may take a few seconds to process
    - For best results, keep "Improve Readability" enabled

    ### Supported Sites:
    Most standard websites work well. Some sites with heavy JavaScript or anti-bot measures may not work properly.

    ---

    ### Credits:
    This Gradio app was inspired by and is a rewrite of [macsplit/urltomarkdown](https://github.com/macsplit/urltomarkdown).
    """)

# Launch the app
if __name__ == "__main__":
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )