Spaces:

dejanseo
/

url-keyword-extraction

Sleeping

File size: 2,932 Bytes

import os
from urllib.parse import urlparse

import pandas as pd
import gradio as gr

CSV_OUTPUT_PATH = "url_keywords_full.csv"


def extract_keywords_from_url(url: str) -> list[str]:
    """
    Strip the domain, remove .html/.htm/.php, replace delimiters with spaces,
    split on '/', then on whitespace, lowercase each token, and return the list.
    """
    parsed = urlparse(url)
    path = parsed.path.strip("/")
    for ext in (".html", ".htm", ".php"):
        if path.endswith(ext):
            path = path[: -len(ext)]
    cleaned = path.replace("_", " ").replace("-", " ").replace(".", " ")
    segments = cleaned.split("/") if cleaned else []
    keywords: list[str] = []
    for seg in segments:
        for token in seg.split():
            tok = token.strip().lower()
            if tok:
                keywords.append(tok)
    return keywords


def process_urls(input_text: str):
    """
    Given a multiline string of URLs (one per line), extract keywords for each URL,
    build a DataFrame with columns ['url', 'keywords'], save it as CSV, and return both.
    """
    urls = [line.strip() for line in input_text.splitlines() if line.strip()]
    results = []
    for url in urls:
        kws = extract_keywords_from_url(url)
        results.append({"url": url, "keywords": ", ".join(kws)})
    df = pd.DataFrame(results, columns=["url", "keywords"])
    df.to_csv(CSV_OUTPUT_PATH, index=False, encoding="utf-8")
    return df, CSV_OUTPUT_PATH


# Aggressive CSS override: everything white bg + black text,
# then re-style buttons to keep accent color + white text.
custom_css = """
/* 1) Force every element inside the Gradio container to white bg + black text */
.gradio-container * {
    background-color: #ffffff !important;
    color: #000000 !important;
    border-color: #cccccc !important;
}

/* 2) Restore button accent + white text */
.gradio-container .gr-button,
.gradio-container .gr-button:hover,
.gradio-container .gr-button:focus {
    background-color: #1f6feb !important;
    color: #ffffff !important;
    border-color: #1f6feb !important;
}
"""

with gr.Blocks(theme=gr.themes.Default(), css=custom_css) as demo:
    gr.Markdown("## URL Keywords Extractor by DEJAN")
    gr.Markdown(
        "Enter one or more URLs (one per line) below, then click **Generate** to see extracted keywords."
    )
    url_input = gr.Textbox(
        lines=5,
        placeholder="https://dejan.ai/labs/interactive-demo\nhttps://example.com/foo-bar",
        label="Enter URLs (one per line)",
    )
    generate_btn = gr.Button("Generate", variant="primary")

    df_output = gr.Dataframe(
        headers=["url", "keywords"],
        label="Extracted URL Keywords",
        interactive=False,
    )
    download_csv = gr.File(label="EXPORT CSV", file_types=[".csv"])

    generate_btn.click(fn=process_urls, inputs=url_input, outputs=[df_output, download_csv])

if __name__ == "__main__":
    demo.launch()