File size: 2,932 Bytes
4b4e019
4d6b94a
4b4e019
 
 
 
 
 
561915c
6e98ac3
ce6ff12
 
 
 
4d6b94a
3623de9
4d6b94a
 
 
 
 
6e98ac3
4d6b94a
4b4e019
 
6e98ac3
 
4d6b94a
561915c
4b4e019
 
ce6ff12
 
 
 
4b4e019
 
 
 
 
 
 
 
 
 
3623de9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1373a35
 
 
4b4e019
 
 
 
 
 
 
 
 
 
 
 
3623de9
4b4e019
3623de9
0645bef
6e98ac3
4b4e019
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
from urllib.parse import urlparse

import pandas as pd
import gradio as gr

CSV_OUTPUT_PATH = "url_keywords_full.csv"


def extract_keywords_from_url(url: str) -> list[str]:
    """
    Strip the domain, remove .html/.htm/.php, replace delimiters with spaces,
    split on '/', then on whitespace, lowercase each token, and return the list.
    """
    parsed = urlparse(url)
    path = parsed.path.strip("/")
    for ext in (".html", ".htm", ".php"):
        if path.endswith(ext):
            path = path[: -len(ext)]
    cleaned = path.replace("_", " ").replace("-", " ").replace(".", " ")
    segments = cleaned.split("/") if cleaned else []
    keywords: list[str] = []
    for seg in segments:
        for token in seg.split():
            tok = token.strip().lower()
            if tok:
                keywords.append(tok)
    return keywords


def process_urls(input_text: str):
    """
    Given a multiline string of URLs (one per line), extract keywords for each URL,
    build a DataFrame with columns ['url', 'keywords'], save it as CSV, and return both.
    """
    urls = [line.strip() for line in input_text.splitlines() if line.strip()]
    results = []
    for url in urls:
        kws = extract_keywords_from_url(url)
        results.append({"url": url, "keywords": ", ".join(kws)})
    df = pd.DataFrame(results, columns=["url", "keywords"])
    df.to_csv(CSV_OUTPUT_PATH, index=False, encoding="utf-8")
    return df, CSV_OUTPUT_PATH


# Aggressive CSS override: everything white bg + black text,
# then re-style buttons to keep accent color + white text.
custom_css = """
/* 1) Force every element inside the Gradio container to white bg + black text */
.gradio-container * {
    background-color: #ffffff !important;
    color: #000000 !important;
    border-color: #cccccc !important;
}

/* 2) Restore button accent + white text */
.gradio-container .gr-button,
.gradio-container .gr-button:hover,
.gradio-container .gr-button:focus {
    background-color: #1f6feb !important;
    color: #ffffff !important;
    border-color: #1f6feb !important;
}
"""

with gr.Blocks(theme=gr.themes.Default(), css=custom_css) as demo:
    gr.Markdown("## URL Keywords Extractor by DEJAN")
    gr.Markdown(
        "Enter one or more URLs (one per line) below, then click **Generate** to see extracted keywords."
    )
    url_input = gr.Textbox(
        lines=5,
        placeholder="https://dejan.ai/labs/interactive-demo\nhttps://example.com/foo-bar",
        label="Enter URLs (one per line)",
    )
    generate_btn = gr.Button("Generate", variant="primary")

    df_output = gr.Dataframe(
        headers=["url", "keywords"],
        label="Extracted URL Keywords",
        interactive=False,
    )
    download_csv = gr.File(label="EXPORT CSV", file_types=[".csv"])

    generate_btn.click(fn=process_urls, inputs=url_input, outputs=[df_output, download_csv])

if __name__ == "__main__":
    demo.launch()