dejanseo's picture
Update app.py
3623de9 verified
import os
from urllib.parse import urlparse
import pandas as pd
import gradio as gr
CSV_OUTPUT_PATH = "url_keywords_full.csv"
def extract_keywords_from_url(url: str) -> list[str]:
"""
Strip the domain, remove .html/.htm/.php, replace delimiters with spaces,
split on '/', then on whitespace, lowercase each token, and return the list.
"""
parsed = urlparse(url)
path = parsed.path.strip("/")
for ext in (".html", ".htm", ".php"):
if path.endswith(ext):
path = path[: -len(ext)]
cleaned = path.replace("_", " ").replace("-", " ").replace(".", " ")
segments = cleaned.split("/") if cleaned else []
keywords: list[str] = []
for seg in segments:
for token in seg.split():
tok = token.strip().lower()
if tok:
keywords.append(tok)
return keywords
def process_urls(input_text: str):
"""
Given a multiline string of URLs (one per line), extract keywords for each URL,
build a DataFrame with columns ['url', 'keywords'], save it as CSV, and return both.
"""
urls = [line.strip() for line in input_text.splitlines() if line.strip()]
results = []
for url in urls:
kws = extract_keywords_from_url(url)
results.append({"url": url, "keywords": ", ".join(kws)})
df = pd.DataFrame(results, columns=["url", "keywords"])
df.to_csv(CSV_OUTPUT_PATH, index=False, encoding="utf-8")
return df, CSV_OUTPUT_PATH
# Aggressive CSS override: everything white bg + black text,
# then re-style buttons to keep accent color + white text.
custom_css = """
/* 1) Force every element inside the Gradio container to white bg + black text */
.gradio-container * {
background-color: #ffffff !important;
color: #000000 !important;
border-color: #cccccc !important;
}
/* 2) Restore button accent + white text */
.gradio-container .gr-button,
.gradio-container .gr-button:hover,
.gradio-container .gr-button:focus {
background-color: #1f6feb !important;
color: #ffffff !important;
border-color: #1f6feb !important;
}
"""
with gr.Blocks(theme=gr.themes.Default(), css=custom_css) as demo:
gr.Markdown("## URL Keywords Extractor by DEJAN")
gr.Markdown(
"Enter one or more URLs (one per line) below, then click **Generate** to see extracted keywords."
)
url_input = gr.Textbox(
lines=5,
placeholder="https://dejan.ai/labs/interactive-demo\nhttps://example.com/foo-bar",
label="Enter URLs (one per line)",
)
generate_btn = gr.Button("Generate", variant="primary")
df_output = gr.Dataframe(
headers=["url", "keywords"],
label="Extracted URL Keywords",
interactive=False,
)
download_csv = gr.File(label="EXPORT CSV", file_types=[".csv"])
generate_btn.click(fn=process_urls, inputs=url_input, outputs=[df_output, download_csv])
if __name__ == "__main__":
demo.launch()