Spaces:
Sleeping
Sleeping
import os | |
from urllib.parse import urlparse | |
import pandas as pd | |
import gradio as gr | |
CSV_OUTPUT_PATH = "url_keywords_full.csv" | |
def extract_keywords_from_url(url: str) -> list[str]: | |
""" | |
Strip the domain, remove .html/.htm/.php, replace delimiters with spaces, | |
split on '/', then on whitespace, lowercase each token, and return the list. | |
""" | |
parsed = urlparse(url) | |
path = parsed.path.strip("/") | |
for ext in (".html", ".htm", ".php"): | |
if path.endswith(ext): | |
path = path[: -len(ext)] | |
cleaned = path.replace("_", " ").replace("-", " ").replace(".", " ") | |
segments = cleaned.split("/") if cleaned else [] | |
keywords: list[str] = [] | |
for seg in segments: | |
for token in seg.split(): | |
tok = token.strip().lower() | |
if tok: | |
keywords.append(tok) | |
return keywords | |
def process_urls(input_text: str): | |
""" | |
Given a multiline string of URLs (one per line), extract keywords for each URL, | |
build a DataFrame with columns ['url', 'keywords'], save it as CSV, and return both. | |
""" | |
urls = [line.strip() for line in input_text.splitlines() if line.strip()] | |
results = [] | |
for url in urls: | |
kws = extract_keywords_from_url(url) | |
results.append({"url": url, "keywords": ", ".join(kws)}) | |
df = pd.DataFrame(results, columns=["url", "keywords"]) | |
df.to_csv(CSV_OUTPUT_PATH, index=False, encoding="utf-8") | |
return df, CSV_OUTPUT_PATH | |
# Aggressive CSS override: everything white bg + black text, | |
# then re-style buttons to keep accent color + white text. | |
custom_css = """ | |
/* 1) Force every element inside the Gradio container to white bg + black text */ | |
.gradio-container * { | |
background-color: #ffffff !important; | |
color: #000000 !important; | |
border-color: #cccccc !important; | |
} | |
/* 2) Restore button accent + white text */ | |
.gradio-container .gr-button, | |
.gradio-container .gr-button:hover, | |
.gradio-container .gr-button:focus { | |
background-color: #1f6feb !important; | |
color: #ffffff !important; | |
border-color: #1f6feb !important; | |
} | |
""" | |
with gr.Blocks(theme=gr.themes.Default(), css=custom_css) as demo: | |
gr.Markdown("## URL Keywords Extractor by DEJAN") | |
gr.Markdown( | |
"Enter one or more URLs (one per line) below, then click **Generate** to see extracted keywords." | |
) | |
url_input = gr.Textbox( | |
lines=5, | |
placeholder="https://dejan.ai/labs/interactive-demo\nhttps://example.com/foo-bar", | |
label="Enter URLs (one per line)", | |
) | |
generate_btn = gr.Button("Generate", variant="primary") | |
df_output = gr.Dataframe( | |
headers=["url", "keywords"], | |
label="Extracted URL Keywords", | |
interactive=False, | |
) | |
download_csv = gr.File(label="EXPORT CSV", file_types=[".csv"]) | |
generate_btn.click(fn=process_urls, inputs=url_input, outputs=[df_output, download_csv]) | |
if __name__ == "__main__": | |
demo.launch() | |