import os from urllib.parse import urlparse import pandas as pd import gradio as gr CSV_OUTPUT_PATH = "url_keywords_full.csv" def extract_keywords_from_url(url: str) -> list[str]: """ Strip the domain, remove .html/.htm/.php, replace delimiters with spaces, split on '/', then on whitespace, lowercase each token, and return the list. """ parsed = urlparse(url) path = parsed.path.strip("/") for ext in (".html", ".htm", ".php"): if path.endswith(ext): path = path[: -len(ext)] cleaned = path.replace("_", " ").replace("-", " ").replace(".", " ") segments = cleaned.split("/") if cleaned else [] keywords: list[str] = [] for seg in segments: for token in seg.split(): tok = token.strip().lower() if tok: keywords.append(tok) return keywords def process_urls(input_text: str): """ Given a multiline string of URLs (one per line), extract keywords for each URL, build a DataFrame with columns ['url', 'keywords'], save it as CSV, and return both. """ urls = [line.strip() for line in input_text.splitlines() if line.strip()] results = [] for url in urls: kws = extract_keywords_from_url(url) results.append({"url": url, "keywords": ", ".join(kws)}) df = pd.DataFrame(results, columns=["url", "keywords"]) df.to_csv(CSV_OUTPUT_PATH, index=False, encoding="utf-8") return df, CSV_OUTPUT_PATH # Aggressive CSS override: everything white bg + black text, # then re-style buttons to keep accent color + white text. custom_css = """ /* 1) Force every element inside the Gradio container to white bg + black text */ .gradio-container * { background-color: #ffffff !important; color: #000000 !important; border-color: #cccccc !important; } /* 2) Restore button accent + white text */ .gradio-container .gr-button, .gradio-container .gr-button:hover, .gradio-container .gr-button:focus { background-color: #1f6feb !important; color: #ffffff !important; border-color: #1f6feb !important; } """ with gr.Blocks(theme=gr.themes.Default(), css=custom_css) as demo: gr.Markdown("## URL Keywords Extractor by DEJAN") gr.Markdown( "Enter one or more URLs (one per line) below, then click **Generate** to see extracted keywords." ) url_input = gr.Textbox( lines=5, placeholder="https://dejan.ai/labs/interactive-demo\nhttps://example.com/foo-bar", label="Enter URLs (one per line)", ) generate_btn = gr.Button("Generate", variant="primary") df_output = gr.Dataframe( headers=["url", "keywords"], label="Extracted URL Keywords", interactive=False, ) download_csv = gr.File(label="EXPORT CSV", file_types=[".csv"]) generate_btn.click(fn=process_urls, inputs=url_input, outputs=[df_output, download_csv]) if __name__ == "__main__": demo.launch()