Spaces:

dejanseo
/

url-keyword-extraction

Running

App Files Files Community

dejanseo commited on Jun 2

Commit

ce6ff12

verified ·

1 Parent(s): 4ea5c8b

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -6

app.py CHANGED Viewed

@@ -8,12 +8,22 @@ CSV_OUTPUT_PATH = "url_keywords_full.csv"
 def extract_keywords_from_url(url: str) -> list[str]:
     parsed = urlparse(url)
-    path = parsed.path.strip("/")
     for ext in (".html", ".htm", ".php"):
         if path.endswith(ext):
             path = path[: -len(ext)]
     cleaned = path.replace("_", " ").replace("-", " ").replace(".", " ")
     segments = cleaned.split("/") if cleaned else []
     keywords: list[str] = []
     for seg in segments:
@@ -25,20 +35,50 @@ def extract_keywords_from_url(url: str) -> list[str]:
 def process_urls(input_text: str):
     urls = [line.strip() for line in input_text.splitlines() if line.strip()]
     results = []
     for url in urls:
         kws = extract_keywords_from_url(url)
         results.append({"url": url, "keywords": ", ".join(kws)})
     df = pd.DataFrame(results, columns=["url", "keywords"])
     df.to_csv(CSV_OUTPUT_PATH, index=False, encoding="utf-8")
     return df, CSV_OUTPUT_PATH
-# Use a light theme explicitly
-with gr.Blocks(theme=gr.themes.Default()) as demo:
-    gr.Markdown("## URL Keywords Extractor Demo (Gradio)")
-    gr.Markdown("Enter one or more URLs (one per line) below, then click **Generate** to see extracted keywords.")
     url_input = gr.Textbox(
         lines=5,
         placeholder="https://dejan.ai/labs/interactive-demo\nhttps://example.com/foo-bar",
@@ -51,7 +91,10 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
         label="Extracted URL Keywords",
         interactive=False,
     )
-    download_csv = gr.File(label="EXPORT CSV", file_types=[".csv"])
     generate_btn.click(
         fn=process_urls,

 def extract_keywords_from_url(url: str) -> list[str]:
+    """
+    Strip the domain, remove .html/.htm/.php, replace delimiters with spaces,
+    split on '/', then on whitespace, lowercase each token, and return the list.
+    """
     parsed = urlparse(url)
+    path = parsed.path.strip("/")  # e.g. "labs/interactive-demo"
+    # Remove common file extensions if they appear at the end:
     for ext in (".html", ".htm", ".php"):
         if path.endswith(ext):
             path = path[: -len(ext)]
+    # Replace underscores, hyphens, and dots with spaces:
     cleaned = path.replace("_", " ").replace("-", " ").replace(".", " ")
+    # Split on "/" to get segments, then split each segment on whitespace:
     segments = cleaned.split("/") if cleaned else []
     keywords: list[str] = []
     for seg in segments:
 def process_urls(input_text: str):
+    """
+    Given a multiline string of URLs (one per line), extract keywords for each URL,
+    build a DataFrame with columns ['url', 'keywords'], save it as CSV, and return both.
+    """
     urls = [line.strip() for line in input_text.splitlines() if line.strip()]
     results = []
     for url in urls:
         kws = extract_keywords_from_url(url)
         results.append({"url": url, "keywords": ", ".join(kws)})
     df = pd.DataFrame(results, columns=["url", "keywords"])
     df.to_csv(CSV_OUTPUT_PATH, index=False, encoding="utf-8")
     return df, CSV_OUTPUT_PATH
+# Custom CSS to enforce a white/light theme
+light_css = """
+body {
+    background-color: #ffffff !important;
+    color: #000000 !important;
+}
+.gradio-container {
+    background-color: #ffffff !important;
+    color: #000000 !important;
+}
+.gr-button {
+    background-color: #007bff !important;
+    color: #ffffff !important;
+}
+.gr-textbox textarea {
+    background-color: #ffffff !important;
+    color: #000000 !important;
+}
+.gr-dataframe {
+    background-color: #ffffff !important;
+    color: #000000 !important;
+}
+"""
+with gr.Blocks(css=light_css) as demo:
+    gr.Markdown("# URL Keywords Extractor by DEJAN")
+    gr.Markdown(
+        "Enter one or more URLs (one per line) below, then click **Generate** to see extracted keywords."
+    )
     url_input = gr.Textbox(
         lines=5,
         placeholder="https://dejan.ai/labs/interactive-demo\nhttps://example.com/foo-bar",
         label="Extracted URL Keywords",
         interactive=False,
     )
+    download_csv = gr.File(
+        label="EXPORT CSV",
+        file_types=[".csv"],
+    )
     generate_btn.click(
         fn=process_urls,