dejanseo commited on
Commit
ce6ff12
·
verified ·
1 Parent(s): 4ea5c8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -6
app.py CHANGED
@@ -8,12 +8,22 @@ CSV_OUTPUT_PATH = "url_keywords_full.csv"
8
 
9
 
10
  def extract_keywords_from_url(url: str) -> list[str]:
 
 
 
 
11
  parsed = urlparse(url)
12
- path = parsed.path.strip("/")
 
 
13
  for ext in (".html", ".htm", ".php"):
14
  if path.endswith(ext):
15
  path = path[: -len(ext)]
 
 
16
  cleaned = path.replace("_", " ").replace("-", " ").replace(".", " ")
 
 
17
  segments = cleaned.split("/") if cleaned else []
18
  keywords: list[str] = []
19
  for seg in segments:
@@ -25,20 +35,50 @@ def extract_keywords_from_url(url: str) -> list[str]:
25
 
26
 
27
  def process_urls(input_text: str):
 
 
 
 
28
  urls = [line.strip() for line in input_text.splitlines() if line.strip()]
29
  results = []
30
  for url in urls:
31
  kws = extract_keywords_from_url(url)
32
  results.append({"url": url, "keywords": ", ".join(kws)})
 
33
  df = pd.DataFrame(results, columns=["url", "keywords"])
34
  df.to_csv(CSV_OUTPUT_PATH, index=False, encoding="utf-8")
35
  return df, CSV_OUTPUT_PATH
36
 
37
 
38
- # Use a light theme explicitly
39
- with gr.Blocks(theme=gr.themes.Default()) as demo:
40
- gr.Markdown("## URL Keywords Extractor Demo (Gradio)")
41
- gr.Markdown("Enter one or more URLs (one per line) below, then click **Generate** to see extracted keywords.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  url_input = gr.Textbox(
43
  lines=5,
44
  placeholder="https://dejan.ai/labs/interactive-demo\nhttps://example.com/foo-bar",
@@ -51,7 +91,10 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
51
  label="Extracted URL Keywords",
52
  interactive=False,
53
  )
54
- download_csv = gr.File(label="EXPORT CSV", file_types=[".csv"])
 
 
 
55
 
56
  generate_btn.click(
57
  fn=process_urls,
 
8
 
9
 
10
  def extract_keywords_from_url(url: str) -> list[str]:
11
+ """
12
+ Strip the domain, remove .html/.htm/.php, replace delimiters with spaces,
13
+ split on '/', then on whitespace, lowercase each token, and return the list.
14
+ """
15
  parsed = urlparse(url)
16
+ path = parsed.path.strip("/") # e.g. "labs/interactive-demo"
17
+
18
+ # Remove common file extensions if they appear at the end:
19
  for ext in (".html", ".htm", ".php"):
20
  if path.endswith(ext):
21
  path = path[: -len(ext)]
22
+
23
+ # Replace underscores, hyphens, and dots with spaces:
24
  cleaned = path.replace("_", " ").replace("-", " ").replace(".", " ")
25
+
26
+ # Split on "/" to get segments, then split each segment on whitespace:
27
  segments = cleaned.split("/") if cleaned else []
28
  keywords: list[str] = []
29
  for seg in segments:
 
35
 
36
 
37
  def process_urls(input_text: str):
38
+ """
39
+ Given a multiline string of URLs (one per line), extract keywords for each URL,
40
+ build a DataFrame with columns ['url', 'keywords'], save it as CSV, and return both.
41
+ """
42
  urls = [line.strip() for line in input_text.splitlines() if line.strip()]
43
  results = []
44
  for url in urls:
45
  kws = extract_keywords_from_url(url)
46
  results.append({"url": url, "keywords": ", ".join(kws)})
47
+
48
  df = pd.DataFrame(results, columns=["url", "keywords"])
49
  df.to_csv(CSV_OUTPUT_PATH, index=False, encoding="utf-8")
50
  return df, CSV_OUTPUT_PATH
51
 
52
 
53
+ # Custom CSS to enforce a white/light theme
54
+ light_css = """
55
+ body {
56
+ background-color: #ffffff !important;
57
+ color: #000000 !important;
58
+ }
59
+ .gradio-container {
60
+ background-color: #ffffff !important;
61
+ color: #000000 !important;
62
+ }
63
+ .gr-button {
64
+ background-color: #007bff !important;
65
+ color: #ffffff !important;
66
+ }
67
+ .gr-textbox textarea {
68
+ background-color: #ffffff !important;
69
+ color: #000000 !important;
70
+ }
71
+ .gr-dataframe {
72
+ background-color: #ffffff !important;
73
+ color: #000000 !important;
74
+ }
75
+ """
76
+
77
+ with gr.Blocks(css=light_css) as demo:
78
+ gr.Markdown("# URL Keywords Extractor by DEJAN")
79
+ gr.Markdown(
80
+ "Enter one or more URLs (one per line) below, then click **Generate** to see extracted keywords."
81
+ )
82
  url_input = gr.Textbox(
83
  lines=5,
84
  placeholder="https://dejan.ai/labs/interactive-demo\nhttps://example.com/foo-bar",
 
91
  label="Extracted URL Keywords",
92
  interactive=False,
93
  )
94
+ download_csv = gr.File(
95
+ label="EXPORT CSV",
96
+ file_types=[".csv"],
97
+ )
98
 
99
  generate_btn.click(
100
  fn=process_urls,