import requests from bs4 import BeautifulSoup import re WIKIPEDIA_SUMMARY_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{}" WIKIDATA_SEARCH_URL = "https://www.wikidata.org/w/api.php" _clean_re = re.compile(r'^[\W_]+|[\W_]+$') # strip leading/trailing punctuation _token_re = re.compile(r"[A-Za-z0-9\-\']{2,}") # tokens of length >=2, allow hyphen/apostrophe def _normalize_token(tok: str) -> str: if not tok: return "" t = tok.strip() # remove colon or trailing punctuation t = re.sub(r'[:;,\.\)\(\[\]\{\}]+$', '', t) t = _clean_re.sub("", t) return t def fetch_wikipedia_summary(topic): try: resp = requests.get(WIKIPEDIA_SUMMARY_URL.format(requests.utils.quote(topic)), timeout=6) if resp.status_code == 200: j = resp.json() text = "" if "extract" in j and j["extract"]: text += j["extract"] + "\n" if "description" in j and j["description"]: text += f"{j['description']}\n" return text.strip() except Exception: pass return "" def fetch_wikidata_description(topic): try: params = { "action": "wbsearchentities", "search": topic, "language": "en", "format": "json", "limit": 5 } resp = requests.get(WIKIDATA_SEARCH_URL, params=params, timeout=6) if resp.status_code == 200: j = resp.json() descs = [] for item in j.get("search", []): label = item.get("label", "") desc = item.get("description", "") if label or desc: descs.append(f"{label}: {desc}") return "\n".join(descs) except Exception: pass return "" def fetch_web_snippets(topic, limit=3): try: q = requests.utils.quote(topic) url = f"https://en.wikipedia.org/w/index.php?search={q}" resp = requests.get(url, timeout=6) if resp.status_code == 200: soup = BeautifulSoup(resp.text, "html.parser") paragraphs = soup.select("p") text = "" for p in paragraphs[:limit]: txt = p.get_text().strip() if txt: text += txt + "\n" return text.strip() except Exception: pass return "" def fetch_context(topic): parts = [] wiki = fetch_wikipedia_summary(topic) if wiki: parts.append(wiki) wd = fetch_wikidata_description(topic) if wd: parts.append(wd) web = fetch_web_snippets(topic) if web: parts.append(web) combined = "\n\n".join([p for p in parts if p]) if not combined: combined = f"No reliable content found for {topic}." return combined def extract_keywords(context, top_k=6): # try spaCy NER first, then fallback to frequency-based token extraction try: import spacy nlp = spacy.load("en_core_web_sm") doc = nlp(context) ents = [] seen_norm = set() for ent in doc.ents: norm = _normalize_token(ent.text) if not norm: continue key = norm.lower() if key in seen_norm: continue seen_norm.add(key) ents.append(ent.text.strip()) if len(ents) >= top_k: break if ents: return ents[:top_k] except Exception: pass # fallback frequency-based extraction with token cleaning tokens = _token_re.findall(context) freq = {} for t in tokens: cleaned = _normalize_token(t) if len(cleaned) < 2: continue key = cleaned.lower() freq[key] = freq.get(key, 0) + 1 sorted_tokens = sorted(freq.items(), key=lambda x: x[1], reverse=True) results = [] seen = set() for tok, _ in sorted_tokens: if tok in seen: continue # skip common stopwords (small set) if tok in {"the", "and", "for", "with", "that", "from", "this", "have", "their", "which"}: continue seen.add(tok) results.append(tok) if len(results) >= top_k: break return results