Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import re | |
WIKIPEDIA_SUMMARY_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{}" | |
WIKIDATA_SEARCH_URL = "https://www.wikidata.org/w/api.php" | |
_clean_re = re.compile(r'^[\W_]+|[\W_]+$') # strip leading/trailing punctuation | |
_token_re = re.compile(r"[A-Za-z0-9\-\']{2,}") # tokens of length >=2, allow hyphen/apostrophe | |
def _normalize_token(tok: str) -> str: | |
if not tok: | |
return "" | |
t = tok.strip() | |
# remove colon or trailing punctuation | |
t = re.sub(r'[:;,\.\)\(\[\]\{\}]+$', '', t) | |
t = _clean_re.sub("", t) | |
return t | |
def fetch_wikipedia_summary(topic): | |
try: | |
resp = requests.get(WIKIPEDIA_SUMMARY_URL.format(requests.utils.quote(topic)), timeout=6) | |
if resp.status_code == 200: | |
j = resp.json() | |
text = "" | |
if "extract" in j and j["extract"]: | |
text += j["extract"] + "\n" | |
if "description" in j and j["description"]: | |
text += f"{j['description']}\n" | |
return text.strip() | |
except Exception: | |
pass | |
return "" | |
def fetch_wikidata_description(topic): | |
try: | |
params = { | |
"action": "wbsearchentities", | |
"search": topic, | |
"language": "en", | |
"format": "json", | |
"limit": 5 | |
} | |
resp = requests.get(WIKIDATA_SEARCH_URL, params=params, timeout=6) | |
if resp.status_code == 200: | |
j = resp.json() | |
descs = [] | |
for item in j.get("search", []): | |
label = item.get("label", "") | |
desc = item.get("description", "") | |
if label or desc: | |
descs.append(f"{label}: {desc}") | |
return "\n".join(descs) | |
except Exception: | |
pass | |
return "" | |
def fetch_web_snippets(topic, limit=3): | |
try: | |
q = requests.utils.quote(topic) | |
url = f"https://en.wikipedia.org/w/index.php?search={q}" | |
resp = requests.get(url, timeout=6) | |
if resp.status_code == 200: | |
soup = BeautifulSoup(resp.text, "html.parser") | |
paragraphs = soup.select("p") | |
text = "" | |
for p in paragraphs[:limit]: | |
txt = p.get_text().strip() | |
if txt: | |
text += txt + "\n" | |
return text.strip() | |
except Exception: | |
pass | |
return "" | |
def fetch_context(topic): | |
parts = [] | |
wiki = fetch_wikipedia_summary(topic) | |
if wiki: | |
parts.append(wiki) | |
wd = fetch_wikidata_description(topic) | |
if wd: | |
parts.append(wd) | |
web = fetch_web_snippets(topic) | |
if web: | |
parts.append(web) | |
combined = "\n\n".join([p for p in parts if p]) | |
if not combined: | |
combined = f"No reliable content found for {topic}." | |
return combined | |
def extract_keywords(context, top_k=6): | |
# try spaCy NER first, then fallback to frequency-based token extraction | |
try: | |
import spacy | |
nlp = spacy.load("en_core_web_sm") | |
doc = nlp(context) | |
ents = [] | |
seen_norm = set() | |
for ent in doc.ents: | |
norm = _normalize_token(ent.text) | |
if not norm: | |
continue | |
key = norm.lower() | |
if key in seen_norm: | |
continue | |
seen_norm.add(key) | |
ents.append(ent.text.strip()) | |
if len(ents) >= top_k: | |
break | |
if ents: | |
return ents[:top_k] | |
except Exception: | |
pass | |
# fallback frequency-based extraction with token cleaning | |
tokens = _token_re.findall(context) | |
freq = {} | |
for t in tokens: | |
cleaned = _normalize_token(t) | |
if len(cleaned) < 2: | |
continue | |
key = cleaned.lower() | |
freq[key] = freq.get(key, 0) + 1 | |
sorted_tokens = sorted(freq.items(), key=lambda x: x[1], reverse=True) | |
results = [] | |
seen = set() | |
for tok, _ in sorted_tokens: | |
if tok in seen: | |
continue | |
# skip common stopwords (small set) | |
if tok in {"the", "and", "for", "with", "that", "from", "this", "have", "their", "which"}: | |
continue | |
seen.add(tok) | |
results.append(tok) | |
if len(results) >= top_k: | |
break | |
return results |