Spaces:
Sleeping
Sleeping
File size: 4,290 Bytes
6b070b1 1198cf7 48ebe80 6b070b1 48ebe80 1198cf7 6b070b1 1198cf7 6b070b1 1198cf7 6b070b1 48ebe80 6b070b1 48ebe80 6b070b1 1198cf7 6b070b1 1198cf7 6b070b1 1198cf7 6b070b1 1198cf7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import requests
from bs4 import BeautifulSoup
import re
WIKIPEDIA_SUMMARY_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{}"
WIKIDATA_SEARCH_URL = "https://www.wikidata.org/w/api.php"
_clean_re = re.compile(r'^[\W_]+|[\W_]+$') # strip leading/trailing punctuation
_token_re = re.compile(r"[A-Za-z0-9\-\']{2,}") # tokens of length >=2, allow hyphen/apostrophe
def _normalize_token(tok: str) -> str:
if not tok:
return ""
t = tok.strip()
# remove colon or trailing punctuation
t = re.sub(r'[:;,\.\)\(\[\]\{\}]+$', '', t)
t = _clean_re.sub("", t)
return t
def fetch_wikipedia_summary(topic):
try:
resp = requests.get(WIKIPEDIA_SUMMARY_URL.format(requests.utils.quote(topic)), timeout=6)
if resp.status_code == 200:
j = resp.json()
text = ""
if "extract" in j and j["extract"]:
text += j["extract"] + "\n"
if "description" in j and j["description"]:
text += f"{j['description']}\n"
return text.strip()
except Exception:
pass
return ""
def fetch_wikidata_description(topic):
try:
params = {
"action": "wbsearchentities",
"search": topic,
"language": "en",
"format": "json",
"limit": 5
}
resp = requests.get(WIKIDATA_SEARCH_URL, params=params, timeout=6)
if resp.status_code == 200:
j = resp.json()
descs = []
for item in j.get("search", []):
label = item.get("label", "")
desc = item.get("description", "")
if label or desc:
descs.append(f"{label}: {desc}")
return "\n".join(descs)
except Exception:
pass
return ""
def fetch_web_snippets(topic, limit=3):
try:
q = requests.utils.quote(topic)
url = f"https://en.wikipedia.org/w/index.php?search={q}"
resp = requests.get(url, timeout=6)
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, "html.parser")
paragraphs = soup.select("p")
text = ""
for p in paragraphs[:limit]:
txt = p.get_text().strip()
if txt:
text += txt + "\n"
return text.strip()
except Exception:
pass
return ""
def fetch_context(topic):
parts = []
wiki = fetch_wikipedia_summary(topic)
if wiki:
parts.append(wiki)
wd = fetch_wikidata_description(topic)
if wd:
parts.append(wd)
web = fetch_web_snippets(topic)
if web:
parts.append(web)
combined = "\n\n".join([p for p in parts if p])
if not combined:
combined = f"No reliable content found for {topic}."
return combined
def extract_keywords(context, top_k=6):
# try spaCy NER first, then fallback to frequency-based token extraction
try:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(context)
ents = []
seen_norm = set()
for ent in doc.ents:
norm = _normalize_token(ent.text)
if not norm:
continue
key = norm.lower()
if key in seen_norm:
continue
seen_norm.add(key)
ents.append(ent.text.strip())
if len(ents) >= top_k:
break
if ents:
return ents[:top_k]
except Exception:
pass
# fallback frequency-based extraction with token cleaning
tokens = _token_re.findall(context)
freq = {}
for t in tokens:
cleaned = _normalize_token(t)
if len(cleaned) < 2:
continue
key = cleaned.lower()
freq[key] = freq.get(key, 0) + 1
sorted_tokens = sorted(freq.items(), key=lambda x: x[1], reverse=True)
results = []
seen = set()
for tok, _ in sorted_tokens:
if tok in seen:
continue
# skip common stopwords (small set)
if tok in {"the", "and", "for", "with", "that", "from", "this", "have", "their", "which"}:
continue
seen.add(tok)
results.append(tok)
if len(results) >= top_k:
break
return results |