smart-quiz-api / quiz_logic /wikipedia_utils.py
NZLouislu's picture
Update quiz_logic/wikipedia_utils.py
1198cf7 verified
import requests
from bs4 import BeautifulSoup
import re
WIKIPEDIA_SUMMARY_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{}"
WIKIDATA_SEARCH_URL = "https://www.wikidata.org/w/api.php"
_clean_re = re.compile(r'^[\W_]+|[\W_]+$') # strip leading/trailing punctuation
_token_re = re.compile(r"[A-Za-z0-9\-\']{2,}") # tokens of length >=2, allow hyphen/apostrophe
def _normalize_token(tok: str) -> str:
if not tok:
return ""
t = tok.strip()
# remove colon or trailing punctuation
t = re.sub(r'[:;,\.\)\(\[\]\{\}]+$', '', t)
t = _clean_re.sub("", t)
return t
def fetch_wikipedia_summary(topic):
try:
resp = requests.get(WIKIPEDIA_SUMMARY_URL.format(requests.utils.quote(topic)), timeout=6)
if resp.status_code == 200:
j = resp.json()
text = ""
if "extract" in j and j["extract"]:
text += j["extract"] + "\n"
if "description" in j and j["description"]:
text += f"{j['description']}\n"
return text.strip()
except Exception:
pass
return ""
def fetch_wikidata_description(topic):
try:
params = {
"action": "wbsearchentities",
"search": topic,
"language": "en",
"format": "json",
"limit": 5
}
resp = requests.get(WIKIDATA_SEARCH_URL, params=params, timeout=6)
if resp.status_code == 200:
j = resp.json()
descs = []
for item in j.get("search", []):
label = item.get("label", "")
desc = item.get("description", "")
if label or desc:
descs.append(f"{label}: {desc}")
return "\n".join(descs)
except Exception:
pass
return ""
def fetch_web_snippets(topic, limit=3):
try:
q = requests.utils.quote(topic)
url = f"https://en.wikipedia.org/w/index.php?search={q}"
resp = requests.get(url, timeout=6)
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, "html.parser")
paragraphs = soup.select("p")
text = ""
for p in paragraphs[:limit]:
txt = p.get_text().strip()
if txt:
text += txt + "\n"
return text.strip()
except Exception:
pass
return ""
def fetch_context(topic):
parts = []
wiki = fetch_wikipedia_summary(topic)
if wiki:
parts.append(wiki)
wd = fetch_wikidata_description(topic)
if wd:
parts.append(wd)
web = fetch_web_snippets(topic)
if web:
parts.append(web)
combined = "\n\n".join([p for p in parts if p])
if not combined:
combined = f"No reliable content found for {topic}."
return combined
def extract_keywords(context, top_k=6):
# try spaCy NER first, then fallback to frequency-based token extraction
try:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(context)
ents = []
seen_norm = set()
for ent in doc.ents:
norm = _normalize_token(ent.text)
if not norm:
continue
key = norm.lower()
if key in seen_norm:
continue
seen_norm.add(key)
ents.append(ent.text.strip())
if len(ents) >= top_k:
break
if ents:
return ents[:top_k]
except Exception:
pass
# fallback frequency-based extraction with token cleaning
tokens = _token_re.findall(context)
freq = {}
for t in tokens:
cleaned = _normalize_token(t)
if len(cleaned) < 2:
continue
key = cleaned.lower()
freq[key] = freq.get(key, 0) + 1
sorted_tokens = sorted(freq.items(), key=lambda x: x[1], reverse=True)
results = []
seen = set()
for tok, _ in sorted_tokens:
if tok in seen:
continue
# skip common stopwords (small set)
if tok in {"the", "and", "for", "with", "that", "from", "this", "have", "their", "which"}:
continue
seen.add(tok)
results.append(tok)
if len(results) >= top_k:
break
return results