smart-quiz-api

Sleeping

App Files Files Community

smart-quiz-api / quiz_logic /wikipedia_utils.py

NZLouislu

Update quiz_logic/wikipedia_utils.py

1198cf7 verified 17 days ago

raw

history blame contribute delete

4.29 kB

	import requests
	from bs4 import BeautifulSoup
	import re

	WIKIPEDIA_SUMMARY_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{}"
	WIKIDATA_SEARCH_URL = "https://www.wikidata.org/w/api.php"

	_clean_re = re.compile(r'^[\W_]+\|[\W_]+$') # strip leading/trailing punctuation
	_token_re = re.compile(r"[A-Za-z0-9\-\']{2,}") # tokens of length >=2, allow hyphen/apostrophe

	def _normalize_token(tok: str) -> str:
	if not tok:
	return ""
	t = tok.strip()
	# remove colon or trailing punctuation
	t = re.sub(r'[:;,\.\)\(\[\]\{\}]+$', '', t)
	t = _clean_re.sub("", t)
	return t

	def fetch_wikipedia_summary(topic):
	try:
	resp = requests.get(WIKIPEDIA_SUMMARY_URL.format(requests.utils.quote(topic)), timeout=6)
	if resp.status_code == 200:
	j = resp.json()
	text = ""
	if "extract" in j and j["extract"]:
	text += j["extract"] + "\n"
	if "description" in j and j["description"]:
	text += f"{j['description']}\n"
	return text.strip()
	except Exception:
	pass
	return ""

	def fetch_wikidata_description(topic):
	try:
	params = {
	"action": "wbsearchentities",
	"search": topic,
	"language": "en",
	"format": "json",
	"limit": 5
	}
	resp = requests.get(WIKIDATA_SEARCH_URL, params=params, timeout=6)
	if resp.status_code == 200:
	j = resp.json()
	descs = []
	for item in j.get("search", []):
	label = item.get("label", "")
	desc = item.get("description", "")
	if label or desc:
	descs.append(f"{label}: {desc}")
	return "\n".join(descs)
	except Exception:
	pass
	return ""

	def fetch_web_snippets(topic, limit=3):
	try:
	q = requests.utils.quote(topic)
	url = f"https://en.wikipedia.org/w/index.php?search={q}"
	resp = requests.get(url, timeout=6)
	if resp.status_code == 200:
	soup = BeautifulSoup(resp.text, "html.parser")
	paragraphs = soup.select("p")
	text = ""
	for p in paragraphs[:limit]:
	txt = p.get_text().strip()
	if txt:
	text += txt + "\n"
	return text.strip()
	except Exception:
	pass
	return ""

	def fetch_context(topic):
	parts = []
	wiki = fetch_wikipedia_summary(topic)
	if wiki:
	parts.append(wiki)
	wd = fetch_wikidata_description(topic)
	if wd:
	parts.append(wd)
	web = fetch_web_snippets(topic)
	if web:
	parts.append(web)
	combined = "\n\n".join([p for p in parts if p])
	if not combined:
	combined = f"No reliable content found for {topic}."
	return combined

	def extract_keywords(context, top_k=6):
	# try spaCy NER first, then fallback to frequency-based token extraction
	try:
	import spacy
	nlp = spacy.load("en_core_web_sm")
	doc = nlp(context)
	ents = []
	seen_norm = set()
	for ent in doc.ents:
	norm = _normalize_token(ent.text)
	if not norm:
	continue
	key = norm.lower()
	if key in seen_norm:
	continue
	seen_norm.add(key)
	ents.append(ent.text.strip())
	if len(ents) >= top_k:
	break
	if ents:
	return ents[:top_k]
	except Exception:
	pass

	# fallback frequency-based extraction with token cleaning
	tokens = _token_re.findall(context)
	freq = {}
	for t in tokens:
	cleaned = _normalize_token(t)
	if len(cleaned) < 2:
	continue
	key = cleaned.lower()
	freq[key] = freq.get(key, 0) + 1
	sorted_tokens = sorted(freq.items(), key=lambda x: x[1], reverse=True)
	results = []
	seen = set()
	for tok, _ in sorted_tokens:
	if tok in seen:
	continue
	# skip common stopwords (small set)
	if tok in {"the", "and", "for", "with", "that", "from", "this", "have", "their", "which"}:
	continue
	seen.add(tok)
	results.append(tok)
	if len(results) >= top_k:
	break
	return results