import requests
from bs4 import BeautifulSoup
import re

WIKIPEDIA_SUMMARY_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{}"
WIKIDATA_SEARCH_URL = "https://www.wikidata.org/w/api.php"

_clean_re = re.compile(r'^[\W_]+|[\W_]+$')  # strip leading/trailing punctuation
_token_re = re.compile(r"[A-Za-z0-9\-\']{2,}")  # tokens of length >=2, allow hyphen/apostrophe

def _normalize_token(tok: str) -> str:
    if not tok:
        return ""
    t = tok.strip()
    # remove colon or trailing punctuation
    t = re.sub(r'[:;,\.\)\(\[\]\{\}]+$', '', t)
    t = _clean_re.sub("", t)
    return t

def fetch_wikipedia_summary(topic):
    try:
        resp = requests.get(WIKIPEDIA_SUMMARY_URL.format(requests.utils.quote(topic)), timeout=6)
        if resp.status_code == 200:
            j = resp.json()
            text = ""
            if "extract" in j and j["extract"]:
                text += j["extract"] + "\n"
            if "description" in j and j["description"]:
                text += f"{j['description']}\n"
            return text.strip()
    except Exception:
        pass
    return ""

def fetch_wikidata_description(topic):
    try:
        params = {
            "action": "wbsearchentities",
            "search": topic,
            "language": "en",
            "format": "json",
            "limit": 5
        }
        resp = requests.get(WIKIDATA_SEARCH_URL, params=params, timeout=6)
        if resp.status_code == 200:
            j = resp.json()
            descs = []
            for item in j.get("search", []):
                label = item.get("label", "")
                desc = item.get("description", "")
                if label or desc:
                    descs.append(f"{label}: {desc}")
            return "\n".join(descs)
    except Exception:
        pass
    return ""

def fetch_web_snippets(topic, limit=3):
    try:
        q = requests.utils.quote(topic)
        url = f"https://en.wikipedia.org/w/index.php?search={q}"
        resp = requests.get(url, timeout=6)
        if resp.status_code == 200:
            soup = BeautifulSoup(resp.text, "html.parser")
            paragraphs = soup.select("p")
            text = ""
            for p in paragraphs[:limit]:
                txt = p.get_text().strip()
                if txt:
                    text += txt + "\n"
            return text.strip()
    except Exception:
        pass
    return ""

def fetch_context(topic):
    parts = []
    wiki = fetch_wikipedia_summary(topic)
    if wiki:
        parts.append(wiki)
    wd = fetch_wikidata_description(topic)
    if wd:
        parts.append(wd)
    web = fetch_web_snippets(topic)
    if web:
        parts.append(web)
    combined = "\n\n".join([p for p in parts if p])
    if not combined:
        combined = f"No reliable content found for {topic}."
    return combined

def extract_keywords(context, top_k=6):
    # try spaCy NER first, then fallback to frequency-based token extraction
    try:
        import spacy
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(context)
        ents = []
        seen_norm = set()
        for ent in doc.ents:
            norm = _normalize_token(ent.text)
            if not norm:
                continue
            key = norm.lower()
            if key in seen_norm:
                continue
            seen_norm.add(key)
            ents.append(ent.text.strip())
            if len(ents) >= top_k:
                break
        if ents:
            return ents[:top_k]
    except Exception:
        pass

    # fallback frequency-based extraction with token cleaning
    tokens = _token_re.findall(context)
    freq = {}
    for t in tokens:
        cleaned = _normalize_token(t)
        if len(cleaned) < 2:
            continue
        key = cleaned.lower()
        freq[key] = freq.get(key, 0) + 1
    sorted_tokens = sorted(freq.items(), key=lambda x: x[1], reverse=True)
    results = []
    seen = set()
    for tok, _ in sorted_tokens:
        if tok in seen:
            continue
        # skip common stopwords (small set)
        if tok in {"the", "and", "for", "with", "that", "from", "this", "have", "their", "which"}:
            continue
        seen.add(tok)
        results.append(tok)
        if len(results) >= top_k:
            break
    return results