File size: 4,290 Bytes
6b070b1
 
1198cf7
48ebe80
6b070b1
 
48ebe80
1198cf7
 
 
 
 
 
 
 
 
 
 
 
6b070b1
 
1198cf7
6b070b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1198cf7
 
6b070b1
 
 
 
 
 
48ebe80
6b070b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48ebe80
6b070b1
1198cf7
6b070b1
 
 
 
1198cf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b070b1
 
1198cf7
 
 
6b070b1
1198cf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import requests
from bs4 import BeautifulSoup
import re

WIKIPEDIA_SUMMARY_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{}"
WIKIDATA_SEARCH_URL = "https://www.wikidata.org/w/api.php"

_clean_re = re.compile(r'^[\W_]+|[\W_]+$')  # strip leading/trailing punctuation
_token_re = re.compile(r"[A-Za-z0-9\-\']{2,}")  # tokens of length >=2, allow hyphen/apostrophe

def _normalize_token(tok: str) -> str:
    if not tok:
        return ""
    t = tok.strip()
    # remove colon or trailing punctuation
    t = re.sub(r'[:;,\.\)\(\[\]\{\}]+$', '', t)
    t = _clean_re.sub("", t)
    return t

def fetch_wikipedia_summary(topic):
    try:
        resp = requests.get(WIKIPEDIA_SUMMARY_URL.format(requests.utils.quote(topic)), timeout=6)
        if resp.status_code == 200:
            j = resp.json()
            text = ""
            if "extract" in j and j["extract"]:
                text += j["extract"] + "\n"
            if "description" in j and j["description"]:
                text += f"{j['description']}\n"
            return text.strip()
    except Exception:
        pass
    return ""

def fetch_wikidata_description(topic):
    try:
        params = {
            "action": "wbsearchentities",
            "search": topic,
            "language": "en",
            "format": "json",
            "limit": 5
        }
        resp = requests.get(WIKIDATA_SEARCH_URL, params=params, timeout=6)
        if resp.status_code == 200:
            j = resp.json()
            descs = []
            for item in j.get("search", []):
                label = item.get("label", "")
                desc = item.get("description", "")
                if label or desc:
                    descs.append(f"{label}: {desc}")
            return "\n".join(descs)
    except Exception:
        pass
    return ""

def fetch_web_snippets(topic, limit=3):
    try:
        q = requests.utils.quote(topic)
        url = f"https://en.wikipedia.org/w/index.php?search={q}"
        resp = requests.get(url, timeout=6)
        if resp.status_code == 200:
            soup = BeautifulSoup(resp.text, "html.parser")
            paragraphs = soup.select("p")
            text = ""
            for p in paragraphs[:limit]:
                txt = p.get_text().strip()
                if txt:
                    text += txt + "\n"
            return text.strip()
    except Exception:
        pass
    return ""

def fetch_context(topic):
    parts = []
    wiki = fetch_wikipedia_summary(topic)
    if wiki:
        parts.append(wiki)
    wd = fetch_wikidata_description(topic)
    if wd:
        parts.append(wd)
    web = fetch_web_snippets(topic)
    if web:
        parts.append(web)
    combined = "\n\n".join([p for p in parts if p])
    if not combined:
        combined = f"No reliable content found for {topic}."
    return combined

def extract_keywords(context, top_k=6):
    # try spaCy NER first, then fallback to frequency-based token extraction
    try:
        import spacy
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(context)
        ents = []
        seen_norm = set()
        for ent in doc.ents:
            norm = _normalize_token(ent.text)
            if not norm:
                continue
            key = norm.lower()
            if key in seen_norm:
                continue
            seen_norm.add(key)
            ents.append(ent.text.strip())
            if len(ents) >= top_k:
                break
        if ents:
            return ents[:top_k]
    except Exception:
        pass

    # fallback frequency-based extraction with token cleaning
    tokens = _token_re.findall(context)
    freq = {}
    for t in tokens:
        cleaned = _normalize_token(t)
        if len(cleaned) < 2:
            continue
        key = cleaned.lower()
        freq[key] = freq.get(key, 0) + 1
    sorted_tokens = sorted(freq.items(), key=lambda x: x[1], reverse=True)
    results = []
    seen = set()
    for tok, _ in sorted_tokens:
        if tok in seen:
            continue
        # skip common stopwords (small set)
        if tok in {"the", "and", "for", "with", "that", "from", "this", "have", "their", "which"}:
            continue
        seen.add(tok)
        results.append(tok)
        if len(results) >= top_k:
            break
    return results