Spaces:
Running
Running
File size: 5,588 Bytes
a6576f0 aa981f7 612d63d aa981f7 612d63d aa981f7 612d63d aa981f7 612d63d aa981f7 b8895e0 612d63d aa981f7 612d63d aa981f7 a6576f0 aa981f7 612d63d a6576f0 612d63d a6576f0 612d63d a6576f0 612d63d a6576f0 612d63d a6576f0 612d63d a6576f0 612d63d a6576f0 612d63d a6576f0 612d63d a6576f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
"""
Text analysis utilities for Reddit content insights.
Provides keyword extraction and similarity matching functions.
"""
import pandas as pd
import contextlib
# NOTE:
# Heavy NLP/ML libraries (spaCy, sentence-transformers, KeyBERT, torch, etc.) can take a
# long time to import or may not be available in constrained environments (e.g. the
# default HuggingFace Spaces CPU image). Importing them at module import time can cause
# the module to fail to initialise which, in turn, leads to cryptic errors such as
# "cannot import name 'keywords_for_df'". To avoid this we lazily import the heavy
# dependencies the first time they are actually needed. The helper is cached so that
# subsequent calls are fast.
from functools import lru_cache
# -----------------------------------------------------------------------------
# Internal helpers
# -----------------------------------------------------------------------------
@lru_cache(maxsize=1)
def _load_models():
"""Lazily load and cache NLP models.
Returns
-------
tuple
(nlp, kw_model) where ``nlp`` is a spaCy language model and ``kw_model`` is a
KeyBERT instance. If the required libraries are not available the function
raises ImportError *inside* the helper so the caller can decide how to handle
the failure gracefully.
"""
import importlib
# ------------------------------------------------------------------
# Inform the user via Streamlit (if available) that heavy models are
# loading. We use a spinner that is shown only on the first call; the
# function is cached so subsequent calls skip the spinner entirely.
# ------------------------------------------------------------------
try:
import streamlit as st # noqa: WPS433 (late import)
spinner_cm = st.spinner(
"Initializing keyword-extraction models (first run may take ~1 min)…",
)
except ModuleNotFoundError:
# If Streamlit isn't present (e.g. unit tests) simply do nothing.
spinner_cm = contextlib.nullcontext()
with spinner_cm:
# Import spaCy and ensure the small English model is available
spacy = importlib.import_module("spacy")
try:
nlp = spacy.load("en_core_web_sm")
except OSError as exc:
# The model is missing. Do NOT attempt to install it at run-time
# because the app may run under a non-privileged user (e.g. Streamlit
# Cloud) and lack write permissions to the virtual-env. Instead we
# instruct the developer to add the model wheel to build-time
# dependencies so it gets installed by pip when the image is built.
raise RuntimeError(
"spaCy model 'en_core_web_sm' is not installed. "
"Add 'en-core-web-sm==3.8.0' (hyphen, not underscore) to "
"your requirements.txt so it is installed during deployment."
) from exc
# Sentence-Transformers and KeyBERT (which depends on it)
sent_trans = importlib.import_module("sentence_transformers")
SentenceTransformer = sent_trans.SentenceTransformer
KeyBERT = importlib.import_module("keybert").KeyBERT
embedder = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(embedder)
# Notify user that models are ready (only on first load)
try:
st.success("Keyword-extraction models ready!", icon="✅") # type: ignore[name-defined]
except Exception: # noqa: BLE001 (streamlit not available or other minor issue)
pass
return nlp, kw_model
def keywords_for_df(df: pd.DataFrame, top_n=5):
"""
Extract keywords from a DataFrame containing Reddit posts.
Args:
df: DataFrame with a 'text' column containing post content
top_n: Number of top keywords to return
Returns:
List of (keyword, score) tuples
"""
if df.empty:
return []
# Attempt to load heavy models. If this fails we degrade gracefully by returning
# an empty list rather than crashing the whole application.
try:
nlp, kw_model = _load_models()
except Exception as exc: # noqa: BLE001 (broad, but we degrade gracefully)
# Log the failure inside Streamlit if available; otherwise swallow silently.
try:
import streamlit as st # noqa: WPS433
st.warning(
f"Keyword extraction disabled due to model loading error: {exc}",
icon="⚠️",
)
except ModuleNotFoundError:
pass
return []
# Join all text from the dataframe
raw = " ".join(df["text"].astype(str))
# Process with spaCy to extract noun chunks and named entities
doc = nlp(raw.lower())
# Combine noun chunks and relevant named entities
cand = " ".join(
[c.text for c in doc.noun_chunks]
+ [e.text for e in doc.ents if e.label_ in {"PRODUCT", "EVENT", "ORG", "GPE"}]
)
# Quick stopword list to filter common terms
for ex in [
"blog",
"topic",
"locked",
"author",
"moderator",
"error",
"bot",
"comments",
"archive",
"support",
"discord",
]:
cand = cand.replace(ex, " ")
# Use KeyBERT to extract keywords with diversity
return kw_model.extract_keywords(
cand,
keyphrase_ngram_range=(1, 3),
stop_words="english",
use_mmr=True,
diversity=0.8,
top_n=top_n,
)
|