Spaces:
Running
Running
""" | |
Text analysis utilities for Reddit content insights. | |
Provides keyword extraction and similarity matching functions. | |
""" | |
import pandas as pd | |
import contextlib | |
# NOTE: | |
# Heavy NLP/ML libraries (spaCy, sentence-transformers, KeyBERT, torch, etc.) can take a | |
# long time to import or may not be available in constrained environments (e.g. the | |
# default HuggingFace Spaces CPU image). Importing them at module import time can cause | |
# the module to fail to initialise which, in turn, leads to cryptic errors such as | |
# "cannot import name 'keywords_for_df'". To avoid this we lazily import the heavy | |
# dependencies the first time they are actually needed. The helper is cached so that | |
# subsequent calls are fast. | |
from functools import lru_cache | |
# ----------------------------------------------------------------------------- | |
# Internal helpers | |
# ----------------------------------------------------------------------------- | |
def _load_models(): | |
"""Lazily load and cache NLP models. | |
Returns | |
------- | |
tuple | |
(nlp, kw_model) where ``nlp`` is a spaCy language model and ``kw_model`` is a | |
KeyBERT instance. If the required libraries are not available the function | |
raises ImportError *inside* the helper so the caller can decide how to handle | |
the failure gracefully. | |
""" | |
import importlib | |
# ------------------------------------------------------------------ | |
# Inform the user via Streamlit (if available) that heavy models are | |
# loading. We use a spinner that is shown only on the first call; the | |
# function is cached so subsequent calls skip the spinner entirely. | |
# ------------------------------------------------------------------ | |
try: | |
import streamlit as st # noqa: WPS433 (late import) | |
spinner_cm = st.spinner( | |
"Initializing keyword-extraction models (first run may take ~1 min)…", | |
) | |
except ModuleNotFoundError: | |
# If Streamlit isn't present (e.g. unit tests) simply do nothing. | |
spinner_cm = contextlib.nullcontext() | |
with spinner_cm: | |
# Import spaCy and ensure the small English model is available | |
spacy = importlib.import_module("spacy") | |
try: | |
nlp = spacy.load("en_core_web_sm") | |
except OSError as exc: | |
# The model is missing. Do NOT attempt to install it at run-time | |
# because the app may run under a non-privileged user (e.g. Streamlit | |
# Cloud) and lack write permissions to the virtual-env. Instead we | |
# instruct the developer to add the model wheel to build-time | |
# dependencies so it gets installed by pip when the image is built. | |
raise RuntimeError( | |
"spaCy model 'en_core_web_sm' is not installed. " | |
"Add 'en-core-web-sm==3.8.0' (hyphen, not underscore) to " | |
"your requirements.txt so it is installed during deployment." | |
) from exc | |
# Sentence-Transformers and KeyBERT (which depends on it) | |
sent_trans = importlib.import_module("sentence_transformers") | |
SentenceTransformer = sent_trans.SentenceTransformer | |
KeyBERT = importlib.import_module("keybert").KeyBERT | |
embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
kw_model = KeyBERT(embedder) | |
# Notify user that models are ready (only on first load) | |
try: | |
st.success("Keyword-extraction models ready!", icon="✅") # type: ignore[name-defined] | |
except Exception: # noqa: BLE001 (streamlit not available or other minor issue) | |
pass | |
return nlp, kw_model | |
def keywords_for_df(df: pd.DataFrame, top_n=5): | |
""" | |
Extract keywords from a DataFrame containing Reddit posts. | |
Args: | |
df: DataFrame with a 'text' column containing post content | |
top_n: Number of top keywords to return | |
Returns: | |
List of (keyword, score) tuples | |
""" | |
if df.empty: | |
return [] | |
# Attempt to load heavy models. If this fails we degrade gracefully by returning | |
# an empty list rather than crashing the whole application. | |
try: | |
nlp, kw_model = _load_models() | |
except Exception as exc: # noqa: BLE001 (broad, but we degrade gracefully) | |
# Log the failure inside Streamlit if available; otherwise swallow silently. | |
try: | |
import streamlit as st # noqa: WPS433 | |
st.warning( | |
f"Keyword extraction disabled due to model loading error: {exc}", | |
icon="⚠️", | |
) | |
except ModuleNotFoundError: | |
pass | |
return [] | |
# Join all text from the dataframe | |
raw = " ".join(df["text"].astype(str)) | |
# Process with spaCy to extract noun chunks and named entities | |
doc = nlp(raw.lower()) | |
# Combine noun chunks and relevant named entities | |
cand = " ".join( | |
[c.text for c in doc.noun_chunks] | |
+ [e.text for e in doc.ents if e.label_ in {"PRODUCT", "EVENT", "ORG", "GPE"}] | |
) | |
# Quick stopword list to filter common terms | |
for ex in [ | |
"blog", | |
"topic", | |
"locked", | |
"author", | |
"moderator", | |
"error", | |
"bot", | |
"comments", | |
"archive", | |
"support", | |
"discord", | |
]: | |
cand = cand.replace(ex, " ") | |
# Use KeyBERT to extract keywords with diversity | |
return kw_model.extract_keywords( | |
cand, | |
keyphrase_ngram_range=(1, 3), | |
stop_words="english", | |
use_mmr=True, | |
diversity=0.8, | |
top_n=top_n, | |
) | |