Spaces:

hblim
/

reddit_sentiment_tracker

Running

File size: 5,588 Bytes

"""
Text analysis utilities for Reddit content insights.
Provides keyword extraction and similarity matching functions.
"""
import pandas as pd
import contextlib

# NOTE:
# Heavy NLP/ML libraries (spaCy, sentence-transformers, KeyBERT, torch, etc.) can take a
# long time to import or may not be available in constrained environments (e.g. the
# default HuggingFace Spaces CPU image).  Importing them at module import time can cause
# the module to fail to initialise which, in turn, leads to cryptic errors such as
# "cannot import name 'keywords_for_df'".  To avoid this we lazily import the heavy
# dependencies the first time they are actually needed.  The helper is cached so that
# subsequent calls are fast.

from functools import lru_cache


# -----------------------------------------------------------------------------
# Internal helpers
# -----------------------------------------------------------------------------


@lru_cache(maxsize=1)
def _load_models():
    """Lazily load and cache NLP models.

    Returns
    -------
    tuple
        (nlp, kw_model) where ``nlp`` is a spaCy language model and ``kw_model`` is a
        KeyBERT instance.  If the required libraries are not available the function
        raises ImportError *inside* the helper so the caller can decide how to handle
        the failure gracefully.
    """

    import importlib

    # ------------------------------------------------------------------
    # Inform the user via Streamlit (if available) that heavy models are
    # loading.  We use a spinner that is shown only on the first call; the
    # function is cached so subsequent calls skip the spinner entirely.
    # ------------------------------------------------------------------

    try:
        import streamlit as st  # noqa: WPS433 (late import)

        spinner_cm = st.spinner(
            "Initializing keyword-extraction models (first run may take ~1 min)…",
        )
    except ModuleNotFoundError:
        # If Streamlit isn't present (e.g. unit tests) simply do nothing.
        spinner_cm = contextlib.nullcontext()

    with spinner_cm:
        # Import spaCy and ensure the small English model is available
        spacy = importlib.import_module("spacy")

        try:
            nlp = spacy.load("en_core_web_sm")
        except OSError as exc:
            # The model is missing.  Do NOT attempt to install it at run-time
            # because the app may run under a non-privileged user (e.g. Streamlit
            # Cloud) and lack write permissions to the virtual-env.  Instead we
            # instruct the developer to add the model wheel to build-time
            # dependencies so it gets installed by pip when the image is built.
            raise RuntimeError(
                "spaCy model 'en_core_web_sm' is not installed. "
                "Add 'en-core-web-sm==3.8.0' (hyphen, not underscore) to "
                "your requirements.txt so it is installed during deployment."
            ) from exc

        # Sentence-Transformers and KeyBERT (which depends on it)
        sent_trans = importlib.import_module("sentence_transformers")
        SentenceTransformer = sent_trans.SentenceTransformer

        KeyBERT = importlib.import_module("keybert").KeyBERT

        embedder = SentenceTransformer("all-MiniLM-L6-v2")
        kw_model = KeyBERT(embedder)

    # Notify user that models are ready (only on first load)
    try:
        st.success("Keyword-extraction models ready!", icon="✅")  # type: ignore[name-defined]
    except Exception:  # noqa: BLE001 (streamlit not available or other minor issue)
        pass

    return nlp, kw_model

def keywords_for_df(df: pd.DataFrame, top_n=5):
    """
    Extract keywords from a DataFrame containing Reddit posts.
    
    Args:
        df: DataFrame with a 'text' column containing post content
        top_n: Number of top keywords to return
        
    Returns:
        List of (keyword, score) tuples
    """
    if df.empty:
        return []

    # Attempt to load heavy models.  If this fails we degrade gracefully by returning
    # an empty list rather than crashing the whole application.
    try:
        nlp, kw_model = _load_models()
    except Exception as exc:  # noqa: BLE001 (broad, but we degrade gracefully)
        # Log the failure inside Streamlit if available; otherwise swallow silently.
        try:
            import streamlit as st  # noqa: WPS433

            st.warning(
                f"Keyword extraction disabled due to model loading error: {exc}",
                icon="⚠️",
            )
        except ModuleNotFoundError:
            pass

        return []

    # Join all text from the dataframe
    raw = " ".join(df["text"].astype(str))

    # Process with spaCy to extract noun chunks and named entities
    doc = nlp(raw.lower())

    # Combine noun chunks and relevant named entities
    cand = " ".join(
        [c.text for c in doc.noun_chunks]
        + [e.text for e in doc.ents if e.label_ in {"PRODUCT", "EVENT", "ORG", "GPE"}]
    )

    # Quick stopword list to filter common terms
    for ex in [
        "blog",
        "topic",
        "locked",
        "author",
        "moderator",
        "error",
        "bot",
        "comments",
        "archive",
        "support",
        "discord",
    ]:
        cand = cand.replace(ex, " ")

    # Use KeyBERT to extract keywords with diversity
    return kw_model.extract_keywords(
        cand,
        keyphrase_ngram_range=(1, 3),
        stop_words="english",
        use_mmr=True,
        diversity=0.8,
        top_n=top_n,
    )