Spaces:

hblim
/

reddit_sentiment_tracker

Running

File size: 18,656 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import altair as alt
import yaml
from pathlib import Path

# Call page config BEFORE importing modules that use Streamlit commands
st.set_page_config(page_title="Reddit Sentiment Trends", layout="wide")

# Import from local modules AFTER page config is set
from data_utils import (
    load_summary,
    load_day,
    get_subreddit_colors,
    get_last_updated_hf_caption,
)
from text_analysis import keywords_for_df


st.title("Reddit Sentiment Monitor")

st.markdown(
    """
    **Welcome!** This page shows how Reddit's AI communities feel day-to-day.

    A daily pipeline grabs new posts and comments, scores their tone with a sentiment model, and saves the results to a public HuggingFace [dataset](https://huggingface.co/datasets/hblim/top_reddit_posts_daily). \n

    """
)


# ── Load & transform data ────────────────────────────────────────────────────
df = load_summary()
last_update_caption = get_last_updated_hf_caption()

# Get colors for each subreddit
subreddits = df["subreddit"].unique()
subreddit_colors = get_subreddit_colors(subreddits)

# Load mean/std parameters for sentiment spike bands per subreddit
params_path = Path(__file__).resolve().parent.parent / "spike_params.yaml"
try:
    with params_path.open("r") as f:
        spike_params = yaml.safe_load(f)
except FileNotFoundError:
    spike_params = {}

# Define time format to use across all charts
time_format = "%m/%d/%Y"

# Get date range from the dataset for the form
min_date = df["date"].min().date()
max_date = df["date"].max().date()

# ── Community weighted sentiment line chart for all subreddits ───────────────
st.subheader("Daily Community-Weighted Sentiment")
st.markdown(
    """
    The line chart below plots the daily *community-weighted sentiment*, reflecting the average sentiment across all posts/comments in a subreddit community.
    
    To calculate the community-weighted sentiment:
    - First, each post or comment is assigned a sentiment score of −1 (negative) or +1 (positive) 
    - Then, the sentiment score is weighted by its upvotes so busier discussions matter more. 
    """
)
# Add date range selector for the time series
date_range = st.date_input(
    "Select date range for time series",
    (min_date, max_date),
    min_value=min_date,
    max_value=max_date
)
start_date, end_date = date_range
filtered_df = df[(df["date"].dt.date >= start_date) & (df["date"].dt.date <= end_date)]

# Add a dropdown (selectbox) for choosing a single subreddit to display
default_sub = "artificial" if "artificial" in subreddits else list(subreddits)[0]
selected_subreddit = st.selectbox(
    "Select subreddit",
    options=list(subreddits),
    index=list(subreddits).index(default_sub)
)
plot_df = filtered_df[filtered_df["subreddit"] == selected_subreddit]

# ── Determine shading band and dynamic y-axis domain ────────────────────────
mean_val = std_val = None
if selected_subreddit in spike_params:
    mean_val = spike_params[selected_subreddit].get("mean")
    std_val = spike_params[selected_subreddit].get("std")

# Calculate band limits if parameters exist
band_low = band_high = None
if mean_val is not None and std_val is not None:
    band_low = mean_val - 3 * std_val
    band_high = mean_val + 3 * std_val

# Determine y-axis domain based on data and (optional) band
sent_min = plot_df["community_weighted_sentiment"].min()
sent_max = plot_df["community_weighted_sentiment"].max()

if band_low is not None:
    y_min = float(min(sent_min, band_low))
    y_max = float(max(sent_max, band_high))
else:
    y_min = float(sent_min)
    y_max = float(sent_max)

# Add small padding so points are not flush with edges
padding = 0.05
y_domain = [y_min - padding, y_max + padding]

# Define hover selection for nearest point
nearest = alt.selection_single(
    name="nearest",
    on="mouseover",
    nearest=True,
    fields=["date"],
    empty="none"
)

# Base chart with refreshed y-axis range
base = alt.Chart(plot_df).encode(
    x=alt.X("date:T", title="Date", axis=alt.Axis(format=time_format, labelPadding=15)),
    y=alt.Y(
        "community_weighted_sentiment:Q",
        title="Community Weighted Sentiment",
        scale=alt.Scale(domain=y_domain),
    ),
)
# Use a constant blue colour for all plot elements
line_colour = "#1f77b4"

# Draw line for the selected subreddit
line = (
    base.transform_calculate(legend='"daily community sentiment score"')
    .mark_line(color=line_colour)
    .encode(
        color=alt.Color(
            "legend:N",
            scale=alt.Scale(domain=["daily community sentiment score", "historical 3σ sentiment range", "significant sentiment outlier"], range=[line_colour, line_colour, "red"]),
            legend=None  # hide default legend; we will add a custom manual legend below the chart
        )
    )
)

# Invisible selectors to capture hover events
selectors = base.mark_point(opacity=0).add_selection(nearest)

# Draw highlighted points on hover
points_hover = base.mark_point(size=60, color=line_colour).encode(
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)

# Tooltip rule and popup
tooltips = base.mark_rule(color="gray").encode(
    tooltip=[
        alt.Tooltip("subreddit:N", title="Subreddit"),
        alt.Tooltip("date:T", title="Date", format=time_format),
        alt.Tooltip("community_weighted_sentiment:Q", title="Sentiment", format=".2f")
    ]
).transform_filter(nearest)

# Optional shaded band (mean ± 3σ)
band = None
outliers = None
domain_labels = [
    "daily community sentiment score",
    "historical 3σ sentiment range",
    "significant sentiment outlier",
]
domain_colors = [line_colour, line_colour, "red"]



if band_low is not None:
    band_df = pd.DataFrame({
        "date": [plot_df["date"].min(), plot_df["date"].max()],
        "low": [band_low, band_low],
        "high": [band_high, band_high],
    })
    band = (
        alt.Chart(band_df)
        .transform_calculate(legend='"historical 3σ sentiment range"')
        .mark_area(opacity=0.15)
        .encode(
            x="date:T",
            y=alt.Y("low:Q", scale=alt.Scale(domain=y_domain)),
            y2="high:Q",
            color=alt.Color(
                "legend:N",
                scale=alt.Scale(domain=domain_labels, range=domain_colors),
                legend=None  # suppress built-in legend for band
            ),
        )
    )

    # Identify significant outliers outside the band
    outlier_df = plot_df[(plot_df["community_weighted_sentiment"] < band_low) |
                         (plot_df["community_weighted_sentiment"] > band_high)].copy()
    if not outlier_df.empty:
        outliers = (
            alt.Chart(outlier_df)
            .transform_calculate(legend='"significant sentiment outlier"')
            .mark_point(shape="circle", size=100, fill="white", stroke="red", strokeWidth=2)
            .encode(
                x="date:T",
                y="community_weighted_sentiment:Q",
                color=alt.Color(
                    "legend:N",
                    scale=alt.Scale(domain=domain_labels, range=domain_colors),
                    legend=None  # suppress built-in legend for outlier
                ),
            )
        )

# Layer everything and make interactive, with title showing subreddit
layers = [line, selectors, points_hover, tooltips]
if band is not None:
    layers.insert(0, band)  # draw band behind the line
if outliers is not None:
    layers.append(outliers)

hover_chart = alt.layer(*layers).properties(
    height=400,  # increased height for more spacious plot area
).interactive(bind_y=False)

# ── Manual legend (two rows) ───────────────────────────────────────────────
legend_df = pd.DataFrame({
    "row": [0, 1],
    "label": ["significant sentiment outlier", "historical 3σ sentiment range"],
    "stroke": ["red", "lightblue"],  # outline colour
    "fill": ["white", "lightblue"],  # interior fill (blue only for historical band)
    "shape": ["circle", "square"],
})

legend_points = (
    alt.Chart(legend_df)
    .mark_point(size=100, filled=True)
    .encode(
        y=alt.Y("row:O", axis=None),
        x=alt.value(0),
        shape=alt.Shape("shape:N", legend=None),
        stroke=alt.Stroke("stroke:N", scale=None, legend=None),
        fill=alt.Fill("fill:N", scale=None, legend=None),
    )
)

legend_text = (
    alt.Chart(legend_df)
    .mark_text(align="left", baseline="middle", dx=15, color="black")
    .encode(
        y="row:O",
        x=alt.value(0),
        text="label:N",
    )
)

manual_legend = (
    legend_points + legend_text
).properties(height=50, background="white",width=170)

# # Concatenate chart and manual legend vertically
# final_chart = alt.vconcat(
#     manual_legend,    
#     hover_chart,
#     spacing=0
# ).configure_view(strokeWidth=0)

st.altair_chart(manual_legend, use_container_width=False)
st.altair_chart(hover_chart, use_container_width=True)

# ── Bar chart for post counts by subreddit (side-by-side) ────────────────────
st.subheader("Daily Post Counts by Subreddit")

# Create grouped bar chart for post counts by date and subreddit
bar_chart = alt.Chart(df).mark_bar().encode(
    x=alt.X("date:T", title="Date", axis=alt.Axis(format=time_format)),
    y=alt.Y("count:Q", title="Post Count"),
    xOffset="subreddit:N",  # This creates the side-by-side grouping
    color=alt.Color(
        "subreddit:N", 
        scale=alt.Scale(domain=list(subreddits), range=list(subreddit_colors.values())),
        legend=alt.Legend(title="Subreddit")
    ),
    tooltip=["date", "subreddit", "count"]
).properties(height=400).interactive()

st.altair_chart(bar_chart, use_container_width=True)

# ── Latest metrics for each subreddit ─────────────────────────────────────────
st.subheader("Latest Metrics")

# Get the most recent data for each subreddit
latest_by_subreddit = df.sort_values("date").groupby("subreddit").last().reset_index()

# Display metrics in columns
cols = st.columns(len(latest_by_subreddit))
for i, (_, row) in enumerate(latest_by_subreddit.iterrows()):
    with cols[i]:
        st.markdown(f"**{row['subreddit']}**")
        st.metric("Community Weighted", f"{row['community_weighted_sentiment']:.2f}")
        st.metric("Posts", int(row["count"]))

# ── Analyze sentiment driving posts ─────────────────────────────────────
st.header("Analyze sentiment driving posts")
with st.form("analysis_form"):
    col1, col2 = st.columns(2)
    with col1:
        selected_subreddit = st.selectbox("Select Subreddit", options=subreddits)
    with col2:
        selected_date = st.date_input(
            "Select Date",
            value=max_date,
            min_value=min_date,
            max_value=max_date
        )
    submit_button = st.form_submit_button("Analyze Posts")

if submit_button:
    date_str = selected_date.strftime("%Y-%m-%d")
    with st.spinner(f"Loading data for r/{selected_subreddit} on {date_str}..."):
        posts_df = load_day(date_str, selected_subreddit)

    if posts_df.empty:
        st.error(f"No posts found for r/{selected_subreddit} on {date_str}")
    else:
        # Separate posts and comments
        posts = posts_df[posts_df["type"] == "post"]
        comments = posts_df[posts_df["type"] == "comment"]

        # Overall summary metrics using engagement-adjusted sentiment (EAS)
        n_posts = len(posts)
        df_day = posts_df.copy()
        df_day["score_num"] = pd.to_numeric(df_day["score"], errors="coerce").fillna(0)
        weights_base_day = 1 + np.log1p(df_day["score_num"].clip(lower=0))
        gamma_post = 0.3
        weights_day = weights_base_day * np.where(df_day["type"] == "post", gamma_post, 1.0)
        total_weight_day = weights_day.sum()
        overall_eas = (weights_day * df_day["sentiment"]).sum() / weights_day.sum() if weights_day.sum() > 0 else 0
        # Normalize daily weighted sentiment to range [-1,1]
        overall_eas = 2 * overall_eas - 1
        overall_score = df_day["score"].sum()

        st.subheader(f"r/{selected_subreddit} on {date_str}")
        c1, c2, c3 = st.columns(3)
        c1.metric("Posts", n_posts)
        c2.metric("Daily Weighted Sentiment, All Posts", f"{overall_eas:.2f}")
        c3.metric("Total Score, All Posts", f"{overall_score:,}")

        # Wrap analysis and rendering of top posts in a spinner
        with st.spinner("Analyzing sentiment and rendering top posts..."):
            # Build per-post analysis
            analysis_rows = []
            for _, post in posts.iterrows():
                pid = post["post_id"]
                text = post["text"]
                # Gather comments for this post
                post_comments = comments[comments["parent_id"] == f"t3_{pid}"]

                # Combine post and comments for calculations
                segment = pd.concat([pd.DataFrame([post]), post_comments], ignore_index=True)
                # Compute engagement-adjusted sentiment for this post thread
                segment_score_num = pd.to_numeric(segment["score"], errors="coerce").fillna(0)
                weights_base = 1 + np.log1p(segment_score_num.clip(lower=0))
                gamma_post = 0.3
                weights_seg = weights_base * np.where(segment["type"] == "post", gamma_post, 1.0)
                ws = (weights_seg * segment["sentiment"]).sum() / weights_seg.sum() if weights_seg.sum() > 0 else 0
                # Normalize weighted sentiment of thread to range [-1,1]
                ws = 2 * ws - 1
                ts = segment["score"].sum()
                nc = len(post_comments)

                thread_weight_sum = weights_seg.sum()
                contrib_weight = thread_weight_sum / total_weight_day if total_weight_day > 0 else 0
                total_contribution = contrib_weight * ws

                analysis_rows.append({
                    "post_id": pid,
                    "Post Keywords": "",  # placeholder; will compute for top posts only
                    "Weighted Sentiment of Thread": ws,
                    "Contribution Weight": contrib_weight,
                    "Total Sentiment Contribution": total_contribution,
                    "# Comments": nc,
                    "Total Score": ts
                })

            analysis_df = pd.DataFrame(analysis_rows)
            # Determine top 5 posts by contribution weight
            top5 = analysis_df.sort_values("Contribution Weight", ascending=False).head(5).copy()
            top5.reset_index(drop=True, inplace=True)

            # Compute keywords only for top posts
            for idx, row in top5.iterrows():
                pid = row["post_id"]
                post_text = posts[posts["post_id"] == pid].iloc[0]["text"]
                kw = keywords_for_df(pd.DataFrame({"text": [post_text]}), top_n=2)
                keywords_list = [k for k, _ in kw][:2]
                top5.at[idx, "Post Keywords"] = ", ".join(keywords_list)

            # Format numeric columns
            for df_part in (top5,):
                df_part["Weighted Sentiment of Thread"] = df_part["Weighted Sentiment of Thread"].map("{:.2f}".format)
                df_part["Total Score"] = df_part["Total Score"].map("{:,}".format)
                df_part["Contribution Weight"] = df_part["Contribution Weight"].map("{:.2%}".format)
                df_part["Total Sentiment Contribution"] = df_part["Total Sentiment Contribution"].map("{:.4f}".format)

            st.subheader("Top 5 Posts by Contribution Weight")
            st.dataframe(
                top5[["Post Keywords", "Weighted Sentiment of Thread", "Contribution Weight", "Total Sentiment Contribution", "# Comments", "Total Score"]],
                use_container_width=True
            )

            st.subheader("Post Details")
            for idx, row in top5.reset_index(drop=True).iterrows():
                pid = row["post_id"]
                post_obj = posts[posts["post_id"] == pid].iloc[0]
                post_text = post_obj["text"]
                first_line = post_text.split("\n")[0][:50] 
                with st.expander(f"{idx} - {first_line}..."):
                    # Post Metrics
                    post_sent = post_obj["sentiment"]
                    # Normalize post sentiment to [-1,1]
                    post_sent_norm = 2 * post_sent - 1
                    post_score = post_obj["score"]
                    ps = pd.to_numeric(post_score, errors="coerce")
                    post_score_num = ps if (not np.isnan(ps) and ps >= 0) else 0
                    # Compute post weight
                    post_weight = (1 + np.log1p(post_score_num)) * gamma_post
                    st.markdown("**Post:**")
                    st.markdown(f"{post_text[:300]}{'...' if len(post_text) > 300 else ''}"
                                f"(Sentiment: {post_sent_norm:.2f}, Weight: {post_weight:.2f}, Score: {post_score:,})"
                                )
                    st.markdown("---")
                    # Display top 5 comments with metrics
                    top_comments = (
                        comments[comments["parent_id"] == f"t3_{pid}"]
                        .sort_values("score", ascending=False)
                        .head(5)
                    )
                    st.markdown("**Top Comments:**")
                    for c_idx, comment in top_comments.iterrows():
                        c_text = comment["text"]
                        # Normalize comment sentiment and compute weight
                        c_sent_norm = 2 * comment["sentiment"] - 1
                        c_score = comment["score"]
                        cs = pd.to_numeric(c_score, errors="coerce")
                        c_score_num = cs if (not np.isnan(cs) and cs >= 0) else 0
                        c_weight = 1 + np.log1p(c_score_num)
                        st.markdown(
                            f"{c_idx}. {c_text[:200]}{'...' if len(c_text) > 200 else ''} "
                            f"(Sentiment: {c_sent_norm:.2f}, Weight: {c_weight:.2f}, Score: {c_score:,})"
                        )

# Display the data source attribution
# st.markdown(last_update_caption, unsafe_allow_html=True)