Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import altair as alt | |
import yaml | |
from pathlib import Path | |
# Call page config BEFORE importing modules that use Streamlit commands | |
st.set_page_config(page_title="Reddit Sentiment Trends", layout="wide") | |
# Import from local modules AFTER page config is set | |
from data_utils import ( | |
load_summary, | |
load_day, | |
get_subreddit_colors, | |
get_last_updated_hf_caption, | |
) | |
from text_analysis import keywords_for_df | |
st.title("Reddit Sentiment Monitor") | |
st.markdown( | |
""" | |
**Welcome!** This page shows how Reddit's AI communities feel day-to-day. | |
A daily pipeline grabs new posts and comments, scores their tone with a sentiment model, and saves the results to a public HuggingFace [dataset](https://huggingface.co/datasets/hblim/top_reddit_posts_daily). \n | |
""" | |
) | |
# ββ Load & transform data ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
df = load_summary() | |
last_update_caption = get_last_updated_hf_caption() | |
# Get colors for each subreddit | |
subreddits = df["subreddit"].unique() | |
subreddit_colors = get_subreddit_colors(subreddits) | |
# Load mean/std parameters for sentiment spike bands per subreddit | |
params_path = Path(__file__).resolve().parent.parent / "spike_params.yaml" | |
try: | |
with params_path.open("r") as f: | |
spike_params = yaml.safe_load(f) | |
except FileNotFoundError: | |
spike_params = {} | |
# Define time format to use across all charts | |
time_format = "%m/%d/%Y" | |
# Get date range from the dataset for the form | |
min_date = df["date"].min().date() | |
max_date = df["date"].max().date() | |
# ββ Community weighted sentiment line chart for all subreddits βββββββββββββββ | |
st.subheader("Daily Community-Weighted Sentiment") | |
st.markdown( | |
""" | |
The line chart below plots the daily *community-weighted sentiment*, reflecting the average sentiment across all posts/comments in a subreddit community. | |
To calculate the community-weighted sentiment: | |
- First, each post or comment is assigned a sentiment score of β1 (negative) or +1 (positive) | |
- Then, the sentiment score is weighted by its upvotes so busier discussions matter more. | |
""" | |
) | |
# Add date range selector for the time series | |
date_range = st.date_input( | |
"Select date range for time series", | |
(min_date, max_date), | |
min_value=min_date, | |
max_value=max_date | |
) | |
start_date, end_date = date_range | |
filtered_df = df[(df["date"].dt.date >= start_date) & (df["date"].dt.date <= end_date)] | |
# Add a dropdown (selectbox) for choosing a single subreddit to display | |
default_sub = "artificial" if "artificial" in subreddits else list(subreddits)[0] | |
selected_subreddit = st.selectbox( | |
"Select subreddit", | |
options=list(subreddits), | |
index=list(subreddits).index(default_sub) | |
) | |
plot_df = filtered_df[filtered_df["subreddit"] == selected_subreddit] | |
# ββ Determine shading band and dynamic y-axis domain ββββββββββββββββββββββββ | |
mean_val = std_val = None | |
if selected_subreddit in spike_params: | |
mean_val = spike_params[selected_subreddit].get("mean") | |
std_val = spike_params[selected_subreddit].get("std") | |
# Calculate band limits if parameters exist | |
band_low = band_high = None | |
if mean_val is not None and std_val is not None: | |
band_low = mean_val - 3 * std_val | |
band_high = mean_val + 3 * std_val | |
# Determine y-axis domain based on data and (optional) band | |
sent_min = plot_df["community_weighted_sentiment"].min() | |
sent_max = plot_df["community_weighted_sentiment"].max() | |
if band_low is not None: | |
y_min = float(min(sent_min, band_low)) | |
y_max = float(max(sent_max, band_high)) | |
else: | |
y_min = float(sent_min) | |
y_max = float(sent_max) | |
# Add small padding so points are not flush with edges | |
padding = 0.05 | |
y_domain = [y_min - padding, y_max + padding] | |
# Define hover selection for nearest point | |
nearest = alt.selection_single( | |
name="nearest", | |
on="mouseover", | |
nearest=True, | |
fields=["date"], | |
empty="none" | |
) | |
# Base chart with refreshed y-axis range | |
base = alt.Chart(plot_df).encode( | |
x=alt.X("date:T", title="Date", axis=alt.Axis(format=time_format, labelPadding=15)), | |
y=alt.Y( | |
"community_weighted_sentiment:Q", | |
title="Community Weighted Sentiment", | |
scale=alt.Scale(domain=y_domain), | |
), | |
) | |
# Use a constant blue colour for all plot elements | |
line_colour = "#1f77b4" | |
# Draw line for the selected subreddit | |
line = ( | |
base.transform_calculate(legend='"daily community sentiment score"') | |
.mark_line(color=line_colour) | |
.encode( | |
color=alt.Color( | |
"legend:N", | |
scale=alt.Scale(domain=["daily community sentiment score", "historical 3Ο sentiment range", "significant sentiment outlier"], range=[line_colour, line_colour, "red"]), | |
legend=None # hide default legend; we will add a custom manual legend below the chart | |
) | |
) | |
) | |
# Invisible selectors to capture hover events | |
selectors = base.mark_point(opacity=0).add_selection(nearest) | |
# Draw highlighted points on hover | |
points_hover = base.mark_point(size=60, color=line_colour).encode( | |
opacity=alt.condition(nearest, alt.value(1), alt.value(0)) | |
) | |
# Tooltip rule and popup | |
tooltips = base.mark_rule(color="gray").encode( | |
tooltip=[ | |
alt.Tooltip("subreddit:N", title="Subreddit"), | |
alt.Tooltip("date:T", title="Date", format=time_format), | |
alt.Tooltip("community_weighted_sentiment:Q", title="Sentiment", format=".2f") | |
] | |
).transform_filter(nearest) | |
# Optional shaded band (mean Β± 3Ο) | |
band = None | |
outliers = None | |
domain_labels = [ | |
"daily community sentiment score", | |
"historical 3Ο sentiment range", | |
"significant sentiment outlier", | |
] | |
domain_colors = [line_colour, line_colour, "red"] | |
if band_low is not None: | |
band_df = pd.DataFrame({ | |
"date": [plot_df["date"].min(), plot_df["date"].max()], | |
"low": [band_low, band_low], | |
"high": [band_high, band_high], | |
}) | |
band = ( | |
alt.Chart(band_df) | |
.transform_calculate(legend='"historical 3Ο sentiment range"') | |
.mark_area(opacity=0.15) | |
.encode( | |
x="date:T", | |
y=alt.Y("low:Q", scale=alt.Scale(domain=y_domain)), | |
y2="high:Q", | |
color=alt.Color( | |
"legend:N", | |
scale=alt.Scale(domain=domain_labels, range=domain_colors), | |
legend=None # suppress built-in legend for band | |
), | |
) | |
) | |
# Identify significant outliers outside the band | |
outlier_df = plot_df[(plot_df["community_weighted_sentiment"] < band_low) | | |
(plot_df["community_weighted_sentiment"] > band_high)].copy() | |
if not outlier_df.empty: | |
outliers = ( | |
alt.Chart(outlier_df) | |
.transform_calculate(legend='"significant sentiment outlier"') | |
.mark_point(shape="circle", size=100, fill="white", stroke="red", strokeWidth=2) | |
.encode( | |
x="date:T", | |
y="community_weighted_sentiment:Q", | |
color=alt.Color( | |
"legend:N", | |
scale=alt.Scale(domain=domain_labels, range=domain_colors), | |
legend=None # suppress built-in legend for outlier | |
), | |
) | |
) | |
# Layer everything and make interactive, with title showing subreddit | |
layers = [line, selectors, points_hover, tooltips] | |
if band is not None: | |
layers.insert(0, band) # draw band behind the line | |
if outliers is not None: | |
layers.append(outliers) | |
hover_chart = alt.layer(*layers).properties( | |
height=400, # increased height for more spacious plot area | |
).interactive(bind_y=False) | |
# ββ Manual legend (two rows) βββββββββββββββββββββββββββββββββββββββββββββββ | |
legend_df = pd.DataFrame({ | |
"row": [0, 1], | |
"label": ["significant sentiment outlier", "historical 3Ο sentiment range"], | |
"stroke": ["red", "lightblue"], # outline colour | |
"fill": ["white", "lightblue"], # interior fill (blue only for historical band) | |
"shape": ["circle", "square"], | |
}) | |
legend_points = ( | |
alt.Chart(legend_df) | |
.mark_point(size=100, filled=True) | |
.encode( | |
y=alt.Y("row:O", axis=None), | |
x=alt.value(0), | |
shape=alt.Shape("shape:N", legend=None), | |
stroke=alt.Stroke("stroke:N", scale=None, legend=None), | |
fill=alt.Fill("fill:N", scale=None, legend=None), | |
) | |
) | |
legend_text = ( | |
alt.Chart(legend_df) | |
.mark_text(align="left", baseline="middle", dx=15, color="black") | |
.encode( | |
y="row:O", | |
x=alt.value(0), | |
text="label:N", | |
) | |
) | |
manual_legend = ( | |
legend_points + legend_text | |
).properties(height=50, background="white",width=170) | |
# # Concatenate chart and manual legend vertically | |
# final_chart = alt.vconcat( | |
# manual_legend, | |
# hover_chart, | |
# spacing=0 | |
# ).configure_view(strokeWidth=0) | |
st.altair_chart(manual_legend, use_container_width=False) | |
st.altair_chart(hover_chart, use_container_width=True) | |
# ββ Bar chart for post counts by subreddit (side-by-side) ββββββββββββββββββββ | |
st.subheader("Daily Post Counts by Subreddit") | |
# Create grouped bar chart for post counts by date and subreddit | |
bar_chart = alt.Chart(df).mark_bar().encode( | |
x=alt.X("date:T", title="Date", axis=alt.Axis(format=time_format)), | |
y=alt.Y("count:Q", title="Post Count"), | |
xOffset="subreddit:N", # This creates the side-by-side grouping | |
color=alt.Color( | |
"subreddit:N", | |
scale=alt.Scale(domain=list(subreddits), range=list(subreddit_colors.values())), | |
legend=alt.Legend(title="Subreddit") | |
), | |
tooltip=["date", "subreddit", "count"] | |
).properties(height=400).interactive() | |
st.altair_chart(bar_chart, use_container_width=True) | |
# ββ Latest metrics for each subreddit βββββββββββββββββββββββββββββββββββββββββ | |
st.subheader("Latest Metrics") | |
# Get the most recent data for each subreddit | |
latest_by_subreddit = df.sort_values("date").groupby("subreddit").last().reset_index() | |
# Display metrics in columns | |
cols = st.columns(len(latest_by_subreddit)) | |
for i, (_, row) in enumerate(latest_by_subreddit.iterrows()): | |
with cols[i]: | |
st.markdown(f"**{row['subreddit']}**") | |
st.metric("Community Weighted", f"{row['community_weighted_sentiment']:.2f}") | |
st.metric("Posts", int(row["count"])) | |
# ββ Analyze sentiment driving posts βββββββββββββββββββββββββββββββββββββ | |
st.header("Analyze sentiment driving posts") | |
with st.form("analysis_form"): | |
col1, col2 = st.columns(2) | |
with col1: | |
selected_subreddit = st.selectbox("Select Subreddit", options=subreddits) | |
with col2: | |
selected_date = st.date_input( | |
"Select Date", | |
value=max_date, | |
min_value=min_date, | |
max_value=max_date | |
) | |
submit_button = st.form_submit_button("Analyze Posts") | |
if submit_button: | |
date_str = selected_date.strftime("%Y-%m-%d") | |
with st.spinner(f"Loading data for r/{selected_subreddit} on {date_str}..."): | |
posts_df = load_day(date_str, selected_subreddit) | |
if posts_df.empty: | |
st.error(f"No posts found for r/{selected_subreddit} on {date_str}") | |
else: | |
# Separate posts and comments | |
posts = posts_df[posts_df["type"] == "post"] | |
comments = posts_df[posts_df["type"] == "comment"] | |
# Overall summary metrics using engagement-adjusted sentiment (EAS) | |
n_posts = len(posts) | |
df_day = posts_df.copy() | |
df_day["score_num"] = pd.to_numeric(df_day["score"], errors="coerce").fillna(0) | |
weights_base_day = 1 + np.log1p(df_day["score_num"].clip(lower=0)) | |
gamma_post = 0.3 | |
weights_day = weights_base_day * np.where(df_day["type"] == "post", gamma_post, 1.0) | |
total_weight_day = weights_day.sum() | |
overall_eas = (weights_day * df_day["sentiment"]).sum() / weights_day.sum() if weights_day.sum() > 0 else 0 | |
# Normalize daily weighted sentiment to range [-1,1] | |
overall_eas = 2 * overall_eas - 1 | |
overall_score = df_day["score"].sum() | |
st.subheader(f"r/{selected_subreddit} on {date_str}") | |
c1, c2, c3 = st.columns(3) | |
c1.metric("Posts", n_posts) | |
c2.metric("Daily Weighted Sentiment, All Posts", f"{overall_eas:.2f}") | |
c3.metric("Total Score, All Posts", f"{overall_score:,}") | |
# Wrap analysis and rendering of top posts in a spinner | |
with st.spinner("Analyzing sentiment and rendering top posts..."): | |
# Build per-post analysis | |
analysis_rows = [] | |
for _, post in posts.iterrows(): | |
pid = post["post_id"] | |
text = post["text"] | |
# Gather comments for this post | |
post_comments = comments[comments["parent_id"] == f"t3_{pid}"] | |
# Combine post and comments for calculations | |
segment = pd.concat([pd.DataFrame([post]), post_comments], ignore_index=True) | |
# Compute engagement-adjusted sentiment for this post thread | |
segment_score_num = pd.to_numeric(segment["score"], errors="coerce").fillna(0) | |
weights_base = 1 + np.log1p(segment_score_num.clip(lower=0)) | |
gamma_post = 0.3 | |
weights_seg = weights_base * np.where(segment["type"] == "post", gamma_post, 1.0) | |
ws = (weights_seg * segment["sentiment"]).sum() / weights_seg.sum() if weights_seg.sum() > 0 else 0 | |
# Normalize weighted sentiment of thread to range [-1,1] | |
ws = 2 * ws - 1 | |
ts = segment["score"].sum() | |
nc = len(post_comments) | |
thread_weight_sum = weights_seg.sum() | |
contrib_weight = thread_weight_sum / total_weight_day if total_weight_day > 0 else 0 | |
total_contribution = contrib_weight * ws | |
analysis_rows.append({ | |
"post_id": pid, | |
"Post Keywords": "", # placeholder; will compute for top posts only | |
"Weighted Sentiment of Thread": ws, | |
"Contribution Weight": contrib_weight, | |
"Total Sentiment Contribution": total_contribution, | |
"# Comments": nc, | |
"Total Score": ts | |
}) | |
analysis_df = pd.DataFrame(analysis_rows) | |
# Determine top 5 posts by contribution weight | |
top5 = analysis_df.sort_values("Contribution Weight", ascending=False).head(5).copy() | |
top5.reset_index(drop=True, inplace=True) | |
# Compute keywords only for top posts | |
for idx, row in top5.iterrows(): | |
pid = row["post_id"] | |
post_text = posts[posts["post_id"] == pid].iloc[0]["text"] | |
kw = keywords_for_df(pd.DataFrame({"text": [post_text]}), top_n=2) | |
keywords_list = [k for k, _ in kw][:2] | |
top5.at[idx, "Post Keywords"] = ", ".join(keywords_list) | |
# Format numeric columns | |
for df_part in (top5,): | |
df_part["Weighted Sentiment of Thread"] = df_part["Weighted Sentiment of Thread"].map("{:.2f}".format) | |
df_part["Total Score"] = df_part["Total Score"].map("{:,}".format) | |
df_part["Contribution Weight"] = df_part["Contribution Weight"].map("{:.2%}".format) | |
df_part["Total Sentiment Contribution"] = df_part["Total Sentiment Contribution"].map("{:.4f}".format) | |
st.subheader("Top 5 Posts by Contribution Weight") | |
st.dataframe( | |
top5[["Post Keywords", "Weighted Sentiment of Thread", "Contribution Weight", "Total Sentiment Contribution", "# Comments", "Total Score"]], | |
use_container_width=True | |
) | |
st.subheader("Post Details") | |
for idx, row in top5.reset_index(drop=True).iterrows(): | |
pid = row["post_id"] | |
post_obj = posts[posts["post_id"] == pid].iloc[0] | |
post_text = post_obj["text"] | |
first_line = post_text.split("\n")[0][:50] | |
with st.expander(f"{idx} - {first_line}..."): | |
# Post Metrics | |
post_sent = post_obj["sentiment"] | |
# Normalize post sentiment to [-1,1] | |
post_sent_norm = 2 * post_sent - 1 | |
post_score = post_obj["score"] | |
ps = pd.to_numeric(post_score, errors="coerce") | |
post_score_num = ps if (not np.isnan(ps) and ps >= 0) else 0 | |
# Compute post weight | |
post_weight = (1 + np.log1p(post_score_num)) * gamma_post | |
st.markdown("**Post:**") | |
st.markdown(f"{post_text[:300]}{'...' if len(post_text) > 300 else ''}" | |
f"(Sentiment: {post_sent_norm:.2f}, Weight: {post_weight:.2f}, Score: {post_score:,})" | |
) | |
st.markdown("---") | |
# Display top 5 comments with metrics | |
top_comments = ( | |
comments[comments["parent_id"] == f"t3_{pid}"] | |
.sort_values("score", ascending=False) | |
.head(5) | |
) | |
st.markdown("**Top Comments:**") | |
for c_idx, comment in top_comments.iterrows(): | |
c_text = comment["text"] | |
# Normalize comment sentiment and compute weight | |
c_sent_norm = 2 * comment["sentiment"] - 1 | |
c_score = comment["score"] | |
cs = pd.to_numeric(c_score, errors="coerce") | |
c_score_num = cs if (not np.isnan(cs) and cs >= 0) else 0 | |
c_weight = 1 + np.log1p(c_score_num) | |
st.markdown( | |
f"{c_idx}. {c_text[:200]}{'...' if len(c_text) > 200 else ''} " | |
f"(Sentiment: {c_sent_norm:.2f}, Weight: {c_weight:.2f}, Score: {c_score:,})" | |
) | |
# Display the data source attribution | |
# st.markdown(last_update_caption, unsafe_allow_html=True) |