import streamlit as st import pandas as pd import numpy as np import altair as alt import yaml from pathlib import Path # Call page config BEFORE importing modules that use Streamlit commands st.set_page_config(page_title="Reddit Sentiment Trends", layout="wide") # Import from local modules AFTER page config is set from data_utils import ( load_summary, load_day, get_subreddit_colors, get_last_updated_hf_caption, ) from text_analysis import keywords_for_df st.title("Reddit Sentiment Monitor") st.markdown( """ **Welcome!** This page shows how Reddit's AI communities feel day-to-day. A daily pipeline grabs new posts and comments, scores their tone with a sentiment model, and saves the results to a public HuggingFace [dataset](https://huggingface.co/datasets/hblim/top_reddit_posts_daily). \n """ ) # ── Load & transform data ──────────────────────────────────────────────────── df = load_summary() last_update_caption = get_last_updated_hf_caption() # Get colors for each subreddit subreddits = df["subreddit"].unique() subreddit_colors = get_subreddit_colors(subreddits) # Load mean/std parameters for sentiment spike bands per subreddit params_path = Path(__file__).resolve().parent.parent / "spike_params.yaml" try: with params_path.open("r") as f: spike_params = yaml.safe_load(f) except FileNotFoundError: spike_params = {} # Define time format to use across all charts time_format = "%m/%d/%Y" # Get date range from the dataset for the form min_date = df["date"].min().date() max_date = df["date"].max().date() # ── Community weighted sentiment line chart for all subreddits ─────────────── st.subheader("Daily Community-Weighted Sentiment") st.markdown( """ The line chart below plots the daily *community-weighted sentiment*, reflecting the average sentiment across all posts/comments in a subreddit community. To calculate the community-weighted sentiment: - First, each post or comment is assigned a sentiment score of −1 (negative) or +1 (positive) - Then, the sentiment score is weighted by its upvotes so busier discussions matter more. """ ) # Add date range selector for the time series date_range = st.date_input( "Select date range for time series", (min_date, max_date), min_value=min_date, max_value=max_date ) start_date, end_date = date_range filtered_df = df[(df["date"].dt.date >= start_date) & (df["date"].dt.date <= end_date)] # Add a dropdown (selectbox) for choosing a single subreddit to display default_sub = "artificial" if "artificial" in subreddits else list(subreddits)[0] selected_subreddit = st.selectbox( "Select subreddit", options=list(subreddits), index=list(subreddits).index(default_sub) ) plot_df = filtered_df[filtered_df["subreddit"] == selected_subreddit] # ── Determine shading band and dynamic y-axis domain ──────────────────────── mean_val = std_val = None if selected_subreddit in spike_params: mean_val = spike_params[selected_subreddit].get("mean") std_val = spike_params[selected_subreddit].get("std") # Calculate band limits if parameters exist band_low = band_high = None if mean_val is not None and std_val is not None: band_low = mean_val - 3 * std_val band_high = mean_val + 3 * std_val # Determine y-axis domain based on data and (optional) band sent_min = plot_df["community_weighted_sentiment"].min() sent_max = plot_df["community_weighted_sentiment"].max() if band_low is not None: y_min = float(min(sent_min, band_low)) y_max = float(max(sent_max, band_high)) else: y_min = float(sent_min) y_max = float(sent_max) # Add small padding so points are not flush with edges padding = 0.05 y_domain = [y_min - padding, y_max + padding] # Define hover selection for nearest point nearest = alt.selection_single( name="nearest", on="mouseover", nearest=True, fields=["date"], empty="none" ) # Base chart with refreshed y-axis range base = alt.Chart(plot_df).encode( x=alt.X("date:T", title="Date", axis=alt.Axis(format=time_format, labelPadding=15)), y=alt.Y( "community_weighted_sentiment:Q", title="Community Weighted Sentiment", scale=alt.Scale(domain=y_domain), ), ) # Use a constant blue colour for all plot elements line_colour = "#1f77b4" # Draw line for the selected subreddit line = ( base.transform_calculate(legend='"daily community sentiment score"') .mark_line(color=line_colour) .encode( color=alt.Color( "legend:N", scale=alt.Scale(domain=["daily community sentiment score", "historical 3σ sentiment range", "significant sentiment outlier"], range=[line_colour, line_colour, "red"]), legend=None # hide default legend; we will add a custom manual legend below the chart ) ) ) # Invisible selectors to capture hover events selectors = base.mark_point(opacity=0).add_selection(nearest) # Draw highlighted points on hover points_hover = base.mark_point(size=60, color=line_colour).encode( opacity=alt.condition(nearest, alt.value(1), alt.value(0)) ) # Tooltip rule and popup tooltips = base.mark_rule(color="gray").encode( tooltip=[ alt.Tooltip("subreddit:N", title="Subreddit"), alt.Tooltip("date:T", title="Date", format=time_format), alt.Tooltip("community_weighted_sentiment:Q", title="Sentiment", format=".2f") ] ).transform_filter(nearest) # Optional shaded band (mean ± 3σ) band = None outliers = None domain_labels = [ "daily community sentiment score", "historical 3σ sentiment range", "significant sentiment outlier", ] domain_colors = [line_colour, line_colour, "red"] if band_low is not None: band_df = pd.DataFrame({ "date": [plot_df["date"].min(), plot_df["date"].max()], "low": [band_low, band_low], "high": [band_high, band_high], }) band = ( alt.Chart(band_df) .transform_calculate(legend='"historical 3σ sentiment range"') .mark_area(opacity=0.15) .encode( x="date:T", y=alt.Y("low:Q", scale=alt.Scale(domain=y_domain)), y2="high:Q", color=alt.Color( "legend:N", scale=alt.Scale(domain=domain_labels, range=domain_colors), legend=None # suppress built-in legend for band ), ) ) # Identify significant outliers outside the band outlier_df = plot_df[(plot_df["community_weighted_sentiment"] < band_low) | (plot_df["community_weighted_sentiment"] > band_high)].copy() if not outlier_df.empty: outliers = ( alt.Chart(outlier_df) .transform_calculate(legend='"significant sentiment outlier"') .mark_point(shape="circle", size=100, fill="white", stroke="red", strokeWidth=2) .encode( x="date:T", y="community_weighted_sentiment:Q", color=alt.Color( "legend:N", scale=alt.Scale(domain=domain_labels, range=domain_colors), legend=None # suppress built-in legend for outlier ), ) ) # Layer everything and make interactive, with title showing subreddit layers = [line, selectors, points_hover, tooltips] if band is not None: layers.insert(0, band) # draw band behind the line if outliers is not None: layers.append(outliers) hover_chart = alt.layer(*layers).properties( height=400, # increased height for more spacious plot area ).interactive(bind_y=False) # ── Manual legend (two rows) ─────────────────────────────────────────────── legend_df = pd.DataFrame({ "row": [0, 1], "label": ["significant sentiment outlier", "historical 3σ sentiment range"], "stroke": ["red", "lightblue"], # outline colour "fill": ["white", "lightblue"], # interior fill (blue only for historical band) "shape": ["circle", "square"], }) legend_points = ( alt.Chart(legend_df) .mark_point(size=100, filled=True) .encode( y=alt.Y("row:O", axis=None), x=alt.value(0), shape=alt.Shape("shape:N", legend=None), stroke=alt.Stroke("stroke:N", scale=None, legend=None), fill=alt.Fill("fill:N", scale=None, legend=None), ) ) legend_text = ( alt.Chart(legend_df) .mark_text(align="left", baseline="middle", dx=15, color="black") .encode( y="row:O", x=alt.value(0), text="label:N", ) ) manual_legend = ( legend_points + legend_text ).properties(height=50, background="white",width=170) # # Concatenate chart and manual legend vertically # final_chart = alt.vconcat( # manual_legend, # hover_chart, # spacing=0 # ).configure_view(strokeWidth=0) st.altair_chart(manual_legend, use_container_width=False) st.altair_chart(hover_chart, use_container_width=True) # ── Bar chart for post counts by subreddit (side-by-side) ──────────────────── st.subheader("Daily Post Counts by Subreddit") # Create grouped bar chart for post counts by date and subreddit bar_chart = alt.Chart(df).mark_bar().encode( x=alt.X("date:T", title="Date", axis=alt.Axis(format=time_format)), y=alt.Y("count:Q", title="Post Count"), xOffset="subreddit:N", # This creates the side-by-side grouping color=alt.Color( "subreddit:N", scale=alt.Scale(domain=list(subreddits), range=list(subreddit_colors.values())), legend=alt.Legend(title="Subreddit") ), tooltip=["date", "subreddit", "count"] ).properties(height=400).interactive() st.altair_chart(bar_chart, use_container_width=True) # ── Latest metrics for each subreddit ───────────────────────────────────────── st.subheader("Latest Metrics") # Get the most recent data for each subreddit latest_by_subreddit = df.sort_values("date").groupby("subreddit").last().reset_index() # Display metrics in columns cols = st.columns(len(latest_by_subreddit)) for i, (_, row) in enumerate(latest_by_subreddit.iterrows()): with cols[i]: st.markdown(f"**{row['subreddit']}**") st.metric("Community Weighted", f"{row['community_weighted_sentiment']:.2f}") st.metric("Posts", int(row["count"])) # ── Analyze sentiment driving posts ───────────────────────────────────── st.header("Analyze sentiment driving posts") with st.form("analysis_form"): col1, col2 = st.columns(2) with col1: selected_subreddit = st.selectbox("Select Subreddit", options=subreddits) with col2: selected_date = st.date_input( "Select Date", value=max_date, min_value=min_date, max_value=max_date ) submit_button = st.form_submit_button("Analyze Posts") if submit_button: date_str = selected_date.strftime("%Y-%m-%d") with st.spinner(f"Loading data for r/{selected_subreddit} on {date_str}..."): posts_df = load_day(date_str, selected_subreddit) if posts_df.empty: st.error(f"No posts found for r/{selected_subreddit} on {date_str}") else: # Separate posts and comments posts = posts_df[posts_df["type"] == "post"] comments = posts_df[posts_df["type"] == "comment"] # Overall summary metrics using engagement-adjusted sentiment (EAS) n_posts = len(posts) df_day = posts_df.copy() df_day["score_num"] = pd.to_numeric(df_day["score"], errors="coerce").fillna(0) weights_base_day = 1 + np.log1p(df_day["score_num"].clip(lower=0)) gamma_post = 0.3 weights_day = weights_base_day * np.where(df_day["type"] == "post", gamma_post, 1.0) total_weight_day = weights_day.sum() overall_eas = (weights_day * df_day["sentiment"]).sum() / weights_day.sum() if weights_day.sum() > 0 else 0 # Normalize daily weighted sentiment to range [-1,1] overall_eas = 2 * overall_eas - 1 overall_score = df_day["score"].sum() st.subheader(f"r/{selected_subreddit} on {date_str}") c1, c2, c3 = st.columns(3) c1.metric("Posts", n_posts) c2.metric("Daily Weighted Sentiment, All Posts", f"{overall_eas:.2f}") c3.metric("Total Score, All Posts", f"{overall_score:,}") # Wrap analysis and rendering of top posts in a spinner with st.spinner("Analyzing sentiment and rendering top posts..."): # Build per-post analysis analysis_rows = [] for _, post in posts.iterrows(): pid = post["post_id"] text = post["text"] # Gather comments for this post post_comments = comments[comments["parent_id"] == f"t3_{pid}"] # Combine post and comments for calculations segment = pd.concat([pd.DataFrame([post]), post_comments], ignore_index=True) # Compute engagement-adjusted sentiment for this post thread segment_score_num = pd.to_numeric(segment["score"], errors="coerce").fillna(0) weights_base = 1 + np.log1p(segment_score_num.clip(lower=0)) gamma_post = 0.3 weights_seg = weights_base * np.where(segment["type"] == "post", gamma_post, 1.0) ws = (weights_seg * segment["sentiment"]).sum() / weights_seg.sum() if weights_seg.sum() > 0 else 0 # Normalize weighted sentiment of thread to range [-1,1] ws = 2 * ws - 1 ts = segment["score"].sum() nc = len(post_comments) thread_weight_sum = weights_seg.sum() contrib_weight = thread_weight_sum / total_weight_day if total_weight_day > 0 else 0 total_contribution = contrib_weight * ws analysis_rows.append({ "post_id": pid, "Post Keywords": "", # placeholder; will compute for top posts only "Weighted Sentiment of Thread": ws, "Contribution Weight": contrib_weight, "Total Sentiment Contribution": total_contribution, "# Comments": nc, "Total Score": ts }) analysis_df = pd.DataFrame(analysis_rows) # Determine top 5 posts by contribution weight top5 = analysis_df.sort_values("Contribution Weight", ascending=False).head(5).copy() top5.reset_index(drop=True, inplace=True) # Compute keywords only for top posts for idx, row in top5.iterrows(): pid = row["post_id"] post_text = posts[posts["post_id"] == pid].iloc[0]["text"] kw = keywords_for_df(pd.DataFrame({"text": [post_text]}), top_n=2) keywords_list = [k for k, _ in kw][:2] top5.at[idx, "Post Keywords"] = ", ".join(keywords_list) # Format numeric columns for df_part in (top5,): df_part["Weighted Sentiment of Thread"] = df_part["Weighted Sentiment of Thread"].map("{:.2f}".format) df_part["Total Score"] = df_part["Total Score"].map("{:,}".format) df_part["Contribution Weight"] = df_part["Contribution Weight"].map("{:.2%}".format) df_part["Total Sentiment Contribution"] = df_part["Total Sentiment Contribution"].map("{:.4f}".format) st.subheader("Top 5 Posts by Contribution Weight") st.dataframe( top5[["Post Keywords", "Weighted Sentiment of Thread", "Contribution Weight", "Total Sentiment Contribution", "# Comments", "Total Score"]], use_container_width=True ) st.subheader("Post Details") for idx, row in top5.reset_index(drop=True).iterrows(): pid = row["post_id"] post_obj = posts[posts["post_id"] == pid].iloc[0] post_text = post_obj["text"] first_line = post_text.split("\n")[0][:50] with st.expander(f"{idx} - {first_line}..."): # Post Metrics post_sent = post_obj["sentiment"] # Normalize post sentiment to [-1,1] post_sent_norm = 2 * post_sent - 1 post_score = post_obj["score"] ps = pd.to_numeric(post_score, errors="coerce") post_score_num = ps if (not np.isnan(ps) and ps >= 0) else 0 # Compute post weight post_weight = (1 + np.log1p(post_score_num)) * gamma_post st.markdown("**Post:**") st.markdown(f"{post_text[:300]}{'...' if len(post_text) > 300 else ''}" f"(Sentiment: {post_sent_norm:.2f}, Weight: {post_weight:.2f}, Score: {post_score:,})" ) st.markdown("---") # Display top 5 comments with metrics top_comments = ( comments[comments["parent_id"] == f"t3_{pid}"] .sort_values("score", ascending=False) .head(5) ) st.markdown("**Top Comments:**") for c_idx, comment in top_comments.iterrows(): c_text = comment["text"] # Normalize comment sentiment and compute weight c_sent_norm = 2 * comment["sentiment"] - 1 c_score = comment["score"] cs = pd.to_numeric(c_score, errors="coerce") c_score_num = cs if (not np.isnan(cs) and cs >= 0) else 0 c_weight = 1 + np.log1p(c_score_num) st.markdown( f"{c_idx}. {c_text[:200]}{'...' if len(c_text) > 200 else ''} " f"(Sentiment: {c_sent_norm:.2f}, Weight: {c_weight:.2f}, Score: {c_score:,})" ) # Display the data source attribution # st.markdown(last_update_caption, unsafe_allow_html=True)