"""Pure‑function helpers for daily aggregation.""" from __future__ import annotations import pandas as pd import numpy as np def summary_from_df(df: pd.DataFrame, gamma_post: float = 0.3) -> pd.DataFrame: """ Return a DataFrame with daily & subreddit aggregates. Expects columns: retrieved_at - UTC timestamp or ISO-date string subreddit - subreddit name sentiment - numeric score (e.g. −1 … 1) score - numeric weight / post score Output columns: date (datetime.date) subreddit (string) mean_sentiment community_weighted_sentiment count """ # Normalize retrieved_at to datetime and extract calendar day df = df.copy() df["date"] = pd.to_datetime(df["retrieved_at"]).dt.date # Group by date and subreddit grouped = df.groupby(["date", "subreddit"]) # Aggregate metrics result = grouped.agg( # First calculate raw mean_sentiment raw_mean_sentiment=("sentiment", "mean"), count=("sentiment", "count"), ).reset_index() # Apply transformation to raw_mean_sentiment to get values in range [-1, 1] instead of [0, 1] result["mean_sentiment"] = 2 * result["raw_mean_sentiment"] - 1 # Remove the raw mean column result = result.drop(columns="raw_mean_sentiment") # Calculate engagement-adjusted sentiment (EAS) for each group # 1. Ensure 'score' is numeric df["score_num"] = pd.to_numeric(df["score"], errors="coerce").fillna(0) # 2. Compute base weights (1 + log1p(score)) weights_base = 1 + np.log1p(df["score_num"].clip(lower=0)) # 3. Apply post weight multiplier weights = weights_base * np.where(df.get("type", None) == "post", gamma_post, 1.0) df["weight"] = weights # 4. Compute EAS per group: weighted average of sentiment community_weighted_sentiments = [] for (date, subreddit), group in grouped: w = group["weight"] s = group["sentiment"] eas = (w * s).sum() / w.sum() if w.sum() > 0 else 0 community_weighted_sentiments.append(eas) result["community_weighted_sentiment"] = community_weighted_sentiments # Normalize community_weighted_sentiment to range [-1,1] result["community_weighted_sentiment"] = 2 * result["community_weighted_sentiment"] - 1 # Ensure consistent column order result = result[["date", "subreddit", "mean_sentiment", "community_weighted_sentiment", "count"]] return result