File size: 2,550 Bytes
a6576f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Pure‑function helpers for daily aggregation."""

from __future__ import annotations
import pandas as pd
import numpy as np


def summary_from_df(df: pd.DataFrame, gamma_post: float = 0.3) -> pd.DataFrame:
    """
    Return a DataFrame with daily & subreddit aggregates.

    Expects columns:
        retrieved_at  - UTC timestamp or ISO-date string
        subreddit     - subreddit name
        sentiment     - numeric score (e.g. −1 … 1)
        score         - numeric weight / post score

    Output columns:
        date               (datetime.date)
        subreddit          (string)
        mean_sentiment
        community_weighted_sentiment
        count
    """
    # Normalize retrieved_at to datetime and extract calendar day
    df = df.copy()
    df["date"] = pd.to_datetime(df["retrieved_at"]).dt.date
    
    # Group by date and subreddit
    grouped = df.groupby(["date", "subreddit"])
    
    # Aggregate metrics
    result = grouped.agg(
        # First calculate raw mean_sentiment
        raw_mean_sentiment=("sentiment", "mean"),
        count=("sentiment", "count"),
    ).reset_index()
    
    # Apply transformation to raw_mean_sentiment to get values in range [-1, 1] instead of [0, 1]
    result["mean_sentiment"] = 2 * result["raw_mean_sentiment"] - 1
    
    # Remove the raw mean column
    result = result.drop(columns="raw_mean_sentiment")
    
    # Calculate engagement-adjusted sentiment (EAS) for each group
    # 1. Ensure 'score' is numeric
    df["score_num"] = pd.to_numeric(df["score"], errors="coerce").fillna(0)
    # 2. Compute base weights (1 + log1p(score))
    weights_base = 1 + np.log1p(df["score_num"].clip(lower=0))
    # 3. Apply post weight multiplier
    weights = weights_base * np.where(df.get("type", None) == "post", gamma_post, 1.0)
    df["weight"] = weights
    # 4. Compute EAS per group: weighted average of sentiment
    community_weighted_sentiments = []
    for (date, subreddit), group in grouped:
        w = group["weight"]
        s = group["sentiment"]
        eas = (w * s).sum() / w.sum() if w.sum() > 0 else 0
        community_weighted_sentiments.append(eas)
    result["community_weighted_sentiment"] = community_weighted_sentiments
    
    # Normalize community_weighted_sentiment to range [-1,1]
    result["community_weighted_sentiment"] = 2 * result["community_weighted_sentiment"] - 1
    
    # Ensure consistent column order
    result = result[["date", "subreddit", "mean_sentiment", "community_weighted_sentiment", "count"]]
    
    return result