Spaces:

hblim
/

reddit_sentiment_tracker

Running

App Files Files Community

reddit_sentiment_tracker / reddit_analysis /summarizer /aggregator.py

hblim

Clean codebase for HF Space (drop Prometheus binary data)

a6576f0 3 months ago

raw

history blame contribute delete

2.55 kB

	"""Pure‑function helpers for daily aggregation."""

	from __future__ import annotations
	import pandas as pd
	import numpy as np


	def summary_from_df(df: pd.DataFrame, gamma_post: float = 0.3) -> pd.DataFrame:
	"""
	Return a DataFrame with daily & subreddit aggregates.

	Expects columns:
	retrieved_at - UTC timestamp or ISO-date string
	subreddit - subreddit name
	sentiment - numeric score (e.g. −1 … 1)
	score - numeric weight / post score

	Output columns:
	date (datetime.date)
	subreddit (string)
	mean_sentiment
	community_weighted_sentiment
	count
	"""
	# Normalize retrieved_at to datetime and extract calendar day
	df = df.copy()
	df["date"] = pd.to_datetime(df["retrieved_at"]).dt.date

	# Group by date and subreddit
	grouped = df.groupby(["date", "subreddit"])

	# Aggregate metrics
	result = grouped.agg(
	# First calculate raw mean_sentiment
	raw_mean_sentiment=("sentiment", "mean"),
	count=("sentiment", "count"),
	).reset_index()

	# Apply transformation to raw_mean_sentiment to get values in range [-1, 1] instead of [0, 1]
	result["mean_sentiment"] = 2 * result["raw_mean_sentiment"] - 1

	# Remove the raw mean column
	result = result.drop(columns="raw_mean_sentiment")

	# Calculate engagement-adjusted sentiment (EAS) for each group
	# 1. Ensure 'score' is numeric
	df["score_num"] = pd.to_numeric(df["score"], errors="coerce").fillna(0)
	# 2. Compute base weights (1 + log1p(score))
	weights_base = 1 + np.log1p(df["score_num"].clip(lower=0))
	# 3. Apply post weight multiplier
	weights = weights_base * np.where(df.get("type", None) == "post", gamma_post, 1.0)
	df["weight"] = weights
	# 4. Compute EAS per group: weighted average of sentiment
	community_weighted_sentiments = []
	for (date, subreddit), group in grouped:
	w = group["weight"]
	s = group["sentiment"]
	eas = (w * s).sum() / w.sum() if w.sum() > 0 else 0
	community_weighted_sentiments.append(eas)
	result["community_weighted_sentiment"] = community_weighted_sentiments

	# Normalize community_weighted_sentiment to range [-1,1]
	result["community_weighted_sentiment"] = 2 * result["community_weighted_sentiment"] - 1

	# Ensure consistent column order
	result = result[["date", "subreddit", "mean_sentiment", "community_weighted_sentiment", "count"]]

	return result