# Analyze sentiment driving posts

In [1]:
import os
import glob
import datetime
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd
import pyarrow

from huggingface_hub import HfApi

In [2]:
"""
Download a single subreddit-day Parquet file from
`hblim/top_reddit_posts_daily/data_scored_subreddit/`.

Prereqs
-------
pip install huggingface_hub pandas pyarrow
huggingface-cli login  # or set HF_TOKEN
"""

from __future__ import annotations

import re
from pathlib import Path
from typing import Optional

import pandas as pd
from huggingface_hub import HfApi, hf_hub_download


def _sanitize(sub: str) -> str:
    """
    Apply the same cleaning rule that was used when the shards were created
    (lowercase + replace any char that isn't 0-9, a-z, _, -, . with '_').
    """
    return re.sub(r"[^\w\-.]", "_", sub.strip().lower())


def download_subreddit_day(
    date_str: str,              # "YYYY-MM-DD"
    subreddit: str,             # e.g. "MachineLearning"
    repo_id: str = "hblim/top_reddit_posts_daily",
    data_folder: str = "data_scored_subreddit",
    out_dir: str | Path = "downloads",
    token: Optional[str] = None,
) -> Path:
    """
    Returns the local path of the downloaded Parquet file.

    Raises FileNotFoundError if the shard isn't on the Hub.
    """
    api = HfApi(token=token)
    safe_sub = _sanitize(subreddit)

    # remote path is exactly how the splitter wrote it: YYYY-MM-DD__sub.parquet
    filename_in_repo = f"{data_folder}/{date_str}__{safe_sub}.parquet"

    # sanity check: make sure the file exists in the repo
    if filename_in_repo not in api.list_repo_files(repo_id, repo_type="dataset"):
        raise FileNotFoundError(
            f"No shard named '{filename_in_repo}' in {repo_id}. "
            "Maybe the date or subreddit is wrong?"
        )

    local_path = hf_hub_download(
        repo_id=repo_id,
        filename=filename_in_repo,
        repo_type="dataset",
        cache_dir=str(Path(out_dir).expanduser()),
    )
    print(f"✅ Downloaded to: {local_path}")
    return Path(local_path)

In [84]:
subreddit = 'localllama'
date = '2025-06-06'
path = download_subreddit_day(
        date_str=date,
        subreddit=subreddit)
df = pd.read_parquet(path)

2025-06-06__localllama.parquet:   0%|          | 0.00/69.2k [00:00<?, ?B/s]

✅ Downloaded to: downloads\datasets--hblim--top_reddit_posts_daily\snapshots\5fc94d45ca6e670268f2e505350bbc08ec7d5d84\data_scored_subreddit\2025-06-06__localllama.parquet


In [85]:
import pandas as pd
import numpy as np

# Assume 'df' is already loaded in the notebook, e.g.:
# df = pd.read_csv("my_reddit_day.csv")

def compute_metrics_for_df(df, gamma_post=0.3):
    # 1. Ensure 'score' is numeric
    df['score_num'] = pd.to_numeric(df['score'], errors='coerce').fillna(0)

    # 2. Compute weights: log-scaled by score, with a lower multiplier for posts
    weights = (1 + np.log1p(df['score_num'].clip(lower=0)))
    weights *= np.where(df['type'] == 'post', gamma_post, 1.0)
    df['weight'] = weights

    # 3. Compute a thread_id for each row
    def thread_id(row):
        if row['type'] == 'post':
            return str(row['post_id'])
        pid = row['parent_id']
        if isinstance(pid, str) and pid.startswith('t3_'):
            return pid[3:]
        return str(pid)

    df['thread_id'] = df.apply(thread_id, axis=1)

    # 4. Overall daily weighted sentiment (EAS)
    day_eas = (df['weight'] * df['sentiment']).sum() / df['weight'].sum()

    # 5. Per-thread metrics
    grouped = df.groupby('thread_id')
    thread_metrics = grouped.apply(lambda group: pd.Series({
        'eas': (group['weight'] * group['sentiment']).sum() / group['weight'].sum(),
        'tot_weight': group['weight'].sum(),
        'title': (
            group.loc[group['type'] == 'post', 'text']
                 .iloc[0]
            if (group['type'] == 'post').any()
            else ''
        )
    })).reset_index()

    # 6. Contribution: how much each thread shifts the day sentiment from 0.5
    thread_metrics['contrib'] = thread_metrics['tot_weight'] * (thread_metrics['eas'] - 0.5)

    return day_eas, thread_metrics

# === Example usage on your preloaded DataFrame ===
day_eas_value, thread_df = compute_metrics_for_df(df)

# 7. Show the overall daily sentiment
daily_summary = pd.DataFrame([{
    'weighted_sentiment (EAS)': round(day_eas_value, 3)
}])
daily_summary

thread_top_pos = thread_df.sort_values('contrib', ascending=False).head(5).copy()
thread_top_neg = thread_df.sort_values('contrib').head(5).copy()


  thread_metrics = grouped.apply(lambda group: pd.Series({


In [86]:
# === Example usage on your preloaded DataFrame ===
day_eas_value, thread_df = compute_metrics_for_df(df)

# 7. Show the overall daily sentiment
daily_summary = pd.DataFrame([{
    'weighted_sentiment (EAS)': round(day_eas_value, 4)
}])
daily_summary

  thread_metrics = grouped.apply(lambda group: pd.Series({


Unnamed: 0,weighted_sentiment (EAS)
0,0.3186


In [87]:
# 8. Extract top 5 positive-contribution threads and top 5 negative-contribution threads
thread_top_pos = thread_df.sort_values('contrib', ascending=False).head(5).copy()
thread_top_neg = thread_df.sort_values('contrib').head(5).copy()

# (Optionally) truncate titles for display
# thread_top_pos['title'] = thread_top_pos['title'].str.slice(0, 90)
# thread_top_neg['title'] = thread_top_neg['title'].str.slice(0, 90)

In [89]:
thread_top_pos[['title', 'eas', 'tot_weight']]

Unnamed: 0,title,eas,tot_weight
32,"Is this the largest ""No synthetic data"" open weight LLM? (142B)\n\nFrom the GitHub page of https://huggingface.co/rednote-hilab/dots.llm1.base",0.579431,28.660264
14,Tokasaurus: An LLM Inference Engine for High-Throughput Workloads\n\n,0.740024,8.072325
21,Real-time conversation with a character on your local machine\n\nAnd also the voice split function\n\nSorry for my English =),0.551828,30.763515
37,"Offline verbal chat bot with modular tool calling!\n\nThis is an update from my original [post](https://www.reddit.com/r/LocalLLaMA/comments/1l2vrg2/fully_offline_verbal_chat_bot/) where I demoed my fully offline verbal chat bot. I've made a couple updates, and should be releasing it on github soon. \n\- Clipboard insertion: allows you to insert your clipboard to the prompt with just a key press \n\- Modular tool calling: allows the model to use tools that can be drag and dropped into a folder\n\nTo clarify how tool calling works: Behind the scenes the program parses the json headers of all files in the tools folder at startup, and then passes them along with the users message. This means you can simply drag and drop a tool, restart the app, and use it.\n\nPlease leave suggestions and ask any questions you might have!",0.764096,4.431766
31,"I thought Qwen3 was putting out some questionable content into my code...\n\nOh. \*\*SOLVED.\*\* See why, I think, at the end.\n\nOkay, so I was trying \`aider\`. Only tried a bit here and there, but I just switched to using \`Qwen\_Qwen3-14B-Q6\_K\_L.gguf\`. And I see this in my aider output:\n\n\`\`\`text \n\## Signoff: insurgent (razzin' frazzin' motherfu... stupid directx...) \n\`\`\` \nNow, please bear in mind, this is script that plots timestamps, like \`ls | plottimes\` and, aside from plotting time data as a \`heatmap\`, it has no special war or battle terminology, nor profane language in it. I am not familiar with this thing to know where or how that was generated, since it SEEMS to be from a trial run aider did of the code:\n\nhttps://preview.redd.it/zamjz1bdsb5f1.jpg?width=719&format=pjpg&auto=webp&s=5ca874f91bdd6fe7fc20f4eb797e5ddc22500dec\n\nBut, that seems to be the code running -- not LLM output directly.\n\nOdd!\n\n...scrolling back to see what's up there:\n\n...",0.719805,4.278161


In [90]:
thread_top_neg[['title', 'eas', 'tot_weight']]

Unnamed: 0,title,eas,tot_weight
23,"Cannot even run the smallest model on system RAM?\n\nI am a bit confused. I am trying to run small LLMs on my Unraid server within the Ollama docker, using just the CPU and 16GB of system RAM.\n\nGot Ollama up and running, but even when pulling the smallest models like Qwen 3 0.6B with Q4\_K\_M quantization, Ollama tells me I need way more RAM than I have left to spare. Why is that? Should this model not be running on any potato? Does this have to do with context overhead?\n\n \nSorry if this is a stupid question, I am trying to learn more about this and cannot find the solution anywhere else.",0.0,23.823146
36,"what's the case against flash attention?\n\nI accidently stumbled upon the -fa (flash attention) flag in llama.cpp's llama-server. I cannot speak to the speedup in performence as i haven't properly tested it, but the memory optimization is huge: 8B-F16-gguf model with 100k fit comfortably in 32GB vram gpu with some 2-3 GB to spare.\n\nA very brief search revealed that flash attention theoretically computes the same mathematical function, and in practice benchmarks show no change in the model's output quality.\n\nSo my question is, is flash attention really just free lunch? what's the catch? why is it not enabled by default?",0.0,22.075726
17,"It is possble to run non-reasoning deepseek-r1-0528?\n\nI know, stupid question, but couldn't find an answer to it!",0.0,17.520515
13,"Can a model be so radically altered that its origin can no longer be recognized? YES!\n\n**Phi-lthy4**( [https://huggingface.co/SicariusSicariiStuff/Phi-lthy4](https://huggingface.co/SicariusSicariiStuff/Phi-lthy4) ) has been consistently described as **exceptionally unique** by all who have tested it, **almost devoid of SLOP**, and it is now widely regarded as the **most unique roleplay model available**. It underwent an intensive continued pretraining (CPT) phase, extensive supervised fine-tuning (SFT) on high-quality organic datasets, and leveraged advanced techniques including model merging, parameter pruning, and upscaling.\n\nInterestingly, this distinctiveness was validated in a recent paper: [*Gradient-Based Model Fingerprinting for LLM Similarity Detection and Family Classification*](https://arxiv.org/html/2506.01631v1). Among a wide array of models tested, this one stood out as **unclassifiable** by traditional architecture-based fingerprinting—highlighting the extent of ...",0.211321,27.502412
12,China's Rednote Open-source dots.llm performance & cost\n\n\nhttps://github.com/rednote-hilab/dots.llm1/blob/main/dots1_tech_report.pdf,0.0,15.465402
