# Loading Reddit Data

This notebook has functions to read in parquet data from Hugging Face dataset [hblim/top_reddit_posts_daily](https://huggingface.co/datasets/hblim/top_reddit_posts_daily)

I created this notebook to help me fix a issue where I overwrote `data_raw/2025-04-20.parquet` with 2025-04-23 data during testing
- I went to Hugging Face version history to see when the 2025-04-20 file was overwritten erroneously, and saw newer commits on 2025-04-23)
- I cloned the Hugging Face dataset repo locally and checked out last correct version for the corrupted 2025-04-20.parquet file
- Verified that the data looked good (e.g. retrieved date > created date), and not duplicated anywhere else, and then pushed the correct version back to the  main remote

In [97]:
import os
import glob
import datetime
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd
import pyarrow

from huggingface_hub import HfApi

Download all historical data

In [167]:
def load_reddit_data(source,folder='data_raw'):
    """
    Load Reddit daily posts data either from Hugging Face Hub or local files.

    Args:
        repo_id (str): Hugging Face repo id (e.g., "hblim/top_reddit_posts_daily")
        source (str): "hub" to load from Hugging Face, "local" to load from local files
    Returns:
        pd.DataFrame: Combined DataFrame of all posts
    """
    df = []
    
    if source == "hub":
        api = HfApi()
        all_files = api.list_repo_files("hblim/top_reddit_posts_daily", repo_type="dataset")
        parquet_files = sorted([f for f in all_files if f.startswith(folder) and f.endswith(".parquet")])

        for shard in parquet_files:
            local_path = api.hf_hub_download(repo_id=repo_id, filename=shard, repo_type="dataset")
            file_date = os.path.splitext(os.path.basename(local_path))[0]
            df.append(pd.read_parquet(local_path).assign(filedate=file_date))

    elif source == "local":
        cwd = os.getcwd()
        local_folder = os.path.join(cwd, "top_reddit_posts_daily", folder)
        local_files = sorted(glob.glob(os.path.join(local_folder, "*.parquet")))

        for local_path in local_files:
            file_date = os.path.splitext(os.path.basename(local_path))[0]
            df.append(pd.read_parquet(local_path).assign(filedate=file_date))

    else:
        raise ValueError(f"Unknown source: {source}. Use 'hub' or 'local'.")

    df = pd.concat(df, ignore_index=True)
    print(f"Total records across {df.filedate.nunique()} days: {len(df)}")
    return df

### Check if raw and raw-deduplicated data line up
Is the raw data duplicated anywhere? Then the de-duplicated data should have fewer posts

In [164]:
df = load_reddit_data("hub")

Total records across 13 days: 3666


In [158]:
df.groupby('filedate').subreddit.count()

filedate
2025-04-14    312
2025-04-15    258
2025-04-16    330
2025-04-17    324
2025-04-18    255
2025-04-19    232
2025-04-20    251
2025-04-21    233
2025-04-22    268
2025-04-23    331
2025-04-24    332
2025-04-25    309
2025-04-26    231
Name: subreddit, dtype: int64

In [150]:
df["retrieved_at"] = pd.to_datetime(df["retrieved_at"],utc=True)

# Step 1: Find duplicate post_ids
duplicates = df[df.duplicated(subset=["post_id"], keep=False)]

# Report duplicates and their retrieved_at dates
duplicate_report = duplicates[["post_id", "retrieved_at"]]

# Step 2: De-duplicate keeping the first occurrence, sorted by retrieved_at
df_deduplicated = df_deduplicated.sort_values(by="retrieved_at").reset_index(drop=True)
df_deduplicated = df.drop_duplicates(subset=["post_id"], keep="first")

print(f"Total records across {df_deduplicated.filedate.nunique()} days: {len(df_deduplicated)}")

Total records across 13 days: 3666


In [153]:
summary1 = df.groupby('filedate').subreddit.count()
summary2 = df_deduplicated.groupby('filedate').subreddit.count().loc[summary1.index]

comparison = pd.DataFrame({
    'original': summary1,
    'deduplicated': summary2
})

comparison['matching'] = comparison['original'] == comparison['deduplicated']
comparison

Unnamed: 0_level_0,original,deduplicated,matching
filedate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-04-14,312,312,True
2025-04-15,258,258,True
2025-04-16,330,330,True
2025-04-17,324,324,True
2025-04-18,255,255,True
2025-04-19,232,232,True
2025-04-20,251,251,True
2025-04-21,233,233,True
2025-04-22,268,268,True
2025-04-23,331,331,True


### Check if raw and summary data line up

In [154]:
df_summary = pd.read_csv(api.hf_hub_download(repo_id=repo_id, filename='subreddit_daily_summary.csv', repo_type="dataset"))

In [155]:
# First compute both series
summary1 = df_summary.groupby('date')['count'].sum()
summary2 = df.groupby('filedate').subreddit.count().loc[summary1.index]

# Now merge into a single DataFrame
merged = pd.DataFrame({
    'csv_counts': summary1,
    'parquet_counts': summary2
})

merged

Unnamed: 0_level_0,csv_counts,parquet_counts
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-04-14,312,312
2025-04-15,258,258
2025-04-16,330,330
2025-04-17,324,324
2025-04-18,255,255
2025-04-19,232,232
2025-04-20,251,251
2025-04-21,233,233
2025-04-22,234,268
2025-04-23,309,331


### Check if raw and scored data line up

In [168]:
df = load_reddit_data("hub",folder='data_scored')

summary1 = df.groupby('filedate').subreddit.count()

df_scored = load_reddit_data("hub",folder='data_scored')
summary2 = df_scored.groupby('filedate').subreddit.count().loc[summary1.index]

comparison = pd.DataFrame({
    'raw': summary1,
    'scored': summary2
})

comparison['matching'] = comparison['raw'] == comparison['scored']
comparison

2025-04-22.parquet:   0%|          | 0.00/65.9k [00:00<?, ?B/s]

Total records across 13 days: 3666
Total records across 13 days: 3666


Unnamed: 0_level_0,raw,scored,matching
filedate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-04-14,312,312,True
2025-04-15,258,258,True
2025-04-16,330,330,True
2025-04-17,324,324,True
2025-04-18,255,255,True
2025-04-19,232,232,True
2025-04-20,251,251,True
2025-04-21,233,233,True
2025-04-22,268,268,True
2025-04-23,331,331,True
