{
"cells": [
{
"cell_type": "markdown",
"id": "dd911245-61d7-49db-9960-570f7feefe2b",
"metadata": {},
"source": [
"# Loading Reddit Data\n",
"\n",
"This notebook has functions to read in parquet data from Hugging Face dataset [hblim/top_reddit_posts_daily](https://huggingface.co/datasets/hblim/top_reddit_posts_daily)\n",
"\n",
"I created this notebook to help me fix a issue where I overwrote `data_raw/2025-04-20.parquet` with 2025-04-23 data during testing\n",
"- I went to Hugging Face version history to see when the 2025-04-20 file was overwritten erroneously, and saw newer commits on 2025-04-23)\n",
"- I cloned the Hugging Face dataset repo locally and checked out last correct version for the corrupted 2025-04-20.parquet file\n",
"- Verified that the data looked good (e.g. retrieved date > created date), and not duplicated anywhere else, and then pushed the correct version back to the main remote"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "d5071073-274b-480d-8503-28d2292422d3",
"metadata": {
"execution": {
"iopub.execute_input": "2025-04-26T20:34:47.775286Z",
"iopub.status.busy": "2025-04-26T20:34:47.775286Z",
"iopub.status.idle": "2025-04-26T20:34:47.779253Z",
"shell.execute_reply": "2025-04-26T20:34:47.779253Z",
"shell.execute_reply.started": "2025-04-26T20:34:47.775286Z"
}
},
"outputs": [],
"source": [
"import os\n",
"import glob\n",
"import datetime\n",
"from pathlib import Path\n",
"from dotenv import load_dotenv\n",
"import pandas as pd\n",
"import pyarrow\n",
"\n",
"from huggingface_hub import HfApi"
]
},
{
"cell_type": "markdown",
"id": "9085f661-ba37-4715-b87b-3699cb75db2f",
"metadata": {},
"source": [
"Download all historical data"
]
},
{
"cell_type": "code",
"execution_count": 167,
"id": "0c14356b-721c-4048-8efb-f69d8eae4900",
"metadata": {
"execution": {
"iopub.execute_input": "2025-04-26T21:37:43.533282Z",
"iopub.status.busy": "2025-04-26T21:37:43.533282Z",
"iopub.status.idle": "2025-04-26T21:37:43.539908Z",
"shell.execute_reply": "2025-04-26T21:37:43.539908Z",
"shell.execute_reply.started": "2025-04-26T21:37:43.533282Z"
}
},
"outputs": [],
"source": [
"def load_reddit_data(source,folder='data_raw'):\n",
" \"\"\"\n",
" Load Reddit daily posts data either from Hugging Face Hub or local files.\n",
"\n",
" Args:\n",
" repo_id (str): Hugging Face repo id (e.g., \"hblim/top_reddit_posts_daily\")\n",
" source (str): \"hub\" to load from Hugging Face, \"local\" to load from local files\n",
" Returns:\n",
" pd.DataFrame: Combined DataFrame of all posts\n",
" \"\"\"\n",
" df = []\n",
" \n",
" if source == \"hub\":\n",
" api = HfApi()\n",
" all_files = api.list_repo_files(\"hblim/top_reddit_posts_daily\", repo_type=\"dataset\")\n",
" parquet_files = sorted([f for f in all_files if f.startswith(folder) and f.endswith(\".parquet\")])\n",
"\n",
" for shard in parquet_files:\n",
" local_path = api.hf_hub_download(repo_id=repo_id, filename=shard, repo_type=\"dataset\")\n",
" file_date = os.path.splitext(os.path.basename(local_path))[0]\n",
" df.append(pd.read_parquet(local_path).assign(filedate=file_date))\n",
"\n",
" elif source == \"local\":\n",
" cwd = os.getcwd()\n",
" local_folder = os.path.join(cwd, \"top_reddit_posts_daily\", folder)\n",
" local_files = sorted(glob.glob(os.path.join(local_folder, \"*.parquet\")))\n",
"\n",
" for local_path in local_files:\n",
" file_date = os.path.splitext(os.path.basename(local_path))[0]\n",
" df.append(pd.read_parquet(local_path).assign(filedate=file_date))\n",
"\n",
" else:\n",
" raise ValueError(f\"Unknown source: {source}. Use 'hub' or 'local'.\")\n",
"\n",
" df = pd.concat(df, ignore_index=True)\n",
" print(f\"Total records across {df.filedate.nunique()} days: {len(df)}\")\n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "435fe428-0c99-4d77-9c9d-e9c9a974b16e",
"metadata": {},
"source": [
"### Check if raw and raw-deduplicated data line up\n",
"Is the raw data duplicated anywhere? Then the de-duplicated data should have fewer posts"
]
},
{
"cell_type": "code",
"execution_count": 164,
"id": "e5f35dcd-4267-4bef-bc6a-0e89937441c3",
"metadata": {
"execution": {
"iopub.execute_input": "2025-04-26T21:32:17.106045Z",
"iopub.status.busy": "2025-04-26T21:32:17.106045Z",
"iopub.status.idle": "2025-04-26T21:32:18.082758Z",
"shell.execute_reply": "2025-04-26T21:32:18.082758Z",
"shell.execute_reply.started": "2025-04-26T21:32:17.106045Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total records across 13 days: 3666\n"
]
}
],
"source": [
"df = load_reddit_data(\"hub\")"
]
},
{
"cell_type": "code",
"execution_count": 158,
"id": "ff824b99-c38f-4519-87df-54f9946cc20b",
"metadata": {
"execution": {
"iopub.execute_input": "2025-04-26T21:24:44.078709Z",
"iopub.status.busy": "2025-04-26T21:24:44.078709Z",
"iopub.status.idle": "2025-04-26T21:24:44.086147Z",
"shell.execute_reply": "2025-04-26T21:24:44.086147Z",
"shell.execute_reply.started": "2025-04-26T21:24:44.078709Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"filedate\n",
"2025-04-14 312\n",
"2025-04-15 258\n",
"2025-04-16 330\n",
"2025-04-17 324\n",
"2025-04-18 255\n",
"2025-04-19 232\n",
"2025-04-20 251\n",
"2025-04-21 233\n",
"2025-04-22 268\n",
"2025-04-23 331\n",
"2025-04-24 332\n",
"2025-04-25 309\n",
"2025-04-26 231\n",
"Name: subreddit, dtype: int64"
]
},
"execution_count": 158,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('filedate').subreddit.count()"
]
},
{
"cell_type": "code",
"execution_count": 150,
"id": "6179d986-471b-40cd-bcf5-529f582315ee",
"metadata": {
"execution": {
"iopub.execute_input": "2025-04-26T21:16:54.824986Z",
"iopub.status.busy": "2025-04-26T21:16:54.824986Z",
"iopub.status.idle": "2025-04-26T21:16:54.842945Z",
"shell.execute_reply": "2025-04-26T21:16:54.842945Z",
"shell.execute_reply.started": "2025-04-26T21:16:54.824986Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total records across 13 days: 3666\n"
]
}
],
"source": [
"df[\"retrieved_at\"] = pd.to_datetime(df[\"retrieved_at\"],utc=True)\n",
"\n",
"# Step 1: Find duplicate post_ids\n",
"duplicates = df[df.duplicated(subset=[\"post_id\"], keep=False)]\n",
"\n",
"# Report duplicates and their retrieved_at dates\n",
"duplicate_report = duplicates[[\"post_id\", \"retrieved_at\"]]\n",
"\n",
"# Step 2: De-duplicate keeping the first occurrence, sorted by retrieved_at\n",
"df_deduplicated = df_deduplicated.sort_values(by=\"retrieved_at\").reset_index(drop=True)\n",
"df_deduplicated = df.drop_duplicates(subset=[\"post_id\"], keep=\"first\")\n",
"\n",
"print(f\"Total records across {df_deduplicated.filedate.nunique()} days: {len(df_deduplicated)}\")"
]
},
{
"cell_type": "code",
"execution_count": 153,
"id": "67a5fd89-8ddc-4247-ba22-8f411169487f",
"metadata": {
"execution": {
"iopub.execute_input": "2025-04-26T21:17:16.136315Z",
"iopub.status.busy": "2025-04-26T21:17:16.136315Z",
"iopub.status.idle": "2025-04-26T21:17:16.146070Z",
"shell.execute_reply": "2025-04-26T21:17:16.146070Z",
"shell.execute_reply.started": "2025-04-26T21:17:16.136315Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" original | \n",
" deduplicated | \n",
" matching | \n",
"
\n",
" \n",
" filedate | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 2025-04-14 | \n",
" 312 | \n",
" 312 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-15 | \n",
" 258 | \n",
" 258 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-16 | \n",
" 330 | \n",
" 330 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-17 | \n",
" 324 | \n",
" 324 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-18 | \n",
" 255 | \n",
" 255 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-19 | \n",
" 232 | \n",
" 232 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-20 | \n",
" 251 | \n",
" 251 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-21 | \n",
" 233 | \n",
" 233 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-22 | \n",
" 268 | \n",
" 268 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-23 | \n",
" 331 | \n",
" 331 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-24 | \n",
" 332 | \n",
" 332 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-25 | \n",
" 309 | \n",
" 309 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-26 | \n",
" 231 | \n",
" 231 | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" original deduplicated matching\n",
"filedate \n",
"2025-04-14 312 312 True\n",
"2025-04-15 258 258 True\n",
"2025-04-16 330 330 True\n",
"2025-04-17 324 324 True\n",
"2025-04-18 255 255 True\n",
"2025-04-19 232 232 True\n",
"2025-04-20 251 251 True\n",
"2025-04-21 233 233 True\n",
"2025-04-22 268 268 True\n",
"2025-04-23 331 331 True\n",
"2025-04-24 332 332 True\n",
"2025-04-25 309 309 True\n",
"2025-04-26 231 231 True"
]
},
"execution_count": 153,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"summary1 = df.groupby('filedate').subreddit.count()\n",
"summary2 = df_deduplicated.groupby('filedate').subreddit.count().loc[summary1.index]\n",
"\n",
"comparison = pd.DataFrame({\n",
" 'original': summary1,\n",
" 'deduplicated': summary2\n",
"})\n",
"\n",
"comparison['matching'] = comparison['original'] == comparison['deduplicated']\n",
"comparison"
]
},
{
"cell_type": "markdown",
"id": "e566f098-1402-41cd-a7ec-83900f91e6fb",
"metadata": {},
"source": [
"### Check if raw and summary data line up"
]
},
{
"cell_type": "code",
"execution_count": 154,
"id": "056e51ff-dabd-474a-84c8-6a76f82a4488",
"metadata": {
"execution": {
"iopub.execute_input": "2025-04-26T21:17:19.652014Z",
"iopub.status.busy": "2025-04-26T21:17:19.645961Z",
"iopub.status.idle": "2025-04-26T21:17:19.790646Z",
"shell.execute_reply": "2025-04-26T21:17:19.790646Z",
"shell.execute_reply.started": "2025-04-26T21:17:19.652014Z"
}
},
"outputs": [],
"source": [
"df_summary = pd.read_csv(api.hf_hub_download(repo_id=repo_id, filename='subreddit_daily_summary.csv', repo_type=\"dataset\"))"
]
},
{
"cell_type": "code",
"execution_count": 155,
"id": "321eb761-6278-47e8-89f4-24b06f5ddeb3",
"metadata": {
"execution": {
"iopub.execute_input": "2025-04-26T21:17:20.795827Z",
"iopub.status.busy": "2025-04-26T21:17:20.795309Z",
"iopub.status.idle": "2025-04-26T21:17:20.805781Z",
"shell.execute_reply": "2025-04-26T21:17:20.804717Z",
"shell.execute_reply.started": "2025-04-26T21:17:20.795827Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" csv_counts | \n",
" parquet_counts | \n",
"
\n",
" \n",
" date | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 2025-04-14 | \n",
" 312 | \n",
" 312 | \n",
"
\n",
" \n",
" 2025-04-15 | \n",
" 258 | \n",
" 258 | \n",
"
\n",
" \n",
" 2025-04-16 | \n",
" 330 | \n",
" 330 | \n",
"
\n",
" \n",
" 2025-04-17 | \n",
" 324 | \n",
" 324 | \n",
"
\n",
" \n",
" 2025-04-18 | \n",
" 255 | \n",
" 255 | \n",
"
\n",
" \n",
" 2025-04-19 | \n",
" 232 | \n",
" 232 | \n",
"
\n",
" \n",
" 2025-04-20 | \n",
" 251 | \n",
" 251 | \n",
"
\n",
" \n",
" 2025-04-21 | \n",
" 233 | \n",
" 233 | \n",
"
\n",
" \n",
" 2025-04-22 | \n",
" 234 | \n",
" 268 | \n",
"
\n",
" \n",
" 2025-04-23 | \n",
" 309 | \n",
" 331 | \n",
"
\n",
" \n",
" 2025-04-24 | \n",
" 332 | \n",
" 332 | \n",
"
\n",
" \n",
" 2025-04-25 | \n",
" 309 | \n",
" 309 | \n",
"
\n",
" \n",
" 2025-04-26 | \n",
" 231 | \n",
" 231 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" csv_counts parquet_counts\n",
"date \n",
"2025-04-14 312 312\n",
"2025-04-15 258 258\n",
"2025-04-16 330 330\n",
"2025-04-17 324 324\n",
"2025-04-18 255 255\n",
"2025-04-19 232 232\n",
"2025-04-20 251 251\n",
"2025-04-21 233 233\n",
"2025-04-22 234 268\n",
"2025-04-23 309 331\n",
"2025-04-24 332 332\n",
"2025-04-25 309 309\n",
"2025-04-26 231 231"
]
},
"execution_count": 155,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# First compute both series\n",
"summary1 = df_summary.groupby('date')['count'].sum()\n",
"summary2 = df.groupby('filedate').subreddit.count().loc[summary1.index]\n",
"\n",
"# Now merge into a single DataFrame\n",
"merged = pd.DataFrame({\n",
" 'csv_counts': summary1,\n",
" 'parquet_counts': summary2\n",
"})\n",
"\n",
"merged"
]
},
{
"cell_type": "markdown",
"id": "96b1f688-c768-4aba-93f6-5247d85f8998",
"metadata": {},
"source": [
"### Check if raw and scored data line up"
]
},
{
"cell_type": "code",
"execution_count": 168,
"id": "4ef592c5-c36e-454a-bd59-d455a8a3e062",
"metadata": {
"execution": {
"iopub.execute_input": "2025-04-26T21:38:12.679864Z",
"iopub.status.busy": "2025-04-26T21:38:12.679864Z",
"iopub.status.idle": "2025-04-26T21:38:15.906491Z",
"shell.execute_reply": "2025-04-26T21:38:15.905943Z",
"shell.execute_reply.started": "2025-04-26T21:38:12.679864Z"
}
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "911485558cf84562889cd9245d5e9a24",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"2025-04-22.parquet: 0%| | 0.00/65.9k [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total records across 13 days: 3666\n",
"Total records across 13 days: 3666\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" raw | \n",
" scored | \n",
" matching | \n",
"
\n",
" \n",
" filedate | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 2025-04-14 | \n",
" 312 | \n",
" 312 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-15 | \n",
" 258 | \n",
" 258 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-16 | \n",
" 330 | \n",
" 330 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-17 | \n",
" 324 | \n",
" 324 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-18 | \n",
" 255 | \n",
" 255 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-19 | \n",
" 232 | \n",
" 232 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-20 | \n",
" 251 | \n",
" 251 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-21 | \n",
" 233 | \n",
" 233 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-22 | \n",
" 268 | \n",
" 268 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-23 | \n",
" 331 | \n",
" 331 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-24 | \n",
" 332 | \n",
" 332 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-25 | \n",
" 309 | \n",
" 309 | \n",
" True | \n",
"
\n",
" \n",
" 2025-04-26 | \n",
" 231 | \n",
" 231 | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" raw scored matching\n",
"filedate \n",
"2025-04-14 312 312 True\n",
"2025-04-15 258 258 True\n",
"2025-04-16 330 330 True\n",
"2025-04-17 324 324 True\n",
"2025-04-18 255 255 True\n",
"2025-04-19 232 232 True\n",
"2025-04-20 251 251 True\n",
"2025-04-21 233 233 True\n",
"2025-04-22 268 268 True\n",
"2025-04-23 331 331 True\n",
"2025-04-24 332 332 True\n",
"2025-04-25 309 309 True\n",
"2025-04-26 231 231 True"
]
},
"execution_count": 168,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = load_reddit_data(\"hub\",folder='data_scored')\n",
"\n",
"summary1 = df.groupby('filedate').subreddit.count()\n",
"\n",
"df_scored = load_reddit_data(\"hub\",folder='data_scored')\n",
"summary2 = df_scored.groupby('filedate').subreddit.count().loc[summary1.index]\n",
"\n",
"comparison = pd.DataFrame({\n",
" 'raw': summary1,\n",
" 'scored': summary2\n",
"})\n",
"\n",
"comparison['matching'] = comparison['raw'] == comparison['scored']\n",
"comparison"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:reddit_streamlit]",
"language": "python",
"name": "conda-env-reddit_streamlit-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}