{ "cells": [ { "cell_type": "markdown", "id": "dd911245-61d7-49db-9960-570f7feefe2b", "metadata": {}, "source": [ "# Loading Reddit Data\n", "\n", "This notebook has functions to read in parquet data from Hugging Face dataset [hblim/top_reddit_posts_daily](https://huggingface.co/datasets/hblim/top_reddit_posts_daily)\n", "\n", "I created this notebook to help me fix a issue where I overwrote `data_raw/2025-04-20.parquet` with 2025-04-23 data during testing\n", "- I went to Hugging Face version history to see when the 2025-04-20 file was overwritten erroneously, and saw newer commits on 2025-04-23)\n", "- I cloned the Hugging Face dataset repo locally and checked out last correct version for the corrupted 2025-04-20.parquet file\n", "- Verified that the data looked good (e.g. retrieved date > created date), and not duplicated anywhere else, and then pushed the correct version back to the main remote" ] }, { "cell_type": "code", "execution_count": 97, "id": "d5071073-274b-480d-8503-28d2292422d3", "metadata": { "execution": { "iopub.execute_input": "2025-04-26T20:34:47.775286Z", "iopub.status.busy": "2025-04-26T20:34:47.775286Z", "iopub.status.idle": "2025-04-26T20:34:47.779253Z", "shell.execute_reply": "2025-04-26T20:34:47.779253Z", "shell.execute_reply.started": "2025-04-26T20:34:47.775286Z" } }, "outputs": [], "source": [ "import os\n", "import glob\n", "import datetime\n", "from pathlib import Path\n", "from dotenv import load_dotenv\n", "import pandas as pd\n", "import pyarrow\n", "\n", "from huggingface_hub import HfApi" ] }, { "cell_type": "markdown", "id": "9085f661-ba37-4715-b87b-3699cb75db2f", "metadata": {}, "source": [ "Download all historical data" ] }, { "cell_type": "code", "execution_count": 167, "id": "0c14356b-721c-4048-8efb-f69d8eae4900", "metadata": { "execution": { "iopub.execute_input": "2025-04-26T21:37:43.533282Z", "iopub.status.busy": "2025-04-26T21:37:43.533282Z", "iopub.status.idle": "2025-04-26T21:37:43.539908Z", "shell.execute_reply": "2025-04-26T21:37:43.539908Z", "shell.execute_reply.started": "2025-04-26T21:37:43.533282Z" } }, "outputs": [], "source": [ "def load_reddit_data(source,folder='data_raw'):\n", " \"\"\"\n", " Load Reddit daily posts data either from Hugging Face Hub or local files.\n", "\n", " Args:\n", " repo_id (str): Hugging Face repo id (e.g., \"hblim/top_reddit_posts_daily\")\n", " source (str): \"hub\" to load from Hugging Face, \"local\" to load from local files\n", " Returns:\n", " pd.DataFrame: Combined DataFrame of all posts\n", " \"\"\"\n", " df = []\n", " \n", " if source == \"hub\":\n", " api = HfApi()\n", " all_files = api.list_repo_files(\"hblim/top_reddit_posts_daily\", repo_type=\"dataset\")\n", " parquet_files = sorted([f for f in all_files if f.startswith(folder) and f.endswith(\".parquet\")])\n", "\n", " for shard in parquet_files:\n", " local_path = api.hf_hub_download(repo_id=repo_id, filename=shard, repo_type=\"dataset\")\n", " file_date = os.path.splitext(os.path.basename(local_path))[0]\n", " df.append(pd.read_parquet(local_path).assign(filedate=file_date))\n", "\n", " elif source == \"local\":\n", " cwd = os.getcwd()\n", " local_folder = os.path.join(cwd, \"top_reddit_posts_daily\", folder)\n", " local_files = sorted(glob.glob(os.path.join(local_folder, \"*.parquet\")))\n", "\n", " for local_path in local_files:\n", " file_date = os.path.splitext(os.path.basename(local_path))[0]\n", " df.append(pd.read_parquet(local_path).assign(filedate=file_date))\n", "\n", " else:\n", " raise ValueError(f\"Unknown source: {source}. Use 'hub' or 'local'.\")\n", "\n", " df = pd.concat(df, ignore_index=True)\n", " print(f\"Total records across {df.filedate.nunique()} days: {len(df)}\")\n", " return df" ] }, { "cell_type": "markdown", "id": "435fe428-0c99-4d77-9c9d-e9c9a974b16e", "metadata": {}, "source": [ "### Check if raw and raw-deduplicated data line up\n", "Is the raw data duplicated anywhere? Then the de-duplicated data should have fewer posts" ] }, { "cell_type": "code", "execution_count": 164, "id": "e5f35dcd-4267-4bef-bc6a-0e89937441c3", "metadata": { "execution": { "iopub.execute_input": "2025-04-26T21:32:17.106045Z", "iopub.status.busy": "2025-04-26T21:32:17.106045Z", "iopub.status.idle": "2025-04-26T21:32:18.082758Z", "shell.execute_reply": "2025-04-26T21:32:18.082758Z", "shell.execute_reply.started": "2025-04-26T21:32:17.106045Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total records across 13 days: 3666\n" ] } ], "source": [ "df = load_reddit_data(\"hub\")" ] }, { "cell_type": "code", "execution_count": 158, "id": "ff824b99-c38f-4519-87df-54f9946cc20b", "metadata": { "execution": { "iopub.execute_input": "2025-04-26T21:24:44.078709Z", "iopub.status.busy": "2025-04-26T21:24:44.078709Z", "iopub.status.idle": "2025-04-26T21:24:44.086147Z", "shell.execute_reply": "2025-04-26T21:24:44.086147Z", "shell.execute_reply.started": "2025-04-26T21:24:44.078709Z" } }, "outputs": [ { "data": { "text/plain": [ "filedate\n", "2025-04-14 312\n", "2025-04-15 258\n", "2025-04-16 330\n", "2025-04-17 324\n", "2025-04-18 255\n", "2025-04-19 232\n", "2025-04-20 251\n", "2025-04-21 233\n", "2025-04-22 268\n", "2025-04-23 331\n", "2025-04-24 332\n", "2025-04-25 309\n", "2025-04-26 231\n", "Name: subreddit, dtype: int64" ] }, "execution_count": 158, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby('filedate').subreddit.count()" ] }, { "cell_type": "code", "execution_count": 150, "id": "6179d986-471b-40cd-bcf5-529f582315ee", "metadata": { "execution": { "iopub.execute_input": "2025-04-26T21:16:54.824986Z", "iopub.status.busy": "2025-04-26T21:16:54.824986Z", "iopub.status.idle": "2025-04-26T21:16:54.842945Z", "shell.execute_reply": "2025-04-26T21:16:54.842945Z", "shell.execute_reply.started": "2025-04-26T21:16:54.824986Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total records across 13 days: 3666\n" ] } ], "source": [ "df[\"retrieved_at\"] = pd.to_datetime(df[\"retrieved_at\"],utc=True)\n", "\n", "# Step 1: Find duplicate post_ids\n", "duplicates = df[df.duplicated(subset=[\"post_id\"], keep=False)]\n", "\n", "# Report duplicates and their retrieved_at dates\n", "duplicate_report = duplicates[[\"post_id\", \"retrieved_at\"]]\n", "\n", "# Step 2: De-duplicate keeping the first occurrence, sorted by retrieved_at\n", "df_deduplicated = df_deduplicated.sort_values(by=\"retrieved_at\").reset_index(drop=True)\n", "df_deduplicated = df.drop_duplicates(subset=[\"post_id\"], keep=\"first\")\n", "\n", "print(f\"Total records across {df_deduplicated.filedate.nunique()} days: {len(df_deduplicated)}\")" ] }, { "cell_type": "code", "execution_count": 153, "id": "67a5fd89-8ddc-4247-ba22-8f411169487f", "metadata": { "execution": { "iopub.execute_input": "2025-04-26T21:17:16.136315Z", "iopub.status.busy": "2025-04-26T21:17:16.136315Z", "iopub.status.idle": "2025-04-26T21:17:16.146070Z", "shell.execute_reply": "2025-04-26T21:17:16.146070Z", "shell.execute_reply.started": "2025-04-26T21:17:16.136315Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
originaldeduplicatedmatching
filedate
2025-04-14312312True
2025-04-15258258True
2025-04-16330330True
2025-04-17324324True
2025-04-18255255True
2025-04-19232232True
2025-04-20251251True
2025-04-21233233True
2025-04-22268268True
2025-04-23331331True
2025-04-24332332True
2025-04-25309309True
2025-04-26231231True
\n", "
" ], "text/plain": [ " original deduplicated matching\n", "filedate \n", "2025-04-14 312 312 True\n", "2025-04-15 258 258 True\n", "2025-04-16 330 330 True\n", "2025-04-17 324 324 True\n", "2025-04-18 255 255 True\n", "2025-04-19 232 232 True\n", "2025-04-20 251 251 True\n", "2025-04-21 233 233 True\n", "2025-04-22 268 268 True\n", "2025-04-23 331 331 True\n", "2025-04-24 332 332 True\n", "2025-04-25 309 309 True\n", "2025-04-26 231 231 True" ] }, "execution_count": 153, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary1 = df.groupby('filedate').subreddit.count()\n", "summary2 = df_deduplicated.groupby('filedate').subreddit.count().loc[summary1.index]\n", "\n", "comparison = pd.DataFrame({\n", " 'original': summary1,\n", " 'deduplicated': summary2\n", "})\n", "\n", "comparison['matching'] = comparison['original'] == comparison['deduplicated']\n", "comparison" ] }, { "cell_type": "markdown", "id": "e566f098-1402-41cd-a7ec-83900f91e6fb", "metadata": {}, "source": [ "### Check if raw and summary data line up" ] }, { "cell_type": "code", "execution_count": 154, "id": "056e51ff-dabd-474a-84c8-6a76f82a4488", "metadata": { "execution": { "iopub.execute_input": "2025-04-26T21:17:19.652014Z", "iopub.status.busy": "2025-04-26T21:17:19.645961Z", "iopub.status.idle": "2025-04-26T21:17:19.790646Z", "shell.execute_reply": "2025-04-26T21:17:19.790646Z", "shell.execute_reply.started": "2025-04-26T21:17:19.652014Z" } }, "outputs": [], "source": [ "df_summary = pd.read_csv(api.hf_hub_download(repo_id=repo_id, filename='subreddit_daily_summary.csv', repo_type=\"dataset\"))" ] }, { "cell_type": "code", "execution_count": 155, "id": "321eb761-6278-47e8-89f4-24b06f5ddeb3", "metadata": { "execution": { "iopub.execute_input": "2025-04-26T21:17:20.795827Z", "iopub.status.busy": "2025-04-26T21:17:20.795309Z", "iopub.status.idle": "2025-04-26T21:17:20.805781Z", "shell.execute_reply": "2025-04-26T21:17:20.804717Z", "shell.execute_reply.started": "2025-04-26T21:17:20.795827Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
csv_countsparquet_counts
date
2025-04-14312312
2025-04-15258258
2025-04-16330330
2025-04-17324324
2025-04-18255255
2025-04-19232232
2025-04-20251251
2025-04-21233233
2025-04-22234268
2025-04-23309331
2025-04-24332332
2025-04-25309309
2025-04-26231231
\n", "
" ], "text/plain": [ " csv_counts parquet_counts\n", "date \n", "2025-04-14 312 312\n", "2025-04-15 258 258\n", "2025-04-16 330 330\n", "2025-04-17 324 324\n", "2025-04-18 255 255\n", "2025-04-19 232 232\n", "2025-04-20 251 251\n", "2025-04-21 233 233\n", "2025-04-22 234 268\n", "2025-04-23 309 331\n", "2025-04-24 332 332\n", "2025-04-25 309 309\n", "2025-04-26 231 231" ] }, "execution_count": 155, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# First compute both series\n", "summary1 = df_summary.groupby('date')['count'].sum()\n", "summary2 = df.groupby('filedate').subreddit.count().loc[summary1.index]\n", "\n", "# Now merge into a single DataFrame\n", "merged = pd.DataFrame({\n", " 'csv_counts': summary1,\n", " 'parquet_counts': summary2\n", "})\n", "\n", "merged" ] }, { "cell_type": "markdown", "id": "96b1f688-c768-4aba-93f6-5247d85f8998", "metadata": {}, "source": [ "### Check if raw and scored data line up" ] }, { "cell_type": "code", "execution_count": 168, "id": "4ef592c5-c36e-454a-bd59-d455a8a3e062", "metadata": { "execution": { "iopub.execute_input": "2025-04-26T21:38:12.679864Z", "iopub.status.busy": "2025-04-26T21:38:12.679864Z", "iopub.status.idle": "2025-04-26T21:38:15.906491Z", "shell.execute_reply": "2025-04-26T21:38:15.905943Z", "shell.execute_reply.started": "2025-04-26T21:38:12.679864Z" } }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "911485558cf84562889cd9245d5e9a24", "version_major": 2, "version_minor": 0 }, "text/plain": [ "2025-04-22.parquet: 0%| | 0.00/65.9k [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rawscoredmatching
filedate
2025-04-14312312True
2025-04-15258258True
2025-04-16330330True
2025-04-17324324True
2025-04-18255255True
2025-04-19232232True
2025-04-20251251True
2025-04-21233233True
2025-04-22268268True
2025-04-23331331True
2025-04-24332332True
2025-04-25309309True
2025-04-26231231True
\n", "" ], "text/plain": [ " raw scored matching\n", "filedate \n", "2025-04-14 312 312 True\n", "2025-04-15 258 258 True\n", "2025-04-16 330 330 True\n", "2025-04-17 324 324 True\n", "2025-04-18 255 255 True\n", "2025-04-19 232 232 True\n", "2025-04-20 251 251 True\n", "2025-04-21 233 233 True\n", "2025-04-22 268 268 True\n", "2025-04-23 331 331 True\n", "2025-04-24 332 332 True\n", "2025-04-25 309 309 True\n", "2025-04-26 231 231 True" ] }, "execution_count": 168, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = load_reddit_data(\"hub\",folder='data_scored')\n", "\n", "summary1 = df.groupby('filedate').subreddit.count()\n", "\n", "df_scored = load_reddit_data(\"hub\",folder='data_scored')\n", "summary2 = df_scored.groupby('filedate').subreddit.count().loc[summary1.index]\n", "\n", "comparison = pd.DataFrame({\n", " 'raw': summary1,\n", " 'scored': summary2\n", "})\n", "\n", "comparison['matching'] = comparison['raw'] == comparison['scored']\n", "comparison" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:reddit_streamlit]", "language": "python", "name": "conda-env-reddit_streamlit-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.10" } }, "nbformat": 4, "nbformat_minor": 5 }