Spaces:

hblim
/

reddit_sentiment_tracker

Running

File size: 24,583 Bytes

a6576f0

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dd911245-61d7-49db-9960-570f7feefe2b",
   "metadata": {},
   "source": [
    "# Loading Reddit Data\n",
    "\n",
    "This notebook has functions to read in parquet data from Hugging Face dataset [hblim/top_reddit_posts_daily](https://huggingface.co/datasets/hblim/top_reddit_posts_daily)\n",
    "\n",
    "I created this notebook to help me fix a issue where I overwrote `data_raw/2025-04-20.parquet` with 2025-04-23 data during testing\n",
    "- I went to Hugging Face version history to see when the 2025-04-20 file was overwritten erroneously, and saw newer commits on 2025-04-23)\n",
    "- I cloned the Hugging Face dataset repo locally and checked out last correct version for the corrupted 2025-04-20.parquet file\n",
    "- Verified that the data looked good (e.g. retrieved date > created date), and not duplicated anywhere else, and then pushed the correct version back to the  main remote"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "d5071073-274b-480d-8503-28d2292422d3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-26T20:34:47.775286Z",
     "iopub.status.busy": "2025-04-26T20:34:47.775286Z",
     "iopub.status.idle": "2025-04-26T20:34:47.779253Z",
     "shell.execute_reply": "2025-04-26T20:34:47.779253Z",
     "shell.execute_reply.started": "2025-04-26T20:34:47.775286Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import glob\n",
    "import datetime\n",
    "from pathlib import Path\n",
    "from dotenv import load_dotenv\n",
    "import pandas as pd\n",
    "import pyarrow\n",
    "\n",
    "from huggingface_hub import HfApi"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9085f661-ba37-4715-b87b-3699cb75db2f",
   "metadata": {},
   "source": [
    "Download all historical data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "id": "0c14356b-721c-4048-8efb-f69d8eae4900",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-26T21:37:43.533282Z",
     "iopub.status.busy": "2025-04-26T21:37:43.533282Z",
     "iopub.status.idle": "2025-04-26T21:37:43.539908Z",
     "shell.execute_reply": "2025-04-26T21:37:43.539908Z",
     "shell.execute_reply.started": "2025-04-26T21:37:43.533282Z"
    }
   },
   "outputs": [],
   "source": [
    "def load_reddit_data(source,folder='data_raw'):\n",
    "    \"\"\"\n",
    "    Load Reddit daily posts data either from Hugging Face Hub or local files.\n",
    "\n",
    "    Args:\n",
    "        repo_id (str): Hugging Face repo id (e.g., \"hblim/top_reddit_posts_daily\")\n",
    "        source (str): \"hub\" to load from Hugging Face, \"local\" to load from local files\n",
    "    Returns:\n",
    "        pd.DataFrame: Combined DataFrame of all posts\n",
    "    \"\"\"\n",
    "    df = []\n",
    "    \n",
    "    if source == \"hub\":\n",
    "        api = HfApi()\n",
    "        all_files = api.list_repo_files(\"hblim/top_reddit_posts_daily\", repo_type=\"dataset\")\n",
    "        parquet_files = sorted([f for f in all_files if f.startswith(folder) and f.endswith(\".parquet\")])\n",
    "\n",
    "        for shard in parquet_files:\n",
    "            local_path = api.hf_hub_download(repo_id=repo_id, filename=shard, repo_type=\"dataset\")\n",
    "            file_date = os.path.splitext(os.path.basename(local_path))[0]\n",
    "            df.append(pd.read_parquet(local_path).assign(filedate=file_date))\n",
    "\n",
    "    elif source == \"local\":\n",
    "        cwd = os.getcwd()\n",
    "        local_folder = os.path.join(cwd, \"top_reddit_posts_daily\", folder)\n",
    "        local_files = sorted(glob.glob(os.path.join(local_folder, \"*.parquet\")))\n",
    "\n",
    "        for local_path in local_files:\n",
    "            file_date = os.path.splitext(os.path.basename(local_path))[0]\n",
    "            df.append(pd.read_parquet(local_path).assign(filedate=file_date))\n",
    "\n",
    "    else:\n",
    "        raise ValueError(f\"Unknown source: {source}. Use 'hub' or 'local'.\")\n",
    "\n",
    "    df = pd.concat(df, ignore_index=True)\n",
    "    print(f\"Total records across {df.filedate.nunique()} days: {len(df)}\")\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "435fe428-0c99-4d77-9c9d-e9c9a974b16e",
   "metadata": {},
   "source": [
    "### Check if raw and raw-deduplicated data line up\n",
    "Is the raw data duplicated anywhere? Then the de-duplicated data should have fewer posts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "id": "e5f35dcd-4267-4bef-bc6a-0e89937441c3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-26T21:32:17.106045Z",
     "iopub.status.busy": "2025-04-26T21:32:17.106045Z",
     "iopub.status.idle": "2025-04-26T21:32:18.082758Z",
     "shell.execute_reply": "2025-04-26T21:32:18.082758Z",
     "shell.execute_reply.started": "2025-04-26T21:32:17.106045Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total records across 13 days: 3666\n"
     ]
    }
   ],
   "source": [
    "df = load_reddit_data(\"hub\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "id": "ff824b99-c38f-4519-87df-54f9946cc20b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-26T21:24:44.078709Z",
     "iopub.status.busy": "2025-04-26T21:24:44.078709Z",
     "iopub.status.idle": "2025-04-26T21:24:44.086147Z",
     "shell.execute_reply": "2025-04-26T21:24:44.086147Z",
     "shell.execute_reply.started": "2025-04-26T21:24:44.078709Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "filedate\n",
       "2025-04-14    312\n",
       "2025-04-15    258\n",
       "2025-04-16    330\n",
       "2025-04-17    324\n",
       "2025-04-18    255\n",
       "2025-04-19    232\n",
       "2025-04-20    251\n",
       "2025-04-21    233\n",
       "2025-04-22    268\n",
       "2025-04-23    331\n",
       "2025-04-24    332\n",
       "2025-04-25    309\n",
       "2025-04-26    231\n",
       "Name: subreddit, dtype: int64"
      ]
     },
     "execution_count": 158,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby('filedate').subreddit.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "id": "6179d986-471b-40cd-bcf5-529f582315ee",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-26T21:16:54.824986Z",
     "iopub.status.busy": "2025-04-26T21:16:54.824986Z",
     "iopub.status.idle": "2025-04-26T21:16:54.842945Z",
     "shell.execute_reply": "2025-04-26T21:16:54.842945Z",
     "shell.execute_reply.started": "2025-04-26T21:16:54.824986Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total records across 13 days: 3666\n"
     ]
    }
   ],
   "source": [
    "df[\"retrieved_at\"] = pd.to_datetime(df[\"retrieved_at\"],utc=True)\n",
    "\n",
    "# Step 1: Find duplicate post_ids\n",
    "duplicates = df[df.duplicated(subset=[\"post_id\"], keep=False)]\n",
    "\n",
    "# Report duplicates and their retrieved_at dates\n",
    "duplicate_report = duplicates[[\"post_id\", \"retrieved_at\"]]\n",
    "\n",
    "# Step 2: De-duplicate keeping the first occurrence, sorted by retrieved_at\n",
    "df_deduplicated = df_deduplicated.sort_values(by=\"retrieved_at\").reset_index(drop=True)\n",
    "df_deduplicated = df.drop_duplicates(subset=[\"post_id\"], keep=\"first\")\n",
    "\n",
    "print(f\"Total records across {df_deduplicated.filedate.nunique()} days: {len(df_deduplicated)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "id": "67a5fd89-8ddc-4247-ba22-8f411169487f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-26T21:17:16.136315Z",
     "iopub.status.busy": "2025-04-26T21:17:16.136315Z",
     "iopub.status.idle": "2025-04-26T21:17:16.146070Z",
     "shell.execute_reply": "2025-04-26T21:17:16.146070Z",
     "shell.execute_reply.started": "2025-04-26T21:17:16.136315Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>original</th>\n",
       "      <th>deduplicated</th>\n",
       "      <th>matching</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>filedate</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2025-04-14</th>\n",
       "      <td>312</td>\n",
       "      <td>312</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-15</th>\n",
       "      <td>258</td>\n",
       "      <td>258</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-16</th>\n",
       "      <td>330</td>\n",
       "      <td>330</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-17</th>\n",
       "      <td>324</td>\n",
       "      <td>324</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-18</th>\n",
       "      <td>255</td>\n",
       "      <td>255</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-19</th>\n",
       "      <td>232</td>\n",
       "      <td>232</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-20</th>\n",
       "      <td>251</td>\n",
       "      <td>251</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-21</th>\n",
       "      <td>233</td>\n",
       "      <td>233</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-22</th>\n",
       "      <td>268</td>\n",
       "      <td>268</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-23</th>\n",
       "      <td>331</td>\n",
       "      <td>331</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-24</th>\n",
       "      <td>332</td>\n",
       "      <td>332</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-25</th>\n",
       "      <td>309</td>\n",
       "      <td>309</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-26</th>\n",
       "      <td>231</td>\n",
       "      <td>231</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            original  deduplicated  matching\n",
       "filedate                                    \n",
       "2025-04-14       312           312      True\n",
       "2025-04-15       258           258      True\n",
       "2025-04-16       330           330      True\n",
       "2025-04-17       324           324      True\n",
       "2025-04-18       255           255      True\n",
       "2025-04-19       232           232      True\n",
       "2025-04-20       251           251      True\n",
       "2025-04-21       233           233      True\n",
       "2025-04-22       268           268      True\n",
       "2025-04-23       331           331      True\n",
       "2025-04-24       332           332      True\n",
       "2025-04-25       309           309      True\n",
       "2025-04-26       231           231      True"
      ]
     },
     "execution_count": 153,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary1 = df.groupby('filedate').subreddit.count()\n",
    "summary2 = df_deduplicated.groupby('filedate').subreddit.count().loc[summary1.index]\n",
    "\n",
    "comparison = pd.DataFrame({\n",
    "    'original': summary1,\n",
    "    'deduplicated': summary2\n",
    "})\n",
    "\n",
    "comparison['matching'] = comparison['original'] == comparison['deduplicated']\n",
    "comparison"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e566f098-1402-41cd-a7ec-83900f91e6fb",
   "metadata": {},
   "source": [
    "### Check if raw and summary data line up"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "id": "056e51ff-dabd-474a-84c8-6a76f82a4488",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-26T21:17:19.652014Z",
     "iopub.status.busy": "2025-04-26T21:17:19.645961Z",
     "iopub.status.idle": "2025-04-26T21:17:19.790646Z",
     "shell.execute_reply": "2025-04-26T21:17:19.790646Z",
     "shell.execute_reply.started": "2025-04-26T21:17:19.652014Z"
    }
   },
   "outputs": [],
   "source": [
    "df_summary = pd.read_csv(api.hf_hub_download(repo_id=repo_id, filename='subreddit_daily_summary.csv', repo_type=\"dataset\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "id": "321eb761-6278-47e8-89f4-24b06f5ddeb3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-26T21:17:20.795827Z",
     "iopub.status.busy": "2025-04-26T21:17:20.795309Z",
     "iopub.status.idle": "2025-04-26T21:17:20.805781Z",
     "shell.execute_reply": "2025-04-26T21:17:20.804717Z",
     "shell.execute_reply.started": "2025-04-26T21:17:20.795827Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>csv_counts</th>\n",
       "      <th>parquet_counts</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2025-04-14</th>\n",
       "      <td>312</td>\n",
       "      <td>312</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-15</th>\n",
       "      <td>258</td>\n",
       "      <td>258</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-16</th>\n",
       "      <td>330</td>\n",
       "      <td>330</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-17</th>\n",
       "      <td>324</td>\n",
       "      <td>324</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-18</th>\n",
       "      <td>255</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-19</th>\n",
       "      <td>232</td>\n",
       "      <td>232</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-20</th>\n",
       "      <td>251</td>\n",
       "      <td>251</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-21</th>\n",
       "      <td>233</td>\n",
       "      <td>233</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-22</th>\n",
       "      <td>234</td>\n",
       "      <td>268</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-23</th>\n",
       "      <td>309</td>\n",
       "      <td>331</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-24</th>\n",
       "      <td>332</td>\n",
       "      <td>332</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-25</th>\n",
       "      <td>309</td>\n",
       "      <td>309</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-26</th>\n",
       "      <td>231</td>\n",
       "      <td>231</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            csv_counts  parquet_counts\n",
       "date                                  \n",
       "2025-04-14         312             312\n",
       "2025-04-15         258             258\n",
       "2025-04-16         330             330\n",
       "2025-04-17         324             324\n",
       "2025-04-18         255             255\n",
       "2025-04-19         232             232\n",
       "2025-04-20         251             251\n",
       "2025-04-21         233             233\n",
       "2025-04-22         234             268\n",
       "2025-04-23         309             331\n",
       "2025-04-24         332             332\n",
       "2025-04-25         309             309\n",
       "2025-04-26         231             231"
      ]
     },
     "execution_count": 155,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# First compute both series\n",
    "summary1 = df_summary.groupby('date')['count'].sum()\n",
    "summary2 = df.groupby('filedate').subreddit.count().loc[summary1.index]\n",
    "\n",
    "# Now merge into a single DataFrame\n",
    "merged = pd.DataFrame({\n",
    "    'csv_counts': summary1,\n",
    "    'parquet_counts': summary2\n",
    "})\n",
    "\n",
    "merged"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "96b1f688-c768-4aba-93f6-5247d85f8998",
   "metadata": {},
   "source": [
    "### Check if raw and scored data line up"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "id": "4ef592c5-c36e-454a-bd59-d455a8a3e062",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-26T21:38:12.679864Z",
     "iopub.status.busy": "2025-04-26T21:38:12.679864Z",
     "iopub.status.idle": "2025-04-26T21:38:15.906491Z",
     "shell.execute_reply": "2025-04-26T21:38:15.905943Z",
     "shell.execute_reply.started": "2025-04-26T21:38:12.679864Z"
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "911485558cf84562889cd9245d5e9a24",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "2025-04-22.parquet:   0%|          | 0.00/65.9k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total records across 13 days: 3666\n",
      "Total records across 13 days: 3666\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>raw</th>\n",
       "      <th>scored</th>\n",
       "      <th>matching</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>filedate</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2025-04-14</th>\n",
       "      <td>312</td>\n",
       "      <td>312</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-15</th>\n",
       "      <td>258</td>\n",
       "      <td>258</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-16</th>\n",
       "      <td>330</td>\n",
       "      <td>330</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-17</th>\n",
       "      <td>324</td>\n",
       "      <td>324</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-18</th>\n",
       "      <td>255</td>\n",
       "      <td>255</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-19</th>\n",
       "      <td>232</td>\n",
       "      <td>232</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-20</th>\n",
       "      <td>251</td>\n",
       "      <td>251</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-21</th>\n",
       "      <td>233</td>\n",
       "      <td>233</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-22</th>\n",
       "      <td>268</td>\n",
       "      <td>268</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-23</th>\n",
       "      <td>331</td>\n",
       "      <td>331</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-24</th>\n",
       "      <td>332</td>\n",
       "      <td>332</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-25</th>\n",
       "      <td>309</td>\n",
       "      <td>309</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2025-04-26</th>\n",
       "      <td>231</td>\n",
       "      <td>231</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            raw  scored  matching\n",
       "filedate                         \n",
       "2025-04-14  312     312      True\n",
       "2025-04-15  258     258      True\n",
       "2025-04-16  330     330      True\n",
       "2025-04-17  324     324      True\n",
       "2025-04-18  255     255      True\n",
       "2025-04-19  232     232      True\n",
       "2025-04-20  251     251      True\n",
       "2025-04-21  233     233      True\n",
       "2025-04-22  268     268      True\n",
       "2025-04-23  331     331      True\n",
       "2025-04-24  332     332      True\n",
       "2025-04-25  309     309      True\n",
       "2025-04-26  231     231      True"
      ]
     },
     "execution_count": 168,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = load_reddit_data(\"hub\",folder='data_scored')\n",
    "\n",
    "summary1 = df.groupby('filedate').subreddit.count()\n",
    "\n",
    "df_scored = load_reddit_data(\"hub\",folder='data_scored')\n",
    "summary2 = df_scored.groupby('filedate').subreddit.count().loc[summary1.index]\n",
    "\n",
    "comparison = pd.DataFrame({\n",
    "    'raw': summary1,\n",
    "    'scored': summary2\n",
    "})\n",
    "\n",
    "comparison['matching'] = comparison['raw'] == comparison['scored']\n",
    "comparison"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:reddit_streamlit]",
   "language": "python",
   "name": "conda-env-reddit_streamlit-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}