Split daily Reddit Parquet shards by subreddit and re-upload.

In [7]:
!pip install -q pyarrow fastparquet
!pip install -U huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Downloading huggingface_hub-0.32.4-py3-none-any.whl (512 kB)
Installing collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.30.2
    Uninstalling huggingface-hub-0.30.2:
      Successfully uninstalled huggingface-hub-0.30.2
Successfully installed huggingface_hub-0.32.4


In [11]:
from __future__ import annotations

import os
import re
import shutil
import tempfile
from pathlib import Path
from typing import Iterable

import pandas as pd
from huggingface_hub import HfApi, hf_hub_download, CommitOperationAdd
from tqdm.auto import tqdm

from dotenv import load_dotenv

load_dotenv()

True

In [12]:
def _sanitize(name: str) -> str:
    """
    Make subreddit safe for filenames (removes slashes, spaces, etc.).
    """
    name = name.strip().lower()
    name = re.sub(r"[^\w\-\.]", "_", name)  # keep letters, numbers, _, -, .
    return name
    
def split_and_upload_by_subreddit(
    repo_id: str = "hblim/top_reddit_posts_daily",
    source_folder: str = "data_scored",
    target_folder: str = "data_scored_subreddit",
    overwrite: bool = False,
    batch_size: int = 20,
    token: str | None = None,
) -> None:
    """
    For every Parquet in `source_folder`, create one Parquet per subreddit
    and upload to `target_folder`.

    Parameters
    ----------
    repo_id : str
        Hugging Face dataset repo id.
    source_folder : str
        Folder that already contains the daily Parquet files.
    target_folder : str
        New folder to hold subreddit-level Parquet shards.
    overwrite : bool
        Re-process / re-upload even if the target file already exists.
    batch_size : int
        Upload this many files per commit (reduces commit spam).
    token : str | None
        HF token; if None, uses the one stored by `huggingface-cli login`.
    """
    api = HfApi(token=token)

    # 1. discover daily Parquet files in the repo
    files_in_repo: Iterable[str] = api.list_repo_files(repo_id, repo_type="dataset")
    daily_files = sorted(
        f for f in files_in_repo if f.startswith(source_folder) and f.endswith(".parquet")
    )
    if not daily_files:
        raise RuntimeError(f"No Parquet files found in {source_folder}")

    print(f"Found {len(daily_files)} daily shards in {source_folder}")

    with tempfile.TemporaryDirectory() as tmp_dir:
        tmp_dir = Path(tmp_dir)

        upload_queue: list[tuple[Path, str]] = []
        pbar = tqdm(daily_files, desc="processing days", unit="file")

        for remote_path in pbar:
            file_date = Path(remote_path).stem  # e.g. 2025-05-31
            local_path = hf_hub_download(
                repo_id=repo_id,
                filename=remote_path,
                repo_type="dataset",
                cache_dir=tmp_dir,  # keep inside temp dir
            )
            df = pd.read_parquet(local_path)

            # 2. split by subreddit
            for subreddit, sub_df in df.groupby("subreddit", sort=False):
                safe_sub = _sanitize(subreddit)
                out_fname = f"{file_date}__{safe_sub}.parquet"
                out_repo_path = f"{target_folder}/{out_fname}"

                # skip if already in repo and not overwriting
                if not overwrite and out_repo_path in files_in_repo:
                    continue

                out_local = tmp_dir / out_fname
                sub_df.to_parquet(out_local, index=False)
                upload_queue.append((out_local, out_repo_path))

            # upload in batches to reduce commit churn
            if len(upload_queue) >= batch_size:
                _flush_upload_queue(api, repo_id, upload_queue)
                upload_queue.clear()

        # flush any leftovers
        if upload_queue:
            _flush_upload_queue(api, repo_id, upload_queue)

    print("✅ Done – all subreddit shards uploaded.")


def _flush_upload_queue(api: HfApi, repo_id: str,
                        queue: list[tuple[Path, str]]) -> None:
    """Upload a batch of files in one commit (works on ≥0.28)."""
    if not queue:
        return

    ops = [
        CommitOperationAdd(
            path_in_repo=dst,         # where the file will live in the repo
            path_or_fileobj=str(src)  # local temp file
        )
        for src, dst in queue
    ]

    api.create_commit(
        repo_id=repo_id,
        repo_type="dataset",
        operations=ops,
        commit_message=f"Add {len(queue)} subreddit parquet file(s)",
    )

In [13]:
# Example call – adjust repo_id / token as needed
split_and_upload_by_subreddit(
    repo_id="hblim/top_reddit_posts_daily",
    source_folder="data_scored",
    target_folder="data_scored_subreddit",
    overwrite=False,  # set True if you need to regenerate everything
    batch_size=50,    # tweak for faster / slower commits
)

Found 35 daily shards in data_scored


processing days:   0%|          | 0/35 [00:00<?, ?file/s]

2025-05-01.parquet:   0%|          | 0.00/271k [00:00<?, ?B/s]

2025-05-02.parquet:   0%|          | 0.00/202k [00:00<?, ?B/s]

2025-05-03.parquet:   0%|          | 0.00/231k [00:00<?, ?B/s]

2025-05-04.parquet:   0%|          | 0.00/195k [00:00<?, ?B/s]

2025-05-05.parquet:   0%|          | 0.00/225k [00:00<?, ?B/s]

2025-05-06.parquet:   0%|          | 0.00/225k [00:00<?, ?B/s]

2025-05-07.parquet:   0%|          | 0.00/188k [00:00<?, ?B/s]

2025-05-08.parquet:   0%|          | 0.00/228k [00:00<?, ?B/s]

2025-05-09.parquet:   0%|          | 0.00/221k [00:00<?, ?B/s]

2025-05-10.parquet:   0%|          | 0.00/190k [00:00<?, ?B/s]

2025-05-11.parquet:   0%|          | 0.00/193k [00:00<?, ?B/s]

2025-05-12.parquet:   0%|          | 0.00/230k [00:00<?, ?B/s]

2025-05-13.parquet:   0%|          | 0.00/221k [00:00<?, ?B/s]

2025-05-01__localllama.parquet:   0%|          | 0.00/151k [00:00<?, ?B/s]

2025-05-01__artificial.parquet:   0%|          | 0.00/36.2k [00:00<?, ?B/s]

2025-05-01__singularity.parquet:   0%|          | 0.00/51.6k [00:00<?, ?B/s]

2025-05-01__openai.parquet:   0%|          | 0.00/59.1k [00:00<?, ?B/s]

Upload 52 LFS files:   0%|          | 0/52 [00:00<?, ?it/s]

2025-05-02__artificial.parquet:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

2025-05-02__localllama.parquet:   0%|          | 0.00/89.5k [00:00<?, ?B/s]

2025-05-02__singularity.parquet:   0%|          | 0.00/44.8k [00:00<?, ?B/s]

2025-05-02__openai.parquet:   0%|          | 0.00/66.8k [00:00<?, ?B/s]

2025-05-03__artificial.parquet:   0%|          | 0.00/25.3k [00:00<?, ?B/s]

2025-05-03__localllama.parquet:   0%|          | 0.00/113k [00:00<?, ?B/s]

2025-05-03__singularity.parquet:   0%|          | 0.00/57.9k [00:00<?, ?B/s]

2025-05-03__openai.parquet:   0%|          | 0.00/60.2k [00:00<?, ?B/s]

2025-05-04__artificial.parquet:   0%|          | 0.00/23.6k [00:00<?, ?B/s]

2025-05-04__localllama.parquet:   0%|          | 0.00/83.6k [00:00<?, ?B/s]

2025-05-04__singularity.parquet:   0%|          | 0.00/42.0k [00:00<?, ?B/s]

2025-05-04__openai.parquet:   0%|          | 0.00/68.2k [00:00<?, ?B/s]

2025-05-05__artificial.parquet:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

2025-05-05__localllama.parquet:   0%|          | 0.00/108k [00:00<?, ?B/s]

2025-05-05__singularity.parquet:   0%|          | 0.00/62.5k [00:00<?, ?B/s]

2025-05-05__openai.parquet:   0%|          | 0.00/65.9k [00:00<?, ?B/s]

2025-05-06__artificial.parquet:   0%|          | 0.00/32.2k [00:00<?, ?B/s]

2025-05-06__localllama.parquet:   0%|          | 0.00/107k [00:00<?, ?B/s]

2025-05-06__singularity.parquet:   0%|          | 0.00/41.9k [00:00<?, ?B/s]

2025-05-06__openai.parquet:   0%|          | 0.00/68.4k [00:00<?, ?B/s]

2025-05-07__artificial.parquet:   0%|          | 0.00/32.8k [00:00<?, ?B/s]

2025-05-07__localllama.parquet:   0%|          | 0.00/89.0k [00:00<?, ?B/s]

2025-05-07__singularity.parquet:   0%|          | 0.00/45.4k [00:00<?, ?B/s]

2025-05-07__openai.parquet:   0%|          | 0.00/46.6k [00:00<?, ?B/s]

2025-05-08__artificial.parquet:   0%|          | 0.00/21.3k [00:00<?, ?B/s]

2025-05-08__localllama.parquet:   0%|          | 0.00/96.9k [00:00<?, ?B/s]

2025-05-08__singularity.parquet:   0%|          | 0.00/61.1k [00:00<?, ?B/s]

2025-05-08__openai.parquet:   0%|          | 0.00/72.3k [00:00<?, ?B/s]

2025-05-09__artificial.parquet:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

2025-05-09__localllama.parquet:   0%|          | 0.00/95.1k [00:00<?, ?B/s]

2025-05-09__singularity.parquet:   0%|          | 0.00/64.4k [00:00<?, ?B/s]

2025-05-09__openai.parquet:   0%|          | 0.00/66.6k [00:00<?, ?B/s]

2025-05-10__artificial.parquet:   0%|          | 0.00/27.6k [00:00<?, ?B/s]

2025-05-10__localllama.parquet:   0%|          | 0.00/74.8k [00:00<?, ?B/s]

2025-05-10__singularity.parquet:   0%|          | 0.00/49.6k [00:00<?, ?B/s]

2025-05-10__openai.parquet:   0%|          | 0.00/62.1k [00:00<?, ?B/s]

2025-05-11__artificial.parquet:   0%|          | 0.00/24.4k [00:00<?, ?B/s]

2025-05-11__localllama.parquet:   0%|          | 0.00/87.3k [00:00<?, ?B/s]

2025-05-11__singularity.parquet:   0%|          | 0.00/43.2k [00:00<?, ?B/s]

2025-05-11__openai.parquet:   0%|          | 0.00/61.2k [00:00<?, ?B/s]

2025-05-12__artificial.parquet:   0%|          | 0.00/34.5k [00:00<?, ?B/s]

2025-05-12__localllama.parquet:   0%|          | 0.00/91.9k [00:00<?, ?B/s]

2025-05-12__singularity.parquet:   0%|          | 0.00/67.2k [00:00<?, ?B/s]

2025-05-12__openai.parquet:   0%|          | 0.00/63.9k [00:00<?, ?B/s]

2025-05-13__artificial.parquet:   0%|          | 0.00/31.6k [00:00<?, ?B/s]

2025-05-13__localllama.parquet:   0%|          | 0.00/110k [00:00<?, ?B/s]

2025-05-13__singularity.parquet:   0%|          | 0.00/34.1k [00:00<?, ?B/s]

2025-05-13__openai.parquet:   0%|          | 0.00/77.2k [00:00<?, ?B/s]

2025-05-14.parquet:   0%|          | 0.00/252k [00:00<?, ?B/s]

2025-05-15.parquet:   0%|          | 0.00/238k [00:00<?, ?B/s]

2025-05-16.parquet:   0%|          | 0.00/215k [00:00<?, ?B/s]

2025-05-17.parquet:   0%|          | 0.00/211k [00:00<?, ?B/s]

2025-05-18.parquet:   0%|          | 0.00/181k [00:00<?, ?B/s]

2025-05-19.parquet:   0%|          | 0.00/203k [00:00<?, ?B/s]

2025-05-20.parquet:   0%|          | 0.00/200k [00:00<?, ?B/s]

2025-05-21.parquet:   0%|          | 0.00/305k [00:00<?, ?B/s]

2025-05-22.parquet:   0%|          | 0.00/268k [00:00<?, ?B/s]

2025-05-23.parquet:   0%|          | 0.00/245k [00:00<?, ?B/s]

2025-05-24.parquet:   0%|          | 0.00/255k [00:00<?, ?B/s]

2025-05-25.parquet:   0%|          | 0.00/232k [00:00<?, ?B/s]

2025-05-26.parquet:   0%|          | 0.00/229k [00:00<?, ?B/s]

2025-05-14__singularity.parquet:   0%|          | 0.00/67.7k [00:00<?, ?B/s]

2025-05-14__openai.parquet:   0%|          | 0.00/77.2k [00:00<?, ?B/s]

Upload 52 LFS files:   0%|          | 0/52 [00:00<?, ?it/s]

2025-05-14__artificial.parquet:   0%|          | 0.00/44.0k [00:00<?, ?B/s]

2025-05-14__localllama.parquet:   0%|          | 0.00/86.7k [00:00<?, ?B/s]

2025-05-15__artificial.parquet:   0%|          | 0.00/27.5k [00:00<?, ?B/s]

2025-05-15__localllama.parquet:   0%|          | 0.00/91.4k [00:00<?, ?B/s]

2025-05-15__singularity.parquet:   0%|          | 0.00/87.4k [00:00<?, ?B/s]

2025-05-16__artificial.parquet:   0%|          | 0.00/28.3k [00:00<?, ?B/s]

2025-05-15__openai.parquet:   0%|          | 0.00/62.5k [00:00<?, ?B/s]

2025-05-16__localllama.parquet:   0%|          | 0.00/88.7k [00:00<?, ?B/s]

2025-05-16__singularity.parquet:   0%|          | 0.00/61.4k [00:00<?, ?B/s]

2025-05-16__openai.parquet:   0%|          | 0.00/61.3k [00:00<?, ?B/s]

2025-05-17__artificial.parquet:   0%|          | 0.00/31.0k [00:00<?, ?B/s]

2025-05-17__localllama.parquet:   0%|          | 0.00/83.0k [00:00<?, ?B/s]

2025-05-17__singularity.parquet:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

2025-05-17__openai.parquet:   0%|          | 0.00/75.9k [00:00<?, ?B/s]

2025-05-18__artificial.parquet:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

2025-05-18__localllama.parquet:   0%|          | 0.00/89.1k [00:00<?, ?B/s]

2025-05-18__singularity.parquet:   0%|          | 0.00/37.0k [00:00<?, ?B/s]

2025-05-18__openai.parquet:   0%|          | 0.00/59.9k [00:00<?, ?B/s]

2025-05-19__artificial.parquet:   0%|          | 0.00/34.9k [00:00<?, ?B/s]

2025-05-19__localllama.parquet:   0%|          | 0.00/83.4k [00:00<?, ?B/s]

2025-05-19__singularity.parquet:   0%|          | 0.00/74.1k [00:00<?, ?B/s]

2025-05-19__openai.parquet:   0%|          | 0.00/39.2k [00:00<?, ?B/s]

2025-05-20__artificial.parquet:   0%|          | 0.00/29.8k [00:00<?, ?B/s]

2025-05-20__localllama.parquet:   0%|          | 0.00/76.2k [00:00<?, ?B/s]

2025-05-20__singularity.parquet:   0%|          | 0.00/74.3k [00:00<?, ?B/s]

2025-05-20__openai.parquet:   0%|          | 0.00/44.1k [00:00<?, ?B/s]

2025-05-21__artificial.parquet:   0%|          | 0.00/30.5k [00:00<?, ?B/s]

2025-05-21__localllama.parquet:   0%|          | 0.00/103k [00:00<?, ?B/s]

2025-05-21__singularity.parquet:   0%|          | 0.00/134k [00:00<?, ?B/s]

2025-05-21__openai.parquet:   0%|          | 0.00/63.5k [00:00<?, ?B/s]

2025-05-22__artificial.parquet:   0%|          | 0.00/29.3k [00:00<?, ?B/s]

2025-05-22__localllama.parquet:   0%|          | 0.00/107k [00:00<?, ?B/s]

2025-05-22__singularity.parquet:   0%|          | 0.00/84.4k [00:00<?, ?B/s]

2025-05-22__openai.parquet:   0%|          | 0.00/72.0k [00:00<?, ?B/s]

2025-05-23__artificial.parquet:   0%|          | 0.00/44.2k [00:00<?, ?B/s]

2025-05-23__localllama.parquet:   0%|          | 0.00/97.4k [00:00<?, ?B/s]

2025-05-23__singularity.parquet:   0%|          | 0.00/80.9k [00:00<?, ?B/s]

2025-05-23__openai.parquet:   0%|          | 0.00/53.4k [00:00<?, ?B/s]

2025-05-24__artificial.parquet:   0%|          | 0.00/36.4k [00:00<?, ?B/s]

2025-05-24__localllama.parquet:   0%|          | 0.00/88.0k [00:00<?, ?B/s]

2025-05-24__singularity.parquet:   0%|          | 0.00/102k [00:00<?, ?B/s]

2025-05-24__openai.parquet:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

2025-05-25__artificial.parquet:   0%|          | 0.00/46.3k [00:00<?, ?B/s]

2025-05-25__localllama.parquet:   0%|          | 0.00/77.1k [00:00<?, ?B/s]

2025-05-25__singularity.parquet:   0%|          | 0.00/71.1k [00:00<?, ?B/s]

2025-05-25__openai.parquet:   0%|          | 0.00/63.2k [00:00<?, ?B/s]

2025-05-26__artificial.parquet:   0%|          | 0.00/29.4k [00:00<?, ?B/s]

2025-05-26__localllama.parquet:   0%|          | 0.00/103k [00:00<?, ?B/s]

2025-05-26__singularity.parquet:   0%|          | 0.00/64.8k [00:00<?, ?B/s]

2025-05-26__openai.parquet:   0%|          | 0.00/59.3k [00:00<?, ?B/s]

2025-05-27.parquet:   0%|          | 0.00/232k [00:00<?, ?B/s]

2025-05-28.parquet:   0%|          | 0.00/270k [00:00<?, ?B/s]

2025-05-29.parquet:   0%|          | 0.00/262k [00:00<?, ?B/s]

2025-05-30.parquet:   0%|          | 0.00/240k [00:00<?, ?B/s]

2025-05-31.parquet:   0%|          | 0.00/231k [00:00<?, ?B/s]

2025-06-01.parquet:   0%|          | 0.00/167k [00:00<?, ?B/s]

2025-06-02.parquet:   0%|          | 0.00/250k [00:00<?, ?B/s]

2025-06-03.parquet:   0%|          | 0.00/206k [00:00<?, ?B/s]

2025-06-04.parquet:   0%|          | 0.00/269k [00:00<?, ?B/s]

2025-05-27__artificial.parquet:   0%|          | 0.00/32.8k [00:00<?, ?B/s]

2025-05-27__singularity.parquet:   0%|          | 0.00/85.7k [00:00<?, ?B/s]

Upload 36 LFS files:   0%|          | 0/36 [00:00<?, ?it/s]

2025-05-27__localllama.parquet:   0%|          | 0.00/86.4k [00:00<?, ?B/s]

2025-05-27__openai.parquet:   0%|          | 0.00/50.3k [00:00<?, ?B/s]

2025-05-28__artificial.parquet:   0%|          | 0.00/27.6k [00:00<?, ?B/s]

2025-05-28__localllama.parquet:   0%|          | 0.00/93.1k [00:00<?, ?B/s]

2025-05-28__singularity.parquet:   0%|          | 0.00/115k [00:00<?, ?B/s]

2025-05-28__openai.parquet:   0%|          | 0.00/62.4k [00:00<?, ?B/s]

2025-05-29__artificial.parquet:   0%|          | 0.00/26.9k [00:00<?, ?B/s]

2025-05-29__localllama.parquet:   0%|          | 0.00/123k [00:00<?, ?B/s]

2025-05-29__singularity.parquet:   0%|          | 0.00/100k [00:00<?, ?B/s]

2025-05-29__openai.parquet:   0%|          | 0.00/42.5k [00:00<?, ?B/s]

2025-05-30__artificial.parquet:   0%|          | 0.00/29.4k [00:00<?, ?B/s]

2025-05-30__localllama.parquet:   0%|          | 0.00/94.9k [00:00<?, ?B/s]

2025-05-30__singularity.parquet:   0%|          | 0.00/88.7k [00:00<?, ?B/s]

2025-05-30__openai.parquet:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

2025-05-31__artificial.parquet:   0%|          | 0.00/34.1k [00:00<?, ?B/s]

2025-05-31__localllama.parquet:   0%|          | 0.00/82.8k [00:00<?, ?B/s]

2025-05-31__singularity.parquet:   0%|          | 0.00/82.2k [00:00<?, ?B/s]

2025-05-31__openai.parquet:   0%|          | 0.00/58.9k [00:00<?, ?B/s]

2025-06-01__artificial.parquet:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

2025-06-01__localllama.parquet:   0%|          | 0.00/71.4k [00:00<?, ?B/s]

2025-06-01__singularity.parquet:   0%|          | 0.00/50.8k [00:00<?, ?B/s]

2025-06-01__openai.parquet:   0%|          | 0.00/52.1k [00:00<?, ?B/s]

2025-06-02__artificial.parquet:   0%|          | 0.00/30.2k [00:00<?, ?B/s]

2025-06-02__localllama.parquet:   0%|          | 0.00/102k [00:00<?, ?B/s]

2025-06-02__singularity.parquet:   0%|          | 0.00/81.2k [00:00<?, ?B/s]

2025-06-02__openai.parquet:   0%|          | 0.00/59.5k [00:00<?, ?B/s]

2025-06-03__artificial.parquet:   0%|          | 0.00/37.6k [00:00<?, ?B/s]

2025-06-03__localllama.parquet:   0%|          | 0.00/76.1k [00:00<?, ?B/s]

2025-06-03__singularity.parquet:   0%|          | 0.00/57.1k [00:00<?, ?B/s]

2025-06-03__openai.parquet:   0%|          | 0.00/59.1k [00:00<?, ?B/s]

2025-06-04__artificial.parquet:   0%|          | 0.00/35.2k [00:00<?, ?B/s]

2025-06-04__localllama.parquet:   0%|          | 0.00/84.6k [00:00<?, ?B/s]

2025-06-04__singularity.parquet:   0%|          | 0.00/84.3k [00:00<?, ?B/s]

2025-06-04__openai.parquet:   0%|          | 0.00/88.2k [00:00<?, ?B/s]

✅ Done – all subreddit shards uploaded.
