import pandas as pd import os from datasets import load_dataset, DownloadConfig from helpers.utils import extract_audio_identifier DATA_FILE = "sawadogosalif/MooreFRCollections_BibleOnlyText" data = load_dataset(DATA_FILE, split="train", download_config=DownloadConfig(token=os.environ["HF_TOKEN"])).to_pandas() data[["chapter", "page"]] = data["moore_source_url"].apply( lambda x: pd.Series(extract_audio_identifier(x)) ) BUCKET_NAME = "moore-collection"