Spaces:

hannatoenbreker
/

whisper-dutch-small-gradio

Runtime error

File size: 31,227 Bytes

177499f

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset common_voice_11_0/nl to /Users/hannatoenbreker/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/nl/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9dfef4ac94534c36ad01f3ba3ac8f869",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b5b797b7d8a34411b7a38bdc35a1a4ad",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6c07d35f372a49ed84c090168e4092b1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "949b41580ea24e689fb90f15c4d0f98f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fe273167dd27440e976da47f2e4bea8e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Reading metadata...: 30318it [00:00, 49815.73it/s]\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[2], line 6\u001b[0m\n\u001b[1;32m      2\u001b[0m token \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mhf_tlYASKaBhUkcufofQwneYwTzcyjYLkAkuN\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m      4\u001b[0m common_voice \u001b[39m=\u001b[39m DatasetDict()\n\u001b[0;32m----> 6\u001b[0m common_voice[\u001b[39m\"\u001b[39m\u001b[39mtrain\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m load_dataset(\u001b[39m\"\u001b[39;49m\u001b[39mmozilla-foundation/common_voice_11_0\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mnl\u001b[39;49m\u001b[39m\"\u001b[39;49m, split\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mtrain+validation\u001b[39;49m\u001b[39m\"\u001b[39;49m, use_auth_token\u001b[39m=\u001b[39;49mtoken)\n\u001b[1;32m      7\u001b[0m common_voice[\u001b[39m\"\u001b[39m\u001b[39mtest\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m load_dataset(\u001b[39m\"\u001b[39m\u001b[39mmozilla-foundation/common_voice_11_0\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mnl\u001b[39m\u001b[39m\"\u001b[39m, split\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mtest\u001b[39m\u001b[39m\"\u001b[39m, use_auth_token\u001b[39m=\u001b[39mtoken)\n\u001b[1;32m      9\u001b[0m \u001b[39mprint\u001b[39m(common_voice)\n",
      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/datasets/load.py:1782\u001b[0m, in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)\u001b[0m\n\u001b[1;32m   1779\u001b[0m try_from_hf_gcs \u001b[39m=\u001b[39m path \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m _PACKAGED_DATASETS_MODULES\n\u001b[1;32m   1781\u001b[0m \u001b[39m# Download and prepare data\u001b[39;00m\n\u001b[0;32m-> 1782\u001b[0m builder_instance\u001b[39m.\u001b[39;49mdownload_and_prepare(\n\u001b[1;32m   1783\u001b[0m     download_config\u001b[39m=\u001b[39;49mdownload_config,\n\u001b[1;32m   1784\u001b[0m     download_mode\u001b[39m=\u001b[39;49mdownload_mode,\n\u001b[1;32m   1785\u001b[0m     verification_mode\u001b[39m=\u001b[39;49mverification_mode,\n\u001b[1;32m   1786\u001b[0m     try_from_hf_gcs\u001b[39m=\u001b[39;49mtry_from_hf_gcs,\n\u001b[1;32m   1787\u001b[0m     num_proc\u001b[39m=\u001b[39;49mnum_proc,\n\u001b[1;32m   1788\u001b[0m )\n\u001b[1;32m   1790\u001b[0m \u001b[39m# Build dataset for splits\u001b[39;00m\n\u001b[1;32m   1791\u001b[0m keep_in_memory \u001b[39m=\u001b[39m (\n\u001b[1;32m   1792\u001b[0m     keep_in_memory \u001b[39mif\u001b[39;00m keep_in_memory \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m is_small_dataset(builder_instance\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size)\n\u001b[1;32m   1793\u001b[0m )\n",
      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/datasets/builder.py:872\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[0;34m(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[1;32m    870\u001b[0m     \u001b[39mif\u001b[39;00m num_proc \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    871\u001b[0m         prepare_split_kwargs[\u001b[39m\"\u001b[39m\u001b[39mnum_proc\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m num_proc\n\u001b[0;32m--> 872\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_download_and_prepare(\n\u001b[1;32m    873\u001b[0m         dl_manager\u001b[39m=\u001b[39;49mdl_manager,\n\u001b[1;32m    874\u001b[0m         verification_mode\u001b[39m=\u001b[39;49mverification_mode,\n\u001b[1;32m    875\u001b[0m         \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mprepare_split_kwargs,\n\u001b[1;32m    876\u001b[0m         \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mdownload_and_prepare_kwargs,\n\u001b[1;32m    877\u001b[0m     )\n\u001b[1;32m    878\u001b[0m \u001b[39m# Sync info\u001b[39;00m\n\u001b[1;32m    879\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size \u001b[39m=\u001b[39m \u001b[39msum\u001b[39m(split\u001b[39m.\u001b[39mnum_bytes \u001b[39mfor\u001b[39;00m split \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39msplits\u001b[39m.\u001b[39mvalues())\n",
      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/datasets/builder.py:1649\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._download_and_prepare\u001b[0;34m(self, dl_manager, verification_mode, **prepare_splits_kwargs)\u001b[0m\n\u001b[1;32m   1648\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_download_and_prepare\u001b[39m(\u001b[39mself\u001b[39m, dl_manager, verification_mode, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_splits_kwargs):\n\u001b[0;32m-> 1649\u001b[0m     \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49m_download_and_prepare(\n\u001b[1;32m   1650\u001b[0m         dl_manager,\n\u001b[1;32m   1651\u001b[0m         verification_mode,\n\u001b[1;32m   1652\u001b[0m         check_duplicate_keys\u001b[39m=\u001b[39;49mverification_mode \u001b[39m==\u001b[39;49m VerificationMode\u001b[39m.\u001b[39;49mBASIC_CHECKS\n\u001b[1;32m   1653\u001b[0m         \u001b[39mor\u001b[39;49;00m verification_mode \u001b[39m==\u001b[39;49m VerificationMode\u001b[39m.\u001b[39;49mALL_CHECKS,\n\u001b[1;32m   1654\u001b[0m         \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mprepare_splits_kwargs,\n\u001b[1;32m   1655\u001b[0m     )\n",
      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/datasets/builder.py:967\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[0;34m(self, dl_manager, verification_mode, **prepare_split_kwargs)\u001b[0m\n\u001b[1;32m    963\u001b[0m split_dict\u001b[39m.\u001b[39madd(split_generator\u001b[39m.\u001b[39msplit_info)\n\u001b[1;32m    965\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m    966\u001b[0m     \u001b[39m# Prepare split will record examples associated to the split\u001b[39;00m\n\u001b[0;32m--> 967\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_prepare_split(split_generator, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mprepare_split_kwargs)\n\u001b[1;32m    968\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mOSError\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m    969\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mOSError\u001b[39;00m(\n\u001b[1;32m    970\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mCannot find data file. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    971\u001b[0m         \u001b[39m+\u001b[39m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmanual_download_instructions \u001b[39mor\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m    972\u001b[0m         \u001b[39m+\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mOriginal error:\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m    973\u001b[0m         \u001b[39m+\u001b[39m \u001b[39mstr\u001b[39m(e)\n\u001b[1;32m    974\u001b[0m     ) \u001b[39mfrom\u001b[39;00m \u001b[39mNone\u001b[39m\n",
      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/datasets/builder.py:1488\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._prepare_split\u001b[0;34m(self, split_generator, check_duplicate_keys, file_format, num_proc, max_shard_size)\u001b[0m\n\u001b[1;32m   1486\u001b[0m gen_kwargs \u001b[39m=\u001b[39m split_generator\u001b[39m.\u001b[39mgen_kwargs\n\u001b[1;32m   1487\u001b[0m job_id \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[0;32m-> 1488\u001b[0m \u001b[39mfor\u001b[39;00m job_id, done, content \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_prepare_split_single(\n\u001b[1;32m   1489\u001b[0m     gen_kwargs\u001b[39m=\u001b[39mgen_kwargs, job_id\u001b[39m=\u001b[39mjob_id, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39m_prepare_split_args\n\u001b[1;32m   1490\u001b[0m ):\n\u001b[1;32m   1491\u001b[0m     \u001b[39mif\u001b[39;00m done:\n\u001b[1;32m   1492\u001b[0m         result \u001b[39m=\u001b[39m content\n",
      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/datasets/builder.py:1625\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._prepare_split_single\u001b[0;34m(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)\u001b[0m\n\u001b[1;32m   1615\u001b[0m     shard_id \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n\u001b[1;32m   1616\u001b[0m     writer \u001b[39m=\u001b[39m writer_class(\n\u001b[1;32m   1617\u001b[0m         features\u001b[39m=\u001b[39mwriter\u001b[39m.\u001b[39m_features,\n\u001b[1;32m   1618\u001b[0m         path\u001b[39m=\u001b[39mfpath\u001b[39m.\u001b[39mreplace(\u001b[39m\"\u001b[39m\u001b[39mSSSSS\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mshard_id\u001b[39m:\u001b[39;00m\u001b[39m05d\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\u001b[39m.\u001b[39mreplace(\u001b[39m\"\u001b[39m\u001b[39mJJJJJ\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mjob_id\u001b[39m:\u001b[39;00m\u001b[39m05d\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m),\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1623\u001b[0m         embed_local_files\u001b[39m=\u001b[39membed_local_files,\n\u001b[1;32m   1624\u001b[0m     )\n\u001b[0;32m-> 1625\u001b[0m example \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49minfo\u001b[39m.\u001b[39;49mfeatures\u001b[39m.\u001b[39;49mencode_example(record) \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mfeatures \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m record\n\u001b[1;32m   1626\u001b[0m writer\u001b[39m.\u001b[39mwrite(example, key)\n\u001b[1;32m   1627\u001b[0m num_examples_progress_update \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n",
      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/datasets/features/features.py:1808\u001b[0m, in \u001b[0;36mFeatures.encode_example\u001b[0;34m(self, example)\u001b[0m\n\u001b[1;32m   1797\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mencode_example\u001b[39m(\u001b[39mself\u001b[39m, example):\n\u001b[1;32m   1798\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m   1799\u001b[0m \u001b[39m    Encode example into a format for Arrow.\u001b[39;00m\n\u001b[1;32m   1800\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1806\u001b[0m \u001b[39m        `dict[str, Any]`\u001b[39;00m\n\u001b[1;32m   1807\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1808\u001b[0m     example \u001b[39m=\u001b[39m cast_to_python_objects(example)\n\u001b[1;32m   1809\u001b[0m     \u001b[39mreturn\u001b[39;00m encode_nested_example(\u001b[39mself\u001b[39m, example)\n",
      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/datasets/features/features.py:439\u001b[0m, in \u001b[0;36mcast_to_python_objects\u001b[0;34m(obj, only_1d_for_numpy, optimize_list_casting)\u001b[0m\n\u001b[1;32m    419\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mcast_to_python_objects\u001b[39m(obj: Any, only_1d_for_numpy\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m, optimize_list_casting\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Any:\n\u001b[1;32m    420\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    421\u001b[0m \u001b[39m    Cast numpy/pytorch/tensorflow/pandas objects to python lists.\u001b[39;00m\n\u001b[1;32m    422\u001b[0m \u001b[39m    It works recursively.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    437\u001b[0m \u001b[39m        casted_obj: the casted object\u001b[39;00m\n\u001b[1;32m    438\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 439\u001b[0m     \u001b[39mreturn\u001b[39;00m _cast_to_python_objects(\n\u001b[1;32m    440\u001b[0m         obj, only_1d_for_numpy\u001b[39m=\u001b[39;49monly_1d_for_numpy, optimize_list_casting\u001b[39m=\u001b[39;49moptimize_list_casting\n\u001b[1;32m    441\u001b[0m     )[\u001b[39m0\u001b[39m]\n",
      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/datasets/features/features.py:384\u001b[0m, in \u001b[0;36m_cast_to_python_objects\u001b[0;34m(obj, only_1d_for_numpy, optimize_list_casting)\u001b[0m\n\u001b[1;32m    382\u001b[0m output \u001b[39m=\u001b[39m {}\n\u001b[1;32m    383\u001b[0m \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m obj\u001b[39m.\u001b[39mitems():\n\u001b[0;32m--> 384\u001b[0m     casted_v, has_changed_v \u001b[39m=\u001b[39m _cast_to_python_objects(\n\u001b[1;32m    385\u001b[0m         v, only_1d_for_numpy\u001b[39m=\u001b[39;49monly_1d_for_numpy, optimize_list_casting\u001b[39m=\u001b[39;49moptimize_list_casting\n\u001b[1;32m    386\u001b[0m     )\n\u001b[1;32m    387\u001b[0m     has_changed \u001b[39m|\u001b[39m\u001b[39m=\u001b[39m has_changed_v\n\u001b[1;32m    388\u001b[0m     output[k] \u001b[39m=\u001b[39m casted_v\n",
      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/datasets/features/features.py:327\u001b[0m, in \u001b[0;36m_cast_to_python_objects\u001b[0;34m(obj, only_1d_for_numpy, optimize_list_casting)\u001b[0m\n\u001b[1;32m    317\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    318\u001b[0m         \u001b[39mreturn\u001b[39;00m (\n\u001b[1;32m    319\u001b[0m             [\n\u001b[1;32m    320\u001b[0m                 _cast_to_python_objects(\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    325\u001b[0m             \u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m    326\u001b[0m         )\n\u001b[0;32m--> 327\u001b[0m \u001b[39melif\u001b[39;00m config\u001b[39m.\u001b[39mTF_AVAILABLE \u001b[39mand\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mtensorflow\u001b[39m\u001b[39m\"\u001b[39m \u001b[39min\u001b[39;00m sys\u001b[39m.\u001b[39mmodules \u001b[39mand\u001b[39;00m \u001b[39misinstance\u001b[39m(obj, tf\u001b[39m.\u001b[39mTensor):\n\u001b[1;32m    328\u001b[0m     \u001b[39mif\u001b[39;00m obj\u001b[39m.\u001b[39mndim \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m    329\u001b[0m         \u001b[39mreturn\u001b[39;00m obj\u001b[39m.\u001b[39mnumpy()[()], \u001b[39mTrue\u001b[39;00m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset, DatasetDict\n",
    "\n",
    "common_voice = DatasetDict()\n",
    "\n",
    "common_voice[\"train\"] = load_dataset(\"mozilla-foundation/common_voice_11_0\", \"nl\", split=\"train+validation\", use_auth_token=True)\n",
    "common_voice[\"test\"] = load_dataset(\"mozilla-foundation/common_voice_11_0\", \"nl\", split=\"test\", use_auth_token=True)\n",
    "\n",
    "print(common_voice)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token is valid.\n",
      "Your token has been saved in your configured git credential helpers (osxkeychain).\n",
      "Your token has been saved to /Users/hannatoenbreker/.cache/huggingface/token\n",
      "Login successful\n"
     ]
    }
   ],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "common_voice = common_voice.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"path\", \"segment\", \"up_votes\"])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare feature extractor and tokenizer \n",
    "### The ASR pipeline can be de-composed into three components:\n",
    "\n",
    "- A feature extractor which pre-processes the raw audio-inputs\n",
    "- The model which performs the sequence-to-sequence mapping\n",
    "- A tokenizer which post-processes the model outputs to text format\n",
    "\n",
    "In 🤗 Transformers, the Whisper model has an associated feature extractor and tokenizer, called WhisperFeatureExtractor and WhisperTokenizer respectively."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import WhisperFeatureExtractor\n",
    "\n",
    "feature_extractor = WhisperFeatureExtractor.from_pretrained(\"openai/whisper-small\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import WhisperTokenizer\n",
    "\n",
    "tokenizer = WhisperTokenizer.from_pretrained(\"openai/whisper-small\", language=\"Dutch\", task=\"transcribe\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_str = common_voice[\"train\"][0][\"sentence\"]\n",
    "labels = tokenizer(input_str).input_ids\n",
    "decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)\n",
    "decoded_str = tokenizer.decode(labels, skip_special_tokens=True)\n",
    "\n",
    "print(f\"Input:                 {input_str}\")\n",
    "print(f\"Decoded w/ special:    {decoded_with_special}\")\n",
    "print(f\"Decoded w/out special: {decoded_str}\")\n",
    "print(f\"Are equal:             {input_str == decoded_str}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import WhisperProcessor\n",
    "\n",
    "processor = WhisperProcessor.from_pretrained(\"openai/whisper-small\", language=\"Dutch\", task=\"transcribe\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare our data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(common_voice[\"train\"][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Audio\n",
    "\n",
    "common_voice = common_voice.cast_column(\"audio\", Audio(sampling_rate=16000))\n",
    "print(common_voice[\"train\"][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepare_dataset(batch):\n",
    "    # load and resample audio data from 48 to 16kHz\n",
    "    audio = batch[\"audio\"]\n",
    "\n",
    "    # compute log-Mel input features from input audio array \n",
    "    batch[\"input_features\"] = feature_extractor(audio[\"array\"], sampling_rate=audio[\"sampling_rate\"]).input_features[0]\n",
    "\n",
    "    # encode target text to label ids \n",
    "    batch[\"labels\"] = tokenizer(batch[\"sentence\"]).input_ids\n",
    "    return batch\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names[\"train\"], num_proc=4)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define a Data Collator\n",
    "### We can leverage the WhisperProcessor we defined earlier to perform both the feature extractor and the tokenizer operations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "\n",
    "from dataclasses import dataclass\n",
    "from typing import Any, Dict, List, Union\n",
    "\n",
    "@dataclass\n",
    "class DataCollatorSpeechSeq2SeqWithPadding:\n",
    "    processor: Any\n",
    "\n",
    "    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
    "        # split inputs and labels since they have to be of different lengths and need different padding methods\n",
    "        # first treat the audio inputs by simply returning torch tensors\n",
    "        input_features = [{\"input_features\": feature[\"input_features\"]} for feature in features]\n",
    "        batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n",
    "\n",
    "        # get the tokenized label sequences\n",
    "        label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
    "        # pad the labels to max length\n",
    "        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n",
    "\n",
    "        # replace padding with -100 to ignore loss correctly\n",
    "        labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
    "\n",
    "        # if bos token is appended in previous tokenization step,\n",
    "        # cut bos token here as it's append later anyways\n",
    "        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n",
    "            labels = labels[:, 1:]\n",
    "\n",
    "        batch[\"labels\"] = labels\n",
    "\n",
    "        return batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate our results with WER"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import evaluate\n",
    "\n",
    "metric = evaluate.load(\"wer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_metrics(pred):\n",
    "    pred_ids = pred.predictions\n",
    "    label_ids = pred.label_ids\n",
    "\n",
    "    # replace -100 with the pad_token_id\n",
    "    label_ids[label_ids == -100] = tokenizer.pad_token_id\n",
    "\n",
    "    # we do not want to group tokens when computing the metrics\n",
    "    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n",
    "    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)\n",
    "\n",
    "    wer = 100 * metric.compute(predictions=pred_str, references=label_str)\n",
    "\n",
    "    return {\"wer\": wer}"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Initializing our model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import WhisperForConditionalGeneration\n",
    "\n",
    "model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-small\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.config.forced_decoder_ids = None\n",
    "model.config.suppress_tokens = []"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training our model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Seq2SeqTrainingArguments\n",
    "\n",
    "training_args = Seq2SeqTrainingArguments(\n",
    "    output_dir=\"./whisper-dutch\",  # change to a repo name of your choice\n",
    "    per_device_train_batch_size=8,\n",
    "    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size\n",
    "    learning_rate=1e-5,\n",
    "    warmup_steps=500,\n",
    "    max_steps=4000,\n",
    "    gradient_checkpointing=True,\n",
    "    fp16=False,\n",
    "    evaluation_strategy=\"steps\",\n",
    "    per_device_eval_batch_size=8,\n",
    "    predict_with_generate=True,\n",
    "    generation_max_length=225,\n",
    "    save_steps=1000,\n",
    "    eval_steps=1000,\n",
    "    logging_steps=25,\n",
    "    report_to=[\"tensorboard\"],\n",
    "    load_best_model_at_end=True,\n",
    "    metric_for_best_model=\"wer\",\n",
    "    greater_is_better=False,\n",
    "    push_to_hub=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Seq2SeqTrainer\n",
    "\n",
    "trainer = Seq2SeqTrainer(\n",
    "    args=training_args,\n",
    "    model=model,\n",
    "    train_dataset=common_voice[\"train\"],\n",
    "    eval_dataset=common_voice[\"test\"],\n",
    "    data_collator=data_collator,\n",
    "    compute_metrics=compute_metrics,\n",
    "    tokenizer=processor.feature_extractor,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Share our results\n",
    "### If we would like to share our training results on the hub\n",
    "#### Keep in mind that we have to change the argument \"push to hub\" two blocks above this code to true"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kwargs = {\n",
    "    \"dataset_tags\": \"mozilla-foundation/common_voice_11_0\",\n",
    "    \"dataset\": \"Common Voice 11.0\",  # a 'pretty' name for the training dataset\n",
    "    \"dataset_args\": \"config: nl, split: test\",\n",
    "    \"language\": \"nl\",\n",
    "    \"model_name\": \"Whisper Dutch - RTL\",  # a 'pretty' name for your model\n",
    "    \"finetuned_from\": \"openai/whisper-small\",\n",
    "    \"tasks\": \"automatic-speech-recognition\",\n",
    "    \"tags\": \"hf-asr-leaderboard\",\n",
    "}\n",
    "\n",
    "trainer.push_to_hub(**kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import WhisperForConditionalGeneration, WhisperProcessor\n",
    "\n",
    "model = WhisperForConditionalGeneration.from_pretrained(\"hannatoenbreker/whisper-dutch\")\n",
    "processor = WhisperProcessor.from_pretrained(\"hannatoenbreker/whisper-dutch\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import pipeline\n",
    "import gradio as gr\n",
    "\n",
    "pipe = pipeline(model=\"hanna/whisper-dutch\")  # change to \"your-username/the-name-you-picked\"\n",
    "\n",
    "def transcribe(audio):\n",
    "    text = pipe(audio)[\"text\"]\n",
    "    return text\n",
    "\n",
    "iface = gr.Interface(\n",
    "    fn=transcribe, \n",
    "    inputs=gr.Audio(source=\"microphone\", type=\"filepath\"), \n",
    "    outputs=\"text\",\n",
    "    title=\"Whisper Small Dutch\",\n",
    "    description=\"Realtime demo for Dutch speech recognition using a fine-tuned Whisper small model.\",\n",
    ")\n",
    "\n",
    "iface.launch()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}