{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import shutil\n",
"from multiprocessing import Pool\n",
"\n",
"import librosa\n",
"import pandas as pd\n",
"import soundfile as sf\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from tqdm.auto import tqdm\n",
"\n",
"language = 'Tamil'\n",
"lang = 'ta'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# # only for Odia and Marathi\n",
"\n",
"# def clean_id(text, prefix):\n",
"# if 'text' in text:\n",
"# text = text.strip()\n",
"# return text.replace('text', f\"{prefix}_\")\n",
"# return text\n",
"\n",
"# data_dir = f\"/nlsasfs/home/ai4bharat/manidl/ttsteam/datasets/Indic TTS Data/TTS_data_Phase2_to_be_copied/{language}\" \n",
"# for speaker in ['male', 'female']:\n",
"# data_dir_speaker = f\"{data_dir}/{speaker}/mono\"\n",
"# df = pd.read_csv(f'{data_dir_speaker}/txt.done.data', sep='\"', usecols=[0,1], header=None, names=['id', 'text'])\n",
"# df['id'] = df['id'].apply(lambda text: clean_id(text, f\"train_{language.lower()}{speaker}\"))\n",
"# df.to_csv(f'{data_dir_speaker}/txt.done.data', sep='\"', header=None, index=None)\n",
"\n",
"# wav_dir = f\"{data_dir_speaker}/wav\"\n",
"# for fn in os.listdir(wav_dir):\n",
"# fn_new = clean_id(fn, f\"train_{language.lower()}{speaker}\")\n",
"# fp_src = os.path.join(wav_dir, fn)\n",
"# fp_dst = os.path.join(wav_dir, fn_new)\n",
"# os.replace(fp_src, fp_dst)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# IndicTTS in LJSpeech format"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"data_dir = f\"/home/speech/ttsteam/datasets/indictts/{lang}\" # update the path\n",
"data_dir_new = f\"/home/speech/ttsteam/datasets/indictts/{lang}\" # update the path"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#os.makedirs(data_dir_new)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/home/speech/ttsteam/datasets/indictts/ta/wavs'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shutil.copytree(f'{data_dir}/male/mono/wav', f'{data_dir_new}/wavs')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/home/speech/ttsteam/datasets/indictts/ta/wavs'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shutil.copytree(f'{data_dir}/female/mono/wav', f'{data_dir_new}/wavs', dirs_exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(3717, 3)\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" train_tamilmale_00001 | \n",
" அது தஞ்சாவூர்க் கோட்டைக்குள் பிரவேசிக்கவும் சக... | \n",
" male | \n",
"
\n",
" \n",
" 1 | \n",
" train_tamilmale_00002 | \n",
" அதற்குத் தகுந்தபடி ஏதாவது கொஞ்சம் பேசி வேஷம் ப... | \n",
" male | \n",
"
\n",
" \n",
" 2 | \n",
" train_tamilmale_00003 | \n",
" ஆனால் அவன் எதிர்பார்த்த சந்தர்ப்பம் ஒன்றும் கி... | \n",
" male | \n",
"
\n",
" \n",
" 3 | \n",
" train_tamilmale_00004 | \n",
" அப்படியும் பல்லக்கு கீழே வைக்கப்படவில்லை ஒரே ம... | \n",
" male | \n",
"
\n",
" \n",
" 4 | \n",
" train_tamilmale_00005 | \n",
" கோட்டைக்குள் பல்லக்குப் போய்விட்டால் அப்புறம் ... | \n",
" male | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 \\\n",
"0 train_tamilmale_00001 அது தஞ்சாவூர்க் கோட்டைக்குள் பிரவேசிக்கவும் சக... \n",
"1 train_tamilmale_00002 அதற்குத் தகுந்தபடி ஏதாவது கொஞ்சம் பேசி வேஷம் ப... \n",
"2 train_tamilmale_00003 ஆனால் அவன் எதிர்பார்த்த சந்தர்ப்பம் ஒன்றும் கி... \n",
"3 train_tamilmale_00004 அப்படியும் பல்லக்கு கீழே வைக்கப்படவில்லை ஒரே ம... \n",
"4 train_tamilmale_00005 கோட்டைக்குள் பல்லக்குப் போய்விட்டால் அப்புறம் ... \n",
"\n",
" 2 \n",
"0 male \n",
"1 male \n",
"2 male \n",
"3 male \n",
"4 male "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metadata_male_fp = f\"{data_dir}/male/mono/txt.done.data\"\n",
"metadata_male = pd.read_csv(metadata_male_fp, sep='\"', usecols=[0,1], header=None)\n",
"metadata_male[0] = metadata_male[0].str.replace('\\(', '').str.strip()\n",
"metadata_male[1] = metadata_male[1].str.strip()\n",
"metadata_male[2] = 'male'\n",
"print(metadata_male.shape)\n",
"metadata_male.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(3243, 3)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" train_tamilfemale_00001 | \n",
" அதற்குத் தகுந்தபடி, ஏதாவது கொஞ்சம் பேசி, வேஷம்... | \n",
" female | \n",
"
\n",
" \n",
" 1 | \n",
" train_tamilfemale_00002 | \n",
" ஆனால், அவன் எதிர்பார்த்த சந்தர்ப்பம் ஒன்றும், ... | \n",
" female | \n",
"
\n",
" \n",
" 2 | \n",
" train_tamilfemale_00003 | \n",
" அப்படியும், பல்லக்கு கீழே வைக்கப்படவில்லை ஒரே ... | \n",
" female | \n",
"
\n",
" \n",
" 3 | \n",
" train_tamilfemale_00004 | \n",
" கோட்டைக்குள் பல்லக்குப் போய்விட்டால், அப்புறம்... | \n",
" female | \n",
"
\n",
" \n",
" 4 | \n",
" train_tamilfemale_00005 | \n",
" எடுத்த காரியத்தை முடிக்காமல், உயிரோடு திரும்பி... | \n",
" female | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 \\\n",
"0 train_tamilfemale_00001 அதற்குத் தகுந்தபடி, ஏதாவது கொஞ்சம் பேசி, வேஷம்... \n",
"1 train_tamilfemale_00002 ஆனால், அவன் எதிர்பார்த்த சந்தர்ப்பம் ஒன்றும், ... \n",
"2 train_tamilfemale_00003 அப்படியும், பல்லக்கு கீழே வைக்கப்படவில்லை ஒரே ... \n",
"3 train_tamilfemale_00004 கோட்டைக்குள் பல்லக்குப் போய்விட்டால், அப்புறம்... \n",
"4 train_tamilfemale_00005 எடுத்த காரியத்தை முடிக்காமல், உயிரோடு திரும்பி... \n",
"\n",
" 2 \n",
"0 female \n",
"1 female \n",
"2 female \n",
"3 female \n",
"4 female "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metadata_female_fp = f\"{data_dir}/female/mono/txt.done.data\"\n",
"metadata_female = pd.read_csv(metadata_female_fp, sep='\"', usecols=[0,1], header=None)\n",
"metadata_female[0] = metadata_female[0].str.replace('\\(', '').str.strip()\n",
"metadata_female[1] = metadata_female[1].str.strip()\n",
"metadata_female[2] = 'female'\n",
"print(metadata_female.shape)\n",
"metadata_female.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" train_tamilmale_00001 | \n",
" அது தஞ்சாவூர்க் கோட்டைக்குள் பிரவேசிக்கவும் சக... | \n",
" male | \n",
"
\n",
" \n",
" 1 | \n",
" train_tamilmale_00002 | \n",
" அதற்குத் தகுந்தபடி ஏதாவது கொஞ்சம் பேசி வேஷம் ப... | \n",
" male | \n",
"
\n",
" \n",
" 2 | \n",
" train_tamilmale_00003 | \n",
" ஆனால் அவன் எதிர்பார்த்த சந்தர்ப்பம் ஒன்றும் கி... | \n",
" male | \n",
"
\n",
" \n",
" 3 | \n",
" train_tamilmale_00004 | \n",
" அப்படியும் பல்லக்கு கீழே வைக்கப்படவில்லை ஒரே ம... | \n",
" male | \n",
"
\n",
" \n",
" 4 | \n",
" train_tamilmale_00005 | \n",
" கோட்டைக்குள் பல்லக்குப் போய்விட்டால் அப்புறம் ... | \n",
" male | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 6955 | \n",
" train_tamilfemale_03239 | \n",
" பின்பு அவர், துரியோதனனுடன் சதுரங்கம் ஆடி, அதில... | \n",
" female | \n",
"
\n",
" \n",
" 6956 | \n",
" train_tamilfemale_03240 | \n",
" பெட்டிக்குள்ளிருந்த, கல்லில்தான் ஏதோ மாயசக்தி ... | \n",
" female | \n",
"
\n",
" \n",
" 6957 | \n",
" train_tamilfemale_03241 | \n",
" அது போல, சத்துருஜித்தின் தாத்தா யுதாஜித், உஜ்ஜ... | \n",
" female | \n",
"
\n",
" \n",
" 6958 | \n",
" train_tamilfemale_03242 | \n",
" அது கேட்டு கும்பகர்ணன், நீங்கள் சீதையைக் கவர்ந... | \n",
" female | \n",
"
\n",
" \n",
" 6959 | \n",
" train_tamilfemale_03243 | \n",
" அப்படி இல்லாமல், முன்பின் சற்றும் யோசியாது, அவ... | \n",
" female | \n",
"
\n",
" \n",
"
\n",
"
6960 rows × 3 columns
\n",
"
"
],
"text/plain": [
" 0 \\\n",
"0 train_tamilmale_00001 \n",
"1 train_tamilmale_00002 \n",
"2 train_tamilmale_00003 \n",
"3 train_tamilmale_00004 \n",
"4 train_tamilmale_00005 \n",
"... ... \n",
"6955 train_tamilfemale_03239 \n",
"6956 train_tamilfemale_03240 \n",
"6957 train_tamilfemale_03241 \n",
"6958 train_tamilfemale_03242 \n",
"6959 train_tamilfemale_03243 \n",
"\n",
" 1 2 \n",
"0 அது தஞ்சாவூர்க் கோட்டைக்குள் பிரவேசிக்கவும் சக... male \n",
"1 அதற்குத் தகுந்தபடி ஏதாவது கொஞ்சம் பேசி வேஷம் ப... male \n",
"2 ஆனால் அவன் எதிர்பார்த்த சந்தர்ப்பம் ஒன்றும் கி... male \n",
"3 அப்படியும் பல்லக்கு கீழே வைக்கப்படவில்லை ஒரே ம... male \n",
"4 கோட்டைக்குள் பல்லக்குப் போய்விட்டால் அப்புறம் ... male \n",
"... ... ... \n",
"6955 பின்பு அவர், துரியோதனனுடன் சதுரங்கம் ஆடி, அதில... female \n",
"6956 பெட்டிக்குள்ளிருந்த, கல்லில்தான் ஏதோ மாயசக்தி ... female \n",
"6957 அது போல, சத்துருஜித்தின் தாத்தா யுதாஜித், உஜ்ஜ... female \n",
"6958 அது கேட்டு கும்பகர்ணன், நீங்கள் சீதையைக் கவர்ந... female \n",
"6959 அப்படி இல்லாமல், முன்பின் சற்றும் யோசியாது, அவ... female \n",
"\n",
"[6960 rows x 3 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metadata = pd.concat([metadata_male, metadata_female]).reset_index(drop=True)\n",
"metadata"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"metadata.to_csv(f'{data_dir_new}/metadata.csv', sep='|', index=False, header=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Resampling"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"os.makedirs(f'{data_dir_new}/wavs-20k')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def resample_file(func_args):\n",
" fp_src, fp_dst, output_sr = func_args\n",
" y, sr = librosa.load(fp_src, sr=output_sr)\n",
" sf.write(fp_dst, y, sr)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "95f0f148bf35446ba605cbcca45e9763",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/6960 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bde91a0151c14a498e25a7f0490184d2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/6960 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fps_src = [f'{data_dir_new}/wavs/{fn}' for fn in tqdm(os.listdir(f'{data_dir_new}/wavs'))]\n",
"fps_dst = [f'{data_dir_new}/wavs-20k/{fn}' for fn in tqdm(os.listdir(f'{data_dir_new}/wavs'))]\n",
"srs = [22050] * len(fps_src)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# audio_files = list(zip(fps_src, fps_dst, srs))\n",
"# with Pool(processes=16) as p:\n",
"# with tqdm(total=len(fps_src)) as pbar:\n",
"# for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)):\n",
"# pbar.update()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "be32de033d9141fc813cdb60fa746479",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/6960 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"audio_files = list(zip(fps_src, fps_dst, srs))\n",
"for audio_file in tqdm(audio_files):\n",
" resample_file(audio_file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Format data for eval"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"data_dir = f\"/home/speech/ttsteam/datasets/indictts/{lang}\""
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(6960, 3)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" text | \n",
" speaker | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" train_tamilmale_00001 | \n",
" அது தஞ்சாவூர்க் கோட்டைக்குள் பிரவேசிக்கவும் சக... | \n",
" male | \n",
"
\n",
" \n",
" 1 | \n",
" train_tamilmale_00002 | \n",
" அதற்குத் தகுந்தபடி ஏதாவது கொஞ்சம் பேசி வேஷம் ப... | \n",
" male | \n",
"
\n",
" \n",
" 2 | \n",
" train_tamilmale_00003 | \n",
" ஆனால் அவன் எதிர்பார்த்த சந்தர்ப்பம் ஒன்றும் கி... | \n",
" male | \n",
"
\n",
" \n",
" 3 | \n",
" train_tamilmale_00004 | \n",
" அப்படியும் பல்லக்கு கீழே வைக்கப்படவில்லை ஒரே ம... | \n",
" male | \n",
"
\n",
" \n",
" 4 | \n",
" train_tamilmale_00005 | \n",
" கோட்டைக்குள் பல்லக்குப் போய்விட்டால் அப்புறம் ... | \n",
" male | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id text \\\n",
"0 train_tamilmale_00001 அது தஞ்சாவூர்க் கோட்டைக்குள் பிரவேசிக்கவும் சக... \n",
"1 train_tamilmale_00002 அதற்குத் தகுந்தபடி ஏதாவது கொஞ்சம் பேசி வேஷம் ப... \n",
"2 train_tamilmale_00003 ஆனால் அவன் எதிர்பார்த்த சந்தர்ப்பம் ஒன்றும் கி... \n",
"3 train_tamilmale_00004 அப்படியும் பல்லக்கு கீழே வைக்கப்படவில்லை ஒரே ம... \n",
"4 train_tamilmale_00005 கோட்டைக்குள் பல்லக்குப் போய்விட்டால் அப்புறம் ... \n",
"\n",
" speaker \n",
"0 male \n",
"1 male \n",
"2 male \n",
"3 male \n",
"4 male "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(f'{data_dir}/metadata.csv', sep='|', names=['id', 'text', 'speaker'])\n",
"print(df.shape)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6890 70\n"
]
}
],
"source": [
"df_train, df_test = train_test_split(df, test_size=0.01, stratify=df['speaker'], random_state=0)\n",
"print(len(df_train), len(df_test))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"df_train.to_csv(f'{data_dir}/metadata_train.csv', sep='|', index=False, header=False)\n",
"df_test.to_csv(f'{data_dir}/metadata_test.csv', sep='|', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"df_train[df_train['speaker']=='male'].to_csv(f'{data_dir}/metadata_train_male.csv', sep='|', index=False, header=False)\n",
"df_test[df_test['speaker']=='male'].to_csv(f'{data_dir}/metadata_test_male.csv', sep='|', index=False, header=False)\n",
"\n",
"df_train[df_train['speaker']=='female'].to_csv(f'{data_dir}/metadata_train_female.csv', sep='|', index=False, header=False)\n",
"df_test[df_test['speaker']=='female'].to_csv(f'{data_dir}/metadata_test_female.csv', sep='|', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"os.makedirs(f'{data_dir}/wavs-20k-test-male/')\n",
"os.makedirs(f'{data_dir}/wavs-20k-test-female/')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "090f5c50a0a44b1c95b0a1a02a719e63",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/70 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for row_id, row in tqdm(df_test.iterrows(), total=len(df_test)):\n",
" src = f'{data_dir}/wavs-20k/{row[\"id\"]}.wav'\n",
" dst = f'{data_dir}/wavs-20k-test-{row[\"speaker\"]}/{row[\"id\"]}.wav'\n",
" shutil.copyfile(src, dst)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "tts-env1",
"language": "python",
"name": "tts-env1"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "fb04034de289f34c48b68cd19e605c0e979a38e1c3cfa37ec316349b950a8e21"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}