Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,485 Bytes
dd9600d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# @ hwang258@jh.edu
import os
import argparse
from tqdm import tqdm
import json
import glob
def parse_args():
parser = argparse.ArgumentParser(description="Create manifests for gigaspeech")
parser.add_argument('--save_dir', type=str, default=None, help="path to the manifest, phonemes, and encodec codes dirs")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
phn_save_root = os.path.join(args.save_dir, "g2p")
t5_save_root = os.path.join(args.save_dir, "t5")
manifest_root = os.path.join(args.save_dir, "manifest")
os.makedirs(manifest_root, exist_ok=True)
json_paths = glob.glob(os.path.join(args.save_dir, 'jsons', '*.json'))
for json_path in json_paths:
savelines = []
with open(json_path, 'r') as json_file:
jsondata = json.load(json_file)
for key in tqdm(range(len(jsondata))):
if os.path.exists(os.path.join(phn_save_root, jsondata[key]['segment_id']+".txt")) and \
os.path.exists(os.path.join(t5_save_root, jsondata[key]['segment_id']+".npz")):
tag = "none"
savelines.append([jsondata[key]['segment_id'], tag])
outputlines = ''
for i in range(len(savelines)):
outputlines += savelines[i][0]+'\t'+str(savelines[i][1])+'\n'
with open(os.path.join(manifest_root, json_path.split('/')[-1].replace('.json', '')+'.txt'), "w") as f:
f.write(outputlines) |