File size: 4,701 Bytes
4b86b2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/bin/bash

source "$(dirname ${BASH_SOURCE[0]})/../../configs/local_paths.sh"
export PYTHONPATH="$(dirname ${BASH_SOURCE[0]})/../:$PYTHONPATH"

DATA_DIR=$SRC_ROOT/data
DATA_SCRIPTS_PATH=$SRC_ROOT/scripts/data
MANIFESTS_DIR=$DATA_DIR/manifests_new

mkdir -p $DATA_DIR
mkdir -p $MANIFESTS_DIR
mkdir -p $DATA_DIR/tmp


# LS
librispeech_dir=$DATA_DIR/librispeech/LibriSpeech
lhotse download librispeech $DATA_DIR/librispeech
lhotse prepare librispeech $librispeech_dir $MANIFESTS_DIR


git clone https://github.com/JorisCos/LibriMix $DATA_DIR/tmp/LibriMix
pip install -r $DATA_DIR/tmp/LibriMix/requirements.txt

# Download WHAM
wham_zip_file=$DATA_DIR/tmp/wham/wham_noise.zip
wham_folder=$DATA_DIR/tmp/wham/wham_noise
if [ ! -d "$wham_folder" ]; then
    mkdir -p $DATA_DIR/tmp/wham

    if [ ! -f "$wham_zip_file" ]; then
        wget -c --tries=0 --read-timeout=20 https://my-bucket-a8b4b49c25c811ee9a7e8bba05fa24c7.s3.amazonaws.com/wham_noise.zip -P $DATA_DIR/tmp/wham
    fi

    unzip -qn $DATA_DIR/tmp/wham/wham_noise.zip -d $DATA_DIR/tmp/wham
    rm -rf $DATA_DIR/tmp/wham/wham_noise.zip
fi


python $DATA_DIR/tmp/LibriMix/scripts/augment_train_noise.py --wham_dir $DATA_DIR/tmp/wham/wham_noise

for n_src in 2 3; do
  metadata_dir=$DATA_DIR/tmp/LibriMix/metadata/Libri$n_src"Mix"
  python $DATA_DIR/tmp/LibriMix/scripts/create_librimix_from_metadata.py --librispeech_dir $librispeech_dir \
    --wham_dir $DATA_DIR/tmp/wham/wham_noise \
    --metadata_dir $metadata_dir \
    --librimix_outdir $DATA_DIR/librimix \
    --n_src $n_src \
    --freqs 16k \
    --modes max \
    --types mix_clean mix_both mix_single
  for type in "clean" "both"; do
      python $DATA_SCRIPTS_PATH/lsmix_to_lhotse.py --ls_supset $MANIFESTS_DIR/librispeech_supervisions_test-clean.jsonl.gz \
          --mixture_wavs_dir $DATA_DIR/librimix/Libri${n_src}Mix/wav16k/max/test/mix_$type \
          --output_manifest $MANIFESTS_DIR/libri${n_src}mix_mix_${type}_sc_test_cutset.jsonl.gz \
          --type $type
      python $DATA_SCRIPTS_PATH/extract_supervisions.py \
          --cutset_path $MANIFESTS_DIR/libri${n_src}mix_mix_${type}_sc_test_cutset.jsonl.gz \
          --output_path $MANIFESTS_DIR/libri${n_src}mix_mix_${type}_sc_test_supervisions.jsonl.gz
  done
done


# AMI
lhotse download ami --mic sdm $DATA_DIR/ami
lhotse prepare ami --mic sdm --normalize-text none $DATA_DIR/ami $MANIFESTS_DIR
python3 $DATA_SCRIPTS_PATH/create_cutset.py --input_recset $MANIFESTS_DIR/ami-sdm_recordings_test.jsonl.gz --input_supset $MANIFESTS_DIR/ami-sdm_supervisions_test.jsonl.gz --output $MANIFESTS_DIR/ami-sdm_cutset_test.jsonl.gz


# NOTSOFAR1
chime-utils dgen notsofar1 $DATA_DIR/nsf $DATA_DIR/notsofar  --part="train,dev,eval"
chime-utils lhotse-prep notsofar1 -d eval_sc --txt-norm none -m sdm $DATA_DIR/notsofar $MANIFESTS_DIR
chime-utils lhotse-prep notsofar1 -d eval --txt-norm none -m mdm $DATA_DIR/notsofar $MANIFESTS_DIR

python3 $DATA_SCRIPTS_PATH/create_cutset.py --input_recset $MANIFESTS_DIR/notsofar1-sdm_recordings_eval_sc.jsonl.gz --input_supset $MANIFESTS_DIR/notsofar1-sdm_supervisions_eval_sc.jsonl.gz --output $MANIFESTS_DIR/notsofar1-sdm_cutset_eval_sc.jsonl.gz
python3 $DATA_SCRIPTS_PATH/create_cutset.py --input_recset $MANIFESTS_DIR/notsofar1-mdm_recordings_eval.jsonl.gz --input_supset $MANIFESTS_DIR/notsofar1-mdm_supervisions_eval.jsonl.gz --output $MANIFESTS_DIR/notsofar1-mdm_cutset_eval.jsonl.gz


# Extract supervisions
SC_SUP_MANIFESTS_DIR=$DATA_DIR/manifests_sups_test_sc
mkdir -p $SC_SUP_MANIFESTS_DIR
cp $MANIFESTS_DIR/ami-sdm_supervisions_test.jsonl.gz $SC_SUP_MANIFESTS_DIR/ami-sdm.jsonl.gz
cp $MANIFESTS_DIR/notsofar1-sdm_supervisions_eval_sc.jsonl.gz $SC_SUP_MANIFESTS_DIR/notsofar1-small-sdm.jsonl.gz
cp $MANIFESTS_DIR/libri2mix_mix_clean_sc_test_supervisions.jsonl.gz $SC_SUP_MANIFESTS_DIR/libri2mix_clean.jsonl.gz
cp $MANIFESTS_DIR/libri2mix_mix_both_sc_test_supervisions.jsonl.gz $SC_SUP_MANIFESTS_DIR/libri2mix_both.jsonl.gz
cp $MANIFESTS_DIR/libri3mix_mix_clean_sc_test_supervisions.jsonl.gz $SC_SUP_MANIFESTS_DIR/libri3mix_clean.jsonl.gz
cp $MANIFESTS_DIR/libri3mix_mix_both_sc_test_supervisions.jsonl.gz $SC_SUP_MANIFESTS_DIR/libri3mix_both.jsonl.gz


SC_SUP_JSON_DIR=$DATA_DIR/refs_test_sc
mkdir -p $SC_SUP_JSON_DIR
for input_file in "$SC_SUP_MANIFESTS_DIR"/*.jsonl.gz; do
    # Extract just the filename (no path)
    filename=$(basename "$input_file")

    # Replace suffix to form output filename
    output_filename="${filename/.jsonl.gz/.json}"

    # Full path to output file
    output_file="$SC_SUP_JSON_DIR/$output_filename"

    # Call the Python script
    python3 $DATA_SCRIPTS_PATH/supervision_to_hyp_json.py --input "$input_file" --output "$output_file"
done