| import os
|
| import sys
|
| import soundfile as sf
|
| from tqdm import tqdm
|
|
|
| def is_significant_audio(file_path, silence_threshold=-40, silence_percent=90):
|
| """
|
| Check if an audio file contains significant non-silent parts.
|
| """
|
| try:
|
| data, samplerate = sf.read(file_path)
|
| if len(data) == 0:
|
| return False
|
|
|
|
|
| energy = (data ** 2).mean()
|
| silence_ratio = (energy < silence_threshold).sum() / len(data) * 100
|
| return silence_ratio < silence_percent
|
| except Exception as e:
|
| print(f"Error processing {file_path}: {e}")
|
| return False
|
|
|
| def filter_manifest(manifest_path, output_path, dataset_dir):
|
| """
|
| Read the manifest file, check for silence, and write filtered files.
|
| """
|
| with open(manifest_path, 'r') as f:
|
| lines = f.readlines()
|
|
|
| filtered_lines = [lines[0]]
|
| for line in tqdm(lines[1:], desc=f"Processing {manifest_path}"):
|
| file_path = os.path.join(dataset_dir, line.split("\t")[0])
|
| if is_significant_audio(file_path):
|
| filtered_lines.append(line)
|
| else:
|
| print(f"Skipping file due to silence: {file_path}")
|
|
|
| with open(output_path, 'w') as f_out:
|
| f_out.writelines(filtered_lines)
|
|
|
| if __name__ == "__main__":
|
| train_manifest = sys.argv[1]
|
| valid_manifest = sys.argv[2]
|
| output_dir = sys.argv[3]
|
|
|
| os.makedirs(output_dir, exist_ok=True)
|
|
|
| dataset_dir = "dataset"
|
| filter_manifest(train_manifest, os.path.join(output_dir, "train.tsv"), dataset_dir)
|
| filter_manifest(valid_manifest, os.path.join(output_dir, "valid.tsv"), dataset_dir)
|
|
|