|
import os
|
|
import sys
|
|
import soundfile as sf
|
|
from tqdm import tqdm
|
|
|
|
def is_significant_audio(file_path, silence_threshold=-40, silence_percent=90):
|
|
"""
|
|
Check if an audio file contains significant non-silent parts.
|
|
"""
|
|
try:
|
|
data, samplerate = sf.read(file_path)
|
|
if len(data) == 0:
|
|
return False
|
|
|
|
|
|
energy = (data ** 2).mean()
|
|
silence_ratio = (energy < silence_threshold).sum() / len(data) * 100
|
|
return silence_ratio < silence_percent
|
|
except Exception as e:
|
|
print(f"Error processing {file_path}: {e}")
|
|
return False
|
|
|
|
def filter_manifest(manifest_path, output_path, dataset_dir):
|
|
"""
|
|
Read the manifest file, check for silence, and write filtered files.
|
|
"""
|
|
with open(manifest_path, 'r') as f:
|
|
lines = f.readlines()
|
|
|
|
filtered_lines = [lines[0]]
|
|
for line in tqdm(lines[1:], desc=f"Processing {manifest_path}"):
|
|
file_path = os.path.join(dataset_dir, line.split("\t")[0])
|
|
if is_significant_audio(file_path):
|
|
filtered_lines.append(line)
|
|
else:
|
|
print(f"Skipping file due to silence: {file_path}")
|
|
|
|
with open(output_path, 'w') as f_out:
|
|
f_out.writelines(filtered_lines)
|
|
|
|
if __name__ == "__main__":
|
|
train_manifest = sys.argv[1]
|
|
valid_manifest = sys.argv[2]
|
|
output_dir = sys.argv[3]
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
dataset_dir = "dataset"
|
|
filter_manifest(train_manifest, os.path.join(output_dir, "train.tsv"), dataset_dir)
|
|
filter_manifest(valid_manifest, os.path.join(output_dir, "valid.tsv"), dataset_dir)
|
|
|