import os import sys import soundfile as sf from tqdm import tqdm def is_significant_audio(file_path, silence_threshold=-40, silence_percent=90): """ Check if an audio file contains significant non-silent parts. """ try: data, samplerate = sf.read(file_path) if len(data) == 0: return False # Empty file # Calculate audio energy energy = (data ** 2).mean() silence_ratio = (energy < silence_threshold).sum() / len(data) * 100 return silence_ratio < silence_percent except Exception as e: print(f"Error processing {file_path}: {e}") return False def filter_manifest(manifest_path, output_path, dataset_dir): """ Read the manifest file, check for silence, and write filtered files. """ with open(manifest_path, 'r') as f: lines = f.readlines() filtered_lines = [lines[0]] # Keep the header for line in tqdm(lines[1:], desc=f"Processing {manifest_path}"): file_path = os.path.join(dataset_dir, line.split("\t")[0]) if is_significant_audio(file_path): filtered_lines.append(line) else: print(f"Skipping file due to silence: {file_path}") with open(output_path, 'w') as f_out: f_out.writelines(filtered_lines) if __name__ == "__main__": train_manifest = sys.argv[1] valid_manifest = sys.argv[2] output_dir = sys.argv[3] os.makedirs(output_dir, exist_ok=True) dataset_dir = "dataset" filter_manifest(train_manifest, os.path.join(output_dir, "train.tsv"), dataset_dir) filter_manifest(valid_manifest, os.path.join(output_dir, "valid.tsv"), dataset_dir)