temp_checkpoints / Scripts /remove_silence_files.py
MUSTAR's picture
Upload 2 files
7105a54 verified
import os
import sys
import soundfile as sf
from tqdm import tqdm
def is_significant_audio(file_path, silence_threshold=-40, silence_percent=90):
"""
Check if an audio file contains significant non-silent parts.
"""
try:
data, samplerate = sf.read(file_path)
if len(data) == 0:
return False # Empty file
# Calculate audio energy
energy = (data ** 2).mean()
silence_ratio = (energy < silence_threshold).sum() / len(data) * 100
return silence_ratio < silence_percent
except Exception as e:
print(f"Error processing {file_path}: {e}")
return False
def filter_manifest(manifest_path, output_path, dataset_dir):
"""
Read the manifest file, check for silence, and write filtered files.
"""
with open(manifest_path, 'r') as f:
lines = f.readlines()
filtered_lines = [lines[0]] # Keep the header
for line in tqdm(lines[1:], desc=f"Processing {manifest_path}"):
file_path = os.path.join(dataset_dir, line.split("\t")[0])
if is_significant_audio(file_path):
filtered_lines.append(line)
else:
print(f"Skipping file due to silence: {file_path}")
with open(output_path, 'w') as f_out:
f_out.writelines(filtered_lines)
if __name__ == "__main__":
train_manifest = sys.argv[1]
valid_manifest = sys.argv[2]
output_dir = sys.argv[3]
os.makedirs(output_dir, exist_ok=True)
dataset_dir = "dataset"
filter_manifest(train_manifest, os.path.join(output_dir, "train.tsv"), dataset_dir)
filter_manifest(valid_manifest, os.path.join(output_dir, "valid.tsv"), dataset_dir)