Datasculptor's picture
Duplicate from AIGC-Audio/AudioGPT
98f685a
raw
history blame
8.27 kB
import numpy as np
import argparse
import csv
import os
import glob
import datetime
import time
import logging
import h5py
import librosa
from utilities import (create_folder, get_filename, create_logging,
float32_to_int16, pad_or_truncate, read_metadata)
import config
def split_unbalanced_csv_to_partial_csvs(args):
"""Split unbalanced csv to part csvs. Each part csv contains up to 50000 ids.
"""
unbalanced_csv_path = args.unbalanced_csv
unbalanced_partial_csvs_dir = args.unbalanced_partial_csvs_dir
create_folder(unbalanced_partial_csvs_dir)
with open(unbalanced_csv_path, 'r') as f:
lines = f.readlines()
lines = lines[3:] # Remove head info
audios_num_per_file = 50000
files_num = int(np.ceil(len(lines) / float(audios_num_per_file)))
for r in range(files_num):
lines_per_file = lines[r * audios_num_per_file :
(r + 1) * audios_num_per_file]
out_csv_path = os.path.join(unbalanced_partial_csvs_dir,
'unbalanced_train_segments_part{:02d}.csv'.format(r))
with open(out_csv_path, 'w') as f:
f.write('empty\n')
f.write('empty\n')
f.write('empty\n')
for line in lines_per_file:
f.write(line)
print('Write out csv to {}'.format(out_csv_path))
def download_wavs(args):
"""Download videos and extract audio in wav format.
"""
# Paths
csv_path = args.csv_path
audios_dir = args.audios_dir
mini_data = args.mini_data
if mini_data:
logs_dir = '_logs/download_dataset/{}'.format(get_filename(csv_path))
else:
logs_dir = '_logs/download_dataset_minidata/{}'.format(get_filename(csv_path))
create_folder(audios_dir)
create_folder(logs_dir)
create_logging(logs_dir, filemode='w')
logging.info('Download log is saved to {}'.format(logs_dir))
# Read csv
with open(csv_path, 'r') as f:
lines = f.readlines()
lines = lines[3:] # Remove csv head info
if mini_data:
lines = lines[0 : 10] # Download partial data for debug
download_time = time.time()
# Download
for (n, line) in enumerate(lines):
items = line.split(', ')
audio_id = items[0]
start_time = float(items[1])
end_time = float(items[2])
duration = end_time - start_time
logging.info('{} {} start_time: {:.1f}, end_time: {:.1f}'.format(
n, audio_id, start_time, end_time))
# Download full video of whatever format
video_name = os.path.join(audios_dir, '_Y{}.%(ext)s'.format(audio_id))
os.system("youtube-dl --quiet -o '{}' -x https://www.youtube.com/watch?v={}"\
.format(video_name, audio_id))
video_paths = glob.glob(os.path.join(audios_dir, '_Y' + audio_id + '.*'))
# If download successful
if len(video_paths) > 0:
video_path = video_paths[0] # Choose one video
# Add 'Y' to the head because some video ids are started with '-'
# which will cause problem
audio_path = os.path.join(audios_dir, 'Y' + audio_id + '.wav')
# Extract audio in wav format
os.system("ffmpeg -loglevel panic -i {} -ac 1 -ar 32000 -ss {} -t 00:00:{} {} "\
.format(video_path,
str(datetime.timedelta(seconds=start_time)), duration,
audio_path))
# Remove downloaded video
os.system("rm {}".format(video_path))
logging.info("Download and convert to {}".format(audio_path))
logging.info('Download finished! Time spent: {:.3f} s'.format(
time.time() - download_time))
logging.info('Logs can be viewed in {}'.format(logs_dir))
def pack_waveforms_to_hdf5(args):
"""Pack waveform and target of several audio clips to a single hdf5 file.
This can speed up loading and training.
"""
# Arguments & parameters
audios_dir = args.audios_dir
csv_path = args.csv_path
waveforms_hdf5_path = args.waveforms_hdf5_path
mini_data = args.mini_data
clip_samples = config.clip_samples
classes_num = config.classes_num
sample_rate = config.sample_rate
id_to_ix = config.id_to_ix
# Paths
if mini_data:
prefix = 'mini_'
waveforms_hdf5_path += '.mini'
else:
prefix = ''
create_folder(os.path.dirname(waveforms_hdf5_path))
logs_dir = '_logs/pack_waveforms_to_hdf5/{}{}'.format(prefix, get_filename(csv_path))
create_folder(logs_dir)
create_logging(logs_dir, filemode='w')
logging.info('Write logs to {}'.format(logs_dir))
# Read csv file
meta_dict = read_metadata(csv_path, classes_num, id_to_ix)
if mini_data:
mini_num = 10
for key in meta_dict.keys():
meta_dict[key] = meta_dict[key][0 : mini_num]
audios_num = len(meta_dict['audio_name'])
# Pack waveform to hdf5
total_time = time.time()
with h5py.File(waveforms_hdf5_path, 'w') as hf:
hf.create_dataset('audio_name', shape=((audios_num,)), dtype='S20')
hf.create_dataset('waveform', shape=((audios_num, clip_samples)), dtype=np.int16)
hf.create_dataset('target', shape=((audios_num, classes_num)), dtype=np.bool)
hf.attrs.create('sample_rate', data=sample_rate, dtype=np.int32)
# Pack waveform & target of several audio clips to a single hdf5 file
for n in range(audios_num):
audio_path = os.path.join(audios_dir, meta_dict['audio_name'][n])
if os.path.isfile(audio_path):
logging.info('{} {}'.format(n, audio_path))
(audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
audio = pad_or_truncate(audio, clip_samples)
hf['audio_name'][n] = meta_dict['audio_name'][n].encode()
hf['waveform'][n] = float32_to_int16(audio)
hf['target'][n] = meta_dict['target'][n]
else:
logging.info('{} File does not exist! {}'.format(n, audio_path))
logging.info('Write to {}'.format(waveforms_hdf5_path))
logging.info('Pack hdf5 time: {:.3f}'.format(time.time() - total_time))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(dest='mode')
parser_split = subparsers.add_parser('split_unbalanced_csv_to_partial_csvs')
parser_split.add_argument('--unbalanced_csv', type=str, required=True, help='Path of unbalanced_csv file to read.')
parser_split.add_argument('--unbalanced_partial_csvs_dir', type=str, required=True, help='Directory to save out split unbalanced partial csv.')
parser_download_wavs = subparsers.add_parser('download_wavs')
parser_download_wavs.add_argument('--csv_path', type=str, required=True, help='Path of csv file containing audio info to be downloaded.')
parser_download_wavs.add_argument('--audios_dir', type=str, required=True, help='Directory to save out downloaded audio.')
parser_download_wavs.add_argument('--mini_data', action='store_true', default=True, help='Set true to only download 10 audios for debugging.')
parser_pack_wavs = subparsers.add_parser('pack_waveforms_to_hdf5')
parser_pack_wavs.add_argument('--csv_path', type=str, required=True, help='Path of csv file containing audio info to be downloaded.')
parser_pack_wavs.add_argument('--audios_dir', type=str, required=True, help='Directory to save out downloaded audio.')
parser_pack_wavs.add_argument('--waveforms_hdf5_path', type=str, required=True, help='Path to save out packed hdf5.')
parser_pack_wavs.add_argument('--mini_data', action='store_true', default=False, help='Set true to only download 10 audios for debugging.')
args = parser.parse_args()
if args.mode == 'split_unbalanced_csv_to_partial_csvs':
split_unbalanced_csv_to_partial_csvs(args)
elif args.mode == 'download_wavs':
download_wavs(args)
elif args.mode == 'pack_waveforms_to_hdf5':
pack_waveforms_to_hdf5(args)
else:
raise Exception('Incorrect arguments!')