import time,logging import os import random,traceback import numpy as np import torch import torch.utils.data from tqdm import tqdm from module import commons from module.mel_processing import spectrogram_torch from text import cleaned_text_to_sequence from utils import load_wav_to_torch, load_filepaths_and_text import torch.nn.functional as F from functools import lru_cache import torch import requests from scipy.io import wavfile from io import BytesIO # from config import exp_dir from my_utils import load_audio class TextAudioSpeakerLoader(torch.utils.data.Dataset): """ 1) loads audio, speaker_id, text pairs 2) normalizes text and converts them to sequences of integers 3) computes spectrograms from audio files. """ def __init__(self, hparams, val=False): exp_dir=hparams.exp_dir self.path2="%s/2-name2text.txt"%exp_dir self.path4="%s/4-cnhubert"%exp_dir self.path5="%s/5-wav32k"%exp_dir assert os.path.exists(self.path2) assert os.path.exists(self.path4) assert os.path.exists(self.path5) names4=set([name[:-3]for name in list(os.listdir(self.path4))])#去除.pt后缀 names5=set(os.listdir(self.path5)) self.phoneme_data={} with open(self.path2,"r",encoding="utf8")as f: lines=f.read().strip("\n").split("\n") for line in lines: tmp=line.split("\t") if(len(tmp)!=4):continue self.phoneme_data[tmp[0]]=[tmp[1]] self.audiopaths_sid_text=list(set(self.phoneme_data)&names4&names5) tmp=self.audiopaths_sid_text leng=len(tmp) min_num=100 if(leng<min_num): self.audiopaths_sid_text=[] for _ in range(max(2, int(min_num / leng))): self.audiopaths_sid_text += tmp self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.filter_length = hparams.filter_length self.hop_length = hparams.hop_length self.win_length = hparams.win_length self.sampling_rate = hparams.sampling_rate self.val = val random.seed(1234) random.shuffle(self.audiopaths_sid_text) print("phoneme_data_len:", len(self.phoneme_data.keys())) print("wav_data_len:", len(self.audiopaths_sid_text)) audiopaths_sid_text_new = [] lengths = [] skipped_phone = 0 skipped_dur = 0 for audiopath in tqdm(self.audiopaths_sid_text): try: phoneme = self.phoneme_data[audiopath][0] phoneme = phoneme.split(' ') phoneme_ids = cleaned_text_to_sequence(phoneme) except Exception: print(f"{audiopath} not in self.phoneme_data !") skipped_phone += 1 continue size=os.path.getsize("%s/%s"%(self.path5,audiopath)) duration = size / self.sampling_rate / 2 if (54 > duration > 0.6 or self.val): audiopaths_sid_text_new.append([audiopath, phoneme_ids]) lengths.append(size // (2 * self.hop_length)) else: skipped_dur += 1 continue print("skipped_phone: ", skipped_phone, ", skipped_dur: ", skipped_dur) print("total left: ", len(audiopaths_sid_text_new)) assert len(audiopaths_sid_text_new)>1#至少能凑够batch size,这里todo self.audiopaths_sid_text = audiopaths_sid_text_new self.lengths = lengths def get_audio_text_speaker_pair(self, audiopath_sid_text): audiopath, phoneme_ids = audiopath_sid_text text = torch.FloatTensor(phoneme_ids) try: spec, wav = self.get_audio("%s/%s"%(self.path5,audiopath)) with torch.no_grad(): ssl = torch.load("%s/%s.pt"%(self.path4,audiopath),map_location="cpu") if(ssl.shape[-1]!=spec.shape[-1]): typee=ssl.dtype ssl=F.pad(ssl.float(),(0,1),mode="replicate").to(typee) ssl.requires_grad=False except: traceback.print_exc() spec = torch.zeros(1025, 100) wav = torch.zeros(1, 100*self.hop_length) ssl=torch.zeros(1,768,100) text=text[-1:] print("load audio or ssl error!!!!!!", audiopath) # print(ssl.requires_grad,spec.requires_grad,wav.requires_grad,text.requires_grad) return (ssl, spec, wav, text) def get_audio(self, filename): audio_array = load_audio(filename,self.sampling_rate)#load_audio的方法是已经归一化到-1~1之间的,不用再/32768 # print(filename,audio_array.max(),audio_array.min(),audio_array.mean()) audio=torch.FloatTensor(audio_array)#/32768 audio_norm = audio audio_norm = audio_norm.unsqueeze(0) spec = spectrogram_torch(audio_norm, self.filter_length,self.sampling_rate, self.hop_length, self.win_length,center=False) spec = torch.squeeze(spec, 0) return spec, audio_norm def get_sid(self, sid): sid = torch.LongTensor([int(sid)]) return sid def __getitem__(self, index): # with torch.no_grad(): return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index]) def __len__(self): return len(self.audiopaths_sid_text) def random_slice(self, ssl, wav, mel): assert abs(ssl.shape[-1]- wav.shape[-1]//self.hop_length) < 3, ("first", ssl.shape, wav.shape) len_mel = mel.shape[1] if self.val: reference_mel = mel[:, :len_mel//3] return reference_mel, ssl, wav, mel dir = random.randint(0, 1) sep_point = random.randint(int(len_mel//3), int(len_mel//3*2)) if dir == 0: reference_mel = mel[:, :sep_point] ssl = ssl[:, :, sep_point:] wav2 = wav[:, sep_point*self.hop_length:] mel = mel[:, sep_point:] else: reference_mel = mel[:, sep_point:] ssl = ssl[:, :, :sep_point] wav2 = wav[:, :sep_point*self.hop_length] mel = mel[:, :sep_point] assert abs(ssl.shape[-1]- wav2.shape[-1]//self.hop_length) < 3, (ssl.shape, wav.shape,wav2.shape, mel.shape, sep_point,self.hop_length, sep_point*self.hop_length, dir) return reference_mel, ssl, wav2, mel class TextAudioSpeakerCollate(): """ Zero-pads model inputs and targets """ def __init__(self, return_ids=False): self.return_ids = return_ids def __call__(self, batch): """Collate's training batch from normalized text, audio and speaker identities PARAMS ------ batch: [text_normalized, spec_normalized, wav_normalized, sid] """ # Right zero-pad all one-hot text sequences to max input length _, ids_sorted_decreasing = torch.sort( torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True) max_ssl_len = max([x[0].size(2) for x in batch]) max_ssl_len = int(2 * ((max_ssl_len // 2) + 1)) max_spec_len = max([x[1].size(1) for x in batch]) max_spec_len = int(2 * ((max_spec_len // 2) + 1)) max_wav_len = max([x[2].size(1) for x in batch]) max_text_len = max([x[3].size(0) for x in batch]) ssl_lengths = torch.LongTensor(len(batch)) spec_lengths = torch.LongTensor(len(batch)) wav_lengths = torch.LongTensor(len(batch)) text_lengths = torch.LongTensor(len(batch)) spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len) text_padded = torch.LongTensor(len(batch), max_text_len) spec_padded.zero_() wav_padded.zero_() ssl_padded.zero_() text_padded.zero_() for i in range(len(ids_sorted_decreasing)): row = batch[ids_sorted_decreasing[i]] ssl = row[0] ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :] ssl_lengths[i] = ssl.size(2) spec = row[1] spec_padded[i, :, :spec.size(1)] = spec spec_lengths[i] = spec.size(1) wav = row[2] wav_padded[i, :, :wav.size(1)] = wav wav_lengths[i] = wav.size(1) text = row[3] text_padded[i, :text.size(0)] = text text_lengths[i] = text.size(0) return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): """ Maintain similar input lengths in a batch. Length groups are specified by boundaries. Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}. It removes samples which are not included in the boundaries. Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. """ def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True): super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) self.lengths = dataset.lengths # print(233333333333333,self.lengths,dir(dataset)) self.batch_size = batch_size self.boundaries = boundaries self.buckets, self.num_samples_per_bucket = self._create_buckets() self.total_size = sum(self.num_samples_per_bucket) self.num_samples = self.total_size // self.num_replicas def _create_buckets(self): buckets = [[] for _ in range(len(self.boundaries) - 1)] for i in range(len(self.lengths)): length = self.lengths[i] idx_bucket = self._bisect(length) if idx_bucket != -1: buckets[idx_bucket].append(i) for i in range(len(buckets) - 1, 0, -1): # for i in range(len(buckets) - 1, -1, -1): if len(buckets[i]) == 0: buckets.pop(i) self.boundaries.pop(i + 1) num_samples_per_bucket = [] for i in range(len(buckets)): len_bucket = len(buckets[i]) total_batch_size = self.num_replicas * self.batch_size rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size num_samples_per_bucket.append(len_bucket + rem) return buckets, num_samples_per_bucket def __iter__(self): # deterministically shuffle based on epoch g = torch.Generator() g.manual_seed(self.epoch) indices = [] if self.shuffle: for bucket in self.buckets: indices.append(torch.randperm(len(bucket), generator=g).tolist()) else: for bucket in self.buckets: indices.append(list(range(len(bucket)))) batches = [] for i in range(len(self.buckets)): bucket = self.buckets[i] len_bucket = len(bucket) ids_bucket = indices[i] num_samples_bucket = self.num_samples_per_bucket[i] # add extra samples to make it evenly divisible rem = num_samples_bucket - len_bucket ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)] # subsample ids_bucket = ids_bucket[self.rank::self.num_replicas] # batching for j in range(len(ids_bucket) // self.batch_size): batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]] batches.append(batch) if self.shuffle: batch_ids = torch.randperm(len(batches), generator=g).tolist() batches = [batches[i] for i in batch_ids] self.batches = batches assert len(self.batches) * self.batch_size == self.num_samples return iter(self.batches) def _bisect(self, x, lo=0, hi=None): if hi is None: hi = len(self.boundaries) - 1 if hi > lo: mid = (hi + lo) // 2 if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: return mid elif x <= self.boundaries[mid]: return self._bisect(x, lo, mid) else: return self._bisect(x, mid + 1, hi) else: return -1 def __len__(self): return self.num_samples // self.batch_size