Spaces:

FunAudioLLM
/

CosyVoice-300M

Runtime error

File size: 2,078 Bytes

12bfd03

# 和 Encodec* 的 dataset.py 有点类似但是不完全一样
# 主要是 prob > 0.7 的时候多了 ans2
import glob
import random

import torch
import torchaudio
from torch.utils.data import Dataset


class NSynthDataset(Dataset):
    """Dataset to load NSynth data."""

    def __init__(self, audio_dir):
        super().__init__()
        self.filenames = []
        self.filenames.extend(glob.glob(audio_dir + "/*.wav"))
        print(len(self.filenames))
        _, self.sr = torchaudio.load(self.filenames[0])
        self.max_len = 24000  # 24000

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, index):
        #print(self.filenames[index])
        prob = random.random()  # (0,1)
        if prob > 0.7:
            # data augmentation
            ans1 = torch.zeros(1, self.max_len)
            ans2 = torch.zeros(1, self.max_len)
            audio1 = torchaudio.load(self.filenames[index])[0]
            index2 = random.randint(0, len(self.filenames) - 1)
            audio2 = torchaudio.load(self.filenames[index2])[0]
            if audio1.shape[1] > self.max_len:
                st = random.randint(0, audio1.shape[1] - self.max_len - 1)
                ed = st + self.max_len
                ans1 = audio1[:, st:ed]
            else:
                ans1[:, :audio1.shape[1]] = audio1
            if audio2.shape[1] > self.max_len:
                st = random.randint(0, audio2.shape[1] - self.max_len - 1)
                ed = st + self.max_len
                ans2 = audio2[:, st:ed]
            else:
                ans2[:, :audio2.shape[1]] = audio2
            ans = ans1 + ans2
            return ans
        else:
            ans = torch.zeros(1, self.max_len)
            audio = torchaudio.load(self.filenames[index])[0]
            if audio.shape[1] > self.max_len:
                st = random.randint(0, audio.shape[1] - self.max_len - 1)
                ed = st + self.max_len
                return audio[:, st:ed]
            else:
                ans[:, :audio.shape[1]] = audio
                return ans