|
|
|
|
|
|
|
|
|
|
|
import logging |
|
|
|
from fairseq.data.audio.frm_text_to_speech_dataset import FrmTextToSpeechDatasetCreator |
|
from fairseq.tasks import register_task |
|
from fairseq.tasks.text_to_speech import TextToSpeechTask |
|
|
|
|
|
logging.basicConfig( |
|
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", |
|
datefmt="%Y-%m-%d %H:%M:%S", |
|
level=logging.INFO, |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
@register_task("frm_text_to_speech") |
|
class FrmTextToSpeechTask(TextToSpeechTask): |
|
@staticmethod |
|
def add_args(parser): |
|
TextToSpeechTask.add_args(parser) |
|
parser.add_argument("--do_chunk", action="store_true", help="train on chunks") |
|
parser.add_argument("--chunk_bound", default=-1, type=int) |
|
parser.add_argument("--chunk_init", default=50, type=int) |
|
parser.add_argument("--chunk_incr", default=5, type=int) |
|
parser.add_argument("--add_eos", action="store_true") |
|
parser.add_argument("--dedup", action="store_true") |
|
parser.add_argument("--ref_fpu", default=-1, type=float) |
|
|
|
def load_dataset(self, split, **unused_kwargs): |
|
is_train_split = split.startswith("train") |
|
pre_tokenizer = self.build_tokenizer(self.args) |
|
bpe_tokenizer = self.build_bpe(self.args) |
|
self.datasets[split] = FrmTextToSpeechDatasetCreator.from_tsv( |
|
self.args.data, |
|
self.data_cfg, |
|
split, |
|
self.src_dict, |
|
pre_tokenizer, |
|
bpe_tokenizer, |
|
is_train_split=is_train_split, |
|
n_frames_per_step=self.args.n_frames_per_step, |
|
speaker_to_id=self.speaker_to_id, |
|
do_chunk=self.args.do_chunk, |
|
chunk_bound=self.args.chunk_bound, |
|
chunk_init=self.args.chunk_init, |
|
chunk_incr=self.args.chunk_incr, |
|
add_eos=self.args.add_eos, |
|
dedup=self.args.dedup, |
|
ref_fpu=self.args.ref_fpu, |
|
) |
|
|