|
_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture |
|
spec_transform: |
|
_target_: fish_speech.utils.spectrogram.LogMelSpectrogram |
|
sample_rate: 44100 |
|
n_mels: 160 |
|
n_fft: 2048 |
|
hop_length: 512 |
|
win_length: 2048 |
|
backbone: |
|
_target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder |
|
input_channels: 160 |
|
depths: [3, 3, 9, 3] |
|
dims: [128, 256, 384, 512] |
|
drop_path_rate: 0.2 |
|
kernel_size: 7 |
|
head: |
|
_target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator |
|
hop_length: 512 |
|
upsample_rates: [8, 8, 2, 2, 2] |
|
upsample_kernel_sizes: [16, 16, 4, 4, 4] |
|
resblock_kernel_sizes: [3, 7, 11] |
|
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] |
|
num_mels: 512 |
|
upsample_initial_channel: 512 |
|
pre_conv_kernel_size: 13 |
|
post_conv_kernel_size: 13 |
|
quantizer: |
|
_target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize |
|
input_dim: 512 |
|
n_groups: 8 |
|
n_codebooks: 1 |
|
levels: [8, 5, 5, 5] |
|
downsample_factor: [2, 2] |
|
|