File size: 2,630 Bytes
bcdb559 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
from math import pi
from random import randint
from typing import Any, Optional, Sequence, Tuple, Union
import torch
from einops import rearrange
from torch import Tensor, nn
from tqdm import tqdm
from .utils import *
from .sampler import *
"""
Diffusion Classes (generic for 1d data)
"""
class Model1d(nn.Module):
def __init__(self, unet_type: str = "base", **kwargs):
super().__init__()
diffusion_kwargs, kwargs = groupby("diffusion_", kwargs)
self.unet = None
self.diffusion = None
def forward(self, x: Tensor, **kwargs) -> Tensor:
return self.diffusion(x, **kwargs)
def sample(self, *args, **kwargs) -> Tensor:
return self.diffusion.sample(*args, **kwargs)
"""
Audio Diffusion Classes (specific for 1d audio data)
"""
def get_default_model_kwargs():
return dict(
channels=128,
patch_size=16,
multipliers=[1, 2, 4, 4, 4, 4, 4],
factors=[4, 4, 4, 2, 2, 2],
num_blocks=[2, 2, 2, 2, 2, 2],
attentions=[0, 0, 0, 1, 1, 1, 1],
attention_heads=8,
attention_features=64,
attention_multiplier=2,
attention_use_rel_pos=False,
diffusion_type="v",
diffusion_sigma_distribution=UniformDistribution(),
)
def get_default_sampling_kwargs():
return dict(sigma_schedule=LinearSchedule(), sampler=VSampler(), clamp=True)
class AudioDiffusionModel(Model1d):
def __init__(self, **kwargs):
super().__init__(**{**get_default_model_kwargs(), **kwargs})
def sample(self, *args, **kwargs):
return super().sample(*args, **{**get_default_sampling_kwargs(), **kwargs})
class AudioDiffusionConditional(Model1d):
def __init__(
self,
embedding_features: int,
embedding_max_length: int,
embedding_mask_proba: float = 0.1,
**kwargs,
):
self.embedding_mask_proba = embedding_mask_proba
default_kwargs = dict(
**get_default_model_kwargs(),
unet_type="cfg",
context_embedding_features=embedding_features,
context_embedding_max_length=embedding_max_length,
)
super().__init__(**{**default_kwargs, **kwargs})
def forward(self, *args, **kwargs):
default_kwargs = dict(embedding_mask_proba=self.embedding_mask_proba)
return super().forward(*args, **{**default_kwargs, **kwargs})
def sample(self, *args, **kwargs):
default_kwargs = dict(
**get_default_sampling_kwargs(),
embedding_scale=5.0,
)
return super().sample(*args, **{**default_kwargs, **kwargs})
|