Spaces:

jhtonyKoo
/

ITO-Master

Running

App Files Files Community

jhtonyKoo commited on Oct 15

Commit

6bbce1b

•

1 Parent(s): 71be77a

modify app

Browse files

Files changed (8) hide show

app.py +8 -6
inference.py +96 -98
modules/common_audioeffects.py +1537 -0
modules/common_miscellaneous.py +219 -0
modules/data_normalization.py +342 -0
modules/fx_utils.py +308 -0
modules/normalization_imager.py +123 -0
modules/utils_data_normalization.py +992 -0

app.py CHANGED Viewed

@@ -64,8 +64,8 @@ def process_audio_with_youtube(input_audio, input_youtube_url, reference_audio,
     return process_audio(input_audio, reference_audio)
 def process_audio(input_audio, reference_audio):
-    output_audio, predicted_params, sr = mastering_transfer.process_audio(
-        input_audio, reference_audio, reference_audio
     )
     param_output = mastering_transfer.get_param_output_string(predicted_params)
@@ -88,7 +88,7 @@ def process_audio(input_audio, reference_audio):
     # Denormalize the audio to int16
     output_audio = denormalize_audio(output_audio, dtype=np.int16)
-    return (sr, output_audio), param_output
 def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights):
     if ito_reference_audio is None:
@@ -182,13 +182,15 @@ with gr.Blocks() as demo:
         process_button = gr.Button("Process Mastering Style Transfer")
         with gr.Row():
-            output_audio = gr.Audio(label="Output Audio", type='numpy')
             param_output = gr.Textbox(label="Predicted Parameters", lines=5)
         process_button.click(
             process_audio,
             inputs=[input_audio, reference_audio],
-            outputs=[output_audio, param_output]
         )
     with gr.Tab("YouTube Audio"):
@@ -252,7 +254,7 @@ with gr.Blocks() as demo:
     ito_button.click(
         perform_ito,
-        inputs=[input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights],
         outputs=[ito_output_audio, ito_param_output, ito_step_slider, ito_log, ito_loss_plot, all_results]
     ).then(
         update_ito_output,

     return process_audio(input_audio, reference_audio)
 def process_audio(input_audio, reference_audio):
+    output_audio, predicted_params, sr, normalized_input = mastering_transfer.process_audio(
+        input_audio, reference_audio
     )
     param_output = mastering_transfer.get_param_output_string(predicted_params)
     # Denormalize the audio to int16
     output_audio = denormalize_audio(output_audio, dtype=np.int16)
+    return (sr, output_audio), param_output, (sr, normalized_input)
 def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights):
     if ito_reference_audio is None:
         process_button = gr.Button("Process Mastering Style Transfer")
         with gr.Row():
+            with gr.Column():
+                output_audio = gr.Audio(label="Output Audio", type='numpy')
+                normalized_input = gr.Audio(label="Normalized Input Audio", type='numpy')
             param_output = gr.Textbox(label="Predicted Parameters", lines=5)
         process_button.click(
             process_audio,
             inputs=[input_audio, reference_audio],
+            outputs=[output_audio, param_output, normalized_input]
         )
     with gr.Tab("YouTube Audio"):
     ito_button.click(
         perform_ito,
+        inputs=[normalized_input, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights],
         outputs=[ito_output_audio, ito_param_output, ito_step_slider, ito_log, ito_loss_plot, all_results]
     ).then(
         update_ito_output,

inference.py CHANGED Viewed

@@ -30,6 +30,11 @@ class MasteringStyleTransfer:
         self.effects_encoder = self.load_effects_encoder()
         self.mastering_converter = self.load_mastering_converter()
     def load_effects_encoder(self):
         effects_encoder = Effects_Encoder(self.args.cfg_enc)
         reload_weights(effects_encoder, self.args.encoder_path, self.device)
@@ -60,68 +65,6 @@ class MasteringStyleTransfer:
             predicted_params = self.mastering_converter.get_last_predicted_params()
         return output_audio, predicted_params
-    # def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
-    #     fit_embedding = torch.nn.Parameter(initial_reference_feature)
-    #     optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
-    #     af_loss = AudioFeatureLoss(
-    #         weights=ito_config['af_weights'],
-    #         sample_rate=ito_config['sample_rate'],
-    #         stem_separation=False,
-    #         use_clap=False
-    #     )
-    #     min_loss = float('inf')
-    #     min_loss_step = 0
-    #     min_loss_output = None
-    #     min_loss_params = None
-    #     min_loss_embedding = None
-    #     loss_history = []
-    #     divergence_counter = 0
-    #     ito_log = []
-    #     for step in range(ito_config['num_steps']):
-    #         optimizer.zero_grad()
-    #         output_audio = self.mastering_converter(input_tensor, fit_embedding)
-    #         current_params = self.mastering_converter.get_last_predicted_params()
-    #         losses = af_loss(output_audio, reference_tensor)
-    #         total_loss = sum(losses.values())
-    #         loss_history.append(total_loss.item())
-    #         if total_loss < min_loss:
-    #             min_loss = total_loss.item()
-    #             min_loss_step = step
-    #             min_loss_output = output_audio.detach()
-    #             min_loss_params = current_params
-    #             min_loss_embedding = fit_embedding.detach().clone()
-    #         # Check for divergence
-    #         if len(loss_history) > 10 and total_loss > loss_history[-11]:
-    #             divergence_counter += 1
-    #         else:
-    #             divergence_counter = 0
-    #         # Log top 5 parameter differences
-    #         if step == 0:
-    #             initial_params = current_params
-    #         top_5_diff = self.get_top_n_diff_string(initial_params, current_params, top_n=5)
-    #         log_entry = f"Step {step + 1}\n   Loss: {total_loss.item():.4f}\n{top_5_diff}\n"
-    #         if divergence_counter >= 10:
-    #             print(f"Optimization stopped early due to divergence at step {step}")
-    #             break
-    #         total_loss.backward()
-    #         optimizer.step()
-    #         yield log_entry, output_audio.detach(), current_params, step + 1, total_loss.item()
-    #     return min_loss_output, min_loss_params, min_loss_embedding, min_loss_step + 1
     def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
         fit_embedding = torch.nn.Parameter(initial_reference_feature)
         optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
@@ -167,11 +110,9 @@ class MasteringStyleTransfer:
             total_loss.backward()
             optimizer.step()
-            # yield all_results[-1]
         return all_results, min_loss_step
-    def preprocess_audio(self, audio, target_sample_rate=44100):
         sample_rate, data = audio
         # Normalize audio to -1 to 1 range
@@ -195,62 +136,119 @@ class MasteringStyleTransfer:
         else:
             raise ValueError(f"Unsupported audio shape: {data.shape}")
-        # Convert to torch tensor
-        data_tensor = torch.FloatTensor(data).unsqueeze(0)
         # Resample if necessary
         if sample_rate != target_sample_rate:
-            data_tensor = julius.resample_frac(data_tensor, sample_rate, target_sample_rate)
         return data_tensor.to(self.device)
-    def process_audio(self, input_audio, reference_audio, ito_reference_audio):
-        input_tensor = self.preprocess_audio(input_audio, self.args.sample_rate)
         reference_tensor = self.preprocess_audio(reference_audio, self.args.sample_rate)
-        ito_reference_tensor = self.preprocess_audio(ito_reference_audio, self.args.sample_rate)
         reference_feature = self.get_reference_embedding(reference_tensor)
         output_audio, predicted_params = self.mastering_style_transfer(input_tensor, reference_feature)
-        return output_audio, predicted_params, self.args.sample_rate
-    def print_predicted_params(self, predicted_params):
-        if predicted_params is None:
-            print("No predicted parameters available.")
-            return
-        print("Predicted Parameters:")
-        for fx_name, fx_params in predicted_params.items():
-            print(f"\n{fx_name.upper()}:")
-            if isinstance(fx_params, dict):
-                for param_name, param_value in fx_params.items():
-                    if isinstance(param_value, torch.Tensor):
-                        param_value = param_value.detach().cpu().numpy()
-                    print(f"  {param_name}: {param_value}")
-            elif isinstance(fx_params, torch.Tensor):
-                param_value = fx_params.detach().cpu().numpy()
-                print(f"  {param_value}")
-            else:
-                print(f"  {fx_params}")
     def get_param_output_string(self, params):
         if params is None:
             return "No parameters available"
         output = []
         for fx_name, fx_params in params.items():
-            output.append(f"{fx_name.upper()}:")
             if isinstance(fx_params, dict):
                 for param_name, param_value in fx_params.items():
                     if isinstance(param_value, torch.Tensor):
                         param_value = param_value.item()
-                    output.append(f"  {param_name}: {param_value:.2f}")
-            elif isinstance(fx_params, torch.Tensor):
-                output.append(f"  {fx_params.item():.2f}")
             else:
-                output.append(f"  {fx_params:.2f}")
         return "\n".join(output)
     def get_top_n_diff_string(self, initial_params, ito_params, top_n=5):

         self.effects_encoder = self.load_effects_encoder()
         self.mastering_converter = self.load_mastering_converter()
+        self.fx_normalizer = Audio_Effects_Normalizer(precomputed_feature_path=args.fx_norm_feature_path, \
+                                                        STEMS=['mixture'], \
+                                                        EFFECTS=['eq', 'imager', 'loudness'], \
+                                                        audio_extension=args.audio_extension)
     def load_effects_encoder(self):
         effects_encoder = Effects_Encoder(self.args.cfg_enc)
         reload_weights(effects_encoder, self.args.encoder_path, self.device)
             predicted_params = self.mastering_converter.get_last_predicted_params()
         return output_audio, predicted_params
     def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
         fit_embedding = torch.nn.Parameter(initial_reference_feature)
         optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
             total_loss.backward()
             optimizer.step()
         return all_results, min_loss_step
+    def preprocess_audio(self, audio, target_sample_rate=44100, is_input=False):
         sample_rate, data = audio
         # Normalize audio to -1 to 1 range
         else:
             raise ValueError(f"Unsupported audio shape: {data.shape}")
         # Resample if necessary
         if sample_rate != target_sample_rate:
+            data = julius.resample_frac(torch.from_numpy(data), sample_rate, target_sample_rate).numpy()
+        # Apply fx normalization for input audio during mastering style transfer
+        if is_input:
+            data = self.fx_normalizer.normalize_audio(data, 'mixture')
+        # Convert to torch tensor
+        data_tensor = torch.FloatTensor(data).unsqueeze(0)
         return data_tensor.to(self.device)
+    def process_audio(self, input_audio, reference_audio):
+        input_tensor = self.preprocess_audio(input_audio, self.args.sample_rate, is_input=True)
         reference_tensor = self.preprocess_audio(reference_audio, self.args.sample_rate)
         reference_feature = self.get_reference_embedding(reference_tensor)
         output_audio, predicted_params = self.mastering_style_transfer(input_tensor, reference_feature)
+        return output_audio, predicted_params, self.args.sample_rate, input_tensor
     def get_param_output_string(self, params):
         if params is None:
             return "No parameters available"
+        param_mapper = {
+            'EQ': {
+                'low_shelf_gain_db': ('Low Shelf Gain', 'dB', -20, 20),
+                'low_shelf_cutoff_freq': ('Low Shelf Cutoff', 'Hz', 20, 2000),
+                'low_shelf_q_factor': ('Low Shelf Q', '', 0.1, 5.0),
+                'band0_gain_db': ('Low-Mid Band Gain', 'dB', -20, 20),
+                'band0_cutoff_freq': ('Low-Mid Band Frequency', 'Hz', 80, 2000),
+                'band0_q_factor': ('Low-Mid Band Q', '', 0.1, 5.0),
+                'band1_gain_db': ('Mid Band Gain', 'dB', -20, 20),
+                'band1_cutoff_freq': ('Mid Band Frequency', 'Hz', 2000, 8000),
+                'band1_q_factor': ('Mid Band Q', '', 0.1, 5.0),
+                'band2_gain_db': ('High-Mid Band Gain', 'dB', -20, 20),
+                'band2_cutoff_freq': ('High-Mid Band Frequency', 'Hz', 8000, 12000),
+                'band2_q_factor': ('High-Mid Band Q', '', 0.1, 5.0),
+                'band3_gain_db': ('High Band Gain', 'dB', -20, 20),
+                'band3_cutoff_freq': ('High Band Frequency', 'Hz', 12000, 20000),  # Assuming sample_rate is 44100
+                'band3_q_factor': ('High Band Q', '', 0.1, 5.0),
+                'high_shelf_gain_db': ('High Shelf Gain', 'dB', -20, 20),
+                'high_shelf_cutoff_freq': ('High Shelf Cutoff', 'Hz', 4000, 20000),  # Assuming sample_rate is 44100
+                'high_shelf_q_factor': ('High Shelf Q', '', 0.1, 5.0),
+            },
+            'DISTORTION': {
+                'drive_db': ('Drive', 'dB', 0, 8),
+                'parallel_weight_factor': ('Dry/Wet Mix', '%', 0, 100),
+            },
+            'MULTIBAND_COMP': {
+                'low_cutoff': ('Low/Mid Crossover', 'Hz', 20, 1000),
+                'high_cutoff': ('Mid/High Crossover', 'Hz', 1000, 20000),
+                'parallel_weight_factor': ('Dry/Wet Mix', '%', 0, 100),
+                'low_shelf_comp_thresh': ('Low Band Comp Threshold', 'dB', -60, 0),
+                'low_shelf_comp_ratio': ('Low Band Comp Ratio', ':1', 1, 20),
+                'low_shelf_exp_thresh': ('Low Band Exp Threshold', 'dB', -60, 0),
+                'low_shelf_exp_ratio': ('Low Band Exp Ratio', ':1', 1, 20),
+                'low_shelf_at': ('Low Band Attack Time', 'ms', 5, 100),
+                'low_shelf_rt': ('Low Band Release Time', 'ms', 5, 100),
+                'mid_band_comp_thresh': ('Mid Band Comp Threshold', 'dB', -60, 0),
+                'mid_band_comp_ratio': ('Mid Band Comp Ratio', ':1', 1, 20),
+                'mid_band_exp_thresh': ('Mid Band Exp Threshold', 'dB', -60, 0),
+                'mid_band_exp_ratio': ('Mid Band Exp Ratio', ':1', 1, 20),
+                'mid_band_at': ('Mid Band Attack Time', 'ms', 5, 100),
+                'mid_band_rt': ('Mid Band Release Time', 'ms', 5, 100),
+                'high_shelf_comp_thresh': ('High Band Comp Threshold', 'dB', -60, 0),
+                'high_shelf_comp_ratio': ('High Band Comp Ratio', ':1', 1, 20),
+                'high_shelf_exp_thresh': ('High Band Exp Threshold', 'dB', -60, 0),
+                'high_shelf_exp_ratio': ('High Band Exp Ratio', ':1', 1, 20),
+                'high_shelf_at': ('High Band Attack Time', 'ms', 5, 100),
+                'high_shelf_rt': ('High Band Release Time', 'ms', 5, 100),
+            },
+            'GAIN': {
+                'gain_db': ('Output Gain', 'dB', -24, 24),
+            },
+            'IMAGER': {
+                'width': ('Stereo Width', '', 0, 1),
+            },
+            'LIMITER': {
+                'threshold': ('Threshold', 'dB', -60, 0),
+                'at': ('Attack Time', 'ms', 5, 100),
+                'rt': ('Release Time', 'ms', 5, 100),
+            },
+        }
         output = []
         for fx_name, fx_params in params.items():
+            output.append(f"{fx_name}:")
             if isinstance(fx_params, dict):
                 for param_name, param_value in fx_params.items():
                     if isinstance(param_value, torch.Tensor):
                         param_value = param_value.item()
+                    if fx_name in param_mapper and param_name in param_mapper[fx_name]:
+                        friendly_name, unit, min_val, max_val = param_mapper[fx_name][param_name]
+                        if fx_name == 'IMAGER' and param_name == 'width':
+                            # Convert width to a more intuitive scale
+                            width_percentage = param_value * 200
+                            output.append(f"  {friendly_name}: {width_percentage:.2f}% (Range: 0-200%)")
+                        else:
+                            output.append(f"  {friendly_name}: {param_value:.2f} {unit} (Range: {min_val}-{max_val})")
+                    else:
+                        output.append(f"  {param_name}: {param_value:.2f}")
             else:
+                if fx_name == 'IMAGER':
+                    width_percentage = fx_params.item() * 200
+                    output.append(f"  Stereo Width: {width_percentage:.2f}% (Range: 0-200%)")
+                else:
+                    output.append(f"  {fx_params.item():.2f}")
         return "\n".join(output)
     def get_top_n_diff_string(self, initial_params, ito_params, top_n=5):

modules/common_audioeffects.py ADDED Viewed

	@@ -0,0 +1,1537 @@

+"""
+Audio effects for data augmentation.
+Several audio effects can be combined into an augmentation chain.
+Important note: We assume that the parallelization during training is done using
+                multi-processing and not multi-threading. Hence, we do not need the
+                `@sox.sox_context()` decorators as discussed in this
+                [thread](https://github.com/pseeth/soxbindings/issues/4).
+AI Music Technology Group, Sony Group Corporation
+AI Speech and Sound Group, Sony Europe
+This implementation originally belongs to Sony Group Corporation,
+    which has been introduced in the work "Automatic music mixing with deep learning and out-of-domain data".
+    Original repo link: https://github.com/sony/FxNorm-automix
+This work modifies a few implementations from the original repo to suit the task.
+"""
+from itertools import permutations
+import logging
+import numpy as np
+import pymixconsole as pymc
+from pymixconsole.parameter import Parameter
+from pymixconsole.parameter_list import ParameterList
+from pymixconsole.processor import Processor
+from random import shuffle
+from scipy.signal import oaconvolve
+import soxbindings as sox
+from typing import List, Optional, Tuple, Union
+from numba import jit
+# prevent pysox from logging warnings regarding non-opimal timestretch factors
+logging.getLogger('sox').setLevel(logging.ERROR)
+# Monkey-Patch `Processor` for convenience
+# (a) Allow `None` as blocksize if processor can work on variable-length audio
+def new_init(self, name, parameters, block_size, sample_rate, dtype='float32'):
+    """
+    Initialize processor.
+    Args:
+        self: Reference to object
+        name (str): Name of processor.
+        parameters (parameter_list): Parameters for this processor.
+        block_size (int): Size of blocks for blockwise processing.
+            Can also be `None` if full audio can be processed at once.
+        sample_rate (int): Sample rate of input audio. Use `None` if effect is independent of this value.
+        dtype (str): data type of samples
+    """
+    self.name = name
+    self.parameters = parameters
+    self.block_size = block_size
+    self.sample_rate = sample_rate
+    self.dtype = dtype
+# (b) make code simpler
+def new_update(self, parameter_name):
+    """
+    Update processor after randomization of parameters.
+    Args:
+        self: Reference to object.
+        parameter_name (str): Parameter whose value has changed.
+    """
+    pass
+# (c) representation for nice print
+def new_repr(self):
+    """
+    Create human-readable representation.
+    Args:
+        self: Reference to object.
+    Returns:
+        string representation of object.
+    """
+    return f'Processor(name={self.name!r}, parameters={self.parameters!r}'
+Processor.__init__ = new_init
+Processor.__repr__ = new_repr
+Processor.update = new_update
+class AugmentationChain:
+    """Basic audio Fx chain which is used for data augmentation."""
+    def __init__(self,
+                 fxs: Optional[List[Tuple[Union[Processor, 'AugmentationChain'], float, bool]]] = [],
+                 shuffle: Optional[bool] = False,
+                 parallel: Optional[bool] = False,
+                 parallel_weight_factor = None,
+                 randomize_param_value=True):
+        """
+        Create augmentation chain from the dictionary `fxs`.
+        Args:
+            fxs (list of tuples): First tuple element is an instances of `pymc.processor` or `AugmentationChain` that
+                we want to use for data augmentation. Second element gives probability that effect should be applied.
+                Third element defines, whether the processed signal is normalized by the RMS of the input.
+            shuffle (bool): If `True` then order of Fx are changed whenever chain is applied.
+        """
+        self.fxs = fxs
+        self.shuffle = shuffle
+        self.parallel = parallel
+        self.parallel_weight_factor = parallel_weight_factor
+        self.randomize_param_value = randomize_param_value
+    def apply_processor(self, x, processor: Processor, rms_normalize):
+        """
+        Pass audio in `x` through `processor` and output the respective processed audio.
+        Args:
+            x (Numpy array): Input audio of shape `n_samples` x `n_channels`.
+            processor (Processor): Audio effect that we want to apply.
+            rms_normalize (bool):  If `True`, the processed signal is normalized by the RMS of the signal.
+        Returns:
+            Numpy array: Processed audio of shape `n_samples` x `n_channels` (same size as `x')
+        """
+        n_samples_input = x.shape[0]
+        if processor.block_size is None:
+            y = processor.process(x)
+        else:
+            # make sure that n_samples is a multiple of `processor.block_size`
+            if x.shape[0] % processor.block_size != 0:
+                n_pad = processor.block_size - x.shape[0] % processor.block_size
+                x = np.pad(x, ((0, n_pad), (0, 0)), mode='reflective')
+            y = np.zeros_like(x)
+            for idx in range(0, x.shape[0], processor.block_size):
+                y[idx:idx+processor.block_size, :] = processor.process(x[idx:idx+processor.block_size, :])
+        if rms_normalize:
+            # normalize output energy such that it is the same as the input energy
+            scale = np.sqrt(np.mean(np.square(x)) / np.maximum(1e-7, np.mean(np.square(y))))
+            y *= scale
+        # return audio of same length as x
+        return y[:n_samples_input, :]
+    def apply_same_processor(self, x_list, processor: Processor, rms_normalize):
+        for i in range(len(x_list)):
+            x_list[i] = self.apply_processor(x_list[i], processor, rms_normalize)
+        return x_list
+    def __call__(self, x_list):
+        """
+        Apply the same augmentation chain to audio tracks in list `x_list`.
+        Args:
+            x_list (list of Numpy array) : List of audio samples of shape `n_samples` x `n_channels`.
+        Returns:
+            y_list (list of Numpy array) : List of processed audio of same shape as `x_list` where the same effects have been applied.
+        """
+        # randomly shuffle effect order if `self.shuffle` is True
+        if self.shuffle:
+            shuffle(self.fxs)
+        # apply effects with probabilities given in `self.fxs`
+        y_list = x_list.copy()
+        for fx, p, rms_normalize in self.fxs:
+            if np.random.rand() < p:
+                if isinstance(fx, Processor):
+                    # randomize all effect parameters (also calls `update()` for each processor)
+                    if self.randomize_param_value:
+                        fx.randomize()
+                    else:
+                        fx.update(None)
+                    # apply processor
+                    y_list = self.apply_same_processor(y_list, fx, rms_normalize)
+                else:
+                    y_list = fx(y_list)
+        if self.parallel:
+            # weighting factor of input signal in the range of (0.0 ~ 0.5)
+            weight_in = self.parallel_weight_factor if self.parallel_weight_factor else np.random.rand() / 2.
+            for i in range(len(y_list)):
+                y_list[i] = weight_in*x_list[i] + (1-weight_in)*y_list[i]
+        return y_list
+    def __repr__(self):
+        """
+        Human-readable representation.
+        Returns:
+            string representation of object.
+        """
+        return f'AugmentationChain(fxs={self.fxs!r}, shuffle={self.shuffle!r})'
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% DISTORTION %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+def hard_clip(x, threshold_dB, drive):
+    """
+    Hard clip distortion.
+    Args:
+        x: input audio
+        threshold_dB: threshold
+        drive: drive
+    Returns:
+        (Numpy array): distorted audio
+    """
+    drive_linear = np.power(10., drive / 20.).astype(np.float32)
+    threshold_linear = 10. ** (threshold_dB / 20.)
+    return np.clip(x * drive_linear, -threshold_linear, threshold_linear)
+def overdrive(x, drive, colour, sample_rate):
+    """
+    Overdrive distortion.
+    Args:
+        x: input audio
+        drive: Controls the amount of distortion (dB).
+        colour: Controls the amount of even harmonic content in the output(dB)
+        sample_rate: sampling rate
+    Returns:
+        (Numpy array): distorted audio
+    """
+    scale = np.max(np.abs(x))
+    if scale > 0.9:
+        clips = True
+        x = x * (0.9 / scale)
+    else:
+        clips = False
+    tfm = sox.Transformer()
+    tfm.overdrive(gain_db=drive, colour=colour)
+    y = tfm.build_array(input_array=x, sample_rate_in=sample_rate).astype(np.float32)
+    if clips:
+        y *= scale / 0.9  # rescale output to original scale
+    return y
+def hyperbolic_tangent(x, drive):
+    """
+    Hyperbolic Tanh distortion.
+    Args:
+        x: input audio
+        drive: drive
+    Returns:
+        (Numpy array): distorted audio
+    """
+    drive_linear = np.power(10., drive / 20.).astype(np.float32)
+    return np.tanh(2. * x * drive_linear)
+def soft_sine(x, drive):
+    """
+    Soft sine distortion.
+    Args:
+        x: input audio
+        drive: drive
+    Returns:
+        (Numpy array): distorted audio
+    """
+    drive_linear = np.power(10., drive / 20.).astype(np.float32)
+    y = np.clip(x * drive_linear, -np.pi/4.0, np.pi/4.0)
+    return np.sin(2. * y)
+def bit_crusher(x, bits):
+    """
+    Bit crusher distortion.
+    Args:
+        x: input audio
+        bits: bits
+    Returns:
+        (Numpy array): distorted audio
+    """
+    return np.rint(x * (2 ** bits)) / (2 ** bits)
+class Distortion(Processor):
+    """
+    Distortion processor.
+    Processor parameters:
+        mode (str): Currently supports the following five modes: hard_clip, waveshaper, soft_sine, tanh, bit_crusher.
+            Each mode has different parameters such as threshold, factor, or bits.
+        threshold (float): threshold
+        drive (float): drive
+        factor (float): factor
+        limit_range (float): limit range
+        bits (int): bits
+    """
+    def __init__(self, sample_rate, name='Distortion', parameters=None):
+        """
+        Initialize processor.
+        Args:
+            sample_rate (int): sample rate.
+            name (str): Name of processor.
+            parameters (parameter_list): Parameters for this processor.
+        """
+        super().__init__(name, None, block_size=None, sample_rate=sample_rate)
+        if not parameters:
+            self.parameters = ParameterList()
+            self.parameters.add(Parameter('mode', 'hard_clip', 'string',
+                                          options=['hard_clip',
+                                                   'overdrive',
+                                                   'soft_sine',
+                                                   'tanh',
+                                                   'bit_crusher']))
+            self.parameters.add(Parameter('threshold', 0.0, 'float',
+                                          units='dB', maximum=0.0, minimum=-20.0))
+            self.parameters.add(Parameter('drive', 0.0, 'float',
+                                          units='dB', maximum=20.0, minimum=0.0))
+            self.parameters.add(Parameter('colour', 20.0, 'float',
+                                          maximum=100.0, minimum=0.0))
+            self.parameters.add(Parameter('bits', 12, 'int',
+                                          maximum=12, minimum=8))
+    def process(self, x):
+        """
+        Process audio.
+        Args:
+            x (Numpy array): input audio of size `n_samples x n_channels`.
+        Returns:
+            (Numpy array): distorted audio of size `n_samples x n_channels`.
+        """
+        if self.parameters.mode.value == 'hard_clip':
+            y = hard_clip(x, self.parameters.threshold.value, self.parameters.drive.value)
+        elif self.parameters.mode.value == 'overdrive':
+            y = overdrive(x, self.parameters.drive.value,
+                          self.parameters.colour.value, self.sample_rate)
+        elif self.parameters.mode.value == 'soft_sine':
+            y = soft_sine(x, self.parameters.drive.value)
+        elif self.parameters.mode.value == 'tanh':
+            y = hyperbolic_tangent(x, self.parameters.drive.value)
+        elif self.parameters.mode.value == 'bit_crusher':
+            y = bit_crusher(x, self.parameters.bits.value)
+        # If the output has low amplitude, (some distortion settigns can "crush" down the amplitude)
+        # Then it`s normalised to the input's amplitude
+        x_max = np.max(np.abs(x)) + 1e-8
+        o_max = np.max(np.abs(y)) + 1e-8
+        if x_max > o_max:
+            y = y*(x_max/o_max)
+        return y
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% EQUALISER %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+class Equaliser(Processor):
+    """
+    Five band parametric equaliser (two shelves and three central bands).
+    All gains are set in dB values and range from `MIN_GAIN` dB to `MAX_GAIN` dB.
+    This processor is implemented as cascade of five biquad IIR filters
+    that are implemented using the infamous cookbook formulae from RBJ.
+    Processor parameters:
+        low_shelf_gain (float), low_shelf_freq (float)
+        first_band_gain (float), first_band_freq (float), first_band_q (float)
+        second_band_gain (float), second_band_freq (float), second_band_q (float)
+        third_band_gain (float), third_band_freq (float), third_band_q (float)
+    original from https://github.com/csteinmetz1/pymixconsole/blob/master/pymixconsole/processors/equaliser.py
+    """
+    def __init__(self, n_channels,
+                 sample_rate,
+                 gain_range=(-15.0, 15.0),
+                 q_range=(0.1, 2.0),
+                 bands=['low_shelf', 'first_band', 'second_band', 'third_band', 'high_shelf'],
+                 hard_clip=False,
+                 name='Equaliser', parameters=None):
+        """
+        Initialize processor.
+        Args:
+            n_channels (int): Number of audio channels.
+            sample_rate (int): Sample rate of audio.
+            gain_range (tuple of floats): minimum and maximum gain that can be used.
+            q_range (tuple of floats): minimum and maximum q value.
+            hard_clip (bool): Whether we clip to [-1, 1.] after processing.
+            name (str): Name of processor.
+            parameters (parameter_list): Parameters for this processor.
+        """
+        super().__init__(name, parameters=parameters, block_size=None, sample_rate=sample_rate)
+        self.n_channels = n_channels
+        MIN_GAIN, MAX_GAIN = gain_range
+        MIN_Q, MAX_Q = q_range
+        if not parameters:
+            self.parameters = ParameterList()
+            # low shelf parameters -------
+            self.parameters.add(Parameter('low_shelf_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
+            self.parameters.add(Parameter('low_shelf_freq', 80.0, 'float', minimum=30.0, maximum=200.0))
+            # first band parameters ------
+            self.parameters.add(Parameter('first_band_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
+            self.parameters.add(Parameter('first_band_freq', 400.0, 'float', minimum=200.0, maximum=1000.0))
+            self.parameters.add(Parameter('first_band_q', 0.7, 'float', minimum=MIN_Q, maximum=MAX_Q))
+            # second band parameters -----
+            self.parameters.add(Parameter('second_band_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
+            self.parameters.add(Parameter('second_band_freq', 2000.0, 'float', minimum=1000.0, maximum=3000.0))
+            self.parameters.add(Parameter('second_band_q', 0.7, 'float', minimum=MIN_Q, maximum=MAX_Q))
+            # third band parameters ------
+            self.parameters.add(Parameter('third_band_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
+            self.parameters.add(Parameter('third_band_freq', 4000.0, 'float', minimum=3000.0, maximum=8000.0))
+            self.parameters.add(Parameter('third_band_q', 0.7, 'float', minimum=MIN_Q, maximum=MAX_Q))
+            # high shelf parameters ------
+            self.parameters.add(Parameter('high_shelf_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
+            self.parameters.add(Parameter('high_shelf_freq', 8000.0, 'float', minimum=5000.0, maximum=10000.0))
+        self.bands = bands
+        self.filters = self.setup_filters()
+        self.hard_clip = hard_clip
+    def setup_filters(self):
+        """
+        Create IIR filters.
+        Returns:
+            IIR filters
+        """
+        filters = {}
+        for band in self.bands:
+            G = getattr(self.parameters, band + '_gain').value
+            fc = getattr(self.parameters, band + '_freq').value
+            rate = self.sample_rate
+            if band in ['low_shelf', 'high_shelf']:
+                Q = 0.707
+                filter_type = band
+            else:
+                Q = getattr(self.parameters, band + '_q').value
+                filter_type = 'peaking'
+            filters[band] = pymc.components.iirfilter.IIRfilter(G, Q, fc, rate, filter_type, n_channels=self.n_channels)
+        return filters
+    def update_filter(self, band):
+        """
+        Update filters.
+        Args:
+            band (str): Band that should be updated.
+        """
+        self.filters[band].G = getattr(self.parameters, band + '_gain').value
+        self.filters[band].fc = getattr(self.parameters, band + '_freq').value
+        self.filters[band].rate = self.sample_rate
+        if band in ['first_band', 'second_band', 'third_band']:
+            self.filters[band].Q = getattr(self.parameters, band + '_q').value
+    def update(self, parameter_name=None):
+        """
+        Update processor after randomization of parameters.
+        Args:
+            parameter_name (str): Parameter whose value has changed.
+        """
+        if parameter_name is not None:
+            bands = ['_'.join(parameter_name.split('_')[:2])]
+        else:
+            bands = self.bands
+        for band in bands:
+            self.update_filter(band)
+        for _band, iirfilter in self.filters.items():
+            iirfilter.reset_state()
+    def reset_state(self):
+        """Reset state."""
+        for _band, iirfilter in self.filters.items():
+            iirfilter.reset_state()
+    def process(self, x):
+        """
+        Process audio.
+        Args:
+            x (Numpy array): input audio of size `n_samples x n_channels`.
+        Returns:
+            (Numpy array): equalized audio of size `n_samples x n_channels`.
+        """
+        for _band, iirfilter in self.filters.items():
+            iirfilter.reset_state()
+            x = iirfilter.apply_filter(x)
+        if self.hard_clip:
+            x = np.clip(x, -1.0, 1.0)
+        # make sure that we have float32 as IIR filtering returns float64
+        x = x.astype(np.float32)
+        # make sure that we have two dimensions (if `n_channels == 1`)
+        if x.ndim == 1:
+            x = x[:, np.newaxis]
+        return x
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% COMPRESSOR %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+@jit(nopython=True)
+def compressor_process(x, threshold, attack_time, release_time, ratio, makeup_gain, sample_rate, yL_prev):
+    """
+    Apply compressor.
+    Args:
+        x (Numpy array): audio data.
+        threshold: threshold in dB.
+        attack_time: attack_time in ms.
+        release_time: release_time in ms.
+        ratio: ratio.
+        makeup_gain: makeup_gain.
+        sample_rate: sample rate.
+        yL_prev: internal state of the envelop gain.
+    Returns:
+        compressed audio.
+    """
+    M = x.shape[0]
+    x_g = np.zeros(M)
+    x_l = np.zeros(M)
+    y_g = np.zeros(M)
+    y_l = np.zeros(M)
+    c = np.zeros(M)
+    yL_prev = 0.
+    alpha_attack = np.exp(-1/(0.001 * sample_rate * attack_time))
+    alpha_release = np.exp(-1/(0.001 * sample_rate * release_time))
+    for i in np.arange(M):
+        if np.abs(x[i]) < 0.000001:
+            x_g[i] = -120.0
+        else:
+            x_g[i] = 20 * np.log10(np.abs(x[i]))
+        if ratio > 1:
+            if x_g[i] >= threshold:
+                y_g[i] = threshold + (x_g[i] - threshold) / ratio
+            else:
+                y_g[i] = x_g[i]
+        elif ratio < 1:
+            if x_g[i] <= threshold:
+                y_g[i] = threshold + (x_g[i] - threshold) / (1/ratio)
+            else:
+                y_g[i] = x_g[i]
+        x_l[i] = x_g[i] - y_g[i]
+        if x_l[i] > yL_prev:
+            y_l[i] = alpha_attack * yL_prev + (1 - alpha_attack) * x_l[i]
+        else:
+            y_l[i] = alpha_release * yL_prev + (1 - alpha_release) * x_l[i]
+        c[i] = np.power(10.0, (makeup_gain - y_l[i]) / 20.0)
+        yL_prev = y_l[i]
+    y = x * c
+    return y, yL_prev
+class Compressor(Processor):
+    """
+    Single band stereo dynamic range compressor.
+    Processor parameters:
+        threshold (float)
+        attack_time (float)
+        release_time (float)
+        ratio (float)
+        makeup_gain (float)
+    """
+    def __init__(self, sample_rate, name='Compressor', parameters=None):
+        """
+        Initialize processor.
+        Args:
+            sample_rate (int): Sample rate of input audio.
+            name (str): Name of processor.
+            parameters (parameter_list): Parameters for this processor.
+        """
+        super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
+        if not parameters:
+            self.parameters = ParameterList()
+            self.parameters.add(Parameter('threshold', -20.0, 'float', units='dB', minimum=-80.0, maximum=-5.0))
+            self.parameters.add(Parameter('attack_time', 2.0, 'float', units='ms', minimum=1., maximum=20.0))
+            self.parameters.add(Parameter('release_time', 100.0, 'float', units='ms', minimum=50.0, maximum=500.0))
+            self.parameters.add(Parameter('ratio', 4.0, 'float', minimum=4., maximum=40.0))
+            # we remove makeup_gain parameter inside the Compressor
+        # store internal state (for block-wise processing)
+        self.yL_prev = None
+    def process(self, x):
+        """
+        Process audio.
+        Args:
+            x (Numpy array): input audio of size `n_samples x n_channels`.
+        Returns:
+            (Numpy array): compressed audio of size `n_samples x n_channels`.
+        """
+        if self.yL_prev is None:
+            self.yL_prev = [0.] * x.shape[1]
+        if not self.parameters.threshold.value == 0.0 or not self.parameters.ratio.value == 1.0:
+            y = np.zeros_like(x)
+            for ch in range(x.shape[1]):
+                y[:, ch], self.yL_prev[ch] = compressor_process(x[:, ch],
+                                                                self.parameters.threshold.value,
+                                                                self.parameters.attack_time.value,
+                                                                self.parameters.release_time.value,
+                                                                self.parameters.ratio.value,
+                                                                0.0, # makeup_gain = 0
+                                                                self.sample_rate,
+                                                                self.yL_prev[ch])
+        else:
+            y = x
+        return y
+    def update(self, parameter_name=None):
+        """
+        Update processor after randomization of parameters.
+        Args:
+            parameter_name (str): Parameter whose value has changed.
+        """
+        self.yL_prev = None
+# %%%%%%%%%%%%%%%%%%%%%%%%%% CONVOLUTIONAL REVERB %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+class ConvolutionalReverb(Processor):
+    """
+    Convolutional Reverb.
+    Processor parameters:
+        wet_dry (float): Wet/dry ratio.
+        decay (float): Applies a fade out to the impulse response.
+        pre_delay (float): Value in ms. Shifts the IR in time and allows.
+            A positive value produces a traditional delay between the dry signal and the wet.
+            A negative delay is, in reality, zero delay, but effectively trims off the start of IR,
+            so the reverb response begins at a point further in.
+    """
+    def __init__(self, impulse_responses, sample_rate, name='ConvolutionalReverb', parameters=None):
+        """
+        Initialize processor.
+        Args:
+            impulse_responses (list): List with impulse responses created by `common_dataprocessing.create_dataset`
+            sample_rate (int): Sample rate that we should assume (used for fade-out computation)
+            name (str): Name of processor.
+            parameters (parameter_list): Parameters for this processor.
+        Raises:
+            ValueError: if no impulse responses are provided.
+        """
+        super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
+        if impulse_responses is None:
+            raise ValueError('List of impulse responses must be provided for ConvolutionalReverb processor.')
+        self.impulse_responses = impulse_responses
+        if not parameters:
+            self.parameters = ParameterList()
+            self.max_ir_num = len(max(impulse_responses, key=len))
+            self.parameters.add(Parameter('index', 0, 'int', minimum=0, maximum=len(impulse_responses)))
+            self.parameters.add(Parameter('index_ir', 0, 'int', minimum=0, maximum=self.max_ir_num))
+            self.parameters.add(Parameter('wet', 1.0, 'float', minimum=1.0, maximum=1.0))
+            self.parameters.add(Parameter('dry', 0.0, 'float', minimum=0.0, maximum=0.0))
+            self.parameters.add(Parameter('decay', 1.0, 'float', minimum=1.0, maximum=1.0))
+            self.parameters.add(Parameter('pre_delay', 0, 'int', units='ms', minimum=0, maximum=0))
+    def update(self, parameter_name=None):
+        """
+        Update processor after randomization of parameters.
+        Args:
+            parameter_name (str): Parameter whose value has changed.
+        """
+        # we sample IR with a uniform random distribution according to RT60 values
+        chosen_ir_duration = self.impulse_responses[self.parameters.index.value]
+        chosen_ir_idx = self.parameters.index_ir.value % len(chosen_ir_duration)
+        self.h = np.copy(chosen_ir_duration[chosen_ir_idx]['impulse_response']())
+        # fade out the impulse based on the decay setting (starting from peak value)
+        if self.parameters.decay.value < 1.:
+            idx_peak = np.argmax(np.max(np.abs(self.h), axis=1), axis=0)
+            fstart = np.minimum(self.h.shape[0],
+                                idx_peak + int(self.parameters.decay.value * (self.h.shape[0] - idx_peak)))
+            fstop = np.minimum(self.h.shape[0], fstart + int(0.020*self.sample_rate))  # constant 20 ms fade out
+            flen = fstop - fstart
+            fade = np.arange(1, flen+1, dtype=self.dtype)/flen
+            fade = np.power(0.1, fade * 5)
+            self.h[fstart:fstop, :] *= fade[:, np.newaxis]
+            self.h = self.h[:fstop]
+    def process(self, x):
+        """
+        Process audio.
+        Args:
+            x (Numpy array): input audio of size `n_samples x n_channels`.
+        Returns:
+            (Numpy array): reverbed audio of size `n_samples x n_channels`.
+        """
+        # reshape IR to the correct size
+        n_channels = x.shape[1]
+        if self.h.shape[1] == 1 and n_channels > 1:
+            self.h = np.hstack([self.h] * n_channels)  # repeat mono IR for multi-channel input
+        if self.h.shape[1] > 1 and n_channels == 1:
+            self.h = self.h[:, np.random.randint(self.h.shape[1]), np.newaxis]  # randomly choose one IR channel
+        if self.parameters.wet.value == 0.0:
+            return x
+        else:
+            # perform convolution to get wet signal
+            y = oaconvolve(x, self.h, mode='full', axes=0)
+            # cut out wet signal (compensating for the delay that the IR is introducing + predelay)
+            idx = np.argmax(np.max(np.abs(self.h), axis=1), axis=0)
+            idx += int(0.001 * np.abs(self.parameters.pre_delay.value) * self.sample_rate)
+            idx = np.clip(idx, 0, self.h.shape[0]-1)
+            y = y[idx:idx+x.shape[0], :]
+            # return weighted sum of dry and wet signal
+            return self.parameters.dry.value * x + self.parameters.wet.value * y
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%% HAAS EFFECT %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+def haas_process(x, delay, feedback, wet_channel):
+    """
+    Add Haas effect to audio.
+    Args:
+        x (Numpy array): input audio.
+        delay: Delay that we apply to one of the channels (in samples).
+        feedback: Feedback value.
+        wet_channel: Which channel we process (`left` or `right`).
+    Returns:
+        (Numpy array): Audio with Haas effect.
+    """
+    y = np.copy(x)
+    if wet_channel == 'left':
+        y[:, 0] += feedback * np.roll(x[:, 0], delay)
+    elif wet_channel == 'right':
+        y[:, 1] += feedback * np.roll(x[:, 1], delay)
+    return y
+class Haas(Processor):
+    """
+    Haas Effect Processor.
+    Randomly selects one channel and applies a short delay to it.
+    Processor parameters:
+        delay (int)
+        feedback (float)
+        wet_channel (string)
+    """
+    def __init__(self, sample_rate, delay_range=(-0.040, 0.040), name='Haas', parameters=None,
+                 ):
+        """
+        Initialize processor.
+        Args:
+            sample_rate (int): Sample rate of input audio.
+            delay_range (tuple of floats): minimum/maximum delay for Haas effect.
+            name (str): Name of processor.
+            parameters (parameter_list): Parameters for this processor.
+        """
+        super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
+        if not parameters:
+            self.parameters = ParameterList()
+            self.parameters.add(Parameter('delay', int(delay_range[1] * sample_rate), 'int', units='samples',
+                                          minimum=int(delay_range[0] * sample_rate),
+                                          maximum=int(delay_range[1] * sample_rate)))
+            self.parameters.add(Parameter('feedback', 0.35, 'float', minimum=0.33, maximum=0.66))
+            self.parameters.add(Parameter('wet_channel', 'left', 'string', options=['left', 'right']))
+    def process(self, x):
+        """
+        Process audio.
+        Args:
+            x (Numpy array): input audio of size `n_samples x n_channels`.
+        Returns:
+            (Numpy array): audio with Haas effect of size `n_samples x n_channels`.
+        """
+        assert x.shape[1] == 1 or x.shape[1] == 2, 'Haas effect only works with monaural or stereo audio.'
+        if x.shape[1] < 2:
+            x = np.repeat(x, 2, axis=1)
+        y = haas_process(x, self.parameters.delay.value,
+                         self.parameters.feedback.value, self.parameters.wet_channel.value)
+        return y
+    def update(self, parameter_name=None):
+        """
+        Update processor after randomization of parameters.
+        Args:
+            parameter_name (str): Parameter whose value has changed.
+        """
+        self.reset_state()
+    def reset_state(self):
+        """Reset state."""
+        self.read_idx = 0
+        self.write_idx = self.parameters.delay.value
+        self.buffer = np.zeros((65536, 2))
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% PANNER %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+class Panner(Processor):
+    """
+    Simple stereo panner.
+    If input is mono, output is stereo.
+    Original edited from https://github.com/csteinmetz1/pymixconsole/blob/master/pymixconsole/processors/panner.py
+    """
+    def __init__(self, name='Panner', parameters=None):
+        """
+        Initialize processor.
+        Args:
+            name (str): Name of processor.
+            parameters (parameter_list): Parameters for this processor.
+        """
+        # default processor class constructor
+        super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=None)
+        if not parameters:
+            self.parameters = ParameterList()
+            self.parameters.add(Parameter('pan', 0.5, 'float', minimum=0., maximum=1.))
+            self.parameters.add(Parameter('pan_law', '-4.5dB', 'string',
+                                          options=['-4.5dB', 'linear', 'constant_power']))
+        # setup the coefficents based on default params
+        self.update()
+    def _calculate_pan_coefficents(self):
+        """
+        Calculate panning coefficients from the chosen pan law.
+        Based on the set pan law determine the gain value
+        to apply for the left and right channel to achieve panning effect.
+        This operates on the assumption that the input channel is mono.
+        The output data will be stereo at the moment, but could be expanded
+        to a higher channel count format.
+        The panning value is in the range [0, 1], where
+        0 means the signal is panned completely to the left, and
+        1 means the signal is apanned copletely to the right.
+        Raises:
+            ValueError: `self.parameters.pan_law` is not supported.
+        """
+        self.gains = np.zeros(2, dtype=self.dtype)
+        # first scale the linear [0, 1] to [0, pi/2]
+        theta = self.parameters.pan.value * (np.pi/2)
+        if self.parameters.pan_law.value == 'linear':
+            self.gains[0] = ((np.pi/2) - theta) * (2/np.pi)
+            self.gains[1] = theta * (2/np.pi)
+        elif self.parameters.pan_law.value == 'constant_power':
+            self.gains[0] = np.cos(theta)
+            self.gains[1] = np.sin(theta)
+        elif self.parameters.pan_law.value == '-4.5dB':
+            self.gains[0] = np.sqrt(((np.pi/2) - theta) * (2/np.pi) * np.cos(theta))
+            self.gains[1] = np.sqrt(theta * (2/np.pi) * np.sin(theta))
+        else:
+            raise ValueError(f'Invalid pan_law {self.parameters.pan_law.value}.')
+    def process(self, x):
+        """
+        Process audio.
+        Args:
+            x (Numpy array): input audio of size `n_samples x n_channels`.
+        Returns:
+            (Numpy array): panned audio of size `n_samples x n_channels`.
+        """
+        assert x.shape[1] == 1 or x.shape[1] == 2, 'Panner only works with monaural or stereo audio.'
+        if x.shape[1] < 2:
+            x = np.repeat(x, 2, axis=1)
+        return x * self.gains
+    def update(self, parameter_name=None):
+        """
+        Update processor after randomization of parameters.
+        Args:
+            parameter_name (str): Parameter whose value has changed.
+        """
+        self._calculate_pan_coefficents()
+    def reset_state(self):
+        """Reset state."""
+        self._output_buffer = np.empty([self.block_size, 2])
+        self.update()
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% STEREO IMAGER %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+class MidSideImager(Processor):
+    def __init__(self, name='IMAGER', parameters=None):
+        super().__init__(name, parameters=parameters, block_size=None, sample_rate=None)
+        if not parameters:
+            self.parameters = ParameterList()
+            # values of 0.0~1.0 indicate making the signal more centered while 1.0~2.0 means making the signal more wider
+            self.parameters.add(Parameter("bal",  0.0,    "float", processor=self, minimum=0.0,    maximum=2.0))
+    def process(self, data):
+        """
+        # input shape : [signal length, 2]
+        ### note! stereo imager won't work if the input signal is a mono signal (left==right)
+        ### if you want to apply stereo imager to a mono signal, first stereoize it with Haas effects
+        """
+        # to mid-side channels
+        mid, side = self.lr_to_ms(data[:,0], data[:,1])
+        # apply mid-side weights according to energy
+        mid_e, side_e = np.sum(mid**2), np.sum(side**2)
+        total_e = mid_e + side_e
+        # apply weights
+        max_side_multiplier = np.sqrt(total_e / (side_e + 1e-3))
+        # compute current multiply factor
+        cur_bal = round(getattr(self.parameters, "bal").value, 3)
+        side_gain = cur_bal if cur_bal <= 1. else max_side_multiplier * (cur_bal-1)
+        # multiply weighting factor
+        new_side = side * side_gain
+        new_side_e = side_e * (side_gain ** 2)
+        left_mid_e = total_e - new_side_e
+        mid_gain = np.sqrt(left_mid_e / (mid_e + 1e-3))
+        new_mid = mid * mid_gain
+        # convert back to left-right channels
+        left, right = self.ms_to_lr(new_mid, new_side)
+        imaged = np.stack([left, right], 1)
+        return imaged
+    # left-right channeled signal to mid-side signal
+    def lr_to_ms(self, left, right):
+        mid = left + right
+        side = left - right
+        return mid, side
+    # mid-side channeled signal to left-right signal
+    def ms_to_lr(self, mid, side):
+        left = (mid + side) / 2
+        right = (mid - side) / 2
+        return left, right
+    def update(self, parameter_name=None):
+        return parameter_name
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% GAIN %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+class Gain(Processor):
+    """
+    Gain Processor.
+    Applies gain in dB and can also randomly inverts polarity.
+    Processor parameters:
+        gain (float): Gain that should be applied (dB scale).
+        invert (bool): If True, then we also invert the waveform.
+    """
+    def __init__(self, name='Gain', parameters=None):
+        """
+        Initialize processor.
+        Args:
+            name (str): Name of processor.
+            parameters (parameter_list): Parameters for this processor.
+        """
+        super().__init__(name, parameters=parameters, block_size=None, sample_rate=None)
+        if not parameters:
+            self.parameters = ParameterList()
+            # self.parameters.add(Parameter('gain', 1.0, 'float', units='dB', minimum=-12.0, maximum=6.0))
+            self.parameters.add(Parameter('gain', 1.0, 'float', units='dB', minimum=-6.0, maximum=9.0))
+            self.parameters.add(Parameter('invert', False, 'bool'))
+    def process(self, x):
+        """
+        Process audio.
+        Args:
+            x (Numpy array): input audio of size `n_samples x n_channels`.
+        Returns:
+            (Numpy array): gain-augmented audio of size `n_samples x n_channels`.
+        """
+        gain = 10 ** (self.parameters.gain.value / 20.)
+        if self.parameters.invert.value:
+            gain = -gain
+        return gain * x
+# %%%%%%%%%%%%%%%%%%%%%%% SIMPLE CHANNEL SWAP %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+class SwapChannels(Processor):
+    """
+    Swap channels in multi-channel audio.
+    Processor parameters:
+        index (int) Selects the permutation that we are using.
+            Please note that "no permutation" is one of the permutations in `self.permutations` at index `0`.
+    """
+    def __init__(self, n_channels, name='SwapChannels', parameters=None):
+        """
+        Initialize processor.
+        Args:
+            n_channels (int): Number of channels in audio that we want to process.
+            name (str): Name of processor.
+            parameters (parameter_list): Parameters for this processor.
+        """
+        super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=None)
+        self.permutations = tuple(permutations(range(n_channels), n_channels))
+        if not parameters:
+            self.parameters = ParameterList()
+            self.parameters.add(Parameter('index', 0, 'int', minimum=0, maximum=len(self.permutations)))
+    def process(self, x):
+        """
+        Process audio.
+        Args:
+            x (Numpy array): input audio of size `n_samples x n_channels`.
+        Returns:
+            (Numpy array): channel-swapped audio of size `n_samples x n_channels`.
+        """
+        return x[:, self.permutations[self.parameters.index.value]]
+# %%%%%%%%%%%%%%%%%%%%%%% Monauralize %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+class Monauralize(Processor):
+    """
+    Monauralizes audio (i.e., removes spatial information).
+    Process parameters:
+        seed_channel (int): channel that we use for overwriting the others.
+    """
+    def __init__(self, n_channels, name='Monauralize', parameters=None):
+        """
+        Initialize processor.
+        Args:
+            n_channels (int): Number of channels in audio that we want to process.
+            name (str): Name of processor.
+            parameters (parameter_list): Parameters for this processor.
+        """
+        super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=None)
+        if not parameters:
+            self.parameters = ParameterList()
+            self.parameters.add(Parameter('seed_channel', 0, 'int', minimum=0, maximum=n_channels))
+    def process(self, x):
+        """
+        Process audio.
+        Args:
+            x (Numpy array): input audio of size `n_samples x n_channels`.
+        Returns:
+            (Numpy array): monauralized audio of size `n_samples x n_channels`.
+        """
+        return np.tile(x[:, [self.parameters.seed_channel.value]], (1, x.shape[1]))
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% PITCH SHIFT %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+class PitchShift(Processor):
+    """
+    Simple pitch shifter using SoX and soxbindings (https://github.com/pseeth/soxbindings).
+    Processor parameters:
+        steps (float): Pitch shift as positive/negative semitones
+        quick (bool): If True, this effect will run faster but with lower sound quality.
+    """
+    def __init__(self, sample_rate, fix_length=True, name='PitchShift', parameters=None):
+        """
+        Initialize processor.
+        Args:
+            sample_rate (int): Sample rate of input audio.
+            fix_length (bool): If True, then output has same length as input.
+            name (str): Name of processor.
+            parameters (parameter_list): Parameters for this processor.
+        """
+        super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
+        if not parameters:
+            self.parameters = ParameterList()
+            self.parameters.add(Parameter('steps', 0.0, 'float', minimum=-6., maximum=6.))
+            self.parameters.add(Parameter('quick', False, 'bool'))
+        self.fix_length = fix_length
+        self.clips = False
+    def process(self, x):
+        """
+        Process audio.
+        Args:
+            x (Numpy array): input audio of size `n_samples x n_channels`.
+        Returns:
+            (Numpy array): pitch-shifted audio of size `n_samples x n_channels`.
+        """
+        if self.parameters.steps.value == 0.0:
+            y = x
+        else:
+            scale = np.max(np.abs(x))
+            if scale > 0.9:
+                clips = True
+                x = x * (0.9 / scale)
+            else:
+                clips = False
+            tfm = sox.Transformer()
+            tfm.pitch(self.parameters.steps.value, quick=bool(self.parameters.quick.value))
+            y = tfm.build_array(input_array=x, sample_rate_in=self.sample_rate).astype(np.float32)
+            if clips:
+                y *= scale / 0.9  # rescale output to original scale
+        if self.fix_length:
+            n_samples_input = x.shape[0]
+            n_samples_output = y.shape[0]
+            if n_samples_input < n_samples_output:
+                idx1 = (n_samples_output - n_samples_input) // 2
+                idx2 = idx1 + n_samples_input
+                y = y[idx1:idx2]
+            elif n_samples_input > n_samples_output:
+                n_pad = n_samples_input - n_samples_output
+                y = np.pad(y, ((n_pad//2, n_pad - n_pad//2), (0, 0)))
+        return y
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% TIME STRETCH %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+class TimeStretch(Processor):
+    """
+    Simple time stretcher using SoX and soxbindings (https://github.com/pseeth/soxbindings).
+    Processor parameters:
+        factor (float): Time stretch factor.
+        quick (bool): If True, this effect will run faster but with lower sound quality.
+        stretch_type (str): Algorithm used for stretching (`tempo` or `stretch`).
+        audio_type (str): Sets which time segments are most optmial when finding
+            the best overlapping points for time stretching.
+    """
+    def __init__(self, sample_rate, fix_length=True, name='TimeStretch', parameters=None):
+        """
+        Initialize processor.
+        Args:
+            sample_rate (int): Sample rate of input audio.
+            fix_length (bool): If True, then output has same length as input.
+            name (str): Name of processor.
+            parameters (parameter_list): Parameters for this processor.
+        """
+        super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
+        if not parameters:
+            self.parameters = ParameterList()
+            self.parameters.add(Parameter('factor', 1.0, 'float', minimum=1/1.33, maximum=1.33))
+            self.parameters.add(Parameter('quick', False, 'bool'))
+            self.parameters.add(Parameter('stretch_type', 'tempo', 'string', options=['tempo', 'stretch']))
+            self.parameters.add(Parameter('audio_type', 'l', 'string', options=['m', 's', 'l']))
+        self.fix_length = fix_length
+    def process(self, x):
+        """
+        Process audio.
+        Args:
+            x (Numpy array): input audio of size `n_samples x n_channels`.
+        Returns:
+            (Numpy array): time-stretched audio of size `n_samples x n_channels`.
+        """
+        if self.parameters.factor.value == 1.0:
+            y = x
+        else:
+            scale = np.max(np.abs(x))
+            if scale > 0.9:
+                clips = True
+                x = x * (0.9 / scale)
+            else:
+                clips = False
+            tfm = sox.Transformer()
+            if self.parameters.stretch_type.value == 'stretch':
+                tfm.stretch(self.parameters.factor.value)
+            elif self.parameters.stretch_type.value == 'tempo':
+                tfm.tempo(self.parameters.factor.value,
+                          audio_type=self.parameters.audio_type.value,
+                          quick=bool(self.parameters.quick.value))
+            y = tfm.build_array(input_array=x, sample_rate_in=self.sample_rate).astype(np.float32)
+            if clips:
+                y *= scale / 0.9  # rescale output to original scale
+        if self.fix_length:
+            n_samples_input = x.shape[0]
+            n_samples_output = y.shape[0]
+            if n_samples_input < n_samples_output:
+                idx1 = (n_samples_output - n_samples_input) // 2
+                idx2 = idx1 + n_samples_input
+                y = y[idx1:idx2]
+            elif n_samples_input > n_samples_output:
+                n_pad = n_samples_input - n_samples_output
+                y = np.pad(y, ((n_pad//2, n_pad - n_pad//2), (0, 0)))
+        return y
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% PLAYBACK SPEED %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+class PlaybackSpeed(Processor):
+    """
+    Simple playback speed effect using SoX and soxbindings (https://github.com/pseeth/soxbindings).
+    Processor parameters:
+        factor (float): Playback speed factor.
+    """
+    def __init__(self, sample_rate, fix_length=True, name='PlaybackSpeed', parameters=None):
+        """
+        Initialize processor.
+        Args:
+            sample_rate (int): Sample rate of input audio.
+            fix_length (bool): If True, then output has same length as input.
+            name (str): Name of processor.
+            parameters (parameter_list): Parameters for this processor.
+        """
+        super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
+        if not parameters:
+            self.parameters = ParameterList()
+            self.parameters.add(Parameter('factor', 1.0, 'float', minimum=1./1.33, maximum=1.33))
+        self.fix_length = fix_length
+    def process(self, x):
+        """
+        Process audio.
+        Args:
+            x (Numpy array): input audio of size `n_samples x n_channels`.
+        Returns:
+            (Numpy array): resampled audio of size `n_samples x n_channels`.
+        """
+        if self.parameters.factor.value == 1.0:
+            y = x
+        else:
+            scale = np.max(np.abs(x))
+            if scale > 0.9:
+                clips = True
+                x = x * (0.9 / scale)
+            else:
+                clips = False
+            tfm = sox.Transformer()
+            tfm.speed(self.parameters.factor.value)
+            y = tfm.build_array(input_array=x, sample_rate_in=self.sample_rate).astype(np.float32)
+            if clips:
+                y *= scale / 0.9  # rescale output to original scale
+        if self.fix_length:
+            n_samples_input = x.shape[0]
+            n_samples_output = y.shape[0]
+            if n_samples_input < n_samples_output:
+                idx1 = (n_samples_output - n_samples_input) // 2
+                idx2 = idx1 + n_samples_input
+                y = y[idx1:idx2]
+            elif n_samples_input > n_samples_output:
+                n_pad = n_samples_input - n_samples_output
+                y = np.pad(y, ((n_pad//2, n_pad - n_pad//2), (0, 0)))
+        return y
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% BEND %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+class Bend(Processor):
+    """
+    Simple bend effect using SoX and soxbindings (https://github.com/pseeth/soxbindings).
+    Processor parameters:
+        n_bends (int): Number of segments or intervals to pitch shift
+    """
+    def __init__(self, sample_rate, pitch_range=(-600, 600), fix_length=True, name='Bend', parameters=None):
+        """
+        Initialize processor.
+        Args:
+            sample_rate (int): Sample rate of input audio.
+            pitch_range (tuple of ints): min and max pitch bending ranges in cents
+            fix_length (bool): If True, then output has same length as input.
+            name (str): Name of processor.
+            parameters (parameter_list): Parameters for this processor.
+        """
+        super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
+        if not parameters:
+            self.parameters = ParameterList()
+            self.parameters.add(Parameter('n_bends', 2, 'int', minimum=2, maximum=10))
+        self.pitch_range_min, self.pitch_range_max = pitch_range
+    def process(self, x):
+        """
+        Process audio.
+        Args:
+            x (Numpy array): input audio of size `n_samples x n_channels`.
+        Returns:
+            (Numpy array): pitch-bended audio of size `n_samples x n_channels`.
+        """
+        n_bends = self.parameters.n_bends.value
+        max_length = x.shape[0] / self.sample_rate
+        # Generates random non-overlapping segments
+        delta = 1. / self.sample_rate
+        boundaries = np.sort(delta + np.random.rand(n_bends-1) * (max_length - delta))
+        start, end = np.zeros(n_bends), np.zeros(n_bends)
+        start[0] = delta
+        for i, b in enumerate(boundaries):
+            end[i] = b
+            start[i+1] = b
+        end[-1] = max_length
+        # randomly sample pitch-shifts in cents
+        cents = np.random.randint(self.pitch_range_min, self.pitch_range_max+1, n_bends)
+        # remove segment if cent value is zero or start == end (as SoX does not allow such values)
+        idx_keep = np.logical_and(cents != 0, start != end)
+        n_bends, start, end, cents = sum(idx_keep), start[idx_keep], end[idx_keep], cents[idx_keep]
+        scale = np.max(np.abs(x))
+        if scale > 0.9:
+            clips = True
+            x = x * (0.9 / scale)
+        else:
+            clips = False
+        tfm = sox.Transformer()
+        tfm.bend(n_bends=int(n_bends), start_times=list(start), end_times=list(end), cents=list(cents))
+        y = tfm.build_array(input_array=x, sample_rate_in=self.sample_rate).astype(np.float32)
+        if clips:
+            y *= scale / 0.9  # rescale output to original scale
+        return y
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ALGORITHMIC REVERB %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+class AlgorithmicReverb(Processor):
+    def __init__(self, name="algoreverb", parameters=None, sample_rate=44100, **kwargs):
+        super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate, **kwargs)
+        if not parameters:
+            self.parameters = ParameterList()
+            self.parameters.add(Parameter("room_size",   0.5, "float", minimum=0.05, maximum=0.85))
+            self.parameters.add(Parameter("damping",     0.1, "float", minimum=0.0,  maximum=1.0))
+            self.parameters.add(Parameter("dry_mix",     0.9, "float", minimum=0.0,  maximum=1.0))
+            self.parameters.add(Parameter("wet_mix",     0.1, "float", minimum=0.0,  maximum=1.0))
+            self.parameters.add(Parameter("width",       0.7, "float", minimum=0.0,  maximum=1.0))
+        # Tuning
+        self.stereospread = 23
+        self.scalegain    = 0.2
+    def process(self, data):
+        if data.ndim >= 2:
+            dataL = data[:,0]
+            if data.shape[1] == 2:
+                dataR = data[:,1]
+            else:
+                dataR = data[:,0]
+        else:
+            dataL = data
+            dataR = data
+        output = np.zeros((data.shape[0], 2))
+        xL, xR = self.process_filters(dataL.copy(), dataR.copy())
+        wet1_g = self.parameters.wet_mix.value * ((self.parameters.width.value/2) + 0.5)
+        wet2_g = self.parameters.wet_mix.value * ((1-self.parameters.width.value)/2)
+        dry_g  = self.parameters.dry_mix.value
+        output[:,0] = (wet1_g * xL) + (wet2_g * xR) + (dry_g * dataL)
+        output[:,1] = (wet1_g * xR) + (wet2_g * xL) + (dry_g * dataR)
+        return output
+    def process_filters(self, dataL, dataR):
+        xL  = self.combL1.process(dataL.copy() * self.scalegain)
+        xL += self.combL2.process(dataL.copy() * self.scalegain)
+        xL += self.combL3.process(dataL.copy() * self.scalegain)
+        xL += self.combL4.process(dataL.copy() * self.scalegain)
+        xL  = self.combL5.process(dataL.copy() * self.scalegain)
+        xL += self.combL6.process(dataL.copy() * self.scalegain)
+        xL += self.combL7.process(dataL.copy() * self.scalegain)
+        xL += self.combL8.process(dataL.copy() * self.scalegain)
+        xR  = self.combR1.process(dataR.copy() * self.scalegain)
+        xR += self.combR2.process(dataR.copy() * self.scalegain)
+        xR += self.combR3.process(dataR.copy() * self.scalegain)
+        xR += self.combR4.process(dataR.copy() * self.scalegain)
+        xR  = self.combR5.process(dataR.copy() * self.scalegain)
+        xR += self.combR6.process(dataR.copy() * self.scalegain)
+        xR += self.combR7.process(dataR.copy() * self.scalegain)
+        xR += self.combR8.process(dataR.copy() * self.scalegain)
+        yL1 = self.allpassL1.process(xL)
+        yL2 = self.allpassL2.process(yL1)
+        yL3 = self.allpassL3.process(yL2)
+        yL4 = self.allpassL4.process(yL3)
+        yR1 = self.allpassR1.process(xR)
+        yR2 = self.allpassR2.process(yR1)
+        yR3 = self.allpassR3.process(yR2)
+        yR4 = self.allpassR4.process(yR3)
+        return yL4, yR4
+    def update(self, parameter_name):
+        rs = self.parameters.room_size.value
+        dp = self.parameters.damping.value
+        ss = self.stereospread
+        # initialize allpass and feedback comb-filters
+        # (with coefficients optimized for fs=44.1kHz)
+        self.allpassL1 = pymc.components.allpass.Allpass(556,    rs, self.block_size)
+        self.allpassR1 = pymc.components.allpass.Allpass(556+ss, rs, self.block_size)
+        self.allpassL2 = pymc.components.allpass.Allpass(441,    rs, self.block_size)
+        self.allpassR2 = pymc.components.allpass.Allpass(441+ss, rs, self.block_size)
+        self.allpassL3 = pymc.components.allpass.Allpass(341,    rs, self.block_size)
+        self.allpassR3 = pymc.components.allpass.Allpass(341+ss, rs, self.block_size)
+        self.allpassL4 = pymc.components.allpass.Allpass(225,    rs, self.block_size)
+        self.allpassR4 = pymc.components.allpass.Allpass(255+ss, rs, self.block_size)
+        self.combL1 = pymc.components.comb.Comb(1116,    dp, rs, self.block_size)
+        self.combR1 = pymc.components.comb.Comb(1116+ss, dp, rs, self.block_size)
+        self.combL2 = pymc.components.comb.Comb(1188,    dp, rs, self.block_size)
+        self.combR2 = pymc.components.comb.Comb(1188+ss, dp, rs, self.block_size)
+        self.combL3 = pymc.components.comb.Comb(1277,    dp, rs, self.block_size)
+        self.combR3 = pymc.components.comb.Comb(1277+ss, dp, rs, self.block_size)
+        self.combL4 = pymc.components.comb.Comb(1356,    dp, rs, self.block_size)
+        self.combR4 = pymc.components.comb.Comb(1356+ss, dp, rs, self.block_size)
+        self.combL5 = pymc.components.comb.Comb(1422,    dp, rs, self.block_size)
+        self.combR5 = pymc.components.comb.Comb(1422+ss, dp, rs, self.block_size)
+        self.combL6 = pymc.components.comb.Comb(1491,    dp, rs, self.block_size)
+        self.combR6 = pymc.components.comb.Comb(1491+ss, dp, rs, self.block_size)
+        self.combL7 = pymc.components.comb.Comb(1557,    dp, rs, self.block_size)
+        self.combR7 = pymc.components.comb.Comb(1557+ss, dp, rs, self.block_size)
+        self.combL8 = pymc.components.comb.Comb(1617,    dp, rs, self.block_size)
+        self.combR8 = pymc.components.comb.Comb(1617+ss, dp, rs, self.block_size)

modules/common_miscellaneous.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+Common miscellaneous functions.
+AI Music Technology Group, Sony Group Corporation
+AI Speech and Sound Group, Sony Europe
+This implementation originally belongs to Sony Group Corporation,
+    which has been introduced in the work "Automatic music mixing with deep learning and out-of-domain data".
+    Original repo link: https://github.com/sony/FxNorm-automix
+"""
+import os
+import psutil
+import sys
+import numpy as np
+import librosa
+import torch
+import math
+def uprint(s):
+    """
+    Unbuffered print to stdout.
+    We also flush stderr to have the log-file in sync.
+    Args:
+        s: string to print
+    """
+    print(s)
+    sys.stdout.flush()
+    sys.stderr.flush()
+def recursive_getattr(obj, attr):
+    """
+    Run `getattr` recursively (e.g., for `fc1.weight`).
+    Args:
+        obj: object
+        attr: attribute to get
+    Returns:
+        object
+    """
+    for a in attr.split('.'):
+        obj = getattr(obj, a)
+    return obj
+def compute_stft(samples, hop_length, fft_size, stft_window):
+    """
+    Compute the STFT of `samples` applying a Hann window of size `FFT_SIZE`, shifted for each frame by `hop_length`.
+    Args:
+        samples: num samples x channels
+        hop_length: window shift in samples
+        fft_size: FFT size which is also the window size
+        stft_window: STFT analysis window
+    Returns:
+        stft: frames x channels x freqbins
+    """
+    n_channels = samples.shape[1]
+    n_frames = 1+int((samples.shape[0] - fft_size)/hop_length)
+    stft = np.empty((n_frames, n_channels, fft_size//2+1), dtype=np.complex64)
+    # convert into f_contiguous (such that [:,n] slicing is c_contiguous)
+    samples = np.asfortranarray(samples)
+    for n in range(n_channels):
+        # compute STFT (output has size `n_frames x N_BINS`)
+        stft[:, n, :] = librosa.stft(samples[:, n],
+                                     n_fft=fft_size,
+                                     hop_length=hop_length,
+                                     window=stft_window,
+                                     center=False).transpose()
+    return stft
+def compute_istft(stft, hop_length, stft_window):
+    """
+    Compute the inverse STFT of `stft`.
+    Args:
+        stft: frames x channels x freqbins
+        hop_length: window shift in samples
+        stft_window: STFT synthesis window
+    Returns:
+        samples: num samples x channels
+    """
+    for n in range(stft.shape[1]):
+        s = librosa.istft(stft[:, n, :].transpose(),
+                          hop_length=hop_length, window=stft_window, center=False)
+        if n == 0:
+            samples = s
+        else:
+            samples = np.column_stack((samples, s))
+    # ensure that we have a 2d array (monaural files are just loaded as vectors)
+    if samples.ndim == 1:
+        samples = samples[:, np.newaxis]
+    return samples
+def get_size(obj):
+    """
+    Recursively find size of objects (in bytes).
+    Args:
+        obj: object
+    Returns:
+        size of object
+    """
+    size = sys.getsizeof(obj)
+    import functools
+    if isinstance(obj, dict):
+        size += sum([get_size(v) for v in obj.values()])
+        size += sum([get_size(k) for k in obj.keys()])
+    elif isinstance(obj, functools.partial):
+        size += sum([get_size(v) for v in obj.keywords.values()])
+        size += sum([get_size(k) for k in obj.keywords.keys()])
+    elif isinstance(obj, list):
+        size += sum([get_size(i) for i in obj])
+    elif isinstance(obj, tuple):
+        size += sum([get_size(i) for i in obj])
+    return size
+def get_process_memory():
+    """
+    Return memory consumption in GBytes.
+    Returns:
+        memory used by the process
+    """
+    return psutil.Process(os.getpid()).memory_info()[0] / (2 ** 30)
+def check_complete_convolution(input_size, kernel_size, stride=1,
+                               padding=0, dilation=1, note=''):
+    """
+    Check where the convolution is complete.
+    Returns true if no time steps left over in a Conv1d
+    Args:
+        input_size: size of input
+        kernel_size: size of kernel
+        stride: stride
+        padding: padding
+        dilation: dilation
+        note: string for additional notes
+    """
+    is_complete = ((input_size + 2*padding - dilation * (kernel_size - 1) - 1)
+                   / stride + 1).is_integer()
+    uprint(f'{note} {is_complete}')
+def pad_to_shape(x: torch.Tensor, y: int) -> torch.Tensor:
+    """
+    Right-pad or right-trim first argument last dimension to have same size as second argument.
+    Args:
+        x: Tensor to be padded.
+        y: Size to pad/trim x last dimension to
+    Returns:
+        `x` padded to match `y`'s dimension.
+    """
+    inp_len = y
+    output_len = x.shape[-1]
+    return torch.nn.functional.pad(x, [0, inp_len - output_len])
+def valid_length(input_size, kernel_size, stride=1, padding=0, dilation=1):
+    """
+    Return the nearest valid upper length to use with the model so that there is no time steps left over in a 1DConv.
+    For all layers, size of the (input - kernel_size) % stride = 0.
+    Here valid means that there is no left over frame neglected and discarded.
+    Args:
+        input_size: size of input
+        kernel_size: size of kernel
+        stride: stride
+        padding: padding
+        dilation: dilation
+    Returns:
+        valid length for convolution
+    """
+    length = math.ceil((input_size + 2*padding - dilation * (kernel_size - 1) - 1)/stride) + 1
+    length = (length - 1) * stride - 2*padding + dilation * (kernel_size - 1) + 1
+    return int(length)
+def td_length_from_fd(fd_length: int, fft_size: int, fft_hop: int) -> int:
+    """
+    Return the length in time domain, given the length in frequency domain.
+    Return the necessary length in the time domain of a signal to be transformed into
+    a signal of length `fd_length` in time-frequency domain with the given STFT
+    parameters `fft_size` and `fft_hop`. No padding is assumed.
+    Args:
+        fd_length: length in frequency domain
+        fft_size: size of FFT
+        fft_hop: hop length
+    Returns:
+        length in time domain
+    """
+    return (fd_length - 1) * fft_hop + fft_size

modules/data_normalization.py ADDED Viewed

	@@ -0,0 +1,342 @@

+"""
+    Implementation of the 'audio effects chain normalization'
+"""
+import numpy as np
+import scipy
+import soundfile as sf
+import pyloudnorm
+from glob import glob
+import os
+import sys
+currentdir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(currentdir)
+from utils_data_normalization import *
+from normalization_imager import *
+'''
+    Audio Effects Chain Normalization
+    process: normalizes input stems according to given precomputed features
+'''
+class Audio_Effects_Normalizer:
+    def __init__(self, precomputed_feature_path=None, \
+                    STEMS=['drums', 'bass', 'other', 'vocals'], \
+                    EFFECTS=['eq', 'compression', 'imager', 'loudness'], \
+                    audio_extension='wav'):
+        self.STEMS = STEMS # Stems to be normalized
+        self.EFFECTS = EFFECTS # Effects to be normalized, order matters
+        self.audio_extension = audio_extension
+        self.precomputed_feature_path = precomputed_feature_path
+        # Audio settings
+        self.SR = 44100
+        self.SUBTYPE = 'PCM_16'
+        # General Settings
+        self.FFT_SIZE = 2**16
+        self.HOP_LENGTH = self.FFT_SIZE//4
+        # Loudness
+        self.NTAPS = 1001
+        self.LUFS = -30
+        self.MIN_DB = -40 # Min amplitude to apply EQ matching
+        # Compressor
+        self.COMP_USE_EXPANDER = False
+        self.COMP_PEAK_NORM = -10.0
+        self.COMP_TRUE_PEAK = False
+        self.COMP_PERCENTILE = 75 # features_mean (v1) was done with 25
+        self.COMP_MIN_TH = -40
+        self.COMP_MAX_RATIO = 20
+        comp_settings = {key:{} for key in self.STEMS}
+        for key in comp_settings:
+            if key=='vocals':
+                comp_settings[key]['attack'] = 7.5
+                comp_settings[key]['release'] = 400.0
+                comp_settings[key]['ratio'] = 4
+                comp_settings[key]['n_mels'] = 128
+            elif key=='drums':
+                comp_settings[key]['attack'] = 10.0
+                comp_settings[key]['release'] = 180.0
+                comp_settings[key]['ratio'] = 6
+                comp_settings[key]['n_mels'] = 128
+            elif key=='bass':
+                comp_settings[key]['attack'] = 10.0
+                comp_settings[key]['release'] = 500.0
+                comp_settings[key]['ratio'] = 5
+                comp_settings[key]['n_mels'] = 16
+            elif key=='other' or key=='mixture':
+                comp_settings[key]['attack'] = 15.0
+                comp_settings[key]['release'] = 666.0
+                comp_settings[key]['ratio'] = 4
+                comp_settings[key]['n_mels'] = 128
+        self.comp_settings = comp_settings
+        if precomputed_feature_path!=None and os.path.isfile(precomputed_feature_path):
+            # Load Pre-computed Audio Effects Features
+            features_mean = np.load(precomputed_feature_path, allow_pickle='TRUE')[()]
+            self.features_mean = self.smooth_feature(features_mean)
+    # compute audio effects' mean feature values
+    def compute_mean(self, base_dir_path, save_feat=True, single_file=False):
+        audio_path_dict = {}
+        for cur_stem in self.STEMS:
+            # if single_file=True, base_dir_path = the target file path
+            audio_path_dict[cur_stem] = [base_dir_path] if single_file else glob(os.path.join(base_dir_path, "**", f"{cur_stem}.{self.audio_extension}"), recursive=True)
+        features_dict = {}
+        features_mean = {}
+        for effect in self.EFFECTS:
+            features_dict[effect] = {key:[] for key in self.STEMS}
+            features_mean[effect] = {key:[] for key in self.STEMS}
+        stems_names = self.STEMS.copy()
+        for effect in self.EFFECTS:
+            print(f'{effect} ...')
+            j=0
+            for key in self.STEMS:
+                print(f'{key} ...')
+                i = []
+                for i_, p_ in enumerate(audio_path_dict[key]):
+                    i.append(i_)
+                i = np.asarray(i) + j
+                j += len(i)
+                features_ = []
+                for cur_i, cur_audio_path in enumerate(audio_path_dict[key]):
+                    print(f'getting {effect} features for {key}- stem {cur_i} of {len(audio_path_dict[key])-1} {cur_audio_path}')
+                    features_.append(self.get_norm_feature(cur_audio_path, cur_i, effect, key))
+                features_dict[effect][key] = features_
+                print(effect, key, len(features_dict[effect][key]))
+                s = np.asarray(features_dict[effect][key])
+                s = np.mean(s, axis=0)
+                features_mean[effect][key] = s
+                if effect == 'eq':
+                    assert len(s)==1+self.FFT_SIZE//2, len(s)
+                elif effect == 'compression':
+                    assert len(s)==2, len(s)
+                elif effect == 'panning':
+                    assert len(s)==1+self.FFT_SIZE//2, len(s)
+                elif effect == 'loudness':
+                    assert len(s)==1, len(s)
+                if effect == 'eq':
+                    if key in ['other', 'vocals', 'mixture']:
+                        f = 401
+                    else:
+                        f = 151
+                    features_mean[effect][key] = scipy.signal.savgol_filter(features_mean[effect][key],
+                                                                            f, 1, mode='mirror')
+                elif effect == 'panning':
+                    features_mean[effect][key] = scipy.signal.savgol_filter(features_mean[effect][key],
+                                                                            501, 1, mode='mirror')
+        if save_feat:
+            np.save(self.precomputed_feature_path, features_mean)
+        self.features_mean = self.smooth_feature(features_mean)
+        print('---feature mean computation completed---')
+        return self.features_mean
+    def get_norm_feature(self, path, i, effect, stem):
+        if isinstance(path, str):
+            audio, fs = sf.read(path)
+            assert(fs == self.SR)
+        else:
+            audio = path
+            fs = self.SR
+        all_zeros = not np.any(audio)
+        if all_zeros == False:
+            audio = np.pad(audio, ((self.FFT_SIZE, self.FFT_SIZE), (0, 0)), mode='constant')
+            max_db = amp_to_db(np.max(np.abs(audio)))
+            if max_db > self.MIN_DB:
+                if effect == 'loudness':
+                    meter = pyln.Meter(self.SR)
+                    loudness = meter.integrated_loudness(audio)
+                    return [loudness]
+                elif effect == 'eq':
+                    audio = lufs_normalize(audio, self.SR, self.LUFS, log=False)
+                    audio_spec = compute_stft(audio,
+                                    self.HOP_LENGTH,
+                                    self.FFT_SIZE,
+                                    np.sqrt(np.hanning(self.FFT_SIZE+1)[:-1]))
+                    audio_spec = np.abs(audio_spec)
+                    audio_spec_avg = np.mean(audio_spec, axis=(0,1))
+                    return audio_spec_avg
+                elif effect == 'panning':
+                    phi = get_SPS(audio,
+                                n_fft=self.FFT_SIZE,
+                                hop_length=self.HOP_LENGTH,
+                                smooth=False,
+                                frames=False)
+                    return(phi[1])
+                elif effect == 'compression':
+                    x = pyln.normalize.peak(audio, self.COMP_PEAK_NORM)
+                    peak_std = get_mean_peak(x,
+                                            sr=self.SR,
+                                            true_peak=self.COMP_TRUE_PEAK,
+                                            percentile=self.COMP_PERCENTILE,
+                                            n_mels=self.comp_settings[stem]['n_mels'])
+                    if peak_std is not None:
+                        return peak_std
+                    else:
+                        return None
+                elif effect == 'imager':
+                    mid, side = lr_to_ms(audio[:,0], audio[:,1])
+                    return print_balance(mid, side, verbose=False)
+            else:
+                print(f'{path} is silence...')
+                return None
+        else:
+            print(f'{path} is only zeros...')
+            return None
+    # normalize current audio input with the order of designed audio FX
+    def normalize_audio(self, audio, src):
+        assert src in self.STEMS
+        normalized_audio = audio
+        for cur_effect in self.EFFECTS:
+            normalized_audio = self.normalize_audio_per_effect(normalized_audio, src=src, effect=cur_effect)
+        return normalized_audio
+    # normalize current audio input with current targeted audio FX
+    def normalize_audio_per_effect(self, audio, src, effect):
+        audio = audio.astype(dtype=np.float32)
+        audio_track = np.pad(audio, ((self.FFT_SIZE, self.FFT_SIZE), (0, 0)), mode='constant')
+        assert len(audio_track.shape) == 2  # Always expects two dimensions
+        if audio_track.shape[1] == 1:    # Converts mono to stereo with repeated channels
+            audio_track = np.repeat(audio_track, 2, axis=-1)
+        output_audio = audio_track.copy()
+        max_db = amp_to_db(np.max(np.abs(output_audio)))
+        if max_db > self.MIN_DB:
+            if effect == 'eq':
+                # normalize each channel
+                for ch in range(audio_track.shape[1]):
+                    audio_eq_matched = get_eq_matching(output_audio[:, ch],
+                                                        self.features_mean[effect][src],
+                                                        sr=self.SR,
+                                                        n_fft=self.FFT_SIZE,
+                                                        hop_length=self.HOP_LENGTH,
+                                                        min_db=self.MIN_DB,
+                                                        ntaps=self.NTAPS,
+                                                        lufs=self.LUFS)
+                    np.copyto(output_audio[:,ch], audio_eq_matched)
+            elif effect == 'compression':
+                assert(len(self.features_mean[effect][src])==2)
+                # normalize each channel
+                for ch in range(audio_track.shape[1]):
+                    try:
+                        audio_comp_matched = get_comp_matching(output_audio[:, ch],
+                                                                self.features_mean[effect][src][0],
+                                                                self.features_mean[effect][src][1],
+                                                                self.comp_settings[src]['ratio'],
+                                                                self.comp_settings[src]['attack'],
+                                                                self.comp_settings[src]['release'],
+                                                                sr=self.SR,
+                                                                min_db=self.MIN_DB,
+                                                                min_th=self.COMP_MIN_TH,
+                                                                comp_peak_norm=self.COMP_PEAK_NORM,
+                                                                max_ratio=self.COMP_MAX_RATIO,
+                                                                n_mels=self.comp_settings[src]['n_mels'],
+                                                                true_peak=self.COMP_TRUE_PEAK,
+                                                                percentile=self.COMP_PERCENTILE,
+                                                                expander=self.COMP_USE_EXPANDER)
+                        np.copyto(output_audio[:,ch], audio_comp_matched[:, 0])
+                    except:
+                        break
+            elif effect == 'loudness':
+                output_audio = lufs_normalize(output_audio, self.SR, self.features_mean[effect][src], log=False)
+            elif effect == 'imager':
+                # threshold of applying Haas effects
+                mono_threshold = 0.99 if src=='bass' else 0.975
+                audio_imager_matched = normalize_imager(output_audio, \
+                                                        target_side_mid_bal=self.features_mean[effect][src][0], \
+                                                        mono_threshold=mono_threshold, \
+                                                        sr=self.SR)
+                np.copyto(output_audio, audio_imager_matched)
+        output_audio = output_audio[self.FFT_SIZE:self.FFT_SIZE+audio.shape[0]]
+        return output_audio
+    def smooth_feature(self, feature_dict_):
+        for effect in self.EFFECTS:
+            for key in self.STEMS:
+                if effect == 'eq':
+                    if key in ['other', 'vocals', 'mixture']:
+                        f = 401
+                    else:
+                        f = 151
+                    feature_dict_[effect][key] = scipy.signal.savgol_filter(feature_dict_[effect][key],
+                                                                            f, 1, mode='mirror')
+                elif effect == 'panning':
+                    feature_dict_[effect][key] = scipy.signal.savgol_filter(feature_dict_[effect][key],
+                                                                            501, 1, mode='mirror')
+        return feature_dict_
+    # compute "normalization" based on a single sample
+    def feature_matching(self, src_aud_path, ref_aud_path):
+        # compute mean features from reference audio
+        mean_feature = self.compute_mean(ref_aud_path, save_feat=False, single_file=True)
+        print(mean_feature)
+        src_aud, sr = sf.read(src_aud_path)
+        normalized_audio = self.normalize_audio(src_aud, 'mixture')
+        return normalized_audio
+def lufs_normalize(x, sr, lufs, log=True):
+    # measure the loudness first
+    meter = pyloudnorm.Meter(sr) # create BS.1770 meter
+    loudness = meter.integrated_loudness(x+1e-10)
+    if log:
+        print("original loudness: ", loudness," max value: ", np.max(np.abs(x)))
+    loudness_normalized_audio = pyloudnorm.normalize.loudness(x, loudness, lufs)
+    maxabs_amp = np.maximum(1.0, 1e-6 + np.max(np.abs(loudness_normalized_audio)))
+    loudness_normalized_audio /= maxabs_amp
+    loudness = meter.integrated_loudness(loudness_normalized_audio)
+    if log:
+        print("new loudness: ", loudness," max value: ", np.max(np.abs(loudness_normalized_audio)))

modules/fx_utils.py ADDED Viewed

	@@ -0,0 +1,308 @@

+import warnings
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+import numpy as np
+import scipy
+import math
+import librosa
+import librosa.display
+import fnmatch
+import os
+from functools import partial
+import pyloudnorm
+from scipy.signal import lfilter
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+from sklearn.metrics.pairwise import paired_distances
+import matplotlib.pyplot as plt
+def db(x):
+    """Computes the decible energy of a signal"""
+    return 20*np.log10(np.sqrt(np.mean(np.square(x))))
+def melspectrogram(y, mirror_pad=False):
+    """Compute melspectrogram feature extraction
+    Keyword arguments:
+    signal -- input audio as a signal in a numpy object
+    inputnorm -- normalization of output
+    mirror_pad -- pre and post-pend mirror signals
+    Returns freq x time
+    Assumes the input sampling rate is 22050Hz
+    """
+    # Extract mel.
+    fftsize = 1024
+    window = 1024
+    hop = 512
+    melBin = 128
+    sr = 22050
+    # mirror pad signal
+    # first embedding centered on time 0
+    # last embedding centered on end of signal
+    if mirror_pad:
+        y = np.insert(y, 0, y[0:int(half_frame_length_sec * sr)][::-1])
+        y = np.insert(y, len(y), y[-int(half_frame_length_sec * sr):][::-1])
+    S = librosa.core.stft(y,n_fft=fftsize,hop_length=hop,win_length=window)
+    X = np.abs(S)
+    mel_basis = librosa.filters.mel(sr,n_fft=fftsize,n_mels=melBin)
+    mel_S = np.dot(mel_basis,X)
+    # value log compression
+    mel_S = np.log10(1+10*mel_S)
+    mel_S = mel_S.astype(np.float32)
+    return mel_S
+def getFilesPath(directory, extension):
+    n_path=[]
+    for path, subdirs, files in os.walk(directory):
+        for name in files:
+            if fnmatch.fnmatch(name, extension):
+                n_path.append(os.path.join(path,name))
+    n_path.sort()
+    return n_path
+def getRandomTrim(x, length, pad=0, start=None):
+    length = length+pad
+    if x.shape[0] <= length:
+        x_ = x
+        while(x.shape[0] <= length):
+            x_ = np.concatenate((x_,x_))
+    else:
+        if start is None:
+            start = np.random.randint(0, x.shape[0]-length, size=None)
+        end = length+start
+        if end > x.shape[0]:
+            x_ = x[start:]
+            x_ = np.concatenate((x_, x[:length-x.shape[0]]))
+        else:
+            x_ = x[start:length+start]
+    return x_[:length]
+def fadeIn(x, length=128):
+    w = scipy.signal.hann(length*2, sym=True)
+    w1 = w[0:length]
+    ones = np.ones(int(x.shape[0]-length))
+    w = np.append(w1, ones)
+    return x*w
+def fadeOut(x, length=128):
+    w = scipy.signal.hann(length*2, sym=True)
+    w2 = w[length:length*2]
+    ones = np.ones(int(x.shape[0]-length))
+    w = np.append(ones, w2)
+    return x*w
+def plotTimeFreq(audio, sr, n_fft=512, hop_length=128, ylabels=None):
+    n = len(audio)
+#     plt.figure(figsize=(14, 4*n))
+    colors = list(plt.cm.viridis(np.linspace(0,1,n)))
+    X = []
+    X_db = []
+    maxs = np.zeros((n,))
+    mins = np.zeros((n,))
+    maxs_t = np.zeros((n,))
+    for i, x in enumerate(audio):
+        if x.ndim == 2 and x.shape[-1] == 2:
+            x = librosa.core.to_mono(x.T)
+        X_ = librosa.stft(x, n_fft=n_fft, hop_length=hop_length)
+        X_db_ = librosa.amplitude_to_db(abs(X_))
+        X.append(X_)
+        X_db.append(X_db_)
+        maxs[i] = np.max(X_db_)
+        mins[i] = np.min(X_db_)
+        maxs_t[i] = np.max(np.abs(x))
+    vmax = np.max(maxs)
+    vmin = np.min(mins)
+    tmax = np.max(maxs_t)
+    for i, x in enumerate(audio):
+        if x.ndim == 2 and x.shape[-1] == 2:
+            x = librosa.core.to_mono(x.T)
+        plt.subplot(n, 2, 2*i+1)
+        librosa.display.waveplot(x, sr=sr, color=colors[i])
+        if ylabels:
+            plt.ylabel(ylabels[i])
+        plt.ylim(-tmax,tmax)
+        plt.subplot(n, 2, 2*i+2)
+        librosa.display.specshow(X_db[i], sr=sr, x_axis='time', y_axis='log',
+                                 hop_length=hop_length, cmap='GnBu', vmax=vmax, vmin=vmin)
+#         plt.colorbar(format='%+2.0f dB')
+def slicing(x, win_length, hop_length, center = True, windowing = False, pad = 0):
+    # Pad the time series so that frames are centered
+    if center:
+#         x = np.pad(x, int((win_length-hop_length+pad) // 2), mode='constant')
+        x = np.pad(x, ((int((win_length-hop_length+pad)//2), int((win_length+hop_length+pad)//2)),), mode='constant')
+    # Window the time series.
+    y_frames = librosa.util.frame(x, frame_length=win_length, hop_length=hop_length)
+    if windowing:
+        window = scipy.signal.hann(win_length, sym=False)
+    else:
+        window = 1.0
+    f = []
+    for i in range(len(y_frames.T)):
+        f.append(y_frames.T[i]*window)
+    return np.float32(np.asarray(f))
+def overlap(x, x_len, win_length, hop_length, windowing = True, rate = 1):
+    x = x.reshape(x.shape[0],x.shape[1]).T
+    if windowing:
+        window = scipy.signal.hann(win_length, sym=False)
+        rate = rate*hop_length/win_length
+    else:
+        window = 1
+        rate = 1
+    n_frames = x_len / hop_length
+    expected_signal_len = int(win_length + hop_length * (n_frames))
+    y = np.zeros(expected_signal_len)
+    for i in range(int(n_frames)):
+            sample = i * hop_length
+            w = x[:, i]
+            y[sample:(sample + win_length)] = y[sample:(sample + win_length)] + w*window
+    y = y[int(win_length // 2):-int(win_length // 2)]
+    return np.float32(y*rate)
+def highpassFiltering(x_list, f0, sr):
+    b1, a1 = scipy.signal.butter(4, f0/(sr/2),'highpass')
+    x_f = []
+    for x in x_list:
+        x_f_ = scipy.signal.filtfilt(b1, a1, x).copy(order='F')
+        x_f.append(x_f_)
+    return x_f
+def lineartodB(x):
+    return 20*np.log10(x)
+def dBtoLinear(x):
+    return np.power(10,x/20)
+def lufs_normalize(x, sr, lufs, log=True):
+    # measure the loudness first
+    meter = pyloudnorm.Meter(sr) # create BS.1770 meter
+    loudness = meter.integrated_loudness(x+1e-10)
+    if log:
+        print("original loudness: ", loudness," max value: ", np.max(np.abs(x)))
+    loudness_normalized_audio = pyloudnorm.normalize.loudness(x, loudness, lufs)
+    maxabs_amp = np.maximum(1.0, 1e-6 + np.max(np.abs(loudness_normalized_audio)))
+    loudness_normalized_audio /= maxabs_amp
+    loudness = meter.integrated_loudness(loudness_normalized_audio)
+    if log:
+        print("new loudness: ", loudness," max value: ", np.max(np.abs(loudness_normalized_audio)))
+    return loudness_normalized_audio
+import soxbindings as sox
+def lufs_normalize_compand(x, sr, lufs):
+    tfm = sox.Transformer()
+    tfm.compand(attack_time = 0.001,
+                decay_time = 0.01,
+                soft_knee_db = 1.0,
+                tf_points = [(-70, -70), (-0.1, -20), (0, 0)])
+    x = tfm.build_array(input_array=x, sample_rate_in=sr).astype(np.float32)
+    # measure the loudness first
+    meter = pyloudnorm.Meter(sr) # create BS.1770 meter
+    loudness = meter.integrated_loudness(x)
+    print("original loudness: ", loudness," max value: ", np.max(np.abs(x)))
+    loudness_normalized_audio = pyloudnorm.normalize.loudness(x, loudness, lufs)
+    maxabs_amp = np.maximum(1.0, 1e-6 + np.max(np.abs(loudness_normalized_audio)))
+    loudness_normalized_audio /= maxabs_amp
+    loudness = meter.integrated_loudness(loudness_normalized_audio)
+    print("new loudness: ", loudness," max value: ", np.max(np.abs(loudness_normalized_audio)))
+    return loudness_normalized_audio
+def getDistances(x,y):
+    distances = {}
+    distances['mae'] = mean_absolute_error(x, y)
+    distances['mse'] = mean_squared_error(x, y)
+    distances['euclidean'] = np.mean(paired_distances(x, y, metric='euclidean'))
+    distances['manhattan'] = np.mean(paired_distances(x, y, metric='manhattan'))
+    distances['cosine'] = np.mean(paired_distances(x, y, metric='cosine'))
+    distances['mae'] = round(distances['mae'], 5)
+    distances['mse'] = round(distances['mse'], 5)
+    distances['euclidean'] = round(distances['euclidean'], 5)
+    distances['manhattan'] = round(distances['manhattan'], 5)
+    distances['cosine'] = round(distances['cosine'], 5)
+    return distances
+def getMFCC(x, sr, mels=128, mfcc=13, mean_norm=False):
+    melspec = librosa.feature.melspectrogram(y=x, sr=sr, S=None,
+                                     n_fft=1024, hop_length=256,
+                                     n_mels=mels, power=2.0)
+    melspec_dB = librosa.power_to_db(melspec, ref=np.max)
+    mfcc = librosa.feature.mfcc(S=melspec_dB, sr=sr, n_mfcc=mfcc)
+    if mean_norm:
+        mfcc -= (np.mean(mfcc, axis=0))
+    return mfcc
+def getMSE_MFCC(y_true, y_pred, sr, mels=128, mfcc=13, mean_norm=False):
+    ratio = np.mean(np.abs(y_true))/np.mean(np.abs(y_pred))
+    y_pred =  ratio*y_pred
+    y_mfcc = getMFCC(y_true, sr, mels=mels, mfcc=mfcc, mean_norm=mean_norm)
+    z_mfcc = getMFCC(y_pred, sr, mels=mels, mfcc=mfcc, mean_norm=mean_norm)
+    return getDistances(y_mfcc[:,:], z_mfcc[:,:])

modules/normalization_imager.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+    Implementation of the normalization process of stereo-imaging and panning effects
+"""
+import numpy as np
+import sys
+import os
+currentdir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(currentdir)
+from common_audioeffects import AugmentationChain, Haas
+'''
+    ### normalization algorithm for stereo imaging and panning effects ###
+    process :
+        1. inputs 2-channeled audio
+        2. apply Haas effects if the input audio is almost mono
+        3. normalize mid-side channels according to target precomputed feature value
+        4. normalize left-right channels 50-50
+        5. normalize mid-side channels again
+'''
+def normalize_imager(data, \
+                        target_side_mid_bal=0.9, \
+                        mono_threshold=0.95, \
+                        sr=44100, \
+                        eps=1e-04, \
+                        verbose=False):
+    # to mid-side channels
+    mid, side = lr_to_ms(data[:,0], data[:,1])
+    if verbose:
+        print_balance(data[:,0], data[:,1])
+        print_balance(mid, side)
+        print()
+    # apply mid-side weights according to energy
+    mid_e, side_e = np.sum(mid**2), np.sum(side**2)
+    total_e = mid_e + side_e
+    # apply haas effect to almost-mono signal
+    if mid_e/total_e > mono_threshold:
+        aug_chain = AugmentationChain(fxs=[(Haas(sample_rate=sr), 1, True)])
+        data = aug_chain([data])[0]
+        mid, side = lr_to_ms(data[:,0], data[:,1])
+    if verbose:
+        print_balance(data[:,0], data[:,1])
+        print_balance(mid, side)
+        print()
+    # normalize mid-side channels (stereo imaging)
+    new_mid, new_side = process_balance(mid, side, tgt_e1_bal=target_side_mid_bal, eps=eps)
+    left, right = ms_to_lr(new_mid, new_side)
+    imaged = np.stack([left, right], 1)
+    if verbose:
+        print_balance(new_mid, new_side)
+        print_balance(left, right)
+        print()
+    # normalize panning to have the balance of left-right channels 50-50
+    left, right = process_balance(left, right, tgt_e1_bal=0.5, eps=eps)
+    mid, side = lr_to_ms(left, right)
+    if verbose:
+        print_balance(mid, side)
+        print_balance(left, right)
+        print()
+    # normalize again mid-side channels (stereo imaging)
+    new_mid, new_side = process_balance(mid, side, tgt_e1_bal=target_side_mid_bal, eps=eps)
+    left, right = ms_to_lr(new_mid, new_side)
+    imaged = np.stack([left, right], 1)
+    if verbose:
+        print_balance(new_mid, new_side)
+        print_balance(left, right)
+        print()
+    return imaged
+# balance out 2 input data's energy according to given balance
+# tgt_e1_bal range = [0.0, 1.0]
+    # tgt_e2_bal = 1.0 - tgt_e1_bal_range
+def process_balance(data_1, data_2, tgt_e1_bal=0.5, eps=1e-04):
+    e_1, e_2 = np.sum(data_1**2), np.sum(data_2**2)
+    total_e = e_1 + e_2
+    tgt_1_gain = np.sqrt(tgt_e1_bal * total_e / (e_1 + eps))
+    new_data_1 = data_1 * tgt_1_gain
+    new_e_1 = e_1 * (tgt_1_gain ** 2)
+    left_e_1 = total_e - new_e_1
+    tgt_2_gain = np.sqrt(left_e_1 / (e_2 + 1e-3))
+    new_data_2 = data_2 * tgt_2_gain
+    return new_data_1, new_data_2
+# left-right channeled signal to mid-side signal
+def lr_to_ms(left, right):
+    mid = left + right
+    side = left - right
+    return mid, side
+# mid-side channeled signal to left-right signal
+def ms_to_lr(mid, side):
+    left = (mid + side) / 2
+    right = (mid - side) / 2
+    return left, right
+# print energy balance of 2 inputs
+def print_balance(data_1, data_2, verbose=True):
+    e_1, e_2 = np.sum(data_1**2), np.sum(data_2**2)
+    total_e = e_1 + e_2
+    if verbose:
+        print(total_e, e_1/total_e, e_2/total_e)
+    return e_1/total_e, e_2/total_e

modules/utils_data_normalization.py ADDED Viewed

	@@ -0,0 +1,992 @@

+import os
+import sys
+import time
+import numpy as np
+import scipy
+import librosa
+import pyloudnorm as pyln
+sys.setrecursionlimit(int(1e6))
+import sklearn
+currentdir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(currentdir)
+from common_miscellaneous import compute_stft, compute_istft
+from common_audioeffects import Panner, Compressor, AugmentationChain, ConvolutionalReverb, Equaliser, AlgorithmicReverb
+import fx_utils
+import soundfile as sf
+import aubio
+import time
+import warnings
+import torch
+import torchaudio.functional as F
+# Functions
+def print_dict(dict_):
+    for i in dict_:
+        print(i)
+        for j in dict_[i]:
+            print('\t', j)
+def amp_to_db(x):
+    return 20*np.log10(x + 1e-30)
+def db_to_amp(x):
+    return 10**(x/20)
+def get_running_stats(x, features, N=20):
+    mean = []
+    std = []
+    for i in range(len(features)):
+        mean_, std_ = running_mean_std(x[:,i], N)
+        mean.append(mean_)
+        std.append(std_)
+    mean = np.asarray(mean)
+    std = np.asarray(std)
+    return mean, std
+def running_mean_std(x, N):
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=RuntimeWarning)
+        cumsum = np.cumsum(np.insert(x, 0, 0))
+        cumsum2 = np.cumsum(np.insert(x**2, 0, 0))
+        mean = (cumsum[N:] - cumsum[:-N]) / float(N)
+        std = np.sqrt(((cumsum2[N:] - cumsum2[:-N]) / N) - (mean * mean))
+    return mean, std
+def get_eq_matching(audio_t, ref_spec, sr=44100, n_fft=65536, hop_length=16384,
+                    min_db=-50, ntaps=101, lufs=-30):
+    audio_t = np.copy(audio_t)
+    max_db = amp_to_db(np.max(np.abs(audio_t)))
+    if max_db > min_db:
+        audio_t = fx_utils.lufs_normalize(audio_t, sr, lufs, log=False)
+        audio_D = compute_stft(np.expand_dims(audio_t, 1),
+                         hop_length,
+                         n_fft,
+                         np.sqrt(np.hanning(n_fft+1)[:-1]))
+        audio_D = np.abs(audio_D)
+        audio_D_avg = np.mean(audio_D, axis=0)[0]
+        m = ref_spec.shape[0]
+        Ts = 1.0/sr # sampling interval
+        n = m # length of the signal
+        kk = np.arange(n)
+        T = n/sr
+        frq = kk/T # two sides frequency range
+        frq /=2
+        diff_eq = amp_to_db(ref_spec)-amp_to_db(audio_D_avg)
+        diff_eq = db_to_amp(diff_eq)
+        diff_eq = np.sqrt(diff_eq)
+        diff_filter = scipy.signal.firwin2(ntaps,
+                                           frq/np.max(frq),
+                                           diff_eq,
+                                           nfreqs=None, window='hamming',
+                                           nyq=None, antisymmetric=False)
+        output = scipy.signal.filtfilt(diff_filter, 1, audio_t,
+                                       axis=-1, padtype='odd', padlen=None,
+                                       method='pad', irlen=None)
+    else:
+        output = audio_t
+    return output
+def get_eq_matching_gpu(audio_t, ref_spec, sr=44100, n_fft=65536, hop_length=16384,
+                        min_db=-50, ntaps=101, lufs=-30):
+    audio_t = np.copy(audio_t)
+    max_db = amp_to_db(np.max(np.abs(audio_t)))
+    if max_db > min_db:
+        start_time = time.time()
+        audio_t = fx_utils.lufs_normalize(audio_t, sr, lufs, log=False)
+        # audio_D = compute_stft(np.expand_dims(audio_t, 1),
+        #                         hop_length,
+        #                         n_fft,
+        #                         np.sqrt(np.hanning(n_fft+1)[:-1]))
+        audio_D = compute_stft(audio_t,
+                                hop_length,
+                                n_fft,
+                                np.sqrt(np.hanning(n_fft+1)[:-1]))
+        audio_D = np.abs(audio_D)
+        # audio_D_avg = np.mean(audio_D, axis=0)
+        audio_D_avg = np.mean(audio_D, axis=0)[0]
+        m = ref_spec.shape[0]
+        Ts = 1.0/sr # sampling interval
+        n = m # length of the signal
+        kk = np.arange(n)
+        T = n/sr
+        frq = kk/T # two sides frequency range
+        frq /=2
+        diff_eq_l = amp_to_db(ref_spec)-amp_to_db(audio_D_avg)
+        diff_eq_l = db_to_amp(diff_eq_l)
+        diff_eq_l = np.sqrt(diff_eq_l)
+        diff_eq_r = amp_to_db(ref_spec)-amp_to_db(audio_D_avg)
+        diff_eq_r = db_to_amp(diff_eq_r)
+        diff_eq_r = np.sqrt(diff_eq_r)
+        diff_filter_l = scipy.signal.firwin2(ntaps,
+                                        frq/np.max(frq),
+                                        diff_eq_l,
+                                        nfreqs=None, window='hamming',
+                                        nyq=None, antisymmetric=False)
+        diff_filter_r = scipy.signal.firwin2(ntaps,
+                                        frq/np.max(frq),
+                                        diff_eq_r,
+                                        nfreqs=None, window='hamming',
+                                        nyq=None, antisymmetric=False)
+        diff_filter = np.stack((diff_filter_l, diff_filter_r), axis=0)
+        # output = scipy.signal.filtfilt(diff_filter, 1, audio_t,
+        #                                axis=-1, padtype='odd', padlen=None,
+        #                                method='pad', irlen=None)
+        print(f"\t\tall previous: {time.time()-start_time}")
+        start_time = time.time()
+        # device = torch.cuda()
+        audio_t = torch.from_numpy(audio_t.transpose()).float().cuda()
+        diff_filter = torch.from_numpy(diff_filter).float().cuda()
+        denom_coef = torch.ones(diff_filter.size()).cuda()
+        print(f'input to gpu - audio shape: {audio_t.shape}')
+        # audio_t = F.filtfilt(waveform=audio_t, a_coeffs=denom_coef, b_coeffs=diff_filter, clamp=False).transpose()
+        audio_t = F.filtfilt(waveform=audio_t, a_coeffs=denom_coef, b_coeffs=diff_filter, clamp=False)
+        audio_t = audio_t.transpose(1, 0)
+        print(audio_t.shape)
+        print('filtered')
+        print(f"\t\tgpu filtfilt: {time.time()-start_time}")
+        print(torch.mean(audio_t))
+        output = audio_t.detach()
+        print(f"\t\t1gpu filtfilt: {time.time()-start_time}")
+        output = audio_t.cpu()
+        print(f"\t\t2gpu filtfilt: {time.time()-start_time}")
+        output = audio_t.detach().cpu().numpy()
+        print(f"\t\t3gpu filtfilt: {time.time()-start_time}")
+    else:
+        output = audio_t
+    return output
+def get_SPS(x, n_fft=2048, hop_length=1024, smooth=False, frames=False):
+    x = np.copy(x)
+    eps = 1e-20
+    audio_D = compute_stft(x,
+                 hop_length,
+                 n_fft,
+                 np.sqrt(np.hanning(n_fft+1)[:-1]))
+    audio_D_l = np.abs(audio_D[:, 0, :] + eps)
+    audio_D_r = np.abs(audio_D[:, 1, :] + eps)
+    phi = 2 * (np.abs(audio_D_l*np.conj(audio_D_r)))/(np.abs(audio_D_l)**2+np.abs(audio_D_r)**2)
+    phi_l = np.abs(audio_D_l*np.conj(audio_D_r))/(np.abs(audio_D_l)**2)
+    phi_r = np.abs(audio_D_r*np.conj(audio_D_l))/(np.abs(audio_D_r)**2)
+    delta = phi_l - phi_r
+    delta_ = np.sign(delta)
+    SPS = (1-phi)*delta_
+    phi_mean = np.mean(phi, axis=0)
+    if smooth:
+        phi_mean = scipy.signal.savgol_filter(phi_mean, 501, 1, mode='mirror')
+    SPS_mean = np.mean(SPS, axis=0)
+    if smooth:
+        SPS_mean = scipy.signal.savgol_filter(SPS_mean, 501, 1, mode='mirror')
+    return SPS_mean, phi_mean, SPS, phi
+def get_mean_side(sps, freqs=[50,2500], sr=44100, n_fft=2048):
+    sign = np.sign(sps+ 1e-10)
+    idx1 = freqs[0]
+    idx2 = freqs[1]
+    f1 = int(np.floor(idx1*n_fft/sr))
+    f2 = int(np.floor(idx2*n_fft/sr))
+    sign_mean = np.mean(sign[f1:f2])/np.abs(np.mean(sign[f1:f2]))
+    sign_mean
+    return sign_mean
+def get_panning_param_values(phi, side):
+    p = np.zeros_like(phi)
+    g = (np.clip(phi+1e-30, 0, 1))/2
+    for i, g_ in enumerate(g):
+        if side > 0:
+            p[i] = 1 - g_
+        elif side < 0:
+            p[i] = g_
+        else:
+            p[i] = 0.5
+    g_l = 1-p
+    g_r = p
+    return p, [g_l, g_r]
+def get_panning_matching(audio, ref_phi,
+                         sr=44100, n_fft=2048, hop_length=1024,
+                         min_db_f=-10, max_freq_pan=16000, frames=True):
+    eps = 1e-20
+    window = np.sqrt(np.hanning(n_fft+1)[:-1])
+    audio = np.copy(audio)
+    audio_t = np.pad(audio, ((n_fft, n_fft), (0, 0)), mode='constant')
+    sps_mean_, phi_mean_, _, _ = get_SPS(audio_t, n_fft=n_fft, hop_length=hop_length, smooth=True)
+    side = get_mean_side(sps_mean_, sr=sr, n_fft=n_fft)
+    if side > 0:
+        alpha = 0.7
+    else:
+        alpha = 0.3
+    processor = Panner()
+    processor.parameters.pan.value = alpha
+    processor.parameters.pan_law.value = 'linear'
+    processor.update()
+    audio_t_ = processor.process(audio_t)
+    sps_mean_, phi_mean, sps_frames, phi_frames = get_SPS(audio_t_, n_fft=n_fft,
+                                                          hop_length=hop_length,
+                                                          smooth=True, frames=frames)
+    if frames:
+        p_i_ = []
+        g_i_ = []
+        p_ref = []
+        g_ref = []
+        for i in range(len(sps_frames)):
+            sps_ = sps_frames[i]
+            phi_ = phi_frames[i]
+            p_, g_ = get_panning_param_values(phi_, side)
+            p_i_.append(p_)
+            g_i_.append(g_)
+            p_, g_ = get_panning_param_values(ref_phi, side)
+            p_ref.append(p_)
+            g_ref.append(g_)
+        ratio = (np.asarray(g_ref)/(np.asarray(g_i_)+eps))
+        g_l = ratio[:,0,:]
+        g_r = ratio[:,1,:]
+    else:
+        p, g = get_panning_param_values(ref_phi, side)
+        p_i, g_i = get_panning_param_values(phi_mean, side)
+        ratio = (np.asarray(g)/np.asarray(g_i))
+        g_l = ratio[0]
+        g_r = ratio[1]
+    audio_new_D = compute_stft(audio_t_,
+                               hop_length,
+                               n_fft,
+                               window)
+    audio_new_D_mono = audio_new_D.copy()
+    audio_new_D_mono = audio_new_D_mono[:, 0, :] + audio_new_D_mono[:, 1, :]
+    audio_new_D_mono = np.abs(audio_new_D_mono)
+    audio_new_D_phase = np.angle(audio_new_D)
+    audio_new_D = np.abs(audio_new_D)
+    audio_new_D_l = audio_new_D[:, 0, :]
+    audio_new_D_r = audio_new_D[:, 1, :]
+    if frames:
+        for i, frame in enumerate(audio_new_D_mono):
+            max_db = amp_to_db(np.max(np.abs(frame)))
+            if max_db < min_db_f:
+                g_r[i] = np.ones_like(frame)
+                g_l[i] = np.ones_like(frame)
+        idx1 = max_freq_pan
+        f1 = int(np.floor(idx1*n_fft/sr))
+        ones = np.ones_like(g_l)
+        g_l[f1:] = ones[f1:]
+        g_r[f1:] = ones[f1:]
+    audio_new_D_l = audio_new_D_l*g_l
+    audio_new_D_r = audio_new_D_r*g_r
+    audio_new_D_l = np.expand_dims(audio_new_D_l, 0)
+    audio_new_D_r = np.expand_dims(audio_new_D_r, 0)
+    audio_new_D_ = np.concatenate((audio_new_D_l,audio_new_D_r))
+    audio_new_D_ = np.moveaxis(audio_new_D_, 0, 1)
+    audio_new_D_ = audio_new_D_ * (np.cos(audio_new_D_phase) + np.sin(audio_new_D_phase)*1j)
+    audio_new_t = compute_istft(audio_new_D_,
+                                hop_length,
+                                window)
+    audio_new_t = audio_new_t[n_fft:n_fft+audio.shape[0]]
+    return audio_new_t
+def get_mean_peak(audio, sr=44100, true_peak=False, n_mels=128, percentile=75):
+#     Returns mean peak value in dB after the 1Q is removed.
+#     Input should be in the shape samples x channel
+    audio_ = audio
+    window_size = 2**10 # FFT size
+    hop_size = window_size
+    peak = []
+    std = []
+    for ch in range(audio_.shape[-1]):
+        x = np.ascontiguousarray(audio_[:, ch])
+        if true_peak:
+            x = librosa.resample(x, sr, 4*sr)
+            sr = 4*sr
+            window_size = 4*window_size
+            hop_size = 4*hop_size
+        onset_func = aubio.onset('hfc', buf_size=window_size, hop_size=hop_size, samplerate=sr)
+        frames = np.float32(librosa.util.frame(x, frame_length=window_size, hop_length=hop_size))
+        onset_times = []
+        for frame in frames.T:
+            if onset_func(frame):
+                onset_time = onset_func.get_last()
+                onset_times.append(onset_time)
+        samples=[]
+        if onset_times:
+            for i, p in enumerate(onset_times[:-1]):
+                samples.append(onset_times[i]+np.argmax(np.abs(x[onset_times[i]:onset_times[i+1]])))
+            samples.append(onset_times[-1]+np.argmax(np.abs(x[onset_times[-1]:])))
+        p_value = []
+        for p in samples:
+            p_ = amp_to_db(np.abs(x[p]))
+            p_value.append(p_)
+        p_value_=[]
+        for p in p_value:
+            if p > np.percentile(p_value, percentile):
+                p_value_.append(p)
+        if p_value_:
+            peak.append(np.mean(p_value_))
+            std.append(np.std(p_value_))
+        elif p_value:
+            peak.append(np.mean(p_value))
+            std.append(np.std(p_value))
+        else:
+            return None
+    return [np.mean(peak), np.mean(std)]
+def compress(processor, audio, sr, th, ratio, attack, release):
+    eps = 1e-20
+    x = audio
+    processor.parameters.threshold.value = th
+    processor.parameters.ratio.value = ratio
+    processor.parameters.attack_time.value = attack
+    processor.parameters.release_time.value = release
+    processor.update()
+    output = processor.process(x)
+    if np.max(np.abs(output)) >= 1.0:
+        output = np.clip(output, -1.0, 1.0)
+    return output
+def get_comp_matching(audio,
+                      ref_peak, ref_std,
+                      ratio, attack, release, sr=44100,
+                      min_db=-50, comp_peak_norm=-10.0,
+                      min_th=-40, max_ratio=20, n_mels=128,
+                      true_peak=False, percentile=75, expander=True):
+    x = audio.copy()
+    if x.ndim < 2:
+        x = np.expand_dims(x, 1)
+    max_db = amp_to_db(np.max(np.abs(x)))
+    if max_db > min_db:
+        x = pyln.normalize.peak(x, comp_peak_norm)
+        peak, std = get_mean_peak(x, sr,
+                                  n_mels=n_mels,
+                                  true_peak=true_peak,
+                                  percentile=percentile)
+        if peak > (ref_peak - ref_std) and peak < (ref_peak + ref_std):
+            return x
+    #     DownwardCompress
+        elif peak > (ref_peak - ref_std):
+            processor = Compressor(sample_rate=sr)
+            # print('compress')
+            ratios = np.linspace(ratio, max_ratio, max_ratio-ratio+1)
+            ths = np.linspace(-1-9, min_th, 2*np.abs(min_th)-1-18)
+            for rt in ratios:
+                for th in ths:
+                    y = compress(processor, x, sr, th, rt, attack, release)
+                    peak, std = get_mean_peak(y, sr,
+                                              n_mels=n_mels,
+                                              true_peak=true_peak,
+                                              percentile=percentile)
+                    if peak < (ref_peak + ref_std):
+                        break
+                else:
+                    continue
+                break
+            return y
+    #      Upward Expand
+        elif peak < (ref_peak + ref_std):
+            if expander:
+                processor = Compressor(sample_rate=sr)
+                ratios = np.linspace(ratio, max_ratio, max_ratio-ratio+1)
+                ths = np.linspace(-1, min_th, 2*np.abs(min_th)-1)[::-1]
+                for rt in ratios:
+                    for th in ths:
+                        y = compress(processor, x, sr, th, 1/rt, attack, release)
+                        peak, std = get_mean_peak(y, sr,
+                                                  n_mels=n_mels,
+                                                  true_peak=true_peak,
+                                                  percentile=percentile)
+                        if peak > (ref_peak - ref_std):
+                            break
+                    else:
+                        continue
+                    break
+                return y
+            else:
+                return x
+    else:
+        return x
+# REVERB
+def get_reverb_send(audio, eq_parameters, rv_parameters, impulse_responses=None,
+                    eq_prob=1.0, rv_prob=1.0, parallel=True, shuffle=False, sr=44100, bands=['low_shelf', 'high_shelf']):
+    x = audio.copy()
+    if x.ndim < 2:
+        x = np.expand_dims(x, 1)
+    channels = x.shape[-1]
+    eq_gain = eq_parameters.low_shelf_gain.value
+    eq = Equaliser(n_channels=channels,
+               sample_rate=sr,
+               gain_range=(eq_gain, eq_gain),
+               bands=bands,
+               hard_clip=False,
+                name='Equaliser', parameters=eq_parameters)
+    eq.randomize()
+    if impulse_responses:
+        reverb = ConvolutionalReverb(impulse_responses=impulse_responses,
+                                 sample_rate=sr,
+                                 parameters=rv_parameters)
+    else:
+        reverb = AlgorithmicReverb(sample_rate=sr,
+                             parameters=rv_parameters)
+    reverb.randomize()
+    fxchain = AugmentationChain([
+                             (eq, rv_prob, False),
+                            (reverb, eq_prob, False)
+                             ],
+                            shuffle=shuffle, parallel=parallel)
+    output = fxchain(x)
+    return output
+# FUNCTIONS TO COMPUTE FEATURES
+def compute_loudness_features(args_):
+    audio_out_ = args_[0]
+    audio_tar_ = args_[1]
+    idx = args_[2]
+    sr = args_[3]
+    loudness_ = {key:[] for key in ['d_lufs', 'd_peak',]}
+    peak_tar = np.max(np.abs(audio_tar_))
+    peak_tar_db = 20.0 * np.log10(peak_tar)
+    peak_out = np.max(np.abs(audio_out_))
+    peak_out_db = 20.0 * np.log10(peak_out)
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=RuntimeWarning)
+        meter = pyln.Meter(sr) # create BS.1770 meter
+        loudness_tar = meter.integrated_loudness(audio_tar_)
+        loudness_out = meter.integrated_loudness(audio_out_)
+    loudness_['d_lufs'].append(sklearn.metrics.mean_absolute_percentage_error([loudness_tar], [loudness_out]))
+    loudness_['d_peak'].append(sklearn.metrics.mean_absolute_percentage_error([peak_tar_db], [peak_out_db]))
+    return loudness_
+def compute_spectral_features(args_):
+    audio_out_ = args_[0]
+    audio_tar_ = args_[1]
+    idx = args_[2]
+    sr = args_[3]
+    fft_size = args_[4]
+    hop_length = args_[5]
+    channels = args_[6]
+    audio_out_ = pyln.normalize.peak(audio_out_, -1.0)
+    audio_tar_ = pyln.normalize.peak(audio_tar_, -1.0)
+    spec_out_ = compute_stft(audio_out_,
+                         hop_length,
+                         fft_size,
+                         np.sqrt(np.hanning(fft_size+1)[:-1]))
+    spec_out_ = np.transpose(spec_out_, axes=[1, -1, 0])
+    spec_out_ = np.abs(spec_out_)
+    spec_tar_ = compute_stft(audio_tar_,
+                             hop_length,
+                             fft_size,
+                             np.sqrt(np.hanning(fft_size+1)[:-1]))
+    spec_tar_ = np.transpose(spec_tar_, axes=[1, -1, 0])
+    spec_tar_ = np.abs(spec_tar_)
+    spectral_ = {key:[] for key in ['centroid_mean',
+                                    'bandwidth_mean',
+                                    'contrast_l_mean',
+                                    'contrast_m_mean',
+                                    'contrast_h_mean',
+                                    'rolloff_mean',
+                                    'flatness_mean',
+                                    'mape_mean',
+                                   ]}
+    centroid_mean_ = []
+    centroid_std_ = []
+    bandwidth_mean_ = []
+    bandwidth_std_ = []
+    contrast_l_mean_ = []
+    contrast_l_std_ = []
+    contrast_m_mean_ = []
+    contrast_m_std_ = []
+    contrast_h_mean_ = []
+    contrast_h_std_ = []
+    rolloff_mean_ = []
+    rolloff_std_ = []
+    flatness_mean_ = []
+    for ch in range(channels):
+        tar = spec_tar_[ch]
+        out = spec_out_[ch]
+        tar_sc = librosa.feature.spectral_centroid(y=None, sr=sr, S=tar,
+                             n_fft=fft_size, hop_length=hop_length)
+        out_sc = librosa.feature.spectral_centroid(y=None, sr=sr, S=out,
+                             n_fft=fft_size, hop_length=hop_length)
+        tar_bw = librosa.feature.spectral_bandwidth(y=None, sr=sr, S=tar,
+                                                    n_fft=fft_size, hop_length=hop_length,
+                                                    centroid=tar_sc, norm=True, p=2)
+        out_bw = librosa.feature.spectral_bandwidth(y=None, sr=sr, S=out,
+                                                    n_fft=fft_size, hop_length=hop_length,
+                                                    centroid=out_sc, norm=True, p=2)
+        # l = 0-250, m = 1-2-3 = 250 - 2000, h = 2000 - SR/2
+        tar_ct = librosa.feature.spectral_contrast(y=None, sr=sr, S=tar,
+                                                   n_fft=fft_size, hop_length=hop_length,
+                                                   fmin=250.0, n_bands=4, quantile=0.02, linear=False)
+        out_ct = librosa.feature.spectral_contrast(y=None, sr=sr, S=out,
+                                                   n_fft=fft_size, hop_length=hop_length,
+                                                   fmin=250.0, n_bands=4, quantile=0.02, linear=False)
+        tar_ro = librosa.feature.spectral_rolloff(y=None, sr=sr, S=tar,
+                                                  n_fft=fft_size, hop_length=hop_length,
+                                                  roll_percent=0.85)
+        out_ro = librosa.feature.spectral_rolloff(y=None, sr=sr, S=out,
+                                                  n_fft=fft_size, hop_length=hop_length,
+                                                  roll_percent=0.85)
+        tar_ft = librosa.feature.spectral_flatness(y=None, S=tar,
+                                                   n_fft=fft_size, hop_length=hop_length,
+                                                   amin=1e-10, power=2.0)
+        out_ft = librosa.feature.spectral_flatness(y=None, S=out,
+                                                   n_fft=fft_size, hop_length=hop_length,
+                                                   amin=1e-10, power=2.0)
+        eps = 1e-0
+        N = 40
+        mean_sc_tar, std_sc_tar = get_running_stats(tar_sc.T+eps, [0], N=N)
+        mean_sc_out, std_sc_out = get_running_stats(out_sc.T+eps, [0], N=N)
+        assert np.isnan(mean_sc_tar).any() == False, f'NAN values mean_sc_tar {idx}'
+        assert np.isnan(mean_sc_out).any() == False, f'NAN values mean_sc_out {idx}'
+        mean_bw_tar, std_bw_tar = get_running_stats(tar_bw.T+eps, [0], N=N)
+        mean_bw_out, std_bw_out = get_running_stats(out_bw.T+eps, [0], N=N)
+        assert np.isnan(mean_bw_tar).any() == False, f'NAN values tar mean bw {idx}'
+        assert np.isnan(mean_bw_out).any() == False, f'NAN values out mean bw {idx}'
+        mean_ct_tar, std_ct_tar = get_running_stats(tar_ct.T, list(range(tar_ct.shape[0])), N=N)
+        mean_ct_out, std_ct_out = get_running_stats(out_ct.T, list(range(out_ct.shape[0])), N=N)
+        assert np.isnan(mean_ct_tar).any() == False, f'NAN values tar mean ct {idx}'
+        assert np.isnan(mean_ct_out).any() == False, f'NAN values out mean ct {idx}'
+        mean_ro_tar, std_ro_tar = get_running_stats(tar_ro.T+eps, [0], N=N)
+        mean_ro_out, std_ro_out = get_running_stats(out_ro.T+eps, [0], N=N)
+        assert np.isnan(mean_ro_tar).any() == False, f'NAN values tar mean ro {idx}'
+        assert np.isnan(mean_ro_out).any() == False, f'NAN values out mean ro {idx}'
+        mean_ft_tar, std_ft_tar = get_running_stats(tar_ft.T, [0], N=800) # gives very high numbers due to N (80) value
+        mean_ft_out, std_ft_out = get_running_stats(out_ft.T, [0], N=800)
+        mape_mean_sc = sklearn.metrics.mean_absolute_percentage_error(mean_sc_tar[0], mean_sc_out[0])
+        mape_mean_bw = sklearn.metrics.mean_absolute_percentage_error(mean_bw_tar[0], mean_bw_out[0])
+        mape_mean_ct_l = sklearn.metrics.mean_absolute_percentage_error(mean_ct_tar[0], mean_ct_out[0])
+        mape_mean_ct_m = sklearn.metrics.mean_absolute_percentage_error(np.mean(mean_ct_tar[1:4], axis=0),
+                                                                        np.mean(mean_ct_out[1:4], axis=0))
+        mape_mean_ct_h = sklearn.metrics.mean_absolute_percentage_error(mean_ct_tar[-1], mean_ct_out[-1])
+        mape_mean_ro = sklearn.metrics.mean_absolute_percentage_error(mean_ro_tar[0], mean_ro_out[0])
+        mape_mean_ft = sklearn.metrics.mean_absolute_percentage_error(mean_ft_tar[0], mean_ft_out[0])
+        centroid_mean_.append(mape_mean_sc)
+        bandwidth_mean_.append(mape_mean_bw)
+        contrast_l_mean_.append(mape_mean_ct_l)
+        contrast_m_mean_.append(mape_mean_ct_m)
+        contrast_h_mean_.append(mape_mean_ct_h)
+        rolloff_mean_.append(mape_mean_ro)
+        flatness_mean_.append(mape_mean_ft)
+    spectral_['centroid_mean'].append(np.mean(centroid_mean_))
+    spectral_['bandwidth_mean'].append(np.mean(bandwidth_mean_))
+    spectral_['contrast_l_mean'].append(np.mean(contrast_l_mean_))
+    spectral_['contrast_m_mean'].append(np.mean(contrast_m_mean_))
+    spectral_['contrast_h_mean'].append(np.mean(contrast_h_mean_))
+    spectral_['rolloff_mean'].append(np.mean(rolloff_mean_))
+    spectral_['flatness_mean'].append(np.mean(flatness_mean_))
+    spectral_['mape_mean'].append(np.mean([np.mean(centroid_mean_),
+                                      np.mean(bandwidth_mean_),
+                                      np.mean(contrast_l_mean_),
+                                      np.mean(contrast_m_mean_),
+                                      np.mean(contrast_h_mean_),
+                                      np.mean(rolloff_mean_),
+                                      np.mean(flatness_mean_),
+                                     ]))
+    return spectral_
+# PANNING
+def get_panning_rms_frame(sps_frame, freqs=[0,22050], sr=44100, n_fft=2048):
+    idx1 = freqs[0]
+    idx2 = freqs[1]
+    f1 = int(np.floor(idx1*n_fft/sr))
+    f2 = int(np.floor(idx2*n_fft/sr))
+    p_rms = np.sqrt((1/(f2-f1)) * np.sum(sps_frame[f1:f2]**2))
+    return p_rms
+def get_panning_rms(sps, freqs=[[0, 22050]], sr=44100, n_fft=2048):
+    p_rms = []
+    for frame in sps:
+        p_rms_ = []
+        for f in freqs:
+            rms = get_panning_rms_frame(frame, freqs=f, sr=sr, n_fft=n_fft)
+            p_rms_.append(rms)
+        p_rms.append(p_rms_)
+    return np.asarray(p_rms)
+def compute_panning_features(args_):
+    audio_out_ = args_[0]
+    audio_tar_ = args_[1]
+    idx = args_[2]
+    sr = args_[3]
+    fft_size = args_[4]
+    hop_length = args_[5]
+    audio_out_ = pyln.normalize.peak(audio_out_, -1.0)
+    audio_tar_ = pyln.normalize.peak(audio_tar_, -1.0)
+    panning_ = {}
+    freqs=[[0, sr//2], [0, 250], [250, 2500], [2500, sr//2]]
+    _, _, sps_frames_tar, _ = get_SPS(audio_tar_, n_fft=fft_size,
+                                  hop_length=hop_length,
+                                  smooth=True, frames=True)
+    _, _, sps_frames_out, _ = get_SPS(audio_out_, n_fft=fft_size,
+                                      hop_length=hop_length,
+                                      smooth=True, frames=True)
+    p_rms_tar = get_panning_rms(sps_frames_tar,
+                    freqs=freqs,
+                    sr=sr,
+                    n_fft=fft_size)
+    p_rms_out = get_panning_rms(sps_frames_out,
+                    freqs=freqs,
+                    sr=sr,
+                    n_fft=fft_size)
+    # to avoid num instability, deletes frames with zero rms from target
+    if np.min(p_rms_tar) == 0.0:
+        id_zeros = np.where(p_rms_tar.T[0] == 0)
+        p_rms_tar_ = []
+        p_rms_out_ = []
+        for i in range(len(freqs)):
+            temp_tar = np.delete(p_rms_tar.T[i], id_zeros)
+            temp_out = np.delete(p_rms_out.T[i], id_zeros)
+            p_rms_tar_.append(temp_tar)
+            p_rms_out_.append(temp_out)
+        p_rms_tar_ = np.asarray(p_rms_tar_)
+        p_rms_tar = p_rms_tar_.T
+        p_rms_out_ = np.asarray(p_rms_out_)
+        p_rms_out = p_rms_out_.T
+    N = 40
+    mean_tar, std_tar = get_running_stats(p_rms_tar, freqs, N=N)
+    mean_out, std_out = get_running_stats(p_rms_out, freqs, N=N)
+    panning_['P_t_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_tar[0], mean_out[0])]
+    panning_['P_l_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_tar[1], mean_out[1])]
+    panning_['P_m_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_tar[2], mean_out[2])]
+    panning_['P_h_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_tar[3], mean_out[3])]
+    panning_['mape_mean'] = [np.mean([panning_['P_t_mean'],
+                                      panning_['P_l_mean'],
+                                      panning_['P_m_mean'],
+                                      panning_['P_h_mean'],
+                                     ])]
+    return panning_
+# DYNAMIC
+def get_rms_dynamic_crest(x, frame_length, hop_length):
+    rms = []
+    dynamic_spread = []
+    crest = []
+    for ch in range(x.shape[-1]):
+        frames = librosa.util.frame(x[:, ch], frame_length=frame_length, hop_length=hop_length)
+        rms_ = []
+        dynamic_spread_ = []
+        crest_ = []
+        for i in frames.T:
+            x_rms = amp_to_db(np.sqrt(np.sum(i**2)/frame_length))
+            x_d = np.sum(amp_to_db(np.abs(i)) - x_rms)/frame_length
+            x_c = amp_to_db(np.max(np.abs(i)))/x_rms
+            rms_.append(x_rms)
+            dynamic_spread_.append(x_d)
+            crest_.append(x_c)
+        rms.append(rms_)
+        dynamic_spread.append(dynamic_spread_)
+        crest.append(crest_)
+    rms = np.asarray(rms)
+    dynamic_spread = np.asarray(dynamic_spread)
+    crest = np.asarray(crest)
+    rms = np.mean(rms, axis=0)
+    dynamic_spread = np.mean(dynamic_spread, axis=0)
+    crest = np.mean(crest, axis=0)
+    rms = np.expand_dims(rms, axis=0)
+    dynamic_spread = np.expand_dims(dynamic_spread, axis=0)
+    crest = np.expand_dims(crest, axis=0)
+    return rms, dynamic_spread, crest
+def lowpassFiltering(x, f0, sr):
+    b1, a1 = scipy.signal.butter(4, f0/(sr/2),'lowpass')
+    x_f = []
+    for ch in range(x.shape[-1]):
+        x_f_ = scipy.signal.filtfilt(b1, a1, x[:, ch]).copy(order='F')
+        x_f.append(x_f_)
+    return np.asarray(x_f).T
+def get_low_freq_weighting(x, sr, n_fft, hop_length, f0 = 1000):
+    x_low = lowpassFiltering(x, f0, sr)
+    X_low = compute_stft(x_low,
+                         hop_length,
+                         n_fft,
+                         np.sqrt(np.hanning(n_fft+1)[:-1]))
+    X_low = np.transpose(X_low, axes=[1, -1, 0])
+    X_low = np.abs(X_low)
+    X = compute_stft(x,
+                         hop_length,
+                         n_fft,
+                         np.sqrt(np.hanning(n_fft+1)[:-1]))
+    X = np.transpose(X, axes=[1, -1, 0])
+    X = np.abs(X)
+    eps = 1e-5
+    ratio = (X_low)/(X+eps)
+    ratio = np.sum(ratio, axis = 1)
+    ratio = np.mean(ratio, axis = 0)
+    return np.expand_dims(ratio, axis=0)
+def compute_dynamic_features(args_):
+    audio_out_ = args_[0]
+    audio_tar_ = args_[1]
+    idx = args_[2]
+    sr = args_[3]
+    fft_size = args_[4]
+    hop_length = args_[5]
+    audio_out_ = pyln.normalize.peak(audio_out_, -1.0)
+    audio_tar_ = pyln.normalize.peak(audio_tar_, -1.0)
+    dynamic_ = {}
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=UserWarning)
+        rms_tar, dyn_tar, crest_tar = get_rms_dynamic_crest(audio_tar_, fft_size, hop_length)
+        rms_out, dyn_out, crest_out = get_rms_dynamic_crest(audio_out_, fft_size, hop_length)
+        low_ratio_tar = get_low_freq_weighting(audio_tar_, sr, fft_size, hop_length, f0=1000)
+        low_ratio_out = get_low_freq_weighting(audio_out_, sr, fft_size, hop_length, f0=1000)
+    N = 40
+    eps = 1e-10
+    rms_tar = (-1*rms_tar) + 1.0
+    rms_out = (-1*rms_out) + 1.0
+    dyn_tar = (-1*dyn_tar) + 1.0
+    dyn_out = (-1*dyn_out) + 1.0
+    mean_rms_tar, std_rms_tar = get_running_stats(rms_tar.T, [0], N=N)
+    mean_rms_out, std_rms_out = get_running_stats(rms_out.T, [0], N=N)
+    mean_dyn_tar, std_dyn_tar = get_running_stats(dyn_tar.T, [0], N=N)
+    mean_dyn_out, std_dyn_out = get_running_stats(dyn_out.T, [0], N=N)
+    mean_crest_tar, std_crest_tar = get_running_stats(crest_tar.T, [0], N=N)
+    mean_crest_out, std_crest_out = get_running_stats(crest_out.T, [0], N=N)
+    mean_low_ratio_tar, std_low_ratio_tar = get_running_stats(low_ratio_tar.T, [0], N=N)
+    mean_low_ratio_out, std_low_ratio_out = get_running_stats(low_ratio_out.T, [0], N=N)
+    dynamic_['rms_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_rms_tar, mean_rms_out)]
+    dynamic_['dyn_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_dyn_tar, mean_dyn_out)]
+    dynamic_['crest_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_crest_tar, mean_crest_out)]
+    dynamic_['l_ratio_mean_mape'] = [sklearn.metrics.mean_absolute_percentage_error(mean_low_ratio_tar, mean_low_ratio_out)]
+    dynamic_['l_ratio_mean_l2'] = [sklearn.metrics.mean_squared_error(mean_low_ratio_tar, mean_low_ratio_out)]
+    dynamic_['mape_mean'] = [np.mean([dynamic_['rms_mean'],
+                                      dynamic_['dyn_mean'],
+                                      dynamic_['crest_mean'],
+                                     ])]
+    return dynamic_