Spaces:
Running
on
Zero
Running
on
Zero
#! /usr/bin/env python3 | |
import re | |
import sys | |
import torch.nn as nn | |
import torch | |
from PIL import Image | |
import numpy as np | |
from . import rect_to_square, square_to_rect | |
CHORD_BORDER = 8 # chord border size in pixels | |
# my distillation of all output from polyffusion's chord finder for transposed +/-12 semitones POP909 dataset. | |
NOTE_NAMES = ['C','C#','D','E','Eb','F','F#','G', 'Ab', 'A', 'Bb', 'B'] # these are from polyffusion's chord finder. yes, mixing # & b is weird | |
#NOTE_NAMES2 = ['A','Ab','B','Bb','C','C#','D','E','Eb','F','F#','G'] # how they are in all_chords.txt file | |
CHORD_TYPES = ['aug', 'dim', 'dim7', 'hdim7', | |
'maj', 'maj(11)', 'maj13', 'maj/3', 'maj/5', 'maj6', 'maj6(9)', 'maj7', 'maj7/3', 'maj7/5', 'maj7/7', 'maj(9)', 'maj9', 'maj9(11)', | |
'min', 'min(11)', 'min11', 'min13', 'min/5', 'min6', 'min6(9)', 'min7', 'min7/5', 'min7/b7', 'min(9)', 'min9', 'min/b3', 'minmaj7', | |
'sus2', 'sus4', 'sus4(b7)', 'sus4(b7,9)', '7', '7/3', '7/5', '7(#9)', '7/b7', '9', '11', '13'] # 44 chord types | |
CHORD_IND_PAIRS = [(note, chord) for note in NOTE_NAMES for chord in CHORD_TYPES] | |
POSSIBLE_CHORDS = [f"{note}:{chord}" for (note, chord) in CHORD_IND_PAIRS] | |
#POSSIBLE_CHORDS = [f"{note}:{chord}" for note in NOTE_NAMES for chord in CHORD_TYPES] | |
POSSIBLE_CHORDS += ['N'] # N for no chord | |
assert len(POSSIBLE_CHORDS) == 12*44+1, f"There should be {12*44+1} possible chords, but there are {len(POSSIBLE_CHORDS)}. Check the NOTE_NAMES and CHORD_TYPES lists." | |
def to_base_9(n): | |
# converts a decimal integer to base 9 | |
if n == 0: return [0, 0, 0] | |
digits = [] | |
while n: | |
digits.append(n % 9) | |
n //= 9 | |
while len(digits) < 3: # add leading zeros | |
digits.append(0) | |
return digits[::-1] | |
def chord_num_to_color(cn, scale=30): | |
# "embeddings" for chords, from (0,0,30) up to (240,240,240) in each (RGB) channel, in steps of 30 | |
color = to_base_9(cn+1) | |
return tuple(x*scale for x in color) | |
def color_to_chord_num(color, scale=30, warnings_on=False): | |
# reverse of chord_num_to_color, note that color goes backwards | |
out = sum([x//scale * 9**i for i, x in enumerate(color[::-1])])-1 | |
if out < 0: | |
if warnings_on: print(f"color_to_chord_num: Warning: out should be equal to or greater than 0: color = {color}, out = {out}. Wrapping around to {len(POSSIBLE_CHORDS)+out}") | |
out = len(POSSIBLE_CHORDS) + out | |
return out | |
def simplify_chord(chord_name): | |
"""Simplifies chord names by applying a few rules: | |
1. get rid of the ones with parentheses, e.g. change "A:maj(11)" to just "A:maj"? | |
2. remove the notes in the bass, like mapping all "A:7/3", "A:7/5" and "A:7/b7" to just "A:7"? | |
3. remove uspension markings, e.g. sus2, sus4? | |
4. maybe? high-numbered added notes like "G:min11" & "G:min13" -> "G:min" | |
""" | |
chord_name = re.sub(r'\(.*','',chord_name) # 1 | |
chord_name = re.sub(r'\/.*','',chord_name) # 2 | |
chord_name = re.sub(r'sus.*','',chord_name) # 3 | |
return chord_name | |
def get_unique_indices(data): | |
"""Returns the indices of non-repeating values in a list | |
Args: | |
data: A list of any data type. | |
Example: data = [0, 1, 4, 1, 5, 5, 5, 6, 10, 6, 6, 5] | |
Returns: | |
A list of indices for non-repeating values. | |
Example: result = [0, 1, 2, 3, 6, 7, 8, 10, 11] | |
""" | |
return [i for i, (val, next_val) in enumerate(zip(data, data[1:])) if val != next_val] + [len(data) - 1] | |
def get_nonrepeated_values(data, indices=None): | |
"""Returns the indices of non-repeating values in a list | |
Args: | |
data: A list of any data type. | |
Example: data = [0, 1, 4, 1, 5, 5, 5, 6, 10, 6, 6, 5] | |
Returns: | |
A list of non-repeating values. | |
Example: returns [0, 1, 4, 1, 5, 6, 10, 6, 5] | |
""" | |
if indices is None: | |
indices = get_unique_indices(data) | |
return [data[i] for i in indices] | |
def most_freq_or_first(arr, debug=False): | |
"returns either the most frequent value in array, or if multiple values are most frequent, it returns the first such value" | |
assert len(arr.shape) == 1, "arr must be 1D" | |
savearr = arr.copy() | |
if debug: | |
print("most_freq_or_first: arr = ", arr) | |
if savearr.min() < 0: # if there are negative values, we need to shift them up to 0 | |
arr = arr - savearr.min() | |
bc = np.bincount(arr) | |
try: | |
if np.any(arr < 0): bc[arr < 0] = 0 # don't inlcude negative arr values when checking for most frequent | |
bc[bc != bc.max()] = 0 # only interested in most frequent values | |
except Exception as e: | |
print("Exception ",e) | |
print("most_freq_or_first: arr.shape = ", arr.shape) | |
print("most_freq_or_first: arr = ", arr ) | |
print("most_freq_or_first: bc.shape = ", bc.shape) | |
raise e | |
out = np.argmax(bc) | |
# shift numbers back down | |
if savearr.min() < 0: | |
out = out + savearr.min() | |
assert out.max() <= arr.max(), f"out.max() = {out.max()} should be less than arr.max() = {arr.max()}" | |
return out | |
def most_freq_or_first_every(arr, | |
every=4, # pixels per chord label. 4=every quarter note | |
): | |
assert len(arr.shape) == 1, "arr must be 1D" | |
"used to grab most frequent chord labels, assuming we're starting on a beat. arr=chord label indices, e.g. in 0..528" | |
remainder = len(arr) % every | |
if remainder != 0: | |
arr = np.pad(arr, (0, every - remainder), mode='constant', constant_values=(0, arr[- remainder])) | |
#print("most_freq_or_first_every: Warning: Padding arr with last beat value on end. new arr =",arr) | |
check = arr.reshape((-1,every)) | |
out = np.array( [most_freq_or_first(a) for a in arr.reshape((-1,every))] ) | |
if out.max() > arr.max(): | |
for i, c in enumerate(check): | |
mfof = most_freq_or_first(c) | |
if mfof > c.max(): | |
print(f"i={i}, c={c}, most_freq_or_first(c)={mfof}") | |
raise ValueError(f"out.max() = {out.max()} should be less than arr.max() = {arr.max()}") | |
return out | |
def chord_str_to_pair(chord_str): | |
"converts a chord string to a pair of (note, chord) indices" | |
if chord_str == 'N': | |
return (-1,-1) | |
note, chord_type = chord_str.split(':') | |
note_ind = NOTE_NAMES.index(note) | |
chord_type_ind = CHORD_TYPES.index(chord_type) | |
return (note_ind, chord_type_ind) | |
def chords_str_to_pairs(chords_str): | |
for chord_str in chords_str.split(','): | |
yield chord_str_to_pair(chord_str) | |
def chords_str_to_inds(chords_str): | |
for chord_str in chords_str.split(','): | |
yield POSSIBLE_CHORDS.index(chord_str) | |
def pair_to_chord_index(pair): | |
"converts a pair of (note, chord_type) indices to a single chord index" | |
note_ind, chord_type_ind = pair | |
return note_ind*len(CHORD_TYPES) + chord_type_ind | |
def chord_index_to_pair(ci): | |
"converts a single chord index to a pair of (note, chord) indices" | |
note_ind = ci // len(CHORD_TYPES) | |
chord_type_ind = ci % len(CHORD_TYPES) | |
return (note_ind, chord_type_ind) | |
def chord_index_to_str(ci): | |
"converts a single chord index to a chord string" | |
return POSSIBLE_CHORDS[ci] | |
class ChordEmbedding(nn.Module): | |
def __init__(self, chord_emb_dim=8, note_emb_dim=8, type_emb_dim=8, debug=False): | |
super(ChordEmbedding, self).__init__() | |
self.emb_note = nn.Embedding(len(NOTE_NAMES)+1, note_emb_dim) # +1 for "N" ie no chord" | |
self.emb_type = nn.Embedding(len(CHORD_TYPES), type_emb_dim) | |
self.compactify = nn.Linear(note_emb_dim + type_emb_dim, chord_emb_dim) | |
self.chord_emb_dim = chord_emb_dim | |
self.debug = debug | |
self.zero_vec = torch.zeros((1, self.chord_emb_dim)) | |
self.chord_emb_dim = chord_emb_dim | |
def forward(self, chord_inds:torch.Tensor, debug=False): | |
"""x should have dimensions (B) where B is the batch size each value is the index of the chord in the vocabulary | |
Any note wherever inds is len(POSSIBLE_CHORDS), we want to return a zero vector, otherwise we want to return the embedding""" | |
if chord_inds.max() > len(POSSIBLE_CHORDS): | |
torch.set_printoptions(threshold=10000) | |
print(f"\nchord_inds.max() = {chord_inds.max()} but len(POSSIBLE_CHORDS) = {len(POSSIBLE_CHORDS)}. \nchord_inds = {chord_inds}") | |
raise ValueError("chord_inds.max() should be less than len(POSSIBLE_CHORDS)") | |
note_inds, type_inds = chord_inds // len(CHORD_TYPES), chord_inds % len(CHORD_TYPES) | |
# note that for 'N' chord in which chord_ind==len(POSSIBLE_CHORDS)-1, we will get note_inds=LEN(NOTE_NAMES) and type_inds=0. that's why self.embed_note has len(NOTE_NAMES)+1 | |
if debug: | |
print("note_inds, type_inds = ", note_inds, type_inds) | |
print("note_inds.max(), type_inds.max() = ", note_inds.max(), type_inds.max()) | |
note_emb = self.emb_note(note_inds) | |
type_emb = self.emb_type(type_inds) | |
if debug: print("\nnote_emb.shape, type_emb.shape = ", note_emb.shape, type_emb.shape) | |
combined_emb = torch.cat((note_emb, type_emb), dim=1) | |
if debug: print("combined_emb.shape = ", combined_emb.shape) | |
x = self.compactify(combined_emb) | |
if debug: print("ce: x.shape, self.chord_emb_dim = ", x.shape, self.chord_emb_dim) | |
return x | |
class ChordAE(nn.Module): | |
"""Maybe not needed: Autoencoder for training chord embeddings? | |
Note: we don't really need an AE for the full model, we can get by with just the encoder (and no decoder) | |
but the AE is useful for exploring how few dimensions we can get away with""" | |
def __init__(self, chord_vocab_size=len(POSSIBLE_CHORDS), chord_emb_dim=8): | |
super(ChordAE, self).__init__() | |
self.encoder = ChordEmbedding(chord_emb_dim) | |
self.decoder = nn.Linear(chord_emb_dim, chord_vocab_size) # could do better maybe | |
def forward(self, x, debug=False): | |
x = self.encoder(x) | |
x = self.decoder(x) | |
return x | |
def abs_seq_to_rel_seq(seq:torch.Tensor): | |
"""converts a batch of absolute sequences of chord indices to a batch of relative sequence of chord indices | |
subtract the note of the first element in each batch from all the other note indices, modulo len(NOTE_NAMES) | |
overwrite the first element so it's unchanged, and overwrite and 'N' chords with...something else? TODO | |
""" | |
assert len(seq.shape)==2, f"seq should be 2D, but seq.shape = {seq.shape}" | |
# decompose seq into two tensors, one of notes and one of chord types | |
note_inds, type_inds = seq // len(CHORD_TYPES), seq % len(CHORD_TYPES) | |
# for note_inds<12, subtract these from the first element in the sequence, modulo len(NOTE_NAMES) i.e. 12 | |
note_inds2 = note_inds.clone() | |
note_inds2[:,1:] = (note_inds2[:,1:] - note_inds2[:,0].unsqueeze(1)) % len(NOTE_NAMES) | |
# 'N' chords: whereever note_inds == 12, overwrite note_inds2 with 12 | |
note_inds2[note_inds == len(NOTE_NAMES)] = len(NOTE_NAMES) | |
# recompose seq | |
changes_seq = note_inds2 * len(CHORD_TYPES) + type_inds # now these are no longer chords, they are chord *changes* rel to first chord | |
return changes_seq | |
class ChordSeqEncoder(nn.Module): | |
"""Encoder for sequences of chords: | |
We embed the first chord, then we embed the CHANGES in chords thereafter (using modulo-12 arithmetic on the bass note) | |
(4 chords per bar x 32 bars = 128 chords), | |
and then pass the sequence of the chords through some sequence model | |
(LSTM for now, could use a Transformer or something else later) | |
to generate a [256]-dimensional embedding of the sequence of chord embeddings | |
""" | |
def __init__(self, chord_emb_dim=8, seq_len=512//4, seq_emb_dim=256, hidden_dim=512, dropout=0.2): | |
super(ChordSeqEncoder, self).__init__() | |
self.chord_encoder = ChordEmbedding() | |
self.seq_encoder = nn.LSTM(chord_emb_dim, seq_emb_dim, batch_first=True, num_layers=2, dropout=dropout) | |
self.seq_len = seq_len | |
def forward(self, bs): | |
"x should have dimensions (B, S) where B is the batch size and S is the length of the sequence of chord indices" | |
B,S = bs.shape | |
changes_seq = abs_seq_to_rel_seq(bs) # convert to relative sequence of chord indices | |
# get chord embeddings for every chord in the batch in the sequence | |
x = self.chord_encoder(changes_seq.flatten()) | |
# reshape x into (B, S, E) where B is the batch size, S is the sequence length, and E is the chord embedding dimension | |
x = x.view(B, S, -1) | |
E = x.shape[-1] | |
#print("before seq_encoder, x.shape = ", x.shape) | |
#x, _ = self.seq_encoder(x) | |
output, (hidden, cell) = self.seq_encoder(x) | |
#output of forward should be a 2-D tensor of shape (B, SE) where SE = seq_emb_dim | |
x = hidden[0, :, :] # return the hidden state of the LSTM, which is the last state of the sequence | |
#print("after seq_encoder, x.shape = ", x.shape) | |
return x | |
class ChordSeqAE(nn.Module): | |
""" | |
Chord Sequence Autoencoder. For pretraining a ChordSeqEncoder | |
""" | |
def __init__(self, chord_emb_dim=8, seq_len=512//4, seq_emb_dim=256, | |
hidden_dim=512, chord_vocab_size=len(POSSIBLE_CHORDS), | |
vae_scale=0.1): | |
super(ChordSeqAE, self).__init__() | |
self.encoder = ChordSeqEncoder(chord_emb_dim=chord_emb_dim, seq_len=seq_len, seq_emb_dim=seq_emb_dim, hidden_dim=hidden_dim) | |
# made decoder a sequence of linear layers with a ReLU in between | |
self.decoder = nn.Sequential( | |
nn.Linear(seq_emb_dim, hidden_dim), | |
nn.ReLU(), | |
nn.Linear(hidden_dim, seq_len*chord_vocab_size) | |
) | |
self.chord_vocab_size = chord_vocab_size | |
self.vae_scale = vae_scale | |
def forward(self, bs, debug=False): | |
"x should have dimensions (B, S) where B is the batch size and S is the length of the sequence of chord indices" | |
if debug: print("ChordSeqAE: bs.shape = ", bs.shape) | |
B,S = bs.shape | |
x = self.encoder(bs) | |
if debug: print("ChordSeqAE: encoded x.shape = ", x.shape) | |
if self.vae_scale > 0 and self.training: | |
x = x + self.vae_scale*((x.max()-x.min())) * torch.randn_like(x) | |
x = self.decoder(x) | |
x = x.view(B, S, -1) | |
if debug: print("ChordSeqAE: decoded x.shape = ", x.shape) | |
return x | |
def chord_seq_from_img(img:Image.Image, | |
every=8, # was imaginging every beat (every=4) but looking at data, it seems like the smallest chord label is 8 pixels wide | |
debug=False): | |
"""extracts a sequence of chord indices from a pianoroll image | |
hopefully the dataloader will mean we can just do one image and it'll batch them | |
""" | |
if debug: print("img.size, img.min, img.max = ",img.size, np.array(img).min(), np.array(img).max()) | |
if img.size[0] == img.size[1]: # if image is square, make it rectangular | |
img = square_to_rect(img) | |
img_arr = np.array(img) | |
top_row = img_arr[CHORD_BORDER//2] # all x's along y=CHORD_BORDER/2 | |
if debug: | |
img.save("chord_seq_from_img.png") | |
print("img_arr.shape = ", img_arr.shape) | |
print("top_row.shape = ", top_row.shape) | |
print("top_row = ", top_row) | |
chord_seq = np.array([color_to_chord_num(tuple(c)) for c in top_row]) | |
if chord_seq.max() >= len(POSSIBLE_CHORDS): | |
print(f"chord_seq.max = {chord_seq.max()} should be less than len(POSSIBLE_CHORDS) = {len(POSSIBLE_CHORDS)}\nchord_seq = {chord_seq}") | |
indices = np.where(chord_seq >= len(POSSIBLE_CHORDS))[0] | |
print("indices, chord_seq[indices], top_row[indices] = ", indices, chord_seq[indices], top_row[indices]) | |
raise ValueError("chord_seq.max() should be less than len(POSSIBLE_CHORDS)") | |
chord_seq_beats = most_freq_or_first_every(chord_seq, every=every) | |
assert chord_seq_beats.max() <= chord_seq.max(), f"chord_seq_beats.max() = {chord_seq_beats.max()} should be less than chord_seq.max() = {chord_seq.max()}" | |
if debug: print("chord_seq_beats, len(POSSIBLE_CHORDS) = ", chord_seq_beats, len(POSSIBLE_CHORDS)) | |
assert chord_seq_beats.max() < len(POSSIBLE_CHORDS), f"chord_seq_beats.max() should be less than len(POSSIBLE_CHORDS) = {len(POSSIBLE_CHORDS)}" | |
return torch.tensor(chord_seq_beats) | |
def chord_seq_from_img_tensor_batch(img_tensor_batch:torch.Tensor, every=8, debug=False): | |
"""extracts a sequence of chord indices from a batch of pianoroll images""" | |
batch_size = img_tensor_batch.shape[0] | |
itb = (img_tensor_batch + 1.0) * 127.5 #rescale from -1..1 to 0..255 | |
chord_seqs = [] | |
for i in range(batch_size): # TODO: may be a faster way to do this with tensor ops | |
# converting to images and back is slow this is slow | |
img = Image.fromarray(np.round( itb[i].cpu().permute(1,2,0).numpy()).astype(np.uint8)) | |
img = square_to_rect(img) | |
chord_seq = chord_seq_from_img(img, every=every, ) | |
chord_seqs.append(chord_seq) | |
return torch.stack(chord_seqs).to(img_tensor_batch.device) | |
def img_batch_to_seq_emb(img_tensor_batch:torch.Tensor, chord_seq_encoder:nn.Module, every=8, debug=False): | |
"""converts a batch of pianoroll images to a batch of chord sequence embeddings""" | |
chord_seq_batch = chord_seq_from_img_tensor_batch(img_tensor_batch, every=every, debug=debug) | |
cs_emb = chord_seq_encoder(chord_seq_batch) | |
return cs_emb | |
# TODO: test it! | |
if __name__ == '__main__': | |
# FOR TESTING/DEV ONLY | |
import sys, random | |
def make_image_tensor_batch(batch_size=2): | |
"""FOR TESTING/DEV ONLY: makes a batch of random chord-endowed pianoroll (square) images | |
So I can iterate other parts of this faster w/o having to spin up crowson's training code every time while i write code here | |
shape = (B, 3, 256, 256), normalization = -1.0 to 1.0 | |
""" | |
img_batch = torch.zeros((batch_size, 3, 256, 256)) | |
for i in range(batch_size): | |
n = i+1# np.random.randint(0, 909) | |
img_filename = f"/data/POP909-Dataset/images_128_rg_chords_TOTAL/{n:03}_TOTAL.png" # place to grab images from | |
img = Image.open(img_filename).convert('RGB') | |
# crop to 512 pixels wide | |
img = img.crop((0,0,512,128)) | |
img = rect_to_square(img) | |
img_batch[i] = torch.tensor(np.array(img)).permute(2,0,1).float() / 127.5 - 1.0 # normalization done by dataloader makes images -1 to 1 | |
return img_batch | |
# quick check of the mapping | |
for cn in range(len(POSSIBLE_CHORDS)): | |
color = chord_num_to_color(cn) | |
print("cn, color = ", cn, color) | |
cn2 = color_to_chord_num(color) | |
assert cn2 == cn, f"cn2={cn2} should be cn={cn}, color={color}" | |
if len(sys.argv) <= 1: | |
print("Testing suite, Usage: python chords.py <some_arg>") | |
sys.exit(1) | |
some_arg = sys.argv[1] | |
batch_size=2 | |
img_tensor_batch = make_image_tensor_batch(batch_size=batch_size) | |
print("img_tensor_batch.shape = ", img_tensor_batch.shape) | |
print("img_tensor_batch.min(), img_tensor_batch.max() = ", img_tensor_batch.min(), img_tensor_batch.max()) | |
chord_seq_batch = chord_seq_from_img_tensor_batch(img_tensor_batch, every=8, debug=False) | |
print("chord_seq_batch.shape = ", chord_seq_batch.shape) | |
print(f"chord_seq_batch = \n{chord_seq_batch}") | |
cse = ChordSeqEncoder() | |
cs_emb = cse(chord_seq_batch) | |
print("cs_emb.shape = ", cs_emb.shape) | |
#print(f"cs_emb = \n{cs_emb}") | |
sys.exit(0) | |
#img_filename = some_arg | |
img = Image.open(img_filename).convert('RGB') | |
chord_ind_seq = chord_seq_from_img(img, debug=False) | |
print("chord_ind_seq = ", chord_ind_seq) | |
print("len(chord_ind_seq) = ", len(chord_ind_seq)) | |
chord_embedder = ChordEmbedding(len(POSSIBLE_CHORDS)) | |
#print("chord_embeddings = ", chord_embedder(chord_ind_seq)) | |
sys.exit(0) | |
#chords_str = some_arg | |
#cis = chords_str_to_inds(chords_str) | |
cis = chord_ind_seq | |
for ci in cis: | |
print("\n-------") | |
#ci = pair_to_chord_index(pair) | |
pair = chord_index_to_pair(ci) | |
print(f"Input: chord_str = {chords_str}, pair = {pair}, ci = {ci}") | |
color = chord_num_to_color(ci) | |
print(color) | |
cn2 = color_to_chord_num(color) | |
out_str = chord_index_to_str(cn2) | |
print(f"Output: cn2 = {cn2}, out_str = {out_str}") | |
print("Embedding: ") | |
with torch.no_grad(): | |
x = torch.tensor([ci]) | |
print(chord_embedder(x)) | |