Spaces:

drscotthawley
/

PicturesOfMIDI

Running on Zero

App Files Files Community

PicturesOfMIDI / pom /chords.py

drscotthawley

fixing up py files for run

3bf82fd 6 months ago

raw

history blame contribute delete

20.7 kB

	#! /usr/bin/env python3
	import re
	import sys
	import torch.nn as nn
	import torch
	from PIL import Image
	import numpy as np
	from . import rect_to_square, square_to_rect

	CHORD_BORDER = 8 # chord border size in pixels

	# my distillation of all output from polyffusion's chord finder for transposed +/-12 semitones POP909 dataset.
	NOTE_NAMES = ['C','C#','D','E','Eb','F','F#','G', 'Ab', 'A', 'Bb', 'B'] # these are from polyffusion's chord finder. yes, mixing # & b is weird
	#NOTE_NAMES2 = ['A','Ab','B','Bb','C','C#','D','E','Eb','F','F#','G'] # how they are in all_chords.txt file

	CHORD_TYPES = ['aug', 'dim', 'dim7', 'hdim7',
	'maj', 'maj(11)', 'maj13', 'maj/3', 'maj/5', 'maj6', 'maj6(9)', 'maj7', 'maj7/3', 'maj7/5', 'maj7/7', 'maj(9)', 'maj9', 'maj9(11)',
	'min', 'min(11)', 'min11', 'min13', 'min/5', 'min6', 'min6(9)', 'min7', 'min7/5', 'min7/b7', 'min(9)', 'min9', 'min/b3', 'minmaj7',
	'sus2', 'sus4', 'sus4(b7)', 'sus4(b7,9)', '7', '7/3', '7/5', '7(#9)', '7/b7', '9', '11', '13'] # 44 chord types

	CHORD_IND_PAIRS = [(note, chord) for note in NOTE_NAMES for chord in CHORD_TYPES]
	POSSIBLE_CHORDS = [f"{note}:{chord}" for (note, chord) in CHORD_IND_PAIRS]
	#POSSIBLE_CHORDS = [f"{note}:{chord}" for note in NOTE_NAMES for chord in CHORD_TYPES]
	POSSIBLE_CHORDS += ['N'] # N for no chord
	assert len(POSSIBLE_CHORDS) == 1244+1, f"There should be {1244+1} possible chords, but there are {len(POSSIBLE_CHORDS)}. Check the NOTE_NAMES and CHORD_TYPES lists."


	def to_base_9(n):
	# converts a decimal integer to base 9
	if n == 0: return [0, 0, 0]
	digits = []
	while n:
	digits.append(n % 9)
	n //= 9
	while len(digits) < 3: # add leading zeros
	digits.append(0)
	return digits[::-1]


	def chord_num_to_color(cn, scale=30):
	# "embeddings" for chords, from (0,0,30) up to (240,240,240) in each (RGB) channel, in steps of 30
	color = to_base_9(cn+1)
	return tuple(x*scale for x in color)

	def color_to_chord_num(color, scale=30, warnings_on=False):
	# reverse of chord_num_to_color, note that color goes backwards
	out = sum([x//scale * 9**i for i, x in enumerate(color[::-1])])-1
	if out < 0:
	if warnings_on: print(f"color_to_chord_num: Warning: out should be equal to or greater than 0: color = {color}, out = {out}. Wrapping around to {len(POSSIBLE_CHORDS)+out}")
	out = len(POSSIBLE_CHORDS) + out
	return out


	def simplify_chord(chord_name):
	"""Simplifies chord names by applying a few rules:
	1. get rid of the ones with parentheses, e.g. change "A:maj(11)" to just "A:maj"?
	2. remove the notes in the bass, like mapping all "A:7/3", "A:7/5" and "A:7/b7" to just "A:7"?
	3. remove uspension markings, e.g. sus2, sus4?
	4. maybe? high-numbered added notes like "G:min11" & "G:min13" -> "G:min"
	"""
	chord_name = re.sub(r'\(.*','',chord_name) # 1
	chord_name = re.sub(r'\/.*','',chord_name) # 2
	chord_name = re.sub(r'sus.*','',chord_name) # 3
	return chord_name




	def get_unique_indices(data):
	"""Returns the indices of non-repeating values in a list
	Args:
	data: A list of any data type.
	Example: data = [0, 1, 4, 1, 5, 5, 5, 6, 10, 6, 6, 5]

	Returns:
	A list of indices for non-repeating values.
	Example: result = [0, 1, 2, 3, 6, 7, 8, 10, 11]
	"""
	return [i for i, (val, next_val) in enumerate(zip(data, data[1:])) if val != next_val] + [len(data) - 1]

	def get_nonrepeated_values(data, indices=None):
	"""Returns the indices of non-repeating values in a list
	Args:
	data: A list of any data type.
	Example: data = [0, 1, 4, 1, 5, 5, 5, 6, 10, 6, 6, 5]

	Returns:
	A list of non-repeating values.
	Example: returns [0, 1, 4, 1, 5, 6, 10, 6, 5]
	"""
	if indices is None:
	indices = get_unique_indices(data)
	return [data[i] for i in indices]



	def most_freq_or_first(arr, debug=False):
	"returns either the most frequent value in array, or if multiple values are most frequent, it returns the first such value"
	assert len(arr.shape) == 1, "arr must be 1D"
	savearr = arr.copy()
	if debug:
	print("most_freq_or_first: arr = ", arr)
	if savearr.min() < 0: # if there are negative values, we need to shift them up to 0
	arr = arr - savearr.min()
	bc = np.bincount(arr)
	try:

	if np.any(arr < 0): bc[arr < 0] = 0 # don't inlcude negative arr values when checking for most frequent
	bc[bc != bc.max()] = 0 # only interested in most frequent values
	except Exception as e:
	print("Exception ",e)
	print("most_freq_or_first: arr.shape = ", arr.shape)
	print("most_freq_or_first: arr = ", arr )
	print("most_freq_or_first: bc.shape = ", bc.shape)
	raise e
	out = np.argmax(bc)
	# shift numbers back down
	if savearr.min() < 0:
	out = out + savearr.min()
	assert out.max() <= arr.max(), f"out.max() = {out.max()} should be less than arr.max() = {arr.max()}"
	return out


	def most_freq_or_first_every(arr,
	every=4, # pixels per chord label. 4=every quarter note
	):
	assert len(arr.shape) == 1, "arr must be 1D"
	"used to grab most frequent chord labels, assuming we're starting on a beat. arr=chord label indices, e.g. in 0..528"
	remainder = len(arr) % every
	if remainder != 0:
	arr = np.pad(arr, (0, every - remainder), mode='constant', constant_values=(0, arr[- remainder]))
	#print("most_freq_or_first_every: Warning: Padding arr with last beat value on end. new arr =",arr)
	check = arr.reshape((-1,every))
	out = np.array( [most_freq_or_first(a) for a in arr.reshape((-1,every))] )
	if out.max() > arr.max():
	for i, c in enumerate(check):
	mfof = most_freq_or_first(c)
	if mfof > c.max():
	print(f"i={i}, c={c}, most_freq_or_first(c)={mfof}")
	raise ValueError(f"out.max() = {out.max()} should be less than arr.max() = {arr.max()}")

	return out


	def chord_str_to_pair(chord_str):
	"converts a chord string to a pair of (note, chord) indices"
	if chord_str == 'N':
	return (-1,-1)
	note, chord_type = chord_str.split(':')
	note_ind = NOTE_NAMES.index(note)
	chord_type_ind = CHORD_TYPES.index(chord_type)
	return (note_ind, chord_type_ind)

	def chords_str_to_pairs(chords_str):
	for chord_str in chords_str.split(','):
	yield chord_str_to_pair(chord_str)

	def chords_str_to_inds(chords_str):
	for chord_str in chords_str.split(','):
	yield POSSIBLE_CHORDS.index(chord_str)

	def pair_to_chord_index(pair):
	"converts a pair of (note, chord_type) indices to a single chord index"
	note_ind, chord_type_ind = pair
	return note_ind*len(CHORD_TYPES) + chord_type_ind

	def chord_index_to_pair(ci):
	"converts a single chord index to a pair of (note, chord) indices"
	note_ind = ci // len(CHORD_TYPES)
	chord_type_ind = ci % len(CHORD_TYPES)
	return (note_ind, chord_type_ind)

	def chord_index_to_str(ci):
	"converts a single chord index to a chord string"
	return POSSIBLE_CHORDS[ci]


	class ChordEmbedding(nn.Module):
	def __init__(self, chord_emb_dim=8, note_emb_dim=8, type_emb_dim=8, debug=False):
	super(ChordEmbedding, self).__init__()
	self.emb_note = nn.Embedding(len(NOTE_NAMES)+1, note_emb_dim) # +1 for "N" ie no chord"
	self.emb_type = nn.Embedding(len(CHORD_TYPES), type_emb_dim)
	self.compactify = nn.Linear(note_emb_dim + type_emb_dim, chord_emb_dim)
	self.chord_emb_dim = chord_emb_dim
	self.debug = debug
	self.zero_vec = torch.zeros((1, self.chord_emb_dim))
	self.chord_emb_dim = chord_emb_dim

	def forward(self, chord_inds:torch.Tensor, debug=False):
	"""x should have dimensions (B) where B is the batch size each value is the index of the chord in the vocabulary
	Any note wherever inds is len(POSSIBLE_CHORDS), we want to return a zero vector, otherwise we want to return the embedding"""
	if chord_inds.max() > len(POSSIBLE_CHORDS):
	torch.set_printoptions(threshold=10000)
	print(f"\nchord_inds.max() = {chord_inds.max()} but len(POSSIBLE_CHORDS) = {len(POSSIBLE_CHORDS)}. \nchord_inds = {chord_inds}")
	raise ValueError("chord_inds.max() should be less than len(POSSIBLE_CHORDS)")
	note_inds, type_inds = chord_inds // len(CHORD_TYPES), chord_inds % len(CHORD_TYPES)
	# note that for 'N' chord in which chord_ind==len(POSSIBLE_CHORDS)-1, we will get note_inds=LEN(NOTE_NAMES) and type_inds=0. that's why self.embed_note has len(NOTE_NAMES)+1
	if debug:
	print("note_inds, type_inds = ", note_inds, type_inds)
	print("note_inds.max(), type_inds.max() = ", note_inds.max(), type_inds.max())
	note_emb = self.emb_note(note_inds)
	type_emb = self.emb_type(type_inds)
	if debug: print("\nnote_emb.shape, type_emb.shape = ", note_emb.shape, type_emb.shape)
	combined_emb = torch.cat((note_emb, type_emb), dim=1)
	if debug: print("combined_emb.shape = ", combined_emb.shape)
	x = self.compactify(combined_emb)
	if debug: print("ce: x.shape, self.chord_emb_dim = ", x.shape, self.chord_emb_dim)
	return x


	class ChordAE(nn.Module):
	"""Maybe not needed: Autoencoder for training chord embeddings?
	Note: we don't really need an AE for the full model, we can get by with just the encoder (and no decoder)
	but the AE is useful for exploring how few dimensions we can get away with"""
	def __init__(self, chord_vocab_size=len(POSSIBLE_CHORDS), chord_emb_dim=8):
	super(ChordAE, self).__init__()
	self.encoder = ChordEmbedding(chord_emb_dim)
	self.decoder = nn.Linear(chord_emb_dim, chord_vocab_size) # could do better maybe
	def forward(self, x, debug=False):
	x = self.encoder(x)
	x = self.decoder(x)
	return x

	def abs_seq_to_rel_seq(seq:torch.Tensor):
	"""converts a batch of absolute sequences of chord indices to a batch of relative sequence of chord indices
	subtract the note of the first element in each batch from all the other note indices, modulo len(NOTE_NAMES)
	overwrite the first element so it's unchanged, and overwrite and 'N' chords with...something else? TODO
	"""
	assert len(seq.shape)==2, f"seq should be 2D, but seq.shape = {seq.shape}"
	# decompose seq into two tensors, one of notes and one of chord types
	note_inds, type_inds = seq // len(CHORD_TYPES), seq % len(CHORD_TYPES)
	# for note_inds<12, subtract these from the first element in the sequence, modulo len(NOTE_NAMES) i.e. 12
	note_inds2 = note_inds.clone()
	note_inds2[:,1:] = (note_inds2[:,1:] - note_inds2[:,0].unsqueeze(1)) % len(NOTE_NAMES)
	# 'N' chords: whereever note_inds == 12, overwrite note_inds2 with 12
	note_inds2[note_inds == len(NOTE_NAMES)] = len(NOTE_NAMES)
	# recompose seq
	changes_seq = note_inds2 * len(CHORD_TYPES) + type_inds # now these are no longer chords, they are chord changes rel to first chord
	return changes_seq




	class ChordSeqEncoder(nn.Module):
	"""Encoder for sequences of chords:
	We embed the first chord, then we embed the CHANGES in chords thereafter (using modulo-12 arithmetic on the bass note)
	(4 chords per bar x 32 bars = 128 chords),
	and then pass the sequence of the chords through some sequence model
	(LSTM for now, could use a Transformer or something else later)
	to generate a [256]-dimensional embedding of the sequence of chord embeddings
	"""
	def __init__(self, chord_emb_dim=8, seq_len=512//4, seq_emb_dim=256, hidden_dim=512, dropout=0.2):
	super(ChordSeqEncoder, self).__init__()
	self.chord_encoder = ChordEmbedding()
	self.seq_encoder = nn.LSTM(chord_emb_dim, seq_emb_dim, batch_first=True, num_layers=2, dropout=dropout)
	self.seq_len = seq_len
	def forward(self, bs):
	"x should have dimensions (B, S) where B is the batch size and S is the length of the sequence of chord indices"
	B,S = bs.shape
	changes_seq = abs_seq_to_rel_seq(bs) # convert to relative sequence of chord indices
	# get chord embeddings for every chord in the batch in the sequence
	x = self.chord_encoder(changes_seq.flatten())
	# reshape x into (B, S, E) where B is the batch size, S is the sequence length, and E is the chord embedding dimension
	x = x.view(B, S, -1)
	E = x.shape[-1]
	#print("before seq_encoder, x.shape = ", x.shape)
	#x, _ = self.seq_encoder(x)
	output, (hidden, cell) = self.seq_encoder(x)

	#output of forward should be a 2-D tensor of shape (B, SE) where SE = seq_emb_dim
	x = hidden[0, :, :] # return the hidden state of the LSTM, which is the last state of the sequence
	#print("after seq_encoder, x.shape = ", x.shape)
	return x


	class ChordSeqAE(nn.Module):
	"""
	Chord Sequence Autoencoder. For pretraining a ChordSeqEncoder
	"""
	def __init__(self, chord_emb_dim=8, seq_len=512//4, seq_emb_dim=256,
	hidden_dim=512, chord_vocab_size=len(POSSIBLE_CHORDS),
	vae_scale=0.1):
	super(ChordSeqAE, self).__init__()
	self.encoder = ChordSeqEncoder(chord_emb_dim=chord_emb_dim, seq_len=seq_len, seq_emb_dim=seq_emb_dim, hidden_dim=hidden_dim)
	# made decoder a sequence of linear layers with a ReLU in between
	self.decoder = nn.Sequential(
	nn.Linear(seq_emb_dim, hidden_dim),
	nn.ReLU(),
	nn.Linear(hidden_dim, seq_len*chord_vocab_size)
	)
	self.chord_vocab_size = chord_vocab_size
	self.vae_scale = vae_scale

	def forward(self, bs, debug=False):
	"x should have dimensions (B, S) where B is the batch size and S is the length of the sequence of chord indices"
	if debug: print("ChordSeqAE: bs.shape = ", bs.shape)
	B,S = bs.shape
	x = self.encoder(bs)
	if debug: print("ChordSeqAE: encoded x.shape = ", x.shape)
	if self.vae_scale > 0 and self.training:
	x = x + self.vae_scale((x.max()-x.min())) torch.randn_like(x)
	x = self.decoder(x)
	x = x.view(B, S, -1)
	if debug: print("ChordSeqAE: decoded x.shape = ", x.shape)
	return x

	def chord_seq_from_img(img:Image.Image,
	every=8, # was imaginging every beat (every=4) but looking at data, it seems like the smallest chord label is 8 pixels wide
	debug=False):
	"""extracts a sequence of chord indices from a pianoroll image
	hopefully the dataloader will mean we can just do one image and it'll batch them
	"""
	if debug: print("img.size, img.min, img.max = ",img.size, np.array(img).min(), np.array(img).max())
	if img.size[0] == img.size[1]: # if image is square, make it rectangular
	img = square_to_rect(img)
	img_arr = np.array(img)
	top_row = img_arr[CHORD_BORDER//2] # all x's along y=CHORD_BORDER/2
	if debug:
	img.save("chord_seq_from_img.png")
	print("img_arr.shape = ", img_arr.shape)
	print("top_row.shape = ", top_row.shape)
	print("top_row = ", top_row)
	chord_seq = np.array([color_to_chord_num(tuple(c)) for c in top_row])
	if chord_seq.max() >= len(POSSIBLE_CHORDS):
	print(f"chord_seq.max = {chord_seq.max()} should be less than len(POSSIBLE_CHORDS) = {len(POSSIBLE_CHORDS)}\nchord_seq = {chord_seq}")
	indices = np.where(chord_seq >= len(POSSIBLE_CHORDS))[0]
	print("indices, chord_seq[indices], top_row[indices] = ", indices, chord_seq[indices], top_row[indices])
	raise ValueError("chord_seq.max() should be less than len(POSSIBLE_CHORDS)")
	chord_seq_beats = most_freq_or_first_every(chord_seq, every=every)
	assert chord_seq_beats.max() <= chord_seq.max(), f"chord_seq_beats.max() = {chord_seq_beats.max()} should be less than chord_seq.max() = {chord_seq.max()}"
	if debug: print("chord_seq_beats, len(POSSIBLE_CHORDS) = ", chord_seq_beats, len(POSSIBLE_CHORDS))
	assert chord_seq_beats.max() < len(POSSIBLE_CHORDS), f"chord_seq_beats.max() should be less than len(POSSIBLE_CHORDS) = {len(POSSIBLE_CHORDS)}"
	return torch.tensor(chord_seq_beats)


	def chord_seq_from_img_tensor_batch(img_tensor_batch:torch.Tensor, every=8, debug=False):
	"""extracts a sequence of chord indices from a batch of pianoroll images"""
	batch_size = img_tensor_batch.shape[0]
	itb = (img_tensor_batch + 1.0) * 127.5 #rescale from -1..1 to 0..255
	chord_seqs = []
	for i in range(batch_size): # TODO: may be a faster way to do this with tensor ops
	# converting to images and back is slow this is slow
	img = Image.fromarray(np.round( itb[i].cpu().permute(1,2,0).numpy()).astype(np.uint8))
	img = square_to_rect(img)
	chord_seq = chord_seq_from_img(img, every=every, )
	chord_seqs.append(chord_seq)
	return torch.stack(chord_seqs).to(img_tensor_batch.device)

	def img_batch_to_seq_emb(img_tensor_batch:torch.Tensor, chord_seq_encoder:nn.Module, every=8, debug=False):
	"""converts a batch of pianoroll images to a batch of chord sequence embeddings"""
	chord_seq_batch = chord_seq_from_img_tensor_batch(img_tensor_batch, every=every, debug=debug)
	cs_emb = chord_seq_encoder(chord_seq_batch)
	return cs_emb

	# TODO: test it!

	if __name__ == '__main__':
	# FOR TESTING/DEV ONLY
	import sys, random

	def make_image_tensor_batch(batch_size=2):
	"""FOR TESTING/DEV ONLY: makes a batch of random chord-endowed pianoroll (square) images
	So I can iterate other parts of this faster w/o having to spin up crowson's training code every time while i write code here
	shape = (B, 3, 256, 256), normalization = -1.0 to 1.0
	"""
	img_batch = torch.zeros((batch_size, 3, 256, 256))
	for i in range(batch_size):
	n = i+1# np.random.randint(0, 909)
	img_filename = f"/data/POP909-Dataset/images_128_rg_chords_TOTAL/{n:03}_TOTAL.png" # place to grab images from
	img = Image.open(img_filename).convert('RGB')
	# crop to 512 pixels wide
	img = img.crop((0,0,512,128))
	img = rect_to_square(img)
	img_batch[i] = torch.tensor(np.array(img)).permute(2,0,1).float() / 127.5 - 1.0 # normalization done by dataloader makes images -1 to 1
	return img_batch

	# quick check of the mapping
	for cn in range(len(POSSIBLE_CHORDS)):
	color = chord_num_to_color(cn)
	print("cn, color = ", cn, color)
	cn2 = color_to_chord_num(color)
	assert cn2 == cn, f"cn2={cn2} should be cn={cn}, color={color}"


	if len(sys.argv) <= 1:
	print("Testing suite, Usage: python chords.py <some_arg>")
	sys.exit(1)
	some_arg = sys.argv[1]

	batch_size=2
	img_tensor_batch = make_image_tensor_batch(batch_size=batch_size)
	print("img_tensor_batch.shape = ", img_tensor_batch.shape)
	print("img_tensor_batch.min(), img_tensor_batch.max() = ", img_tensor_batch.min(), img_tensor_batch.max())

	chord_seq_batch = chord_seq_from_img_tensor_batch(img_tensor_batch, every=8, debug=False)

	print("chord_seq_batch.shape = ", chord_seq_batch.shape)
	print(f"chord_seq_batch = \n{chord_seq_batch}")


	cse = ChordSeqEncoder()
	cs_emb = cse(chord_seq_batch)

	print("cs_emb.shape = ", cs_emb.shape)
	#print(f"cs_emb = \n{cs_emb}")
	sys.exit(0)




	#img_filename = some_arg
	img = Image.open(img_filename).convert('RGB')
	chord_ind_seq = chord_seq_from_img(img, debug=False)
	print("chord_ind_seq = ", chord_ind_seq)
	print("len(chord_ind_seq) = ", len(chord_ind_seq))
	chord_embedder = ChordEmbedding(len(POSSIBLE_CHORDS))
	#print("chord_embeddings = ", chord_embedder(chord_ind_seq))
	sys.exit(0)
	#chords_str = some_arg
	#cis = chords_str_to_inds(chords_str)
	cis = chord_ind_seq
	for ci in cis:
	print("\n-------")
	#ci = pair_to_chord_index(pair)
	pair = chord_index_to_pair(ci)
	print(f"Input: chord_str = {chords_str}, pair = {pair}, ci = {ci}")
	color = chord_num_to_color(ci)
	print(color)
	cn2 = color_to_chord_num(color)
	out_str = chord_index_to_str(cn2)
	print(f"Output: cn2 = {cn2}, out_str = {out_str}")

	print("Embedding: ")
	with torch.no_grad():
	x = torch.tensor([ci])
	print(chord_embedder(x))