Spaces:

Smotto
/

Vocal-Isolator

Running

App Files Files Community

Vocal-Isolator / src /models /MDX_net /kimvocal.py

Jarod Castillo

init

88b57c0 about 1 year ago

raw

history blame

No virus

2.37 kB

	# Standard Library Imports

	# Third Party Imports
	import torch
	import onnxruntime as ort

	# Local Imports
	from src.models.MDX_net.mdx_net import Conv_TDF_net_trimm
	from src.loader import Loader

	# Global Variables
	from src.constants import EXECUTION_PROVIDER_LIST, COMPUTATION_DEVICE, ONNX_MODEL_PATH


	class KimVocal:
	"""
	TODO: Put something here for flexibility purposes (model types).
	"""

	def __init__(self):
	pass

	def demix_vocals(self, music_tensor, sample_rate, model, streamlit_progressbar):
	"""
	Removing vocals using a ONNX model.

	Args:
	music_tensor (torch.Tensor): Input tensor.
	model (torch.nn): Model used for inferring.

	Returns:
	torch.Tensor: Output tensor after passing through the network.
	"""
	number_of_samples = music_tensor.shape[1]
	overlap = model.overlap
	# Calculate chunk_size and gen_size based on the sample rate
	chunk_size = model.chunk_size
	gen_size = chunk_size - 2 * overlap
	pad_size = gen_size - number_of_samples % gen_size
	mix_padded = torch.cat(
	[torch.zeros(2, overlap), music_tensor, torch.zeros(2, pad_size + overlap)],
	1,
	)

	# Start running the session for the model
	ort_session = ort.InferenceSession(ONNX_MODEL_PATH, providers=EXECUTION_PROVIDER_LIST)

	# TODO: any way to optimize against silence? I think that's what skips are for, gotta double check.
	# process one chunk at a time (batch_size=1)
	demixed_chunks = []
	i = 0
	while i < number_of_samples + pad_size:
	# Progress Bar
	streamlit_progressbar.progress(i / (number_of_samples + pad_size))

	# Computation
	chunk = mix_padded[:, i : i + chunk_size]
	x = model.stft(chunk.unsqueeze(0).to(COMPUTATION_DEVICE))
	with torch.no_grad():
	x = torch.tensor(ort_session.run(None, {"input": x.cpu().numpy()})[0])
	x = model.stft.inverse(x).squeeze(0)
	x = x[..., overlap:-overlap]
	demixed_chunks.append(x)
	i += gen_size

	vocals_output = torch.cat(demixed_chunks, -1)[..., :-pad_size].cpu()

	return vocals_output


	if __name__ == "__main__":
	kimvocal = KimVocal()
	kimvocal.main()