ShiromiyaGamer
/

dependencias

Model card Files Files and versions Community

dependencias / apollo /inference.py

ShiromiyaGamer

Upload inference.py

b95d9b4 verified 3 months ago

raw

history blame

4.25 kB

	import os
	import torch
	import librosa
	import look2hear.models
	import soundfile as sf
	from tqdm.auto import tqdm
	import argparse
	import numpy as np

	import warnings
	warnings.filterwarnings("ignore")

	def load_audio(file_path):
	audio, samplerate = librosa.load(file_path, mono=False, sr=44100)
	print(f'INPUT audio.shape = {audio.shape} \| samplerate = {samplerate}')
	#audio = dBgain(audio, -6)
	return torch.from_numpy(audio), samplerate

	def save_audio(file_path, audio, samplerate=44100):
	#audio = dBgain(audio, +6)
	sf.write(file_path, audio.T, samplerate, subtype="PCM_16")

	def process_chunk(chunk):
	chunk = chunk.unsqueeze(0).cuda()
	with torch.no_grad():
	return model(chunk).squeeze(0).squeeze(0).cpu()

	def _getWindowingArray(window_size, fade_size):
	# no fades here in the end, removing the failed ending of the chunk
	fadein = torch.linspace(1, 1, fade_size)
	fadeout = torch.linspace(0, 0, fade_size)
	window = torch.ones(window_size)
	window[-fade_size:] *= fadeout
	window[:fade_size] *= fadein
	return window

	def dBgain(audio, volume_gain_dB):
	gain = 10 ** (volume_gain_dB / 20)
	gained_audio = audio * gain
	return gained_audio


	def main(input_wav, output_wav):
	os.environ['CUDA_VISIBLE_DEVICES'] = "0"

	global model
	model = look2hear.models.BaseModel.from_pretrain("/kaggle/working/Apollo/model/pytorch_model.bin", sr=44100, win=20, feature_dim=256, layer=6).cuda()

	test_data, samplerate = load_audio(input_wav)

	C = chunk_size * samplerate # chunk_size seconds to samples
	N = overlap
	step = C // N
	fade_size = 2 * 44100 # 2 seconds
	print(f"N = {N} \| C = {C} \| step = {step} \| fade_size = {fade_size}")

	border = C - step

	# Pad the input if necessary
	if test_data.shape[1] > 2 * border and (border > 0):
	test_data = torch.nn.functional.pad(test_data, (border, border), mode='reflect')

	windowingArray = _getWindowingArray(C, fade_size)

	result = torch.zeros((1,) + tuple(test_data.shape), dtype=torch.float32)
	counter = torch.zeros((1,) + tuple(test_data.shape), dtype=torch.float32)

	i = 0
	progress_bar = tqdm(total=test_data.shape[1], desc="Processing audio chunks", leave=False)

	while i < test_data.shape[1]:
	part = test_data[:, i:i + C]
	length = part.shape[-1]
	if length < C:
	if length > C // 2 + 1:
	part = torch.nn.functional.pad(input=part, pad=(0, C - length), mode='reflect')
	else:
	part = torch.nn.functional.pad(input=part, pad=(0, C - length, 0, 0), mode='constant', value=0)

	out = process_chunk(part)

	window = windowingArray
	if i == 0: # First audio chunk, no fadein
	window[:fade_size] = 1
	elif i + C >= test_data.shape[1]: # Last audio chunk, no fadeout
	window[-fade_size:] = 1

	result[..., i:i+length] += out[..., :length] * window[..., :length]
	counter[..., i:i+length] += window[..., :length]

	i += step
	progress_bar.update(step)

	progress_bar.close()

	final_output = result / counter
	final_output = final_output.squeeze(0).numpy()
	np.nan_to_num(final_output, copy=False, nan=0.0)

	# Remove padding if added earlier
	if test_data.shape[1] > 2 * border and (border > 0):
	final_output = final_output[..., border:-border]

	save_audio(output_wav, final_output, samplerate)
	print(f'Success! Output file saved as {output_wav}')

	# Memory clearing
	model.cpu()
	del model
	torch.cuda.empty_cache()

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Audio Inference Script")
	parser.add_argument("--in_wav", type=str, required=True, help="Path to input wav file")
	parser.add_argument("--out_wav", type=str, required=True, help="Path to output wav file")
	parser.add_argument("--chunk_size", type=int, help="chunk size value in seconds", default=3)
	parser.add_argument("--overlap", type=int, help="Overlap", default=2)
	args = parser.parse_args()

	chunk_size = args.chunk_size
	overlap = args.overlap
	print(f'chunk_size = {chunk_size}, overlap = {overlap}')

	main(args.in_wav, args.out_wav)