import os import sys import torch import shutil import librosa import warnings import subprocess import numpy as np import gradio as gr import librosa.display import matplotlib.pyplot as plt import torchvision.transforms as transforms from collections import Counter from PIL import Image from model import EvalNet from utils import get_modelist, find_mp3_files, download TRANSLATE = { "Symphony": "Symphony", "Opera": "Opera", "Solo": "Solo", "Chamber": "Chamber", "Pop_vocal_ballad": "Pop vocal ballad", "Adult_contemporary": "Adult contemporary", "Teen_pop": "Teen pop", "Contemporary_dance_pop": "Contemporary dance pop", "Dance_pop": "Dance pop", "Classic_indie_pop": "Classic indie pop", "Chamber_cabaret_and_art_pop": "Chamber cabaret & art pop", "Soul_or_r_and_b": "Soul / R&B", "Adult_alternative_rock": "Adult alternative rock", "Uplifting_anthemic_rock": "Uplifting anthemic rock", "Soft_rock": "Soft rock", "Acoustic_pop": "Acoustic pop", } CLASSES = list(TRANSLATE.keys()) CACHE_DIR = "./__pycache__/tmp" def most_common_element(input_list): counter = Counter(input_list) mce, _ = counter.most_common(1)[0] return mce def mp3_to_mel(audio_path: str, width=11.4): y, sr = librosa.load(audio_path) mel_spec = librosa.feature.melspectrogram(y=y, sr=sr) log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) dur = librosa.get_duration(y=y, sr=sr) total_frames = log_mel_spec.shape[1] step = int(width * total_frames / dur) count = int(total_frames / step) begin = int(0.5 * (total_frames - count * step)) end = begin + step * count for i in range(begin, end, step): librosa.display.specshow(log_mel_spec[:, i : i + step]) plt.axis("off") plt.savefig( f"{CACHE_DIR}/mel_{round(dur, 2)}_{i}.jpg", bbox_inches="tight", pad_inches=0.0, ) plt.close() def mp3_to_cqt(audio_path: str, width=11.4): y, sr = librosa.load(audio_path) cqt_spec = librosa.cqt(y=y, sr=sr) log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max) dur = librosa.get_duration(y=y, sr=sr) total_frames = log_cqt_spec.shape[1] step = int(width * total_frames / dur) count = int(total_frames / step) begin = int(0.5 * (total_frames - count * step)) end = begin + step * count for i in range(begin, end, step): librosa.display.specshow(log_cqt_spec[:, i : i + step]) plt.axis("off") plt.savefig( f"{CACHE_DIR}/cqt_{round(dur, 2)}_{i}.jpg", bbox_inches="tight", pad_inches=0.0, ) plt.close() def mp3_to_chroma(audio_path: str, width=11.4): y, sr = librosa.load(audio_path) chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr) log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max) dur = librosa.get_duration(y=y, sr=sr) total_frames = log_chroma_spec.shape[1] step = int(width * total_frames / dur) count = int(total_frames / step) begin = int(0.5 * (total_frames - count * step)) end = begin + step * count for i in range(begin, end, step): librosa.display.specshow(log_chroma_spec[:, i : i + step]) plt.axis("off") plt.savefig( f"{CACHE_DIR}/chroma_{round(dur, 2)}_{i}.jpg", bbox_inches="tight", pad_inches=0.0, ) plt.close() def embed_img(img_path, input_size=224): transform = transforms.Compose( [ transforms.Resize([input_size, input_size]), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ] ) img = Image.open(img_path).convert("RGB") return transform(img).unsqueeze(0) def inference(mp3_path, log_name: str, folder_path=CACHE_DIR): if os.path.exists(folder_path): shutil.rmtree(folder_path) if not mp3_path: return None, "Please input an audio!" spec = log_name.split("_")[-1] os.makedirs(folder_path, exist_ok=True) try: network = EvalNet(log_name) eval("mp3_to_%s" % spec)(mp3_path) except Exception as e: print(f"Error converting {mp3_path} : {e}") outputs = [] all_files = os.listdir(folder_path) for file_name in all_files: if file_name.lower().endswith(".jpg"): file_path = os.path.join(folder_path, file_name) input = embed_img(file_path) output: torch.Tensor = network.model(input) pred_id = torch.max(output.data, 1)[1] outputs.append(int(pred_id)) max_count_item = most_common_element(outputs) shutil.rmtree(folder_path) return os.path.basename(mp3_path), TRANSLATE[CLASSES[max_count_item]] if __name__ == "__main__": warnings.filterwarnings("ignore") ffmpeg = "ffmpeg-release-amd64-static" if sys.platform.startswith("linux"): if not os.path.exists(f"./{ffmpeg}.tar.xz"): download( f"https://www.modelscope.cn/studio/ccmusic-database/music_genre/resolve/master/{ffmpeg}.tar.xz" ) folder_path = f"{os.getcwd()}/{ffmpeg}" if not os.path.exists(folder_path): subprocess.call(f"tar -xvf {ffmpeg}.tar.xz", shell=True) os.environ["PATH"] = f"{folder_path}:{os.environ.get('PATH', '')}" models = get_modelist(assign_model="VGG19_BN_cqt") examples = [] example_mp3s = find_mp3_files() for mp3 in example_mp3s: examples.append([mp3, models[0]]) with gr.Blocks() as demo: gr.Interface( fn=inference, inputs=[ gr.Audio(label="Upload MP3", type="filepath"), gr.Dropdown(choices=models, label="Select a model", value=models[0]), ], outputs=[ gr.Textbox(label="Audio filename", show_copy_button=True), gr.Textbox(label="Genre recognition", show_copy_button=True), ], examples=examples, cache_examples=False, allow_flagging="never", title="It is recommended to keep the duration of recording within 15s, too long will affect the recognition efficiency.", ) demo.launch()