import os
import sys
import torch
import shutil
import librosa
import warnings
import subprocess
import numpy as np
import gradio as gr
import librosa.display
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from collections import Counter
from PIL import Image
from model import EvalNet
from utils import get_modelist, find_mp3_files, download


TRANSLATE = {
    "Symphony": "Symphony",
    "Opera": "Opera",
    "Solo": "Solo",
    "Chamber": "Chamber",
    "Pop_vocal_ballad": "Pop vocal ballad",
    "Adult_contemporary": "Adult contemporary",
    "Teen_pop": "Teen pop",
    "Contemporary_dance_pop": "Contemporary dance pop",
    "Dance_pop": "Dance pop",
    "Classic_indie_pop": "Classic indie pop",
    "Chamber_cabaret_and_art_pop": "Chamber cabaret & art pop",
    "Soul_or_r_and_b": "Soul / R&B",
    "Adult_alternative_rock": "Adult alternative rock",
    "Uplifting_anthemic_rock": "Uplifting anthemic rock",
    "Soft_rock": "Soft rock",
    "Acoustic_pop": "Acoustic pop",
}
CLASSES = list(TRANSLATE.keys())
CACHE_DIR = "./__pycache__/tmp"


def most_common_element(input_list):
    counter = Counter(input_list)
    mce, _ = counter.most_common(1)[0]
    return mce


def mp3_to_mel(audio_path: str, width=11.4):
    y, sr = librosa.load(audio_path)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    dur = librosa.get_duration(y=y, sr=sr)
    total_frames = log_mel_spec.shape[1]
    step = int(width * total_frames / dur)
    count = int(total_frames / step)
    begin = int(0.5 * (total_frames - count * step))
    end = begin + step * count
    for i in range(begin, end, step):
        librosa.display.specshow(log_mel_spec[:, i : i + step])
        plt.axis("off")
        plt.savefig(
            f"{CACHE_DIR}/mel_{round(dur, 2)}_{i}.jpg",
            bbox_inches="tight",
            pad_inches=0.0,
        )
        plt.close()


def mp3_to_cqt(audio_path: str, width=11.4):
    y, sr = librosa.load(audio_path)
    cqt_spec = librosa.cqt(y=y, sr=sr)
    log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
    dur = librosa.get_duration(y=y, sr=sr)
    total_frames = log_cqt_spec.shape[1]
    step = int(width * total_frames / dur)
    count = int(total_frames / step)
    begin = int(0.5 * (total_frames - count * step))
    end = begin + step * count
    for i in range(begin, end, step):
        librosa.display.specshow(log_cqt_spec[:, i : i + step])
        plt.axis("off")
        plt.savefig(
            f"{CACHE_DIR}/cqt_{round(dur, 2)}_{i}.jpg",
            bbox_inches="tight",
            pad_inches=0.0,
        )
        plt.close()


def mp3_to_chroma(audio_path: str, width=11.4):
    y, sr = librosa.load(audio_path)
    chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr)
    log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
    dur = librosa.get_duration(y=y, sr=sr)
    total_frames = log_chroma_spec.shape[1]
    step = int(width * total_frames / dur)
    count = int(total_frames / step)
    begin = int(0.5 * (total_frames - count * step))
    end = begin + step * count
    for i in range(begin, end, step):
        librosa.display.specshow(log_chroma_spec[:, i : i + step])
        plt.axis("off")
        plt.savefig(
            f"{CACHE_DIR}/chroma_{round(dur, 2)}_{i}.jpg",
            bbox_inches="tight",
            pad_inches=0.0,
        )
        plt.close()


def embed_img(img_path, input_size=224):
    transform = transforms.Compose(
        [
            transforms.Resize([input_size, input_size]),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
        ]
    )
    img = Image.open(img_path).convert("RGB")
    return transform(img).unsqueeze(0)


def inference(mp3_path, log_name: str, folder_path=CACHE_DIR):
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)

    if not mp3_path:
        return None, "Please input an audio!"

    spec = log_name.split("_")[-1]
    os.makedirs(folder_path, exist_ok=True)
    try:
        network = EvalNet(log_name)
        eval("mp3_to_%s" % spec)(mp3_path)

    except Exception as e:
        print(f"Error converting {mp3_path} : {e}")

    outputs = []
    all_files = os.listdir(folder_path)
    for file_name in all_files:
        if file_name.lower().endswith(".jpg"):
            file_path = os.path.join(folder_path, file_name)
            input = embed_img(file_path)
            output: torch.Tensor = network.model(input)
            pred_id = torch.max(output.data, 1)[1]
            outputs.append(int(pred_id))

    max_count_item = most_common_element(outputs)
    shutil.rmtree(folder_path)
    return os.path.basename(mp3_path), TRANSLATE[CLASSES[max_count_item]]


if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    ffmpeg = "ffmpeg-release-amd64-static"
    if sys.platform.startswith("linux"):
        if not os.path.exists(f"./{ffmpeg}.tar.xz"):
            download(
                f"https://www.modelscope.cn/studio/ccmusic-database/music_genre/resolve/master/{ffmpeg}.tar.xz"
            )

        folder_path = f"{os.getcwd()}/{ffmpeg}"
        if not os.path.exists(folder_path):
            subprocess.call(f"tar -xvf {ffmpeg}.tar.xz", shell=True)

        os.environ["PATH"] = f"{folder_path}:{os.environ.get('PATH', '')}"

    models = get_modelist(assign_model="VGG19_BN_cqt")
    examples = []
    example_mp3s = find_mp3_files()
    for mp3 in example_mp3s:
        examples.append([mp3, models[0]])

    with gr.Blocks() as demo:
        gr.Interface(
            fn=inference,
            inputs=[
                gr.Audio(label="Upload MP3", type="filepath"),
                gr.Dropdown(choices=models, label="Select a model", value=models[0]),
            ],
            outputs=[
                gr.Textbox(label="Audio filename", show_copy_button=True),
                gr.Textbox(label="Genre recognition", show_copy_button=True),
            ],
            examples=examples,
            cache_examples=False,
            allow_flagging="never",
            title="It is recommended to keep the duration of recording within 15s, too long will affect the recognition efficiency.",
        )

    demo.launch()