import torch
from torch import nn


# モデルの定義
class AudioClassifier(nn.Module):
    def __init__(
        self,
        label2id: dict,
        feature_dim=256,
        hidden_dim=256,
        device="cpu",
        dropout_rate=0.5,
        num_hidden_layers=2,
    ):
        super(AudioClassifier, self).__init__()
        self.num_classes = len(label2id)
        self.device = device
        self.label2id = label2id
        self.id2label = {v: k for k, v in self.label2id.items()}
        # 最初の線形層と活性化層を追加
        self.fc1 = nn.Sequential(
            nn.Linear(feature_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.Mish(),
            nn.Dropout(dropout_rate),
        )
        # 隠れ層の追加
        self.hidden_layers = nn.ModuleList()
        for _ in range(num_hidden_layers):
            layer = nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.Mish(),
                nn.Dropout(dropout_rate),
            )
            self.hidden_layers.append(layer)
        # 最後の層（クラス分類用）
        self.fc_last = nn.Linear(hidden_dim, self.num_classes)

    def forward(self, x):
        # 最初の層を通過
        x = self.fc1(x)

        # 隠れ層を順に通過
        for layer in self.hidden_layers:
            x = layer(x)

        # 最後の分類層
        x = self.fc_last(x)
        return x

    def infer_from_features(self, features):
        # 特徴量をテンソルに変換
        features = (
            torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(self.device)
        )

        # モデルを評価モードに設定
        self.eval()

        # モデルの出力を取得
        with torch.no_grad():
            output = self.forward(features)

        # ソフトマックス関数を適用して確率を計算
        probs = torch.softmax(output, dim=1)

        # ラベルごとの確率を計算して大きい順に並べ替えて返す
        probs, indices = torch.sort(probs, descending=True)
        probs = probs.cpu().numpy().squeeze()
        indices = indices.cpu().numpy().squeeze()
        return [(self.id2label[i], p) for i, p in zip(indices, probs)]

    def infer_from_file(self, file_path):
        feature = extract_features(file_path, device=self.device)
        return self.infer_from_features(feature)


from pyannote.audio import Inference, Model

emb_model = Model.from_pretrained("pyannote/wespeaker-voxceleb-resnet34-LM")
inference = Inference(emb_model, window="whole")


def extract_features(file_path, device="cpu"):
    inference.to(torch.device(device))
    return inference(file_path)