# -*- coding: utf-8 -*-
from torch import nn
from transformers import AutoModel


class MonoBertRelevanceClassifierOneLayer(nn.Module):
    def __init__(self, bert_name, n_classes, freeze_bert=False, dropout=0.3, is_output_probability=True):

        super(MonoBertRelevanceClassifierOneLayer, self).__init__()

        self.bert = AutoModel.from_pretrained(bert_name)

        # dropout layer
        self.dropout = nn.Dropout(p=dropout)

        self.pooler = BertPooler(self.bert.config)  # to be used in case of DistilBertModel

        # relu activation function
        self.relu = nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(int(self.bert.config.hidden_size), n_classes)

        #softmax activation function
        self.softmax = nn.Softmax(dim=1)

        # whether to apply softmax on the output or not
        self.is_output_probability= is_output_probability

        #Freeze bert layers
        if freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False

    #define the forward pass
    # def forward(self, inputs):
    def forward(self, input_ids, attention_mask):

        #pass the inputs to the model
        # last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)) – Sequence of hidden-states at the output of the last layer of the model.
        #  cls_output which is pooler_output (torch.FloatTensor of shape (batch_size, hidden_size)) – Last layer hidden-state of the first token of the sequence (classification token) 
        #  further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during pretraining.
        bert_output = self.bert(input_ids=input_ids,
                              attention_mask=attention_mask,
                              return_dict=False)
        
        if len(bert_output) == 1: # this case happens in DistilBertModel (some sentence transformers)
            last_hidden_state = bert_output[0]
            cls_output = self.pooler(last_hidden_state)
        else: # len should be two as it contains last hidden state and pooler output    
            last_hidden_state, cls_output = bert_output


        # output layer
        x = self.fc1(cls_output)
        x = self.dropout(x)

        if self.is_output_probability:
            # apply softmax activation
            x = self.softmax(x)

        return x


class MonoBertRelevanceClassifierTwoLayers(nn.Module):
    def __init__(self, bert_name, n_classes, freeze_bert=False, dropout=0.3, is_output_probability=True):

        super(MonoBertRelevanceClassifierTwoLayers, self).__init__()

        self.bert = AutoModel.from_pretrained(bert_name)

        # self.pooler = BertPooler(self.bert.config)  # to be used in case of DistilBertModel

        # dropout layer
        self.dropout = nn.Dropout(p=dropout)

        # relu activation function
        self.relu = nn.ReLU()

        # dense layer 1
        first_layer_neurons_count = int(self.bert.config.hidden_size)
        second_layer_neurons_count = int(first_layer_neurons_count/3)
        self.fc1 = nn.Linear(first_layer_neurons_count, second_layer_neurons_count)

        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(second_layer_neurons_count, n_classes)

        #softmax activation function
        self.softmax = nn.Softmax(dim=1)

        # whether to apply softmax on the output or not
        self.is_output_probability= is_output_probability

        #Freeze bert layers
        if freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False

    #define the forward pass
    # def forward(self, inputs):
    def forward(self, input_ids, attention_mask):

        
        #pass the inputs to the model
        # last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)) – Sequence of hidden-states at the output of the last layer of the model.
        #  cls_output which is pooler_output (torch.FloatTensor of shape (batch_size, hidden_size)) – Last layer hidden-state of the first token of the sequence (classification token) 
        #  further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during pretraining.
        bert_output = self.bert(input_ids=input_ids,
                              attention_mask=attention_mask,
                              return_dict=False)
        
        if len(bert_output) == 1: # this case happens in DistilBertModel (some sentence transformers)
            last_hidden_state = bert_output[0]
            # pooler = BertPooler(self.bert.config)  # to be used in case of DistilBertModel
            cls_output = self.pooler(last_hidden_state)
        else: # len should be two as it contains last hidden state and pooler output    
            last_hidden_state, cls_output = bert_output

        x = self.fc1(cls_output)

        x = self.relu(x)

        x = self.dropout(x)

        # output layer
        x = self.fc2(x)

        # apply softmax activation
        if self.is_output_probability:
            x = self.softmax(x)

        return x
    
    
class BertPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output