# from transformers.models.led.modeling_led import LEDEncoder
from transformers import LEDConfig, LEDModel, LEDPreTrainedModel
from transformers.modeling_outputs import TokenClassifierOutput

import torch.nn as nn

class CustomLEDForResultsIdModel(LEDPreTrainedModel):
    def __init__(self, config: LEDConfig, checkpoint=None): 
        super().__init__(config)
        self.num_labels = config.num_labels 
        print("Configs")
        print(config.num_labels)
        print(config.dropout)

        #Load Model with given checkpoint and extract its body
        if (checkpoint):
            self.led = LEDModel.from_pretrained(checkpoint, config=config).get_encoder()
        else:
            self.led = LEDModel(config).get_encoder()
            
        # self.model = LEDEncoder.from_pretrained(checkpoint, config=config)
        self.dropout = nn.Dropout(config.dropout) 
        self.classifier = nn.Linear(self.led.config.d_model,self.num_labels) # load and initialize weights

    def forward(self, input_ids=None, attention_mask=None, labels=None, global_attention_mask=None, return_loss=True):
        #Extract outputs from the body
        outputs = self.led(input_ids=input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
        
        sequence_output = self.dropout(outputs.last_hidden_state)
        logits = self.classifier(sequence_output) # calculate losses [:,0,:].view(-1,768)      

        # start_logits, end_logits = logits.split(1, dim=-1)
        # start_logits = start_logits.squeeze(-1).contiguous()
        # end_logits = end_logits.squeeze(-1).contiguous()
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return {
            'loss': loss,
            'logits': logits
        }