import torch from torch import nn from transformers import BertPreTrainedModel class ParagramSPModel(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.config = config self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) # Initialize weights and apply final processing self.post_init() def forward(self, input_ids, attention_mask, return_dict): print(input_ids) print(attention_mask) embeddings = self.word_embeddings(input_ids) masked_embeddings = embeddings * attention_mask[:, :, None] mean_pooled_embeddings = masked_embeddings.sum(dim=1) / attention_mask[:, :, None].sum(dim=1) return (embeddings, mean_pooled_embeddings, embeddings)