import torch from torch import nn from transformers import BertPreTrainedModel class ParagramSPModel(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.config = config self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) # Initialize weights and apply final processing self.post_init() def filter_input_ids(input_ids): output = [] len = input_ids.shape[1] for ids in input_ids.shape[0]: filtered_ids = [] for i in ids: if i > 0: filtered_ids.append(i) if len(filtered_ids) == 0: filtered_ids = [0] output.append(filtered_ids + [config.pad_token_id] * (len - len(filtered_ids))) return torch.tensor(output) def forward(self, input_ids, attention_mask): print(input_ids) print(attention_mask) input_ids = filter_input_ids(input_ids) attention_mask = input_ids > 0 embeddings = self.word_embeddings(input_ids) masked_embeddings = embeddings * attention_mask[:, :, None] mean_pooled_embeddings = masked_embeddings.sum(dim=1) / attention_mask[:, :, None].sum(dim=1) return (embeddings, mean_pooled_embeddings, embeddings)