from transformers import PretrainedConfig from nltk.corpus import stopwords from typing import List import nltk nltk.download('stopwords') nltk.download('punkt') class GZIPEmbeddingConfig(PretrainedConfig): model_type = "gzipembed" def __init__( self, normalize = True, normalized_corpus = True, reduction = False, reduced_dimension = 0, remove_stop_words = True, stop_words = stopwords.words('english'), corpus = [], **kwargs, ): self.corpus = corpus self.normalize = normalize self.normalized_corpus = normalized_corpus self.reduction = reduction self.reduced_dimension = reduced_dimension, self.remove_stop_words = remove_stop_words self.stop_words = stop_words super().__init__(**kwargs)