yuchenxie commited on
Commit
e4c7225
·
verified ·
1 Parent(s): 5ab02e1

Create preprocessing.py

Browse files
Files changed (1) hide show
  1. preprocessing.py +11 -0
preprocessing.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Tokenizer
2
+
3
+ class ArlowGPTPreprocessor:
4
+ """
5
+ Data preprocessor for the ArlowGPT model.
6
+ """
7
+ def __init__(self, tokenizer: GPT2Tokenizer):
8
+ self.tokenizer = tokenizer
9
+
10
+ def preprocess_text(self, text: str):
11
+ return self.tokenizer(text, return_tensors="pt", padding=True)