File size: 1,506 Bytes
1552dd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from my_tokenize import Database
from yeni_tokenize import TokenizerProcessor
class DataPipeline:
    def __init__(self, tokenizer_name='bert-base-uncased', max_length=512):
        self.tokenizer_processor = TokenizerProcessor(tokenizer_name)
        self.max_length = max_length

    def prepare_data(self):
        input_texts = Database.get_input_texts()
        output_texts = Database.get_output_texts()
        encoded_data = self.tokenizer_processor.pad_and_truncate_pairs(input_texts, output_texts, self.max_length)
        return encoded_data

    def tokenize_texts(self, texts):
        return [self.tokenizer_processor.tokenizer(text) for text in texts]
    
    def encode_texts(self, texts):
        return [self.tokenizer_processor.encode(text, self.max_length) for text in texts]

# Tokenizer'ı başlat
pipeline = DataPipeline(tokenizer_name='bert-base-cased', max_length=512)

# MongoDB'den input metinlerini çek
input_texts = Database.get_input_texts()

# Metinleri tokenize et
tokenized_texts = pipeline.tokenize_texts(input_texts)
print("Tokenized Texts:")
for text, tokens in zip(input_texts, tokenized_texts):
    print(f"Original Text: {text}")
    print(f"Tokenized Text: {tokens}")

# Metinleri encode et
encoded_texts = pipeline.encode_texts(input_texts)
print("Encoded Texts:")
for text, encoded in zip(input_texts, encoded_texts):
    print(f"Original Text: {text}")
    print(f"Encoded Text: {encoded['input_ids'].squeeze().tolist()}")