from my_tokenize import Database from yeni_tokenize import TokenizerProcessor class DataPipeline: def __init__(self, tokenizer_name='bert-base-uncased', max_length=512): self.tokenizer_processor = TokenizerProcessor(tokenizer_name) self.max_length = max_length def prepare_data(self): input_texts = Database.get_input_texts() output_texts = Database.get_output_texts() encoded_data = self.tokenizer_processor.pad_and_truncate_pairs(input_texts, output_texts, self.max_length) return encoded_data def tokenize_texts(self, texts): return [self.tokenizer_processor.tokenizer(text) for text in texts] def encode_texts(self, texts): return [self.tokenizer_processor.encode(text, self.max_length) for text in texts] # Tokenizer'ı başlat pipeline = DataPipeline(tokenizer_name='bert-base-cased', max_length=512) # MongoDB'den input metinlerini çek input_texts = Database.get_input_texts() # Metinleri tokenize et tokenized_texts = pipeline.tokenize_texts(input_texts) print("Tokenized Texts:") for text, tokens in zip(input_texts, tokenized_texts): print(f"Original Text: {text}") print(f"Tokenized Text: {tokens}") # Metinleri encode et encoded_texts = pipeline.encode_texts(input_texts) print("Encoded Texts:") for text, encoded in zip(input_texts, encoded_texts): print(f"Original Text: {text}") print(f"Encoded Text: {encoded['input_ids'].squeeze().tolist()}")