File size: 6,982 Bytes
7044ef9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
from datasets import load_dataset
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForQuestionAnswering, BertConfig,AutoModelForCausalLM
from pymongo import MongoClient
import torchtext
torchtext.disable_torchtext_deprecation_warning()
from torchtext.data import get_tokenizer
from yeni_tokenize import TokenizerProcessor


class Database:

    # MongoDB connection settings

    def get_mongodb(database_name='yeniDatabase', collection_name='test', host='localhost', port=27017):
        """

        MongoDB connection and collection selection

        """
        client = MongoClient(f'mongodb://{host}:{port}/')
        db = client[database_name]
        collection = db[collection_name]
        return collection
    
    @staticmethod
    def get_mongodb():
        # MongoDB bağlantı bilgilerini döndürecek şekilde tanımlanmalıdır.
        return 'mongodb://localhost:27017/', 'yeniDatabase', 'train'

    @staticmethod
    def get_input_texts():
        # MongoDB bağlantı bilgilerini alma
        mongo_url, db_name, collection_name = Database.get_mongodb()
        # MongoDB'ye bağlanma
        client = MongoClient(mongo_url)
        db = client[db_name]
        collection = db[collection_name]
        # Sorguyu tanımlama
        query = {"Prompt": {"$exists": True}}
        # Sorguyu çalıştırma ve dökümanları çekme
        cursor = collection.find(query, {"Prompt": 1, "_id": 0})
        # Cursor'ı döküman listesine dönüştürme
        input_texts_from_db =  [doc['Prompt'] for doc in cursor] 
        # Input text'leri döndürme
        # Düz metin listesine dönüştürme
        return input_texts_from_db
    input_text= get_input_texts()
    print("metinler yazılıyor:")
    for text in input_text:
        print(text)

    
    @staticmethod
    def get_output_texts():
        # MongoDB bağlantı bilgilerini alma
        mongo_url, db_name, collection_name = Database.get_mongodb()
        # MongoDB'ye bağlanma
        client = MongoClient(mongo_url)
        db = client[db_name]
        collection = db[collection_name]
        # Sorguyu tanımlama
        query = {"Response": {"$exists": True}}
        # Sorguyu çalıştırma ve dökümanları çekme
        cursor = collection.find(query, {"Response": 1, "_id": 0})
        # Cursor'ı döküman listesine dönüştürme
        output_texts_from_db = [doc['Response'] for doc in cursor]
        #output metin listesine çevirme 
        return output_texts_from_db

    @staticmethod
    def get_average_prompt_token_length():
        # MongoDB bağlantı bilgilerini alma
        mongo_url, db_name, collection_name = Database.get_mongodb()
        # MongoDB'ye bağlanma
        client = MongoClient(mongo_url)
        db = client[db_name]
        collection = db[collection_name]
        # Tüm dökümanları çekme ve 'prompt_token_length' alanını alma
        docs = collection.find({}, {'Prompt_token_length': 1})
        # 'prompt_token_length' değerlerini toplama ve sayma
        total_length = 0
        count = 0
        for doc in docs:
            if 'Prompt_token_length' in doc:
                total_length += doc['Prompt_token_length']
                count += 1
        # Ortalama hesaplama
        average_length = total_length / count if count > 0 else 0
        return int(average_length)


# Tokenizer ve Modeli yükleme
"""

class TokenizerProcessor:

    def __init__(self, tokenizer_name='bert-base-uncased'):

        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name)



    def tokenize_and_encode(self, input_texts, output_texts, max_length=100):

        encoded = self.tokenizer.batch_encode_plus(

            text_pair=list(zip(input_texts, output_texts)),

            padding='max_length',

            truncation=True,

            max_length=max_length,

            return_attention_mask=True,

            return_tensors='pt'

        )

        return encoded



    paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="pt")

not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt")



paraphrase_classification_logits = model(**paraphrase)[0]

not_paraphrase_classification_logits = model(**not_paraphrase)[0]

    def custom_padding(self, input_ids_list, max_length=100, pad_token_id=0):

        padded_inputs = []

        for ids in input_ids_list:

            if len(ids) < max_length:

                padded_ids = ids + [pad_token_id] * (max_length - len(ids))

            else:

                padded_ids = ids[:max_length]

            padded_inputs.append(padded_ids)

        return padded_inputs



    def pad_and_truncate_pairs(self, input_texts, output_texts, max_length=100):



        #input ve output verilerinin uzunluğunu eşitleme 

        inputs = self.tokenizer(input_texts, padding=False, truncation=False, return_tensors=None)

        outputs = self.tokenizer(output_texts, padding=False, truncation=False, return_tensors=None)

        

        input_ids = self.custom_padding(inputs['input_ids'], max_length, self.tokenizer.pad_token_id)

        output_ids = self.custom_padding(outputs['input_ids'], max_length, self.tokenizer.pad_token_id)

        

        input_ids_tensor = torch.tensor(input_ids)

        output_ids_tensor = torch.tensor(output_ids)

        

        input_attention_mask = (input_ids_tensor != self.tokenizer.pad_token_id).long()

        output_attention_mask = (output_ids_tensor != self.tokenizer.pad_token_id).long()

        

        return {

            'input_ids': input_ids_tensor,

            'input_attention_mask': input_attention_mask,

            'output_ids': output_ids_tensor,

            'output_attention_mask': output_attention_mask

        }

    

"""
    #cümleleri teker teker input ve output verilerinden çekmem gerekiyor 
    #def tokenize_and_pad_sequences(sequence_1,sequence2,)


"""class DataPipeline:

    def __init__(self, tokenizer_name='bert-base-uncased', max_length=100):

        self.tokenizer_processor = TokenizerProcessor(tokenizer_name)

        self.max_length = max_length



    def prepare_data(self):

        input_texts = Database.get_input_texts()

        output_texts = Database.get_output_texts()

        encoded_data = self.tokenizer_processor.pad_and_truncate_pairs(input_texts, output_texts, self.max_length)

        return encoded_data

    

    def tokenize_texts(self, texts):

        return [self.tokenize(text) for text in texts]

    

    def encode_texts(self, texts):

        return [self.encode(text, self.max_length) for text in texts]



# Example Usage

if __name__ == "__main__":

    data_pipeline = DataPipeline()

    encoded_data = data_pipeline.prepare_data()

    print(encoded_data)

"""