{ "cells": [ { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "#birleştirilcek dosyaların listesi \n", "train_files=['C:\\\\gitProjects\\\\oak\\\\data\\\\train-00000-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00001-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00002-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00003-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00004-of-00007.parquet']\n", "test_files=['C:\\\\gitProjects\\\\oak\\\\data\\\\train-00005-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00006-of-00007.parquet']\n", "\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "ename": "ImportError", "evalue": "cannot import name 'Automodel' from 'transformers' (c:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\__init__.py)", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[11], line 4\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Dataset\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Automodel \n", "\u001b[1;31mImportError\u001b[0m: cannot import name 'Automodel' from 'transformers' (c:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\__init__.py)" ] } ], "source": [ "import datasets\n", "import transformers\n", "from datasets import Dataset\n", "from transformers import Automodel " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Package Version\n", "----------------- -----------\n", "asttokens 2.4.1\n", "colorama 0.4.6\n", "comm 0.2.2\n", "debugpy 1.8.2\n", "decorator 5.1.1\n", "executing 2.0.1\n", "ipykernel 6.29.5\n", "ipython 8.26.0\n", "jedi 0.19.1\n", "jupyter_client 8.6.2\n", "jupyter_core 5.7.2\n", "matplotlib-inline 0.1.7\n", "nest-asyncio 1.6.0\n", "packaging 24.1\n", "parso 0.8.4\n", "pip 24.2\n", "platformdirs 4.2.2\n", "prompt_toolkit 3.0.47\n", "psutil 6.0.0\n", "pure_eval 0.2.3\n", "Pygments 2.18.0\n", "python-dateutil 2.9.0.post0\n", "pywin32 306\n", "pyzmq 26.0.3\n", "setuptools 65.5.0\n", "six 1.16.0\n", "stack-data 0.6.3\n", "tornado 6.4.1\n", "traitlets 5.14.3\n", "typing_extensions 4.12.2\n", "wcwidth 0.2.13\n", "Collecting transformers\n", " Downloading transformers-4.43.3-py3-none-any.whl.metadata (43 kB)\n", "Collecting filelock (from transformers)\n", " Using cached filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)\n", "Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)\n", " Using cached huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)\n", "Collecting numpy>=1.17 (from transformers)\n", " Using cached numpy-2.0.1-cp311-cp311-win_amd64.whl.metadata (60 kB)\n", "Requirement already satisfied: packaging>=20.0 in c:\\gitprojects\\deneme\\.venv\\lib\\site-packages (from transformers) (24.1)\n", "Collecting pyyaml>=5.1 (from transformers)\n", " Using cached PyYAML-6.0.1-cp311-cp311-win_amd64.whl.metadata (2.1 kB)\n", "Collecting regex!=2019.12.17 (from transformers)\n", " Downloading regex-2024.7.24-cp311-cp311-win_amd64.whl.metadata (41 kB)\n", "Collecting requests (from transformers)\n", " Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)\n", "Collecting safetensors>=0.4.1 (from transformers)\n", " Downloading safetensors-0.4.3-cp311-none-win_amd64.whl.metadata (3.9 kB)\n", "Collecting tokenizers<0.20,>=0.19 (from transformers)\n", " Downloading tokenizers-0.19.1-cp311-none-win_amd64.whl.metadata (6.9 kB)\n", "Collecting tqdm>=4.27 (from transformers)\n", " Using cached tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)\n", "Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.23.2->transformers)\n", " Using cached fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\gitprojects\\deneme\\.venv\\lib\\site-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n", "Requirement already satisfied: colorama in c:\\gitprojects\\deneme\\.venv\\lib\\site-packages (from tqdm>=4.27->transformers) (0.4.6)\n", "Collecting charset-normalizer<4,>=2 (from requests->transformers)\n", " Using cached charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl.metadata (34 kB)\n", "Collecting idna<4,>=2.5 (from requests->transformers)\n", " Using cached idna-3.7-py3-none-any.whl.metadata (9.9 kB)\n", "Collecting urllib3<3,>=1.21.1 (from requests->transformers)\n", " Using cached urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)\n", "Collecting certifi>=2017.4.17 (from requests->transformers)\n", " Using cached certifi-2024.7.4-py3-none-any.whl.metadata (2.2 kB)\n", "Downloading transformers-4.43.3-py3-none-any.whl (9.4 MB)\n", " ---------------------------------------- 0.0/9.4 MB ? eta -:--:--\n", " ---------------------------------------- 0.0/9.4 MB ? eta -:--:--\n", " - -------------------------------------- 0.3/9.4 MB ? eta -:--:--\n", " -- ------------------------------------- 0.5/9.4 MB 932.9 kB/s eta 0:00:10\n", " -- ------------------------------------- 0.5/9.4 MB 932.9 kB/s eta 0:00:10\n", " --- ------------------------------------ 0.8/9.4 MB 838.9 kB/s eta 0:00:11\n", " ---- ----------------------------------- 1.0/9.4 MB 825.2 kB/s eta 0:00:11\n", " ---- ----------------------------------- 1.0/9.4 MB 825.2 kB/s eta 0:00:11\n", " ----- ---------------------------------- 1.3/9.4 MB 818.6 kB/s eta 0:00:10\n", " ------ --------------------------------- 1.6/9.4 MB 822.8 kB/s eta 0:00:10\n", " ------ --------------------------------- 1.6/9.4 MB 822.8 kB/s eta 0:00:10\n", " ------- -------------------------------- 1.8/9.4 MB 838.9 kB/s eta 0:00:10\n", " -------- ------------------------------- 2.1/9.4 MB 851.1 kB/s eta 0:00:09\n", " -------- ------------------------------- 2.1/9.4 MB 851.1 kB/s eta 0:00:09\n", " ---------- ----------------------------- 2.4/9.4 MB 860.5 kB/s eta 0:00:09\n", " ----------- ---------------------------- 2.6/9.4 MB 878.0 kB/s eta 0:00:08\n", " ------------ --------------------------- 2.9/9.4 MB 897.4 kB/s eta 0:00:08\n", " ------------- -------------------------- 3.1/9.4 MB 913.7 kB/s eta 0:00:07\n", " -------------- ------------------------- 3.4/9.4 MB 911.0 kB/s eta 0:00:07\n", " -------------- ------------------------- 3.4/9.4 MB 911.0 kB/s eta 0:00:07\n", " --------------- ------------------------ 3.7/9.4 MB 908.8 kB/s eta 0:00:07\n", " ---------------- ----------------------- 3.9/9.4 MB 910.4 kB/s eta 0:00:07\n", " ----------------- ---------------------- 4.2/9.4 MB 918.5 kB/s eta 0:00:06\n", " ----------------- ---------------------- 4.2/9.4 MB 918.5 kB/s eta 0:00:06\n", " ------------------ --------------------- 4.5/9.4 MB 916.2 kB/s eta 0:00:06\n", " -------------------- ------------------- 4.7/9.4 MB 926.1 kB/s eta 0:00:06\n", " --------------------- ------------------ 5.0/9.4 MB 935.1 kB/s eta 0:00:05\n", " ---------------------- ----------------- 5.2/9.4 MB 940.5 kB/s eta 0:00:05\n", " ----------------------- ---------------- 5.5/9.4 MB 950.7 kB/s eta 0:00:05\n", " ------------------------ --------------- 5.8/9.4 MB 957.4 kB/s eta 0:00:04\n", " ------------------------- -------------- 6.0/9.4 MB 966.3 kB/s eta 0:00:04\n", " -------------------------- ------------- 6.3/9.4 MB 974.5 kB/s eta 0:00:04\n", " --------------------------- ------------ 6.6/9.4 MB 984.6 kB/s eta 0:00:03\n", " ---------------------------- ----------- 6.8/9.4 MB 991.6 kB/s eta 0:00:03\n", " ------------------------------ --------- 7.1/9.4 MB 1.0 MB/s eta 0:00:03\n", " ------------------------------- -------- 7.3/9.4 MB 1.0 MB/s eta 0:00:03\n", " -------------------------------- ------- 7.6/9.4 MB 1.0 MB/s eta 0:00:02\n", " --------------------------------- ------ 7.9/9.4 MB 1.0 MB/s eta 0:00:02\n", " ---------------------------------- ----- 8.1/9.4 MB 1.0 MB/s eta 0:00:02\n", " ----------------------------------- ---- 8.4/9.4 MB 1.0 MB/s eta 0:00:01\n", " ------------------------------------ --- 8.7/9.4 MB 1.1 MB/s eta 0:00:01\n", " ------------------------------------- -- 8.9/9.4 MB 1.1 MB/s eta 0:00:01\n", " ---------------------------------------- 9.4/9.4 MB 1.1 MB/s eta 0:00:00\n", "Using cached huggingface_hub-0.24.5-py3-none-any.whl (417 kB)\n", "Using cached numpy-2.0.1-cp311-cp311-win_amd64.whl (16.6 MB)\n", "Using cached PyYAML-6.0.1-cp311-cp311-win_amd64.whl (144 kB)\n", "Downloading regex-2024.7.24-cp311-cp311-win_amd64.whl (269 kB)\n", "Downloading safetensors-0.4.3-cp311-none-win_amd64.whl (287 kB)\n", "Downloading tokenizers-0.19.1-cp311-none-win_amd64.whl (2.2 MB)\n", " ---------------------------------------- 0.0/2.2 MB ? eta -:--:--\n", " ---------------------------------------- 0.0/2.2 MB ? eta -:--:--\n", " --------- ------------------------------ 0.5/2.2 MB 1.4 MB/s eta 0:00:02\n", " -------------- ------------------------- 0.8/2.2 MB 1.3 MB/s eta 0:00:02\n", " ------------------ --------------------- 1.0/2.2 MB 1.3 MB/s eta 0:00:01\n", " ----------------------- ---------------- 1.3/2.2 MB 1.4 MB/s eta 0:00:01\n", " ---------------------------- ----------- 1.6/2.2 MB 1.4 MB/s eta 0:00:01\n", " --------------------------------- ------ 1.8/2.2 MB 1.4 MB/s eta 0:00:01\n", " ---------------------------------------- 2.2/2.2 MB 1.4 MB/s eta 0:00:00\n", "Using cached tqdm-4.66.4-py3-none-any.whl (78 kB)\n", "Using cached filelock-3.15.4-py3-none-any.whl (16 kB)\n", "Using cached requests-2.32.3-py3-none-any.whl (64 kB)\n", "Using cached certifi-2024.7.4-py3-none-any.whl (162 kB)\n", "Using cached charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl (99 kB)\n", "Using cached fsspec-2024.6.1-py3-none-any.whl (177 kB)\n", "Using cached idna-3.7-py3-none-any.whl (66 kB)\n", "Using cached urllib3-2.2.2-py3-none-any.whl (121 kB)\n", "Installing collected packages: urllib3, tqdm, safetensors, regex, pyyaml, numpy, idna, fsspec, filelock, charset-normalizer, certifi, requests, huggingface-hub, tokenizers, transformers\n", "Successfully installed certifi-2024.7.4 charset-normalizer-3.3.2 filelock-3.15.4 fsspec-2024.6.1 huggingface-hub-0.24.5 idna-3.7 numpy-2.0.1 pyyaml-6.0.1 regex-2024.7.24 requests-2.32.3 safetensors-0.4.3 tokenizers-0.19.1 tqdm-4.66.4 transformers-4.43.3 urllib3-2.2.2\n" ] } ], "source": [ "!pip list dataset\n", "!pip install transformers" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#dosyaları yükleyin ve birleştirin\n", "train_dfs=[pd.read_parquet(file) for file in train_files]\n", "test_dfs=[pd.read_parquet(file) for file in test_files]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#parque dosyalarının birleştirilmesi\n", "train_df=pd.concat(train_dfs,ignore_index=True)\n", "test_df=pd.concat(test_dfs,ignore_index=True)\n", "\n", "print(train_df.head())\n", "print(train_df.head())\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#train ve test dosyaları oluşturma \n", "train_df.to_parquet('C:\\\\gitProjects\\\\train_Egitim\\\\merged_train.parquet')\n", "test_df.to_parquet('C:\\\\gitProjects\\\\test_Egitim\\\\merged_train.parquet')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#test ve train yollarını belirleme ve test, traindeki önemli sütunları alma\n", "train_file_path=('C:\\\\gitProjects\\\\train_Egitim\\\\merged_train.parquet')\n", "test_file_path=('C:\\\\gitProjects\\\\test_Egitim\\\\merged_train.parquet')\n", "\n", "train_df=pd.read_parquet(train_file_path,columns=['Prompt_ID','Prompt','Response','Category','Subcategory','Prompt_token_length'])\n", "test_df=pd.read_parquet(test_file_path,columns=['Prompt_ID','Prompt','Response','Category','Subcategory','Prompt_token_length'])\n", "\n", "print(train_df.head())\n", "print(test_df.head())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#verileri bart ile eğitme burada koleksiyon içerisindeki veriler tanımlanmalı \n", "# Load model directly\n", "from transformers import AutoModel,AutoTokenizer\n", "from transformers import (WEIGHTS_NAME, BertConfig,\n", " BertForQuestionAnswering, BertTokenizer)\n", "from torch.utils.data import DataLoader, SequentialSampler, TensorDataset\n", "\n", "#from utils import (get_answer, input_to_squad_example,squad_examples_to_features, to_list)\n", "import collections\n", "# Load model directly\n", "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"philschmid/bart-large-cnn-samsum\")\n", "model = AutoModelForSeq2SeqLM.from_pretrained(\"philschmid/bart-large-cnn-samsum\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pymongo import MongoClient\n", "import pandas as pd\n", "\n", "# MongoDB connection settings\n", "\n", "def get_mongodb(database_name='yeniDatabase', collection_name='train', host='localhost', port=27017):\n", " \"\"\"\n", " MongoDB connection and collection selection\n", " \"\"\"\n", " client = MongoClient(f'mongodb://{host}:{port}/')\n", " db = client[database_name]\n", " collection = db[collection_name]\n", " return collection\n", "\n", "# Function to load dataset into MongoDB\n", "def dataset_read():\n", " train_file_path = ('C:\\\\gitProjects\\\\train_Egitim\\\\merged_train.parquet')\n", " data = pd.read_parquet(train_file_path, columns=['Prompt_ID', 'Prompt', 'Response', 'Category', 'Subcategory', 'Prompt_token_length'])\n", " data_dict = data.to_dict(\"records\")\n", "\n", " # Get the MongoDB collection\n", " source_collection = get_mongodb(database_name='yeniDatabase', collection_name='train') # Collection for translation\n", "\n", " # Insert data into MongoDB\n", " source_collection.insert_many(data_dict)\n", "\n", " print(\"Data successfully loaded into MongoDB.\")\n", " return source_collection\n", "\n", "# Call the function to load the dataset into MongoDB\n", "source_collection = dataset_read()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Test ve train verilerini mongodb ye yükleme" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_mongodb(database_name='yeniDatabase', collection_name='test', mongo_url='mongodb://localhost:27017/'):\n", " \"\"\"\n", " MongoDB connection and collection selection\n", " \"\"\"\n", " client = MongoClient(mongo_url)\n", " db = client[database_name]\n", " collection = db[collection_name]\n", " return collection\n", "\n", "# Function to load dataset into MongoDB\n", "def dataset_read():\n", " train_file_path = ('C:\\\\gitProjects\\\\test_Egitim\\\\merged_train.parquet')\n", " data = pd.read_parquet(train_file_path, columns=['Prompt_ID', 'Prompt', 'Response', 'Category', 'Subcategory', 'Prompt_token_length'])\n", " data_dict = data.to_dict(\"records\")\n", "\n", " # Get the MongoDB collection\n", " source_collection = get_mongodb(database_name='yeniDatabase', collection_name='test') # Collection for translation\n", "\n", " # Insert data into MongoDB\n", " source_collection.insert_many(data_dict)\n", "\n", " print(\"Data successfully loaded into MongoDB.\")\n", " return source_collection\n", "\n", "# Call the function to load the dataset into MongoDB\n", "source_collection = dataset_read()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Model eğitimi \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# uygulama için kullanılcak olan özelliklerin tanımlanması\n", "from transformers import BertTokenizer,BertForQuestionAnswering,BertConfig\n", "class QA:\n", " def __init__(self,model_path: str):\n", " self.max_seq_length = 384 #max seq\n", " self.doc_stride = 128 #stride \n", " self.do_lower_case = False\n", " self.max_query_length = 30\n", " self.n_best_size = 3\n", " self.max_answer_length = 30\n", " self.version_2_with_negative = False\n", " #modelin yüklenmesi\n", " self.model, self.tokenizer = self.load_model(model_path)\n", " #hangi işlmecinin kullanıldığının belirlenmesi\n", " if torch.cuda.is_available():\n", " self.device = 'cuda'\n", " else:\n", " self.device = 'cpu'\n", " self.model.to(self.device)\n", " self.model.eval()\n", " \n", " # This function is used to load the model\n", " def load_model(self,model_path: str,do_lower_case=False):\n", " config = BertConfig.from_pretrained(model_path + \"C:\\\\gitProjects\\\\train_Egitim\")\n", " tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=do_lower_case)\n", " model = BertForQuestionAnswering.from_pretrained(model_path, from_tf=False, config=config)\n", " return model, tokenizer\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pymongo import MongoClient\n", "\n", "def get_mongodb():\n", " # MongoDB bağlantı bilgilerini döndürecek şekilde tanımlanmalıdır.\n", " return 'mongodb://localhost:27017/', 'yeniDatabase', 'test'\n", "\n", "def get_average_prompt_token_length():\n", " # MongoDB bağlantı bilgilerini alma\n", " mongo_url, db_name, collection_name = get_mongodb()\n", "\n", " # MongoDB'ye bağlanma\n", " client = MongoClient(mongo_url)\n", " db = client[db_name]\n", " collection = db[collection_name]\n", "\n", " # Tüm dökümanları çekme ve 'prompt_token_length' alanını alma\n", " docs = collection.find({}, {'Prompt_token_length': 1})\n", "\n", " # 'prompt_token_length' değerlerini toplama ve sayma\n", " total_length = 0\n", " count = 0\n", "\n", " for doc in docs:\n", " if 'Prompt_token_length' in doc:\n", " total_length += doc['Prompt_token_length']\n", " count += 1\n", " \n", " # Ortalama hesaplama\n", " if count > 0:\n", " average_length = total_length / count\n", " else:\n", " average_length = 0 # Eğer 'prompt_token_length' alanı olan döküman yoksa\n", "\n", " return int(average_length)\n", "\n", "# Ortalama prompt token uzunluğunu al ve yazdır\n", "average_length = get_average_prompt_token_length()\n", "print(f\"Ortalama prompt token uzunluğu: {average_length}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pymongo import MongoClient\n", "from transformers import BertTokenizer\n", "\n", "#getmongodb oluştumak yerine içeriği değiştirilmeli \n", "def get_mongodb():\n", " # MongoDB bağlantı bilgilerini döndürecek şekilde tanımlanmalıdır.\n", " return 'mongodb://localhost:27017/', 'yeniDatabase', 'train'\n", "\n", "def get_input_texts():\n", " # MongoDB bağlantı bilgilerini alma\n", " mongo_url, db_name, collection_name = get_mongodb()\n", "\n", " # MongoDB'ye bağlanma\n", " client = MongoClient(mongo_url)\n", " db = client[db_name]\n", " collection = db[collection_name]\n", " \n", " #input texleri mongodb üzerinde 'Prompt' lara denk gelir.\n", "\n", " # Sorguyu tanımlama\n", " query = {\"Prompt\": {\"$exists\": True}}\n", "\n", " # Sorguyu çalıştırma ve dökümanları çekme\n", " cursor = collection.find(query, {\"Prompt\": 1, \"_id\": 0}) # 'input_text' alanını almak için \"_id\": 0 ekleyin\n", "\n", " # Cursor'ı döküman listesine dönüştürme\n", " input_texts_from_db = list(cursor)\n", "\n", " # Input text'leri döndürme\n", " return input_texts_from_db\n", "\n", "input_texts_from_db= get_input_texts()\n", "# Input text'leri al ve yazdır\n", "\n", "#tokenizer ı yükle\n", "tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')\n", " \n", "#encode etmek için gerekli olan bilgiler \n", "input_texts=[doc[\"Prompt\"] for doc in input_texts_from_db ]\n", "\n", "#encoding işleminde inputlar \n", "\n", "# Tokenize the input texts\n", "encoded_inputs = tokenizer.batch_encode_plus(\n", " input_texts,\n", " padding=True,\n", " truncation=True,\n", " max_length=100,\n", " return_attention_mask=True,\n", " return_tensors='pt'\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"encoded_inputs:{encoded_inputs}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "#maskeleme yönetmiyle eğitim\n", "# Define the number of epochs and learning rate\n", "num_epochs = 3\n", "learning_rate = 1e-4\n", "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n", "\n", "#Iterate over the epochs\n", "for epoch in range(num_epochs):\n", " total_loss = 0\n", " for input_ids, attention_mask, labels in encoded_inputs:\n", " #reset gradients\n", " optimizer.zero_grad()\n", " #forward pass \n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)\n", " loss = outputs.loss\n", " #backward pass \n", " loss.backward()\n", " #update optimizer \n", " optimizer.step()\n", " #accumulate total loss\n", " total_loss += loss.item()\n", " #calculate average loss\n", " average_loss = total_loss / len(encoded_inputs)\n", " #print the loss for current epoch\n", " print(f\"Epoch {epoch+1} - Loss: {average_loss:.4f}\")\n", "\n", " #tüm bu verileri tutan bir \"batch_of_attention_masks\" verisini tanımlamam gerek" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from torch.utils.data import DataLoader,TensorDataset\n", "import torch\n", "from transformers import BertTokenizer\n", "\n", "#hdef değerlerle karşılaştırma yapabilmek için ve doğruluğu ölçmek için\n", "\n", "# Assuming you have tokenized input texts and labels\n", "#attetion mask bert dilinde modelin sadece gerçek tokenler üzerinde çalışmasını sağlar.\n", "input_ids = encoded_inputs['input_ids'] # Replace with your tokenized input texts\n", "attention_masks = encoded_inputs['attention_mask']\n", "\n", "\n", "labels = torch.tensor([1]*len(input_ids))\n", "\n", "# Create a TensorDataset\n", "dataset = TensorDataset(input_ids, attention_masks, labels)\n", "\n", "batch_size=10000\n", "# Create a data loader\n", "data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)\n", "\n", "for batch in data_loader:\n", " input_ids,attention_masks,labels\n", " print(f\"ınput ıds :{input_texts}\")\n", " print(f\"attetion masks: {attention_masks}\")\n", " print(f\"labels:{labels}\")\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ " # This function performs the prediction and return the reponse to the flask app\n", " # This function performs the prediction and return the reponse to the flask app\n", "RawResult = collection.namedtuple(\"RawResult\",[\"unique_id\", \"start_logits\", \"end_logits\"])\n", "\n", "def predict(self,passage :str,question :str): \n", " example = input_to_squad_example(passage,question) \n", " features = squad_examples_to_features(example,self.tokenizer,self.max_seq_length,self.doc_stride,self.max_query_length) \n", " all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n", " all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n", " all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)\n", " all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n", " dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,\n", " all_example_index)\n", " eval_sampler = SequentialSampler(dataset)\n", " eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=1)\n", " \n", " all_results = []\n", " for batch in eval_dataloader:\n", " batch = tuple(t.to(self.device) for t in batch)\n", " with torch.no_grad():\n", " inputs = {'input_ids': batch[0],\n", " 'attention_mask': batch[1],\n", " 'token_type_ids': batch[2] \n", " } \n", " example_indices = batch[3] \n", " outputs = self.model(**inputs)\n", " \n", " for i, example_index in enumerate(example_indices):\n", " eval_feature = features[example_index.item()]\n", " unique_id = int(eval_feature.unique_id)\n", " result = RawResult(unique_id = unique_id,\n", " start_logits = to_list(outputs[0][i]),\n", " end_logits = to_list(outputs[1][i]))\n", " all_results.append(result)\n", " \n", " answer = get_answer(example,features,all_results,self.n_best_size,self.max_answer_length,self.do_lower_case)\n", " \n", " return answer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.batch_encode_plus()\n", "torch.utils.data.DataLoader\n", "input_ids = torch.tensor(batch_of_tokenized_input_texts)\n", "attention_mask = torch.tensor(batch_of_attention_masks)\n", "labels = torch.tensor(batch_of_labels)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.save_pretrained(output_model_path)\n", "tokenizer.save_pretrained(output_model_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from app import train_model_route\n", "\n", "#ön yüzle ilişkilendirme\n", "\n", "train_model_route\n", "\n", "#title category ile ilişkilendirlecek\n", "\n", "\n", "#subheadingler subcategroy ile ilişkilendirieck\n", "\n", "#prompt token uzunlukları kontrol edilerek bütün tokenlerin aynı uzunlukta olması sağlanmalıdır.\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n" ] } ], "metadata": { "kernelspec": { "display_name": "myenv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }