First version of the your-model-name model and tokenizer.

Browse files

Files changed (9) hide show

__init__.py +0 -0
__pycache__/preprocess.cpython-37.pyc +0 -0
main.py +60 -0
preprocess.py +96 -0
test-squad-trained/config.json +23 -0
test-squad-trained/pytorch_model.bin +3 -0
test-squad-trained/special_tokens_map.json +1 -0
test-squad-trained/tokenizer_config.json +1 -0
test-squad-trained/vocab.txt +0 -0

__init__.py ADDED Viewed

File without changes

__pycache__/preprocess.cpython-37.pyc ADDED Viewed

Binary file (3.56 kB). View file

main.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from preprocess import Model, SquadDataset
+from transformers import DistilBertForQuestionAnswering
+from torch.utils.data import DataLoader
+from transformers import AdamW
+import torch
+import subprocess
+data = Model()
+train_contexts, train_questions, train_answers = data.ArrangeData("livecheckcontainer")
+val_contexts, val_questions, val_answers = data.ArrangeData("livecheckcontainer")
+print(train_answers)
+train_answers, train_contexts = data.add_end_idx(train_answers, train_contexts)
+val_answers, val_contexts = data.add_end_idx(val_answers, val_contexts)
+train_encodings, val_encodings = data.Tokenizer(train_contexts, train_questions, val_contexts, val_questions)
+train_encodings  = data.add_token_positions(train_encodings, train_answers)
+val_encodings = data.add_token_positions(val_encodings, val_answers)
+train_dataset = SquadDataset(train_encodings)
+val_dataset = SquadDataset(val_encodings)
+model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+model.to(device)
+model.train()
+train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
+optim = AdamW(model.parameters(), lr=5e-5)
+for epoch in range(2):
+    print(epoch)
+    for batch in train_loader:
+        optim.zero_grad()
+        input_ids = batch['input_ids'].to(device)
+        attention_mask = batch['attention_mask'].to(device)
+        start_positions = batch['start_positions'].to(device)
+        end_positions = batch['end_positions'].to(device)
+        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
+        loss = outputs[0]
+        loss.backward()
+        optim.step()
+print("Done")
+model.eval()
+model.save_pretrained("test-squad-trained")
+data.tokenizer.save_pretrained("test-squad-trained")
+subprocess.call(["git", "add","--all"])
+subprocess.call(["git", "status"])
+subprocess.call(["git", "commit", "-m", "First version of the your-model-name model and tokenizer."])
+subprocess.call(["git", "push"])

preprocess.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import json
+from os import close
+from pathlib import Path
+from azure.cosmos import CosmosClient, PartitionKey, exceptions
+from transformers import DistilBertTokenizerFast
+import torch
+class Model:
+    def __init__(self) -> None:
+        self.endPoint = "https://productdevelopmentstorage.documents.azure.com:443/"
+        self.primaryKey = "nVds9dPOkPuKu8RyWqigA1DIah4SVZtl1DIM0zDuRKd95an04QC0qv9TQIgrdtgluZo7Z0HXACFQgKgOQEAx1g=="
+        self.client = CosmosClient(self.endPoint, self.primaryKey)
+        self.tokenizer = None
+    def GetData(self, type):
+        database = self.client.get_database_client("squadstorage")
+        container = database.get_container_client(type)
+        item_list = list(container.read_all_items(max_item_count=10))
+        return item_list
+    def ArrangeData(self, type):
+        squad_dict = self.GetData(type)
+        contexts = []
+        questions = []
+        answers = []
+        for i in squad_dict:
+            contexts.append(i["context"])
+            questions.append(i["question"])
+            answers.append(i["answers"])
+        return contexts, questions, answers
+    def add_end_idx(self, answers, contexts):
+        for answer, context in zip(answers, contexts):
+            gold_text = answer['text'][0]
+            start_idx = answer['answer_start'][0]
+            end_idx = start_idx + len(gold_text)
+            if context[start_idx:end_idx] == gold_text:
+                answer['answer_end'] = end_idx
+            elif context[start_idx-1:end_idx-1] == gold_text:
+                answer['answer_start'] = start_idx - 1
+                answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
+            elif context[start_idx-2:end_idx-2] == gold_text:
+                answer['answer_start'] = start_idx - 2
+                answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters
+        return answers, contexts
+    def Tokenizer(self, train_contexts, train_questions, val_contexts, val_questions):
+        self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
+        train_encodings = self.tokenizer(train_contexts, train_questions, truncation=True, padding=True)
+        val_encodings = self.tokenizer(val_contexts, val_questions, truncation=True, padding=True)
+        return train_encodings, val_encodings
+    def add_token_positions(self, encodings, answers):
+        start_positions = []
+        end_positions = []
+        for i in range(len(answers)):
+            start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'][0]))
+            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
+            # if start position is None, the answer passage has been truncated
+            if start_positions[-1] is None:
+                start_positions[-1] = self.tokenizer.model_max_length
+            if end_positions[-1] is None:
+                end_positions[-1] = self.tokenizer.model_max_length
+        encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
+        return encodings
+    # train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
+    # val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')
+class SquadDataset(torch.utils.data.Dataset):
+    def __init__(self, encodings):
+        self.encodings = encodings
+    def __getitem__(self, idx):
+        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+    def __len__(self):
+        return len(self.encodings.input_ids)

test-squad-trained/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "_name_or_path": "distilbert-base-uncased",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForQuestionAnswering"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "transformers_version": "4.3.2",
+  "vocab_size": 30522
+}

test-squad-trained/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:423cba4a34bfc72ad38bc33a07f81fd45f433c8e8f15383b8b35c95be8a1b26e
+size 265498527

test-squad-trained/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

test-squad-trained/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "name_or_path": "distilbert-base-uncased"}

test-squad-trained/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff