alekeik1 commited on
Commit
9e4713f
1 Parent(s): 527772c

feat(main): init train and dvc

Browse files
.dvc/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /config.local
2
+ /tmp
3
+ /cache
.dvc/config ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [core]
2
+ remote = ya-s3
3
+ autostage = true
4
+ ['remote "ya-s3"']
5
+ url = s3://shad-ml-2-hw-5/dvc
6
+ endpointurl = https://storage.yandexcloud.net
7
+
8
+ [cache]
9
+ type = reflink,hardlink,symlink,copy
.dvcignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Add patterns of files dvc should ignore, which could improve
2
+ # the performance. Learn more at
3
+ # https://dvc.org/doc/user-guide/dvcignore
.idea/jsonSchemas.xml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="JsonSchemaMappingsProjectConfiguration">
4
+ <state>
5
+ <map>
6
+ <entry key="dvc.yaml">
7
+ <value>
8
+ <SchemaInfo>
9
+ <option name="name" value="dvc.yaml" />
10
+ <option name="relativePathToSchema" value="https://raw.githubusercontent.com/iterative/dvcyaml-schema/master/schema.json" />
11
+ <option name="applicationDefined" value="true" />
12
+ <option name="patterns">
13
+ <list>
14
+ <Item>
15
+ <option name="path" value="dvc.yaml" />
16
+ </Item>
17
+ </list>
18
+ </option>
19
+ </SchemaInfo>
20
+ </value>
21
+ </entry>
22
+ </map>
23
+ </state>
24
+ </component>
25
+ </project>
data/models/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ /model.torch
data/raw/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ /arxivData.json
data/raw/arxivData.json.dvc ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ outs:
2
+ - md5: a314e2f4eab544a46e6f95802ecde647
3
+ size: 72422946
4
+ path: arxivData.json
dvc.lock ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ schema: '2.0'
2
+ stages:
3
+ train:
4
+ cmd: poetry run train
5
+ deps:
6
+ - path: data/raw/arxivData.json
7
+ md5: a314e2f4eab544a46e6f95802ecde647
8
+ size: 72422946
9
+ - path: shad_mlops_transformers/model.py
10
+ md5: 9b932a6cb0cb46fc7c656e7c80c442e0
11
+ size: 2008
12
+ isexec: true
13
+ - path: shad_mlops_transformers/trainer.py
14
+ md5: 61acf28399fadfd2495dc48242c594ba
15
+ size: 3650
16
+ params:
17
+ shad_mlops_transformers/config.py:
18
+ Config.batch_size: 32
19
+ Config.random_seed: 42
20
+ Config.test_size: 0.2
21
+ outs:
22
+ - path: data/models/model.torch
23
+ md5: f110836b7b7585efdbfcb8ab7d5df76c
24
+ size: 438187413
dvc.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ stages:
2
+ train:
3
+ cmd: poetry run train
4
+ deps:
5
+ - shad_mlops_transformers/trainer.py
6
+ - shad_mlops_transformers/model.py
7
+ - data/raw/arxivData.json
8
+ params:
9
+ - shad_mlops_transformers/config.py:
10
+ - Config.batch_size
11
+ - Config.random_seed
12
+ - Config.test_size
13
+ outs:
14
+ # NOTE должно совпадать с конфигом
15
+ - data/models/model.torch
16
+
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -12,6 +12,11 @@ python = "^3.10"
12
  streamlit = "^1.21.0"
13
  torch = "^1.13"
14
  transformers = "^4.27.4"
 
 
 
 
 
15
 
16
 
17
  [tool.poetry.group.dev.dependencies]
@@ -20,6 +25,9 @@ black = "^23.3.0"
20
  docformatter = "^1.6.0"
21
  isort = "^5.12.0"
22
 
 
 
 
23
  [build-system]
24
  requires = ["poetry-core"]
25
  build-backend = "poetry.core.masonry.api"
 
12
  streamlit = "^1.21.0"
13
  torch = "^1.13"
14
  transformers = "^4.27.4"
15
+ pydantic = "^1.10.7"
16
+ scikit-learn = "^1.2.2"
17
+ numpy = "^1.24.2"
18
+ loguru = "^0.7.0"
19
+ dvc = {version = "^2.54.0", extras = ["s3"]}
20
 
21
 
22
  [tool.poetry.group.dev.dependencies]
 
25
  docformatter = "^1.6.0"
26
  isort = "^5.12.0"
27
 
28
+ [tool.poetry.scripts]
29
+ train = "shad_mlops_transformers.trainer:main"
30
+
31
  [build-system]
32
  requires = ["poetry-core"]
33
  build-backend = "poetry.core.masonry.api"
shad_mlops_transformers/__init__.py CHANGED
File without changes
shad_mlops_transformers/config.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from pydantic import BaseSettings
4
+
5
+ basedir = Path(__file__).parent
6
+
7
+
8
+ class Config(BaseSettings):
9
+ data_dir: Path = basedir.parent / "data"
10
+ raw_data_dir: Path = data_dir / "raw"
11
+ batch_size: int = 32
12
+ random_seed: int = 42
13
+ test_size: float = 0.2
14
+ weights_path: Path = data_dir / "models" / "model.torch"
15
+
16
+
17
+ config = Config()
shad_mlops_transformers/main.py CHANGED
File without changes
shad_mlops_transformers/model.py CHANGED
@@ -1,8 +1,56 @@
1
- from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
2
-
3
- tokenizer = AutoTokenizer.from_pretrained("Davlan/distilbert-base-multilingual-cased-ner-hrl")
4
- model = AutoModelForTokenClassification.from_pretrained("Davlan/distilbert-base-multilingual-cased-ner-hrl")
5
- nlp = pipeline("ner", model=model, tokenizer=tokenizer)
6
- example = "Nader Jokhadar had given Syria the lead with a well-struck header in the seventh minute."
7
- ner_results = nlp(example)
8
- print(ner_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ from pathlib import Path
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ from transformers import AutoModel, AutoTokenizer
7
+
8
+ from shad_mlops_transformers.config import config
9
+
10
+ # example = ["Nader Jokhadar had given Syria the lead with a well-struck header in the seventh minute."]
11
+ # model_name = "bert-base-uncased"
12
+ # model_name = "Davlan/distilbert-base-multilingual-cased-ner-hrl"
13
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
14
+ # model = AutoModel.from_pretrained(model_name)
15
+ # nlp = pipeline("ner", model=model, tokenizer=tokenizer)
16
+ # toks = tokenizer(example, padding=True, truncation=True, return_tensors="pt")
17
+ # with torch.no_grad():
18
+ # p = model(**toks)
19
+ # print(p)
20
+
21
+
22
+ class DocumentClassifier(nn.Module):
23
+ def __init__(self, n_classes: int = 2):
24
+ super().__init__()
25
+ self.model_name = "bert-base-uncased"
26
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
27
+ self.encoder = AutoModel.from_pretrained(self.model_name)
28
+ self.n_classes = n_classes
29
+ self.model = nn.Sequential(
30
+ OrderedDict(
31
+ [
32
+ ("fc", nn.Linear(in_features=self.encoder.pooler.dense.out_features, out_features=n_classes)),
33
+ ("sm", nn.Softmax()),
34
+ ]
35
+ )
36
+ )
37
+ self.trainable_params = self.model.parameters()
38
+
39
+ def forward(self, text):
40
+ tok_info = self.tokenize(text)
41
+ with torch.no_grad():
42
+ embeddings = self.encoder(**tok_info)["pooler_output"]
43
+ return self.model(embeddings)
44
+
45
+ def tokenize(self, x: str) -> dict:
46
+ return self.tokenizer(x, padding=True, truncation=True, return_tensors="pt")
47
+
48
+ def from_file(self, path: Path = config.weights_path) -> "DocumentClassifier":
49
+ self.load_state_dict(torch.load(path))
50
+ return self
51
+
52
+
53
+ if __name__ == "__main__":
54
+ data = ["This article describes machine learning"]
55
+ model = DocumentClassifier(n_classes=61).from_file()
56
+ model(data)
shad_mlops_transformers/trainer.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ from loguru import logger
7
+ from sklearn.model_selection import train_test_split
8
+ from torch.utils.data import DataLoader, Dataset
9
+ from tqdm import tqdm
10
+
11
+ from shad_mlops_transformers.config import config
12
+ from shad_mlops_transformers.model import DocumentClassifier
13
+
14
+
15
+ class ArxivDataset(Dataset):
16
+ def __init__(self, raw_data: list[dict]):
17
+ """Разово вычитываем и сохраняем весь датасет."""
18
+ logger.info("reading data")
19
+ self.x = []
20
+ self.y = []
21
+ # self.data = []
22
+ whitelist_labels = ["math", "cs"]
23
+ i = 0
24
+ self.class_mapper = {}
25
+ for item in raw_data:
26
+ tmp_y = []
27
+ # да простят мне это потомки, но там зачем-то люди засунули питоновский dict в строку!
28
+ for tag_desc in eval(item["tag"].replace("'", '"')):
29
+ real_tag: str = tag_desc["term"]
30
+ # пока берем только теги из whitelist
31
+ if not any([real_tag.startswith(x) for x in whitelist_labels]):
32
+ continue
33
+ if real_tag not in self.class_mapper:
34
+ self.class_mapper[real_tag] = i
35
+ i += 1
36
+ tmp_y.append(self.class_mapper[real_tag])
37
+ # берем только один тег
38
+ break
39
+ # если был хотя бы один валидный тег, добавляем в датасет
40
+ if len(tmp_y):
41
+ # NOTE берем только один тег
42
+ # self.data.append({"label": tmp_y[0], "text": item["summary"]})
43
+ self.x.append(item["summary"])
44
+ self.y.append(tmp_y[0])
45
+ self.classes = sorted(list(self.class_mapper.keys()))
46
+ logger.info("[Done] reading data")
47
+
48
+ def __getitem__(self, i):
49
+ # return self.data[i]
50
+ return self.x[i], self.y[i]
51
+
52
+ def __len__(self):
53
+ # return len(self.data)
54
+ return len(self.x)
55
+
56
+
57
+ def make_train_val():
58
+ with open(config.raw_data_dir / "arxivData.json", "r") as f:
59
+ _raw_json = json.load(f)
60
+ return train_test_split(_raw_json, test_size=config.test_size, shuffle=True, random_state=config.random_seed)
61
+
62
+
63
+ def train_model(model: DocumentClassifier, optimizer: torch.optim.Optimizer, loader: DataLoader, criterion):
64
+ model.train()
65
+ losses_tr = []
66
+ for text, true_label in tqdm(loader):
67
+ optimizer.zero_grad()
68
+ pred = model(text)
69
+ loss = criterion(pred, true_label)
70
+
71
+ loss.backward()
72
+ optimizer.step()
73
+ losses_tr.append(loss.item())
74
+ break
75
+
76
+ return model, optimizer, np.mean(losses_tr)
77
+
78
+
79
+ def collator(x):
80
+ return x[0]
81
+
82
+
83
+ def save_model(model: DocumentClassifier):
84
+ config.weights_path.parent.mkdir(parents=True, exist_ok=True)
85
+ torch.save(model.state_dict(), config.weights_path)
86
+
87
+
88
+ def main():
89
+ train, val = make_train_val()
90
+ dataset_train = ArxivDataset(train)
91
+ dataset_val = ArxivDataset(val)
92
+ loader_train = DataLoader(dataset_train, batch_size=config.batch_size, shuffle=True, drop_last=True)
93
+ loader_val = DataLoader(dataset_val, batch_size=config.batch_size, shuffle=True, drop_last=True)
94
+
95
+ model = DocumentClassifier(n_classes=len(dataset_train.classes))
96
+ optimizer = torch.optim.Adam(model.trainable_params)
97
+ loss = nn.CrossEntropyLoss()
98
+ train_model(model=model, optimizer=optimizer, loader=loader_train, criterion=loss)
99
+ save_model(model)
100
+
101
+
102
+ if __name__ == "__main__":
103
+ main()
shad_mlops_transformers/ui.py CHANGED
File without changes