Spaces:
Sleeping
Sleeping
feat(main): init train and dvc
Browse files- .dvc/.gitignore +3 -0
- .dvc/config +9 -0
- .dvcignore +3 -0
- .idea/jsonSchemas.xml +25 -0
- data/models/.gitignore +1 -0
- data/raw/.gitignore +1 -0
- data/raw/arxivData.json.dvc +4 -0
- dvc.lock +24 -0
- dvc.yaml +16 -0
- poetry.lock +0 -0
- pyproject.toml +8 -0
- shad_mlops_transformers/__init__.py +0 -0
- shad_mlops_transformers/config.py +17 -0
- shad_mlops_transformers/main.py +0 -0
- shad_mlops_transformers/model.py +56 -8
- shad_mlops_transformers/trainer.py +103 -0
- shad_mlops_transformers/ui.py +0 -0
.dvc/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
/config.local
|
2 |
+
/tmp
|
3 |
+
/cache
|
.dvc/config
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[core]
|
2 |
+
remote = ya-s3
|
3 |
+
autostage = true
|
4 |
+
['remote "ya-s3"']
|
5 |
+
url = s3://shad-ml-2-hw-5/dvc
|
6 |
+
endpointurl = https://storage.yandexcloud.net
|
7 |
+
|
8 |
+
[cache]
|
9 |
+
type = reflink,hardlink,symlink,copy
|
.dvcignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Add patterns of files dvc should ignore, which could improve
|
2 |
+
# the performance. Learn more at
|
3 |
+
# https://dvc.org/doc/user-guide/dvcignore
|
.idea/jsonSchemas.xml
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="JsonSchemaMappingsProjectConfiguration">
|
4 |
+
<state>
|
5 |
+
<map>
|
6 |
+
<entry key="dvc.yaml">
|
7 |
+
<value>
|
8 |
+
<SchemaInfo>
|
9 |
+
<option name="name" value="dvc.yaml" />
|
10 |
+
<option name="relativePathToSchema" value="https://raw.githubusercontent.com/iterative/dvcyaml-schema/master/schema.json" />
|
11 |
+
<option name="applicationDefined" value="true" />
|
12 |
+
<option name="patterns">
|
13 |
+
<list>
|
14 |
+
<Item>
|
15 |
+
<option name="path" value="dvc.yaml" />
|
16 |
+
</Item>
|
17 |
+
</list>
|
18 |
+
</option>
|
19 |
+
</SchemaInfo>
|
20 |
+
</value>
|
21 |
+
</entry>
|
22 |
+
</map>
|
23 |
+
</state>
|
24 |
+
</component>
|
25 |
+
</project>
|
data/models/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/model.torch
|
data/raw/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/arxivData.json
|
data/raw/arxivData.json.dvc
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
outs:
|
2 |
+
- md5: a314e2f4eab544a46e6f95802ecde647
|
3 |
+
size: 72422946
|
4 |
+
path: arxivData.json
|
dvc.lock
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
schema: '2.0'
|
2 |
+
stages:
|
3 |
+
train:
|
4 |
+
cmd: poetry run train
|
5 |
+
deps:
|
6 |
+
- path: data/raw/arxivData.json
|
7 |
+
md5: a314e2f4eab544a46e6f95802ecde647
|
8 |
+
size: 72422946
|
9 |
+
- path: shad_mlops_transformers/model.py
|
10 |
+
md5: 9b932a6cb0cb46fc7c656e7c80c442e0
|
11 |
+
size: 2008
|
12 |
+
isexec: true
|
13 |
+
- path: shad_mlops_transformers/trainer.py
|
14 |
+
md5: 61acf28399fadfd2495dc48242c594ba
|
15 |
+
size: 3650
|
16 |
+
params:
|
17 |
+
shad_mlops_transformers/config.py:
|
18 |
+
Config.batch_size: 32
|
19 |
+
Config.random_seed: 42
|
20 |
+
Config.test_size: 0.2
|
21 |
+
outs:
|
22 |
+
- path: data/models/model.torch
|
23 |
+
md5: f110836b7b7585efdbfcb8ab7d5df76c
|
24 |
+
size: 438187413
|
dvc.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
stages:
|
2 |
+
train:
|
3 |
+
cmd: poetry run train
|
4 |
+
deps:
|
5 |
+
- shad_mlops_transformers/trainer.py
|
6 |
+
- shad_mlops_transformers/model.py
|
7 |
+
- data/raw/arxivData.json
|
8 |
+
params:
|
9 |
+
- shad_mlops_transformers/config.py:
|
10 |
+
- Config.batch_size
|
11 |
+
- Config.random_seed
|
12 |
+
- Config.test_size
|
13 |
+
outs:
|
14 |
+
# NOTE должно совпадать с конфигом
|
15 |
+
- data/models/model.torch
|
16 |
+
|
poetry.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
CHANGED
@@ -12,6 +12,11 @@ python = "^3.10"
|
|
12 |
streamlit = "^1.21.0"
|
13 |
torch = "^1.13"
|
14 |
transformers = "^4.27.4"
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
|
17 |
[tool.poetry.group.dev.dependencies]
|
@@ -20,6 +25,9 @@ black = "^23.3.0"
|
|
20 |
docformatter = "^1.6.0"
|
21 |
isort = "^5.12.0"
|
22 |
|
|
|
|
|
|
|
23 |
[build-system]
|
24 |
requires = ["poetry-core"]
|
25 |
build-backend = "poetry.core.masonry.api"
|
|
|
12 |
streamlit = "^1.21.0"
|
13 |
torch = "^1.13"
|
14 |
transformers = "^4.27.4"
|
15 |
+
pydantic = "^1.10.7"
|
16 |
+
scikit-learn = "^1.2.2"
|
17 |
+
numpy = "^1.24.2"
|
18 |
+
loguru = "^0.7.0"
|
19 |
+
dvc = {version = "^2.54.0", extras = ["s3"]}
|
20 |
|
21 |
|
22 |
[tool.poetry.group.dev.dependencies]
|
|
|
25 |
docformatter = "^1.6.0"
|
26 |
isort = "^5.12.0"
|
27 |
|
28 |
+
[tool.poetry.scripts]
|
29 |
+
train = "shad_mlops_transformers.trainer:main"
|
30 |
+
|
31 |
[build-system]
|
32 |
requires = ["poetry-core"]
|
33 |
build-backend = "poetry.core.masonry.api"
|
shad_mlops_transformers/__init__.py
CHANGED
File without changes
|
shad_mlops_transformers/config.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
from pydantic import BaseSettings
|
4 |
+
|
5 |
+
basedir = Path(__file__).parent
|
6 |
+
|
7 |
+
|
8 |
+
class Config(BaseSettings):
|
9 |
+
data_dir: Path = basedir.parent / "data"
|
10 |
+
raw_data_dir: Path = data_dir / "raw"
|
11 |
+
batch_size: int = 32
|
12 |
+
random_seed: int = 42
|
13 |
+
test_size: float = 0.2
|
14 |
+
weights_path: Path = data_dir / "models" / "model.torch"
|
15 |
+
|
16 |
+
|
17 |
+
config = Config()
|
shad_mlops_transformers/main.py
CHANGED
File without changes
|
shad_mlops_transformers/model.py
CHANGED
@@ -1,8 +1,56 @@
|
|
1 |
-
from
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import OrderedDict
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
from transformers import AutoModel, AutoTokenizer
|
7 |
+
|
8 |
+
from shad_mlops_transformers.config import config
|
9 |
+
|
10 |
+
# example = ["Nader Jokhadar had given Syria the lead with a well-struck header in the seventh minute."]
|
11 |
+
# model_name = "bert-base-uncased"
|
12 |
+
# model_name = "Davlan/distilbert-base-multilingual-cased-ner-hrl"
|
13 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_name)
|
14 |
+
# model = AutoModel.from_pretrained(model_name)
|
15 |
+
# nlp = pipeline("ner", model=model, tokenizer=tokenizer)
|
16 |
+
# toks = tokenizer(example, padding=True, truncation=True, return_tensors="pt")
|
17 |
+
# with torch.no_grad():
|
18 |
+
# p = model(**toks)
|
19 |
+
# print(p)
|
20 |
+
|
21 |
+
|
22 |
+
class DocumentClassifier(nn.Module):
|
23 |
+
def __init__(self, n_classes: int = 2):
|
24 |
+
super().__init__()
|
25 |
+
self.model_name = "bert-base-uncased"
|
26 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
27 |
+
self.encoder = AutoModel.from_pretrained(self.model_name)
|
28 |
+
self.n_classes = n_classes
|
29 |
+
self.model = nn.Sequential(
|
30 |
+
OrderedDict(
|
31 |
+
[
|
32 |
+
("fc", nn.Linear(in_features=self.encoder.pooler.dense.out_features, out_features=n_classes)),
|
33 |
+
("sm", nn.Softmax()),
|
34 |
+
]
|
35 |
+
)
|
36 |
+
)
|
37 |
+
self.trainable_params = self.model.parameters()
|
38 |
+
|
39 |
+
def forward(self, text):
|
40 |
+
tok_info = self.tokenize(text)
|
41 |
+
with torch.no_grad():
|
42 |
+
embeddings = self.encoder(**tok_info)["pooler_output"]
|
43 |
+
return self.model(embeddings)
|
44 |
+
|
45 |
+
def tokenize(self, x: str) -> dict:
|
46 |
+
return self.tokenizer(x, padding=True, truncation=True, return_tensors="pt")
|
47 |
+
|
48 |
+
def from_file(self, path: Path = config.weights_path) -> "DocumentClassifier":
|
49 |
+
self.load_state_dict(torch.load(path))
|
50 |
+
return self
|
51 |
+
|
52 |
+
|
53 |
+
if __name__ == "__main__":
|
54 |
+
data = ["This article describes machine learning"]
|
55 |
+
model = DocumentClassifier(n_classes=61).from_file()
|
56 |
+
model(data)
|
shad_mlops_transformers/trainer.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
from loguru import logger
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from torch.utils.data import DataLoader, Dataset
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
from shad_mlops_transformers.config import config
|
12 |
+
from shad_mlops_transformers.model import DocumentClassifier
|
13 |
+
|
14 |
+
|
15 |
+
class ArxivDataset(Dataset):
|
16 |
+
def __init__(self, raw_data: list[dict]):
|
17 |
+
"""Разово вычитываем и сохраняем весь датасет."""
|
18 |
+
logger.info("reading data")
|
19 |
+
self.x = []
|
20 |
+
self.y = []
|
21 |
+
# self.data = []
|
22 |
+
whitelist_labels = ["math", "cs"]
|
23 |
+
i = 0
|
24 |
+
self.class_mapper = {}
|
25 |
+
for item in raw_data:
|
26 |
+
tmp_y = []
|
27 |
+
# да простят мне это потомки, но там зачем-то люди засунули питоновский dict в строку!
|
28 |
+
for tag_desc in eval(item["tag"].replace("'", '"')):
|
29 |
+
real_tag: str = tag_desc["term"]
|
30 |
+
# пока берем только теги из whitelist
|
31 |
+
if not any([real_tag.startswith(x) for x in whitelist_labels]):
|
32 |
+
continue
|
33 |
+
if real_tag not in self.class_mapper:
|
34 |
+
self.class_mapper[real_tag] = i
|
35 |
+
i += 1
|
36 |
+
tmp_y.append(self.class_mapper[real_tag])
|
37 |
+
# берем только один тег
|
38 |
+
break
|
39 |
+
# если был хотя бы один валидный тег, добавляем в датасет
|
40 |
+
if len(tmp_y):
|
41 |
+
# NOTE берем только один тег
|
42 |
+
# self.data.append({"label": tmp_y[0], "text": item["summary"]})
|
43 |
+
self.x.append(item["summary"])
|
44 |
+
self.y.append(tmp_y[0])
|
45 |
+
self.classes = sorted(list(self.class_mapper.keys()))
|
46 |
+
logger.info("[Done] reading data")
|
47 |
+
|
48 |
+
def __getitem__(self, i):
|
49 |
+
# return self.data[i]
|
50 |
+
return self.x[i], self.y[i]
|
51 |
+
|
52 |
+
def __len__(self):
|
53 |
+
# return len(self.data)
|
54 |
+
return len(self.x)
|
55 |
+
|
56 |
+
|
57 |
+
def make_train_val():
|
58 |
+
with open(config.raw_data_dir / "arxivData.json", "r") as f:
|
59 |
+
_raw_json = json.load(f)
|
60 |
+
return train_test_split(_raw_json, test_size=config.test_size, shuffle=True, random_state=config.random_seed)
|
61 |
+
|
62 |
+
|
63 |
+
def train_model(model: DocumentClassifier, optimizer: torch.optim.Optimizer, loader: DataLoader, criterion):
|
64 |
+
model.train()
|
65 |
+
losses_tr = []
|
66 |
+
for text, true_label in tqdm(loader):
|
67 |
+
optimizer.zero_grad()
|
68 |
+
pred = model(text)
|
69 |
+
loss = criterion(pred, true_label)
|
70 |
+
|
71 |
+
loss.backward()
|
72 |
+
optimizer.step()
|
73 |
+
losses_tr.append(loss.item())
|
74 |
+
break
|
75 |
+
|
76 |
+
return model, optimizer, np.mean(losses_tr)
|
77 |
+
|
78 |
+
|
79 |
+
def collator(x):
|
80 |
+
return x[0]
|
81 |
+
|
82 |
+
|
83 |
+
def save_model(model: DocumentClassifier):
|
84 |
+
config.weights_path.parent.mkdir(parents=True, exist_ok=True)
|
85 |
+
torch.save(model.state_dict(), config.weights_path)
|
86 |
+
|
87 |
+
|
88 |
+
def main():
|
89 |
+
train, val = make_train_val()
|
90 |
+
dataset_train = ArxivDataset(train)
|
91 |
+
dataset_val = ArxivDataset(val)
|
92 |
+
loader_train = DataLoader(dataset_train, batch_size=config.batch_size, shuffle=True, drop_last=True)
|
93 |
+
loader_val = DataLoader(dataset_val, batch_size=config.batch_size, shuffle=True, drop_last=True)
|
94 |
+
|
95 |
+
model = DocumentClassifier(n_classes=len(dataset_train.classes))
|
96 |
+
optimizer = torch.optim.Adam(model.trainable_params)
|
97 |
+
loss = nn.CrossEntropyLoss()
|
98 |
+
train_model(model=model, optimizer=optimizer, loader=loader_train, criterion=loss)
|
99 |
+
save_model(model)
|
100 |
+
|
101 |
+
|
102 |
+
if __name__ == "__main__":
|
103 |
+
main()
|
shad_mlops_transformers/ui.py
CHANGED
File without changes
|