use bert model
Browse files- added_tokens.json +0 -3
- app.py +4 -4
- bpe.codes +0 -0
- phoBERT.py +2 -2
- phoBertModel.pth +0 -3
- special_tokens_map.json +0 -9
- tokenizer_config.json +0 -54
- vocab.txt +0 -0
added_tokens.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"<mask>": 64000
|
3 |
-
}
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -14,7 +14,7 @@ import unicodedata as ud
|
|
14 |
|
15 |
from underthesea import word_tokenize
|
16 |
|
17 |
-
|
18 |
|
19 |
# Load tokenizer
|
20 |
# fp = Path(__file__).with_name('tokenizer.pkl')
|
@@ -86,14 +86,14 @@ def judge(x):
|
|
86 |
|
87 |
lstm_pred = LSTM_predict(x)
|
88 |
gru_pred = GRU_predict(x)
|
89 |
-
|
90 |
#print(result)
|
91 |
|
92 |
return_result = 'Result'
|
93 |
result_lstm = np.round(lstm_pred, 2)
|
94 |
result_gru = np.round(gru_pred, 2)
|
95 |
-
|
96 |
for i in range(6):
|
97 |
-
result.append((result_lstm[i]+result_gru[i])/
|
98 |
|
99 |
return (result)
|
|
|
14 |
|
15 |
from underthesea import word_tokenize
|
16 |
|
17 |
+
from phoBERT import BERT_predict
|
18 |
|
19 |
# Load tokenizer
|
20 |
# fp = Path(__file__).with_name('tokenizer.pkl')
|
|
|
86 |
|
87 |
lstm_pred = LSTM_predict(x)
|
88 |
gru_pred = GRU_predict(x)
|
89 |
+
bert_pred = BERT_predict(x)
|
90 |
#print(result)
|
91 |
|
92 |
return_result = 'Result'
|
93 |
result_lstm = np.round(lstm_pred, 2)
|
94 |
result_gru = np.round(gru_pred, 2)
|
95 |
+
result_bert = np.round(bert_pred, 2)
|
96 |
for i in range(6):
|
97 |
+
result.append((result_lstm[i]+result_gru[i]+result_bert[i])/3)
|
98 |
|
99 |
return (result)
|
bpe.codes
DELETED
The diff for this file is too large to render.
See raw diff
|
|
phoBERT.py
CHANGED
@@ -5,7 +5,7 @@ import __main__
|
|
5 |
|
6 |
|
7 |
#phobert = AutoModel.from_pretrained("vinai/phobert-base")
|
8 |
-
tokenizer = AutoTokenizer.from_pretrained("./")
|
9 |
|
10 |
class PhoBertModel(torch.nn.Module):
|
11 |
def __init__(self):
|
@@ -35,7 +35,7 @@ class PhoBertModel(torch.nn.Module):
|
|
35 |
setattr(__main__, "PhoBertModel", PhoBertModel)
|
36 |
|
37 |
def getModel():
|
38 |
-
model = torch.load('phoBertModel.pth', map_location=torch.device('cpu'))
|
39 |
model.eval()
|
40 |
return model
|
41 |
|
|
|
5 |
|
6 |
|
7 |
#phobert = AutoModel.from_pretrained("vinai/phobert-base")
|
8 |
+
tokenizer = AutoTokenizer.from_pretrained("./bert/bert_tokenizer")
|
9 |
|
10 |
class PhoBertModel(torch.nn.Module):
|
11 |
def __init__(self):
|
|
|
35 |
setattr(__main__, "PhoBertModel", PhoBertModel)
|
36 |
|
37 |
def getModel():
|
38 |
+
model = torch.load('./bert/phoBertModel.pth', map_location=torch.device('cpu'))
|
39 |
model.eval()
|
40 |
return model
|
41 |
|
phoBertModel.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d5fca9d837d05b1e8330798e32a59b5200bf677d5cf2f178727dcd131c86230b
|
3 |
-
size 542499629
|
|
|
|
|
|
|
|
special_tokens_map.json
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"bos_token": "<s>",
|
3 |
-
"cls_token": "<s>",
|
4 |
-
"eos_token": "</s>",
|
5 |
-
"mask_token": "<mask>",
|
6 |
-
"pad_token": "<pad>",
|
7 |
-
"sep_token": "</s>",
|
8 |
-
"unk_token": "<unk>"
|
9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer_config.json
DELETED
@@ -1,54 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"added_tokens_decoder": {
|
3 |
-
"0": {
|
4 |
-
"content": "<s>",
|
5 |
-
"lstrip": false,
|
6 |
-
"normalized": false,
|
7 |
-
"rstrip": false,
|
8 |
-
"single_word": false,
|
9 |
-
"special": true
|
10 |
-
},
|
11 |
-
"1": {
|
12 |
-
"content": "<pad>",
|
13 |
-
"lstrip": false,
|
14 |
-
"normalized": false,
|
15 |
-
"rstrip": false,
|
16 |
-
"single_word": false,
|
17 |
-
"special": true
|
18 |
-
},
|
19 |
-
"2": {
|
20 |
-
"content": "</s>",
|
21 |
-
"lstrip": false,
|
22 |
-
"normalized": false,
|
23 |
-
"rstrip": false,
|
24 |
-
"single_word": false,
|
25 |
-
"special": true
|
26 |
-
},
|
27 |
-
"3": {
|
28 |
-
"content": "<unk>",
|
29 |
-
"lstrip": false,
|
30 |
-
"normalized": false,
|
31 |
-
"rstrip": false,
|
32 |
-
"single_word": false,
|
33 |
-
"special": true
|
34 |
-
},
|
35 |
-
"64000": {
|
36 |
-
"content": "<mask>",
|
37 |
-
"lstrip": false,
|
38 |
-
"normalized": false,
|
39 |
-
"rstrip": false,
|
40 |
-
"single_word": false,
|
41 |
-
"special": true
|
42 |
-
}
|
43 |
-
},
|
44 |
-
"bos_token": "<s>",
|
45 |
-
"clean_up_tokenization_spaces": true,
|
46 |
-
"cls_token": "<s>",
|
47 |
-
"eos_token": "</s>",
|
48 |
-
"mask_token": "<mask>",
|
49 |
-
"model_max_length": 256,
|
50 |
-
"pad_token": "<pad>",
|
51 |
-
"sep_token": "</s>",
|
52 |
-
"tokenizer_class": "PhobertTokenizer",
|
53 |
-
"unk_token": "<unk>"
|
54 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|