Commit
·
8abcc73
1
Parent(s):
4758dea
re-initialize
Browse files
suparkanbun/models/gloss.orig.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
suparkanbun/models/labelPOS.txt
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
n,代名詞,人称,他,PRON,Person=1|PronType=Prs
|
2 |
+
n,代名詞,人称,他,PRON,Person=2|PronType=Prs
|
3 |
+
n,代名詞,人称,他,PRON,Person=3|PronType=Prs
|
4 |
+
n,代名詞,人称,他,PRON,PronType=Prs
|
5 |
+
n,代名詞,人称,他,PRON,PronType=Prs|Reflex=Yes
|
6 |
+
n,代名詞,人称,止格,PRON,Person=1|PronType=Prs
|
7 |
+
n,代名詞,人称,止格,PRON,Person=2|PronType=Prs
|
8 |
+
n,代名詞,人称,止格,PRON,Person=3|PronType=Prs
|
9 |
+
n,代名詞,人称,止格,PRON,PronType=Prs
|
10 |
+
n,代名詞,人称,起格,PRON,Person=1|PronType=Prs
|
11 |
+
n,代名詞,人称,起格,PRON,Person=2|PronType=Prs
|
12 |
+
n,代名詞,人称,起格,PRON,Person=3|PronType=Prs
|
13 |
+
n,代名詞,人称,起格,PRON,PronType=Prs
|
14 |
+
n,代名詞,指示,*,PRON,PronType=Dem
|
15 |
+
n,代名詞,疑問,*,PRON,PronType=Int
|
16 |
+
n,名詞,不可譲,属性,NOUN,_
|
17 |
+
n,名詞,不可譲,疾病,NOUN,_
|
18 |
+
n,名詞,不可譲,身体,NOUN,_
|
19 |
+
n,名詞,主体,動物,NOUN,_
|
20 |
+
n,名詞,主体,国名,PROPN,Case=Loc|NameType=Nat
|
21 |
+
n,名詞,主体,書物,NOUN,_
|
22 |
+
n,名詞,主体,機関,NOUN,_
|
23 |
+
n,名詞,主体,集団,NOUN,_
|
24 |
+
n,名詞,人,その他の人名,PROPN,NameType=Prs
|
25 |
+
n,名詞,人,人,NOUN,_
|
26 |
+
n,名詞,人,名,PROPN,NameType=Giv
|
27 |
+
n,名詞,人,姓氏,PROPN,NameType=Sur
|
28 |
+
n,名詞,人,役割,NOUN,_
|
29 |
+
n,名詞,人,複合的人名,PROPN,NameType=Prs
|
30 |
+
n,名詞,人,関係,NOUN,_
|
31 |
+
n,名詞,制度,儀礼,NOUN,_
|
32 |
+
n,名詞,制度,場,NOUN,Case=Loc
|
33 |
+
n,名詞,可搬,乗り物,NOUN,_
|
34 |
+
n,名詞,可搬,伝達,NOUN,_
|
35 |
+
n,名詞,可搬,成果物,NOUN,_
|
36 |
+
n,名詞,可搬,糧食,NOUN,_
|
37 |
+
n,名詞,可搬,道具,NOUN,_
|
38 |
+
n,名詞,固定物,地名,PROPN,Case=Loc|NameType=Geo
|
39 |
+
n,名詞,固定物,地形,NOUN,Case=Loc
|
40 |
+
n,名詞,固定物,建造物,NOUN,Case=Loc
|
41 |
+
n,名詞,固定物,樹木,NOUN,_
|
42 |
+
n,名詞,固定物,関係,NOUN,Case=Loc
|
43 |
+
n,名詞,外観,人,NOUN,_
|
44 |
+
n,名詞,天象,天文,NOUN,_
|
45 |
+
n,名詞,天象,怪異,NOUN,_
|
46 |
+
n,名詞,天象,気象,NOUN,_
|
47 |
+
n,名詞,度量衡,*,NOUN,NounType=Clf
|
48 |
+
n,名詞,思考,*,NOUN,_
|
49 |
+
n,名詞,描写,形質,NOUN,_
|
50 |
+
n,名詞,描写,態度,NOUN,_
|
51 |
+
n,名詞,数量,*,NOUN,_
|
52 |
+
n,名詞,時,*,NOUN,Case=Tem
|
53 |
+
n,名詞,行為,*,NOUN,_
|
54 |
+
n,数詞,干支,*,NUM,NumType=Ord
|
55 |
+
n,数詞,数,*,NUM,_
|
56 |
+
n,数詞,数字,*,NUM,_
|
57 |
+
p,助詞,句末,*,PART,_
|
58 |
+
p,助詞,句頭,*,PART,_
|
59 |
+
p,助詞,接続,並列,CCONJ,_
|
60 |
+
p,助詞,接続,体言化,PART,_
|
61 |
+
p,助詞,接続,属格,SCONJ,_
|
62 |
+
p,助詞,提示,*,PART,_
|
63 |
+
p,感嘆詞,*,*,INTJ,_
|
64 |
+
p,接尾辞,*,*,PART,_
|
65 |
+
s,文字,*,*,SYM,_
|
66 |
+
s,記号,一般,*,SYM,_
|
67 |
+
s,記号,句点,*,PUNCT,_
|
68 |
+
s,記号,読点,*,PUNCT,_
|
69 |
+
v,前置詞,基盤,*,ADP,_
|
70 |
+
v,前置詞,源泉,*,ADP,_
|
71 |
+
v,前置詞,経由,*,ADP,_
|
72 |
+
v,前置詞,関係,*,ADP,_
|
73 |
+
v,副詞,判断,推定,ADV,_
|
74 |
+
v,副詞,判断,確定,ADV,_
|
75 |
+
v,副詞,判断,逆接,ADV,_
|
76 |
+
v,副詞,否定,体言否定,ADV,Polarity=Neg
|
77 |
+
v,副詞,否定,有界,ADV,Polarity=Neg
|
78 |
+
v,副詞,否定,無界,ADV,Polarity=Neg
|
79 |
+
v,副詞,否定,禁止,ADV,Polarity=Neg
|
80 |
+
v,副詞,描写,*,ADV,_
|
81 |
+
v,副詞,時相,変化,ADV,AdvType=Tim
|
82 |
+
v,副詞,時相,完了,ADV,AdvType=Tim|Aspect=Perf
|
83 |
+
v,副詞,時相,将来,ADV,AdvType=Tim|Tense=Fut
|
84 |
+
v,副詞,時相,恒常,ADV,AdvType=Tim
|
85 |
+
v,副詞,時相,現在,ADV,AdvType=Tim|Tense=Pres
|
86 |
+
v,副詞,時相,終局,ADV,AdvType=Tim
|
87 |
+
v,副詞,時相,継起,ADV,AdvType=Tim
|
88 |
+
v,副詞,時相,緊接,ADV,AdvType=Tim
|
89 |
+
v,副詞,時相,過去,ADV,AdvType=Tim|Tense=Past
|
90 |
+
v,副詞,疑問,原因,ADV,AdvType=Cau
|
91 |
+
v,副詞,疑問,反語,ADV,_
|
92 |
+
v,副詞,疑問,所在,ADV,_
|
93 |
+
v,副詞,程度,やや高度,ADV,AdvType=Deg|Degree=Cmp
|
94 |
+
v,副詞,程度,極度,ADV,AdvType=Deg|Degree=Sup
|
95 |
+
v,副詞,程度,軽度,ADV,AdvType=Deg|Degree=Pos
|
96 |
+
v,副詞,範囲,共同,ADV,_
|
97 |
+
v,副詞,範囲,総括,ADV,_
|
98 |
+
v,副詞,範囲,限定,ADV,_
|
99 |
+
v,副詞,頻度,偶発,ADV,_
|
100 |
+
v,副詞,頻度,重複,ADV,_
|
101 |
+
v,副詞,頻度,頻繁,ADV,_
|
102 |
+
v,助動詞,受動,*,AUX,Voice=Pass
|
103 |
+
v,助動詞,可能,*,AUX,Mood=Pot
|
104 |
+
v,助動詞,必要,*,AUX,Mood=Nec
|
105 |
+
v,助動詞,願望,*,AUX,Mood=Des
|
106 |
+
v,動詞,変化,制度,VERB,_
|
107 |
+
v,動詞,変化,性質,VERB,_
|
108 |
+
v,動詞,変化,生物,VERB,_
|
109 |
+
v,動詞,存在,存在,VERB,Polarity=Neg
|
110 |
+
v,動詞,存在,存在,VERB,VerbType=Cop
|
111 |
+
v,動詞,存在,存在,VERB,_
|
112 |
+
v,動詞,描写,境遇,VERB,Degree=Pos
|
113 |
+
v,動詞,描写,形質,VERB,Degree=Pos
|
114 |
+
v,動詞,描写,態度,VERB,Degree=Pos
|
115 |
+
v,動詞,描写,量,VERB,Degree=Pos
|
116 |
+
v,動詞,行為,交流,VERB,_
|
117 |
+
v,動詞,行為,伝達,VERB,_
|
118 |
+
v,動詞,行為,使役,VERB,_
|
119 |
+
v,動詞,行為,儀礼,VERB,_
|
120 |
+
v,動詞,行為,分類,VERB,Degree=Equ
|
121 |
+
v,動詞,行為,動作,VERB,_
|
122 |
+
v,動詞,行為,姿勢,VERB,_
|
123 |
+
v,動詞,行為,役割,VERB,_
|
124 |
+
v,動詞,行為,得失,VERB,_
|
125 |
+
v,動詞,行為,態度,VERB,_
|
126 |
+
v,動詞,行為,生産,VERB,_
|
127 |
+
v,動詞,行為,移動,VERB,_
|
128 |
+
v,動詞,行為,設置,VERB,_
|
129 |
+
v,動詞,行為,飲食,VERB,_
|
suparkanbun/models/lzh_kyoto.conllu
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86c46887798cd5d93f500ef99674897876501177366ad2b3e4ad861f3a362beb
|
3 |
+
size 24744523
|
suparkanbun/models/mkmodel.sh
ADDED
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#! /bin/sh
|
2 |
+
# pip3 install transformers seqeval datasets supar
|
3 |
+
test -f run_ner.py || curl -LO https://raw.githubusercontent.com/huggingface/transformers/v4.0.1/examples/token-classification/run_ner.py
|
4 |
+
|
5 |
+
python3 -c '
|
6 |
+
from suparkanbun.simplify import simplify
|
7 |
+
c=[]
|
8 |
+
h=[0]
|
9 |
+
while True:
|
10 |
+
try:
|
11 |
+
s=input()
|
12 |
+
except:
|
13 |
+
quit()
|
14 |
+
t=s.strip().split("\t")
|
15 |
+
if len(t)==10:
|
16 |
+
if t[0]!="#":
|
17 |
+
t[0]=str(len(c)+1)
|
18 |
+
i=len(t[1])
|
19 |
+
if i>1:
|
20 |
+
form=t[1]
|
21 |
+
lemma=t[2]
|
22 |
+
head=t[6]
|
23 |
+
deprel=t[7]
|
24 |
+
for j in range(0,i-1):
|
25 |
+
t[1]=form[j]
|
26 |
+
if t[1] in simplify:
|
27 |
+
t[1]=simplify[t[1]]
|
28 |
+
t[2]=lemma[j]
|
29 |
+
t[6]="-1"
|
30 |
+
t[7]="compound"
|
31 |
+
c.append(list(t))
|
32 |
+
t[0]=str(len(c)+1)
|
33 |
+
t[1]=form[i-1]
|
34 |
+
t[2]=lemma[i-1]
|
35 |
+
t[6]=head
|
36 |
+
t[7]=deprel
|
37 |
+
if t[1] in simplify:
|
38 |
+
t[1]=simplify[t[1]]
|
39 |
+
c.append(list(t))
|
40 |
+
h.append(len(c))
|
41 |
+
elif s.strip()=="":
|
42 |
+
for t in c:
|
43 |
+
t[6]=str(int(t[0])+1 if t[6]=="-1" else h[int(t[6])])
|
44 |
+
print("\t".join(t))
|
45 |
+
print("")
|
46 |
+
c=[]
|
47 |
+
h=[0]
|
48 |
+
' < lzh_kyoto.conllu | tee simplified.conllu | python3 -c '
|
49 |
+
tokens=[]
|
50 |
+
tags=[]
|
51 |
+
while True:
|
52 |
+
try:
|
53 |
+
s=input()
|
54 |
+
except:
|
55 |
+
if len(tokens)>0:
|
56 |
+
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
|
57 |
+
quit()
|
58 |
+
t=s.split("\t")
|
59 |
+
if len(t)==10:
|
60 |
+
p=t[4]+","+t[3]+","+t[5]
|
61 |
+
for c in t[1]:
|
62 |
+
tokens.append(c)
|
63 |
+
tags.append(p)
|
64 |
+
elif len(tokens)>80:
|
65 |
+
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
|
66 |
+
tokens=[]
|
67 |
+
tags=[]
|
68 |
+
' | tee simplifiedPOS.json | nawk '
|
69 |
+
{
|
70 |
+
if(NR%10>0)
|
71 |
+
printf("%s\n",$0)>"trainPOS.json";
|
72 |
+
else
|
73 |
+
printf("%s\n",$0)>"validPOS.json";
|
74 |
+
}'
|
75 |
+
sed 's/^.*"tags":\[//' trainPOS.json | tr '"' '\012' | sort -u | egrep '^[nvps],' > labelPOS.txt
|
76 |
+
if [ ! -d guwenbert-base.pos ]
|
77 |
+
then mkdir -p guwenbert-base.pos
|
78 |
+
python3 run_ner.py --model_name_or_path ethanyt/guwenbert-base --train_file trainPOS.json --validation_file validPOS.json --output_dir guwenbert-base.pos --do_train --do_eval
|
79 |
+
fi
|
80 |
+
if [ ! -d guwenbert-large.pos ]
|
81 |
+
then mkdir -p guwenbert-large.pos
|
82 |
+
python3 run_ner.py --model_name_or_path ethanyt/guwenbert-large --train_file trainPOS.json --validation_file validPOS.json --output_dir guwenbert-large.pos --do_train --do_eval
|
83 |
+
fi
|
84 |
+
|
85 |
+
nawk '
|
86 |
+
BEGIN{
|
87 |
+
f[0]="test.conllu";
|
88 |
+
f[1]="dev.conllu";
|
89 |
+
for(i=2;i<10;i++)
|
90 |
+
f[i]="train.conllu";
|
91 |
+
}
|
92 |
+
{
|
93 |
+
printf("%s\n",$0)>f[i%10];
|
94 |
+
if($0=="")
|
95 |
+
i++;
|
96 |
+
}' simplified.conllu
|
97 |
+
if [ ! -f guwenbert-base.pos/guwenbert-base.supar ]
|
98 |
+
then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p guwenbert-base.pos/guwenbert-base.supar -c biaffine-dep-en -f bert --bert ethanyt/guwenbert-base --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
|
99 |
+
fi
|
100 |
+
if [ ! -f guwenbert-large.pos/guwenbert-large.supar ]
|
101 |
+
then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p guwenbert-large.pos/guwenbert-large.supar -c biaffine-dep-en -f bert --bert ethanyt/guwenbert-large --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
|
102 |
+
fi
|
103 |
+
|
104 |
+
python3 -c '
|
105 |
+
tokens=[]
|
106 |
+
tags=[]
|
107 |
+
i=0
|
108 |
+
while True:
|
109 |
+
try:
|
110 |
+
s=input()
|
111 |
+
except:
|
112 |
+
if len(tokens)>0:
|
113 |
+
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
|
114 |
+
quit()
|
115 |
+
t=s.split("\t")
|
116 |
+
if len(t)==10:
|
117 |
+
for c in t[1]:
|
118 |
+
tokens.append(c)
|
119 |
+
i+=1
|
120 |
+
else:
|
121 |
+
if i==1:
|
122 |
+
tags.append("S")
|
123 |
+
elif i==2:
|
124 |
+
tags+=["B","E"]
|
125 |
+
elif i==3:
|
126 |
+
tags+=["B","E2","E"]
|
127 |
+
else:
|
128 |
+
tags+=["B"]+["M"]*(i-4)+["E3","E2","E"]
|
129 |
+
i=0
|
130 |
+
if len(tokens)>80:
|
131 |
+
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
|
132 |
+
tokens=[]
|
133 |
+
tags=[]
|
134 |
+
' < simplified.conllu | tee simplifiedDanku.json | nawk '
|
135 |
+
{
|
136 |
+
if(NR%10>0)
|
137 |
+
printf("%s\n",$0)>"trainDanku.json";
|
138 |
+
else
|
139 |
+
printf("%s\n",$0)>"validDanku.json";
|
140 |
+
}'
|
141 |
+
sed 's/^.*"tags":\[//' trainDanku.json | tr '"' '\012' | sort -u | egrep '^[A-Z]' > labelDanku.txt
|
142 |
+
if [ ! -d guwenbert-base.danku ]
|
143 |
+
then mkdir -p guwenbert-base.danku
|
144 |
+
python3 run_ner.py --model_name_or_path ethanyt/guwenbert-base --train_file trainDanku.json --validation_file validDanku.json --output_dir guwenbert-base.danku --do_train --do_eval
|
145 |
+
fi
|
146 |
+
if [ ! -d guwenbert-large.danku ]
|
147 |
+
then mkdir -p guwenbert-large.danku
|
148 |
+
python3 run_ner.py --model_name_or_path ethanyt/guwenbert-large --train_file trainDanku.json --validation_file validDanku.json --output_dir guwenbert-large.danku --do_train --do_eval
|
149 |
+
fi
|
150 |
+
|
151 |
+
python3 -c '
|
152 |
+
c=[]
|
153 |
+
h=[0]
|
154 |
+
while True:
|
155 |
+
try:
|
156 |
+
s=input()
|
157 |
+
except:
|
158 |
+
quit()
|
159 |
+
t=s.strip().split("\t")
|
160 |
+
if len(t)==10:
|
161 |
+
if t[0]!="#":
|
162 |
+
t[0]=str(len(c)+1)
|
163 |
+
i=len(t[1])
|
164 |
+
if i>1:
|
165 |
+
form=t[1]
|
166 |
+
lemma=t[2]
|
167 |
+
head=t[6]
|
168 |
+
deprel=t[7]
|
169 |
+
for j in range(0,i-1):
|
170 |
+
t[1]=form[j]
|
171 |
+
t[2]=lemma[j]
|
172 |
+
t[6]="-1"
|
173 |
+
t[7]="compound"
|
174 |
+
c.append(list(t))
|
175 |
+
t[0]=str(len(c)+1)
|
176 |
+
t[1]=form[i-1]
|
177 |
+
t[2]=lemma[i-1]
|
178 |
+
t[6]=head
|
179 |
+
t[7]=deprel
|
180 |
+
c.append(list(t))
|
181 |
+
h.append(len(c))
|
182 |
+
elif s.strip()=="":
|
183 |
+
for t in c:
|
184 |
+
t[6]=str(int(t[0])+1 if t[6]=="-1" else h[int(t[6])])
|
185 |
+
print("\t".join(t))
|
186 |
+
print("")
|
187 |
+
c=[]
|
188 |
+
h=[0]
|
189 |
+
' < lzh_kyoto.conllu | tee traditional.conllu | python3 -c '
|
190 |
+
tokens=[]
|
191 |
+
tags=[]
|
192 |
+
while True:
|
193 |
+
try:
|
194 |
+
s=input()
|
195 |
+
except:
|
196 |
+
if len(tokens)>0:
|
197 |
+
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
|
198 |
+
quit()
|
199 |
+
t=s.split("\t")
|
200 |
+
if len(t)==10:
|
201 |
+
p=t[4]+","+t[3]+","+t[5]
|
202 |
+
for c in t[1]:
|
203 |
+
tokens.append(c)
|
204 |
+
tags.append(p)
|
205 |
+
elif len(tokens)>80:
|
206 |
+
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
|
207 |
+
tokens=[]
|
208 |
+
tags=[]
|
209 |
+
' | tee traditionalPOS.json | nawk '
|
210 |
+
{
|
211 |
+
if(NR%10>0)
|
212 |
+
printf("%s\n",$0)>>"trainPOS.json";
|
213 |
+
else
|
214 |
+
printf("%s\n",$0)>>"validPOS.json";
|
215 |
+
}'
|
216 |
+
if [ ! -d roberta-classical-chinese-base-char.pos ]
|
217 |
+
then mkdir -p roberta-classical-chinese-base-char.pos
|
218 |
+
python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-base-char --train_file trainPOS.json --validation_file validPOS.json --output_dir roberta-classical-chinese-base-char.pos --do_train --do_eval
|
219 |
+
fi
|
220 |
+
if [ ! -d roberta-classical-chinese-large-char.pos ]
|
221 |
+
then mkdir -p roberta-classical-chinese-large-char.pos
|
222 |
+
python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-large-char --train_file trainPOS.json --validation_file validPOS.json --output_dir roberta-classical-chinese-large-char.pos --do_train --do_eval
|
223 |
+
fi
|
224 |
+
|
225 |
+
nawk '
|
226 |
+
BEGIN{
|
227 |
+
f[0]="test.conllu";
|
228 |
+
f[1]="dev.conllu";
|
229 |
+
for(i=2;i<10;i++)
|
230 |
+
f[i]="train.conllu";
|
231 |
+
}
|
232 |
+
{
|
233 |
+
printf("%s\n",$0)>>f[i%10];
|
234 |
+
if($0=="")
|
235 |
+
i++;
|
236 |
+
}' traditional.conllu
|
237 |
+
if [ ! -f roberta-classical-chinese-base-char.pos/roberta-classical-chinese-base-char.supar ]
|
238 |
+
then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p roberta-classical-chinese-base-char.pos/roberta-classical-chinese-base-char.supar -c biaffine-dep-en -f bert --bert KoichiYasuoka/roberta-classical-chinese-base-char --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
|
239 |
+
fi
|
240 |
+
if [ ! -f roberta-classical-chinese-large-char.pos/roberta-classical-chinese-large-char.supar ]
|
241 |
+
then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p roberta-classical-chinese-large-char.pos/roberta-classical-chinese-large-char.supar -c biaffine-dep-en -f bert --bert KoichiYasuoka/roberta-classical-chinese-large-char --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
|
242 |
+
fi
|
243 |
+
|
244 |
+
python3 -c '
|
245 |
+
tokens=[]
|
246 |
+
tags=[]
|
247 |
+
i=0
|
248 |
+
while True:
|
249 |
+
try:
|
250 |
+
s=input()
|
251 |
+
except:
|
252 |
+
if len(tokens)>0:
|
253 |
+
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
|
254 |
+
quit()
|
255 |
+
t=s.split("\t")
|
256 |
+
if len(t)==10:
|
257 |
+
for c in t[1]:
|
258 |
+
tokens.append(c)
|
259 |
+
i+=1
|
260 |
+
else:
|
261 |
+
if i==1:
|
262 |
+
tags.append("S")
|
263 |
+
elif i==2:
|
264 |
+
tags+=["B","E"]
|
265 |
+
elif i==3:
|
266 |
+
tags+=["B","E2","E"]
|
267 |
+
else:
|
268 |
+
tags+=["B"]+["M"]*(i-4)+["E3","E2","E"]
|
269 |
+
i=0
|
270 |
+
if len(tokens)>80:
|
271 |
+
print("{\"tokens\":[\""+"\",\"".join(tokens)+"\"],\"tags\":[\""+"\",\"".join(tags)+"\"]}")
|
272 |
+
tokens=[]
|
273 |
+
tags=[]
|
274 |
+
' < traditional.conllu | tee traditionalDanku.json | nawk '
|
275 |
+
{
|
276 |
+
if(NR%10>0)
|
277 |
+
printf("%s\n",$0)>>"trainDanku.json";
|
278 |
+
else
|
279 |
+
printf("%s\n",$0)>>"validDanku.json";
|
280 |
+
}'
|
281 |
+
if [ ! -d roberta-classical-chinese-base-char.danku ]
|
282 |
+
then mkdir -p roberta-classical-chinese-base-char.danku
|
283 |
+
python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-base-char --train_file trainDanku.json --validation_file validDanku.json --output_dir roberta-classical-chinese-base-char.danku --do_train --do_eval
|
284 |
+
fi
|
285 |
+
if [ ! -d roberta-classical-chinese-large-char.danku ]
|
286 |
+
then mkdir -p roberta-classical-chinese-large-char.danku
|
287 |
+
python3 run_ner.py --model_name_or_path KoichiYasuoka/roberta-classical-chinese-large-char --train_file trainDanku.json --validation_file validDanku.json --output_dir roberta-classical-chinese-large-char.danku --do_train --do_eval
|
288 |
+
fi
|
289 |
+
|
290 |
+
nawk '
|
291 |
+
{
|
292 |
+
if(NR%10>0)
|
293 |
+
printf("%s\n",$0)>"trainPOS.json";
|
294 |
+
else
|
295 |
+
printf("%s\n",$0)>"validPOS.json";
|
296 |
+
}' traditionalPOS.json
|
297 |
+
if [ ! -d sikubert.pos ]
|
298 |
+
then mkdir -p sikubert.pos
|
299 |
+
python3 run_ner.py --model_name_or_path SIKU-BERT/sikubert --train_file trainPOS.json --validation_file validPOS.json --output_dir sikubert.pos --do_train --do_eval
|
300 |
+
fi
|
301 |
+
if [ ! -d sikuroberta.pos ]
|
302 |
+
then mkdir -p sikuroberta.pos
|
303 |
+
python3 run_ner.py --model_name_or_path SIKU-BERT/sikuroberta --train_file trainPOS.json --validation_file validPOS.json --output_dir sikuroberta.pos --do_train --do_eval
|
304 |
+
fi
|
305 |
+
|
306 |
+
nawk '
|
307 |
+
BEGIN{
|
308 |
+
f[0]="test.conllu";
|
309 |
+
f[1]="dev.conllu";
|
310 |
+
for(i=2;i<10;i++)
|
311 |
+
f[i]="train.conllu";
|
312 |
+
}
|
313 |
+
{
|
314 |
+
printf("%s\n",$0)>f[i%10];
|
315 |
+
if($0=="")
|
316 |
+
i++;
|
317 |
+
}' traditional.conllu
|
318 |
+
if [ ! -f sikubert.pos/sikubert.supar ]
|
319 |
+
then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p sikubert.pos/sikubert.supar -c biaffine-dep-en -f bert --bert SIKU-BERT/sikubert --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
|
320 |
+
fi
|
321 |
+
if [ ! -f sikuroberta.pos/sikuroberta.supar ]
|
322 |
+
then python3 -m supar.cmds.biaffine_dep train -b -d 0 -p sikuroberta.pos/sikuroberta.supar -c biaffine-dep-en -f bert --bert SIKU-BERT/sikuroberta --train train.conllu --dev dev.conllu --test test.conllu --embed='' --proj
|
323 |
+
fi
|
324 |
+
|
325 |
+
nawk '
|
326 |
+
{
|
327 |
+
if(NR%10>0)
|
328 |
+
printf("%s\n",$0)>"trainDanku.json";
|
329 |
+
else
|
330 |
+
printf("%s\n",$0)>"validDanku.json";
|
331 |
+
}' traditionalDanku.json
|
332 |
+
if [ ! -d sikubert.danku ]
|
333 |
+
then mkdir -p sikubert.danku
|
334 |
+
python3 run_ner.py --model_name_or_path SIKU-BERT/sikubert --train_file trainDanku.json --validation_file validDanku.json --output_dir sikubert.danku --do_train --do_eval
|
335 |
+
fi
|
336 |
+
if [ ! -d sikuroberta.danku ]
|
337 |
+
then mkdir -p sikuroberta.danku
|
338 |
+
python3 run_ner.py --model_name_or_path SIKU-BERT/sikuroberta --train_file trainDanku.json --validation_file validDanku.json --output_dir sikuroberta.danku --do_train --do_eval
|
339 |
+
fi
|
340 |
+
|
341 |
+
exit 0
|
suparkanbun/models/splitter.sh
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#! /bin/sh
|
2 |
+
for F
|
3 |
+
do split -a 2 -b 83886080 --numeric-suffixes=01 $F $F.
|
4 |
+
ls -1 $F.0[1-9] | sed 's/^\(.*\)0\([1-9]\)$/mv & \1\2/' | sh
|
5 |
+
done
|
6 |
+
exit 0
|