|
#! /bin/sh |
|
S=KoichiYasuoka/modernbert-base-thai-wikipedia |
|
T=KoichiYasuoka/modernbert-base-thai-wikipedia-upos |
|
D=spaCy-Thai/UD_Thai-Corpora |
|
test -d $D || git clone --depth=1 https://github.com/KoichiYasuoka/spaCy-Thai |
|
nawk ' |
|
BEGIN{ |
|
FS=OFS="\t"; |
|
} |
|
{ |
|
if(NF==10&&$1~/^[1-9][0-9]*$/) |
|
u=u$0"\n"; |
|
else if($0~/^# text =/) |
|
u=u$0"\n"; |
|
else if($0==""){ |
|
f=(FILENAME~/test/)?"test":(FILENAME~/dev/)?"dev":"train"; |
|
print u>f".upos"; |
|
if(u~/\t0\troot\t/) |
|
print u>f".conllu"; |
|
u=""; |
|
} |
|
}' $D/*-ud-*.conllu |
|
python3 -m esupar.train $S $T -1 /tmp train.upos |
|
python3 -m esupar.train $T $T 32 /// train.conllu dev.conllu test.conllu |
|
|