KoichiYasuoka's picture
bug fix
8f85b68
raw
history blame contribute delete
649 Bytes
#! /bin/sh
S=KoichiYasuoka/modernbert-base-thai-wikipedia
T=KoichiYasuoka/modernbert-base-thai-wikipedia-upos
D=spaCy-Thai/UD_Thai-Corpora
test -d $D || git clone --depth=1 https://github.com/KoichiYasuoka/spaCy-Thai
nawk '
BEGIN{
FS=OFS="\t";
}
{
if(NF==10&&$1~/^[1-9][0-9]*$/)
u=u$0"\n";
else if($0~/^# text =/)
u=u$0"\n";
else if($0==""){
f=(FILENAME~/test/)?"test":(FILENAME~/dev/)?"dev":"train";
print u>f".upos";
if(u~/\t0\troot\t/)
print u>f".conllu";
u="";
}
}' $D/*-ud-*.conllu
python3 -m esupar.train $S $T -1 /tmp train.upos
python3 -m esupar.train $T $T 32 /// train.conllu dev.conllu test.conllu