#! /usr/bin/python3 src="jerteh/gpt2-vrabac" tgt="KoichiYasuoka/gpt2-small-serbian-upos" import os from transformers import AutoTokenizer,AutoConfig,GPT2ForTokenClassification,DataCollatorForTokenClassification,TrainingArguments,Trainer from tokenizers.pre_tokenizers import Sequence,Punctuation for d in ["UD_Serbian-SET","UD_Croatian-SET"]: os.system("test -d "+d+" || git clone --depth=1 https://github.com/UniversalDependencies/"+d) os.system("for F in train dev test ; do cat UD_*-SET/*-$F.conllu > $F.conllu ; done") class UPOSFileDataset(object): def __init__(self,conllu,tokenizer): self.conllu=open(conllu,"r",encoding="utf-8") self.tokenizer=tokenizer self.seeks=[0] label=set(["SYM"]) s=self.conllu.readline() while s!="": if s=="\n": self.seeks.append(self.conllu.tell()) else: w=s.split("\t") if len(w)==10: if w[0].isdecimal(): label.add(w[3] if w[5]=="_" else w[3]+"|"+w[5]) s=self.conllu.readline() lid={} for i,l in enumerate(sorted(label)): lid[l],lid["B-"+l],lid["I-"+l]=i*3,i*3+1,i*3+2 self.label2id=lid def __call__(*args): lid={l:i for i,l in enumerate(sorted(set(sum([list(t.label2id) for t in args],[]))))} for t in args: t.label2id=lid return lid def __del__(self): self.conllu.close() __len__=lambda self:len(self.seeks)-1 def __getitem__(self,i): self.conllu.seek(self.seeks[i]) form,upos,sp=[],[],False while self.conllu.tell()",pad_token="",sep_token="",unk_token="",mask_token="",bos_token="",eos_token="",model_max_length=1024) tkz.backend_tokenizer.pre_tokenizer=Sequence([Punctuation(),tkz.backend_tokenizer.pre_tokenizer]) trainDS=UPOSFileDataset("train.conllu",tkz) devDS=UPOSFileDataset("dev.conllu",tkz) testDS=UPOSFileDataset("test.conllu",tkz) lid=trainDS(devDS,testDS) cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True) arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=24,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False) trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=GPT2ForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True),train_dataset=trainDS) trn.train() trn.save_model(tgt) tkz.save_pretrained(tgt)