File size: 1,321 Bytes
718f9ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import json

# Load configuration
with open('../config/config.json') as f:
    config = json.load(f)

# Load dataset
dataset = load_dataset('csv', data_files={'train': '../data/train.csv', 'validation': '../data/valid.csv'})

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(config['model_name'], num_labels=config['num_labels'])
tokenizer = AutoTokenizer.from_pretrained(config['model_name'])

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=config['learning_rate'],
    per_device_train_batch_size=config['batch_size'],
    num_train_epochs=config['num_epochs'],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model('../model')