|
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer |
|
from datasets import load_dataset |
|
import torch |
|
|
|
|
|
def load_train_data(): |
|
|
|
train_dataset = load_dataset('csv', data_files={"train": "datasets/Canstralian/ShellCommands.csv"}) |
|
return train_dataset |
|
|
|
|
|
def load_model_and_tokenizer(model_name): |
|
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
return model, tokenizer |
|
|
|
|
|
def preprocess_function(examples, tokenizer): |
|
return tokenizer(examples['text'], padding=True, truncation=True) |
|
|
|
|
|
def fine_tune(model_name="WhiteRabbitNeo/WhiteRabbitNeo-13B-v1"): |
|
train_data = load_train_data() |
|
model, tokenizer = load_model_and_tokenizer(model_name) |
|
|
|
|
|
train_data = train_data.map(lambda x: preprocess_function(x, tokenizer), batched=True) |
|
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir='./results', |
|
evaluation_strategy="epoch", |
|
learning_rate=2e-5, |
|
per_device_train_batch_size=16, |
|
num_train_epochs=3, |
|
weight_decay=0.01, |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_data['train'], |
|
) |
|
|
|
trainer.train() |
|
|
|
|
|
fine_tune() |
|
|