InnerI commited on
Commit
58e0a19
1 Parent(s): fbaad9e

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +56 -0
train.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
3
+
4
+ # Load the tokenizer and set the padding token
5
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') # Load the GPT-2 tokenizer
6
+ if tokenizer.pad_token is None:
7
+ tokenizer.pad_token = tokenizer.eos_token # Set a default pad token if not defined
8
+
9
+ # Tokenize function with padding and truncation
10
+ def tokenize_function(examples):
11
+ return tokenizer(
12
+ examples['Question'], # Use the correct column name
13
+ padding='max_length', # Ensure consistent padding
14
+ truncation=True, # Enable truncation
15
+ max_length=128 # Define a suitable max length
16
+ )
17
+
18
+ # Load the dataset
19
+ dataset = load_dataset('InnerI/synCAI_144kda') # Load your specific dataset
20
+
21
+ # Tokenize the dataset with batched processing
22
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
23
+
24
+ # Load the model
25
+ model = GPT2LMHeadModel.from_pretrained('gpt2-medium') # Load GPT-2 model
26
+
27
+ # Define the data collator for language modeling
28
+ data_collator = DataCollatorForLanguageModeling(
29
+ tokenizer=tokenizer,
30
+ mlm=False # Set to False for standard language modeling (non-masked)
31
+ )
32
+
33
+ # Define training arguments with output directory and other settings
34
+ training_args = TrainingArguments(
35
+ output_dir=r"InnerI/synCAI-144k-gpt2.5", # Use raw string for Windows path
36
+ overwrite_output_dir=True,
37
+ num_train_epochs=1, # Number of epochs for training
38
+ per_device_train_batch_size=4, # Batch size for each training device
39
+ save_steps=10_000, # Save model checkpoint every 10,000 steps
40
+ save_total_limit=2, # Limit to 2 checkpoints
41
+ prediction_loss_only=True, # Record only loss during training
42
+ )
43
+
44
+ # Initialize the Trainer with model, arguments, and collator
45
+ trainer = Trainer(
46
+ model=model,
47
+ args=training_args,
48
+ data_collator=data_collator,
49
+ train_dataset=tokenized_datasets['train'], # Use the tokenized train dataset
50
+ )
51
+
52
+ # Start training the model
53
+ trainer.train()
54
+
55
+ # Save the fine-tuned model to the specified output directory
56
+ trainer.save_model(r"CAI-gpt2.5") # Use raw string for Windows path