Spaces:
Runtime error
Runtime error
import os | |
from uuid import uuid4 | |
import pandas as pd | |
from datasets import load_dataset | |
import subprocess | |
from transformers import AutoTokenizer | |
### Read environment variables | |
# from dotenv import load_dotenv,find_dotenv | |
# load_dotenv(find_dotenv(),override=True) | |
### Functions | |
def max_token_len(dataset): | |
max_seq_length = 0 | |
for row in dataset: | |
tokens = len(tokenizer(row['text'])['input_ids']) | |
if tokens > max_seq_length: | |
max_seq_length = tokens | |
return max_seq_length | |
### Model details | |
# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1' | |
model_name = 'mistralai/Mistral-7B-v0.1' | |
# model_name = 'distilbert-base-uncased' | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model_max_length = tokenizer.model_max_length | |
print("Model Max Length:", model_max_length) | |
### Repo name, dataset initialization, and data directory | |
# Load dataset | |
dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100' | |
dataset=load_dataset(dataset_name) | |
# Write dataset files into data directory | |
data_directory = './fine_tune_data/' | |
# Create the data directory if it doesn't exist | |
os.makedirs(data_directory, exist_ok=True) | |
# Write the train data to a CSV file | |
train_data='train_data' | |
train_filename = os.path.join(data_directory, train_data) | |
dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False) | |
max_token_length_train=max_token_len(dataset['train']) | |
print('Max token length train: '+str(max_token_length_train)) | |
# Write the validation data to a CSV file | |
validation_data='validation_data' | |
validation_filename = os.path.join(data_directory, validation_data) | |
dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False) | |
max_token_length_validation=max_token_len(dataset['validation']) | |
print('Max token length validation: '+str(max_token_length_validation)) | |
max_token_length=max(max_token_length_train,max_token_length_validation) | |
if max_token_length > model_max_length: | |
raise ValueError("Maximum token length exceeds model limits.") | |
block_size=2*max_token_length | |
# Define project parameters | |
username='ai-aerospace' | |
project_name='./llms/'+'ams_data_train-100_'+str(uuid4()) | |
repo_name='ams-data-train-100-'+str(uuid4()) | |
### Set training params | |
model_params={ | |
"project_name": project_name, | |
"model_name": model_name, | |
"repo_id": username+'/'+repo_name, | |
"train_data": train_data, | |
"validation_data": validation_data, | |
"data_directory": data_directory, | |
"block_size": block_size, | |
"model_max_length": max_token_length, | |
"logging_steps": -1, | |
"evaluation_strategy": "epoch", | |
"save_total_limit": 1, | |
"save_strategy": "epoch", | |
"mixed_precision": "fp16", | |
"lr": 0.00003, | |
"epochs": 3, | |
"batch_size": 2, | |
"warmup_ratio": 0.1, | |
"gradient_accumulation": 1, | |
"optimizer": "adamw_torch", | |
"scheduler": "linear", | |
"weight_decay": 0, | |
"max_grad_norm": 1, | |
"seed": 42, | |
"quantization": "int4", | |
"target_modules": "", | |
"lora_r": 16, | |
"lora_alpha": 32, | |
"lora_dropout": 0.05 | |
} | |
for key, value in model_params.items(): | |
os.environ[key] = str(value) | |
### Feed into and run autotrain command | |
# Set .venv and execute the autotrain script | |
# To see all parameters: autotrain llm --help | |
# !autotrain llm --train --project_name my-llm --model TinyLlama/TinyLlama-1.1B-Chat-v0.1 --data_path . --use-peft --use_int4 --learning_rate 2e-4 --train_batch_size 6 --num_train_epochs 3 --trainer sft | |
command=f""" | |
autotrain llm --train \ | |
--trainer sft \ | |
--project_name {model_params['project_name']} \ | |
--model {model_params['model_name']} \ | |
--data_path {model_params['data_directory']} \ | |
--train_split {model_params['train_data']} \ | |
--valid_split {model_params['validation_data']} \ | |
--repo_id {model_params['repo_id']} \ | |
--push_to_hub \ | |
--token HUGGINGFACE_TOKEN | |
--block_size {model_params['block_size']} \ | |
--model_max_length {model_params['model_max_length']} \ | |
--logging_steps {model_params['logging_steps']} \ | |
--evaluation_strategy {model_params['evaluation_strategy']} \ | |
--save_total_limit {model_params['save_total_limit']} \ | |
--save_strategy {model_params['save_strategy']} \ | |
--fp16 \ | |
--lr {model_params['lr']} \ | |
--num_train_epochs {model_params['lr']} \ | |
--batch_size {model_params['batch_size']} \ | |
--warmup_ratio {model_params['warmup_ratio']} \ | |
--gradient_accumulation {model_params['gradient_accumulation']} \ | |
--optimizer {model_params['gradient_accumulation']} \ | |
--scheduler linear \ | |
--weight_decay {model_params['weight_decay']} \ | |
--max_grad_norm {model_params['max_grad_norm']} \ | |
--seed {model_params['seed']} \ | |
--use_int4 \ | |
--target_modules {model_params['target_modules']} \ | |
--use-peft \ | |
--lora_r {model_params['lora_r']} \ | |
--lora_alpha {model_params['lora_alpha']} \ | |
--lora_dropout {model_params['lora_dropout']} | |
""" | |
# Use subprocess.run() to execute the command | |
subprocess.run(command, shell=True, check=True) |