import streamlit as st from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer import os # Set up Streamlit app st.title("Train Your Custom Model with Hugging Face Transformers") # Display instructions st.write("Upload your training data and train the model using a custom dataset.") # Load the dataset from Hugging Face st.sidebar.header("Dataset") dataset_name = st.sidebar.text_input("Enter Dataset Name (e.g., 'mjpsm/CodingInColor')", "mjpsm/CodingInColor") if dataset_name: st.sidebar.text("Loading dataset...") try: dataset = load_dataset(dataset_name) st.sidebar.text("Dataset loaded successfully!") except Exception as e: st.error(f"Failed to load dataset: {e}") dataset = None # Display a sample from the dataset if dataset and 'train' in dataset: st.subheader("Sample Data") st.write(dataset['train'][0]) # Model and Tokenizer Selection st.sidebar.header("Model") model_name = st.sidebar.text_input("Enter Model Name (e.g., 'mjpsm/CodingInColor')", "mjpsm/CodingInColor") if model_name: st.sidebar.text("Loading model...") try: tokenizer = AutoTokenizer.from_pretrained(model_name) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained(model_name) st.sidebar.text("Model loaded successfully!") except Exception as e: st.error(f"Failed to load model: {e}") model = None tokenizer = None # Preprocess function if dataset and tokenizer: def preprocess_function(examples): return tokenizer(examples['Question'], examples['Answer'], padding="max_length", truncation=True) st.sidebar.text("Preprocessing data...") tokenized_dataset = dataset.map(preprocess_function, batched=True) st.sidebar.text("Data preprocessed!") # Training Configuration if model and tokenized_dataset: st.sidebar.header("Training Configuration") num_epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3) batch_size = st.sidebar.slider("Batch Size", 1, 32, 8) training_args = TrainingArguments( output_dir="./results", num_train_epochs=num_epochs, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, warmup_steps=500, weight_decay=0.01, logging_dir="./logs", logging_steps=10, evaluation_strategy="epoch", save_strategy="epoch", ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], # Uncomment if you have a test set # eval_dataset=tokenized_dataset["test"], ) if st.sidebar.button("Start Training"): st.write("Training in progress...") trainer.train() st.write("Training complete!") # Save the trained model and tokenizer model.save_pretrained("./trained_model") tokenizer.save_pretrained("./trained_model") st.write("Model and tokenizer saved!") st.write("Pushing model to Hugging Face Hub...") model.push_to_hub(model_name) tokenizer.push_to_hub(model_name) st.write("Model pushed to Hugging Face Hub!")