# dataset_preparation_sequences.py import os import numpy as np from sklearn.model_selection import train_test_split from tensorflow.keras.utils import to_categorical import pickle def load_sequences(preprocessed_dir='preprocessed_sequences'): """ Loads preprocessed sequences and their labels. Args: preprocessed_dir (str): Directory containing preprocessed sequences. Returns: tuple: Lists of sequences and labels, label mapping dictionary. """ X = [] y = [] label_map = {} classes = sorted(os.listdir(preprocessed_dir)) for idx, cls in enumerate(classes): label_map[cls] = idx cls_path = os.path.join(preprocessed_dir, cls) if not os.path.isdir(cls_path): continue sequence_files = [f for f in os.listdir(cls_path) if f.endswith('.npy')] for seq_file in sequence_files: seq_path = os.path.join(cls_path, seq_file) sequence = np.load(seq_path) X.append(sequence) y.append(idx) # X remains a list of numpy arrays with varying shapes y = np.array(y) y = to_categorical(y, num_classes=len(label_map)) return X, y, label_map def pad_sequences_fixed(X, max_seq_length): """ Pads or truncates sequences to a fixed length. Args: X (list of numpy.ndarray): List of sequences with shape (frames, height, width, channels). max_seq_length (int): Desired sequence length. Returns: numpy.ndarray: Padded/truncated sequences. """ padded_X = [] for seq in X: if seq.shape[0] < max_seq_length: pad_width = max_seq_length - seq.shape[0] padding = np.zeros((pad_width, *seq.shape[1:]), dtype=seq.dtype) padded_seq = np.concatenate((seq, padding), axis=0) else: padded_seq = seq[:max_seq_length] padded_X.append(padded_seq) return np.array(padded_X) def save_dataset(X_train, X_test, y_train, y_test, label_map, output_path='dataset_sequences.pkl'): """ Saves the dataset into a pickle file. Args: X_train, X_test, y_train, y_test: Split data. label_map (dict): Mapping from class names to indices. output_path (str): Path to save the pickle file. """ with open(output_path, 'wb') as f: pickle.dump({ 'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test, 'label_map': label_map }, f) print(f"Dataset saved to {output_path}.") def load_dataset_pickle(pickle_path='dataset_sequences.pkl'): """ Loads the dataset from a pickle file. Args: pickle_path (str): Path to the pickle file. Returns: tuple: Split data and label mapping. """ with open(pickle_path, 'rb') as f: data = pickle.load(f) return data['X_train'], data['X_test'], data['y_train'], data['y_test'], data['label_map'] if __name__ == "__main__": # Load sequences X, y, label_map = load_sequences(preprocessed_dir='preprocessed_sequences') print(f"Total samples: {len(X)}") # Find the maximum sequence length for padding max_seq_length = max([seq.shape[0] for seq in X]) print(f"Maximum sequence length: {max_seq_length}") # Pad sequences to have the same length X_padded = pad_sequences_fixed(X, max_seq_length) print(f"Padded sequences shape: {X_padded.shape}") # Split into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42) print(f"Training samples: {X_train.shape[0]}") print(f"Testing samples: {X_test.shape[0]}") # Save the dataset save_dataset(X_train, X_test, y_train, y_test, label_map, output_path='dataset_sequences.pkl')