| |
| import numpy as np |
| import pandas as pd |
| import re |
| from sklearn.preprocessing import LabelEncoder |
| from sklearn.model_selection import train_test_split |
| from tensorflow.keras.preprocessing.text import Tokenizer |
| from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
| |
| def read_data(path): |
| try: |
| df = pd.read_csv(path) |
| if df.empty: |
| print("The file is empty.") |
| return None |
| return df |
| except FileNotFoundError: |
| print(f"File not found at: {path}") |
| return None |
| except Exception as e: |
| print(f"An error occurred: {e}") |
| return None |
|
|
| |
| def clean_text(text): |
| text = text.lower() |
| text = re.sub(r"\d+", " ", text) |
| text = re.sub(r"[^\w\s]", " ", text) |
| text = text.strip() |
| return text |
|
|
| |
| def preprocess_data(file_path, max_len=10, vocab_size=250): |
| |
| df = read_data(file_path) |
| if df is None: |
| print("Data loading failed.") |
| return None, None, None, None |
|
|
| |
| df['Transaction Description'] = df['Transaction Description'].apply(clean_text) |
| |
| |
| tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>") |
| tokenizer.fit_on_texts(df['Transaction Description']) |
| |
| |
| sequences = tokenizer.texts_to_sequences(df['Transaction Description']) |
| padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post') |
| |
| |
| label_encoder = LabelEncoder() |
| labels = label_encoder.fit_transform(df['Category']) |
| |
| return padded_sequences, labels, tokenizer, label_encoder |
|
|
| |
| def split_data(sequences, labels, test_size=0.2, random_state=42): |
| X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=test_size, random_state=random_state) |
| return X_train, X_test, y_train, y_test |
|
|
| |
| def main(): |
| |
| data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv" |
|
|
| |
| sequences, labels, tokenizer, label_encoder = preprocess_data(data_path) |
|
|
| |
| if sequences is not None: |
| print("Data preprocessing successful!") |
| |
| X_train, X_test, y_train, y_test = split_data(sequences, labels) |
| print(f"Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}") |
| print(f"Testing data shape: {X_test.shape}, Testing labels shape: {y_test.shape}") |
| else: |
| print("Data preprocessing failed.") |
|
|
| |
| if __name__ == "__main__": |
| main() |
|
|