Spaces:

patrol114
/

mistralai-Mistral-Nemo-Instruct-2407

Runtime error

File size: 17,473 Bytes

import os
import pickle
import re
import nltk
import psutil
import numpy as np
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Layer, Bidirectional, Dense, LayerNormalization, Dropout, Embedding, LSTM, Conv1D, MaxPooling1D, BatchNormalization, GRU, MultiHeadAttention
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
from sklearn.utils import shuffle
from typing import List, Optional, Set
from gensim.models import KeyedVectors
from pathlib import Path
import tempfile
import zipfile
import requests
from transformers import AutoTokenizer, AutoModel
import random

# Konfiguracja środowiska
gpus = tf.config.list_physical_devices("GPU")
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("Dynamiczne zarządzanie pamięcią ustawione dla wszystkich GPU.")
    except RuntimeError as e:
        print(f"Błąd podczas ustawiania dynamicznego zarządzania pamięcią: {e}")

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
tf.keras.mixed_precision.set_global_policy('float32')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

ZAPISZ_KATALOG = "mozgi"
KATALOG_LOGOW = "logs"
directory = "test"
log_dir = Path('logs')
tf.keras.backend.clear_session()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

class TextProcessor:
    class PositionalEncoding(Layer):
        def __init__(self, d_model, **kwargs):
            super().__init__(**kwargs)
            self.d_model = d_model

        def get_angles(self, position, i):
            angles = 1 / np.power(10000, (2 * (i // 2)) / np.float32(self.d_model))
            return position * angles

        def call(self, inputs):
            position = tf.shape(inputs)[1]
            angle_rads = self.get_angles(
                position=np.arange(position)[:, np.newaxis],
                i=np.arange(self.d_model)[np.newaxis, :]
            )
            sines = np.sin(angle_rads[:, 0::2])
            cosines = np.cos(angle_rads[:, 1::2])
            pos_encoding = np.concatenate([sines, cosines], axis=-1)
            pos_encoding = tf.cast(pos_encoding, dtype=tf.float32)
            return inputs + pos_encoding

    class WrappedMultiHeadAttention(Layer):
        def __init__(self, num_heads, d_model, rate=0.2, **kwargs):
            super().__init__(**kwargs)
            self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=d_model, dropout=rate)

        def call(self, inputs):
            return self.attention(inputs, inputs)

    class TransformerBlock(Layer):
        def __init__(self, num_heads, d_model, dff, rate=0.2, **kwargs):
            super().__init__(**kwargs)
            self.attention = TextProcessor.WrappedMultiHeadAttention(num_heads, d_model, rate)
            self.ffn = Sequential([
                Dense(dff, activation='relu'),
                Dense(d_model)
            ])
            self.layernorm1 = LayerNormalization(epsilon=1e-6)
            self.layernorm2 = LayerNormalization(epsilon=1e-6)
            self.dropout1 = Dropout(rate)
            self.dropout2 = Dropout(rate)
            self.pos_encoding = TextProcessor.PositionalEncoding(d_model)

        def call(self, inputs, training):
            inputs = self.pos_encoding(inputs)
            attn_output = self.attention(inputs)
            attn_output = self.dropout1(attn_output, training=training)
            out1 = self.layernorm1(inputs + attn_output)
            ffn_output = self.ffn(out1)
            ffn_output = self.dropout2(ffn_output, training=training)
            return self.layernorm2(out1 + ffn_output)

    class TextGenerationCallback(tf.keras.callbacks.Callback):
        def __init__(self, tokenizer, input_sequence_length, model_name, model, temperature=1.0):
            super().__init__()
            self.tokenizer = tokenizer
            self.input_sequence_length = input_sequence_length
            self.model_name = model_name
            self.model = model
            self.temperature = temperature
            self.generated_text_interval = 5
            self.seed_texts = ["Dlaczego Python jest popularny?", "Co to jest AI?", "Wyjaśnij sieci neuronowe", "Dlaczego dane są ważne?"]
            self.current_seed_text_index = 0

        def on_epoch_end(self, epoch, logs=None):
            if epoch % self.generated_text_interval == 0:
                seed_text = self.seed_texts[self.current_seed_text_index]
                self.current_seed_text_index = (self.current_seed_text_index + 1) % len(self.seed_texts)
                generated_text = self.generate_text(seed_text, self.temperature, self.input_sequence_length)
                print(f"\nWygenerowany tekst z modelu '{self.model_name}' po epoce {epoch + 1}:\n{generated_text}\n")

        def generate_text(self, seed_text, temperature=1.0, num_words=50):
            result = []
            for _ in range(num_words):
                encoded_text = self.tokenizer.encode(seed_text, return_tensors='tf')
                predictions = self.model(encoded_text)
                predictions = predictions.logits[:, -1, :] / temperature
                predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
                seed_text += self.tokenizer.decode([predicted_id])
                result.append(self.tokenizer.decode([predicted_id]))
            return ' '.join(result)

    def __init__(
        self,
        directory: str,
        oov_token: str = '<OOV>',
        glove_file: str = None,
        gpt2_model_dir: str = 'gpt2',
        model_name: str = 'gpt2',
        input_sequence_length: int = 100,
        output_sequence_length: int = 100,
        batch_size: int = 32,
        lowercase: bool = False,
        handle_numbers: bool = True,
        handle_special_characters: bool = False,
        handle_stop_words: bool = True,
        lemmatize: bool = True,
        handle_python_code: bool = True,
        lstm_units: int = 128,
        dropout_rate: float = 0.2,
        epochs: int = 100,
        learning_rate: float = 0.00001,
        amsgrad: bool = True,
        kernel_regularizer: float = 0.001,
        recurrent_regularizer: float = 0.001,
        bias_regularizer: float = 0.001,
        num_difficult_sequences: int = 50,
        stop_words: Optional[Set[str]] = None,
        log_dir: Optional[str] = 'logs',
    ):
        self.oov_token = oov_token
        self.directory = directory
        self.glove_file = glove_file
        self.gpt2_model_dir = Path(gpt2_model_dir)
        self.model_name = model_name
        self.input_sequence_length = input_sequence_length
        self.output_sequence_length = output_sequence_length
        self.batch_size = batch_size
        self.lowercase = lowercase
        self.handle_numbers = handle_numbers
        self.handle_special_characters = handle_special_characters
        self.handle_stop_words = handle_stop_words
        self.lemmatize = lemmatize
        self.handle_python_code = handle_python_code
        self.lstm_units = lstm_units
        self.dropout_rate = dropout_rate
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.amsgrad = amsgrad
        self.kernel_regularizer = kernel_regularizer
        self.recurrent_regularizer = recurrent_regularizer
        self.bias_regularizer = bias_regularizer
        self.num_difficult_sequences = num_difficult_sequences
        self.stop_words = set(stopwords.words('english')) if stop_words is None else stop_words
        self.tokenizer = None
        self.embedding_matrix = None
        self.vocab_size = 0
        self.model = None
        self.processed_texts = []
        self.log_dir = log_dir
        self.glove_model = None
        self.gpt2_model = None
        self.gpt2_tokenizer = None

        self.load_models()

    def create_tokenizer(self, texts: List[str]) -> None:
        if not texts:
            raise ValueError("Lista tekstów jest pusta lub None.")

        self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})

        print("Tokenizacja zakończona. Liczba unikalnych tokenów:", len(self.tokenizer.get_vocab()))

    def load_models(self):
        print("Ładowanie modelu GloVe...")
        self.glove_model = self.load_glove_model()
        print("Model GloVe załadowany.")

        print("Ładowanie modelu GPT-2...")
        if not Path(self.gpt2_model_dir).exists():
            print(f"Model GPT-2 ({self.model_name}) nie jest dostępny lokalnie. Pobieranie...")
            self.gpt2_model = AutoModel.from_pretrained(self.model_name)
            self.gpt2_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.gpt2_model.save_pretrained(self.gpt2_model_dir)
            self.gpt2_tokenizer.save_pretrained(self.gpt2_model_dir)
        else:
            self.load_gpt2_model()
        print("Model GPT-2 załadowany.")

    def download_file(self, url, save_path):
        response = requests.get(url, stream=True)
        total_length = response.headers.get('content-length')

        if total_length is None:
            with open(save_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
        else:
            dl = 0
            total_length = int(total_length)
            with open(save_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        dl += len(chunk)
                        f.write(chunk)
                        done = int(50 * dl / total_length)
                        print("\r[%s%s]" % ('=' * done, ' ' * (50-done)), end='')

    def load_glove_model(self):
        glove_file = "glove.6B.100d.txt"
        if not os.path.exists(glove_file):
            print(f"Plik {glove_file} nie został znaleziony. Rozpoczynam pobieranie...")
            try:
                url = "http://nlp.stanford.edu/data/glove.6B.zip"
                with tempfile.NamedTemporaryFile(delete=False) as tmp_zip:
                    self.download_file(url, tmp_zip.name)
                    with zipfile.ZipFile(tmp_zip.name) as zf:
                        zf.extractall('.')
                        glove_file = 'glove.6B.100d.txt'
                print("Pobrano i wypakowano plik GloVe.")
            except Exception as e:
                print(f"Błąd podczas pobierania lub wypakowywania pliku GloVe: {e}")
                return None

        glove_model = {}
        with open(glove_file, 'r', encoding='utf-8') as f:
            for line in f:
                split_line = line.split()
                word = split_line[0]
                embedding = np.array([float(val) for val in split_line[1:]])
                glove_model[word] = embedding

        return glove_model

    def load_gpt2_model(self):
        try:
            self.gpt2_model = AutoModel.from_pretrained(self.model_name)
            self.gpt2_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            print("Standardowy model GPT-2 załadowany pomyślnie.")
        except Exception as e:
            print(f"Błąd podczas wczytywania standardowego modelu GPT-2: {e}")

    def preprocess_text(self, text_input):
        if isinstance(text_input, bytes):
            text = text_input.decode('utf-8')
        elif isinstance(text_input, tf.Tensor):
            text = text_input.numpy().decode('utf-8')
        else:
            text = text_input

        tokens = word_tokenize(text)
        if self.lowercase:
            tokens = [token.lower() for token in tokens]
        if self.lemmatize:
            tokens = [lemmatizer.lemmatize(token) for token in tokens]
        if self.handle_stop_words:
            tokens = [token for token in tokens if token not in self.stop_words]

        return ' '.join(tokens)

    def create_embedding_matrix(self, vocab_size, embedding_dim=100):
        embedding_matrix = np.zeros((vocab_size, embedding_dim))
        missed_embeddings = 0

        all_embeddings = np.stack(list(self.glove_model.values()))
        mean_embedding = np.mean(all_embeddings, axis=0)

        for word, idx in self.tokenizer.get_vocab().items():
            embedding_vector = self.glove_model.get(word)

            if embedding_vector is not None:
                embedding_matrix[idx] = embedding_vector
            else:
                missed_embeddings += 1
                embedding_matrix[idx] = mean_embedding

        print(f"Liczba słów bez dostępnego wektora embeddingu: {missed_embeddings}")

        return embedding_matrix

    def create_sequences(self):
        processed_texts, _ = self._load_and_preprocess_files(self.directory, ['.txt'])

        self.create_tokenizer(processed_texts)
        vocab_size = len(self.tokenizer.get_vocab())
        embedding_matrix = self.create_embedding_matrix(vocab_size)

        sequences = []
        for text in processed_texts:
            encoded = self.tokenizer.encode(text)
            for i in range(1, len(encoded)):
                input_seq = encoded[:i]
                sequences.append(input_seq)

        max_sequence_len = max([len(seq) for seq in sequences])
        sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))

        X, y = sequences[:, :-1], sequences[:, -1]
        y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

        return X_train, X_val, y_train, y_val, embedding_matrix, vocab_size, max_sequence_len

    def _load_and_preprocess_files(self, directory, file_formats):
        processed_texts = []
        word_counts = {}

        if not os.path.isdir(directory):
            raise FileNotFoundError(f"Błąd: Podana ścieżka '{directory}' nie jest katalogiem.")

        files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and any(f.endswith(format) for format in file_formats)]
        if not files:
            raise FileNotFoundError("Brak plików w podanym formacie w katalogu.")

        for file in files:
            file_path = os.path.join(directory, file)
            with open(file_path, "r", encoding='utf-8') as f:
                lines = f.readlines()
                if not lines:
                    print(f"Plik {file} jest pusty.")
                    continue

                for line in lines:
                    processed_line = self.preprocess_text(line)
                    processed_texts.append(processed_line)
                    word_count = len(processed_line.split())
                    word_counts[file] = word_counts.get(file, 0) + word_count
                print(f"Przetworzono plik: {file}, liczba słów: {word_count}")

        if not processed_texts:
            raise ValueError("Brak przetworzonych tekstów. Proszę sprawdzić zawartość katalogu.")
        else:
            print(f"Liczba przetworzonych tekstów: {len(processed_texts)}")

        return processed_texts, word_counts

    def create_and_train_model(self):
        X_train, X_val, y_train, y_val, embedding_matrix, vocab_size, max_sequence_len = self.create_sequences()

        model = Sequential()
        model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_sequence_len - 1, trainable=False))
        model.add(Bidirectional(LSTM(self.lstm_units)))
        model.add(Dropout(self.dropout_rate))
        model.add(Dense(vocab_size, activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.summary()

        log_dir = os.path.join(KATALOG_LOGOW, self.model_name)
        tensorboard_callback = TensorBoard(log_dir=log_dir)

        early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

        model.fit(X_train, y_train, epochs=self.epochs, validation_data=(X_val, y_val), callbacks=[tensorboard_callback, early_stopping_callback])

        self.model = model
        self.save_model_and_tokenizer()

    def save_model_and_tokenizer(self):
        if not os.path.exists(ZAPISZ_KATALOG):
            os.makedirs(ZAPISZ_KATALOG)
        self.model.save(f'{ZAPISZ_KATALOG}/{self.model_name}.h5')
        with open(f'{ZAPISZ_KATALOG}/{self.model_name}_tokenizer.pkl', 'wb') as handle:
            pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print("Model i tokenizer zapisane.")

def main():
    print("Witaj w AI Code Generator!")
    directory = "test"
    model_name = input("Podaj nazwę modelu: ")

    processor = TextProcessor(
        directory=directory,
        model_name=model_name,
        input_sequence_length=100,
        output_sequence_length=100,
        epochs=10,
    )

    processor.create_and_train_model()
    print("Model utworzony i wytrenowany pomyślnie!")

if __name__ == "__main__":
    main()