File size: 4,621 Bytes
d45eea6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional
import keras
import os
import banglanltk as bn

# Load the dataset
data = pd.read_excel("b-nersuzi.xlsx", sheet_name="b-ner")

# Check for and handle missing values
data = data.fillna(method='ffill')

# Group the data by sentence and collect word-tag pairs
agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(), s['Tag'].values.tolist())]
agg_data = data.groupby(['Sentence #']).apply(agg_func).reset_index().rename(columns={0:'Sentence_POS_Tag_Pair'})

# Define a function to preprocess the data
def preprocess_data(data):
    data['Sentence'] = data['Sentence_POS_Tag_Pair'].apply(lambda sentence: " ".join(map(str, [s[0] for s in sentence])))
    data['Tag'] = data['Sentence_POS_Tag_Pair'].apply(lambda sentence: " ".join(map(str, [s[1] for s in sentence])))
    data['tokenised_sentences'] = data['Sentence'].apply(bn.word_tokenize)
    data['tag_list'] = data['Tag'].apply(lambda x: x.split())
    return data

# Preprocess the data
agg_data = preprocess_data(agg_data)

# Separate sentences and tags
tokenized_sentences = agg_data['tokenised_sentences'].tolist()
tags_list = agg_data['tag_list'].tolist()

# Create word-to-index and tag-to-index mappings
words = set(word for sent in tokenized_sentences for word in sent)
word_to_idx = {word: i + 1 for i, word in enumerate(words)}
num_words = len(words) + 1  # Add 1 for padding
tags = set(tag for tag_list in tags_list for tag in tag_list)
tag_to_idx = {tag: i for i, tag in enumerate(tags)}
num_tags = len(tags)

# Encode sentences and tags
max_len = max(len(sent) for sent in tokenized_sentences)
encoded_sentences = [[word_to_idx[word] for word in sent] for sent in tokenized_sentences]
encoded_sentences = pad_sequences(encoded_sentences, maxlen=max_len, padding='post')
encoded_tags = [[tag_to_idx[tag] for tag in tag_list] for tag_list in tags_list]
encoded_tags = pad_sequences(encoded_tags, maxlen=max_len, padding='post')
encoded_tags = [to_categorical(tag, num_classes=num_tags) for tag in encoded_tags]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(encoded_sentences, encoded_tags, test_size=0.2, random_state=42)

# Define the LSTM model
model_path = "best_model.h5"
if os.path.exists(model_path):
    model = load_model(model_path)
else:
    model = Sequential()
    model.add(Embedding(input_dim=num_words, output_dim=50, input_length=max_len))
    model.add(Bidirectional(LSTM(units=100, return_sequences=True)))
    model.add(TimeDistributed(Dense(units=num_tags, activation='softmax')))
    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Define callback to save the model when validation accuracy reaches 99% or above
    class SaveModelCallback(keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs={}):
            if logs.get('val_accuracy') >= 0.99:
                self.model.save("best_model.h5")
                print("\nValidation accuracy reached 99% or above. Model saved.")

    # Train the model
    history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=7, validation_split=0.1, callbacks=[SaveModelCallback()])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, np.array(y_test))
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Function to predict entities in a given sentence
def predict_entities(input_sentence):
    tokenized_input = bn.word_tokenize(input_sentence)
    encoded_input = [word_to_idx[word] if word in word_to_idx else 0 for word in tokenized_input]
    padded_input = pad_sequences([encoded_input], maxlen=max_len, padding='post')
    predictions = model.predict(padded_input)
    predicted_tags = np.argmax(predictions, axis=-1)
    reverse_tag_map = {v: k for k, v in tag_to_idx.items()}
    predicted_tags = [reverse_tag_map[idx] for idx in predicted_tags[0][:len(tokenized_input)]]
    tagged_sentence = [(word, tag) for word, tag in zip(tokenized_input, predicted_tags)]
    return tagged_sentence

# Test user input
user_input = input("Enter a Bengali sentence: ")
predicted_tags = predict_entities(user_input)
for word, tag in predicted_tags:
    print(f"{word}: {tag}")