sujitbabu9088
commited on
Commit
•
d45eea6
1
Parent(s):
4de4a7f
Upload 2 files
Browse filespython and excel file
- .gitattributes +1 -0
- b-nersuzi.xlsx +3 -0
- model.py +100 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
b-nersuzi.xlsx filter=lfs diff=lfs merge=lfs -text
|
b-nersuzi.xlsx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4897255c258736c596081aae9d95f5ba800b5e35322473773ae166626476314d
|
3 |
+
size 5461454
|
model.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
from sklearn.model_selection import train_test_split
|
5 |
+
from keras.preprocessing.sequence import pad_sequences
|
6 |
+
from keras.utils import to_categorical
|
7 |
+
from keras.models import Sequential, load_model
|
8 |
+
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional
|
9 |
+
import keras
|
10 |
+
import os
|
11 |
+
import banglanltk as bn
|
12 |
+
|
13 |
+
# Load the dataset
|
14 |
+
data = pd.read_excel("b-nersuzi.xlsx", sheet_name="b-ner")
|
15 |
+
|
16 |
+
# Check for and handle missing values
|
17 |
+
data = data.fillna(method='ffill')
|
18 |
+
|
19 |
+
# Group the data by sentence and collect word-tag pairs
|
20 |
+
agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(), s['Tag'].values.tolist())]
|
21 |
+
agg_data = data.groupby(['Sentence #']).apply(agg_func).reset_index().rename(columns={0:'Sentence_POS_Tag_Pair'})
|
22 |
+
|
23 |
+
# Define a function to preprocess the data
|
24 |
+
def preprocess_data(data):
|
25 |
+
data['Sentence'] = data['Sentence_POS_Tag_Pair'].apply(lambda sentence: " ".join(map(str, [s[0] for s in sentence])))
|
26 |
+
data['Tag'] = data['Sentence_POS_Tag_Pair'].apply(lambda sentence: " ".join(map(str, [s[1] for s in sentence])))
|
27 |
+
data['tokenised_sentences'] = data['Sentence'].apply(bn.word_tokenize)
|
28 |
+
data['tag_list'] = data['Tag'].apply(lambda x: x.split())
|
29 |
+
return data
|
30 |
+
|
31 |
+
# Preprocess the data
|
32 |
+
agg_data = preprocess_data(agg_data)
|
33 |
+
|
34 |
+
# Separate sentences and tags
|
35 |
+
tokenized_sentences = agg_data['tokenised_sentences'].tolist()
|
36 |
+
tags_list = agg_data['tag_list'].tolist()
|
37 |
+
|
38 |
+
# Create word-to-index and tag-to-index mappings
|
39 |
+
words = set(word for sent in tokenized_sentences for word in sent)
|
40 |
+
word_to_idx = {word: i + 1 for i, word in enumerate(words)}
|
41 |
+
num_words = len(words) + 1 # Add 1 for padding
|
42 |
+
tags = set(tag for tag_list in tags_list for tag in tag_list)
|
43 |
+
tag_to_idx = {tag: i for i, tag in enumerate(tags)}
|
44 |
+
num_tags = len(tags)
|
45 |
+
|
46 |
+
# Encode sentences and tags
|
47 |
+
max_len = max(len(sent) for sent in tokenized_sentences)
|
48 |
+
encoded_sentences = [[word_to_idx[word] for word in sent] for sent in tokenized_sentences]
|
49 |
+
encoded_sentences = pad_sequences(encoded_sentences, maxlen=max_len, padding='post')
|
50 |
+
encoded_tags = [[tag_to_idx[tag] for tag in tag_list] for tag_list in tags_list]
|
51 |
+
encoded_tags = pad_sequences(encoded_tags, maxlen=max_len, padding='post')
|
52 |
+
encoded_tags = [to_categorical(tag, num_classes=num_tags) for tag in encoded_tags]
|
53 |
+
|
54 |
+
# Split data into train and test sets
|
55 |
+
X_train, X_test, y_train, y_test = train_test_split(encoded_sentences, encoded_tags, test_size=0.2, random_state=42)
|
56 |
+
|
57 |
+
# Define the LSTM model
|
58 |
+
model_path = "best_model.h5"
|
59 |
+
if os.path.exists(model_path):
|
60 |
+
model = load_model(model_path)
|
61 |
+
else:
|
62 |
+
model = Sequential()
|
63 |
+
model.add(Embedding(input_dim=num_words, output_dim=50, input_length=max_len))
|
64 |
+
model.add(Bidirectional(LSTM(units=100, return_sequences=True)))
|
65 |
+
model.add(TimeDistributed(Dense(units=num_tags, activation='softmax')))
|
66 |
+
# Compile the model
|
67 |
+
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
|
68 |
+
|
69 |
+
# Define callback to save the model when validation accuracy reaches 99% or above
|
70 |
+
class SaveModelCallback(keras.callbacks.Callback):
|
71 |
+
def on_epoch_end(self, epoch, logs={}):
|
72 |
+
if logs.get('val_accuracy') >= 0.99:
|
73 |
+
self.model.save("best_model.h5")
|
74 |
+
print("\nValidation accuracy reached 99% or above. Model saved.")
|
75 |
+
|
76 |
+
# Train the model
|
77 |
+
history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=7, validation_split=0.1, callbacks=[SaveModelCallback()])
|
78 |
+
|
79 |
+
# Evaluate the model
|
80 |
+
loss, accuracy = model.evaluate(X_test, np.array(y_test))
|
81 |
+
print("Test Loss:", loss)
|
82 |
+
print("Test Accuracy:", accuracy)
|
83 |
+
|
84 |
+
# Function to predict entities in a given sentence
|
85 |
+
def predict_entities(input_sentence):
|
86 |
+
tokenized_input = bn.word_tokenize(input_sentence)
|
87 |
+
encoded_input = [word_to_idx[word] if word in word_to_idx else 0 for word in tokenized_input]
|
88 |
+
padded_input = pad_sequences([encoded_input], maxlen=max_len, padding='post')
|
89 |
+
predictions = model.predict(padded_input)
|
90 |
+
predicted_tags = np.argmax(predictions, axis=-1)
|
91 |
+
reverse_tag_map = {v: k for k, v in tag_to_idx.items()}
|
92 |
+
predicted_tags = [reverse_tag_map[idx] for idx in predicted_tags[0][:len(tokenized_input)]]
|
93 |
+
tagged_sentence = [(word, tag) for word, tag in zip(tokenized_input, predicted_tags)]
|
94 |
+
return tagged_sentence
|
95 |
+
|
96 |
+
# Test user input
|
97 |
+
user_input = input("Enter a Bengali sentence: ")
|
98 |
+
predicted_tags = predict_entities(user_input)
|
99 |
+
for word, tag in predicted_tags:
|
100 |
+
print(f"{word}: {tag}")
|