sujitbabu9088 commited on
Commit
d45eea6
1 Parent(s): 4de4a7f

Upload 2 files

Browse files

python and excel file

Files changed (3) hide show
  1. .gitattributes +1 -0
  2. b-nersuzi.xlsx +3 -0
  3. model.py +100 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ b-nersuzi.xlsx filter=lfs diff=lfs merge=lfs -text
b-nersuzi.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4897255c258736c596081aae9d95f5ba800b5e35322473773ae166626476314d
3
+ size 5461454
model.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ from sklearn.model_selection import train_test_split
5
+ from keras.preprocessing.sequence import pad_sequences
6
+ from keras.utils import to_categorical
7
+ from keras.models import Sequential, load_model
8
+ from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional
9
+ import keras
10
+ import os
11
+ import banglanltk as bn
12
+
13
+ # Load the dataset
14
+ data = pd.read_excel("b-nersuzi.xlsx", sheet_name="b-ner")
15
+
16
+ # Check for and handle missing values
17
+ data = data.fillna(method='ffill')
18
+
19
+ # Group the data by sentence and collect word-tag pairs
20
+ agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(), s['Tag'].values.tolist())]
21
+ agg_data = data.groupby(['Sentence #']).apply(agg_func).reset_index().rename(columns={0:'Sentence_POS_Tag_Pair'})
22
+
23
+ # Define a function to preprocess the data
24
+ def preprocess_data(data):
25
+ data['Sentence'] = data['Sentence_POS_Tag_Pair'].apply(lambda sentence: " ".join(map(str, [s[0] for s in sentence])))
26
+ data['Tag'] = data['Sentence_POS_Tag_Pair'].apply(lambda sentence: " ".join(map(str, [s[1] for s in sentence])))
27
+ data['tokenised_sentences'] = data['Sentence'].apply(bn.word_tokenize)
28
+ data['tag_list'] = data['Tag'].apply(lambda x: x.split())
29
+ return data
30
+
31
+ # Preprocess the data
32
+ agg_data = preprocess_data(agg_data)
33
+
34
+ # Separate sentences and tags
35
+ tokenized_sentences = agg_data['tokenised_sentences'].tolist()
36
+ tags_list = agg_data['tag_list'].tolist()
37
+
38
+ # Create word-to-index and tag-to-index mappings
39
+ words = set(word for sent in tokenized_sentences for word in sent)
40
+ word_to_idx = {word: i + 1 for i, word in enumerate(words)}
41
+ num_words = len(words) + 1 # Add 1 for padding
42
+ tags = set(tag for tag_list in tags_list for tag in tag_list)
43
+ tag_to_idx = {tag: i for i, tag in enumerate(tags)}
44
+ num_tags = len(tags)
45
+
46
+ # Encode sentences and tags
47
+ max_len = max(len(sent) for sent in tokenized_sentences)
48
+ encoded_sentences = [[word_to_idx[word] for word in sent] for sent in tokenized_sentences]
49
+ encoded_sentences = pad_sequences(encoded_sentences, maxlen=max_len, padding='post')
50
+ encoded_tags = [[tag_to_idx[tag] for tag in tag_list] for tag_list in tags_list]
51
+ encoded_tags = pad_sequences(encoded_tags, maxlen=max_len, padding='post')
52
+ encoded_tags = [to_categorical(tag, num_classes=num_tags) for tag in encoded_tags]
53
+
54
+ # Split data into train and test sets
55
+ X_train, X_test, y_train, y_test = train_test_split(encoded_sentences, encoded_tags, test_size=0.2, random_state=42)
56
+
57
+ # Define the LSTM model
58
+ model_path = "best_model.h5"
59
+ if os.path.exists(model_path):
60
+ model = load_model(model_path)
61
+ else:
62
+ model = Sequential()
63
+ model.add(Embedding(input_dim=num_words, output_dim=50, input_length=max_len))
64
+ model.add(Bidirectional(LSTM(units=100, return_sequences=True)))
65
+ model.add(TimeDistributed(Dense(units=num_tags, activation='softmax')))
66
+ # Compile the model
67
+ model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
68
+
69
+ # Define callback to save the model when validation accuracy reaches 99% or above
70
+ class SaveModelCallback(keras.callbacks.Callback):
71
+ def on_epoch_end(self, epoch, logs={}):
72
+ if logs.get('val_accuracy') >= 0.99:
73
+ self.model.save("best_model.h5")
74
+ print("\nValidation accuracy reached 99% or above. Model saved.")
75
+
76
+ # Train the model
77
+ history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=7, validation_split=0.1, callbacks=[SaveModelCallback()])
78
+
79
+ # Evaluate the model
80
+ loss, accuracy = model.evaluate(X_test, np.array(y_test))
81
+ print("Test Loss:", loss)
82
+ print("Test Accuracy:", accuracy)
83
+
84
+ # Function to predict entities in a given sentence
85
+ def predict_entities(input_sentence):
86
+ tokenized_input = bn.word_tokenize(input_sentence)
87
+ encoded_input = [word_to_idx[word] if word in word_to_idx else 0 for word in tokenized_input]
88
+ padded_input = pad_sequences([encoded_input], maxlen=max_len, padding='post')
89
+ predictions = model.predict(padded_input)
90
+ predicted_tags = np.argmax(predictions, axis=-1)
91
+ reverse_tag_map = {v: k for k, v in tag_to_idx.items()}
92
+ predicted_tags = [reverse_tag_map[idx] for idx in predicted_tags[0][:len(tokenized_input)]]
93
+ tagged_sentence = [(word, tag) for word, tag in zip(tokenized_input, predicted_tags)]
94
+ return tagged_sentence
95
+
96
+ # Test user input
97
+ user_input = input("Enter a Bengali sentence: ")
98
+ predicted_tags = predict_entities(user_input)
99
+ for word, tag in predicted_tags:
100
+ print(f"{word}: {tag}")