|
|
|
""" |
|
Created on Thu Feb 8 20:22:57 2024 |
|
|
|
@author: Dhrumit Patel |
|
""" |
|
|
|
""" |
|
Milestone Project 2: SkimLit |
|
|
|
The purpose is to build an NLP model to make reading medical abstracts easier. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Get the data |
|
|
|
Since we will be replicating the paper (PubMed 200K RCT), let's download the dataset they used. |
|
|
|
We can do so from author's github |
|
|
|
git clone https://github.com/Franck-Dernoncourt/pubmed-rct |
|
dir pubmed-rct |
|
|
|
# Check what files are in the PubMed_20K dataset |
|
cd pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign |
|
dir |
|
|
|
Contains 3 files dev.txt, test.txt, train.txt |
|
""" |
|
|
|
|
|
data_dir = "pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/" |
|
|
|
|
|
import os |
|
filenames = [data_dir + filename for filename in os.listdir(data_dir)] |
|
filenames |
|
|
|
""" |
|
Preprocess the data |
|
""" |
|
|
|
|
|
def get_lines(filename): |
|
""" |
|
Reads filename (a text filename) and returns the lines of text as a list. |
|
|
|
Args: |
|
filename (str): a string containing the target filepath. |
|
|
|
Returns: |
|
A list of strings with one string per line from the target filename. |
|
""" |
|
with open(filename, "r") as f: |
|
return f.readlines() |
|
|
|
|
|
train_lines = get_lines(filename=data_dir + "train.txt") |
|
train_lines[:20] |
|
|
|
len(train_lines) |
|
|
|
|
|
|
|
def preprocess_text_with_line_numbers(filename): |
|
""" |
|
Returns a list of dictionaries of abstract line data. |
|
|
|
Takes in filename, reads its contents, and sorts through each line, |
|
extracting things like the target label, the text of the sentence, |
|
how many senetences are in the current abstract and what sentence |
|
number the target line is. |
|
""" |
|
input_lines = get_lines(filename) |
|
abstract_lines = "" |
|
abstract_samples = [] |
|
|
|
|
|
for line in input_lines: |
|
if line.startswith("###"): |
|
abstract_id = line |
|
abstract_lines = "" |
|
|
|
elif line.isspace(): |
|
abstract_line_split = abstract_lines.splitlines() |
|
|
|
|
|
for abstract_line_number, abstract_line in enumerate(abstract_line_split): |
|
line_data = {} |
|
target_text_split = abstract_line.split("\t") |
|
line_data["target"] = target_text_split[0] |
|
line_data["text"] = target_text_split[1].lower() |
|
line_data["line_number"] = abstract_line_number |
|
line_data["total_lines"] = len(abstract_line_split) - 1 |
|
abstract_samples.append(line_data) |
|
|
|
else: |
|
abstract_lines += line |
|
|
|
return abstract_samples |
|
|
|
|
|
train_samples = preprocess_text_with_line_numbers(filename = data_dir + "train.txt") |
|
val_samples = preprocess_text_with_line_numbers(filename = data_dir + "dev.txt") |
|
test_samples = preprocess_text_with_line_numbers(filename = data_dir + "test.txt") |
|
|
|
len(train_samples), len(val_samples), len(test_samples) |
|
|
|
|
|
train_samples[:14] |
|
|
|
""" |
|
Now that our data is in the format of a list of dictionaries, How about |
|
we turn it into a DataFrame to further visualize it? |
|
""" |
|
import pandas as pd |
|
train_df = pd.DataFrame(train_samples) |
|
val_df = pd.DataFrame(val_samples) |
|
test_df = pd.DataFrame(test_samples) |
|
|
|
train_df[:14] |
|
|
|
|
|
train_df["target"].value_counts() |
|
|
|
|
|
train_df["total_lines"].plot.hist() |
|
|
|
""" |
|
Get list of sentences |
|
""" |
|
|
|
train_sentences = train_df["text"].tolist() |
|
val_sentences = val_df["text"].tolist() |
|
test_sentences = test_df["text"].tolist() |
|
|
|
len(train_sentences), len(val_sentences), len(test_sentences) |
|
|
|
|
|
train_sentences[:10] |
|
|
|
""" |
|
Making numeric labels (ML models require numeric labels) |
|
""" |
|
|
|
from sklearn.preprocessing import OneHotEncoder |
|
one_hot_encoder = OneHotEncoder(sparse=False) |
|
train_labels_one_hot = one_hot_encoder.fit_transform(train_df["target"].to_numpy().reshape(-1, 1)) |
|
val_labels_one_hot = one_hot_encoder.transform(val_df["target"].to_numpy().reshape(-1, 1)) |
|
test_labels_one_hot = one_hot_encoder.transform(test_df["target"].to_numpy().reshape(-1, 1)) |
|
|
|
|
|
train_labels_one_hot, val_labels_one_hot, test_labels_one_hot |
|
|
|
""" |
|
Label encode labels |
|
""" |
|
|
|
from sklearn.preprocessing import LabelEncoder |
|
label_encoder = LabelEncoder() |
|
train_labels_encoded = label_encoder.fit_transform(train_df["target"].to_numpy()) |
|
val_labels_encoded = label_encoder.transform(val_df["target"].to_numpy()) |
|
test_labels_encoded = label_encoder.transform(test_df["target"].to_numpy()) |
|
|
|
|
|
train_labels_encoded, val_labels_encoded, test_labels_encoded |
|
|
|
|
|
num_classes = len(label_encoder.classes_) |
|
class_names = label_encoder.classes_ |
|
num_classes, class_names |
|
|
|
""" |
|
Starting a series of Modelling experiments |
|
""" |
|
|
|
""" |
|
Model 0: Getting a baseline model (TF-IDF Multinomial Naive Bayes Classifier) |
|
""" |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.naive_bayes import MultinomialNB |
|
from sklearn.pipeline import Pipeline |
|
|
|
|
|
model_0 = Pipeline([ |
|
("tf-idf", TfidfVectorizer()), |
|
("clf", MultinomialNB()) |
|
]) |
|
|
|
|
|
model_0.fit(train_sentences, train_labels_encoded) |
|
|
|
|
|
model_0.score(val_sentences, val_labels_encoded) |
|
|
|
|
|
baseline_preds = model_0.predict(val_sentences) |
|
baseline_preds |
|
|
|
""" |
|
For classification evaluation metrics (accuracy, precision, recall, f1-score) |
|
""" |
|
from helper_functions import calculate_results |
|
|
|
|
|
baseline_results = calculate_results(y_true=val_labels_encoded, y_pred=baseline_preds) |
|
baseline_results |
|
|
|
|
|
train_sentences[:10] |
|
|
|
""" |
|
Preparing our data (the text) for deep sequence model |
|
|
|
Before we start builidng deeper models, we had got to create vectorization and embedding layers |
|
""" |
|
|
|
import numpy as np |
|
import tensorflow as tf |
|
from tensorflow.keras import layers |
|
|
|
|
|
sent_lens = [len(sentence.split()) for sentence in train_sentences] |
|
avg_sent_len = np.mean(sent_lens) |
|
avg_sent_len |
|
|
|
|
|
import matplotlib.pyplot as plt |
|
plt.hist(sent_lens, bins=20) |
|
|
|
|
|
output_seq_length = int(np.percentile(sent_lens, 95)) |
|
output_seq_length |
|
|
|
|
|
max(sent_lens) |
|
|
|
""" |
|
Create a TextVectorizer layer |
|
|
|
We want to make a layer which maps our texts from words to numbers |
|
""" |
|
|
|
|
|
max_tokens = 68000 |
|
|
|
|
|
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization |
|
text_vectorizer = TextVectorization(max_tokens=max_tokens, |
|
output_sequence_length=output_seq_length) |
|
|
|
|
|
text_vectorizer.adapt(train_sentences) |
|
|
|
|
|
rct_20k_text_vocab = text_vectorizer.get_vocabulary() |
|
print(f"Number of words in vocab: {len(rct_20k_text_vocab)}") |
|
print(f"Most common words in the vocab: {rct_20k_text_vocab[:5]}") |
|
print(f"Least common words in the vocab: {rct_20k_text_vocab[-5:]}") |
|
|
|
|
|
text_vectorizer.get_config() |
|
|
|
from keras import layers |
|
""" |
|
Create a custom text embedding layer |
|
""" |
|
token_embed = layers.Embedding(input_dim=len(rct_20k_text_vocab), |
|
output_dim=128, |
|
mask_zero=True, |
|
name = "token_embedding") |
|
|
|
|
|
""" |
|
Creating datasets (making sure our data loads as fast as possible) |
|
|
|
We are going to setup our data to run as fast as poccible with TensorFlow tf.data API. |
|
""" |
|
|
|
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot)) |
|
valid_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot)) |
|
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels_one_hot)) |
|
|
|
train_dataset |
|
|
|
|
|
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
|
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
|
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
|
|
|
train_dataset, len(train_dataset) |
|
""" |
|
Model 1: Conv1D with token embeddings |
|
""" |
|
|
|
inputs = layers.Input(shape=(1,), dtype=tf.string) |
|
text_vectors = text_vectorizer(inputs) |
|
token_embeddings = token_embed(text_vectors) |
|
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(token_embeddings) |
|
x = layers.GlobalAveragePooling1D()(x) |
|
outputs = layers.Dense(num_classes, activation="softmax")(x) |
|
|
|
model_1 = tf.keras.Model(inputs, outputs) |
|
|
|
|
|
model_1.compile(loss="categorical_crossentropy", |
|
optimizer=tf.keras.optimizers.Adam(), |
|
metrics=["accuracy"]) |
|
|
|
model_1.summary() |
|
|
|
|
|
history_model_1 = model_1.fit(train_dataset, |
|
epochs=3, |
|
steps_per_epoch=int(0.1 * len(train_dataset)), |
|
validation_data=valid_dataset, |
|
validation_steps=int(0.1 * len(valid_dataset))) |
|
|
|
|
|
model_1.evaluate(valid_dataset) |
|
|
|
|
|
model_1_pred_probs = model_1.predict(valid_dataset) |
|
model_1_pred_probs, model_1_pred_probs.shape |
|
|
|
|
|
model_1_preds = tf.argmax(model_1_pred_probs, axis=1) |
|
model_1_preds |
|
class_names |
|
class_names[model_1_preds] |
|
|
|
|
|
model_1_results = calculate_results(y_true=val_labels_encoded, y_pred=model_1_preds) |
|
model_1_results |
|
|
|
""" |
|
Model 2: Feature extraction with pretrained token embeddings |
|
|
|
Now let's use pretrained word embeddings from TensorFlow Hub, |
|
more sepcifically the universal sentence encoder |
|
|
|
The paper used originally used GloVe embeddings, however we are going to stick with the later |
|
created USE pretrained embeddings. |
|
""" |
|
|
|
import tensorflow_hub as hub |
|
tf_hub_embedding_layer = hub.KerasLayer("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2", |
|
trainable=False, |
|
name="universal_sentence_encoder") |
|
|
|
|
|
""" |
|
Building and fitting an NLP feature extraction model using pretrained embeddings TensorFlow Hub |
|
""" |
|
|
|
inputs = layers.Input(shape=[], dtype=tf.string) |
|
pretrained_embedding = tf_hub_embedding_layer(inputs) |
|
x = layers.Dense(128, activation="relu")(pretrained_embedding) |
|
|
|
outputs = layers.Dense(num_classes, activation="softmax")(x) |
|
|
|
model_2 = tf.keras.Model(inputs, outputs, name="model_2_USE_feature_extractor") |
|
|
|
|
|
model_2.compile(loss="categorical_crossentropy", |
|
optimizer=tf.keras.optimizers.Adam(), |
|
metrics=["accuracy"]) |
|
|
|
model_2.summary() |
|
|
|
|
|
with tf.device('/CPU:0'): |
|
history_model_2 = model_2.fit(train_dataset, |
|
epochs=3, |
|
steps_per_epoch=int(0.1 * len(train_dataset)), |
|
validation_data=valid_dataset, |
|
validation_steps=int(0.1 * len(valid_dataset))) |
|
|
|
|
|
with tf.device('/CPU:0'): |
|
model_2.evaluate(valid_dataset) |
|
|
|
|
|
with tf.device('/CPU:0'): |
|
model_2_pred_probs = model_2.predict(valid_dataset) |
|
model_2_pred_probs, model_2_pred_probs.shape |
|
|
|
|
|
model_2_preds = tf.argmax(model_2_pred_probs, axis=1) |
|
model_2_preds |
|
class_names[model_2_preds] |
|
|
|
|
|
model_2_results = calculate_results(y_true=val_labels_encoded, y_pred=model_2_preds) |
|
model_2_results |
|
|
|
""" |
|
Model 3: Conv1D with character embeddings |
|
|
|
The paper which we are replicating states they used a combination of token and charcter level embeddings. |
|
Previously, we have token level embeddings but we will need to do similar steps for characters if we want to use char-level embeddings. |
|
""" |
|
|
|
""" |
|
Creating a charceter-level tokenizer |
|
""" |
|
train_sentences[:5] |
|
|
|
|
|
def split_chars(text): |
|
return " ".join(list(text)) |
|
|
|
|
|
|
|
train_chars = [split_chars(sentence) for sentence in train_sentences] |
|
val_chars = [split_chars(sentence) for sentence in val_sentences] |
|
test_chars = [split_chars(sentence) for sentence in test_sentences] |
|
|
|
train_chars, val_chars, test_chars |
|
|
|
|
|
char_lens = [len(sentence) for sentence in train_sentences] |
|
mean_char_len = np.mean(char_lens) |
|
mean_char_len |
|
|
|
|
|
import matplotlib.pyplot as plt |
|
plt.hist(char_lens, bins=7) |
|
|
|
|
|
output_seq_char_len = int(np.percentile(char_lens, 95)) |
|
output_seq_char_len |
|
|
|
|
|
import string |
|
alphabet = string.ascii_lowercase + string.digits + string.punctuation |
|
alphabet |
|
len(alphabet) |
|
|
|
|
|
NUM_CHAR_TOKENS = len(alphabet) + 2 |
|
char_vectorizer = TextVectorization(max_tokens=NUM_CHAR_TOKENS, |
|
output_sequence_length=output_seq_char_len, |
|
standardize="lower_and_strip_punctuation", |
|
name="char_vectorizer") |
|
|
|
|
|
char_vectorizer.adapt(train_chars) |
|
|
|
|
|
char_vocab = char_vectorizer.get_vocabulary() |
|
print(f"Number of different characters in character vocab: {len(char_vocab)}") |
|
print(f"5 most common character: {char_vocab[:5]}") |
|
print(f"5 least common characters: {char_vocab[-5:]}") |
|
|
|
""" |
|
Creating a character-level embedding |
|
""" |
|
|
|
char_embed = layers.Embedding(input_dim=len(char_vocab), |
|
output_dim=25, |
|
mask_zero=True, |
|
name="char_embed") |
|
|
|
|
|
""" |
|
Model 3: Building a Conv1D model to fit on character embeddings |
|
""" |
|
|
|
inputs = layers.Input(shape=(1,), dtype="string") |
|
char_vectors = char_vectorizer(inputs) |
|
char_embeddings = char_embed(char_vectors) |
|
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(char_embeddings) |
|
x = layers.GlobalMaxPool1D()(x) |
|
outputs = layers.Dense(num_classes, activation="softmax")(x) |
|
|
|
model_3 = tf.keras.Model(inputs, outputs, name="model_3_conv1d_char_embeddings") |
|
|
|
|
|
model_3.compile(loss="categorical_crossentropy", |
|
optimizer=tf.keras.optimizers.Adam(), |
|
metrics=["accuracy"]) |
|
|
|
model_3.summary() |
|
|
|
|
|
train_char_dataset = tf.data.Dataset.from_tensor_slices((train_chars, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE) |
|
val_char_dataset = tf.data.Dataset.from_tensor_slices((val_chars, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE) |
|
test_char_dataset = tf.data.Dataset.from_tensor_slices((test_chars, test_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE) |
|
|
|
train_char_dataset, val_char_dataset, test_char_dataset |
|
|
|
|
|
model_3_history = model_3.fit(train_char_dataset, |
|
epochs=3, |
|
steps_per_epoch=int(0.1 * len (train_char_dataset)), |
|
validation_data=val_char_dataset, |
|
validation_steps=int(0.1 * len(val_char_dataset))) |
|
|
|
|
|
model_3.evaluate(val_char_dataset) |
|
|
|
|
|
model_3_pred_probs = model_3.predict(val_char_dataset) |
|
model_3_pred_probs, model_3_pred_probs.shape |
|
|
|
|
|
model_3_preds = tf.argmax(model_3_pred_probs, axis=1) |
|
model_3_preds |
|
class_names[model_3_preds] |
|
|
|
|
|
model_3_results = calculate_results(y_true=val_labels_encoded, y_pred=model_3_preds) |
|
model_3_results |
|
|
|
baseline_results |
|
|
|
""" |
|
Model 4: Combining pretrained token embeddings + characters embeddings (hybrid embedding layer) |
|
|
|
1. Create a token level embedding model (similar to model_1) |
|
2. Create a character level model (similar to model_3 with a slight modification) |
|
3. Combine 1 & 2 with a concatenate (layers.Concatenate) |
|
4. Build a series of output layer on top point 3. |
|
5. Construct a model which takes token and character level sequences as input and produces sequence label probabilities as output. |
|
""" |
|
|
|
|
|
token_inputs = layers.Input(shape=[], dtype=tf.string, name="token_inputs") |
|
token_embeddings = tf_hub_embedding_layer(token_inputs) |
|
token_outputs = layers.Dense(128, activation="relu")(token_embeddings) |
|
token_model = tf.keras.Model(inputs=token_inputs, outputs=token_outputs) |
|
|
|
|
|
char_inputs = layers.Input(shape=(1,), dtype=tf.string, name="char_input") |
|
char_vectors = char_vectorizer(char_inputs) |
|
char_embeddings = char_embed(char_vectors) |
|
char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings) |
|
char_model = tf.keras.Model(inputs=char_inputs, outputs=char_bi_lstm) |
|
|
|
|
|
token_char_concat = layers.Concatenate(name="token_char_hybrid")([token_model.output, char_model.output]) |
|
|
|
|
|
combined_dropout = layers.Dropout(0.5)(token_char_concat) |
|
combined_dense = layers.Dense(128, activation="relu")(combined_dropout) |
|
final_dropout = layers.Dropout(0.5)(combined_dense) |
|
output_layer = layers.Dense(num_classes, activation="softmax")(final_dropout) |
|
|
|
|
|
model_4 = tf.keras.Model(inputs=[token_model.input, char_model.input], |
|
outputs=output_layer, |
|
name="model_4_token_and_char_embeddings") |
|
|
|
|
|
model_4.summary() |
|
|
|
|
|
from keras.utils import plot_model |
|
plot_model(model_4, show_shapes=True) |
|
|
|
|
|
model_4.compile(loss="categorical_crossentropy", |
|
optimizer=tf.keras.optimizers.Adam(), |
|
metrics=["accuracy"]) |
|
|
|
""" |
|
Combining token and character data into tf.data.Dataset |
|
""" |
|
|
|
|
|
train_char_token_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars)) |
|
train_char_token_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot) |
|
train_char_token_dataset = tf.data.Dataset.zip((train_char_token_data, train_char_token_labels)) |
|
|
|
|
|
train_char_token_dataset = train_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
|
|
|
|
|
val_char_token_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars)) |
|
val_char_token_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot) |
|
val_char_token_dataset = tf.data.Dataset.zip((val_char_token_data, val_char_token_labels)) |
|
|
|
|
|
val_char_token_dataset = val_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
|
|
|
|
|
train_char_token_dataset, val_char_token_dataset |
|
|
|
|
|
with tf.device('/CPU:0'): |
|
history_model_4 = model_4.fit(train_char_token_dataset, |
|
epochs=3, |
|
steps_per_epoch=int(0.1 * len(train_char_token_dataset)), |
|
validation_data=val_char_token_dataset, |
|
validation_steps=int(0.1 * len(val_char_token_dataset))) |
|
|
|
|
|
with tf.device('/CPU:0'): |
|
model_4.evaluate(val_char_token_dataset) |
|
|
|
|
|
model_4_pred_probs = model_4.predict(val_char_token_dataset) |
|
model_4_pred_probs, model_4_pred_probs.shape |
|
|
|
|
|
model_4_preds = tf.argmax(model_4_pred_probs, axis=1) |
|
model_4_preds |
|
|
|
model_4_preds |
|
class_names[model_4_preds] |
|
|
|
|
|
model_4_results = calculate_results(y_true=val_labels_encoded, y_pred=model_4_preds) |
|
model_4_results |
|
|
|
""" |
|
Model 5: Transfer learning with pretrained token embeddings + character embeddings + |
|
positional embeddings |
|
""" |
|
train_df.head() |
|
|
|
""" |
|
Create positional embeddings |
|
""" |
|
|
|
train_df["line_number"].value_counts() |
|
|
|
|
|
train_df["line_number"].plot.hist() |
|
|
|
|
|
train_line_numbers_one_hot = tf.one_hot(train_df["line_number"].to_numpy(), depth=15) |
|
val_line_numbers_one_hot = tf.one_hot(val_df["line_number"].to_numpy(), depth=15) |
|
test_line_numbers_one_hot = tf.one_hot(test_df["line_number"].to_numpy(), depth=15) |
|
train_line_numbers_one_hot[:10], train_line_numbers_one_hot.shape |
|
train_line_numbers_one_hot[0].shape |
|
train_line_numbers_one_hot[0].dtype |
|
|
|
|
|
train_df["total_lines"].value_counts() |
|
|
|
|
|
train_df["total_lines"].plot.hist() |
|
|
|
|
|
np.percentile(train_df["total_lines"], 98) |
|
|
|
|
|
train_total_lines_one_hot = tf.one_hot(train_df["total_lines"].to_numpy(), depth=20) |
|
val_total_lines_one_hot = tf.one_hot(val_df["total_lines"].to_numpy(), depth=20) |
|
test_total_lines_one_hot = tf.one_hot(test_df["total_lines"].to_numpy(), depth=20) |
|
train_total_lines_one_hot[:10], train_total_lines_one_hot.shape |
|
train_total_lines_one_hot[0].shape |
|
train_total_lines_one_hot[0].dtype |
|
|
|
""" |
|
Building a tribrid embedding model |
|
|
|
1. Create a token-level model |
|
2. Create a character-level model |
|
3. Create a model for the "line_number" feature |
|
4. Create a model for the "total_lines" feature |
|
5. Combine the outputs of 1 & 2 using tf.keras.layers.Concatenate |
|
6. Combine the outputs of 3,4,5 using tf.keras.layers.Concatenate |
|
7. Create an output layer to accept the tribrid embedding and output label probabilities. |
|
8. Combine the inputs of 1,2,3,4 and outputs of 7 into tf.keras.Model |
|
""" |
|
|
|
token_inputs = layers.Input(shape=[], dtype="string", name="token_inputs") |
|
token_embeddings = tf_hub_embedding_layer(token_inputs) |
|
token_outputs = layers.Dense(128, activation="relu")(token_embeddings) |
|
token_model = tf.keras.Model(inputs=token_inputs, outputs=token_outputs) |
|
|
|
|
|
char_inputs = layers.Input(shape=(1,), dtype="string", name="char_inputs") |
|
char_vectors = char_vectorizer(char_inputs) |
|
char_embeddings = char_embed(char_vectors) |
|
char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings) |
|
char_model = tf.keras.Model(inputs=char_inputs, outputs=char_bi_lstm) |
|
|
|
|
|
line_number_inputs = layers.Input(shape=(15,), dtype=tf.float32, name="line_number_input") |
|
x = layers.Dense(32, activation="relu")(line_number_inputs) |
|
line_number_model = tf.keras.Model(inputs=line_number_inputs, outputs=x) |
|
|
|
|
|
total_lines_inputs = layers.Input(shape=(20,), dtype=tf.float32, name="total_lines_input") |
|
y = layers.Dense(32, activation="relu")(total_lines_inputs) |
|
total_lines_model = tf.keras.Model(inputs=total_lines_inputs, outputs=y) |
|
|
|
|
|
combined_embeddings = layers.Concatenate(name="char_token_hybrid_embedding")([token_model.output, char_model.output]) |
|
z = layers.Dense(256, activation="relu")(combined_embeddings) |
|
z = layers.Dropout(0.5)(z) |
|
|
|
|
|
tribrid_embeddings = layers.Concatenate(name="char_token_positional_embedding")([line_number_model.output, total_lines_model.output, z]) |
|
|
|
|
|
output_layer = layers.Dense(num_classes, activation="softmax", name="output_layer")(tribrid_embeddings) |
|
|
|
|
|
model_5 = tf.keras.Model(inputs=[line_number_model.input, |
|
total_lines_model.input, |
|
token_model.input, |
|
char_model.input], outputs=output_layer, name="model_5_tribrid_embedding_model") |
|
|
|
|
|
model_5.summary() |
|
|
|
from tensorflow.keras.utils import plot_model |
|
plot_model(model_5, show_shapes=True) |
|
|
|
|
|
|
|
model_5.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2), |
|
optimizer=tf.keras.optimizers.Adam(), |
|
metrics=["accuracy"]) |
|
|
|
""" |
|
Create tribrid embeddings datasets using tf.data |
|
""" |
|
|
|
|
|
train_char_token_pos_data = tf.data.Dataset.from_tensor_slices((train_line_numbers_one_hot, |
|
train_total_lines_one_hot, |
|
train_sentences, |
|
train_chars)) |
|
train_char_token_pos_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot) |
|
train_char_token_pos_dataset = tf.data.Dataset.zip((train_char_token_pos_data, train_char_token_pos_labels)) |
|
|
|
train_char_token_pos_dataset = train_char_token_pos_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
|
|
|
|
|
|
|
val_char_token_pos_data = tf.data.Dataset.from_tensor_slices((val_line_numbers_one_hot, |
|
val_total_lines_one_hot, |
|
val_sentences, |
|
val_chars)) |
|
val_char_token_pos_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot) |
|
val_char_token_pos_dataset = tf.data.Dataset.zip((val_char_token_pos_data, val_char_token_pos_labels)) |
|
|
|
val_char_token_pos_dataset = val_char_token_pos_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
|
|
|
|
|
train_char_token_pos_dataset, val_char_token_pos_dataset |
|
|
|
|
|
with tf.device('/CPU:0'): |
|
history_model_5 = model_5.fit(train_char_token_pos_dataset, |
|
epochs=3, |
|
steps_per_epoch=int(0.1 * len(train_char_token_pos_dataset)), |
|
validation_data=val_char_token_pos_dataset, |
|
validation_steps=int(0.1 * len(val_char_token_pos_dataset))) |
|
|
|
with tf.device('/CPU:0'): |
|
|
|
model_5.evaluate(val_char_token_pos_dataset) |
|
|
|
|
|
model_5_pred_probs = model_5.predict(val_char_token_pos_dataset) |
|
model_5_pred_probs, model_5_pred_probs.shape |
|
|
|
|
|
model_5_preds = tf.argmax(model_5_pred_probs, axis=1) |
|
model_5_preds |
|
|
|
model_5_preds |
|
class_names[model_5_preds] |
|
|
|
|
|
model_5_results = calculate_results(y_true=val_labels_encoded, y_pred=model_5_preds) |
|
model_5_results |
|
|
|
""" |
|
Compare model results |
|
""" |
|
|
|
|
|
all_model_results = pd.DataFrame({"model_0_baseline": baseline_results, |
|
"model_1_custom_token_embedding": model_1_results, |
|
"model_2_pretrained_token_embedding": model_2_results, |
|
"model_3_custom_char_embedding": model_3_results, |
|
"model_4_hybrid_char_token_embedding": model_4_results, |
|
"model_5_pos_char_token_embedding": model_5_results}) |
|
|
|
all_model_results = all_model_results.transpose() |
|
all_model_results |
|
|
|
|
|
all_model_results["accuracy"] = all_model_results["accuracy"]/100 |
|
|
|
all_model_results |
|
|
|
|
|
all_model_results.plot(kind="bar", figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0)) |
|
|
|
|
|
all_model_results.sort_values("f1", ascending=True)["f1"].plot(kind="bar", figsize=(10, 7)) |
|
|
|
""" |
|
Save and load model |
|
""" |
|
|
|
model_5.save("skimlit_tribrid_model_me") |
|
|
|
|
|
from keras.models import load_model |
|
with tf.device('/CPU:0'): |
|
loaded_model = load_model("skimlit_tribrid_model_me") |
|
|
|
|
|
with tf.device('/CPU:0'): |
|
loaded_pred_probs = loaded_model.predict(val_char_token_pos_dataset) |
|
loaded_pred_probs, loaded_pred_probs.shape |
|
|
|
|
|
loaded_preds = tf.argmax(loaded_pred_probs, axis=1) |
|
loaded_preds |
|
|
|
loaded_preds[:10] |
|
class_names[loaded_preds] |
|
|
|
|
|
loaded_model_results = calculate_results(y_true=val_labels_encoded, y_pred=loaded_preds) |
|
loaded_model_results |
|
|
|
assert model_5_results == loaded_model_results |
|
|
|
|
|
loaded_model.summary() |
|
|
|
|
|
""" |
|
Optional - for the loaded model you can use your own trained model |
|
""" |
|
import tensorflow as tf |
|
import tensorflow_hub as hub |
|
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization |
|
|
|
import os |
|
url = "https://drive.google.com/file/d/1DYr3Ew9tU6dph_fI0JeTZ6GbdzZpWr8K/view?usp=sharing" |
|
|
|
|
|
loaded_gs_model = load_model("skimlit_tribrid_model") |
|
|
|
|
|
loaded_gs_model.evaluate(val_char_token_pos_dataset) |
|
loaded_preds = tf.argmax(loaded_pred_probs, axis=1) |
|
loaded_preds[:10] |
|
|
|
|
|
loaded_model_results = calculate_results(val_labels_encoded, loaded_preds) |
|
loaded_model_results |
|
|
|
|
|
loaded_model.summary() |
|
|
|
|
|
test_pos_char_token_data = tf.data.Dataset.from_tensor_slices((test_line_numbers_one_hot, test_total_lines_one_hot, test_sentences, test_chars)) |
|
test_pos_char_token_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot) |
|
|
|
test_pos_char_token_dataset = tf.data.Dataset.zip((test_pos_char_token_data, test_pos_char_token_labels)) |
|
|
|
test_pos_char_token_dataset = test_pos_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
|
|
|
|
|
with tf.device('/CPU:0'): |
|
test_pred_probs = loaded_model.predict(test_pos_char_token_dataset, verbose=1) |
|
test_preds = tf.argmax(test_pred_probs, axis=1) |
|
test_preds[:10] |
|
|
|
|
|
loaded_model_test_results = calculate_results(y_true=test_labels_encoded, y_pred=test_preds) |
|
loaded_model_test_results |
|
|
|
|
|
test_pred_classes = [label_encoder.classes_[pred] for pred in test_preds] |
|
test_pred_classes |
|
|
|
|
|
|
|
|
|
test_df["prediction"] = test_pred_classes |
|
|
|
|
|
test_df["pred_prob"] = tf.reduce_max(test_pred_probs, axis=1).numpy() |
|
|
|
|
|
|
|
test_df["correct"] = test_df["prediction"] == test_df["target"] |
|
|
|
|
|
test_df.head(20) |
|
|
|
|
|
top_100_wrong = test_df[test_df["correct"] == False].sort_values("pred_prob", ascending=False)[:100] |
|
top_100_wrong |
|
|
|
|
|
for row in top_100_wrong[0:10].itertuples(): |
|
|
|
_, target, text, line_number, total_lines, prediction, pred_prob, _ = row |
|
|
|
|
|
print(f"Target: {target}, Pred: {prediction}, Prob: {pred_prob}, Line number: {line_number}, Total lines: {total_lines}\n") |
|
|
|
|
|
print(f"Text:\n{text}\n") |
|
|
|
|
|
print("-----------------------------------------------------------------------\n") |
|
|
|
|
|
import json |
|
import requests |
|
|
|
|
|
url = "https://github.com/Dhrumit1314/Skimlit_NLP/blob/main/abstract_data.json" |
|
response = requests.get(url) |
|
|
|
|
|
if response.status_code == 200: |
|
|
|
example_abstracts = json.loads(response.text) |
|
print("Example abstracts loaded successfully.") |
|
else: |
|
print(f"Failed to download example abstracts. Status code: {response.status_code}") |
|
|
|
|
|
abstracts = pd.DataFrame(example_abstracts) |
|
abstracts |
|
|
|
|
|
from spacy.lang.en import English |
|
|
|
|
|
nlp = English() |
|
|
|
|
|
sentencizer = nlp.add_pipe("sentencizer") |
|
|
|
|
|
example_abstract = example_abstracts[0]["abstract"] |
|
example_abstract |
|
|
|
|
|
doc = nlp(example_abstract) |
|
doc |
|
|
|
|
|
abstract_lines = [str(sent) for sent in list(doc.sents)] |
|
|
|
abstract_lines |
|
|
|
|
|
total_lines_in_sample = len(abstract_lines) |
|
|
|
|
|
sample_lines = [] |
|
|
|
|
|
for i, line in enumerate(abstract_lines): |
|
|
|
sample_dict = {} |
|
|
|
|
|
sample_dict["text"] = str(line) |
|
|
|
|
|
sample_dict["line_number"] = i |
|
|
|
|
|
sample_dict["total_lines"] = total_lines_in_sample - 1 |
|
|
|
|
|
sample_lines.append(sample_dict) |
|
|
|
|
|
sample_lines |
|
|
|
|
|
test_abstract_line_numbers = [line["line_number"] for line in sample_lines] |
|
|
|
|
|
test_abstract_line_numbers_one_hot = tf.one_hot(test_abstract_line_numbers, depth=15) |
|
|
|
|
|
test_abstract_line_numbers_one_hot |
|
|
|
|
|
test_abstract_total_lines = [line["total_lines"] for line in sample_lines] |
|
|
|
|
|
test_abstract_total_lines_one_hot = tf.one_hot(test_abstract_total_lines, depth=20) |
|
test_abstract_total_lines_one_hot |
|
|
|
|
|
abstract_chars = [split_chars(sentence) for sentence in abstract_lines] |
|
abstract_chars |
|
|
|
import tensorflow as tf |
|
import time |
|
|
|
|
|
line_numbers_depth = 15 |
|
total_lines_depth = 20 |
|
|
|
|
|
test_abstract_line_numbers_one_hot = tf.one_hot(test_abstract_line_numbers, depth=line_numbers_depth) |
|
test_abstract_total_lines_one_hot = tf.one_hot(test_abstract_total_lines, depth=total_lines_depth) |
|
test_abstract_abstract_lines = tf.constant(abstract_lines) |
|
test_abstract_abstract_chars = tf.constant(abstract_chars) |
|
|
|
|
|
start_time = time.time() |
|
|
|
with tf.device('/CPU:0'): |
|
|
|
test_abstract_pred_probs = model_5.predict(x=(test_abstract_line_numbers_one_hot, test_abstract_total_lines_one_hot, tf.constant(abstract_lines), tf.constant(abstract_chars))) |
|
|
|
end_time = time.time() |
|
|
|
|
|
|
|
print("Prediction Probabilities:", test_abstract_pred_probs) |
|
|
|
|
|
print("Time taken for predictions: {:.2f} seconds".format(end_time - start_time)) |
|
|
|
|
|
test_abstract_preds = tf.argmax(test_abstract_pred_probs, axis=1) |
|
test_abstract_preds |
|
|
|
|
|
test_abstract_pred_classes = [label_encoder.classes_[i] for i in test_abstract_preds] |
|
test_abstract_pred_classes |
|
|
|
|
|
for i, line in enumerate(abstract_lines): |
|
print(f"{test_abstract_pred_classes[i]}: {line}") |