""" |
Created on Thu Feb 8 20:22:57 2024 |
@author: Dhrumit Patel |
""" |
""" |
Milestone Project 2: SkimLit |
The purpose is to build an NLP model to make reading medical abstracts easier. |
""" |
""" |
Get the data |
Since we will be replicating the paper (PubMed 200K RCT), let's download the dataset they used. |
We can do so from author's github |
git clone https://github.com/Franck-Dernoncourt/pubmed-rct |
dir pubmed-rct |
# Check what files are in the PubMed_20K dataset |
cd pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign |
dir |
Contains 3 files dev.txt, test.txt, train.txt |
""" |
data_dir = "pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/" |
import os |
filenames = [data_dir + filename for filename in os.listdir(data_dir)] |
filenames |
""" |
Preprocess the data |
""" |
def get_lines(filename): |
""" |
Reads filename (a text filename) and returns the lines of text as a list. |
Args: |
filename (str): a string containing the target filepath. |
Returns: |
A list of strings with one string per line from the target filename. |
""" |
with open(filename, "r") as f: |
return f.readlines() |
train_lines = get_lines(filename=data_dir + "train.txt") |
train_lines[:20] |
len(train_lines) |
def preprocess_text_with_line_numbers(filename): |
""" |
Returns a list of dictionaries of abstract line data. |
Takes in filename, reads its contents, and sorts through each line, |
extracting things like the target label, the text of the sentence, |
how many senetences are in the current abstract and what sentence |
number the target line is. |
""" |
input_lines = get_lines(filename) |
abstract_lines = "" |
abstract_samples = [] |
for line in input_lines: |
if line.startswith("###"): |
abstract_id = line |
abstract_lines = "" |
elif line.isspace(): |
abstract_line_split = abstract_lines.splitlines() |
for abstract_line_number, abstract_line in enumerate(abstract_line_split): |
line_data = {} |
target_text_split = abstract_line.split("\t") |
line_data["target"] = target_text_split[0] |
line_data["text"] = target_text_split[1].lower() |
line_data["line_number"] = abstract_line_number |
line_data["total_lines"] = len(abstract_line_split) - 1 |
abstract_samples.append(line_data) |
else: |
abstract_lines += line |
return abstract_samples |
train_samples = preprocess_text_with_line_numbers(filename = data_dir + "train.txt") |
val_samples = preprocess_text_with_line_numbers(filename = data_dir + "dev.txt") |
test_samples = preprocess_text_with_line_numbers(filename = data_dir + "test.txt") |
len(train_samples), len(val_samples), len(test_samples) |
train_samples[:14] |
""" |
Now that our data is in the format of a list of dictionaries, How about |
we turn it into a DataFrame to further visualize it? |
""" |
import pandas as pd |
train_df = pd.DataFrame(train_samples) |
val_df = pd.DataFrame(val_samples) |
test_df = pd.DataFrame(test_samples) |
train_df[:14] |
train_df["target"].value_counts() |
train_df["total_lines"].plot.hist() |
""" |
Get list of sentences |
""" |
train_sentences = train_df["text"].tolist() |
val_sentences = val_df["text"].tolist() |
test_sentences = test_df["text"].tolist() |
len(train_sentences), len(val_sentences), len(test_sentences) |
train_sentences[:10] |
""" |
Making numeric labels (ML models require numeric labels) |
""" |
from sklearn.preprocessing import OneHotEncoder |
one_hot_encoder = OneHotEncoder(sparse=False) |
train_labels_one_hot = one_hot_encoder.fit_transform(train_df["target"].to_numpy().reshape(-1, 1)) |
val_labels_one_hot = one_hot_encoder.transform(val_df["target"].to_numpy().reshape(-1, 1)) |
test_labels_one_hot = one_hot_encoder.transform(test_df["target"].to_numpy().reshape(-1, 1)) |
train_labels_one_hot, val_labels_one_hot, test_labels_one_hot |
""" |
Label encode labels |
""" |
from sklearn.preprocessing import LabelEncoder |
label_encoder = LabelEncoder() |
train_labels_encoded = label_encoder.fit_transform(train_df["target"].to_numpy()) |
val_labels_encoded = label_encoder.transform(val_df["target"].to_numpy()) |
test_labels_encoded = label_encoder.transform(test_df["target"].to_numpy()) |
train_labels_encoded, val_labels_encoded, test_labels_encoded |
num_classes = len(label_encoder.classes_) |
class_names = label_encoder.classes_ |
num_classes, class_names |
""" |
Starting a series of Modelling experiments |
""" |
""" |
Model 0: Getting a baseline model (TF-IDF Multinomial Naive Bayes Classifier) |
""" |
from sklearn.feature_extraction.text import TfidfVectorizer |
from sklearn.naive_bayes import MultinomialNB |
from sklearn.pipeline import Pipeline |
model_0 = Pipeline([ |
("tf-idf", TfidfVectorizer()), |
("clf", MultinomialNB()) |
]) |
model_0.fit(train_sentences, train_labels_encoded) |
model_0.score(val_sentences, val_labels_encoded) |
baseline_preds = model_0.predict(val_sentences) |
baseline_preds |
""" |
For classification evaluation metrics (accuracy, precision, recall, f1-score) |
""" |
from helper_functions import calculate_results |
baseline_results = calculate_results(y_true=val_labels_encoded, y_pred=baseline_preds) |
baseline_results |
train_sentences[:10] |
""" |
Preparing our data (the text) for deep sequence model |
Before we start builidng deeper models, we had got to create vectorization and embedding layers |
""" |
import numpy as np |
import tensorflow as tf |
from tensorflow.keras import layers |
sent_lens = [len(sentence.split()) for sentence in train_sentences] |
avg_sent_len = np.mean(sent_lens) |
avg_sent_len |
import matplotlib.pyplot as plt |
plt.hist(sent_lens, bins=20) |
output_seq_length = int(np.percentile(sent_lens, 95)) |
output_seq_length |
max(sent_lens) |
""" |
Create a TextVectorizer layer |
We want to make a layer which maps our texts from words to numbers |
""" |
max_tokens = 68000 |
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization |
text_vectorizer = TextVectorization(max_tokens=max_tokens, |
output_sequence_length=output_seq_length) |
text_vectorizer.adapt(train_sentences) |
rct_20k_text_vocab = text_vectorizer.get_vocabulary() |
print(f"Number of words in vocab: {len(rct_20k_text_vocab)}") |
print(f"Most common words in the vocab: {rct_20k_text_vocab[:5]}") |
print(f"Least common words in the vocab: {rct_20k_text_vocab[-5:]}") |
text_vectorizer.get_config() |
from keras import layers |
""" |
Create a custom text embedding layer |
""" |
token_embed = layers.Embedding(input_dim=len(rct_20k_text_vocab), |
output_dim=128, |
mask_zero=True, |
name = "token_embedding") |
""" |
Creating datasets (making sure our data loads as fast as possible) |
We are going to setup our data to run as fast as poccible with TensorFlow tf.data API. |
""" |
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot)) |
valid_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot)) |
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels_one_hot)) |
train_dataset |
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
train_dataset, len(train_dataset) |
""" |
Model 1: Conv1D with token embeddings |
""" |
inputs = layers.Input(shape=(1,), dtype=tf.string) |
text_vectors = text_vectorizer(inputs) |
token_embeddings = token_embed(text_vectors) |
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(token_embeddings) |
x = layers.GlobalAveragePooling1D()(x) |
outputs = layers.Dense(num_classes, activation="softmax")(x) |
model_1 = tf.keras.Model(inputs, outputs) |
model_1.compile(loss="categorical_crossentropy", |
optimizer=tf.keras.optimizers.Adam(), |
metrics=["accuracy"]) |
model_1.summary() |
history_model_1 = model_1.fit(train_dataset, |
epochs=3, |
steps_per_epoch=int(0.1 * len(train_dataset)), |
validation_data=valid_dataset, |
validation_steps=int(0.1 * len(valid_dataset))) |
model_1.evaluate(valid_dataset) |
model_1_pred_probs = model_1.predict(valid_dataset) |
model_1_pred_probs, model_1_pred_probs.shape |
model_1_preds = tf.argmax(model_1_pred_probs, axis=1) |
model_1_preds |
class_names |
class_names[model_1_preds] |
model_1_results = calculate_results(y_true=val_labels_encoded, y_pred=model_1_preds) |
model_1_results |
""" |
Model 2: Feature extraction with pretrained token embeddings |
Now let's use pretrained word embeddings from TensorFlow Hub, |
more sepcifically the universal sentence encoder |
The paper used originally used GloVe embeddings, however we are going to stick with the later |
created USE pretrained embeddings. |
""" |
import tensorflow_hub as hub |
tf_hub_embedding_layer = hub.KerasLayer("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2", |
trainable=False, |
name="universal_sentence_encoder") |
""" |
Building and fitting an NLP feature extraction model using pretrained embeddings TensorFlow Hub |
""" |
inputs = layers.Input(shape=[], dtype=tf.string) |
pretrained_embedding = tf_hub_embedding_layer(inputs) |
x = layers.Dense(128, activation="relu")(pretrained_embedding) |
outputs = layers.Dense(num_classes, activation="softmax")(x) |
model_2 = tf.keras.Model(inputs, outputs, name="model_2_USE_feature_extractor") |
model_2.compile(loss="categorical_crossentropy", |
optimizer=tf.keras.optimizers.Adam(), |
metrics=["accuracy"]) |
model_2.summary() |
with tf.device('/CPU:0'): |
history_model_2 = model_2.fit(train_dataset, |
epochs=3, |
steps_per_epoch=int(0.1 * len(train_dataset)), |
validation_data=valid_dataset, |
validation_steps=int(0.1 * len(valid_dataset))) |
with tf.device('/CPU:0'): |
model_2.evaluate(valid_dataset) |
with tf.device('/CPU:0'): |
model_2_pred_probs = model_2.predict(valid_dataset) |
model_2_pred_probs, model_2_pred_probs.shape |
model_2_preds = tf.argmax(model_2_pred_probs, axis=1) |
model_2_preds |
class_names[model_2_preds] |
model_2_results = calculate_results(y_true=val_labels_encoded, y_pred=model_2_preds) |
model_2_results |
""" |
Model 3: Conv1D with character embeddings |
The paper which we are replicating states they used a combination of token and charcter level embeddings. |
Previously, we have token level embeddings but we will need to do similar steps for characters if we want to use char-level embeddings. |
""" |
""" |
Creating a charceter-level tokenizer |
""" |
train_sentences[:5] |
def split_chars(text): |
return " ".join(list(text)) |
train_chars = [split_chars(sentence) for sentence in train_sentences] |
val_chars = [split_chars(sentence) for sentence in val_sentences] |
test_chars = [split_chars(sentence) for sentence in test_sentences] |
train_chars, val_chars, test_chars |
char_lens = [len(sentence) for sentence in train_sentences] |
mean_char_len = np.mean(char_lens) |
mean_char_len |
import matplotlib.pyplot as plt |
plt.hist(char_lens, bins=7) |
output_seq_char_len = int(np.percentile(char_lens, 95)) |
output_seq_char_len |
import string |
alphabet = string.ascii_lowercase + string.digits + string.punctuation |
alphabet |
len(alphabet) |
NUM_CHAR_TOKENS = len(alphabet) + 2 |
char_vectorizer = TextVectorization(max_tokens=NUM_CHAR_TOKENS, |
output_sequence_length=output_seq_char_len, |
standardize="lower_and_strip_punctuation", |
name="char_vectorizer") |
char_vectorizer.adapt(train_chars) |
char_vocab = char_vectorizer.get_vocabulary() |
print(f"Number of different characters in character vocab: {len(char_vocab)}") |
print(f"5 most common character: {char_vocab[:5]}") |
print(f"5 least common characters: {char_vocab[-5:]}") |
""" |
Creating a character-level embedding |
""" |
char_embed = layers.Embedding(input_dim=len(char_vocab), |
output_dim=25, |
mask_zero=True, |
name="char_embed") |
""" |
Model 3: Building a Conv1D model to fit on character embeddings |
""" |
inputs = layers.Input(shape=(1,), dtype="string") |
char_vectors = char_vectorizer(inputs) |
char_embeddings = char_embed(char_vectors) |
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(char_embeddings) |
x = layers.GlobalMaxPool1D()(x) |
outputs = layers.Dense(num_classes, activation="softmax")(x) |
model_3 = tf.keras.Model(inputs, outputs, name="model_3_conv1d_char_embeddings") |
model_3.compile(loss="categorical_crossentropy", |
optimizer=tf.keras.optimizers.Adam(), |
metrics=["accuracy"]) |
model_3.summary() |
train_char_dataset = tf.data.Dataset.from_tensor_slices((train_chars, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE) |
val_char_dataset = tf.data.Dataset.from_tensor_slices((val_chars, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE) |
test_char_dataset = tf.data.Dataset.from_tensor_slices((test_chars, test_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE) |
train_char_dataset, val_char_dataset, test_char_dataset |
model_3_history = model_3.fit(train_char_dataset, |
epochs=3, |
steps_per_epoch=int(0.1 * len (train_char_dataset)), |
validation_data=val_char_dataset, |
validation_steps=int(0.1 * len(val_char_dataset))) |
model_3.evaluate(val_char_dataset) |
model_3_pred_probs = model_3.predict(val_char_dataset) |
model_3_pred_probs, model_3_pred_probs.shape |
model_3_preds = tf.argmax(model_3_pred_probs, axis=1) |
model_3_preds |
class_names[model_3_preds] |
model_3_results = calculate_results(y_true=val_labels_encoded, y_pred=model_3_preds) |
model_3_results |
baseline_results |
""" |
Model 4: Combining pretrained token embeddings + characters embeddings (hybrid embedding layer) |
1. Create a token level embedding model (similar to model_1) |
2. Create a character level model (similar to model_3 with a slight modification) |
3. Combine 1 & 2 with a concatenate (layers.Concatenate) |
4. Build a series of output layer on top point 3. |
5. Construct a model which takes token and character level sequences as input and produces sequence label probabilities as output. |
""" |
token_inputs = layers.Input(shape=[], dtype=tf.string, name="token_inputs") |
token_embeddings = tf_hub_embedding_layer(token_inputs) |
token_outputs = layers.Dense(128, activation="relu")(token_embeddings) |
token_model = tf.keras.Model(inputs=token_inputs, outputs=token_outputs) |
char_inputs = layers.Input(shape=(1,), dtype=tf.string, name="char_input") |
char_vectors = char_vectorizer(char_inputs) |
char_embeddings = char_embed(char_vectors) |
char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings) |
char_model = tf.keras.Model(inputs=char_inputs, outputs=char_bi_lstm) |
token_char_concat = layers.Concatenate(name="token_char_hybrid")([token_model.output, char_model.output]) |
combined_dropout = layers.Dropout(0.5)(token_char_concat) |
combined_dense = layers.Dense(128, activation="relu")(combined_dropout) |
final_dropout = layers.Dropout(0.5)(combined_dense) |
output_layer = layers.Dense(num_classes, activation="softmax")(final_dropout) |
model_4 = tf.keras.Model(inputs=[token_model.input, char_model.input], |
outputs=output_layer, |
name="model_4_token_and_char_embeddings") |
model_4.summary() |
from keras.utils import plot_model |
plot_model(model_4, show_shapes=True) |
model_4.compile(loss="categorical_crossentropy", |
optimizer=tf.keras.optimizers.Adam(), |
metrics=["accuracy"]) |
""" |
Combining token and character data into tf.data.Dataset |
""" |
train_char_token_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars)) |
train_char_token_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot) |
train_char_token_dataset = tf.data.Dataset.zip((train_char_token_data, train_char_token_labels)) |
train_char_token_dataset = train_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
val_char_token_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars)) |
val_char_token_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot) |
val_char_token_dataset = tf.data.Dataset.zip((val_char_token_data, val_char_token_labels)) |
val_char_token_dataset = val_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
train_char_token_dataset, val_char_token_dataset |
with tf.device('/CPU:0'): |
history_model_4 = model_4.fit(train_char_token_dataset, |
epochs=3, |
steps_per_epoch=int(0.1 * len(train_char_token_dataset)), |
validation_data=val_char_token_dataset, |
validation_steps=int(0.1 * len(val_char_token_dataset))) |
with tf.device('/CPU:0'): |
model_4.evaluate(val_char_token_dataset) |
model_4_pred_probs = model_4.predict(val_char_token_dataset) |
model_4_pred_probs, model_4_pred_probs.shape |
model_4_preds = tf.argmax(model_4_pred_probs, axis=1) |
model_4_preds |
model_4_preds |
class_names[model_4_preds] |
model_4_results = calculate_results(y_true=val_labels_encoded, y_pred=model_4_preds) |
model_4_results |
""" |
Model 5: Transfer learning with pretrained token embeddings + character embeddings + |
positional embeddings |
""" |
train_df.head() |
""" |
Create positional embeddings |
""" |
train_df["line_number"].value_counts() |
train_df["line_number"].plot.hist() |
train_line_numbers_one_hot = tf.one_hot(train_df["line_number"].to_numpy(), depth=15) |
val_line_numbers_one_hot = tf.one_hot(val_df["line_number"].to_numpy(), depth=15) |
test_line_numbers_one_hot = tf.one_hot(test_df["line_number"].to_numpy(), depth=15) |
train_line_numbers_one_hot[:10], train_line_numbers_one_hot.shape |
train_line_numbers_one_hot[0].shape |
train_line_numbers_one_hot[0].dtype |
train_df["total_lines"].value_counts() |
train_df["total_lines"].plot.hist() |
np.percentile(train_df["total_lines"], 98) |
train_total_lines_one_hot = tf.one_hot(train_df["total_lines"].to_numpy(), depth=20) |
val_total_lines_one_hot = tf.one_hot(val_df["total_lines"].to_numpy(), depth=20) |
test_total_lines_one_hot = tf.one_hot(test_df["total_lines"].to_numpy(), depth=20) |
train_total_lines_one_hot[:10], train_total_lines_one_hot.shape |
train_total_lines_one_hot[0].shape |
train_total_lines_one_hot[0].dtype |
""" |
Building a tribrid embedding model |
1. Create a token-level model |
2. Create a character-level model |
3. Create a model for the "line_number" feature |
4. Create a model for the "total_lines" feature |
5. Combine the outputs of 1 & 2 using tf.keras.layers.Concatenate |
6. Combine the outputs of 3,4,5 using tf.keras.layers.Concatenate |
7. Create an output layer to accept the tribrid embedding and output label probabilities. |
8. Combine the inputs of 1,2,3,4 and outputs of 7 into tf.keras.Model |
""" |
token_inputs = layers.Input(shape=[], dtype="string", name="token_inputs") |
token_embeddings = tf_hub_embedding_layer(token_inputs) |
token_outputs = layers.Dense(128, activation="relu")(token_embeddings) |
token_model = tf.keras.Model(inputs=token_inputs, outputs=token_outputs) |
char_inputs = layers.Input(shape=(1,), dtype="string", name="char_inputs") |
char_vectors = char_vectorizer(char_inputs) |
char_embeddings = char_embed(char_vectors) |
char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings) |
char_model = tf.keras.Model(inputs=char_inputs, outputs=char_bi_lstm) |
line_number_inputs = layers.Input(shape=(15,), dtype=tf.float32, name="line_number_input") |
x = layers.Dense(32, activation="relu")(line_number_inputs) |
line_number_model = tf.keras.Model(inputs=line_number_inputs, outputs=x) |
total_lines_inputs = layers.Input(shape=(20,), dtype=tf.float32, name="total_lines_input") |
y = layers.Dense(32, activation="relu")(total_lines_inputs) |
total_lines_model = tf.keras.Model(inputs=total_lines_inputs, outputs=y) |
combined_embeddings = layers.Concatenate(name="char_token_hybrid_embedding")([token_model.output, char_model.output]) |
z = layers.Dense(256, activation="relu")(combined_embeddings) |
z = layers.Dropout(0.5)(z) |
tribrid_embeddings = layers.Concatenate(name="char_token_positional_embedding")([line_number_model.output, total_lines_model.output, z]) |
output_layer = layers.Dense(num_classes, activation="softmax", name="output_layer")(tribrid_embeddings) |
model_5 = tf.keras.Model(inputs=[line_number_model.input, |
total_lines_model.input, |
token_model.input, |
char_model.input], outputs=output_layer, name="model_5_tribrid_embedding_model") |
model_5.summary() |
from tensorflow.keras.utils import plot_model |
plot_model(model_5, show_shapes=True) |
model_5.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2), |
optimizer=tf.keras.optimizers.Adam(), |
metrics=["accuracy"]) |
""" |
Create tribrid embeddings datasets using tf.data |
""" |
train_char_token_pos_data = tf.data.Dataset.from_tensor_slices((train_line_numbers_one_hot, |
train_total_lines_one_hot, |
train_sentences, |
train_chars)) |
train_char_token_pos_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot) |
train_char_token_pos_dataset = tf.data.Dataset.zip((train_char_token_pos_data, train_char_token_pos_labels)) |
train_char_token_pos_dataset = train_char_token_pos_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
val_char_token_pos_data = tf.data.Dataset.from_tensor_slices((val_line_numbers_one_hot, |
val_total_lines_one_hot, |
val_sentences, |
val_chars)) |
val_char_token_pos_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot) |
val_char_token_pos_dataset = tf.data.Dataset.zip((val_char_token_pos_data, val_char_token_pos_labels)) |
val_char_token_pos_dataset = val_char_token_pos_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
train_char_token_pos_dataset, val_char_token_pos_dataset |
with tf.device('/CPU:0'): |
history_model_5 = model_5.fit(train_char_token_pos_dataset, |
epochs=3, |
steps_per_epoch=int(0.1 * len(train_char_token_pos_dataset)), |
validation_data=val_char_token_pos_dataset, |
validation_steps=int(0.1 * len(val_char_token_pos_dataset))) |
with tf.device('/CPU:0'): |
model_5.evaluate(val_char_token_pos_dataset) |
model_5_pred_probs = model_5.predict(val_char_token_pos_dataset) |
model_5_pred_probs, model_5_pred_probs.shape |
model_5_preds = tf.argmax(model_5_pred_probs, axis=1) |
model_5_preds |
model_5_preds |
class_names[model_5_preds] |
model_5_results = calculate_results(y_true=val_labels_encoded, y_pred=model_5_preds) |
model_5_results |
""" |
Compare model results |
""" |
all_model_results = pd.DataFrame({"model_0_baseline": baseline_results, |
"model_1_custom_token_embedding": model_1_results, |
"model_2_pretrained_token_embedding": model_2_results, |
"model_3_custom_char_embedding": model_3_results, |
"model_4_hybrid_char_token_embedding": model_4_results, |
"model_5_pos_char_token_embedding": model_5_results}) |
all_model_results = all_model_results.transpose() |
all_model_results |
all_model_results["accuracy"] = all_model_results["accuracy"]/100 |
all_model_results |
all_model_results.plot(kind="bar", figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0)) |
all_model_results.sort_values("f1", ascending=True)["f1"].plot(kind="bar", figsize=(10, 7)) |
""" |
Save and load model |
""" |
model_5.save("skimlit_tribrid_model_me") |
from keras.models import load_model |
with tf.device('/CPU:0'): |
loaded_model = load_model("skimlit_tribrid_model_me") |
with tf.device('/CPU:0'): |
loaded_pred_probs = loaded_model.predict(val_char_token_pos_dataset) |
loaded_pred_probs, loaded_pred_probs.shape |
loaded_preds = tf.argmax(loaded_pred_probs, axis=1) |
loaded_preds |
loaded_preds[:10] |
class_names[loaded_preds] |
loaded_model_results = calculate_results(y_true=val_labels_encoded, y_pred=loaded_preds) |
loaded_model_results |
assert model_5_results == loaded_model_results |
loaded_model.summary() |
""" |
Optional - for the loaded model you can use your own trained model |
""" |
import tensorflow as tf |
import tensorflow_hub as hub |
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization |
import os |
url = "https://drive.google.com/file/d/1DYr3Ew9tU6dph_fI0JeTZ6GbdzZpWr8K/view?usp=sharing" |
loaded_gs_model = load_model("skimlit_tribrid_model") |
loaded_gs_model.evaluate(val_char_token_pos_dataset) |
loaded_preds = tf.argmax(loaded_pred_probs, axis=1) |
loaded_preds[:10] |
loaded_model_results = calculate_results(val_labels_encoded, loaded_preds) |
loaded_model_results |
loaded_model.summary() |
test_pos_char_token_data = tf.data.Dataset.from_tensor_slices((test_line_numbers_one_hot, test_total_lines_one_hot, test_sentences, test_chars)) |
test_pos_char_token_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot) |
test_pos_char_token_dataset = tf.data.Dataset.zip((test_pos_char_token_data, test_pos_char_token_labels)) |
test_pos_char_token_dataset = test_pos_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) |
with tf.device('/CPU:0'): |
test_pred_probs = loaded_model.predict(test_pos_char_token_dataset, verbose=1) |
test_preds = tf.argmax(test_pred_probs, axis=1) |
test_preds[:10] |
loaded_model_test_results = calculate_results(y_true=test_labels_encoded, y_pred=test_preds) |
loaded_model_test_results |
test_pred_classes = [label_encoder.classes_[pred] for pred in test_preds] |
test_pred_classes |
test_df["prediction"] = test_pred_classes |
test_df["pred_prob"] = tf.reduce_max(test_pred_probs, axis=1).numpy() |
test_df["correct"] = test_df["prediction"] == test_df["target"] |
test_df.head(20) |
top_100_wrong = test_df[test_df["correct"] == False].sort_values("pred_prob", ascending=False)[:100] |
top_100_wrong |
for row in top_100_wrong[0:10].itertuples(): |
_, target, text, line_number, total_lines, prediction, pred_prob, _ = row |
print(f"Target: {target}, Pred: {prediction}, Prob: {pred_prob}, Line number: {line_number}, Total lines: {total_lines}\n") |
print(f"Text:\n{text}\n") |
print("-----------------------------------------------------------------------\n") |
import json |
import requests |
url = "https://github.com/Dhrumit1314/Skimlit_NLP/blob/main/abstract_data.json" |
response = requests.get(url) |
if response.status_code == 200: |
example_abstracts = json.loads(response.text) |
print("Example abstracts loaded successfully.") |
else: |
print(f"Failed to download example abstracts. Status code: {response.status_code}") |
abstracts = pd.DataFrame(example_abstracts) |
abstracts |
from spacy.lang.en import English |
nlp = English() |
sentencizer = nlp.add_pipe("sentencizer") |
example_abstract = example_abstracts[0]["abstract"] |
example_abstract |
doc = nlp(example_abstract) |
doc |
abstract_lines = [str(sent) for sent in list(doc.sents)] |
abstract_lines |
total_lines_in_sample = len(abstract_lines) |
sample_lines = [] |
for i, line in enumerate(abstract_lines): |
sample_dict = {} |
sample_dict["text"] = str(line) |
sample_dict["line_number"] = i |
sample_dict["total_lines"] = total_lines_in_sample - 1 |
sample_lines.append(sample_dict) |
sample_lines |
test_abstract_line_numbers = [line["line_number"] for line in sample_lines] |
test_abstract_line_numbers_one_hot = tf.one_hot(test_abstract_line_numbers, depth=15) |
test_abstract_line_numbers_one_hot |
test_abstract_total_lines = [line["total_lines"] for line in sample_lines] |
test_abstract_total_lines_one_hot = tf.one_hot(test_abstract_total_lines, depth=20) |
test_abstract_total_lines_one_hot |
abstract_chars = [split_chars(sentence) for sentence in abstract_lines] |
abstract_chars |
import tensorflow as tf |
import time |
line_numbers_depth = 15 |
total_lines_depth = 20 |
test_abstract_line_numbers_one_hot = tf.one_hot(test_abstract_line_numbers, depth=line_numbers_depth) |
test_abstract_total_lines_one_hot = tf.one_hot(test_abstract_total_lines, depth=total_lines_depth) |
test_abstract_abstract_lines = tf.constant(abstract_lines) |
test_abstract_abstract_chars = tf.constant(abstract_chars) |
start_time = time.time() |
with tf.device('/CPU:0'): |
test_abstract_pred_probs = model_5.predict(x=(test_abstract_line_numbers_one_hot, test_abstract_total_lines_one_hot, tf.constant(abstract_lines), tf.constant(abstract_chars))) |
end_time = time.time() |
print("Prediction Probabilities:", test_abstract_pred_probs) |
print("Time taken for predictions: {:.2f} seconds".format(end_time - start_time)) |
test_abstract_preds = tf.argmax(test_abstract_pred_probs, axis=1) |
test_abstract_preds |
test_abstract_pred_classes = [label_encoder.classes_[i] for i in test_abstract_preds] |
test_abstract_pred_classes |
for i, line in enumerate(abstract_lines): |
print(f"{test_abstract_pred_classes[i]}: {line}") |