SkimLit_NLP / SkimLit_NLP.py

Upload 8 files

ed00d9b verified 9 months ago

40.3 kB

	# -- coding: utf-8 --
	"""
	Created on Thu Feb 8 20:22:57 2024

	@author: Dhrumit Patel
	"""

	"""
	Milestone Project 2: SkimLit

	The purpose is to build an NLP model to make reading medical abstracts easier.
	"""

	# Check for GPU?
	# !nvidia-smi
	# !nvidia-smi -L

	"""
	Get the data

	Since we will be replicating the paper (PubMed 200K RCT), let's download the dataset they used.

	We can do so from author's github

	git clone https://github.com/Franck-Dernoncourt/pubmed-rct
	dir pubmed-rct

	# Check what files are in the PubMed_20K dataset
	cd pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign
	dir

	Contains 3 files dev.txt, test.txt, train.txt
	"""

	# Start our experiments using the 20k dataset with numbers replaced by "@" sign
	data_dir = "pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/"

	# Check all the filenames in the target directory
	import os
	filenames = [data_dir + filename for filename in os.listdir(data_dir)]
	filenames

	"""
	Preprocess the data
	"""

	# Create a function to read the lines of a document
	def get_lines(filename):
	"""
	Reads filename (a text filename) and returns the lines of text as a list.

	Args:
	filename (str): a string containing the target filepath.

	Returns:
	A list of strings with one string per line from the target filename.
	"""
	with open(filename, "r") as f:
	return f.readlines()

	# Let's read in the training lines
	train_lines = get_lines(filename=data_dir + "train.txt") # read the lines within the training file
	train_lines[:20]

	len(train_lines)


	# Let's write a function to preprocess our data as above (List of dictionaries)
	def preprocess_text_with_line_numbers(filename):
	"""
	Returns a list of dictionaries of abstract line data.

	Takes in filename, reads its contents, and sorts through each line,
	extracting things like the target label, the text of the sentence,
	how many senetences are in the current abstract and what sentence
	number the target line is.
	"""
	input_lines = get_lines(filename) # get all lines from filename
	abstract_lines = "" # Create an empty abstract
	abstract_samples = [] # Create an empty list of abstract to store dictionaries

	# Loop through each line in the target file
	for line in input_lines:
	if line.startswith("###"): # Check to see if the line is an ID line
	abstract_id = line
	abstract_lines = "" # Reset the abstract string if the line is an ID line

	elif line.isspace(): # Check to see if line is a new line
	abstract_line_split = abstract_lines.splitlines() # Split abstract into seperate lines

	# Iterate through each line in a single abstract and count them at the same time
	for abstract_line_number, abstract_line in enumerate(abstract_line_split):
	line_data = {} # Create an empty dictionary for each line
	target_text_split = abstract_line.split("\t") # Split target label from text
	line_data["target"] = target_text_split[0] # Get the target label
	line_data["text"] = target_text_split[1].lower() # Get target text and lower it
	line_data["line_number"] = abstract_line_number # What number line foes the line appear in the abstract?
	line_data["total_lines"] = len(abstract_line_split) - 1 # How many total line are there in the target abstract? (start from 0)
	abstract_samples.append(line_data) # Add line data dictionary to abstract samples list

	else: # If the above conditions aren't fulfilled, then the line contains a labelled sentence
	abstract_lines += line

	return abstract_samples

	# Get data from file and preprocess it
	train_samples = preprocess_text_with_line_numbers(filename = data_dir + "train.txt")
	val_samples = preprocess_text_with_line_numbers(filename = data_dir + "dev.txt") # dev is another name for validation dataset
	test_samples = preprocess_text_with_line_numbers(filename = data_dir + "test.txt")

	len(train_samples), len(val_samples), len(test_samples)

	# Check the first abstract of our training data
	train_samples[:14]

	"""
	Now that our data is in the format of a list of dictionaries, How about
	we turn it into a DataFrame to further visualize it?
	"""
	import pandas as pd
	train_df = pd.DataFrame(train_samples)
	val_df = pd.DataFrame(val_samples)
	test_df = pd.DataFrame(test_samples)

	train_df[:14]

	# Distribution of labels in training data
	train_df["target"].value_counts()

	# Let's check length of different lines (Number of sentences per abstract (X-axis) vs Number of occurrences (Y-axis))
	train_df["total_lines"].plot.hist()

	"""
	Get list of sentences
	"""
	# Convert abstract text lines into lists
	train_sentences = train_df["text"].tolist()
	val_sentences = val_df["text"].tolist()
	test_sentences = test_df["text"].tolist()

	len(train_sentences), len(val_sentences), len(test_sentences)

	# View the first 10 lines of training sentences
	train_sentences[:10]

	"""
	Making numeric labels (ML models require numeric labels)
	"""
	# One hot encode labels
	from sklearn.preprocessing import OneHotEncoder
	one_hot_encoder = OneHotEncoder(sparse=False) # We want non-sparse matrix
	train_labels_one_hot = one_hot_encoder.fit_transform(train_df["target"].to_numpy().reshape(-1, 1))
	val_labels_one_hot = one_hot_encoder.transform(val_df["target"].to_numpy().reshape(-1, 1))
	test_labels_one_hot = one_hot_encoder.transform(test_df["target"].to_numpy().reshape(-1, 1))

	# Check what one hot encoded labels look like
	train_labels_one_hot, val_labels_one_hot, test_labels_one_hot

	"""
	Label encode labels
	"""
	# Extract labels ("target" columns) and encode them into integers
	from sklearn.preprocessing import LabelEncoder
	label_encoder = LabelEncoder()
	train_labels_encoded = label_encoder.fit_transform(train_df["target"].to_numpy())
	val_labels_encoded = label_encoder.transform(val_df["target"].to_numpy())
	test_labels_encoded = label_encoder.transform(test_df["target"].to_numpy())

	# Check what label encoded labels look like
	train_labels_encoded, val_labels_encoded, test_labels_encoded

	# Get class names and number of classes from LabelEncoder instance
	num_classes = len(label_encoder.classes_)
	class_names = label_encoder.classes_
	num_classes, class_names

	"""
	Starting a series of Modelling experiments
	"""

	"""
	Model 0: Getting a baseline model (TF-IDF Multinomial Naive Bayes Classifier)
	"""
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.pipeline import Pipeline

	# Create a pipeline
	model_0 = Pipeline([
	("tf-idf", TfidfVectorizer()),
	("clf", MultinomialNB())
	])

	# Fit the pipeline on the training data
	model_0.fit(train_sentences, train_labels_encoded)

	# Evaluate baseline model on validation dataset
	model_0.score(val_sentences, val_labels_encoded)

	# Make predictions using our baseline model
	baseline_preds = model_0.predict(val_sentences)
	baseline_preds

	"""
	For classification evaluation metrics (accuracy, precision, recall, f1-score)
	"""
	from helper_functions import calculate_results

	# Calculate baselien results
	baseline_results = calculate_results(y_true=val_labels_encoded, y_pred=baseline_preds)
	baseline_results


	train_sentences[:10]

	"""
	Preparing our data (the text) for deep sequence model

	Before we start builidng deeper models, we had got to create vectorization and embedding layers
	"""

	import numpy as np
	import tensorflow as tf
	from tensorflow.keras import layers

	# How long is each sentence on average
	sent_lens = [len(sentence.split()) for sentence in train_sentences]
	avg_sent_len = np.mean(sent_lens)
	avg_sent_len

	# What's the distribution look like?
	import matplotlib.pyplot as plt
	plt.hist(sent_lens, bins=20)

	# How long of a sentence length covers 95% of examples?
	output_seq_length = int(np.percentile(sent_lens, 95))
	output_seq_length

	# Maximum sequence length in the training set
	max(sent_lens)

	"""
	Create a TextVectorizer layer

	We want to make a layer which maps our texts from words to numbers
	"""

	# How many words are in our vocab? This is taken from Table2 from paper
	max_tokens = 68000 # Came from paper by authors

	# Create text vectorizer
	from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
	text_vectorizer = TextVectorization(max_tokens=max_tokens, # Numebr of words in vocabulary
	output_sequence_length=output_seq_length) # Desired output length of vectorized sequences

	# Adapt text vectorizer to training sentences
	text_vectorizer.adapt(train_sentences)

	# How many words in our training vocabulary?
	rct_20k_text_vocab = text_vectorizer.get_vocabulary()
	print(f"Number of words in vocab: {len(rct_20k_text_vocab)}")
	print(f"Most common words in the vocab: {rct_20k_text_vocab[:5]}")
	print(f"Least common words in the vocab: {rct_20k_text_vocab[-5:]}")

	# Get the config of our text vectorizer
	text_vectorizer.get_config()

	from keras import layers
	"""
	Create a custom text embedding layer
	"""
	token_embed = layers.Embedding(input_dim=len(rct_20k_text_vocab),
	output_dim=128, # Note: Different embedding sizes result in drastically different numbers of parameters to train
	mask_zero=True, # Use masking to handle variable sequences lengths(save space)
	name = "token_embedding")


	"""
	Creating datasets (making sure our data loads as fast as possible)

	We are going to setup our data to run as fast as poccible with TensorFlow tf.data API.
	"""
	# Turn our data into TensorFlow datasets
	train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot))
	valid_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot))
	test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels_one_hot))

	train_dataset

	# Take the TensorSliceDataset's and turn them into prefetched datasets
	train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
	valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
	test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

	train_dataset, len(train_dataset)
	"""
	Model 1: Conv1D with token embeddings
	"""
	# Create 1D Conv model to process sequences
	inputs = layers.Input(shape=(1,), dtype=tf.string)
	text_vectors = text_vectorizer(inputs) # Vectorize text inputs
	token_embeddings = token_embed(text_vectors) # Create embedding
	x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(token_embeddings)
	x = layers.GlobalAveragePooling1D()(x) # Condense the ouput of our feature vector from Conv layer
	outputs = layers.Dense(num_classes, activation="softmax")(x)

	model_1 = tf.keras.Model(inputs, outputs)

	# Compile the model
	model_1.compile(loss="categorical_crossentropy",
	optimizer=tf.keras.optimizers.Adam(),
	metrics=["accuracy"])

	model_1.summary()

	# Fit the model
	history_model_1 = model_1.fit(train_dataset,
	epochs=3,
	steps_per_epoch=int(0.1 * len(train_dataset)), # It will only look on 10% of batches for training (to speed up training)
	validation_data=valid_dataset,
	validation_steps=int(0.1 * len(valid_dataset)))

	# Evaluate on whole validation dataset
	model_1.evaluate(valid_dataset)

	# Make predictions on the validation dataset (our model predicts probabilities for each class)
	model_1_pred_probs = model_1.predict(valid_dataset)
	model_1_pred_probs, model_1_pred_probs.shape

	# Convert pred probs to classes
	model_1_preds = tf.argmax(model_1_pred_probs, axis=1)
	model_1_preds
	class_names
	class_names[model_1_preds]

	# Calculate model_1 results
	model_1_results = calculate_results(y_true=val_labels_encoded, y_pred=model_1_preds)
	model_1_results

	"""
	Model 2: Feature extraction with pretrained token embeddings

	Now let's use pretrained word embeddings from TensorFlow Hub,
	more sepcifically the universal sentence encoder

	The paper used originally used GloVe embeddings, however we are going to stick with the later
	created USE pretrained embeddings.
	"""
	# Download pretrained TensorFlow Hub USE
	import tensorflow_hub as hub
	tf_hub_embedding_layer = hub.KerasLayer("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
	trainable=False,
	name="universal_sentence_encoder")


	"""
	Building and fitting an NLP feature extraction model using pretrained embeddings TensorFlow Hub
	"""
	# Define feature extraction model using TF Hub layer
	inputs = layers.Input(shape=[], dtype=tf.string)
	pretrained_embedding = tf_hub_embedding_layer(inputs) # Tokenize text and create embedding of each sequence (512 long vector)
	x = layers.Dense(128, activation="relu")(pretrained_embedding)
	# Note: you could add more layers if you wanted to
	outputs = layers.Dense(num_classes, activation="softmax")(x) # Create the output layer

	model_2 = tf.keras.Model(inputs, outputs, name="model_2_USE_feature_extractor")

	# Compile the model
	model_2.compile(loss="categorical_crossentropy",
	optimizer=tf.keras.optimizers.Adam(),
	metrics=["accuracy"])

	model_2.summary()

	# Fit model_2 to the data
	with tf.device('/CPU:0'):
	history_model_2 = model_2.fit(train_dataset,
	epochs=3,
	steps_per_epoch=int(0.1 * len(train_dataset)),
	validation_data=valid_dataset,
	validation_steps=int(0.1 * len(valid_dataset)))

	# Evaluate on the whole validation dataset
	with tf.device('/CPU:0'):
	model_2.evaluate(valid_dataset)

	# Make predictions with feature extraction model
	with tf.device('/CPU:0'):
	model_2_pred_probs = model_2.predict(valid_dataset)
	model_2_pred_probs, model_2_pred_probs.shape

	# Convert the prediction probabilites found with feature extraction model to labels
	model_2_preds = tf.argmax(model_2_pred_probs, axis=1)
	model_2_preds
	class_names[model_2_preds]

	# Calculate results from TF Hub pretrained embeddings results on val set
	model_2_results = calculate_results(y_true=val_labels_encoded, y_pred=model_2_preds)
	model_2_results

	"""
	Model 3: Conv1D with character embeddings

	The paper which we are replicating states they used a combination of token and charcter level embeddings.
	Previously, we have token level embeddings but we will need to do similar steps for characters if we want to use char-level embeddings.
	"""

	"""
	Creating a charceter-level tokenizer
	"""
	train_sentences[:5]

	# Make function to split sentences into characters
	def split_chars(text):
	return " ".join(list(text))


	# Split sequence-level data splits into character-level data splits
	train_chars = [split_chars(sentence) for sentence in train_sentences]
	val_chars = [split_chars(sentence) for sentence in val_sentences]
	test_chars = [split_chars(sentence) for sentence in test_sentences]

	train_chars, val_chars, test_chars

	# What's the average character length?
	char_lens = [len(sentence) for sentence in train_sentences]
	mean_char_len = np.mean(char_lens)
	mean_char_len

	# Check the distribution of our sequences at a character-level
	import matplotlib.pyplot as plt
	plt.hist(char_lens, bins=7)

	# Find what length of characters covers 95% of sequences
	output_seq_char_len = int(np.percentile(char_lens, 95))
	output_seq_char_len

	# Get all keyboard characters
	import string
	alphabet = string.ascii_lowercase + string.digits + string.punctuation
	alphabet
	len(alphabet)

	# Create char-level token vectorizer instances
	NUM_CHAR_TOKENS = len(alphabet) + 2 # add 2 for space and OOV token (OOV = out of vocab, ['UNK])
	char_vectorizer = TextVectorization(max_tokens=NUM_CHAR_TOKENS,
	output_sequence_length=output_seq_char_len,
	standardize="lower_and_strip_punctuation", # Default
	name="char_vectorizer")

	# Adapt character vectorizer to training character
	char_vectorizer.adapt(train_chars)

	# Chek character vocab stats
	char_vocab = char_vectorizer.get_vocabulary()
	print(f"Number of different characters in character vocab: {len(char_vocab)}")
	print(f"5 most common character: {char_vocab[:5]}")
	print(f"5 least common characters: {char_vocab[-5:]}")

	"""
	Creating a character-level embedding
	"""
	# Create char embedding layer
	char_embed = layers.Embedding(input_dim=len(char_vocab), # Number of different characters
	output_dim=25, # This is the size of char embedding in the paper
	mask_zero=True,
	name="char_embed")


	"""
	Model 3: Building a Conv1D model to fit on character embeddings
	"""
	# Make Conv1D on chars only
	inputs = layers.Input(shape=(1,), dtype="string")
	char_vectors = char_vectorizer(inputs)
	char_embeddings = char_embed(char_vectors)
	x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(char_embeddings)
	x = layers.GlobalMaxPool1D()(x)
	outputs = layers.Dense(num_classes, activation="softmax")(x)

	model_3 = tf.keras.Model(inputs, outputs, name="model_3_conv1d_char_embeddings")

	# Compile the model
	model_3.compile(loss="categorical_crossentropy",
	optimizer=tf.keras.optimizers.Adam(),
	metrics=["accuracy"])

	model_3.summary()

	# Create char level dataset
	train_char_dataset = tf.data.Dataset.from_tensor_slices((train_chars, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)
	val_char_dataset = tf.data.Dataset.from_tensor_slices((val_chars, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)
	test_char_dataset = tf.data.Dataset.from_tensor_slices((test_chars, test_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)

	train_char_dataset, val_char_dataset, test_char_dataset

	# Fit the model on chars only
	model_3_history = model_3.fit(train_char_dataset,
	epochs=3,
	steps_per_epoch=int(0.1 * len (train_char_dataset)),
	validation_data=val_char_dataset,
	validation_steps=int(0.1 * len(val_char_dataset)))

	# Evaluate the model_3
	model_3.evaluate(val_char_dataset)

	# Make predictions with character model only
	model_3_pred_probs = model_3.predict(val_char_dataset)
	model_3_pred_probs, model_3_pred_probs.shape

	# Convert prediction to class labels
	model_3_preds = tf.argmax(model_3_pred_probs, axis=1)
	model_3_preds
	class_names[model_3_preds]

	# Calculate results for Conv1D model chars only
	model_3_results = calculate_results(y_true=val_labels_encoded, y_pred=model_3_preds)
	model_3_results

	baseline_results

	"""
	Model 4: Combining pretrained token embeddings + characters embeddings (hybrid embedding layer)

	1. Create a token level embedding model (similar to model_1)
	2. Create a character level model (similar to model_3 with a slight modification)
	3. Combine 1 & 2 with a concatenate (layers.Concatenate)
	4. Build a series of output layer on top point 3.
	5. Construct a model which takes token and character level sequences as input and produces sequence label probabilities as output.
	"""

	# 1. Setup token inputs/model
	token_inputs = layers.Input(shape=[], dtype=tf.string, name="token_inputs")
	token_embeddings = tf_hub_embedding_layer(token_inputs)
	token_outputs = layers.Dense(128, activation="relu")(token_embeddings)
	token_model = tf.keras.Model(inputs=token_inputs, outputs=token_outputs)

	# 2. Setup char inputs/model
	char_inputs = layers.Input(shape=(1,), dtype=tf.string, name="char_input")
	char_vectors = char_vectorizer(char_inputs)
	char_embeddings = char_embed(char_vectors)
	char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings) # bi-LSTM as given in paper
	char_model = tf.keras.Model(inputs=char_inputs, outputs=char_bi_lstm)

	# 3. Concatenate token and char inputs (create hybrid tokem embedding)
	token_char_concat = layers.Concatenate(name="token_char_hybrid")([token_model.output, char_model.output])

	# 4. Create output layers - adding in dropout (according to the paper)
	combined_dropout = layers.Dropout(0.5)(token_char_concat)
	combined_dense = layers.Dense(128, activation="relu")(combined_dropout)
	final_dropout = layers.Dropout(0.5)(combined_dense)
	output_layer = layers.Dense(num_classes, activation="softmax")(final_dropout)

	# 5. Construct model with char and token inputs
	model_4 = tf.keras.Model(inputs=[token_model.input, char_model.input],
	outputs=output_layer,
	name="model_4_token_and_char_embeddings")

	# Get a summary of our model
	model_4.summary()

	# Plot hybrid token and character model
	from keras.utils import plot_model
	plot_model(model_4, show_shapes=True)

	# Compile token char model
	model_4.compile(loss="categorical_crossentropy",
	optimizer=tf.keras.optimizers.Adam(), # Paper says SGD optimizer
	metrics=["accuracy"])

	"""
	Combining token and character data into tf.data.Dataset
	"""
	# Combine chars and tokens into a dataset

	train_char_token_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars)) # make data
	train_char_token_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot) # make labels
	train_char_token_dataset = tf.data.Dataset.zip((train_char_token_data, train_char_token_labels)) # Combine data and labels

	# Prefetch and batch train data
	train_char_token_dataset = train_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

	# For validation dataset
	val_char_token_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars))
	val_char_token_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
	val_char_token_dataset = tf.data.Dataset.zip((val_char_token_data, val_char_token_labels))

	# Prefetch and batch val data
	val_char_token_dataset = val_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

	# Check out training char and token embedding dataset
	train_char_token_dataset, val_char_token_dataset

	# Fitting a model on token and character-level sequences
	with tf.device('/CPU:0'):
	history_model_4 = model_4.fit(train_char_token_dataset,
	epochs=3,
	steps_per_epoch=int(0.1 * len(train_char_token_dataset)),
	validation_data=val_char_token_dataset,
	validation_steps=int(0.1 * len(val_char_token_dataset)))

	# Evaluate on the whole validation dataset
	with tf.device('/CPU:0'):
	model_4.evaluate(val_char_token_dataset)

	# Make predictions using the token-character model hybrid
	model_4_pred_probs = model_4.predict(val_char_token_dataset)
	model_4_pred_probs, model_4_pred_probs.shape

	# Converting to prediction probabilities to labels
	model_4_preds = tf.argmax(model_4_pred_probs, axis=1)
	model_4_preds

	model_4_preds
	class_names[model_4_preds]

	# Get results of token char hybrid model
	model_4_results = calculate_results(y_true=val_labels_encoded, y_pred=model_4_preds)
	model_4_results

	"""
	Model 5: Transfer learning with pretrained token embeddings + character embeddings +
	positional embeddings
	"""
	train_df.head()

	"""
	Create positional embeddings
	"""
	# How many different line numbers are there?
	train_df["line_number"].value_counts()

	# Check the distribution of "line_number" column
	train_df["line_number"].plot.hist()

	# Use TensorFlow to create one-hot encoded tensors of our "line_number" column
	train_line_numbers_one_hot = tf.one_hot(train_df["line_number"].to_numpy(), depth=15)
	val_line_numbers_one_hot = tf.one_hot(val_df["line_number"].to_numpy(), depth=15)
	test_line_numbers_one_hot = tf.one_hot(test_df["line_number"].to_numpy(), depth=15)
	train_line_numbers_one_hot[:10], train_line_numbers_one_hot.shape
	train_line_numbers_one_hot[0].shape
	train_line_numbers_one_hot[0].dtype

	# How many different numbers of lines are there?
	train_df["total_lines"].value_counts()

	# Check the distribution of "total_lines" column
	train_df["total_lines"].plot.hist()

	# Check the coverage of a "total_lines" / What length of 95% covers our abstract string?
	np.percentile(train_df["total_lines"], 98)

	# Use TensorFlow One-hot encoded tensors for our "total_lines" column
	train_total_lines_one_hot = tf.one_hot(train_df["total_lines"].to_numpy(), depth=20)
	val_total_lines_one_hot = tf.one_hot(val_df["total_lines"].to_numpy(), depth=20)
	test_total_lines_one_hot = tf.one_hot(test_df["total_lines"].to_numpy(), depth=20)
	train_total_lines_one_hot[:10], train_total_lines_one_hot.shape
	train_total_lines_one_hot[0].shape
	train_total_lines_one_hot[0].dtype

	"""
	Building a tribrid embedding model

	1. Create a token-level model
	2. Create a character-level model
	3. Create a model for the "line_number" feature
	4. Create a model for the "total_lines" feature
	5. Combine the outputs of 1 & 2 using tf.keras.layers.Concatenate
	6. Combine the outputs of 3,4,5 using tf.keras.layers.Concatenate
	7. Create an output layer to accept the tribrid embedding and output label probabilities.
	8. Combine the inputs of 1,2,3,4 and outputs of 7 into tf.keras.Model
	"""
	# 1. Token inputs
	token_inputs = layers.Input(shape=[], dtype="string", name="token_inputs")
	token_embeddings = tf_hub_embedding_layer(token_inputs)
	token_outputs = layers.Dense(128, activation="relu")(token_embeddings)
	token_model = tf.keras.Model(inputs=token_inputs, outputs=token_outputs)

	# 2. Char inputs
	char_inputs = layers.Input(shape=(1,), dtype="string", name="char_inputs")
	char_vectors = char_vectorizer(char_inputs)
	char_embeddings = char_embed(char_vectors)
	char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings)
	char_model = tf.keras.Model(inputs=char_inputs, outputs=char_bi_lstm)

	# 3. Create a model for "line_number" feature
	line_number_inputs = layers.Input(shape=(15,), dtype=tf.float32, name="line_number_input")
	x = layers.Dense(32, activation="relu")(line_number_inputs)
	line_number_model = tf.keras.Model(inputs=line_number_inputs, outputs=x)

	# 4. Create a model for "total_lines" feature
	total_lines_inputs = layers.Input(shape=(20,), dtype=tf.float32, name="total_lines_input")
	y = layers.Dense(32, activation="relu")(total_lines_inputs)
	total_lines_model = tf.keras.Model(inputs=total_lines_inputs, outputs=y)

	# 5. Combine the outputs of token and char embeddings into a hybrid embedding
	combined_embeddings = layers.Concatenate(name="char_token_hybrid_embedding")([token_model.output, char_model.output])
	z = layers.Dense(256, activation="relu")(combined_embeddings)
	z = layers.Dropout(0.5)(z)

	# 6. Combine positional embedding with combined token and char embeddings
	tribrid_embeddings = layers.Concatenate(name="char_token_positional_embedding")([line_number_model.output, total_lines_model.output, z])

	# 7. Create output layer
	output_layer = layers.Dense(num_classes, activation="softmax", name="output_layer")(tribrid_embeddings)

	# 8. Put together model withall kinds of inputs
	model_5 = tf.keras.Model(inputs=[line_number_model.input,
	total_lines_model.input,
	token_model.input,
	char_model.input], outputs=output_layer, name="model_5_tribrid_embedding_model")

	# Get a summary of our tribrid model
	model_5.summary()

	from tensorflow.keras.utils import plot_model
	plot_model(model_5, show_shapes=True)


	# Compile token char and postional embedding model
	model_5.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2), # Helps to prevent overfitting
	optimizer=tf.keras.optimizers.Adam(),
	metrics=["accuracy"])

	"""
	Create tribrid embeddings datasets using tf.data
	"""

	# Create training and validation datasets (with all 4 kinds of input data)
	train_char_token_pos_data = tf.data.Dataset.from_tensor_slices((train_line_numbers_one_hot,
	train_total_lines_one_hot,
	train_sentences,
	train_chars))
	train_char_token_pos_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)
	train_char_token_pos_dataset = tf.data.Dataset.zip((train_char_token_pos_data, train_char_token_pos_labels))

	train_char_token_pos_dataset = train_char_token_pos_dataset.batch(32).prefetch(tf.data.AUTOTUNE)


	# Do the same as above for the validation dataset
	val_char_token_pos_data = tf.data.Dataset.from_tensor_slices((val_line_numbers_one_hot,
	val_total_lines_one_hot,
	val_sentences,
	val_chars))
	val_char_token_pos_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
	val_char_token_pos_dataset = tf.data.Dataset.zip((val_char_token_pos_data, val_char_token_pos_labels))

	val_char_token_pos_dataset = val_char_token_pos_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

	# Check the input shapes
	train_char_token_pos_dataset, val_char_token_pos_dataset

	# Fit the model
	with tf.device('/CPU:0'):
	history_model_5 = model_5.fit(train_char_token_pos_dataset,
	epochs=3,
	steps_per_epoch=int(0.1 * len(train_char_token_pos_dataset)),
	validation_data=val_char_token_pos_dataset,
	validation_steps=int(0.1 * len(val_char_token_pos_dataset)))

	with tf.device('/CPU:0'):
	# Evaluate our model on whole validation dataset
	model_5.evaluate(val_char_token_pos_dataset)

	# Make predictions with the char token pos model
	model_5_pred_probs = model_5.predict(val_char_token_pos_dataset)
	model_5_pred_probs, model_5_pred_probs.shape

	# Convert prediction probabilities to the labels
	model_5_preds = tf.argmax(model_5_pred_probs, axis=1)
	model_5_preds

	model_5_preds
	class_names[model_5_preds]

	# Calculate results of char token pos model
	model_5_results = calculate_results(y_true=val_labels_encoded, y_pred=model_5_preds)
	model_5_results

	"""
	Compare model results
	"""

	# Combine model results into a dataframe
	all_model_results = pd.DataFrame({"model_0_baseline": baseline_results,
	"model_1_custom_token_embedding": model_1_results,
	"model_2_pretrained_token_embedding": model_2_results,
	"model_3_custom_char_embedding": model_3_results,
	"model_4_hybrid_char_token_embedding": model_4_results,
	"model_5_pos_char_token_embedding": model_5_results})

	all_model_results = all_model_results.transpose()
	all_model_results

	# Reduce the accuracy to same scale as other metrics
	all_model_results["accuracy"] = all_model_results["accuracy"]/100

	all_model_results

	# Plot and comapre all model results
	all_model_results.plot(kind="bar", figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0))

	# Sort models results using f1-score
	all_model_results.sort_values("f1", ascending=True)["f1"].plot(kind="bar", figsize=(10, 7))

	"""
	Save and load model
	"""
	# Save the best performing model to SavedModel format (default)
	model_5.save("skimlit_tribrid_model_me")

	# Load in best performing model
	from keras.models import load_model
	with tf.device('/CPU:0'):
	loaded_model = load_model("skimlit_tribrid_model_me")

	# Make predictions with our loaded model on the validation set
	with tf.device('/CPU:0'):
	loaded_pred_probs = loaded_model.predict(val_char_token_pos_dataset)
	loaded_pred_probs, loaded_pred_probs.shape

	# Convert prediction probabilities to labels
	loaded_preds = tf.argmax(loaded_pred_probs, axis=1)
	loaded_preds

	loaded_preds[:10]
	class_names[loaded_preds]

	# Calculate the results of our loaded model
	loaded_model_results = calculate_results(y_true=val_labels_encoded, y_pred=loaded_preds)
	loaded_model_results

	assert model_5_results == loaded_model_results # If nothing displays in console, it means True

	# Check the loaded model summary
	loaded_model.summary()


	"""
	Optional - for the loaded model you can use your own trained model
	"""
	import tensorflow as tf
	import tensorflow_hub as hub
	from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

	import os
	url = "https://drive.google.com/file/d/1DYr3Ew9tU6dph_fI0JeTZ6GbdzZpWr8K/view?usp=sharing"

	# Load in downloaded online model
	loaded_gs_model = load_model("skimlit_tribrid_model")

	# Evaluate the online loaded model
	loaded_gs_model.evaluate(val_char_token_pos_dataset)
	loaded_preds = tf.argmax(loaded_pred_probs, axis=1)
	loaded_preds[:10]

	# Evaluate loaded model's predictions
	loaded_model_results = calculate_results(val_labels_encoded, loaded_preds)
	loaded_model_results

	# Check loaded model summary
	loaded_model.summary()

	# Create test dataset batch and prefetched
	test_pos_char_token_data = tf.data.Dataset.from_tensor_slices((test_line_numbers_one_hot, test_total_lines_one_hot, test_sentences, test_chars))
	test_pos_char_token_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)

	test_pos_char_token_dataset = tf.data.Dataset.zip((test_pos_char_token_data, test_pos_char_token_labels))

	test_pos_char_token_dataset = test_pos_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

	# Make predictions on the test dataset
	with tf.device('/CPU:0'):
	test_pred_probs = loaded_model.predict(test_pos_char_token_dataset, verbose=1)
	test_preds = tf.argmax(test_pred_probs, axis=1)
	test_preds[:10]

	# Evaluate loaded model test predictions
	loaded_model_test_results = calculate_results(y_true=test_labels_encoded, y_pred=test_preds)
	loaded_model_test_results

	# Get list of class names of test predictions
	test_pred_classes = [label_encoder.classes_[pred] for pred in test_preds]
	test_pred_classes

	# Create prediction-enriched test dataframe

	# Add a new column "prediction" to the test dataframe, containing predicted classes
	test_df["prediction"] = test_pred_classes

	# Add a new column "pred_prob" to the test dataframe, containing the maximum prediction probability
	test_df["pred_prob"] = tf.reduce_max(test_pred_probs, axis=1).numpy()

	# Add a new column "correct" to the test dataframe, which is True if the prediction matches the target, False otherwise
	# This creates a binary column indicating whether the prediction is correct or not
	test_df["correct"] = test_df["prediction"] == test_df["target"]

	# Display the first 20 rows of the enriched test dataframe
	test_df.head(20)

	# Find top 100 most wrong samples (note: 100 is an abitrary number, you could go through all of them if you wanted)
	top_100_wrong = test_df[test_df["correct"] == False].sort_values("pred_prob", ascending=False)[:100]
	top_100_wrong

	# Investigate top wrong predictions for rows in the top 100 wrong predictions dataframe
	for row in top_100_wrong[0:10].itertuples():
	# Unpack row values
	_, target, text, line_number, total_lines, prediction, pred_prob, _ = row

	# Display information about the prediction
	print(f"Target: {target}, Pred: {prediction}, Prob: {pred_prob}, Line number: {line_number}, Total lines: {total_lines}\n")

	# Display the text associated with the prediction
	print(f"Text:\n{text}\n")

	# Separator for better readability
	print("-----------------------------------------------------------------------\n")


	import json
	import requests

	# Download and open example abstracts (copy and pasted from PubMed)
	url = "https://github.com/Dhrumit1314/Skimlit_NLP/blob/main/abstract_data.json"
	response = requests.get(url)

	# Check if the download was successful (status code 200)
	if response.status_code == 200:
	# Load the JSON data from the response
	example_abstracts = json.loads(response.text)
	print("Example abstracts loaded successfully.")
	else:
	print(f"Failed to download example abstracts. Status code: {response.status_code}")

	# See what our example abstracts look like
	abstracts = pd.DataFrame(example_abstracts)
	abstracts

	# Import necessary library
	from spacy.lang.en import English

	# Setup English sentence parser with spaCy
	nlp = English()

	# Add the sentencizer to the spaCy pipeline
	sentencizer = nlp.add_pipe("sentencizer")

	# Example abstract from the loaded dataset
	example_abstract = example_abstracts[0]["abstract"]
	example_abstract

	# Create a spaCy "doc" object by parsing the example abstract
	doc = nlp(example_abstract)
	doc

	# Extract sentences from the spaCy doc and convert to string type
	abstract_lines = [str(sent) for sent in list(doc.sents)]
	# Display the detected sentences from the abstract
	abstract_lines

	# Get the total number of lines in the sample
	total_lines_in_sample = len(abstract_lines)

	# Initialize an empty list to store dictionaries containing features for each line
	sample_lines = []

	# Iterate through each line in the abstract and create a list of dictionaries containing features for each line
	for i, line in enumerate(abstract_lines):
	# Create a dictionary to store features for the current line
	sample_dict = {}

	# Store the text of the line in the dictionary
	sample_dict["text"] = str(line)

	# Store the line number in the dictionary
	sample_dict["line_number"] = i

	# Store the total number of lines in the sample (subtracting 1 to make it 0-based index)
	sample_dict["total_lines"] = total_lines_in_sample - 1

	# Append the dictionary to the list
	sample_lines.append(sample_dict)

	# Display the list of dictionaries containing features for each line
	sample_lines

	# Get all line_number values from the sample abstract
	test_abstract_line_numbers = [line["line_number"] for line in sample_lines]

	# One-hot encode to the same depth as training data, so the model accepts the right input shape
	test_abstract_line_numbers_one_hot = tf.one_hot(test_abstract_line_numbers, depth=15)

	# Display the one-hot encoded line numbers
	test_abstract_line_numbers_one_hot

	# Get all total_lines values from sample abstract
	test_abstract_total_lines = [line["total_lines"] for line in sample_lines]

	# One-hot encode to same depth as training data, so model accepts right input shape
	test_abstract_total_lines_one_hot = tf.one_hot(test_abstract_total_lines, depth=20)
	test_abstract_total_lines_one_hot

	# Split abstract lines into characters
	abstract_chars = [split_chars(sentence) for sentence in abstract_lines]
	abstract_chars

	import tensorflow as tf
	import time

	# Define the depths for one-hot encoding
	line_numbers_depth = 15
	total_lines_depth = 20

	# Prepare the input features
	test_abstract_line_numbers_one_hot = tf.one_hot(test_abstract_line_numbers, depth=line_numbers_depth)
	test_abstract_total_lines_one_hot = tf.one_hot(test_abstract_total_lines, depth=total_lines_depth)
	test_abstract_abstract_lines = tf.constant(abstract_lines)
	test_abstract_abstract_chars = tf.constant(abstract_chars)

	# Make predictions on the sample abstract features
	start_time = time.time()

	with tf.device('/CPU:0'):
	# Note - Here you can use loaded_model if you want
	test_abstract_pred_probs = model_5.predict(x=(test_abstract_line_numbers_one_hot, test_abstract_total_lines_one_hot, tf.constant(abstract_lines), tf.constant(abstract_chars)))

	end_time = time.time()


	# Display the prediction probabilities
	print("Prediction Probabilities:", test_abstract_pred_probs)

	# Display the time taken for predictions
	print("Time taken for predictions: {:.2f} seconds".format(end_time - start_time))

	# Turn prediction probabilities into prediction classes
	test_abstract_preds = tf.argmax(test_abstract_pred_probs, axis=1)
	test_abstract_preds

	# Turn prediction class integers into string class names
	test_abstract_pred_classes = [label_encoder.classes_[i] for i in test_abstract_preds]
	test_abstract_pred_classes

	# Visualize abstract lines and predicted sequence labels
	for i, line in enumerate(abstract_lines):
	print(f"{test_abstract_pred_classes[i]}: {line}")