NewsClassifier / main.py
Arsalan8's picture
Upload 9 files
1b5c1d0 verified
import pandas as pd
import json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
# Specify the file path
file_path = 'news classifier/News_Category_Dataset_v3.json'
# Lists to store data
links = []
headlines = []
categories = []
short_descriptions = []
authors = []
dates = []
# Open and read the file
with open(file_path, 'r') as file:
for line in file:
data = json.loads(line)
links.append(data['link'])
headlines.append(data['headline'])
categories.append(data['category'])
short_descriptions.append(data['short_description'])
authors.append(data['authors'])
dates.append(data['date'])
# Create a DataFrame
df = pd.DataFrame({
'link': links,
'headline': headlines,
'category': categories,
'short_description': short_descriptions,
'authors': authors,
'date': dates
})
# Combine headline and short_description
df['text'] = df['headline'] + ' ' + df['short_description']
x = df['text']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# One-hot encoding of the labels
label_binarizer = LabelBinarizer()
y_train_one_hot = label_binarizer.fit_transform(y_train)
y_test_one_hot = label_binarizer.transform(y_test)
# Batch size
batch_size = 32
# Define the model
VOCAB_SIZE = 20000 # Increased vocabulary size
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(X_train.to_list())
# Model with dropout to reduce overfitting
class AttentionLayer(tf.keras.layers.Layer):
def __init__(self, **kwargs):
super(AttentionLayer, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1),
initializer='normal')
self.b = self.add_weight(name='att_bias', shape=(input_shape[1], 1),
initializer='zeros')
super(AttentionLayer, self).build(input_shape)
def call(self, x):
et = tf.nn.tanh(tf.matmul(x, self.W) + self.b)
at = tf.nn.softmax(et, axis=1)
at = tf.transpose(at, perm=[0, 2, 1])
output = tf.matmul(at, x)
return tf.squeeze(output, axis=1)
def compute_output_shape(self, input_shape):
return (input_shape[0], input_shape[-1])
# Model with added attention layer
model = tf.keras.Sequential([
encoder,
tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=256, mask_zero=True),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True, dropout=0.5)),
AttentionLayer(),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(len(label_binarizer.classes_), activation='softmax')
])
# Compile the model
model.compile(loss='categorical_crossentropy',
optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), # Increase learning rate
metrics=['accuracy'])
# Train the model
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train_one_hot)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test_one_hot)).batch(batch_size)
history = model.fit(train_dataset, epochs=10, validation_data=test_dataset)
# Save the model
model.save('news_classifier_optimized')
#63% accuracy