Spaces:

Arsalan8
/

NewsClassifier

Sleeping

App Files Files Community

NewsClassifier / main.py

Arsalan8

Upload 9 files

1b5c1d0 verified about 1 year ago

raw

history blame contribute delete

3.59 kB

	import pandas as pd
	import json
	import tensorflow as tf
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import LabelBinarizer

	# Specify the file path
	file_path = 'news classifier/News_Category_Dataset_v3.json'

	# Lists to store data
	links = []
	headlines = []
	categories = []
	short_descriptions = []
	authors = []
	dates = []

	# Open and read the file
	with open(file_path, 'r') as file:
	for line in file:
	data = json.loads(line)
	links.append(data['link'])
	headlines.append(data['headline'])
	categories.append(data['category'])
	short_descriptions.append(data['short_description'])
	authors.append(data['authors'])
	dates.append(data['date'])
	# Create a DataFrame
	df = pd.DataFrame({
	'link': links,
	'headline': headlines,
	'category': categories,
	'short_description': short_descriptions,
	'authors': authors,
	'date': dates
	})

	# Combine headline and short_description
	df['text'] = df['headline'] + ' ' + df['short_description']

	x = df['text']
	y = df['category']








	X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

	# One-hot encoding of the labels
	label_binarizer = LabelBinarizer()
	y_train_one_hot = label_binarizer.fit_transform(y_train)
	y_test_one_hot = label_binarizer.transform(y_test)

	# Batch size
	batch_size = 32

	# Define the model
	VOCAB_SIZE = 20000 # Increased vocabulary size
	encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
	encoder.adapt(X_train.to_list())

	# Model with dropout to reduce overfitting

	class AttentionLayer(tf.keras.layers.Layer):
	def __init__(self, **kwargs):
	super(AttentionLayer, self).__init__(**kwargs)

	def build(self, input_shape):
	# Create a trainable weight variable for this layer.
	self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1),
	initializer='normal')
	self.b = self.add_weight(name='att_bias', shape=(input_shape[1], 1),
	initializer='zeros')
	super(AttentionLayer, self).build(input_shape)

	def call(self, x):
	et = tf.nn.tanh(tf.matmul(x, self.W) + self.b)
	at = tf.nn.softmax(et, axis=1)
	at = tf.transpose(at, perm=[0, 2, 1])
	output = tf.matmul(at, x)
	return tf.squeeze(output, axis=1)

	def compute_output_shape(self, input_shape):
	return (input_shape[0], input_shape[-1])
	# Model with added attention layer



	model = tf.keras.Sequential([
	encoder,
	tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=256, mask_zero=True),
	tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True, dropout=0.5)),
	AttentionLayer(),
	tf.keras.layers.Dense(256, activation='relu'),
	tf.keras.layers.Dropout(0.5),
	tf.keras.layers.Dense(256, activation='relu'),
	tf.keras.layers.Dropout(0.5),
	tf.keras.layers.Dense(len(label_binarizer.classes_), activation='softmax')
	])

	# Compile the model
	model.compile(loss='categorical_crossentropy',
	optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), # Increase learning rate
	metrics=['accuracy'])

	# Train the model
	train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train_one_hot)).batch(batch_size)
	test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test_one_hot)).batch(batch_size)
	history = model.fit(train_dataset, epochs=10, validation_data=test_dataset)

	# Save the model
	model.save('news_classifier_optimized')
	#63% accuracy