{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pathlib import Path as pp\n", "from pandas.core.frame import DataFrame as df\n", "import os\n", "\n", "# reading dataset files\n", "right_dir: pp = pp(\"../data/right\")\n", "wrong_dir: pp = pp(\"../data/wrong\")\n", "\n", "# dataframes\n", "right_df : df = df()\n", "wrong_df : df = df()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# merging right datafiles into a dataframe\n", "rdf_list = []\n", "for file in os.scandir(right_dir.absolute()):\n", " if os.path.exists(file):\n", " rdf_list.append(pd.read_csv(file))\n", "right_df = pd.concat(rdf_list)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexcontent
00Feeling annoyed? Follow this thread
11Thread, why you should vote for BJP. #PhirEKBa...
22The biggest festival of democracy is here! EC ...
33Slogan competition \\nShare your slogan idea ( ...
444 जून की करो तैयारी,\\n\\nआ रहे हैं भगवाधारी....
.........
9292मैं बता रहा हूँ, बेंगलुरू मामले में भी बहुत ते...
9393ईरान में करीब 1200 भारतीय फंसे हैं। 800 छात्र ...
9494वैसे तो TikTok ने PMCares फंड में LAC पर चीन स...
9595बंगाल में भीड़ से खचाखच भरी रैलियां हो सकती है...
9696Thank You PM \\n@narendramodi\\n ji for follow b...
\n", "

1675 rows × 2 columns

\n", "
" ], "text/plain": [ " index content\n", "0 0 Feeling annoyed? Follow this thread \n", "1 1 Thread, why you should vote for BJP. #PhirEKBa...\n", "2 2 The biggest festival of democracy is here! EC ...\n", "3 3 Slogan competition \\nShare your slogan idea ( ...\n", "4 4 4 जून की करो तैयारी,\\n\\nआ रहे हैं भगवाधारी....\n", ".. ... ...\n", "92 92 मैं बता रहा हूँ, बेंगलुरू मामले में भी बहुत ते...\n", "93 93 ईरान में करीब 1200 भारतीय फंसे हैं। 800 छात्र ...\n", "94 94 वैसे तो TikTok ने PMCares फंड में LAC पर चीन स...\n", "95 95 बंगाल में भीड़ से खचाखच भरी रैलियां हो सकती है...\n", "96 96 Thank You PM \\n@narendramodi\\n ji for follow b...\n", "\n", "[1675 rows x 2 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# merging left datafiles into a dataframe\n", "ldf_list = []\n", "for file in os.scandir(wrong_dir.absolute()):\n", " if os.path.exists(file):\n", " ldf_list.append(pd.read_csv(file))\n", "wrong_df = pd.concat(ldf_list)\n", "wrong_df" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# add category to the dataframes\n", "right_df['category'] = 'RIGHT'\n", "wrong_df['category'] = 'WRONG'\n", "wrong_df\n", "\n", "# cleaning up the columns and adding the binray labels to the dataframes\n", "frame = [right_df, wrong_df]\n", "final_dataset: df = pd.concat(frame)\n", "\n", "final_dataset.drop('index', axis=1)\n", "final_dataset\n", "final_dataset['content'] = final_dataset['content'].astype(str) # all rows in 'content' column must be of type str\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-03-29 19:54:59.568730: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "2024-03-29 19:55:07.003956: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" ] } ], "source": [ ".to_pandas()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Tokenization of dataset\n", "tokenizer = Tokenizer()\n", "tokenizer.fit_on_texts(final_dataset['content'])\n", "\n", "vocab_size = len(tokenizer.word_index) + 1 # our dataset vocab size (space split)\n", "max_length = 200 # max words in a sentence\n", "embedding_dim = 50 # TODO: need to adjust accordingly\n", "\n", "X = tokenizer.texts_to_sequences(final_dataset['content'])\n", "X = pad_sequences(X, maxlen=max_length, padding='post')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Encode the lables\n", "labels = final_dataset['category'].map({'RIGHT': '1', 'WRONG': '0'}).astype('float32').values\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/xd/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/keras/src/layers/core/embedding.py:81: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n", " super().__init__(**kwargs)\n", "2024-03-29 19:55:16.544096: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2024-03-29 19:55:21.152411: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2024-03-29 19:55:21.152806: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2024-03-29 19:55:21.154795: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2024-03-29 19:55:21.155560: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2024-03-29 19:55:21.156123: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2024-03-29 19:55:21.347819: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2024-03-29 19:55:21.348262: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2024-03-29 19:55:21.348616: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2024-03-29 19:55:21.365839: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1754 MB memory: -> device: 0, name: NVIDIA GeForce 920MX, pci bus id: 0000:01:00.0, compute capability: 5.0\n" ] } ], "source": [ "# Build the model\n", "model = keras.Sequential([\n", " keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_length,)),\n", " keras.layers.GlobalAveragePooling1D(),\n", " keras.layers.Dense(16, activation='relu'),\n", " keras.layers.Dense(1, activation='sigmoid')\n", "])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Compile the model\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(3140, 200)\n", "(1347, 200)\n", "(3140,)\n", "(1347,)\n", "(4487,)\n" ] } ], "source": [ "# split the dataset into train and test\n", "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42, shuffle=True)\n", "print(X_train.shape)\n", "print(X_test.shape)\n", "print(y_train.shape)\n", "print(y_test.shape)\n", "\n", "print(labels.shape)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "I0000 00:00:1711722330.932635 76533 service.cc:145] XLA service 0x7bbdd40053b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n", "I0000 00:00:1711722330.932689 76533 service.cc:153] StreamExecutor device (0): NVIDIA GeForce 920MX, Compute Capability 5.0\n", "2024-03-29 19:55:31.370178: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", "2024-03-29 19:55:33.092199: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m17/99\u001b[0m \u001b[32m━━━\u001b[0m\u001b[37m━━━━━━━━━━━━━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.5972 - loss: 0.6806" ] }, { "name": "stderr", "output_type": "stream", "text": [ "I0000 00:00:1711722338.226902 76533 device_compiler.h:188] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m15s\u001b[0m 56ms/step - accuracy: 0.6186 - loss: 0.6631 - val_accuracy: 0.6511 - val_loss: 0.6453\n", "Epoch 2/10\n", "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 7ms/step - accuracy: 0.6242 - loss: 0.6554 - val_accuracy: 0.6511 - val_loss: 0.6393\n", "Epoch 3/10\n", "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 7ms/step - accuracy: 0.6134 - loss: 0.6635 - val_accuracy: 0.6511 - val_loss: 0.6375\n", "Epoch 4/10\n", "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 7ms/step - accuracy: 0.6270 - loss: 0.6466 - val_accuracy: 0.6511 - val_loss: 0.6305\n", "Epoch 5/10\n", "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 7ms/step - accuracy: 0.6063 - loss: 0.6464 - val_accuracy: 0.6548 - val_loss: 0.6168\n", "Epoch 6/10\n", "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 7ms/step - accuracy: 0.6640 - loss: 0.6361 - val_accuracy: 0.6600 - val_loss: 0.5972\n", "Epoch 7/10\n", "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 8ms/step - accuracy: 0.6912 - loss: 0.5938 - val_accuracy: 0.7053 - val_loss: 0.5723\n", "Epoch 8/10\n", "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 7ms/step - accuracy: 0.7263 - loss: 0.5345 - val_accuracy: 0.5880 - val_loss: 0.6528\n", "Epoch 9/10\n", "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 6ms/step - accuracy: 0.7473 - loss: 0.5100 - val_accuracy: 0.7669 - val_loss: 0.5012\n", "Epoch 10/10\n", "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 6ms/step - accuracy: 0.8396 - loss: 0.4156 - val_accuracy: 0.7194 - val_loss: 0.5071\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# train the model\n", "model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m43/43\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 4ms/step - accuracy: 0.7254 - loss: 0.5047\n" ] }, { "data": { "text/plain": [ "0.7193763852119446" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Evaluate the model\n", "loss, accuracy = model.evaluate(X_test, y_test)\n", "accuracy" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# save the model\n", "try:\n", " model.save(\"../models/right-wrong-BC.keras\")\n", "except FileNotFoundError:\n", " os.mkdir(\"../models\")\n", " model.save(\"../models/right-wrong-BC.keras\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 347ms/step\n" ] }, { "data": { "text/plain": [ "0.71704614" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use model\n", "def preprocess_text(text):\n", " # Tokenize the text\n", " tokenized_text = tokenizer.texts_to_sequences([text])\n", " # Pad sequences to the same length as training data\n", " padded_text = pad_sequences(tokenized_text, maxlen=max_length, padding='post')\n", " return padded_text\n", "\n", "# load model\n", "model = keras.models.load_model(\"../models/right-wrong-BC.keras\") # Replace \"your_model.h5\" with the path to your trained model\n", "\n", "# Preprocess the custom input text\n", "preprocessed_text = preprocess_text(\"Modi ji is Moon Pappu on Bangkok honeymoon\")\n", "\n", "# Make predictions\n", "predictions = model.predict(preprocessed_text)\n", "\n", "predictions[0][0]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 2 }