File size: 113,500 Bytes
79f04c1 |
1 |
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":8565891,"sourceType":"datasetVersion","datasetId":5120988}],"dockerImageVersionId":30716,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:18.221392Z","iopub.execute_input":"2024-05-31T15:59:18.221694Z","iopub.status.idle":"2024-05-31T15:59:20.313965Z","shell.execute_reply.started":"2024-05-31T15:59:18.221668Z","shell.execute_reply":"2024-05-31T15:59:20.312942Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"df=pd.read_csv(\"/kaggle/input/quora-duplicate-questions-copy/train.csv\")","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:20.316233Z","iopub.execute_input":"2024-05-31T15:59:20.316770Z","iopub.status.idle":"2024-05-31T15:59:22.476112Z","shell.execute_reply.started":"2024-05-31T15:59:20.316735Z","shell.execute_reply":"2024-05-31T15:59:22.475105Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"df.shape","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.477876Z","iopub.execute_input":"2024-05-31T15:59:22.478273Z","iopub.status.idle":"2024-05-31T15:59:22.487296Z","shell.execute_reply.started":"2024-05-31T15:59:22.478236Z","shell.execute_reply":"2024-05-31T15:59:22.486116Z"},"trusted":true},"execution_count":3,"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":"(404290, 6)"},"metadata":{}}]},{"cell_type":"code","source":"df.head()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.488561Z","iopub.execute_input":"2024-05-31T15:59:22.488954Z","iopub.status.idle":"2024-05-31T15:59:22.514564Z","shell.execute_reply.started":"2024-05-31T15:59:22.488922Z","shell.execute_reply":"2024-05-31T15:59:22.513359Z"},"trusted":true},"execution_count":4,"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":" id qid1 qid2 question1 \\\n0 0 1 2 What is the step by step guide to invest in sh... \n1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia... \n2 2 5 6 How can I increase the speed of my internet co... \n3 3 7 8 Why am I mentally very lonely? How can I solve... \n4 4 9 10 Which one dissolve in water quikly sugar, salt... \n\n question2 is_duplicate \n0 What is the step by step guide to invest in sh... 0 \n1 What would happen if the Indian government sto... 0 \n2 How can Internet speed be increased by hacking... 0 \n3 Find the remainder when [math]23^{24}[/math] i... 0 \n4 Which fish would survive in salt water? 0 ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>qid1</th>\n <th>qid2</th>\n <th>question1</th>\n <th>question2</th>\n <th>is_duplicate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>1</td>\n <td>2</td>\n <td>What is the step by step guide to invest in sh...</td>\n <td>What is the step by step guide to invest in sh...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>3</td>\n <td>4</td>\n <td>What is the story of Kohinoor (Koh-i-Noor) Dia...</td>\n <td>What would happen if the Indian government sto...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>5</td>\n <td>6</td>\n <td>How can I increase the speed of my internet co...</td>\n <td>How can Internet speed be increased by hacking...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>7</td>\n <td>8</td>\n <td>Why am I mentally very lonely? How can I solve...</td>\n <td>Find the remainder when [math]23^{24}[/math] i...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>9</td>\n <td>10</td>\n <td>Which one dissolve in water quikly sugar, salt...</td>\n <td>Which fish would survive in salt water?</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"df.isnull().sum()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.517703Z","iopub.execute_input":"2024-05-31T15:59:22.518337Z","iopub.status.idle":"2024-05-31T15:59:22.596705Z","shell.execute_reply.started":"2024-05-31T15:59:22.518297Z","shell.execute_reply":"2024-05-31T15:59:22.595501Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":"id 0\nqid1 0\nqid2 0\nquestion1 1\nquestion2 2\nis_duplicate 0\ndtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"df.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.598173Z","iopub.execute_input":"2024-05-31T15:59:22.598466Z","iopub.status.idle":"2024-05-31T15:59:22.736840Z","shell.execute_reply.started":"2024-05-31T15:59:22.598442Z","shell.execute_reply":"2024-05-31T15:59:22.735647Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"df.duplicated().sum()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.741851Z","iopub.execute_input":"2024-05-31T15:59:22.744101Z","iopub.status.idle":"2024-05-31T15:59:23.191640Z","shell.execute_reply.started":"2024-05-31T15:59:22.744061Z","shell.execute_reply":"2024-05-31T15:59:23.190396Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"0"},"metadata":{}}]},{"cell_type":"code","source":"df=df.head(200000)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.192948Z","iopub.execute_input":"2024-05-31T15:59:23.193273Z","iopub.status.idle":"2024-05-31T15:59:23.198226Z","shell.execute_reply.started":"2024-05-31T15:59:23.193238Z","shell.execute_reply":"2024-05-31T15:59:23.197248Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"def preprocess(q):\n q=str(q).lower().strip()\n \n q=q.replace('%',' percent ')\n q=q.replace('@',' at ')\n q=q.replace('$',' dollar ')\n \n q=q.replace('[math]','')\n \n q=q.replace(',000,000,000 ','b ')\n q=q.replace(',000,000 ','m ')\n q=q.replace(',000 ','k ')\n \n import re\n q=re.sub(r'([0-9]+)000000000',r'\\1b',q)\n q=re.sub(r'([0-9]+)000000',r'\\1m',q)\n q=re.sub(r'([0-9]+)000',r'\\1k',q)\n \n contractions = { \n \"ain't\": \"am not\",\n \"aren't\": \"are not\",\n \"can't\": \"can not\",\n \"can't've\": \"can not have\",\n \"'cause\": \"because\",\n \"could've\": \"could have\",\n \"couldn't\": \"could not\",\n \"couldn't've\": \"could not have\",\n \"didn't\": \"did not\",\n \"doesn't\": \"does not\",\n \"don't\": \"do not\",\n \"hadn't\": \"had not\",\n \"hadn't've\": \"had not have\",\n \"hasn't\": \"has not\",\n \"haven't\": \"have not\",\n \"he'd\": \"he would\",\n \"he'd've\": \"he would have\",\n \"he'll\": \"he will\",\n \"he'll've\": \"he will have\",\n \"he's\": \"he is\",\n \"how'd\": \"how did\",\n \"how'd'y\": \"how do you\",\n \"how'll\": \"how will\",\n \"how's\": \"how is\",\n \"i'd\": \"i would\",\n \"i'd've\": \"i would have\",\n \"i'll\": \"i will\",\n \"i'll've\": \"i will have\",\n \"i'm\": \"i am\",\n \"i've\": \"i have\",\n \"isn't\": \"is not\",\n \"it'd\": \"it would\",\n \"it'd've\": \"it would have\",\n \"it'll\": \"it will\",\n \"it'll've\": \"it will have\",\n \"it's\": \"it is\",\n \"let's\": \"let us\",\n \"ma'am\": \"madam\",\n \"mayn't\": \"may not\",\n \"might've\": \"might have\",\n \"mightn't\": \"might not\",\n \"mightn't've\": \"might not have\",\n \"must've\": \"must have\",\n \"mustn't\": \"must not\",\n \"mustn't've\": \"must not have\",\n \"needn't\": \"need not\",\n \"needn't've\": \"need not have\",\n \"o'clock\": \"of the clock\",\n \"oughtn't\": \"ought not\",\n \"oughtn't've\": \"ought not have\",\n \"shan't\": \"shall not\",\n \"sha'n't\": \"shall not\",\n \"shan't've\": \"shall not have\",\n \"she'd\": \"she would\",\n \"she'd've\": \"she would have\",\n \"she'll\": \"she will\",\n \"she'll've\": \"she will have\",\n \"she's\": \"she is\",\n \"should've\": \"should have\",\n \"shouldn't\": \"should not\",\n \"shouldn't've\": \"should not have\",\n \"so've\": \"so have\",\n \"so's\": \"so as\",\n \"that'd\": \"that would\",\n \"that'd've\": \"that would have\",\n \"that's\": \"that is\",\n \"there'd\": \"there would\",\n \"there'd've\": \"there would have\",\n \"there's\": \"there is\",\n \"they'd\": \"they would\",\n \"they'd've\": \"they would have\",\n \"they'll\": \"they will\",\n \"they'll've\": \"they will have\",\n \"they're\": \"they are\",\n \"they've\": \"they have\",\n \"to've\": \"to have\",\n \"wasn't\": \"was not\",\n \"we'd\": \"we would\",\n \"we'd've\": \"we would have\",\n \"we'll\": \"we will\",\n \"we'll've\": \"we will have\",\n \"we're\": \"we are\",\n \"we've\": \"we have\",\n \"weren't\": \"were not\",\n \"what'll\": \"what will\",\n \"what'll've\": \"what will have\",\n \"what're\": \"what are\",\n \"what's\": \"what is\",\n \"what've\": \"what have\",\n \"when's\": \"when is\",\n \"when've\": \"when have\",\n \"where'd\": \"where did\",\n \"where's\": \"where is\",\n \"where've\": \"where have\",\n \"who'll\": \"who will\",\n \"who'll've\": \"who will have\",\n \"who's\": \"who is\",\n \"who've\": \"who have\",\n \"why's\": \"why is\",\n \"why've\": \"why have\",\n \"will've\": \"will have\",\n \"won't\": \"will not\",\n \"won't've\": \"will not have\",\n \"would've\": \"would have\",\n \"wouldn't\": \"would not\",\n \"wouldn't've\": \"would not have\",\n \"y'all\": \"you all\",\n \"y'all'd\": \"you all would\",\n \"y'all'd've\": \"you all would have\",\n \"y'all're\": \"you all are\",\n \"y'all've\": \"you all have\",\n \"you'd\": \"you would\",\n \"you'd've\": \"you would have\",\n \"you'll\": \"you will\",\n \"you'll've\": \"you will have\",\n \"you're\": \"you are\",\n \"you've\": \"you have\"\n }\n\n q_decontracted = []\n\n for word in q.split():\n if word in contractions:\n word = contractions[word]\n\n q_decontracted.append(word)\n\n q = ' '.join(q_decontracted)\n q = q.replace(\"'ve\", \" have\")\n q = q.replace(\"n't\", \" not\")\n q = q.replace(\"'re\", \" are\")\n q = q.replace(\"'ll\", \" will\")\n \n q=re.sub(re.compile('<.*?>'),'',q)\n \n import string\n q=q.translate(str.maketrans('', '', string.punctuation))\n \n return q","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.199690Z","iopub.execute_input":"2024-05-31T15:59:23.199977Z","iopub.status.idle":"2024-05-31T15:59:23.217455Z","shell.execute_reply.started":"2024-05-31T15:59:23.199952Z","shell.execute_reply":"2024-05-31T15:59:23.216322Z"},"trusted":true},"execution_count":9,"outputs":[]},{"cell_type":"code","source":"df['is_duplicate'].value_counts().plot(kind='bar')","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.218752Z","iopub.execute_input":"2024-05-31T15:59:23.219131Z","iopub.status.idle":"2024-05-31T15:59:23.586693Z","shell.execute_reply.started":"2024-05-31T15:59:23.219097Z","shell.execute_reply":"2024-05-31T15:59:23.585550Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"<Axes: xlabel='is_duplicate'>"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"<Figure size 640x480 with 1 Axes>","image/png":"iVBORw0KGgoAAAANSUhEUgAAAkIAAAGrCAYAAAAsBPjXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAtHklEQVR4nO3de1TVdb7/8ReIXFLZeAlwn0iZclSOJgWJWFmOjLvROoeyM1KUVIyeOtCo5LUM7WpDYyWlMjYVzUlX5pmJMXRIBkdplFBR8jJizqRh42ysUdhJIyJ8f3+0+P7cal5qI8rn+VhrryXf73t/v5/Nmj0+25evfpZlWQIAADCQf1svAAAAoK0QQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwVkBbL+Bi1tzcrAMHDqhLly7y8/Nr6+UAAIBzYFmWvvrqKzmdTvn7n/k1H0LoDA4cOKCoqKi2XgYAAPgO9u/fryuuuOKMM4TQGXTp0kXSN7/I0NDQNl4NAAA4Fx6PR1FRUfbf42dCCJ1By9thoaGhhBAAAJeYc/lYCx+WBgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgrIC2XgAuTr1nrGzrJeAC2vf86LZeAgC0CV4RAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYKzzDqHS0lLdfvvtcjqd8vPzU0FBgb2vsbFR06dP18CBA9WpUyc5nU6NGzdOBw4c8DrGoUOHlJqaqtDQUIWFhSk9PV1Hjhzxmtm2bZtuuukmBQcHKyoqSjk5OaesZfny5erXr5+Cg4M1cOBArVq1ymu/ZVnKzs5Wz549FRISoqSkJO3Zs+d8HzIAAGinzjuE6uvrNWjQIC1YsOCUfV9//bW2bNmiJ554Qlu2bNHvfvc77d69W//xH//hNZeamqqdO3equLhYhYWFKi0t1YQJE+z9Ho9HI0eOVK9evVRRUaEXXnhBc+bM0eLFi+2ZDRs26O6771Z6erq2bt2q5ORkJScna8eOHfZMTk6OcnNzlZeXp/LycnXq1Ekul0tHjx4934cNAADaIT/LsqzvfGc/P7333ntKTk7+1plNmzZp8ODB+uyzz3TllVdq165diomJ0aZNmxQfHy9JKioq0qhRo/T555/L6XRq0aJFevzxx+V2uxUYGChJmjFjhgoKClRVVSVJGjt2rOrr61VYWGifa8iQIYqNjVVeXp4sy5LT6dSjjz6qKVOmSJLq6uoUERGh/Px8paSknLLWhoYGNTQ02D97PB5FRUWprq5OoaGh3/XXdEniX583C//6PID2xOPxyOFwnNPf363+GaG6ujr5+fkpLCxMklRWVqawsDA7giQpKSlJ/v7+Ki8vt2eGDRtmR5AkuVwu7d69W4cPH7ZnkpKSvM7lcrlUVlYmSdq7d6/cbrfXjMPhUEJCgj1zsrlz58rhcNi3qKio7/8LAAAAF61WDaGjR49q+vTpuvvuu+0ic7vdCg8P95oLCAhQt27d5Ha77ZmIiAivmZafzzZz4v4T73e6mZPNnDlTdXV19m3//v3n/ZgBAMClI6C1DtzY2Kif/vSnsixLixYtaq3T+FRQUJCCgoLaehkAAOACaZVXhFoi6LPPPlNxcbHX+3ORkZE6ePCg1/zx48d16NAhRUZG2jM1NTVeMy0/n23mxP0n3u90MwAAwGw+D6GWCNqzZ4/++Mc/qnv37l77ExMTVVtbq4qKCnvbmjVr1NzcrISEBHumtLRUjY2N9kxxcbH69u2rrl272jMlJSVexy4uLlZiYqIkKTo6WpGRkV4zHo9H5eXl9gwAADDbeYfQkSNHVFlZqcrKSknffCi5srJS1dXVamxs1F133aXNmzdryZIlampqktvtltvt1rFjxyRJ/fv316233qrx48dr48aNWr9+vTIzM5WSkiKn0ylJuueeexQYGKj09HTt3LlTy5Yt0/z585WVlWWvY+LEiSoqKtK8efNUVVWlOXPmaPPmzcrMzJT0zTfaJk2apGeeeUYrVqzQ9u3bNW7cODmdzjN+yw0AAJjjvL8+v3btWg0fPvyU7WlpaZozZ46io6NPe78//elPuuWWWyR9c0HFzMxMvf/++/L399eYMWOUm5urzp072/Pbtm1TRkaGNm3apB49euiRRx7R9OnTvY65fPlyzZo1S/v27VOfPn2Uk5OjUaNG2fsty9Ls2bO1ePFi1dbW6sYbb9TChQv1wx/+8Jwe6/l8/a694evzZuHr8wDak/P5+/t7XUeovSOEYApCCEB7clFdRwgAAOBiRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjHXeIVRaWqrbb79dTqdTfn5+Kigo8NpvWZays7PVs2dPhYSEKCkpSXv27PGaOXTokFJTUxUaGqqwsDClp6fryJEjXjPbtm3TTTfdpODgYEVFRSknJ+eUtSxfvlz9+vVTcHCwBg4cqFWrVp33WgAAgLnOO4Tq6+s1aNAgLViw4LT7c3JylJubq7y8PJWXl6tTp05yuVw6evSoPZOamqqdO3equLhYhYWFKi0t1YQJE+z9Ho9HI0eOVK9evVRRUaEXXnhBc+bM0eLFi+2ZDRs26O6771Z6erq2bt2q5ORkJScna8eOHee1FgAAYC4/y7Ks73xnPz+99957Sk5OlvTNKzBOp1OPPvqopkyZIkmqq6tTRESE8vPzlZKSol27dikmJkabNm1SfHy8JKmoqEijRo3S559/LqfTqUWLFunxxx+X2+1WYGCgJGnGjBkqKChQVVWVJGns2LGqr69XYWGhvZ4hQ4YoNjZWeXl557SWkzU0NKihocH+2ePxKCoqSnV1dQoNDf2uv6ZLUu8ZK9t6CbiA9j0/uq2XAAA+4/F45HA4zunvb59+Rmjv3r1yu91KSkqytzkcDiUkJKisrEySVFZWprCwMDuCJCkpKUn+/v4qLy+3Z4YNG2ZHkCS5XC7t3r1bhw8ftmdOPE/LTMt5zmUtJ5s7d64cDod9i4qK+j6/DgAAcJHzaQi53W5JUkREhNf2iIgIe5/b7VZ4eLjX/oCAAHXr1s1r5nTHOPEc3zZz4v6zreVkM2fOVF1dnX3bv3//OTxqAABwqQpo6wVcTIKCghQUFNTWywAAABeIT18RioyMlCTV1NR4ba+pqbH3RUZG6uDBg177jx8/rkOHDnnNnO4YJ57j22ZO3H+2tQAAALP5NISio6MVGRmpkpISe5vH41F5ebkSExMlSYmJiaqtrVVFRYU9s2bNGjU3NyshIcGeKS0tVWNjoz1TXFysvn37qmvXrvbMiedpmWk5z7msBQAAmO28Q+jIkSOqrKxUZWWlpG8+lFxZWanq6mr5+flp0qRJeuaZZ7RixQpt375d48aNk9PptL9Z1r9/f916660aP368Nm7cqPXr1yszM1MpKSlyOp2SpHvuuUeBgYFKT0/Xzp07tWzZMs2fP19ZWVn2OiZOnKiioiLNmzdPVVVVmjNnjjZv3qzMzExJOqe1AAAAs533Z4Q2b96s4cOH2z+3xElaWpry8/M1bdo01dfXa8KECaqtrdWNN96ooqIiBQcH2/dZsmSJMjMzNWLECPn7+2vMmDHKzc219zscDq1evVoZGRmKi4tTjx49lJ2d7XWtoaFDh2rp0qWaNWuWHnvsMfXp00cFBQUaMGCAPXMuawEAAOb6XtcRau/O5zoE7Q3XETIL1xEC0J602XWEAAAALiWEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYPg+hpqYmPfHEE4qOjlZISIiuuuoqPf3007Isy56xLEvZ2dnq2bOnQkJClJSUpD179ngd59ChQ0pNTVVoaKjCwsKUnp6uI0eOeM1s27ZNN910k4KDgxUVFaWcnJxT1rN8+XL169dPwcHBGjhwoFatWuXrhwwAAC5RPg+hX/ziF1q0aJFeffVV7dq1S7/4xS+Uk5OjV155xZ7JyclRbm6u8vLyVF5erk6dOsnlcuno0aP2TGpqqnbu3Kni4mIVFhaqtLRUEyZMsPd7PB6NHDlSvXr1UkVFhV544QXNmTNHixcvtmc2bNigu+++W+np6dq6dauSk5OVnJysHTt2+PphAwCAS5CfdeJLNT5w2223KSIiQq+//rq9bcyYMQoJCdHbb78ty7LkdDr16KOPasqUKZKkuro6RUREKD8/XykpKdq1a5diYmK0adMmxcfHS5KKioo0atQoff7553I6nVq0aJEef/xxud1uBQYGSpJmzJihgoICVVVVSZLGjh2r+vp6FRYW2msZMmSIYmNjlZeXd8raGxoa1NDQYP/s8XgUFRWluro6hYaG+vLXdNHrPWNlWy8BF9C+50e39RIAwGc8Ho8cDsc5/f3t81eEhg4dqpKSEn3yySeSpI8//lh//vOf9ZOf/ESStHfvXrndbiUlJdn3cTgcSkhIUFlZmSSprKxMYWFhdgRJUlJSkvz9/VVeXm7PDBs2zI4gSXK5XNq9e7cOHz5sz5x4npaZlvOcbO7cuXI4HPYtKirq+/46AADARSzA1wecMWOGPB6P+vXrpw4dOqipqUnPPvusUlNTJUlut1uSFBER4XW/iIgIe5/b7VZ4eLj3QgMC1K1bN6+Z6OjoU47Rsq9r165yu91nPM/JZs6cqaysLPvnlleEAABA++TzEHr33Xe1ZMkSLV26VP/+7/+uyspKTZo0SU6nU2lpab4+nU8FBQUpKCiorZcBAAAuEJ+H0NSpUzVjxgylpKRIkgYOHKjPPvtMc+fOVVpamiIjIyVJNTU16tmzp32/mpoaxcbGSpIiIyN18OBBr+MeP35chw4dsu8fGRmpmpoar5mWn88207IfAACYzeefEfr666/l7+992A4dOqi5uVmSFB0drcjISJWUlNj7PR6PysvLlZiYKElKTExUbW2tKioq7Jk1a9aoublZCQkJ9kxpaakaGxvtmeLiYvXt21ddu3a1Z048T8tMy3kAAIDZfB5Ct99+u5599lmtXLlS+/bt03vvvacXX3xRd9xxhyTJz89PkyZN0jPPPKMVK1Zo+/btGjdunJxOp5KTkyVJ/fv316233qrx48dr48aNWr9+vTIzM5WSkiKn0ylJuueeexQYGKj09HTt3LlTy5Yt0/z5870+4zNx4kQVFRVp3rx5qqqq0pw5c7R582ZlZmb6+mEDAIBLkM/fGnvllVf0xBNP6H/+53908OBBOZ1O/fd//7eys7PtmWnTpqm+vl4TJkxQbW2tbrzxRhUVFSk4ONieWbJkiTIzMzVixAj5+/trzJgxys3Ntfc7HA6tXr1aGRkZiouLU48ePZSdne11raGhQ4dq6dKlmjVrlh577DH16dNHBQUFGjBggK8fNgAAuAT5/DpC7cn5XIegveE6QmbhOkIA2pM2vY4QAADApYIQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsn19ZGgBwceOCqWbhgqlnxitCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAY7VKCP3973/Xvffeq+7duyskJEQDBw7U5s2b7f2WZSk7O1s9e/ZUSEiIkpKStGfPHq9jHDp0SKmpqQoNDVVYWJjS09N15MgRr5lt27bppptuUnBwsKKiopSTk3PKWpYvX65+/fopODhYAwcO1KpVq1rjIQMAgEuQz0Po8OHDuuGGG9SxY0f94Q9/0F/+8hfNmzdPXbt2tWdycnKUm5urvLw8lZeXq1OnTnK5XDp69Kg9k5qaqp07d6q4uFiFhYUqLS3VhAkT7P0ej0cjR45Ur169VFFRoRdeeEFz5szR4sWL7ZkNGzbo7rvvVnp6urZu3ark5GQlJydrx44dvn7YAADgEuRnWZblywPOmDFD69ev14cffnja/ZZlyel06tFHH9WUKVMkSXV1dYqIiFB+fr5SUlK0a9cuxcTEaNOmTYqPj5ckFRUVadSoUfr888/ldDq1aNEiPf7443K73QoMDLTPXVBQoKqqKknS2LFjVV9fr8LCQvv8Q4YMUWxsrPLy8k5ZW0NDgxoaGuyfPR6PoqKiVFdXp9DQUN/8gi4RvWesbOsl4ALa9/zotl4CLiCe32Yx8fnt8XjkcDjO6e9vn78itGLFCsXHx+u//uu/FB4ermuvvVavvfaavX/v3r1yu91KSkqytzkcDiUkJKisrEySVFZWprCwMDuCJCkpKUn+/v4qLy+3Z4YNG2ZHkCS5XC7t3r1bhw8ftmdOPE/LTMt5TjZ37lw5HA77FhUV9T1/GwAA4GLm8xD69NNPtWjRIvXp00cffPCBHn74Yf385z/XW2+9JUlyu92SpIiICK/7RURE2PvcbrfCw8O99gcEBKhbt25eM6c7xonn+LaZlv0nmzlzpurq6uzb/v37z/vxAwCAS0eArw/Y3Nys+Ph4Pffcc5Kka6+9Vjt27FBeXp7S0tJ8fTqfCgoKUlBQUFsvAwAAXCA+f0WoZ8+eiomJ8drWv39/VVdXS5IiIyMlSTU1NV4zNTU19r7IyEgdPHjQa//x48d16NAhr5nTHePEc3zbTMt+AABgNp+H0A033KDdu3d7bfvkk0/Uq1cvSVJ0dLQiIyNVUlJi7/d4PCovL1diYqIkKTExUbW1taqoqLBn1qxZo+bmZiUkJNgzpaWlamxstGeKi4vVt29f+xtqiYmJXudpmWk5DwAAMJvPQ2jy5Mn66KOP9Nxzz+mvf/2rli5dqsWLFysjI0OS5Ofnp0mTJumZZ57RihUrtH37do0bN05Op1PJycmSvnkF6dZbb9X48eO1ceNGrV+/XpmZmUpJSZHT6ZQk3XPPPQoMDFR6erp27typZcuWaf78+crKyrLXMnHiRBUVFWnevHmqqqrSnDlztHnzZmVmZvr6YQMAgEuQzz8jdP311+u9997TzJkz9dRTTyk6Olovv/yyUlNT7Zlp06apvr5eEyZMUG1trW688UYVFRUpODjYnlmyZIkyMzM1YsQI+fv7a8yYMcrNzbX3OxwOrV69WhkZGYqLi1OPHj2UnZ3tda2hoUOHaunSpZo1a5Yee+wx9enTRwUFBRowYICvHzYAALgE+fw6Qu3J+VyHoL3hOiNmMfE6Iybj+W0WE5/fbXodIQAAgEsFIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwVquH0PPPPy8/Pz9NmjTJ3nb06FFlZGSoe/fu6ty5s8aMGaOamhqv+1VXV2v06NG67LLLFB4erqlTp+r48eNeM2vXrtV1112noKAgXX311crPzz/l/AsWLFDv3r0VHByshIQEbdy4sTUeJgAAuAS1aght2rRJv/rVr3TNNdd4bZ88ebLef/99LV++XOvWrdOBAwd055132vubmpo0evRoHTt2TBs2bNBbb72l/Px8ZWdn2zN79+7V6NGjNXz4cFVWVmrSpEn62c9+pg8++MCeWbZsmbKysjR79mxt2bJFgwYNksvl0sGDB1vzYQMAgEtEq4XQkSNHlJqaqtdee01du3a1t9fV1en111/Xiy++qB/96EeKi4vTm2++qQ0bNuijjz6SJK1evVp/+ctf9Pbbbys2NlY/+clP9PTTT2vBggU6duyYJCkvL0/R0dGaN2+e+vfvr8zMTN1111166aWX7HO9+OKLGj9+vB544AHFxMQoLy9Pl112md54443WetgAAOAS0mohlJGRodGjRyspKclre0VFhRobG7229+vXT1deeaXKysokSWVlZRo4cKAiIiLsGZfLJY/Ho507d9ozJx/b5XLZxzh27JgqKiq8Zvz9/ZWUlGTPnKyhoUEej8frBgAA2q+A1jjoO++8oy1btmjTpk2n7HO73QoMDFRYWJjX9oiICLndbnvmxAhq2d+y70wzHo9H//rXv3T48GE1NTWddqaqquq06547d66efPLJc3+gAADgkubzV4T279+viRMnasmSJQoODvb14VvVzJkzVVdXZ9/279/f1ksCAACtyOchVFFRoYMHD+q6665TQECAAgICtG7dOuXm5iogIEARERE6duyYamtrve5XU1OjyMhISVJkZOQp3yJr+flsM6GhoQoJCVGPHj3UoUOH0860HONkQUFBCg0N9boBAID2y+chNGLECG3fvl2VlZX2LT4+XqmpqfafO3bsqJKSEvs+u3fvVnV1tRITEyVJiYmJ2r59u9e3u4qLixUaGqqYmBh75sRjtMy0HCMwMFBxcXFeM83NzSopKbFnAACA2Xz+GaEuXbpowIABXts6deqk7t2729vT09OVlZWlbt26KTQ0VI888ogSExM1ZMgQSdLIkSMVExOj++67Tzk5OXK73Zo1a5YyMjIUFBQkSXrooYf06quvatq0aXrwwQe1Zs0avfvuu1q5cqV93qysLKWlpSk+Pl6DBw/Wyy+/rPr6ej3wwAO+ftgAAOAS1Coflj6bl156Sf7+/hozZowaGhrkcrm0cOFCe3+HDh1UWFiohx9+WImJierUqZPS0tL01FNP2TPR0dFauXKlJk+erPnz5+uKK67Qr3/9a7lcLntm7Nix+uKLL5SdnS23263Y2FgVFRWd8gFqAABgJj/Lsqy2XsTFyuPxyOFwqK6uzrjPC/WesfLsQ2g39j0/uq2XgAuI57dZTHx+n8/f3/xbYwAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYPg+huXPn6vrrr1eXLl0UHh6u5ORk7d6922vm6NGjysjIUPfu3dW5c2eNGTNGNTU1XjPV1dUaPXq0LrvsMoWHh2vq1Kk6fvy418zatWt13XXXKSgoSFdffbXy8/NPWc+CBQvUu3dvBQcHKyEhQRs3bvT1QwYAAJcon4fQunXrlJGRoY8++kjFxcVqbGzUyJEjVV9fb89MnjxZ77//vpYvX65169bpwIEDuvPOO+39TU1NGj16tI4dO6YNGzborbfeUn5+vrKzs+2ZvXv3avTo0Ro+fLgqKys1adIk/exnP9MHH3xgzyxbtkxZWVmaPXu2tmzZokGDBsnlcungwYO+ftgAAOAS5GdZltWaJ/jiiy8UHh6udevWadiwYaqrq9Pll1+upUuX6q677pIkVVVVqX///iorK9OQIUP0hz/8QbfddpsOHDigiIgISVJeXp6mT5+uL774QoGBgZo+fbpWrlypHTt22OdKSUlRbW2tioqKJEkJCQm6/vrr9eqrr0qSmpubFRUVpUceeUQzZsw469o9Ho8cDofq6uoUGhrq61/NRa33jJVtvQRcQPueH93WS8AFxPPbLCY+v8/n7+9W/4xQXV2dJKlbt26SpIqKCjU2NiopKcme6devn6688kqVlZVJksrKyjRw4EA7giTJ5XLJ4/Fo586d9syJx2iZaTnGsWPHVFFR4TXj7++vpKQke+ZkDQ0N8ng8XjcAANB+tWoINTc3a9KkSbrhhhs0YMAASZLb7VZgYKDCwsK8ZiMiIuR2u+2ZEyOoZX/LvjPNeDwe/etf/9KXX36ppqam0860HONkc+fOlcPhsG9RUVHf7YEDAIBLQquGUEZGhnbs2KF33nmnNU/jMzNnzlRdXZ19279/f1svCQAAtKKA1jpwZmamCgsLVVpaqiuuuMLeHhkZqWPHjqm2ttbrVaGamhpFRkbaMyd/u6vlW2Unzpz8TbOamhqFhoYqJCREHTp0UIcOHU4703KMkwUFBSkoKOi7PWAAAHDJ8fkrQpZlKTMzU++9957WrFmj6Ohor/1xcXHq2LGjSkpK7G27d+9WdXW1EhMTJUmJiYnavn2717e7iouLFRoaqpiYGHvmxGO0zLQcIzAwUHFxcV4zzc3NKikpsWcAAIDZfP6KUEZGhpYuXarf//736tKli/15HIfDoZCQEDkcDqWnpysrK0vdunVTaGioHnnkESUmJmrIkCGSpJEjRyomJkb33XefcnJy5Ha7NWvWLGVkZNiv2Dz00EN69dVXNW3aND344INas2aN3n33Xa1c+f+/DZGVlaW0tDTFx8dr8ODBevnll1VfX68HHnjA1w8bAABcgnweQosWLZIk3XLLLV7b33zzTd1///2SpJdeekn+/v4aM2aMGhoa5HK5tHDhQnu2Q4cOKiws1MMPP6zExER16tRJaWlpeuqpp+yZ6OhorVy5UpMnT9b8+fN1xRVX6Ne//rVcLpc9M3bsWH3xxRfKzs6W2+1WbGysioqKTvkANQAAMFOrX0foUsZ1hGAKE68zYjKe32Yx8fl9UV1HCAAA4GJFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMZUQILViwQL1791ZwcLASEhK0cePGtl4SAAC4CLT7EFq2bJmysrI0e/ZsbdmyRYMGDZLL5dLBgwfbemkAAKCNtfsQevHFFzV+/Hg98MADiomJUV5eni677DK98cYbbb00AADQxgLaegGt6dixY6qoqNDMmTPtbf7+/kpKSlJZWdkp8w0NDWpoaLB/rqurkyR5PJ7WX+xFprnh67ZeAi4gE/83bjKe32Yx8fnd8pgtyzrrbLsOoS+//FJNTU2KiIjw2h4REaGqqqpT5ufOnasnn3zylO1RUVGttkbgYuB4ua1XAKC1mPz8/uqrr+RwOM44065D6HzNnDlTWVlZ9s/Nzc06dOiQunfvLj8/vzZcGS4Ej8ejqKgo7d+/X6GhoW29HAA+xPPbLJZl6auvvpLT6TzrbLsOoR49eqhDhw6qqanx2l5TU6PIyMhT5oOCghQUFOS1LSwsrDWXiItQaGgo/0cJtFM8v81xtleCWrTrD0sHBgYqLi5OJSUl9rbm5maVlJQoMTGxDVcGAAAuBu36FSFJysrKUlpamuLj4zV48GC9/PLLqq+v1wMPPNDWSwMAAG2s3YfQ2LFj9cUXXyg7O1tut1uxsbEqKio65QPUQFBQkGbPnn3K26MALn08v/Ft/Kxz+W4ZAABAO9SuPyMEAABwJoQQAAAwFiEEAACMRQgBAABjEUIAAMBY7f7r88C3+fLLL/XGG2+orKxMbrdbkhQZGamhQ4fq/vvv1+WXX97GKwQAtDZeEYKRNm3apB/+8IfKzc2Vw+HQsGHDNGzYMDkcDuXm5qpfv37avHlzWy8TQCvZv3+/HnzwwbZeBi4CXEcIRhoyZIgGDRqkvLy8U/5BXcuy9NBDD2nbtm0qKytroxUCaE0ff/yxrrvuOjU1NbX1UtDGeGsMRvr444+Vn59/SgRJkp+fnyZPnqxrr722DVYGwBdWrFhxxv2ffvrpBVoJLnaEEIwUGRmpjRs3ql+/fqfdv3HjRv4ZFuASlpycLD8/P53pTY/T/YcQzEMIwUhTpkzRhAkTVFFRoREjRtjRU1NTo5KSEr322mv65S9/2carBPBd9ezZUwsXLtR//ud/nnZ/ZWWl4uLiLvCqcDEihGCkjIwM9ejRQy+99JIWLlxof06gQ4cOiouLU35+vn7605+28SoBfFdxcXGqqKj41hA626tFMAcflobxGhsb9eWXX0qSevTooY4dO7bxigB8Xx9++KHq6+t16623nnZ/fX29Nm/erJtvvvkCrwwXG0IIAAAYi+sIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBaDW33HKLJk2adEkcd+3atfLz81Ntba0kKT8/X2FhYT49B4CLD9cRAtBqfve7312ylyMYO3asRo0a5bPjrV27VsOHD9fhw4cJLOAiQggBaDXdunVr6yV8ZyEhIQoJCWnrZQBoZbw1BqDVnPgW1sKFC9WnTx8FBwcrIiJCd9111zkdo76+XuPGjVPnzp3Vs2dPzZs375QZPz8/FRQUeG0LCwtTfn6+JGnfvn3y8/PTO++8o6FDhyo4OFgDBgzQunXrvvW8p3tr7P3339f111+v4OBg9ejRQ3fccYe973//938VHx+vLl26KDIyUvfcc48OHjxon3/48OGSpK5du8rPz0/333+/JKm5uVlz585VdHS0QkJCNGjQIP3f//3fOf1uAHx/hBCAVrd582b9/Oc/11NPPaXdu3erqKhIw4YNO6f7Tp06VevWrdPvf/97rV69WmvXrtWWLVu+0zqmTp2qRx99VFu3blViYqJuv/12/fOf/zyn+65cuVJ33HGHRo0apa1bt6qkpESDBw+29zc2Nurpp5/Wxx9/rIKCAu3bt8+OnaioKP32t7+VJO3evVv/+Mc/NH/+fEnS3Llz9Zvf/EZ5eXnauXOnJk+erHvvvfeMkQbAd3hrDECrq66uVqdOnXTbbbepS5cu6tWrl6699tqz3u/IkSN6/fXX9fbbb2vEiBGSpLfeektXXHHFd1pHZmamxowZI0latGiRioqK9Prrr2vatGlnve+zzz6rlJQUPfnkk/a2QYMG2X9+8MEH7T//4Ac/UG5urq6//nodOXJEnTt3tt8mDA8Pt19pamho0HPPPac//vGPSkxMtO/75z//Wb/61a/45x+AC4BXhAC0uh//+Mfq1auXfvCDH+i+++7TkiVL9PXXX5/1fn/729907NgxJSQk2Nu6deumvn37fqd1tMSGJAUEBCg+Pl67du06p/tWVlbaMXY6FRUVuv3223XllVeqS5cudsRUV1d/633++te/6uuvv9aPf/xjde7c2b795je/0d/+9rdzfFQAvg9eEQLQ6rp06aItW7Zo7dq1Wr16tbKzszVnzhxt2rTJJ9+gOt2/JN7Y2Pi9j3uiM31wur6+Xi6XSy6XS0uWLNHll1+u6upquVwuHTt27Fvvd+TIEUnfvO32b//2b177goKCfLNwAGfEK0IALoiAgAAlJSUpJydH27Zt0759+7RmzZoz3ueqq65Sx44dVV5ebm87fPiwPvnkE6+5yy+/XP/4xz/sn/fs2XPaV5w++ugj+8/Hjx9XRUWF+vfvf07rv+aaa1RSUnLafVVVVfrnP/+p559/XjfddJP69etnf1C6RWBgoCSpqanJ3hYTE6OgoCBVV1fr6quv9rpFRUWd07oAfD+8IgSg1RUWFurTTz/VsGHD1LVrV61atUrNzc1nfYurc+fOSk9P19SpU9W9e3eFh4fr8ccfl7+/93/D/ehHP9Krr76qxMRENTU1afr06ae9ftGCBQvUp08f9e/fXy+99JIOHz7s9dmeM5k9e7ZGjBihq666SikpKTp+/LhWrVql6dOn68orr1RgYKBeeeUVPfTQQ9qxY4eefvppr/v36tVLfn5+Kiws1KhRoxQSEqIuXbpoypQpmjx5spqbm3XjjTeqrq5O69evV2hoqNLS0s5pbQC+BwsAWsnNN99sTZw40frwww+tm2++2eratasVEhJiXXPNNdayZcvO6RhfffWVde+991qXXXaZFRERYeXk5NjHbfH3v//dGjlypNWpUyerT58+1qpVqyyHw2G9+eablmVZ1t69ey1J1tKlS63BgwdbgYGBVkxMjLVmzRr7GH/6058sSdbhw4cty7KsN99803I4HF5r+e1vf2vFxsZagYGBVo8ePaw777zT3rd06VKrd+/eVlBQkJWYmGitWLHCkmRt3brVnnnqqaesyMhIy8/Pz0pLS7Msy7Kam5utl19+2erbt6/VsWNH6/LLL7dcLpe1bt26c/49A/ju/CzrpDfWAaCd2bdvn6Kjo7V161bFxsa29XIAXET4jBAAADAWIQSgzVRXV3t9bfzk25m+eg4AvsBbYwDazPHjx7Vv375v3d+7d28FBPCdDgCthxACAADG4q0xAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMb6fwthdCyLFVegAAAAAElFTkSuQmCC"},"metadata":{}}]},{"cell_type":"code","source":"qid=pd.Series(df['qid1'].tolist()+df['qid2'].tolist())","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.588197Z","iopub.execute_input":"2024-05-31T15:59:23.588543Z","iopub.status.idle":"2024-05-31T15:59:23.745422Z","shell.execute_reply.started":"2024-05-31T15:59:23.588515Z","shell.execute_reply":"2024-05-31T15:59:23.743914Z"},"trusted":true},"execution_count":11,"outputs":[]},{"cell_type":"code","source":"np.unique(qid).shape[0]","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.747034Z","iopub.execute_input":"2024-05-31T15:59:23.747847Z","iopub.status.idle":"2024-05-31T15:59:23.793227Z","shell.execute_reply.started":"2024-05-31T15:59:23.747809Z","shell.execute_reply":"2024-05-31T15:59:23.791954Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"301629"},"metadata":{}}]},{"cell_type":"code","source":"df['question1']=df['question1'].apply(preprocess)\ndf['question2']=df['question2'].apply(preprocess)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.794871Z","iopub.execute_input":"2024-05-31T15:59:23.795807Z","iopub.status.idle":"2024-05-31T15:59:38.168654Z","shell.execute_reply.started":"2024-05-31T15:59:23.795769Z","shell.execute_reply":"2024-05-31T15:59:38.167619Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"code","source":"qid.shape[0]","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.174320Z","iopub.execute_input":"2024-05-31T15:59:38.174626Z","iopub.status.idle":"2024-05-31T15:59:38.181066Z","shell.execute_reply.started":"2024-05-31T15:59:38.174601Z","shell.execute_reply":"2024-05-31T15:59:38.180099Z"},"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":"400000"},"metadata":{}}]},{"cell_type":"code","source":"x=qid.value_counts()>1","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.182483Z","iopub.execute_input":"2024-05-31T15:59:38.182810Z","iopub.status.idle":"2024-05-31T15:59:38.222431Z","shell.execute_reply.started":"2024-05-31T15:59:38.182777Z","shell.execute_reply":"2024-05-31T15:59:38.221528Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"x[x]","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.223680Z","iopub.execute_input":"2024-05-31T15:59:38.223986Z","iopub.status.idle":"2024-05-31T15:59:38.232481Z","shell.execute_reply.started":"2024-05-31T15:59:38.223961Z","shell.execute_reply":"2024-05-31T15:59:38.231296Z"},"trusted":true},"execution_count":16,"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"2559 True\n4044 True\n30782 True\n17978 True\n2561 True\n ... \n41258 True\n64963 True\n22576 True\n141425 True\n47459 True\nName: count, Length: 47906, dtype: bool"},"metadata":{}}]},{"cell_type":"code","source":"x[x].shape[0]","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.233700Z","iopub.execute_input":"2024-05-31T15:59:38.234161Z","iopub.status.idle":"2024-05-31T15:59:38.242383Z","shell.execute_reply.started":"2024-05-31T15:59:38.234134Z","shell.execute_reply":"2024-05-31T15:59:38.241358Z"},"trusted":true},"execution_count":17,"outputs":[{"execution_count":17,"output_type":"execute_result","data":{"text/plain":"47906"},"metadata":{}}]},{"cell_type":"code","source":"plt.hist(qid.value_counts().values,bins=100)\nplt.yscale('log')\nplt.show()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.243819Z","iopub.execute_input":"2024-05-31T15:59:38.244313Z","iopub.status.idle":"2024-05-31T15:59:38.987011Z","shell.execute_reply.started":"2024-05-31T15:59:38.244277Z","shell.execute_reply":"2024-05-31T15:59:38.986059Z"},"trusted":true},"execution_count":18,"outputs":[{"output_type":"display_data","data":{"text/plain":"<Figure size 640x480 with 1 Axes>","image/png":"iVBORw0KGgoAAAANSUhEUgAAAicAAAGdCAYAAADJ6dNTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAikUlEQVR4nO3de3BU5f3H8c8mmEUUwiVlQ4AQbb00Ihubm6k6Bc00ExlUqA5/2BphSke7abFbW8MfhTqjhmkrk9o5Y6oWcUanpHTGeKGiGJFUi+aC8dJUCm3UFMwGxpIlURO7e35/OC6/JYBsssk+J+f9mtkZzyXP+eaZNXzmnOd5jse2bVsAAACGSEt1AQAAAP8f4QQAABiFcAIAAIxCOAEAAEYhnAAAAKMQTgAAgFEIJwAAwCiEEwAAYJRJqS4gUdFoVIcOHdLUqVPl8XhSXQ4AADgDtm3r2LFjysnJUVra6e+NOC6cHDp0SPPnz091GQAAYAS6u7s1b968057juHAydepUSZ//ctOmTUtxNQAA4EyEw2HNnz8/9u/46TgunHzxKGfatGmEEwAAHOZMhmQwIBYAABjFMeHEsizl5+eruLg41aUAAIAx5LFt2051EYkIh8PKzMxUX18fj3UAAHCIRP79dsydEwAA4A6EEwAAYBTCCQAAMArhBAAAGIVwAgAAjEI4AQAARnFMOGGdEwAA3IF1TgAAwJhjnRMAAOBYhBMAAGAUx72VeKzl1WyP235v49IUVQIAgDtx5wQAABiFcAIAAIxCOAEAAEYhnAAAAKMQTgAAgFEIJwAAwCiOCScsXw8AgDs4JpwEAgF1dnaqtbU11aUAAIAx5JhwAgAA3IFwAgAAjEI4AQAARiGcAAAAoxBOAACAUQgnAADAKIQTAABgFMIJAAAwCuEEAAAYhXACAACMQjgBAABGmZSKi+bl5WnatGlKS0vTjBkztGvXrlSUAQAADJSScCJJf/vb33Tuueem6vIAAMBQPNYBAABGSTicNDc3a9myZcrJyZHH41FjY+OwcyzLUl5eniZPnqzS0lK1tLTEHfd4PPrWt76l4uJiPfHEEyMuHgAATDwJh5OBgQH5/X5ZlnXS4w0NDQoGg9qwYYP27t0rv9+viooK9fb2xs555ZVX1N7erqefflr33Xef3nrrrZH/BgAAYEJJOJxUVlbqnnvu0fLly096fNOmTVqzZo1WrVql/Px81dfXa8qUKdq8eXPsnLlz50qS5syZo2uvvVZ79+495fUGBwcVDofjPgAAYOJK6piToaEhtbe3q7y8/PgF0tJUXl6uPXv2SPr8zsuxY8ckSf39/XrppZd0ySWXnLLN2tpaZWZmxj7z589PZskAAMAwSQ0nR44cUSQSkc/ni9vv8/nU09MjSQqFQrryyivl9/t1+eWX65ZbblFxcfEp21y3bp36+vpin+7u7mSWDAAADDPuU4nPP/98vfnmm2d8vtfrldfrlWVZsixLkUhkDKsDAACpltQ7J1lZWUpPT1coFIrbHwqFlJ2dPaq2A4GAOjs71draOqp2AACA2ZIaTjIyMlRYWKimpqbYvmg0qqamJpWVlSXzUgAAYIJK+LFOf3+/Dhw4ENvu6upSR0eHZs6cqdzcXAWDQVVVVamoqEglJSWqq6vTwMCAVq1aldTCAQDAxJRwOGlra9OSJUti28FgUJJUVVWlLVu2aOXKlTp8+LDWr1+vnp4eFRQUaMeOHcMGySaKMScAALiDx7ZtO9VFJCIcDiszM1N9fX2aNm1a0tvPq9ket/3exqVJvwYAAG6TyL/fvFsHAAAYxTHhxLIs5efnn3ZNFAAA4HyOCSdMJQYAwB0cE04AAIA7EE4AAIBRHBNOGHMCAIA7OCacMOYEAAB3cEw4AQAA7kA4AQAARiGcAAAAozgmnDAgFgAAd3BMOGFALAAA7uCYcAIAANyBcAIAAIxCOAEAAEYhnAAAAKM4JpwwWwcAAHdwTDhhtg4AAO7gmHACAADcgXACAACMQjgBAABGIZwAAACjEE4AAIBRCCcAAMAok1JdwJmyLEuWZSkSiYzrdfNqtg/b997GpeNaAwAAbuKYOyescwIAgDs4JpwAAAB3IJwAAACjEE4AAIBRCCcAAMAohBMAAGAUwgkAADAK4QQAABiFcAIAAIxCOAEAAEZxTDixLEv5+fkqLi5OdSkAAGAMOSacsHw9AADu4JhwAgAA3IFwAgAAjEI4AQAARiGcAAAAoxBOAACAUQgnAADAKIQTAABgFMIJAAAwCuEEAAAYhXACAACMQjgBAABGSVk4+fjjj7VgwQLdeeedqSoBAAAYKGXh5N5779Xll1+eqssDAABDpSSc7N+/X++++64qKytTcXkAAGCwhMNJc3Ozli1bppycHHk8HjU2Ng47x7Is5eXlafLkySotLVVLS0vc8TvvvFO1tbUjLhoAAExcCYeTgYEB+f1+WZZ10uMNDQ0KBoPasGGD9u7dK7/fr4qKCvX29kqSnnrqKV144YW68MILR1c5AACYkCYl+gOVlZWnfRyzadMmrVmzRqtWrZIk1dfXa/v27dq8ebNqamr02muvaevWrdq2bZv6+/v12Wefadq0aVq/fv1J2xscHNTg4GBsOxwOJ1oyAABwkKSOORkaGlJ7e7vKy8uPXyAtTeXl5dqzZ48kqba2Vt3d3Xrvvff0m9/8RmvWrDllMPni/MzMzNhn/vz5ySwZAAAYJqnh5MiRI4pEIvL5fHH7fT6fenp6RtTmunXr1NfXF/t0d3cno1QAAGCohB/rJNOtt976ped4vV55vd6xLwYAABghqXdOsrKylJ6erlAoFLc/FAopOzt7VG1blqX8/HwVFxePqh0AAGC2pIaTjIwMFRYWqqmpKbYvGo2qqalJZWVlo2o7EAios7NTra2toy1z1PJqtsd9AABA8iT8WKe/v18HDhyIbXd1damjo0MzZ85Ubm6ugsGgqqqqVFRUpJKSEtXV1WlgYCA2ewcAAOB0Eg4nbW1tWrJkSWw7GAxKkqqqqrRlyxatXLlShw8f1vr169XT06OCggLt2LFj2CDZRFmWJcuyFIlERtUOAAAwm8e2bTvVRSQiHA4rMzNTfX19mjZtWtLbH8ljmvc2Lk16HQAATCSJ/Pudshf/AQAAnIxjwgmzdQAAcAfHhBOTZusAAICx45hwAgAA3IFwAgAAjEI4AQAARnFMOGFALAAA7uCYcMKAWAAA3MEx4QQAALgD4QQAABiFcAIAAIzimHDCgFgAANzBMeGEAbEAALiDY8IJAABwB8IJAAAwCuEEAAAYhXACAACM4phwwmwdAADcwTHhhNk6AAC4g2PCCQAAcIdJqS5gosqr2R63/d7GpSmqBAAAZ+HOCQAAMArhBAAAGIVwAgAAjOKYcMJUYgAA3MEx4YSpxAAAuINjwgkAAHAHwgkAADAK4QQAABiFcAIAAIxCOAEAAEYhnAAAAKMQTgAAgFEIJwAAwCiEEwAAYBTHhBOWrwcAwB0cE05Yvh4AAHdwTDgBAADuMCnVBbhFXs32Yfve27g0BZUAAGA27pwAAACjEE4AAIBRCCcAAMAohBMAAGAUwgkAADAK4QQAABiFcAIAAIxCOAEAAEYhnAAAAKOMezg5evSoioqKVFBQoIULF+rhhx8e7xIAAIDBxn35+qlTp6q5uVlTpkzRwMCAFi5cqBUrVmjWrFnjXQoAADDQuN85SU9P15QpUyRJg4ODsm1btm2PdxkAAMBQCYeT5uZmLVu2TDk5OfJ4PGpsbBx2jmVZysvL0+TJk1VaWqqWlpa440ePHpXf79e8efP0s5/9TFlZWSP+BQAAwMSS8GOdgYEB+f1+rV69WitWrBh2vKGhQcFgUPX19SotLVVdXZ0qKiq0b98+zZ49W5I0ffp0vfnmmwqFQlqxYoVuvPFG+Xy+0f82DsebiwEAGMGdk8rKSt1zzz1avnz5SY9v2rRJa9as0apVq5Sfn6/6+npNmTJFmzdvHnauz+eT3+/XX//611Neb3BwUOFwOO4DAAAmrqSOORkaGlJ7e7vKy8uPXyAtTeXl5dqzZ48kKRQK6dixY5Kkvr4+NTc366KLLjplm7W1tcrMzIx95s+fn8ySAQCAYZIaTo4cOaJIJDLsEY3P51NPT48k6f3339dVV10lv9+vq666Sj/60Y906aWXnrLNdevWqa+vL/bp7u5OZskAAMAw4z6VuKSkRB0dHWd8vtfrldfrHbuCAACAUZJ65yQrK0vp6ekKhUJx+0OhkLKzs0fVtmVZys/PV3Fx8ajaAQAAZktqOMnIyFBhYaGamppi+6LRqJqamlRWVjaqtgOBgDo7O9Xa2jraMgEAgMESfqzT39+vAwcOxLa7urrU0dGhmTNnKjc3V8FgUFVVVSoqKlJJSYnq6uo0MDCgVatWJbVwAAAwMSUcTtra2rRkyZLYdjAYlCRVVVVpy5YtWrlypQ4fPqz169erp6dHBQUF2rFjx6jXMbEsS5ZlKRKJjKodAABgtoTDyeLFi790ufnq6mpVV1ePuKiTCQQCCgQCCofDyszMTGrbAADAHOP+bh0AAIDTIZwAAACjjPs6JyPl1jEnJ75vh3ftAAAmOsfcOWEqMQAA7uCYcAIAANyBcAIAAIzimHDC8vUAALiDY8IJY04AAHAHx4QTAADgDo6ZSoxTY7oxAGAi4c4JAAAwimPCCQNiAQBwB8eEEwbEAgDgDo4JJwAAwB0IJwAAwCiEEwAAYBTCCQAAMIpjwgmzdQAAcAfHLMIWCAQUCAQUDoeVmZmZ6nKMduKibBILswEAnMMxd04AAIA7EE4AAIBRCCcAAMAohBMAAGAUwgkAADCKY2brWJYly7IUiURSXcqEwIweAICpHHPnhBf/AQDgDo4JJwAAwB0IJwAAwCiEEwAAYBTCCQAAMArhBAAAGIVwAgAAjOKYdU4w9k5c+4R1TwAAqcCdEwAAYBTCCQAAMIpjwollWcrPz1dxcXGqSwEAAGPIMeGE5esBAHAHx4QTAADgDoQTAABgFMIJAAAwCuEEAAAYhXACAACMwgqxSAiryAIAxhp3TgAAgFEIJwAAwCiEEwAAYBTGnGBUThyDIjEOBQAwOoQTjDkCDAAgEeP+WKe7u1uLFy9Wfn6+Fi1apG3bto13CQAAwGDjfudk0qRJqqurU0FBgXp6elRYWKhrr71W55xzzniXAgAADDTu4WTOnDmaM2eOJCk7O1tZWVn66KOPCCcAAEDSCB7rNDc3a9myZcrJyZHH41FjY+OwcyzLUl5eniZPnqzS0lK1tLSctK329nZFIhHNnz8/4cIBAMDElHA4GRgYkN/vl2VZJz3e0NCgYDCoDRs2aO/evfL7/aqoqFBvb2/ceR999JFuueUWPfTQQyOrHAAATEgJP9aprKxUZWXlKY9v2rRJa9as0apVqyRJ9fX12r59uzZv3qyamhpJ0uDgoG644QbV1NTom9/85mmvNzg4qMHBwdh2OBxOtGQAAOAgSZ2tMzQ0pPb2dpWXlx+/QFqaysvLtWfPHkmSbdu69dZbdfXVV+t73/vel7ZZW1urzMzM2IdHQAAATGxJDSdHjhxRJBKRz+eL2+/z+dTT0yNJevXVV9XQ0KDGxkYVFBSooKBAb7/99inbXLdunfr6+mKf7u7uZJYMAAAMM+6zda688kpFo9EzPt/r9crr9Y5hRQAAwCRJvXOSlZWl9PR0hUKhuP2hUEjZ2dmjatuyLOXn56u4uHhU7QAAALMlNZxkZGSosLBQTU1NsX3RaFRNTU0qKysbVduBQECdnZ1qbW0dbZkAAMBgCT/W6e/v14EDB2LbXV1d6ujo0MyZM5Wbm6tgMKiqqioVFRWppKREdXV1GhgYiM3eAaTh79vhXTsAgC8kHE7a2tq0ZMmS2HYwGJQkVVVVacuWLVq5cqUOHz6s9evXq6enRwUFBdqxY8ewQbKJsixLlmUpEomMqh0AAGC2hMPJ4sWLZdv2ac+prq5WdXX1iIs6mUAgoEAgoHA4rMzMzKS2DQAAzDHubyUGAAA4HcIJAAAwimPCCVOJAQBwB8eEE6YSAwDgDo4JJwAAwB3Gffl64EyxFgoAuJNj7pww5gQAAHdwTDhhzAkAAO7gmHACAADcgTEncIwTx6BIjEMBgImIOycAAMAojgknDIgFAMAdHBNOGBALAIA7OCacAAAAdyCcAAAAoxBOAACAUZhKjAmPZfABwFkcc+eE2ToAALiDY8IJs3UAAHAHHutgQjnZKrIAAGdxzJ0TAADgDoQTAABgFMIJAAAwCuEEAAAYhXACAACM4phwwjonAAC4g2PCCeucAADgDo4JJwAAwB0IJwAAwCiEEwAAYBTCCQAAMArv1gFO4mTv6Hlv49IUVAIA7kM4AcQLAwHAJDzWAQAARiGcAAAAoxBOAACAURwTTli+HgAAd3BMOGH5egAA3MEx4QQAALgD4QQAABiFcAIAAIxCOAEAAEYhnAAAAKOwfD0wQrx/BwDGBndOAACAUQgnAADAKIQTAABgFMIJAAAwSkrCyfLlyzVjxgzdeOONqbg8AAAwWEpm66xdu1arV6/WY489lorLAyl14iwfZvgAQLyU3DlZvHixpk6dmopLAwAAwyUcTpqbm7Vs2TLl5OTI4/GosbFx2DmWZSkvL0+TJ09WaWmpWlpaklErAABwgYTDycDAgPx+vyzLOunxhoYGBYNBbdiwQXv37pXf71dFRYV6e3tHXSwAAJj4Eh5zUllZqcrKylMe37Rpk9asWaNVq1ZJkurr67V9+3Zt3rxZNTU1CRc4ODiowcHB2HY4HE64DQAA4BxJHRA7NDSk9vZ2rVu3LrYvLS1N5eXl2rNnz4jarK2t1d13352sEgHjsAw+AMRL6oDYI0eOKBKJyOfzxe33+Xzq6emJbZeXl+umm27SX/7yF82bN++0wWXdunXq6+uLfbq7u5NZMgAAMExKphK/+OKLZ3yu1+uV1+sdw2oAAIBJkhpOsrKylJ6erlAoFLc/FAopOzt7VG1bliXLshSJREbVDjCWWMMEAEYvqY91MjIyVFhYqKampti+aDSqpqYmlZWVjartQCCgzs5Otba2jrZMAABgsITvnPT39+vAgQOx7a6uLnV0dGjmzJnKzc1VMBhUVVWVioqKVFJSorq6Og0MDMRm7wAAAJxOwuGkra1NS5YsiW0Hg0FJUlVVlbZs2aKVK1fq8OHDWr9+vXp6elRQUKAdO3YMGySbKB7rAPF4hARgoko4nCxevFi2bZ/2nOrqalVXV4+4qJMJBAIKBAIKh8PKzMxMatsAAMAcKXm3DgAAwKkQTgAAgFEcE04sy1J+fr6Ki4tTXQoAABhDjgknTCUGAMAdHBNOAACAOxBOAACAURwTThhzAgCAOzgmnDDmBAAAd3BMOAEAAO5AOAEAAEYhnAAAAKMk/G6dVOHFf3CzE1/yl8x2eGEgANM45s4JA2IBAHAHx4QTAADgDoQTAABgFMIJAAAwCuEEAAAYhdk6AIY5cVYPM3oAjCfH3Dlhtg4AAO7gmHACAADcgXACAACMQjgBAABGIZwAAACjEE4AAIBRCCcAAMAorHMCTGDJeptxsq7FeikAzoRj7pywzgkAAO7gmHACAADcgXACAACMQjgBAABGIZwAAACjEE4AAIBRCCcAAMAohBMAAGAUwgkAADAK4QQAABjFMeHEsizl5+eruLg41aUAAIAx5JhwwvL1AAC4g2PCCQAAcAfCCQAAMArhBAAAGIVwAgAAjEI4AQAARiGcAAAAoxBOAACAUQgnAADAKIQTAABgFMIJAAAwCuEEAAAYJSXh5Nlnn9VFF12kCy64QI888kgqSgAAAIaaNN4X/N///qdgMKhdu3YpMzNThYWFWr58uWbNmjXepQAAAAON+52TlpYWXXLJJZo7d67OPfdcVVZW6oUXXhjvMgAAgKESDifNzc1atmyZcnJy5PF41NjYOOwcy7KUl5enyZMnq7S0VC0tLbFjhw4d0ty5c2Pbc+fO1cGDB0dWPQAAmHASDicDAwPy+/2yLOukxxsaGhQMBrVhwwbt3btXfr9fFRUV6u3tHVGBg4ODCofDcR8AADBxJTzmpLKyUpWVlac8vmnTJq1Zs0arVq2SJNXX12v79u3avHmzampqlJOTE3en5ODBgyopKTlle7W1tbr77rsTLRNAEuXVbB+2772NS8es7S+71kh+JtXGsg8nqhP7bKT9lax2JioTv5tJHXMyNDSk9vZ2lZeXH79AWprKy8u1Z88eSVJJSYneeecdHTx4UP39/XruuedUUVFxyjbXrVunvr6+2Ke7uzuZJQMAAMMkdbbOkSNHFIlE5PP54vb7fD69++67n19w0iTdf//9WrJkiaLRqH7+85+fdqaO1+uV1+tNZpkAAMBg4z6VWJKuu+46XXfddQn9jGVZsixLkUhkjKoCAAAmSOpjnaysLKWnpysUCsXtD4VCys7OHlXbgUBAnZ2dam1tHVU7AADAbEkNJxkZGSosLFRTU1NsXzQaVVNTk8rKypJ5KQAAMEEl/Finv79fBw4ciG13dXWpo6NDM2fOVG5uroLBoKqqqlRUVKSSkhLV1dVpYGAgNnsHAADgdBIOJ21tbVqyZElsOxgMSpKqqqq0ZcsWrVy5UocPH9b69evV09OjgoIC7dixY9gg2UQx5gQAAHdIOJwsXrxYtm2f9pzq6mpVV1ePuKiTCQQCCgQCCofDyszMTGrbAADAHCl5KzEAAMCpOCacWJal/Px8FRcXp7oUAAAwhhwTTphKDACAOzgmnAAAAHcgnAAAAKM4Jpww5gQAAHdwTDhhzAkAAO6Qkhf/jcYXa6yEw+ExaT86+HHCP3OyWs6knRN/biQ/w7W4VrKvNdKfc8K1UulkNZtWo2lO7LOR9ley2pmoxuu7+UWbX7ZWmiR57DM5yyD/+c9/NH/+/FSXAQAARqC7u1vz5s077TmOCyfRaFSHDh3S1KlT5fF4Ev75cDis+fPnq7u7W9OmTRuDCp2DvjiOvohHfxxHXxxHX8SjP447k76wbVvHjh1TTk6O0tJOP6rEcY910tLSvjRxnYlp06a5/sv0BfriOPoiHv1xHH1xHH0Rj/447sv64kxfP+OYAbEAAMAdCCcAAMAorgsnXq9XGzZskNfrTXUpKUdfHEdfxKM/jqMvjqMv4tEfxyW7Lxw3IBYAAExsrrtzAgAAzEY4AQAARiGcAAAAoxBOAACAUVwVTizLUl5eniZPnqzS0lK1tLSkuqRx0dzcrGXLliknJ0cej0eNjY1xx23b1vr16zVnzhydffbZKi8v1/79+1NT7Birra1VcXGxpk6dqtmzZ+uGG27Qvn374s759NNPFQgENGvWLJ177rn6zne+o1AolKKKx86DDz6oRYsWxRZNKisr03PPPRc77pZ+OJmNGzfK4/HojjvuiO1zU3/88pe/lMfjiftcfPHFseNu6gtJOnjwoL773e9q1qxZOvvss3XppZeqra0tdtwtf0Pz8vKGfS88Ho8CgYCk5H4vXBNOGhoaFAwGtWHDBu3du1d+v18VFRXq7e1NdWljbmBgQH6/X5ZlnfT4r371Kz3wwAOqr6/X66+/rnPOOUcVFRX69NNPx7nSsbd7924FAgG99tpr2rlzpz777DN9+9vf1sDAQOycn/zkJ3rmmWe0bds27d69W4cOHdKKFStSWPXYmDdvnjZu3Kj29na1tbXp6quv1vXXX6+///3vktzTDydqbW3V73//ey1atChuv9v645JLLtGHH34Y+7zyyiuxY27qi//+97+64oordNZZZ+m5555TZ2en7r//fs2YMSN2jlv+hra2tsZ9J3bu3ClJuummmyQl+Xthu0RJSYkdCARi25FIxM7JybFra2tTWNX4k2Q/+eSTse1oNGpnZ2fbv/71r2P7jh49anu9XvuPf/xjCiocX729vbYke/fu3bZtf/67n3XWWfa2bdti5/zjH/+wJdl79uxJVZnjZsaMGfYjjzzi2n44duyYfcEFF9g7d+60v/Wtb9lr1661bdt934sNGzbYfr//pMfc1hd33XWXfeWVV57yuJv/hq5du9b+6le/akej0aR/L1xx52RoaEjt7e0qLy+P7UtLS1N5ebn27NmTwspSr6urSz09PXF9k5mZqdLSUlf0TV9fnyRp5syZkqT29nZ99tlncf1x8cUXKzc3d0L3RyQS0datWzUwMKCysjLX9kMgENDSpUvjfm/Jnd+L/fv3KycnR+eff75uvvlmffDBB5Lc1xdPP/20ioqKdNNNN2n27Nm67LLL9PDDD8eOu/Vv6NDQkB5//HGtXr1aHo8n6d8LV4STI0eOKBKJyOfzxe33+Xzq6elJUVVm+OL3d2PfRKNR3XHHHbriiiu0cOFCSZ/3R0ZGhqZPnx537kTtj7ffflvnnnuuvF6vbrvtNj355JPKz893XT9I0tatW7V3717V1tYOO+a2/igtLdWWLVu0Y8cOPfjgg+rq6tJVV12lY8eOua4v/v3vf+vBBx/UBRdcoOeff1633367fvzjH+uxxx6T5N6/oY2NjTp69KhuvfVWScn/f8RxbyUGkiUQCOidd96Je5buNhdddJE6OjrU19enP//5z6qqqtLu3btTXda46+7u1tq1a7Vz505Nnjw51eWkXGVlZey/Fy1apNLSUi1YsEB/+tOfdPbZZ6ewsvEXjUZVVFSk++67T5J02WWX6Z133lF9fb2qqqpSXF3q/OEPf1BlZaVycnLGpH1X3DnJyspSenr6sFHDoVBI2dnZKarKDF/8/m7rm+rqaj377LPatWuX5s2bF9ufnZ2toaEhHT16NO78idofGRkZ+trXvqbCwkLV1tbK7/frt7/9rev6ob29Xb29vfrGN76hSZMmadKkSdq9e7ceeOABTZo0ST6fz1X9caLp06frwgsv1IEDB1z33ZgzZ47y8/Pj9n3961+PPeZy49/Q999/Xy+++KK+//3vx/Yl+3vhinCSkZGhwsJCNTU1xfZFo1E1NTWprKwshZWl3nnnnafs7Oy4vgmHw3r99dcnZN/Ytq3q6mo9+eSTeumll3TeeefFHS8sLNRZZ50V1x/79u3TBx98MCH740TRaFSDg4Ou64drrrlGb7/9tjo6OmKfoqIi3XzzzbH/dlN/nKi/v1//+te/NGfOHNd9N6644ophyw3885//1IIFCyS572+oJD366KOaPXu2li5dGtuX9O9FEgfuGm3r1q221+u1t2zZYnd2dto/+MEP7OnTp9s9PT2pLm3MHTt2zH7jjTfsN954w5Zkb9q0yX7jjTfs999/37Zt2964caM9ffp0+6mnnrLfeust+/rrr7fPO+88+5NPPklx5cl3++2325mZmfbLL79sf/jhh7HPxx9/HDvntttus3Nzc+2XXnrJbmtrs8vKyuyysrIUVj02ampq7N27d9tdXV32W2+9ZdfU1Ngej8d+4YUXbNt2Tz+cyv+frWPb7uqPn/70p/bLL79sd3V12a+++qpdXl5uZ2Vl2b29vbZtu6svWlpa7EmTJtn33nuvvX//fvuJJ56wp0yZYj/++OOxc9z0NzQSidi5ubn2XXfdNexYMr8Xrgkntm3bv/vd7+zc3Fw7IyPDLikpsV977bVUlzQudu3aZUsa9qmqqrJt+/OpcL/4xS9sn89ne71e+5prrrH37duX2qLHyMn6QZL96KOPxs755JNP7B/+8If2jBkz7ClTptjLly+3P/zww9QVPUZWr15tL1iwwM7IyLC/8pWv2Ndcc00smNi2e/rhVE4MJ27qj5UrV9pz5syxMzIy7Llz59orV660Dxw4EDvupr6wbdt+5pln7IULF9per9e++OKL7YceeijuuJv+hj7//PO2pJP+fsn8Xnhs27ZHeGcHAAAg6Vwx5gQAADgH4QQAABiFcAIAAIxCOAEAAEYhnAAAAKMQTgAAgFEIJwAAwCiEEwAAYBTCCQAAMArhBAAAGIVwAgAAjEI4AQAARvk/g4HrboKNJNoAAAAASUVORK5CYII="},"metadata":{}}]},{"cell_type":"code","source":"df.drop(columns=['id','qid1','qid2'],inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.988119Z","iopub.execute_input":"2024-05-31T15:59:38.988422Z","iopub.status.idle":"2024-05-31T15:59:39.009014Z","shell.execute_reply.started":"2024-05-31T15:59:38.988396Z","shell.execute_reply":"2024-05-31T15:59:39.008163Z"},"trusted":true},"execution_count":19,"outputs":[]},{"cell_type":"code","source":"df","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.010182Z","iopub.execute_input":"2024-05-31T15:59:39.010484Z","iopub.status.idle":"2024-05-31T15:59:39.023858Z","shell.execute_reply.started":"2024-05-31T15:59:39.010459Z","shell.execute_reply":"2024-05-31T15:59:39.022788Z"},"trusted":true},"execution_count":20,"outputs":[{"execution_count":20,"output_type":"execute_result","data":{"text/plain":" question1 \\\n0 what is the step by step guide to invest in sh... \n1 what is the story of kohinoor kohinoor diamond \n2 how can i increase the speed of my internet co... \n3 why am i mentally very lonely how can i solve it \n4 which one dissolve in water quikly sugar salt ... \n... ... \n199996 which of these tv shows should i watch next \n199997 should i change my name \n199998 should i buy the new macbook 2016 or one from ... \n199999 what is your review of love 2011 movie \n200000 can pakistan hit indian air craft carrier in a... \n\n question2 is_duplicate \n0 what is the step by step guide to invest in sh... 0 \n1 what would happen if the indian government sto... 0 \n2 how can internet speed be increased by hacking... 0 \n3 find the remainder when 2324math is divided by... 0 \n4 which fish would survive in salt water 0 \n... ... ... \n199996 what are some thriller shows i should watch next 0 \n199997 should i legally change my first name 0 \n199998 should i buy the new macbook pro 2016 or the m... 1 \n199999 what is your review of love birds 2011 movie 0 \n200000 can pakistan destroy an indian aircraft carrie... 0 \n\n[200000 rows x 3 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>question1</th>\n <th>question2</th>\n <th>is_duplicate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>what is the step by step guide to invest in sh...</td>\n <td>what is the step by step guide to invest in sh...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>what is the story of kohinoor kohinoor diamond</td>\n <td>what would happen if the indian government sto...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>how can i increase the speed of my internet co...</td>\n <td>how can internet speed be increased by hacking...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>why am i mentally very lonely how can i solve it</td>\n <td>find the remainder when 2324math is divided by...</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>which one dissolve in water quikly sugar salt ...</td>\n <td>which fish would survive in salt water</td>\n <td>0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>199996</th>\n <td>which of these tv shows should i watch next</td>\n <td>what are some thriller shows i should watch next</td>\n <td>0</td>\n </tr>\n <tr>\n <th>199997</th>\n <td>should i change my name</td>\n <td>should i legally change my first name</td>\n <td>0</td>\n </tr>\n <tr>\n <th>199998</th>\n <td>should i buy the new macbook 2016 or one from ...</td>\n <td>should i buy the new macbook pro 2016 or the m...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>199999</th>\n <td>what is your review of love 2011 movie</td>\n <td>what is your review of love birds 2011 movie</td>\n <td>0</td>\n </tr>\n <tr>\n <th>200000</th>\n <td>can pakistan hit indian air craft carrier in a...</td>\n <td>can pakistan destroy an indian aircraft carrie...</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>200000 rows × 3 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"df['q1_len']=df['question1'].str.len()\ndf['q2_len']=df['question2'].str.len()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.025425Z","iopub.execute_input":"2024-05-31T15:59:39.025810Z","iopub.status.idle":"2024-05-31T15:59:39.159384Z","shell.execute_reply.started":"2024-05-31T15:59:39.025774Z","shell.execute_reply":"2024-05-31T15:59:39.158511Z"},"trusted":true},"execution_count":21,"outputs":[]},{"cell_type":"code","source":"df['q1_num_words']=df['question1'].apply(lambda row: len(row.split(\" \")))\ndf['q2_num_words']=df['question2'].apply(lambda row: len(row.split(\" \")))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.160903Z","iopub.execute_input":"2024-05-31T15:59:39.161222Z","iopub.status.idle":"2024-05-31T15:59:39.633424Z","shell.execute_reply.started":"2024-05-31T15:59:39.161194Z","shell.execute_reply":"2024-05-31T15:59:39.632355Z"},"trusted":true},"execution_count":22,"outputs":[]},{"cell_type":"code","source":"def common_words(row):\n w1=set(map(lambda word: word.lower().strip(),row['question1'].split(\" \")))\n w2=set(map(lambda word: word.lower().strip(),row['question2'].split(\" \")))\n return len(w1 & w2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.634722Z","iopub.execute_input":"2024-05-31T15:59:39.635031Z","iopub.status.idle":"2024-05-31T15:59:39.641017Z","shell.execute_reply.started":"2024-05-31T15:59:39.635005Z","shell.execute_reply":"2024-05-31T15:59:39.639990Z"},"trusted":true},"execution_count":23,"outputs":[]},{"cell_type":"code","source":"def total_words(row):\n w1=set(map(lambda word: word.lower().strip(),row['question1'].split(\" \")))\n w2=set(map(lambda word: word.lower().strip(),row['question2'].split(\" \")))\n return len(w1) + len(w2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.642318Z","iopub.execute_input":"2024-05-31T15:59:39.642632Z","iopub.status.idle":"2024-05-31T15:59:39.651761Z","shell.execute_reply.started":"2024-05-31T15:59:39.642600Z","shell.execute_reply":"2024-05-31T15:59:39.650721Z"},"trusted":true},"execution_count":24,"outputs":[]},{"cell_type":"code","source":"df['word_common']=df.apply(common_words,axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.652945Z","iopub.execute_input":"2024-05-31T15:59:39.653278Z","iopub.status.idle":"2024-05-31T15:59:43.993662Z","shell.execute_reply.started":"2024-05-31T15:59:39.653253Z","shell.execute_reply":"2024-05-31T15:59:43.992797Z"},"trusted":true},"execution_count":25,"outputs":[]},{"cell_type":"code","source":"df['word_total']=df.apply(total_words,axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:43.994773Z","iopub.execute_input":"2024-05-31T15:59:43.995058Z","iopub.status.idle":"2024-05-31T15:59:48.205245Z","shell.execute_reply.started":"2024-05-31T15:59:43.995018Z","shell.execute_reply":"2024-05-31T15:59:48.204131Z"},"trusted":true},"execution_count":26,"outputs":[]},{"cell_type":"code","source":"df","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.206484Z","iopub.execute_input":"2024-05-31T15:59:48.206787Z","iopub.status.idle":"2024-05-31T15:59:48.221588Z","shell.execute_reply.started":"2024-05-31T15:59:48.206762Z","shell.execute_reply":"2024-05-31T15:59:48.220559Z"},"trusted":true},"execution_count":27,"outputs":[{"execution_count":27,"output_type":"execute_result","data":{"text/plain":" question1 \\\n0 what is the step by step guide to invest in sh... \n1 what is the story of kohinoor kohinoor diamond \n2 how can i increase the speed of my internet co... \n3 why am i mentally very lonely how can i solve it \n4 which one dissolve in water quikly sugar salt ... \n... ... \n199996 which of these tv shows should i watch next \n199997 should i change my name \n199998 should i buy the new macbook 2016 or one from ... \n199999 what is your review of love 2011 movie \n200000 can pakistan hit indian air craft carrier in a... \n\n question2 is_duplicate \\\n0 what is the step by step guide to invest in sh... 0 \n1 what would happen if the indian government sto... 0 \n2 how can internet speed be increased by hacking... 0 \n3 find the remainder when 2324math is divided by... 0 \n4 which fish would survive in salt water 0 \n... ... ... \n199996 what are some thriller shows i should watch next 0 \n199997 should i legally change my first name 0 \n199998 should i buy the new macbook pro 2016 or the m... 1 \n199999 what is your review of love birds 2011 movie 0 \n200000 can pakistan destroy an indian aircraft carrie... 0 \n\n q1_len q2_len q1_num_words q2_num_words word_common word_total \n0 65 56 14 12 11 23 \n1 46 83 8 13 4 18 \n2 72 58 14 10 4 24 \n3 48 51 11 9 0 19 \n4 73 38 13 7 4 20 \n... ... ... ... ... ... ... \n199996 43 48 9 9 5 18 \n199997 23 37 5 7 5 12 \n199998 50 61 11 13 9 21 \n199999 38 44 8 9 8 17 \n200000 146 60 27 10 6 34 \n\n[200000 rows x 9 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>question1</th>\n <th>question2</th>\n <th>is_duplicate</th>\n <th>q1_len</th>\n <th>q2_len</th>\n <th>q1_num_words</th>\n <th>q2_num_words</th>\n <th>word_common</th>\n <th>word_total</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>what is the step by step guide to invest in sh...</td>\n <td>what is the step by step guide to invest in sh...</td>\n <td>0</td>\n <td>65</td>\n <td>56</td>\n <td>14</td>\n <td>12</td>\n <td>11</td>\n <td>23</td>\n </tr>\n <tr>\n <th>1</th>\n <td>what is the story of kohinoor kohinoor diamond</td>\n <td>what would happen if the indian government sto...</td>\n <td>0</td>\n <td>46</td>\n <td>83</td>\n <td>8</td>\n <td>13</td>\n <td>4</td>\n <td>18</td>\n </tr>\n <tr>\n <th>2</th>\n <td>how can i increase the speed of my internet co...</td>\n <td>how can internet speed be increased by hacking...</td>\n <td>0</td>\n <td>72</td>\n <td>58</td>\n <td>14</td>\n <td>10</td>\n <td>4</td>\n <td>24</td>\n </tr>\n <tr>\n <th>3</th>\n <td>why am i mentally very lonely how can i solve it</td>\n <td>find the remainder when 2324math is divided by...</td>\n <td>0</td>\n <td>48</td>\n <td>51</td>\n <td>11</td>\n <td>9</td>\n <td>0</td>\n <td>19</td>\n </tr>\n <tr>\n <th>4</th>\n <td>which one dissolve in water quikly sugar salt ...</td>\n <td>which fish would survive in salt water</td>\n <td>0</td>\n <td>73</td>\n <td>38</td>\n <td>13</td>\n <td>7</td>\n <td>4</td>\n <td>20</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>199996</th>\n <td>which of these tv shows should i watch next</td>\n <td>what are some thriller shows i should watch next</td>\n <td>0</td>\n <td>43</td>\n <td>48</td>\n <td>9</td>\n <td>9</td>\n <td>5</td>\n <td>18</td>\n </tr>\n <tr>\n <th>199997</th>\n <td>should i change my name</td>\n <td>should i legally change my first name</td>\n <td>0</td>\n <td>23</td>\n <td>37</td>\n <td>5</td>\n <td>7</td>\n <td>5</td>\n <td>12</td>\n </tr>\n <tr>\n <th>199998</th>\n <td>should i buy the new macbook 2016 or one from ...</td>\n <td>should i buy the new macbook pro 2016 or the m...</td>\n <td>1</td>\n <td>50</td>\n <td>61</td>\n <td>11</td>\n <td>13</td>\n <td>9</td>\n <td>21</td>\n </tr>\n <tr>\n <th>199999</th>\n <td>what is your review of love 2011 movie</td>\n <td>what is your review of love birds 2011 movie</td>\n <td>0</td>\n <td>38</td>\n <td>44</td>\n <td>8</td>\n <td>9</td>\n <td>8</td>\n <td>17</td>\n </tr>\n <tr>\n <th>200000</th>\n <td>can pakistan hit indian air craft carrier in a...</td>\n <td>can pakistan destroy an indian aircraft carrie...</td>\n <td>0</td>\n <td>146</td>\n <td>60</td>\n <td>27</td>\n <td>10</td>\n <td>6</td>\n <td>34</td>\n </tr>\n </tbody>\n</table>\n<p>200000 rows × 9 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"df['word_share']=round(df['word_common']/df['word_total'],2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.223003Z","iopub.execute_input":"2024-05-31T15:59:48.223347Z","iopub.status.idle":"2024-05-31T15:59:48.235292Z","shell.execute_reply.started":"2024-05-31T15:59:48.223321Z","shell.execute_reply":"2024-05-31T15:59:48.234269Z"},"trusted":true},"execution_count":28,"outputs":[]},{"cell_type":"code","source":"df","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.236483Z","iopub.execute_input":"2024-05-31T15:59:48.236780Z","iopub.status.idle":"2024-05-31T15:59:48.255263Z","shell.execute_reply.started":"2024-05-31T15:59:48.236750Z","shell.execute_reply":"2024-05-31T15:59:48.254206Z"},"trusted":true},"execution_count":29,"outputs":[{"execution_count":29,"output_type":"execute_result","data":{"text/plain":" question1 \\\n0 what is the step by step guide to invest in sh... \n1 what is the story of kohinoor kohinoor diamond \n2 how can i increase the speed of my internet co... \n3 why am i mentally very lonely how can i solve it \n4 which one dissolve in water quikly sugar salt ... \n... ... \n199996 which of these tv shows should i watch next \n199997 should i change my name \n199998 should i buy the new macbook 2016 or one from ... \n199999 what is your review of love 2011 movie \n200000 can pakistan hit indian air craft carrier in a... \n\n question2 is_duplicate \\\n0 what is the step by step guide to invest in sh... 0 \n1 what would happen if the indian government sto... 0 \n2 how can internet speed be increased by hacking... 0 \n3 find the remainder when 2324math is divided by... 0 \n4 which fish would survive in salt water 0 \n... ... ... \n199996 what are some thriller shows i should watch next 0 \n199997 should i legally change my first name 0 \n199998 should i buy the new macbook pro 2016 or the m... 1 \n199999 what is your review of love birds 2011 movie 0 \n200000 can pakistan destroy an indian aircraft carrie... 0 \n\n q1_len q2_len q1_num_words q2_num_words word_common word_total \\\n0 65 56 14 12 11 23 \n1 46 83 8 13 4 18 \n2 72 58 14 10 4 24 \n3 48 51 11 9 0 19 \n4 73 38 13 7 4 20 \n... ... ... ... ... ... ... \n199996 43 48 9 9 5 18 \n199997 23 37 5 7 5 12 \n199998 50 61 11 13 9 21 \n199999 38 44 8 9 8 17 \n200000 146 60 27 10 6 34 \n\n word_share \n0 0.48 \n1 0.22 \n2 0.17 \n3 0.00 \n4 0.20 \n... ... \n199996 0.28 \n199997 0.42 \n199998 0.43 \n199999 0.47 \n200000 0.18 \n\n[200000 rows x 10 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>question1</th>\n <th>question2</th>\n <th>is_duplicate</th>\n <th>q1_len</th>\n <th>q2_len</th>\n <th>q1_num_words</th>\n <th>q2_num_words</th>\n <th>word_common</th>\n <th>word_total</th>\n <th>word_share</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>what is the step by step guide to invest in sh...</td>\n <td>what is the step by step guide to invest in sh...</td>\n <td>0</td>\n <td>65</td>\n <td>56</td>\n <td>14</td>\n <td>12</td>\n <td>11</td>\n <td>23</td>\n <td>0.48</td>\n </tr>\n <tr>\n <th>1</th>\n <td>what is the story of kohinoor kohinoor diamond</td>\n <td>what would happen if the indian government sto...</td>\n <td>0</td>\n <td>46</td>\n <td>83</td>\n <td>8</td>\n <td>13</td>\n <td>4</td>\n <td>18</td>\n <td>0.22</td>\n </tr>\n <tr>\n <th>2</th>\n <td>how can i increase the speed of my internet co...</td>\n <td>how can internet speed be increased by hacking...</td>\n <td>0</td>\n <td>72</td>\n <td>58</td>\n <td>14</td>\n <td>10</td>\n <td>4</td>\n <td>24</td>\n <td>0.17</td>\n </tr>\n <tr>\n <th>3</th>\n <td>why am i mentally very lonely how can i solve it</td>\n <td>find the remainder when 2324math is divided by...</td>\n <td>0</td>\n <td>48</td>\n <td>51</td>\n <td>11</td>\n <td>9</td>\n <td>0</td>\n <td>19</td>\n <td>0.00</td>\n </tr>\n <tr>\n <th>4</th>\n <td>which one dissolve in water quikly sugar salt ...</td>\n <td>which fish would survive in salt water</td>\n <td>0</td>\n <td>73</td>\n <td>38</td>\n <td>13</td>\n <td>7</td>\n <td>4</td>\n <td>20</td>\n <td>0.20</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>199996</th>\n <td>which of these tv shows should i watch next</td>\n <td>what are some thriller shows i should watch next</td>\n <td>0</td>\n <td>43</td>\n <td>48</td>\n <td>9</td>\n <td>9</td>\n <td>5</td>\n <td>18</td>\n <td>0.28</td>\n </tr>\n <tr>\n <th>199997</th>\n <td>should i change my name</td>\n <td>should i legally change my first name</td>\n <td>0</td>\n <td>23</td>\n <td>37</td>\n <td>5</td>\n <td>7</td>\n <td>5</td>\n <td>12</td>\n <td>0.42</td>\n </tr>\n <tr>\n <th>199998</th>\n <td>should i buy the new macbook 2016 or one from ...</td>\n <td>should i buy the new macbook pro 2016 or the m...</td>\n <td>1</td>\n <td>50</td>\n <td>61</td>\n <td>11</td>\n <td>13</td>\n <td>9</td>\n <td>21</td>\n <td>0.43</td>\n </tr>\n <tr>\n <th>199999</th>\n <td>what is your review of love 2011 movie</td>\n <td>what is your review of love birds 2011 movie</td>\n <td>0</td>\n <td>38</td>\n <td>44</td>\n <td>8</td>\n <td>9</td>\n <td>8</td>\n <td>17</td>\n <td>0.47</td>\n </tr>\n <tr>\n <th>200000</th>\n <td>can pakistan hit indian air craft carrier in a...</td>\n <td>can pakistan destroy an indian aircraft carrie...</td>\n <td>0</td>\n <td>146</td>\n <td>60</td>\n <td>27</td>\n <td>10</td>\n <td>6</td>\n <td>34</td>\n <td>0.18</td>\n </tr>\n </tbody>\n</table>\n<p>200000 rows × 10 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"ndf1=df[['question1','question2']]\nndf2=df.drop(columns=['question1','question2'])","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.256585Z","iopub.execute_input":"2024-05-31T15:59:48.256964Z","iopub.status.idle":"2024-05-31T15:59:48.280291Z","shell.execute_reply.started":"2024-05-31T15:59:48.256930Z","shell.execute_reply":"2024-05-31T15:59:48.279254Z"},"trusted":true},"execution_count":30,"outputs":[]},{"cell_type":"code","source":"ndf1","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.281704Z","iopub.execute_input":"2024-05-31T15:59:48.282112Z","iopub.status.idle":"2024-05-31T15:59:48.295253Z","shell.execute_reply.started":"2024-05-31T15:59:48.282079Z","shell.execute_reply":"2024-05-31T15:59:48.294063Z"},"trusted":true},"execution_count":31,"outputs":[{"execution_count":31,"output_type":"execute_result","data":{"text/plain":" question1 \\\n0 what is the step by step guide to invest in sh... \n1 what is the story of kohinoor kohinoor diamond \n2 how can i increase the speed of my internet co... \n3 why am i mentally very lonely how can i solve it \n4 which one dissolve in water quikly sugar salt ... \n... ... \n199996 which of these tv shows should i watch next \n199997 should i change my name \n199998 should i buy the new macbook 2016 or one from ... \n199999 what is your review of love 2011 movie \n200000 can pakistan hit indian air craft carrier in a... \n\n question2 \n0 what is the step by step guide to invest in sh... \n1 what would happen if the indian government sto... \n2 how can internet speed be increased by hacking... \n3 find the remainder when 2324math is divided by... \n4 which fish would survive in salt water \n... ... \n199996 what are some thriller shows i should watch next \n199997 should i legally change my first name \n199998 should i buy the new macbook pro 2016 or the m... \n199999 what is your review of love birds 2011 movie \n200000 can pakistan destroy an indian aircraft carrie... \n\n[200000 rows x 2 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>question1</th>\n <th>question2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>what is the step by step guide to invest in sh...</td>\n <td>what is the step by step guide to invest in sh...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>what is the story of kohinoor kohinoor diamond</td>\n <td>what would happen if the indian government sto...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>how can i increase the speed of my internet co...</td>\n <td>how can internet speed be increased by hacking...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>why am i mentally very lonely how can i solve it</td>\n <td>find the remainder when 2324math is divided by...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>which one dissolve in water quikly sugar salt ...</td>\n <td>which fish would survive in salt water</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>199996</th>\n <td>which of these tv shows should i watch next</td>\n <td>what are some thriller shows i should watch next</td>\n </tr>\n <tr>\n <th>199997</th>\n <td>should i change my name</td>\n <td>should i legally change my first name</td>\n </tr>\n <tr>\n <th>199998</th>\n <td>should i buy the new macbook 2016 or one from ...</td>\n <td>should i buy the new macbook pro 2016 or the m...</td>\n </tr>\n <tr>\n <th>199999</th>\n <td>what is your review of love 2011 movie</td>\n <td>what is your review of love birds 2011 movie</td>\n </tr>\n <tr>\n <th>200000</th>\n <td>can pakistan hit indian air craft carrier in a...</td>\n <td>can pakistan destroy an indian aircraft carrie...</td>\n </tr>\n </tbody>\n</table>\n<p>200000 rows × 2 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"ndf2","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.296620Z","iopub.execute_input":"2024-05-31T15:59:48.297003Z","iopub.status.idle":"2024-05-31T15:59:48.316699Z","shell.execute_reply.started":"2024-05-31T15:59:48.296972Z","shell.execute_reply":"2024-05-31T15:59:48.315499Z"},"trusted":true},"execution_count":32,"outputs":[{"execution_count":32,"output_type":"execute_result","data":{"text/plain":" is_duplicate q1_len q2_len q1_num_words q2_num_words word_common \\\n0 0 65 56 14 12 11 \n1 0 46 83 8 13 4 \n2 0 72 58 14 10 4 \n3 0 48 51 11 9 0 \n4 0 73 38 13 7 4 \n... ... ... ... ... ... ... \n199996 0 43 48 9 9 5 \n199997 0 23 37 5 7 5 \n199998 1 50 61 11 13 9 \n199999 0 38 44 8 9 8 \n200000 0 146 60 27 10 6 \n\n word_total word_share \n0 23 0.48 \n1 18 0.22 \n2 24 0.17 \n3 19 0.00 \n4 20 0.20 \n... ... ... \n199996 18 0.28 \n199997 12 0.42 \n199998 21 0.43 \n199999 17 0.47 \n200000 34 0.18 \n\n[200000 rows x 8 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>is_duplicate</th>\n <th>q1_len</th>\n <th>q2_len</th>\n <th>q1_num_words</th>\n <th>q2_num_words</th>\n <th>word_common</th>\n <th>word_total</th>\n <th>word_share</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>65</td>\n <td>56</td>\n <td>14</td>\n <td>12</td>\n <td>11</td>\n <td>23</td>\n <td>0.48</td>\n </tr>\n <tr>\n <th>1</th>\n <td>0</td>\n <td>46</td>\n <td>83</td>\n <td>8</td>\n <td>13</td>\n <td>4</td>\n <td>18</td>\n <td>0.22</td>\n </tr>\n <tr>\n <th>2</th>\n <td>0</td>\n <td>72</td>\n <td>58</td>\n <td>14</td>\n <td>10</td>\n <td>4</td>\n <td>24</td>\n <td>0.17</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0</td>\n <td>48</td>\n <td>51</td>\n <td>11</td>\n <td>9</td>\n <td>0</td>\n <td>19</td>\n <td>0.00</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0</td>\n <td>73</td>\n <td>38</td>\n <td>13</td>\n <td>7</td>\n <td>4</td>\n <td>20</td>\n <td>0.20</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>199996</th>\n <td>0</td>\n <td>43</td>\n <td>48</td>\n <td>9</td>\n <td>9</td>\n <td>5</td>\n <td>18</td>\n <td>0.28</td>\n </tr>\n <tr>\n <th>199997</th>\n <td>0</td>\n <td>23</td>\n <td>37</td>\n <td>5</td>\n <td>7</td>\n <td>5</td>\n <td>12</td>\n <td>0.42</td>\n </tr>\n <tr>\n <th>199998</th>\n <td>1</td>\n <td>50</td>\n <td>61</td>\n <td>11</td>\n <td>13</td>\n <td>9</td>\n <td>21</td>\n <td>0.43</td>\n </tr>\n <tr>\n <th>199999</th>\n <td>0</td>\n <td>38</td>\n <td>44</td>\n <td>8</td>\n <td>9</td>\n <td>8</td>\n <td>17</td>\n <td>0.47</td>\n </tr>\n <tr>\n <th>200000</th>\n <td>0</td>\n <td>146</td>\n <td>60</td>\n <td>27</td>\n <td>10</td>\n <td>6</td>\n <td>34</td>\n <td>0.18</td>\n </tr>\n </tbody>\n</table>\n<p>200000 rows × 8 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"from nltk.corpus import stopwords\n\ndef fetch_token_features(row):\n \n q1 = row['question1']\n q2 = row['question2']\n \n SAFE_DIV = 0.0001 \n\n STOP_WORDS = stopwords.words(\"english\")\n \n token_features = [0.0]*8\n \n # Converting the Sentence into Tokens: \n q1_tokens = q1.split()\n q2_tokens = q2.split()\n \n if len(q1_tokens) == 0 or len(q2_tokens) == 0:\n return token_features\n\n # Get the non-stopwords in Questions\n q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])\n q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])\n \n #Get the stopwords in Questions\n q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])\n q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])\n \n # Get the common non-stopwords from Question pair\n common_word_count = len(q1_words.intersection(q2_words))\n \n # Get the common stopwords from Question pair\n common_stop_count = len(q1_stops.intersection(q2_stops))\n \n # Get the common Tokens from Question pair\n common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))\n \n \n token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)\n token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)\n token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)\n token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)\n token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)\n token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)\n \n # Last word of both question is same or not\n token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])\n \n # First word of both question is same or not\n token_features[7] = int(q1_tokens[0] == q2_tokens[0])\n \n return token_features","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.318063Z","iopub.execute_input":"2024-05-31T15:59:48.318457Z","iopub.status.idle":"2024-05-31T15:59:49.088959Z","shell.execute_reply.started":"2024-05-31T15:59:48.318418Z","shell.execute_reply":"2024-05-31T15:59:49.088070Z"},"trusted":true},"execution_count":33,"outputs":[]},{"cell_type":"code","source":"token_features = df.apply(fetch_token_features, axis=1)\n\ndf[\"cwc_min\"] = list(map(lambda x: x[0], token_features))\ndf[\"cwc_max\"] = list(map(lambda x: x[1], token_features))\ndf[\"csc_min\"] = list(map(lambda x: x[2], token_features))\ndf[\"csc_max\"] = list(map(lambda x: x[3], token_features))\ndf[\"ctc_min\"] = list(map(lambda x: x[4], token_features))\ndf[\"ctc_max\"] = list(map(lambda x: x[5], token_features))\ndf[\"last_word_eq\"] = list(map(lambda x: x[6], token_features))\ndf[\"first_word_eq\"] = list(map(lambda x: x[7], token_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:49.090329Z","iopub.execute_input":"2024-05-31T15:59:49.090993Z","iopub.status.idle":"2024-05-31T16:00:39.809819Z","shell.execute_reply.started":"2024-05-31T15:59:49.090955Z","shell.execute_reply":"2024-05-31T16:00:39.808943Z"},"trusted":true},"execution_count":34,"outputs":[]},{"cell_type":"code","source":"pip install distance","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:00:39.811472Z","iopub.execute_input":"2024-05-31T16:00:39.811883Z","iopub.status.idle":"2024-05-31T16:00:54.852520Z","shell.execute_reply.started":"2024-05-31T16:00:39.811847Z","shell.execute_reply":"2024-05-31T16:00:54.851156Z"},"trusted":true},"execution_count":35,"outputs":[{"name":"stdout","text":"Collecting distance\n Downloading Distance-0.1.3.tar.gz (180 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m180.3/180.3 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n\u001b[?25hBuilding wheels for collected packages: distance\n Building wheel for distance (setup.py) ... \u001b[?25ldone\n\u001b[?25h Created wheel for distance: filename=Distance-0.1.3-py3-none-any.whl size=16258 sha256=cd544d5c1039ea6345ff5a69695ae0ef0e616e019bdaf0ccaadf6d5845ffc9ac\n Stored in directory: /root/.cache/pip/wheels/e8/bb/de/f71bf63559ea9a921059a5405806f7ff6ed612a9231c4a9309\nSuccessfully built distance\n\u001b[33mWARNING: Error parsing requirements for aiohttp: [Errno 2] No such file or directory: '/opt/conda/lib/python3.10/site-packages/aiohttp-3.9.1.dist-info/METADATA'\u001b[0m\u001b[33m\n\u001b[0mInstalling collected packages: distance\nSuccessfully installed distance-0.1.3\nNote: you may need to restart the kernel to use updated packages.\n","output_type":"stream"}]},{"cell_type":"code","source":"import distance\n\ndef fetch_length_features(row):\n \n q1 = row['question1']\n q2 = row['question2']\n \n length_features = [0.0]*3\n \n # Converting the Sentence into Tokens: \n q1_tokens = q1.split()\n q2_tokens = q2.split()\n \n if len(q1_tokens) == 0 or len(q2_tokens) == 0:\n return length_features\n \n # Absolute length features\n length_features[0] = abs(len(q1_tokens) - len(q2_tokens))\n \n # Average Token Length of both Questions\n length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2\n \n # Find the longest common substring\n strs = list(distance.lcsubstrings(q1, q2))\n if strs:\n length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)\n else:\n length_features[2] = 0.0\n \n return length_features","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:00:54.854466Z","iopub.execute_input":"2024-05-31T16:00:54.855416Z","iopub.status.idle":"2024-05-31T16:00:54.868756Z","shell.execute_reply.started":"2024-05-31T16:00:54.855371Z","shell.execute_reply":"2024-05-31T16:00:54.867731Z"},"trusted":true},"execution_count":36,"outputs":[]},{"cell_type":"code","source":"length_features = df.apply(fetch_length_features, axis=1)\n\ndf['abs_len_diff'] = list(map(lambda x: x[0], length_features))\ndf['mean_len'] = list(map(lambda x: x[1], length_features))\ndf['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:00:54.870202Z","iopub.execute_input":"2024-05-31T16:00:54.870885Z","iopub.status.idle":"2024-05-31T16:03:34.399606Z","shell.execute_reply.started":"2024-05-31T16:00:54.870849Z","shell.execute_reply":"2024-05-31T16:03:34.398480Z"},"trusted":true},"execution_count":37,"outputs":[]},{"cell_type":"code","source":"# Fuzzy Features\nfrom fuzzywuzzy import fuzz\n\ndef fetch_fuzzy_features(row):\n \n q1 = row['question1']\n q2 = row['question2']\n \n fuzzy_features = [0.0]*4\n \n # fuzz_ratio\n fuzzy_features[0] = fuzz.QRatio(q1, q2)\n\n # fuzz_partial_ratio\n fuzzy_features[1] = fuzz.partial_ratio(q1, q2)\n\n # token_sort_ratio\n fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)\n\n # token_set_ratio\n fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)\n\n return fuzzy_features","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:03:34.401346Z","iopub.execute_input":"2024-05-31T16:03:34.401740Z","iopub.status.idle":"2024-05-31T16:03:34.415928Z","shell.execute_reply.started":"2024-05-31T16:03:34.401703Z","shell.execute_reply":"2024-05-31T16:03:34.414845Z"},"trusted":true},"execution_count":38,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning\n warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')\n","output_type":"stream"}]},{"cell_type":"code","source":"fuzzy_features = df.apply(fetch_fuzzy_features, axis=1)\n\n# Creating new feature columns for fuzzy features\ndf['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))\ndf['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))\ndf['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))\ndf['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:03:34.417222Z","iopub.execute_input":"2024-05-31T16:03:34.417541Z","iopub.status.idle":"2024-05-31T16:12:37.749312Z","shell.execute_reply.started":"2024-05-31T16:03:34.417507Z","shell.execute_reply":"2024-05-31T16:12:37.748091Z"},"trusted":true},"execution_count":39,"outputs":[]},{"cell_type":"code","source":"ndf1=df[['question1','question2']]\nndf2=df.drop(columns=['question1','question2'])","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:12:37.750792Z","iopub.execute_input":"2024-05-31T16:12:37.751247Z","iopub.status.idle":"2024-05-31T16:12:37.796876Z","shell.execute_reply.started":"2024-05-31T16:12:37.751211Z","shell.execute_reply":"2024-05-31T16:12:37.796026Z"},"trusted":true},"execution_count":40,"outputs":[]},{"cell_type":"code","source":"from sklearn.feature_extraction.text import TfidfVectorizer","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:12:37.806016Z","iopub.execute_input":"2024-05-31T16:12:37.806372Z","iopub.status.idle":"2024-05-31T16:12:37.811136Z","shell.execute_reply.started":"2024-05-31T16:12:37.806344Z","shell.execute_reply":"2024-05-31T16:12:37.810107Z"},"trusted":true},"execution_count":41,"outputs":[]},{"cell_type":"code","source":"cv=TfidfVectorizer(max_features=1000)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:11.173544Z","iopub.execute_input":"2024-05-31T16:13:11.173929Z","iopub.status.idle":"2024-05-31T16:13:11.179093Z","shell.execute_reply.started":"2024-05-31T16:13:11.173896Z","shell.execute_reply":"2024-05-31T16:13:11.177928Z"},"trusted":true},"execution_count":45,"outputs":[]},{"cell_type":"code","source":"\nquestions=list(ndf1['question1'])+list(ndf1['question2'])","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:12.853466Z","iopub.execute_input":"2024-05-31T16:13:12.853846Z","iopub.status.idle":"2024-05-31T16:13:12.911399Z","shell.execute_reply.started":"2024-05-31T16:13:12.853814Z","shell.execute_reply":"2024-05-31T16:13:12.910271Z"},"trusted":true},"execution_count":46,"outputs":[]},{"cell_type":"code","source":"","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"q1_arr,q2_arr=np.vsplit(cv.fit_transform(questions).toarray(),2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:17.135566Z","iopub.execute_input":"2024-05-31T16:13:17.135964Z","iopub.status.idle":"2024-05-31T16:13:26.663959Z","shell.execute_reply.started":"2024-05-31T16:13:17.135933Z","shell.execute_reply":"2024-05-31T16:13:26.662828Z"},"trusted":true},"execution_count":47,"outputs":[]},{"cell_type":"code","source":"\ntemp_df=pd.concat([pd.DataFrame(q1_arr,index=ndf1.index),pd.DataFrame(q2_arr,index=ndf1.index)],axis=1)\ntemp_df.shape","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:26.665906Z","iopub.execute_input":"2024-05-31T16:13:26.666258Z","iopub.status.idle":"2024-05-31T16:13:34.625571Z","shell.execute_reply.started":"2024-05-31T16:13:26.666230Z","shell.execute_reply":"2024-05-31T16:13:34.624673Z"},"trusted":true},"execution_count":48,"outputs":[{"execution_count":48,"output_type":"execute_result","data":{"text/plain":"(200000, 2000)"},"metadata":{}}]},{"cell_type":"code","source":"q1_arr=\"\"\nq2_arr=\"\"","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:34.627158Z","iopub.execute_input":"2024-05-31T16:13:34.627867Z","iopub.status.idle":"2024-05-31T16:13:34.791511Z","shell.execute_reply.started":"2024-05-31T16:13:34.627828Z","shell.execute_reply":"2024-05-31T16:13:34.790413Z"},"trusted":true},"execution_count":49,"outputs":[]},{"cell_type":"code","source":"temp_df=pd.concat([ndf2,temp_df],axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:34.794367Z","iopub.execute_input":"2024-05-31T16:13:34.795113Z","iopub.status.idle":"2024-05-31T16:13:38.803374Z","shell.execute_reply.started":"2024-05-31T16:13:34.795080Z","shell.execute_reply":"2024-05-31T16:13:38.802436Z"},"trusted":true},"execution_count":50,"outputs":[]},{"cell_type":"code","source":"temp_df.shape","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.804531Z","iopub.execute_input":"2024-05-31T16:13:38.804807Z","iopub.status.idle":"2024-05-31T16:13:38.810942Z","shell.execute_reply.started":"2024-05-31T16:13:38.804784Z","shell.execute_reply":"2024-05-31T16:13:38.809885Z"},"trusted":true},"execution_count":51,"outputs":[{"execution_count":51,"output_type":"execute_result","data":{"text/plain":"(200000, 2023)"},"metadata":{}}]},{"cell_type":"code","source":"temp_df['is_duplicate']","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.812454Z","iopub.execute_input":"2024-05-31T16:13:38.813079Z","iopub.status.idle":"2024-05-31T16:13:38.827266Z","shell.execute_reply.started":"2024-05-31T16:13:38.813018Z","shell.execute_reply":"2024-05-31T16:13:38.826128Z"},"trusted":true},"execution_count":52,"outputs":[{"execution_count":52,"output_type":"execute_result","data":{"text/plain":"0 0\n1 0\n2 0\n3 0\n4 0\n ..\n199996 0\n199997 0\n199998 1\n199999 0\n200000 0\nName: is_duplicate, Length: 200000, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"temp_df.columns = temp_df.columns.astype(str)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.828616Z","iopub.execute_input":"2024-05-31T16:13:38.829513Z","iopub.status.idle":"2024-05-31T16:13:38.838173Z","shell.execute_reply.started":"2024-05-31T16:13:38.829475Z","shell.execute_reply":"2024-05-31T16:13:38.837202Z"},"trusted":true},"execution_count":53,"outputs":[]},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.839553Z","iopub.execute_input":"2024-05-31T16:13:38.839956Z","iopub.status.idle":"2024-05-31T16:13:38.849624Z","shell.execute_reply.started":"2024-05-31T16:13:38.839921Z","shell.execute_reply":"2024-05-31T16:13:38.848631Z"},"trusted":true},"execution_count":54,"outputs":[]},{"cell_type":"code","source":"\nx_train,x_test,y_train,y_test=train_test_split(temp_df.drop(columns='is_duplicate'),temp_df['is_duplicate'],test_size=0.1,random_state=3)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.851064Z","iopub.execute_input":"2024-05-31T16:13:38.851910Z","iopub.status.idle":"2024-05-31T16:13:42.573240Z","shell.execute_reply.started":"2024-05-31T16:13:38.851873Z","shell.execute_reply":"2024-05-31T16:13:42.572007Z"},"trusted":true},"execution_count":55,"outputs":[]},{"cell_type":"code","source":"temp_df","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:45.458273Z","iopub.execute_input":"2024-05-31T16:13:45.458942Z","iopub.status.idle":"2024-05-31T16:13:45.547209Z","shell.execute_reply.started":"2024-05-31T16:13:45.458905Z","shell.execute_reply":"2024-05-31T16:13:45.546132Z"},"trusted":true},"execution_count":56,"outputs":[{"execution_count":56,"output_type":"execute_result","data":{"text/plain":" is_duplicate q1_len q2_len q1_num_words q2_num_words word_common \\\n0 0 65 56 14 12 11 \n1 0 46 83 8 13 4 \n2 0 72 58 14 10 4 \n3 0 48 51 11 9 0 \n4 0 73 38 13 7 4 \n... ... ... ... ... ... ... \n199996 0 43 48 9 9 5 \n199997 0 23 37 5 7 5 \n199998 1 50 61 11 13 9 \n199999 0 38 44 8 9 8 \n200000 0 146 60 27 10 6 \n\n word_total word_share cwc_min cwc_max ... 990 991 992 993 \\\n0 23 0.48 0.999980 0.833319 ... 0.0 0.0 0.0 0.0 \n1 18 0.22 0.666644 0.249997 ... 0.0 0.0 0.0 0.0 \n2 24 0.17 0.399992 0.333328 ... 0.0 0.0 0.0 0.0 \n3 19 0.00 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 \n4 20 0.20 0.399992 0.199998 ... 0.0 0.0 0.0 0.0 \n... ... ... ... ... ... ... ... ... ... \n199996 18 0.28 0.749981 0.749981 ... 0.0 0.0 0.0 0.0 \n199997 12 0.42 0.999950 0.499988 ... 0.0 0.0 0.0 0.0 \n199998 21 0.43 0.833319 0.833319 ... 0.0 0.0 0.0 0.0 \n199999 17 0.47 0.999975 0.799984 ... 0.0 0.0 0.0 0.0 \n200000 34 0.18 0.666656 0.222221 ... 0.0 0.0 0.0 0.0 \n\n 994 995 996 997 998 999 \n0 0.0 0.0 0.0 0.000000 0.0 0.0 \n1 0.0 0.0 0.0 0.000000 0.0 0.0 \n2 0.0 0.0 0.0 0.000000 0.0 0.0 \n3 0.0 0.0 0.0 0.000000 0.0 0.0 \n4 0.0 0.0 0.0 0.000000 0.0 0.0 \n... ... ... ... ... ... ... \n199996 0.0 0.0 0.0 0.000000 0.0 0.0 \n199997 0.0 0.0 0.0 0.000000 0.0 0.0 \n199998 0.0 0.0 0.0 0.000000 0.0 0.0 \n199999 0.0 0.0 0.0 0.344384 0.0 0.0 \n200000 0.0 0.0 0.0 0.000000 0.0 0.0 \n\n[200000 rows x 2023 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>is_duplicate</th>\n <th>q1_len</th>\n <th>q2_len</th>\n <th>q1_num_words</th>\n <th>q2_num_words</th>\n <th>word_common</th>\n <th>word_total</th>\n <th>word_share</th>\n <th>cwc_min</th>\n <th>cwc_max</th>\n <th>...</th>\n <th>990</th>\n <th>991</th>\n <th>992</th>\n <th>993</th>\n <th>994</th>\n <th>995</th>\n <th>996</th>\n <th>997</th>\n <th>998</th>\n <th>999</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>65</td>\n <td>56</td>\n <td>14</td>\n <td>12</td>\n <td>11</td>\n <td>23</td>\n <td>0.48</td>\n <td>0.999980</td>\n <td>0.833319</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>0</td>\n <td>46</td>\n <td>83</td>\n <td>8</td>\n <td>13</td>\n <td>4</td>\n <td>18</td>\n <td>0.22</td>\n <td>0.666644</td>\n <td>0.249997</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>0</td>\n <td>72</td>\n <td>58</td>\n <td>14</td>\n <td>10</td>\n <td>4</td>\n <td>24</td>\n <td>0.17</td>\n <td>0.399992</td>\n <td>0.333328</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0</td>\n <td>48</td>\n <td>51</td>\n <td>11</td>\n <td>9</td>\n <td>0</td>\n <td>19</td>\n <td>0.00</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0</td>\n <td>73</td>\n <td>38</td>\n <td>13</td>\n <td>7</td>\n <td>4</td>\n <td>20</td>\n <td>0.20</td>\n <td>0.399992</td>\n <td>0.199998</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>199996</th>\n <td>0</td>\n <td>43</td>\n <td>48</td>\n <td>9</td>\n <td>9</td>\n <td>5</td>\n <td>18</td>\n <td>0.28</td>\n <td>0.749981</td>\n <td>0.749981</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>199997</th>\n <td>0</td>\n <td>23</td>\n <td>37</td>\n <td>5</td>\n <td>7</td>\n <td>5</td>\n <td>12</td>\n <td>0.42</td>\n <td>0.999950</td>\n <td>0.499988</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>199998</th>\n <td>1</td>\n <td>50</td>\n <td>61</td>\n <td>11</td>\n <td>13</td>\n <td>9</td>\n <td>21</td>\n <td>0.43</td>\n <td>0.833319</td>\n <td>0.833319</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>199999</th>\n <td>0</td>\n <td>38</td>\n <td>44</td>\n <td>8</td>\n <td>9</td>\n <td>8</td>\n <td>17</td>\n <td>0.47</td>\n <td>0.999975</td>\n <td>0.799984</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.344384</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>200000</th>\n <td>0</td>\n <td>146</td>\n <td>60</td>\n <td>27</td>\n <td>10</td>\n <td>6</td>\n <td>34</td>\n <td>0.18</td>\n <td>0.666656</td>\n <td>0.222221</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n </tbody>\n</table>\n<p>200000 rows × 2023 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"temp_df.columns = temp_df.columns.astype(str)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:45.723629Z","iopub.execute_input":"2024-05-31T16:13:45.724341Z","iopub.status.idle":"2024-05-31T16:13:45.729720Z","shell.execute_reply.started":"2024-05-31T16:13:45.724307Z","shell.execute_reply":"2024-05-31T16:13:45.728559Z"},"trusted":true},"execution_count":57,"outputs":[]},{"cell_type":"code","source":"from sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:45.948658Z","iopub.execute_input":"2024-05-31T16:13:45.949487Z","iopub.status.idle":"2024-05-31T16:13:46.109297Z","shell.execute_reply.started":"2024-05-31T16:13:45.949447Z","shell.execute_reply":"2024-05-31T16:13:46.108278Z"},"trusted":true},"execution_count":58,"outputs":[]},{"cell_type":"code","source":"\nrf=RandomForestClassifier()\nrf.fit(x_train,y_train)\ny_pred=rf.predict(x_test)\naccuracy_score(y_test,y_pred)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:46.859901Z","iopub.execute_input":"2024-05-31T16:13:46.860985Z","iopub.status.idle":"2024-05-31T16:18:53.609846Z","shell.execute_reply.started":"2024-05-31T16:13:46.860950Z","shell.execute_reply":"2024-05-31T16:18:53.608761Z"},"trusted":true},"execution_count":59,"outputs":[{"execution_count":59,"output_type":"execute_result","data":{"text/plain":"0.8151"},"metadata":{}}]},{"cell_type":"code","source":"import pickle\nmodel_pkl_file = \"RF.pkl\" \n\nwith open(model_pkl_file, 'wb') as file: \n pickle.dump(rf, file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:53.611939Z","iopub.execute_input":"2024-05-31T16:18:53.612764Z","iopub.status.idle":"2024-05-31T16:18:54.365381Z","shell.execute_reply.started":"2024-05-31T16:18:53.612727Z","shell.execute_reply":"2024-05-31T16:18:54.364401Z"},"trusted":true},"execution_count":60,"outputs":[]},{"cell_type":"code","source":"model_pkl_file = \"BOW.pkl\" \n\nwith open(model_pkl_file, 'wb') as file: \n pickle.dump(cv, file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:54.366547Z","iopub.execute_input":"2024-05-31T16:18:54.366856Z","iopub.status.idle":"2024-05-31T16:18:54.400438Z","shell.execute_reply.started":"2024-05-31T16:18:54.366830Z","shell.execute_reply":"2024-05-31T16:18:54.399515Z"},"trusted":true},"execution_count":61,"outputs":[]},{"cell_type":"code","source":"import pickle\nwith open(\"/kaggle/working/BOW.pkl\", 'rb') as file: \n cv = pickle.load(file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:54.401578Z","iopub.execute_input":"2024-05-31T16:18:54.401866Z","iopub.status.idle":"2024-05-31T16:18:54.429997Z","shell.execute_reply.started":"2024-05-31T16:18:54.401842Z","shell.execute_reply":"2024-05-31T16:18:54.429102Z"},"trusted":true},"execution_count":62,"outputs":[]},{"cell_type":"code","source":"with open(\"/kaggle/working/RF.pkl\", 'rb') as file: \n rf = pickle.load(file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:54.432348Z","iopub.execute_input":"2024-05-31T16:18:54.432667Z","iopub.status.idle":"2024-05-31T16:18:55.121525Z","shell.execute_reply.started":"2024-05-31T16:18:54.432641Z","shell.execute_reply":"2024-05-31T16:18:55.120436Z"},"trusted":true},"execution_count":63,"outputs":[]},{"cell_type":"code","source":"df=pd.read_csv(\"/kaggle/input/quora-duplicate-questions-copy/train.csv\")\ndf=df.tail(204290)\ndf.dropna(inplace=True)\ndf.drop(columns=['id','qid1','qid2'],inplace=True)\ndf['question1']=df['question1'].apply(preprocess)\ndf['question2']=df['question2'].apply(preprocess)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:55.122843Z","iopub.execute_input":"2024-05-31T16:18:55.123148Z","iopub.status.idle":"2024-05-31T16:19:11.234145Z","shell.execute_reply.started":"2024-05-31T16:18:55.123124Z","shell.execute_reply":"2024-05-31T16:19:11.233103Z"},"trusted":true},"execution_count":64,"outputs":[]},{"cell_type":"code","source":"\ndf['q1_len']=df['question1'].str.len()\ndf['q2_len']=df['question2'].str.len()\ndf['q1_num_words']=df['question1'].apply(lambda row: len(row.split(\" \")))\ndf['q2_num_words']=df['question2'].apply(lambda row: len(row.split(\" \")))\ndef common_words(row):\n w1=set(map(lambda word: word.lower().strip(),row['question1'].split(\" \")))\n w2=set(map(lambda word: word.lower().strip(),row['question2'].split(\" \")))\n return len(w1 & w2)\ndef total_words(row):\n w1=set(map(lambda word: word.lower().strip(),row['question1'].split(\" \")))\n w2=set(map(lambda word: word.lower().strip(),row['question2'].split(\" \")))\n return len(w1) + len(w2)\n\ndf['word_common']=df.apply(common_words,axis=1)\ndf['word_total']=df.apply(total_words,axis=1)\ndf['word_share']=round(df['word_common']/df['word_total'],2)\n\n","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:19:11.235593Z","iopub.execute_input":"2024-05-31T16:19:11.236383Z","iopub.status.idle":"2024-05-31T16:19:20.397449Z","shell.execute_reply.started":"2024-05-31T16:19:11.236343Z","shell.execute_reply":"2024-05-31T16:19:20.396339Z"},"trusted":true},"execution_count":65,"outputs":[]},{"cell_type":"code","source":"token_features = df.apply(fetch_token_features, axis=1)\n\ndf[\"cwc_min\"] = list(map(lambda x: x[0], token_features))\ndf[\"cwc_max\"] = list(map(lambda x: x[1], token_features))\ndf[\"csc_min\"] = list(map(lambda x: x[2], token_features))\ndf[\"csc_max\"] = list(map(lambda x: x[3], token_features))\ndf[\"ctc_min\"] = list(map(lambda x: x[4], token_features))\ndf[\"ctc_max\"] = list(map(lambda x: x[5], token_features))\ndf[\"last_word_eq\"] = list(map(lambda x: x[6], token_features))\ndf[\"first_word_eq\"] = list(map(lambda x: x[7], token_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:19:20.398704Z","iopub.execute_input":"2024-05-31T16:19:20.399012Z","iopub.status.idle":"2024-05-31T16:20:13.569480Z","shell.execute_reply.started":"2024-05-31T16:19:20.398986Z","shell.execute_reply":"2024-05-31T16:20:13.568221Z"},"trusted":true},"execution_count":66,"outputs":[]},{"cell_type":"code","source":"length_features = df.apply(fetch_length_features, axis=1)\n\ndf['abs_len_diff'] = list(map(lambda x: x[0], length_features))\ndf['mean_len'] = list(map(lambda x: x[1], length_features))\ndf['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:20:13.570901Z","iopub.execute_input":"2024-05-31T16:20:13.571264Z","iopub.status.idle":"2024-05-31T16:22:55.402008Z","shell.execute_reply.started":"2024-05-31T16:20:13.571234Z","shell.execute_reply":"2024-05-31T16:22:55.400892Z"},"trusted":true},"execution_count":67,"outputs":[]},{"cell_type":"code","source":"fuzzy_features = df.apply(fetch_fuzzy_features, axis=1)\n\n# Creating new feature columns for fuzzy features\ndf['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))\ndf['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))\ndf['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))\ndf['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:22:55.403684Z","iopub.execute_input":"2024-05-31T16:22:55.403981Z","iopub.status.idle":"2024-05-31T16:32:18.985961Z","shell.execute_reply.started":"2024-05-31T16:22:55.403956Z","shell.execute_reply":"2024-05-31T16:32:18.984852Z"},"trusted":true},"execution_count":68,"outputs":[]},{"cell_type":"code","source":"ndf2=df.drop(columns=['question1','question2'])","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:18.987276Z","iopub.execute_input":"2024-05-31T16:32:18.987559Z","iopub.status.idle":"2024-05-31T16:32:19.008186Z","shell.execute_reply.started":"2024-05-31T16:32:18.987536Z","shell.execute_reply":"2024-05-31T16:32:19.007170Z"},"trusted":true},"execution_count":69,"outputs":[]},{"cell_type":"code","source":"questions=list(df['question1'])+list(df['question2'])\nq1_arr,q2_arr=np.vsplit(cv.fit_transform(questions).toarray(),2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:19.009434Z","iopub.execute_input":"2024-05-31T16:32:19.009730Z","iopub.status.idle":"2024-05-31T16:32:28.784256Z","shell.execute_reply.started":"2024-05-31T16:32:19.009705Z","shell.execute_reply":"2024-05-31T16:32:28.783401Z"},"trusted":true},"execution_count":70,"outputs":[]},{"cell_type":"code","source":"tenp_df=\"\"\nndf1=\"\"","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:28.785527Z","iopub.execute_input":"2024-05-31T16:32:28.785898Z","iopub.status.idle":"2024-05-31T16:32:28.794742Z","shell.execute_reply.started":"2024-05-31T16:32:28.785866Z","shell.execute_reply":"2024-05-31T16:32:28.793759Z"},"trusted":true},"execution_count":71,"outputs":[]},{"cell_type":"code","source":"\ntemp_df=pd.concat([pd.DataFrame(q1_arr,index=ndf2.index),pd.DataFrame(q2_arr,index=ndf2.index)],axis=1)\ntemp_df.shape\n","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:28.798176Z","iopub.execute_input":"2024-05-31T16:32:28.798534Z","iopub.status.idle":"2024-05-31T16:32:37.549598Z","shell.execute_reply.started":"2024-05-31T16:32:28.798507Z","shell.execute_reply":"2024-05-31T16:32:37.548558Z"},"trusted":true},"execution_count":72,"outputs":[{"execution_count":72,"output_type":"execute_result","data":{"text/plain":"(204288, 2000)"},"metadata":{}}]},{"cell_type":"code","source":"q1_arr=\"\"\nq2_arr=\"\"","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:37.550761Z","iopub.execute_input":"2024-05-31T16:32:37.551083Z","iopub.status.idle":"2024-05-31T16:32:37.719975Z","shell.execute_reply.started":"2024-05-31T16:32:37.551032Z","shell.execute_reply":"2024-05-31T16:32:37.718778Z"},"trusted":true},"execution_count":73,"outputs":[]},{"cell_type":"code","source":"temp_df=pd.concat([ndf2,temp_df],axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:37.721428Z","iopub.execute_input":"2024-05-31T16:32:37.721758Z","iopub.status.idle":"2024-05-31T16:32:41.807913Z","shell.execute_reply.started":"2024-05-31T16:32:37.721730Z","shell.execute_reply":"2024-05-31T16:32:41.807000Z"},"trusted":true},"execution_count":74,"outputs":[]},{"cell_type":"code","source":"temp_df.columns = temp_df.columns.astype(str)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:41.808947Z","iopub.execute_input":"2024-05-31T16:32:41.809256Z","iopub.status.idle":"2024-05-31T16:32:41.814768Z","shell.execute_reply.started":"2024-05-31T16:32:41.809230Z","shell.execute_reply":"2024-05-31T16:32:41.813818Z"},"trusted":true},"execution_count":75,"outputs":[]},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split\nx_train,x_test,y_train,y_test=train_test_split(temp_df.drop(columns='is_duplicate'),temp_df['is_duplicate'],test_size=0.1,random_state=3)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:41.815878Z","iopub.execute_input":"2024-05-31T16:32:41.816183Z","iopub.status.idle":"2024-05-31T16:32:45.660976Z","shell.execute_reply.started":"2024-05-31T16:32:41.816159Z","shell.execute_reply":"2024-05-31T16:32:45.659752Z"},"trusted":true},"execution_count":76,"outputs":[]},{"cell_type":"code","source":"rf.fit(x_train,y_train)\ny_pred=rf.predict(x_test)\naccuracy_score(y_test,y_pred)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:45.662778Z","iopub.execute_input":"2024-05-31T16:32:45.663346Z","iopub.status.idle":"2024-05-31T16:37:57.160516Z","shell.execute_reply.started":"2024-05-31T16:32:45.663299Z","shell.execute_reply":"2024-05-31T16:37:57.159407Z"},"trusted":true},"execution_count":77,"outputs":[{"execution_count":77,"output_type":"execute_result","data":{"text/plain":"0.8166821675069754"},"metadata":{}}]},{"cell_type":"code","source":"model_pkl_file = \"RF.pkl\" \n\nwith open(model_pkl_file, 'wb') as file: \n pickle.dump(rf, file)\n \nmodel_pkl_file = \"BOW.pkl\" \n\nwith open(model_pkl_file, 'wb') as file: \n pickle.dump(cv, file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:37:57.161922Z","iopub.execute_input":"2024-05-31T16:37:57.162258Z","iopub.status.idle":"2024-05-31T16:37:58.112704Z","shell.execute_reply.started":"2024-05-31T16:37:57.162230Z","shell.execute_reply":"2024-05-31T16:37:58.111599Z"},"trusted":true},"execution_count":78,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]} |