diff --git "a/TF-IDF.ipynb" "b/TF-IDF.ipynb" new file mode 100644--- /dev/null +++ "b/TF-IDF.ipynb" @@ -0,0 +1 @@ +{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":8565891,"sourceType":"datasetVersion","datasetId":5120988}],"dockerImageVersionId":30716,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:18.221392Z","iopub.execute_input":"2024-05-31T15:59:18.221694Z","iopub.status.idle":"2024-05-31T15:59:20.313965Z","shell.execute_reply.started":"2024-05-31T15:59:18.221668Z","shell.execute_reply":"2024-05-31T15:59:20.312942Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"df=pd.read_csv(\"/kaggle/input/quora-duplicate-questions-copy/train.csv\")","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:20.316233Z","iopub.execute_input":"2024-05-31T15:59:20.316770Z","iopub.status.idle":"2024-05-31T15:59:22.476112Z","shell.execute_reply.started":"2024-05-31T15:59:20.316735Z","shell.execute_reply":"2024-05-31T15:59:22.475105Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"df.shape","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.477876Z","iopub.execute_input":"2024-05-31T15:59:22.478273Z","iopub.status.idle":"2024-05-31T15:59:22.487296Z","shell.execute_reply.started":"2024-05-31T15:59:22.478236Z","shell.execute_reply":"2024-05-31T15:59:22.486116Z"},"trusted":true},"execution_count":3,"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":"(404290, 6)"},"metadata":{}}]},{"cell_type":"code","source":"df.head()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.488561Z","iopub.execute_input":"2024-05-31T15:59:22.488954Z","iopub.status.idle":"2024-05-31T15:59:22.514564Z","shell.execute_reply.started":"2024-05-31T15:59:22.488922Z","shell.execute_reply":"2024-05-31T15:59:22.513359Z"},"trusted":true},"execution_count":4,"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":" id qid1 qid2 question1 \\\n0 0 1 2 What is the step by step guide to invest in sh... \n1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia... \n2 2 5 6 How can I increase the speed of my internet co... \n3 3 7 8 Why am I mentally very lonely? How can I solve... \n4 4 9 10 Which one dissolve in water quikly sugar, salt... \n\n question2 is_duplicate \n0 What is the step by step guide to invest in sh... 0 \n1 What would happen if the Indian government sto... 0 \n2 How can Internet speed be increased by hacking... 0 \n3 Find the remainder when [math]23^{24}[/math] i... 0 \n4 Which fish would survive in salt water? 0 ","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idqid1qid2question1question2is_duplicate
0012What is the step by step guide to invest in sh...What is the step by step guide to invest in sh...0
1134What is the story of Kohinoor (Koh-i-Noor) Dia...What would happen if the Indian government sto...0
2256How can I increase the speed of my internet co...How can Internet speed be increased by hacking...0
3378Why am I mentally very lonely? How can I solve...Find the remainder when [math]23^{24}[/math] i...0
44910Which one dissolve in water quikly sugar, salt...Which fish would survive in salt water?0
\n
"},"metadata":{}}]},{"cell_type":"code","source":"df.isnull().sum()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.517703Z","iopub.execute_input":"2024-05-31T15:59:22.518337Z","iopub.status.idle":"2024-05-31T15:59:22.596705Z","shell.execute_reply.started":"2024-05-31T15:59:22.518297Z","shell.execute_reply":"2024-05-31T15:59:22.595501Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":"id 0\nqid1 0\nqid2 0\nquestion1 1\nquestion2 2\nis_duplicate 0\ndtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"df.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.598173Z","iopub.execute_input":"2024-05-31T15:59:22.598466Z","iopub.status.idle":"2024-05-31T15:59:22.736840Z","shell.execute_reply.started":"2024-05-31T15:59:22.598442Z","shell.execute_reply":"2024-05-31T15:59:22.735647Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"df.duplicated().sum()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.741851Z","iopub.execute_input":"2024-05-31T15:59:22.744101Z","iopub.status.idle":"2024-05-31T15:59:23.191640Z","shell.execute_reply.started":"2024-05-31T15:59:22.744061Z","shell.execute_reply":"2024-05-31T15:59:23.190396Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"0"},"metadata":{}}]},{"cell_type":"code","source":"df=df.head(200000)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.192948Z","iopub.execute_input":"2024-05-31T15:59:23.193273Z","iopub.status.idle":"2024-05-31T15:59:23.198226Z","shell.execute_reply.started":"2024-05-31T15:59:23.193238Z","shell.execute_reply":"2024-05-31T15:59:23.197248Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"def preprocess(q):\n q=str(q).lower().strip()\n \n q=q.replace('%',' percent ')\n q=q.replace('@',' at ')\n q=q.replace('$',' dollar ')\n \n q=q.replace('[math]','')\n \n q=q.replace(',000,000,000 ','b ')\n q=q.replace(',000,000 ','m ')\n q=q.replace(',000 ','k ')\n \n import re\n q=re.sub(r'([0-9]+)000000000',r'\\1b',q)\n q=re.sub(r'([0-9]+)000000',r'\\1m',q)\n q=re.sub(r'([0-9]+)000',r'\\1k',q)\n \n contractions = { \n \"ain't\": \"am not\",\n \"aren't\": \"are not\",\n \"can't\": \"can not\",\n \"can't've\": \"can not have\",\n \"'cause\": \"because\",\n \"could've\": \"could have\",\n \"couldn't\": \"could not\",\n \"couldn't've\": \"could not have\",\n \"didn't\": \"did not\",\n \"doesn't\": \"does not\",\n \"don't\": \"do not\",\n \"hadn't\": \"had not\",\n \"hadn't've\": \"had not have\",\n \"hasn't\": \"has not\",\n \"haven't\": \"have not\",\n \"he'd\": \"he would\",\n \"he'd've\": \"he would have\",\n \"he'll\": \"he will\",\n \"he'll've\": \"he will have\",\n \"he's\": \"he is\",\n \"how'd\": \"how did\",\n \"how'd'y\": \"how do you\",\n \"how'll\": \"how will\",\n \"how's\": \"how is\",\n \"i'd\": \"i would\",\n \"i'd've\": \"i would have\",\n \"i'll\": \"i will\",\n \"i'll've\": \"i will have\",\n \"i'm\": \"i am\",\n \"i've\": \"i have\",\n \"isn't\": \"is not\",\n \"it'd\": \"it would\",\n \"it'd've\": \"it would have\",\n \"it'll\": \"it will\",\n \"it'll've\": \"it will have\",\n \"it's\": \"it is\",\n \"let's\": \"let us\",\n \"ma'am\": \"madam\",\n \"mayn't\": \"may not\",\n \"might've\": \"might have\",\n \"mightn't\": \"might not\",\n \"mightn't've\": \"might not have\",\n \"must've\": \"must have\",\n \"mustn't\": \"must not\",\n \"mustn't've\": \"must not have\",\n \"needn't\": \"need not\",\n \"needn't've\": \"need not have\",\n \"o'clock\": \"of the clock\",\n \"oughtn't\": \"ought not\",\n \"oughtn't've\": \"ought not have\",\n \"shan't\": \"shall not\",\n \"sha'n't\": \"shall not\",\n \"shan't've\": \"shall not have\",\n \"she'd\": \"she would\",\n \"she'd've\": \"she would have\",\n \"she'll\": \"she will\",\n \"she'll've\": \"she will have\",\n \"she's\": \"she is\",\n \"should've\": \"should have\",\n \"shouldn't\": \"should not\",\n \"shouldn't've\": \"should not have\",\n \"so've\": \"so have\",\n \"so's\": \"so as\",\n \"that'd\": \"that would\",\n \"that'd've\": \"that would have\",\n \"that's\": \"that is\",\n \"there'd\": \"there would\",\n \"there'd've\": \"there would have\",\n \"there's\": \"there is\",\n \"they'd\": \"they would\",\n \"they'd've\": \"they would have\",\n \"they'll\": \"they will\",\n \"they'll've\": \"they will have\",\n \"they're\": \"they are\",\n \"they've\": \"they have\",\n \"to've\": \"to have\",\n \"wasn't\": \"was not\",\n \"we'd\": \"we would\",\n \"we'd've\": \"we would have\",\n \"we'll\": \"we will\",\n \"we'll've\": \"we will have\",\n \"we're\": \"we are\",\n \"we've\": \"we have\",\n \"weren't\": \"were not\",\n \"what'll\": \"what will\",\n \"what'll've\": \"what will have\",\n \"what're\": \"what are\",\n \"what's\": \"what is\",\n \"what've\": \"what have\",\n \"when's\": \"when is\",\n \"when've\": \"when have\",\n \"where'd\": \"where did\",\n \"where's\": \"where is\",\n \"where've\": \"where have\",\n \"who'll\": \"who will\",\n \"who'll've\": \"who will have\",\n \"who's\": \"who is\",\n \"who've\": \"who have\",\n \"why's\": \"why is\",\n \"why've\": \"why have\",\n \"will've\": \"will have\",\n \"won't\": \"will not\",\n \"won't've\": \"will not have\",\n \"would've\": \"would have\",\n \"wouldn't\": \"would not\",\n \"wouldn't've\": \"would not have\",\n \"y'all\": \"you all\",\n \"y'all'd\": \"you all would\",\n \"y'all'd've\": \"you all would have\",\n \"y'all're\": \"you all are\",\n \"y'all've\": \"you all have\",\n \"you'd\": \"you would\",\n \"you'd've\": \"you would have\",\n \"you'll\": \"you will\",\n \"you'll've\": \"you will have\",\n \"you're\": \"you are\",\n \"you've\": \"you have\"\n }\n\n q_decontracted = []\n\n for word in q.split():\n if word in contractions:\n word = contractions[word]\n\n q_decontracted.append(word)\n\n q = ' '.join(q_decontracted)\n q = q.replace(\"'ve\", \" have\")\n q = q.replace(\"n't\", \" not\")\n q = q.replace(\"'re\", \" are\")\n q = q.replace(\"'ll\", \" will\")\n \n q=re.sub(re.compile('<.*?>'),'',q)\n \n import string\n q=q.translate(str.maketrans('', '', string.punctuation))\n \n return q","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.199690Z","iopub.execute_input":"2024-05-31T15:59:23.199977Z","iopub.status.idle":"2024-05-31T15:59:23.217455Z","shell.execute_reply.started":"2024-05-31T15:59:23.199952Z","shell.execute_reply":"2024-05-31T15:59:23.216322Z"},"trusted":true},"execution_count":9,"outputs":[]},{"cell_type":"code","source":"df['is_duplicate'].value_counts().plot(kind='bar')","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.218752Z","iopub.execute_input":"2024-05-31T15:59:23.219131Z","iopub.status.idle":"2024-05-31T15:59:23.586693Z","shell.execute_reply.started":"2024-05-31T15:59:23.219097Z","shell.execute_reply":"2024-05-31T15:59:23.585550Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":""},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"
","image/png":"iVBORw0KGgoAAAANSUhEUgAAAkIAAAGrCAYAAAAsBPjXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAtHklEQVR4nO3de1TVdb7/8ReIXFLZeAlwn0iZclSOJgWJWFmOjLvROoeyM1KUVIyeOtCo5LUM7WpDYyWlMjYVzUlX5pmJMXRIBkdplFBR8jJizqRh42ysUdhJIyJ8f3+0+P7cal5qI8rn+VhrryXf73t/v5/Nmj0+25evfpZlWQIAADCQf1svAAAAoK0QQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwVkBbL+Bi1tzcrAMHDqhLly7y8/Nr6+UAAIBzYFmWvvrqKzmdTvn7n/k1H0LoDA4cOKCoqKi2XgYAAPgO9u/fryuuuOKMM4TQGXTp0kXSN7/I0NDQNl4NAAA4Fx6PR1FRUfbf42dCCJ1By9thoaGhhBAAAJeYc/lYCx+WBgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgrIC2XgAuTr1nrGzrJeAC2vf86LZeAgC0CV4RAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYKzzDqHS0lLdfvvtcjqd8vPzU0FBgb2vsbFR06dP18CBA9WpUyc5nU6NGzdOBw4c8DrGoUOHlJqaqtDQUIWFhSk9PV1Hjhzxmtm2bZtuuukmBQcHKyoqSjk5OaesZfny5erXr5+Cg4M1cOBArVq1ymu/ZVnKzs5Wz549FRISoqSkJO3Zs+d8HzIAAGinzjuE6uvrNWjQIC1YsOCUfV9//bW2bNmiJ554Qlu2bNHvfvc77d69W//xH//hNZeamqqdO3equLhYhYWFKi0t1YQJE+z9Ho9HI0eOVK9evVRRUaEXXnhBc+bM0eLFi+2ZDRs26O6771Z6erq2bt2q5ORkJScna8eOHfZMTk6OcnNzlZeXp/LycnXq1Ekul0tHjx4934cNAADaIT/LsqzvfGc/P7333ntKTk7+1plNmzZp8ODB+uyzz3TllVdq165diomJ0aZNmxQfHy9JKioq0qhRo/T555/L6XRq0aJFevzxx+V2uxUYGChJmjFjhgoKClRVVSVJGjt2rOrr61VYWGifa8iQIYqNjVVeXp4sy5LT6dSjjz6qKVOmSJLq6uoUERGh/Px8paSknLLWhoYGNTQ02D97PB5FRUWprq5OoaGh3/XXdEniX583C//6PID2xOPxyOFwnNPf363+GaG6ujr5+fkpLCxMklRWVqawsDA7giQpKSlJ/v7+Ki8vt2eGDRtmR5AkuVwu7d69W4cPH7ZnkpKSvM7lcrlUVlYmSdq7d6/cbrfXjMPhUEJCgj1zsrlz58rhcNi3qKio7/8LAAAAF61WDaGjR49q+vTpuvvuu+0ic7vdCg8P95oLCAhQt27d5Ha77ZmIiAivmZafzzZz4v4T73e6mZPNnDlTdXV19m3//v3n/ZgBAMClI6C1DtzY2Kif/vSnsixLixYtaq3T+FRQUJCCgoLaehkAAOACaZVXhFoi6LPPPlNxcbHX+3ORkZE6ePCg1/zx48d16NAhRUZG2jM1NTVeMy0/n23mxP0n3u90MwAAwGw+D6GWCNqzZ4/++Mc/qnv37l77ExMTVVtbq4qKCnvbmjVr1NzcrISEBHumtLRUjY2N9kxxcbH69u2rrl272jMlJSVexy4uLlZiYqIkKTo6WpGRkV4zHo9H5eXl9gwAADDbeYfQkSNHVFlZqcrKSknffCi5srJS1dXVamxs1F133aXNmzdryZIlampqktvtltvt1rFjxyRJ/fv316233qrx48dr48aNWr9+vTIzM5WSkiKn0ylJuueeexQYGKj09HTt3LlTy5Yt0/z585WVlWWvY+LEiSoqKtK8efNUVVWlOXPmaPPmzcrMzJT0zTfaJk2apGeeeUYrVqzQ9u3bNW7cODmdzjN+yw0AAJjjvL8+v3btWg0fPvyU7WlpaZozZ46io6NPe78//elPuuWWWyR9c0HFzMxMvf/++/L399eYMWOUm5urzp072/Pbtm1TRkaGNm3apB49euiRRx7R9OnTvY65fPlyzZo1S/v27VOfPn2Uk5OjUaNG2fsty9Ls2bO1ePFi1dbW6sYbb9TChQv1wx/+8Jwe6/l8/a694evzZuHr8wDak/P5+/t7XUeovSOEYApCCEB7clFdRwgAAOBiRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjHXeIVRaWqrbb79dTqdTfn5+Kigo8NpvWZays7PVs2dPhYSEKCkpSXv27PGaOXTokFJTUxUaGqqwsDClp6fryJEjXjPbtm3TTTfdpODgYEVFRSknJ+eUtSxfvlz9+vVTcHCwBg4cqFWrVp33WgAAgLnOO4Tq6+s1aNAgLViw4LT7c3JylJubq7y8PJWXl6tTp05yuVw6evSoPZOamqqdO3equLhYhYWFKi0t1YQJE+z9Ho9HI0eOVK9evVRRUaEXXnhBc+bM0eLFi+2ZDRs26O6771Z6erq2bt2q5ORkJScna8eOHee1FgAAYC4/y7Ks73xnPz+99957Sk5OlvTNKzBOp1OPPvqopkyZIkmqq6tTRESE8vPzlZKSol27dikmJkabNm1SfHy8JKmoqEijRo3S559/LqfTqUWLFunxxx+X2+1WYGCgJGnGjBkqKChQVVWVJGns2LGqr69XYWGhvZ4hQ4YoNjZWeXl557SWkzU0NKihocH+2ePxKCoqSnV1dQoNDf2uv6ZLUu8ZK9t6CbiA9j0/uq2XAAA+4/F45HA4zunvb59+Rmjv3r1yu91KSkqytzkcDiUkJKisrEySVFZWprCwMDuCJCkpKUn+/v4qLy+3Z4YNG2ZHkCS5XC7t3r1bhw8ftmdOPE/LTMt5zmUtJ5s7d64cDod9i4qK+j6/DgAAcJHzaQi53W5JUkREhNf2iIgIe5/b7VZ4eLjX/oCAAHXr1s1r5nTHOPEc3zZz4v6zreVkM2fOVF1dnX3bv3//OTxqAABwqQpo6wVcTIKCghQUFNTWywAAABeIT18RioyMlCTV1NR4ba+pqbH3RUZG6uDBg177jx8/rkOHDnnNnO4YJ57j22ZO3H+2tQAAALP5NISio6MVGRmpkpISe5vH41F5ebkSExMlSYmJiaqtrVVFRYU9s2bNGjU3NyshIcGeKS0tVWNjoz1TXFysvn37qmvXrvbMiedpmWk5z7msBQAAmO28Q+jIkSOqrKxUZWWlpG8+lFxZWanq6mr5+flp0qRJeuaZZ7RixQpt375d48aNk9PptL9Z1r9/f916660aP368Nm7cqPXr1yszM1MpKSlyOp2SpHvuuUeBgYFKT0/Xzp07tWzZMs2fP19ZWVn2OiZOnKiioiLNmzdPVVVVmjNnjjZv3qzMzExJOqe1AAAAs533Z4Q2b96s4cOH2z+3xElaWpry8/M1bdo01dfXa8KECaqtrdWNN96ooqIiBQcH2/dZsmSJMjMzNWLECPn7+2vMmDHKzc219zscDq1evVoZGRmKi4tTjx49lJ2d7XWtoaFDh2rp0qWaNWuWHnvsMfXp00cFBQUaMGCAPXMuawEAAOb6XtcRau/O5zoE7Q3XETIL1xEC0J602XWEAAAALiWEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYPg+hpqYmPfHEE4qOjlZISIiuuuoqPf3007Isy56xLEvZ2dnq2bOnQkJClJSUpD179ngd59ChQ0pNTVVoaKjCwsKUnp6uI0eOeM1s27ZNN910k4KDgxUVFaWcnJxT1rN8+XL169dPwcHBGjhwoFatWuXrhwwAAC5RPg+hX/ziF1q0aJFeffVV7dq1S7/4xS+Uk5OjV155xZ7JyclRbm6u8vLyVF5erk6dOsnlcuno0aP2TGpqqnbu3Kni4mIVFhaqtLRUEyZMsPd7PB6NHDlSvXr1UkVFhV544QXNmTNHixcvtmc2bNigu+++W+np6dq6dauSk5OVnJysHTt2+PphAwCAS5CfdeJLNT5w2223KSIiQq+//rq9bcyYMQoJCdHbb78ty7LkdDr16KOPasqUKZKkuro6RUREKD8/XykpKdq1a5diYmK0adMmxcfHS5KKioo0atQoff7553I6nVq0aJEef/xxud1uBQYGSpJmzJihgoICVVVVSZLGjh2r+vp6FRYW2msZMmSIYmNjlZeXd8raGxoa1NDQYP/s8XgUFRWluro6hYaG+vLXdNHrPWNlWy8BF9C+50e39RIAwGc8Ho8cDsc5/f3t81eEhg4dqpKSEn3yySeSpI8//lh//vOf9ZOf/ESStHfvXrndbiUlJdn3cTgcSkhIUFlZmSSprKxMYWFhdgRJUlJSkvz9/VVeXm7PDBs2zI4gSXK5XNq9e7cOHz5sz5x4npaZlvOcbO7cuXI4HPYtKirq+/46AADARSzA1wecMWOGPB6P+vXrpw4dOqipqUnPPvusUlNTJUlut1uSFBER4XW/iIgIe5/b7VZ4eLj3QgMC1K1bN6+Z6OjoU47Rsq9r165yu91nPM/JZs6cqaysLPvnlleEAABA++TzEHr33Xe1ZMkSLV26VP/+7/+uyspKTZo0SU6nU2lpab4+nU8FBQUpKCiorZcBAAAuEJ+H0NSpUzVjxgylpKRIkgYOHKjPPvtMc+fOVVpamiIjIyVJNTU16tmzp32/mpoaxcbGSpIiIyN18OBBr+MeP35chw4dsu8fGRmpmpoar5mWn88207IfAACYzeefEfr666/l7+992A4dOqi5uVmSFB0drcjISJWUlNj7PR6PysvLlZiYKElKTExUbW2tKioq7Jk1a9aoublZCQkJ9kxpaakaGxvtmeLiYvXt21ddu3a1Z048T8tMy3kAAIDZfB5Ct99+u5599lmtXLlS+/bt03vvvacXX3xRd9xxhyTJz89PkyZN0jPPPKMVK1Zo+/btGjdunJxOp5KTkyVJ/fv316233qrx48dr48aNWr9+vTIzM5WSkiKn0ylJuueeexQYGKj09HTt3LlTy5Yt0/z5870+4zNx4kQVFRVp3rx5qqqq0pw5c7R582ZlZmb6+mEDAIBLkM/fGnvllVf0xBNP6H/+53908OBBOZ1O/fd//7eys7PtmWnTpqm+vl4TJkxQbW2tbrzxRhUVFSk4ONieWbJkiTIzMzVixAj5+/trzJgxys3Ntfc7HA6tXr1aGRkZiouLU48ePZSdne11raGhQ4dq6dKlmjVrlh577DH16dNHBQUFGjBggK8fNgAAuAT5/DpC7cn5XIegveE6QmbhOkIA2pM2vY4QAADApYIQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsn19ZGgBwceOCqWbhgqlnxitCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAY7VKCP3973/Xvffeq+7duyskJEQDBw7U5s2b7f2WZSk7O1s9e/ZUSEiIkpKStGfPHq9jHDp0SKmpqQoNDVVYWJjS09N15MgRr5lt27bppptuUnBwsKKiopSTk3PKWpYvX65+/fopODhYAwcO1KpVq1rjIQMAgEuQz0Po8OHDuuGGG9SxY0f94Q9/0F/+8hfNmzdPXbt2tWdycnKUm5urvLw8lZeXq1OnTnK5XDp69Kg9k5qaqp07d6q4uFiFhYUqLS3VhAkT7P0ej0cjR45Ur169VFFRoRdeeEFz5szR4sWL7ZkNGzbo7rvvVnp6urZu3ark5GQlJydrx44dvn7YAADgEuRnWZblywPOmDFD69ev14cffnja/ZZlyel06tFHH9WUKVMkSXV1dYqIiFB+fr5SUlK0a9cuxcTEaNOmTYqPj5ckFRUVadSoUfr888/ldDq1aNEiPf7443K73QoMDLTPXVBQoKqqKknS2LFjVV9fr8LCQvv8Q4YMUWxsrPLy8k5ZW0NDgxoaGuyfPR6PoqKiVFdXp9DQUN/8gi4RvWesbOsl4ALa9/zotl4CLiCe32Yx8fnt8XjkcDjO6e9vn78itGLFCsXHx+u//uu/FB4ermuvvVavvfaavX/v3r1yu91KSkqytzkcDiUkJKisrEySVFZWprCwMDuCJCkpKUn+/v4qLy+3Z4YNG2ZHkCS5XC7t3r1bhw8ftmdOPE/LTMt5TjZ37lw5HA77FhUV9T1/GwAA4GLm8xD69NNPtWjRIvXp00cffPCBHn74Yf385z/XW2+9JUlyu92SpIiICK/7RURE2PvcbrfCw8O99gcEBKhbt25eM6c7xonn+LaZlv0nmzlzpurq6uzb/v37z/vxAwCAS0eArw/Y3Nys+Ph4Pffcc5Kka6+9Vjt27FBeXp7S0tJ8fTqfCgoKUlBQUFsvAwAAXCA+f0WoZ8+eiomJ8drWv39/VVdXS5IiIyMlSTU1NV4zNTU19r7IyEgdPHjQa//x48d16NAhr5nTHePEc3zbTMt+AABgNp+H0A033KDdu3d7bfvkk0/Uq1cvSVJ0dLQiIyNVUlJi7/d4PCovL1diYqIkKTExUbW1taqoqLBn1qxZo+bmZiUkJNgzpaWlamxstGeKi4vVt29f+xtqiYmJXudpmWk5DwAAMJvPQ2jy5Mn66KOP9Nxzz+mvf/2rli5dqsWLFysjI0OS5Ofnp0mTJumZZ57RihUrtH37do0bN05Op1PJycmSvnkF6dZbb9X48eO1ceNGrV+/XpmZmUpJSZHT6ZQk3XPPPQoMDFR6erp27typZcuWaf78+crKyrLXMnHiRBUVFWnevHmqqqrSnDlztHnzZmVmZvr6YQMAgEuQzz8jdP311+u9997TzJkz9dRTTyk6Olovv/yyUlNT7Zlp06apvr5eEyZMUG1trW688UYVFRUpODjYnlmyZIkyMzM1YsQI+fv7a8yYMcrNzbX3OxwOrV69WhkZGYqLi1OPHj2UnZ3tda2hoUOHaunSpZo1a5Yee+wx9enTRwUFBRowYICvHzYAALgE+fw6Qu3J+VyHoL3hOiNmMfE6Iybj+W0WE5/fbXodIQAAgEsFIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwVquH0PPPPy8/Pz9NmjTJ3nb06FFlZGSoe/fu6ty5s8aMGaOamhqv+1VXV2v06NG67LLLFB4erqlTp+r48eNeM2vXrtV1112noKAgXX311crPzz/l/AsWLFDv3r0VHByshIQEbdy4sTUeJgAAuAS1aght2rRJv/rVr3TNNdd4bZ88ebLef/99LV++XOvWrdOBAwd055132vubmpo0evRoHTt2TBs2bNBbb72l/Px8ZWdn2zN79+7V6NGjNXz4cFVWVmrSpEn62c9+pg8++MCeWbZsmbKysjR79mxt2bJFgwYNksvl0sGDB1vzYQMAgEtEq4XQkSNHlJqaqtdee01du3a1t9fV1en111/Xiy++qB/96EeKi4vTm2++qQ0bNuijjz6SJK1evVp/+ctf9Pbbbys2NlY/+clP9PTTT2vBggU6duyYJCkvL0/R0dGaN2+e+vfvr8zMTN1111166aWX7HO9+OKLGj9+vB544AHFxMQoLy9Pl112md54443WetgAAOAS0mohlJGRodGjRyspKclre0VFhRobG7229+vXT1deeaXKysokSWVlZRo4cKAiIiLsGZfLJY/Ho507d9ozJx/b5XLZxzh27JgqKiq8Zvz9/ZWUlGTPnKyhoUEej8frBgAA2q+A1jjoO++8oy1btmjTpk2n7HO73QoMDFRYWJjX9oiICLndbnvmxAhq2d+y70wzHo9H//rXv3T48GE1NTWddqaqquq06547d66efPLJc3+gAADgkubzV4T279+viRMnasmSJQoODvb14VvVzJkzVVdXZ9/279/f1ksCAACtyOchVFFRoYMHD+q6665TQECAAgICtG7dOuXm5iogIEARERE6duyYamtrve5XU1OjyMhISVJkZOQp3yJr+flsM6GhoQoJCVGPHj3UoUOH0860HONkQUFBCg0N9boBAID2y+chNGLECG3fvl2VlZX2LT4+XqmpqfafO3bsqJKSEvs+u3fvVnV1tRITEyVJiYmJ2r59u9e3u4qLixUaGqqYmBh75sRjtMy0HCMwMFBxcXFeM83NzSopKbFnAACA2Xz+GaEuXbpowIABXts6deqk7t2729vT09OVlZWlbt26KTQ0VI888ogSExM1ZMgQSdLIkSMVExOj++67Tzk5OXK73Zo1a5YyMjIUFBQkSXrooYf06quvatq0aXrwwQe1Zs0avfvuu1q5cqV93qysLKWlpSk+Pl6DBw/Wyy+/rPr6ej3wwAO+ftgAAOAS1Coflj6bl156Sf7+/hozZowaGhrkcrm0cOFCe3+HDh1UWFiohx9+WImJierUqZPS0tL01FNP2TPR0dFauXKlJk+erPnz5+uKK67Qr3/9a7lcLntm7Nix+uKLL5SdnS23263Y2FgVFRWd8gFqAABgJj/Lsqy2XsTFyuPxyOFwqK6uzrjPC/WesfLsQ2g39j0/uq2XgAuI57dZTHx+n8/f3/xbYwAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYPg+huXPn6vrrr1eXLl0UHh6u5ORk7d6922vm6NGjysjIUPfu3dW5c2eNGTNGNTU1XjPV1dUaPXq0LrvsMoWHh2vq1Kk6fvy418zatWt13XXXKSgoSFdffbXy8/NPWc+CBQvUu3dvBQcHKyEhQRs3bvT1QwYAAJcon4fQunXrlJGRoY8++kjFxcVqbGzUyJEjVV9fb89MnjxZ77//vpYvX65169bpwIEDuvPOO+39TU1NGj16tI4dO6YNGzborbfeUn5+vrKzs+2ZvXv3avTo0Ro+fLgqKys1adIk/exnP9MHH3xgzyxbtkxZWVmaPXu2tmzZokGDBsnlcungwYO+ftgAAOAS5GdZltWaJ/jiiy8UHh6udevWadiwYaqrq9Pll1+upUuX6q677pIkVVVVqX///iorK9OQIUP0hz/8QbfddpsOHDigiIgISVJeXp6mT5+uL774QoGBgZo+fbpWrlypHTt22OdKSUlRbW2tioqKJEkJCQm6/vrr9eqrr0qSmpubFRUVpUceeUQzZsw469o9Ho8cDofq6uoUGhrq61/NRa33jJVtvQRcQPueH93WS8AFxPPbLCY+v8/n7+9W/4xQXV2dJKlbt26SpIqKCjU2NiopKcme6devn6688kqVlZVJksrKyjRw4EA7giTJ5XLJ4/Fo586d9syJx2iZaTnGsWPHVFFR4TXj7++vpKQke+ZkDQ0N8ng8XjcAANB+tWoINTc3a9KkSbrhhhs0YMAASZLb7VZgYKDCwsK8ZiMiIuR2u+2ZEyOoZX/LvjPNeDwe/etf/9KXX36ppqam0860HONkc+fOlcPhsG9RUVHf7YEDAIBLQquGUEZGhnbs2KF33nmnNU/jMzNnzlRdXZ19279/f1svCQAAtKKA1jpwZmamCgsLVVpaqiuuuMLeHhkZqWPHjqm2ttbrVaGamhpFRkbaMyd/u6vlW2Unzpz8TbOamhqFhoYqJCREHTp0UIcOHU4703KMkwUFBSkoKOi7PWAAAHDJ8fkrQpZlKTMzU++9957WrFmj6Ohor/1xcXHq2LGjSkpK7G27d+9WdXW1EhMTJUmJiYnavn2717e7iouLFRoaqpiYGHvmxGO0zLQcIzAwUHFxcV4zzc3NKikpsWcAAIDZfP6KUEZGhpYuXarf//736tKli/15HIfDoZCQEDkcDqWnpysrK0vdunVTaGioHnnkESUmJmrIkCGSpJEjRyomJkb33XefcnJy5Ha7NWvWLGVkZNiv2Dz00EN69dVXNW3aND344INas2aN3n33Xa1c+f+/DZGVlaW0tDTFx8dr8ODBevnll1VfX68HHnjA1w8bAABcgnweQosWLZIk3XLLLV7b33zzTd1///2SpJdeekn+/v4aM2aMGhoa5HK5tHDhQnu2Q4cOKiws1MMPP6zExER16tRJaWlpeuqpp+yZ6OhorVy5UpMnT9b8+fN1xRVX6Ne//rVcLpc9M3bsWH3xxRfKzs6W2+1WbGysioqKTvkANQAAMFOrX0foUsZ1hGAKE68zYjKe32Yx8fl9UV1HCAAA4GJFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMZUQILViwQL1791ZwcLASEhK0cePGtl4SAAC4CLT7EFq2bJmysrI0e/ZsbdmyRYMGDZLL5dLBgwfbemkAAKCNtfsQevHFFzV+/Hg98MADiomJUV5eni677DK98cYbbb00AADQxgLaegGt6dixY6qoqNDMmTPtbf7+/kpKSlJZWdkp8w0NDWpoaLB/rqurkyR5PJ7WX+xFprnh67ZeAi4gE/83bjKe32Yx8fnd8pgtyzrrbLsOoS+//FJNTU2KiIjw2h4REaGqqqpT5ufOnasnn3zylO1RUVGttkbgYuB4ua1XAKC1mPz8/uqrr+RwOM44065D6HzNnDlTWVlZ9s/Nzc06dOiQunfvLj8/vzZcGS4Ej8ejqKgo7d+/X6GhoW29HAA+xPPbLJZl6auvvpLT6TzrbLsOoR49eqhDhw6qqanx2l5TU6PIyMhT5oOCghQUFOS1LSwsrDWXiItQaGgo/0cJtFM8v81xtleCWrTrD0sHBgYqLi5OJSUl9rbm5maVlJQoMTGxDVcGAAAuBu36FSFJysrKUlpamuLj4zV48GC9/PLLqq+v1wMPPNDWSwMAAG2s3YfQ2LFj9cUXXyg7O1tut1uxsbEqKio65QPUQFBQkGbPnn3K26MALn08v/Ft/Kxz+W4ZAABAO9SuPyMEAABwJoQQAAAwFiEEAACMRQgBAABjEUIAAMBY7f7r88C3+fLLL/XGG2+orKxMbrdbkhQZGamhQ4fq/vvv1+WXX97GKwQAtDZeEYKRNm3apB/+8IfKzc2Vw+HQsGHDNGzYMDkcDuXm5qpfv37avHlzWy8TQCvZv3+/HnzwwbZeBi4CXEcIRhoyZIgGDRqkvLy8U/5BXcuy9NBDD2nbtm0qKytroxUCaE0ff/yxrrvuOjU1NbX1UtDGeGsMRvr444+Vn59/SgRJkp+fnyZPnqxrr722DVYGwBdWrFhxxv2ffvrpBVoJLnaEEIwUGRmpjRs3ql+/fqfdv3HjRv4ZFuASlpycLD8/P53pTY/T/YcQzEMIwUhTpkzRhAkTVFFRoREjRtjRU1NTo5KSEr322mv65S9/2carBPBd9ezZUwsXLtR//ud/nnZ/ZWWl4uLiLvCqcDEihGCkjIwM9ejRQy+99JIWLlxof06gQ4cOiouLU35+vn7605+28SoBfFdxcXGqqKj41hA626tFMAcflobxGhsb9eWXX0qSevTooY4dO7bxigB8Xx9++KHq6+t16623nnZ/fX29Nm/erJtvvvkCrwwXG0IIAAAYi+sIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBaDW33HKLJk2adEkcd+3atfLz81Ntba0kKT8/X2FhYT49B4CLD9cRAtBqfve7312ylyMYO3asRo0a5bPjrV27VsOHD9fhw4cJLOAiQggBaDXdunVr6yV8ZyEhIQoJCWnrZQBoZbw1BqDVnPgW1sKFC9WnTx8FBwcrIiJCd9111zkdo76+XuPGjVPnzp3Vs2dPzZs375QZPz8/FRQUeG0LCwtTfn6+JGnfvn3y8/PTO++8o6FDhyo4OFgDBgzQunXrvvW8p3tr7P3339f111+v4OBg9ejRQ3fccYe973//938VHx+vLl26KDIyUvfcc48OHjxon3/48OGSpK5du8rPz0/333+/JKm5uVlz585VdHS0QkJCNGjQIP3f//3fOf1uAHx/hBCAVrd582b9/Oc/11NPPaXdu3erqKhIw4YNO6f7Tp06VevWrdPvf/97rV69WmvXrtWWLVu+0zqmTp2qRx99VFu3blViYqJuv/12/fOf/zyn+65cuVJ33HGHRo0apa1bt6qkpESDBw+29zc2Nurpp5/Wxx9/rIKCAu3bt8+OnaioKP32t7+VJO3evVv/+Mc/NH/+fEnS3Llz9Zvf/EZ5eXnauXOnJk+erHvvvfeMkQbAd3hrDECrq66uVqdOnXTbbbepS5cu6tWrl6699tqz3u/IkSN6/fXX9fbbb2vEiBGSpLfeektXXHHFd1pHZmamxowZI0latGiRioqK9Prrr2vatGlnve+zzz6rlJQUPfnkk/a2QYMG2X9+8MEH7T//4Ac/UG5urq6//nodOXJEnTt3tt8mDA8Pt19pamho0HPPPac//vGPSkxMtO/75z//Wb/61a/45x+AC4BXhAC0uh//+Mfq1auXfvCDH+i+++7TkiVL9PXXX5/1fn/729907NgxJSQk2Nu6deumvn37fqd1tMSGJAUEBCg+Pl67du06p/tWVlbaMXY6FRUVuv3223XllVeqS5cudsRUV1d/633++te/6uuvv9aPf/xjde7c2b795je/0d/+9rdzfFQAvg9eEQLQ6rp06aItW7Zo7dq1Wr16tbKzszVnzhxt2rTJJ9+gOt2/JN7Y2Pi9j3uiM31wur6+Xi6XSy6XS0uWLNHll1+u6upquVwuHTt27Fvvd+TIEUnfvO32b//2b177goKCfLNwAGfEK0IALoiAgAAlJSUpJydH27Zt0759+7RmzZoz3ueqq65Sx44dVV5ebm87fPiwPvnkE6+5yy+/XP/4xz/sn/fs2XPaV5w++ugj+8/Hjx9XRUWF+vfvf07rv+aaa1RSUnLafVVVVfrnP/+p559/XjfddJP69etnf1C6RWBgoCSpqanJ3hYTE6OgoCBVV1fr6quv9rpFRUWd07oAfD+8IgSg1RUWFurTTz/VsGHD1LVrV61atUrNzc1nfYurc+fOSk9P19SpU9W9e3eFh4fr8ccfl7+/93/D/ehHP9Krr76qxMRENTU1afr06ae9ftGCBQvUp08f9e/fXy+99JIOHz7s9dmeM5k9e7ZGjBihq666SikpKTp+/LhWrVql6dOn68orr1RgYKBeeeUVPfTQQ9qxY4eefvppr/v36tVLfn5+Kiws1KhRoxQSEqIuXbpoypQpmjx5spqbm3XjjTeqrq5O69evV2hoqNLS0s5pbQC+BwsAWsnNN99sTZw40frwww+tm2++2eratasVEhJiXXPNNdayZcvO6RhfffWVde+991qXXXaZFRERYeXk5NjHbfH3v//dGjlypNWpUyerT58+1qpVqyyHw2G9+eablmVZ1t69ey1J1tKlS63BgwdbgYGBVkxMjLVmzRr7GH/6058sSdbhw4cty7KsN99803I4HF5r+e1vf2vFxsZagYGBVo8ePaw777zT3rd06VKrd+/eVlBQkJWYmGitWLHCkmRt3brVnnnqqaesyMhIy8/Pz0pLS7Msy7Kam5utl19+2erbt6/VsWNH6/LLL7dcLpe1bt26c/49A/ju/CzrpDfWAaCd2bdvn6Kjo7V161bFxsa29XIAXET4jBAAADAWIQSgzVRXV3t9bfzk25m+eg4AvsBbYwDazPHjx7Vv375v3d+7d28FBPCdDgCthxACAADG4q0xAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMb6fwthdCyLFVegAAAAAElFTkSuQmCC"},"metadata":{}}]},{"cell_type":"code","source":"qid=pd.Series(df['qid1'].tolist()+df['qid2'].tolist())","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.588197Z","iopub.execute_input":"2024-05-31T15:59:23.588543Z","iopub.status.idle":"2024-05-31T15:59:23.745422Z","shell.execute_reply.started":"2024-05-31T15:59:23.588515Z","shell.execute_reply":"2024-05-31T15:59:23.743914Z"},"trusted":true},"execution_count":11,"outputs":[]},{"cell_type":"code","source":"np.unique(qid).shape[0]","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.747034Z","iopub.execute_input":"2024-05-31T15:59:23.747847Z","iopub.status.idle":"2024-05-31T15:59:23.793227Z","shell.execute_reply.started":"2024-05-31T15:59:23.747809Z","shell.execute_reply":"2024-05-31T15:59:23.791954Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"301629"},"metadata":{}}]},{"cell_type":"code","source":"df['question1']=df['question1'].apply(preprocess)\ndf['question2']=df['question2'].apply(preprocess)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.794871Z","iopub.execute_input":"2024-05-31T15:59:23.795807Z","iopub.status.idle":"2024-05-31T15:59:38.168654Z","shell.execute_reply.started":"2024-05-31T15:59:23.795769Z","shell.execute_reply":"2024-05-31T15:59:38.167619Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"code","source":"qid.shape[0]","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.174320Z","iopub.execute_input":"2024-05-31T15:59:38.174626Z","iopub.status.idle":"2024-05-31T15:59:38.181066Z","shell.execute_reply.started":"2024-05-31T15:59:38.174601Z","shell.execute_reply":"2024-05-31T15:59:38.180099Z"},"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":"400000"},"metadata":{}}]},{"cell_type":"code","source":"x=qid.value_counts()>1","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.182483Z","iopub.execute_input":"2024-05-31T15:59:38.182810Z","iopub.status.idle":"2024-05-31T15:59:38.222431Z","shell.execute_reply.started":"2024-05-31T15:59:38.182777Z","shell.execute_reply":"2024-05-31T15:59:38.221528Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"x[x]","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.223680Z","iopub.execute_input":"2024-05-31T15:59:38.223986Z","iopub.status.idle":"2024-05-31T15:59:38.232481Z","shell.execute_reply.started":"2024-05-31T15:59:38.223961Z","shell.execute_reply":"2024-05-31T15:59:38.231296Z"},"trusted":true},"execution_count":16,"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"2559 True\n4044 True\n30782 True\n17978 True\n2561 True\n ... \n41258 True\n64963 True\n22576 True\n141425 True\n47459 True\nName: count, Length: 47906, dtype: bool"},"metadata":{}}]},{"cell_type":"code","source":"x[x].shape[0]","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.233700Z","iopub.execute_input":"2024-05-31T15:59:38.234161Z","iopub.status.idle":"2024-05-31T15:59:38.242383Z","shell.execute_reply.started":"2024-05-31T15:59:38.234134Z","shell.execute_reply":"2024-05-31T15:59:38.241358Z"},"trusted":true},"execution_count":17,"outputs":[{"execution_count":17,"output_type":"execute_result","data":{"text/plain":"47906"},"metadata":{}}]},{"cell_type":"code","source":"plt.hist(qid.value_counts().values,bins=100)\nplt.yscale('log')\nplt.show()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.243819Z","iopub.execute_input":"2024-05-31T15:59:38.244313Z","iopub.status.idle":"2024-05-31T15:59:38.987011Z","shell.execute_reply.started":"2024-05-31T15:59:38.244277Z","shell.execute_reply":"2024-05-31T15:59:38.986059Z"},"trusted":true},"execution_count":18,"outputs":[{"output_type":"display_data","data":{"text/plain":"
","image/png":"iVBORw0KGgoAAAANSUhEUgAAAicAAAGdCAYAAADJ6dNTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAikUlEQVR4nO3de3BU5f3H8c8mmEUUwiVlQ4AQbb00Ihubm6k6Bc00ExlUqA5/2BphSke7abFbW8MfhTqjhmkrk9o5Y6oWcUanpHTGeKGiGJFUi+aC8dJUCm3UFMwGxpIlURO7e35/OC6/JYBsssk+J+f9mtkZzyXP+eaZNXzmnOd5jse2bVsAAACGSEt1AQAAAP8f4QQAABiFcAIAAIxCOAEAAEYhnAAAAKMQTgAAgFEIJwAAwCiEEwAAYJRJqS4gUdFoVIcOHdLUqVPl8XhSXQ4AADgDtm3r2LFjysnJUVra6e+NOC6cHDp0SPPnz091GQAAYAS6u7s1b968057juHAydepUSZ//ctOmTUtxNQAA4EyEw2HNnz8/9u/46TgunHzxKGfatGmEEwAAHOZMhmQwIBYAABjFMeHEsizl5+eruLg41aUAAIAx5LFt2051EYkIh8PKzMxUX18fj3UAAHCIRP79dsydEwAA4A6EEwAAYBTCCQAAMArhBAAAGIVwAgAAjEI4AQAARnFMOGGdEwAA3IF1TgAAwJhjnRMAAOBYhBMAAGAUx72VeKzl1WyP235v49IUVQIAgDtx5wQAABiFcAIAAIxCOAEAAEYhnAAAAKMQTgAAgFEIJwAAwCiOCScsXw8AgDs4JpwEAgF1dnaqtbU11aUAAIAx5JhwAgAA3IFwAgAAjEI4AQAARiGcAAAAoxBOAACAUQgnAADAKIQTAABgFMIJAAAwCuEEAAAYhXACAACMQjgBAABGmZSKi+bl5WnatGlKS0vTjBkztGvXrlSUAQAADJSScCJJf/vb33Tuueem6vIAAMBQPNYBAABGSTicNDc3a9myZcrJyZHH41FjY+OwcyzLUl5eniZPnqzS0lK1tLTEHfd4PPrWt76l4uJiPfHEEyMuHgAATDwJh5OBgQH5/X5ZlnXS4w0NDQoGg9qwYYP27t0rv9+viooK9fb2xs555ZVX1N7erqefflr33Xef3nrrrZH/BgAAYEJJOJxUVlbqnnvu0fLly096fNOmTVqzZo1WrVql/Px81dfXa8qUKdq8eXPsnLlz50qS5syZo2uvvVZ79+495fUGBwcVDofjPgAAYOJK6piToaEhtbe3q7y8/PgF0tJUXl6uPXv2SPr8zsuxY8ckSf39/XrppZd0ySWXnLLN2tpaZWZmxj7z589PZskAAMAwSQ0nR44cUSQSkc/ni9vv8/nU09MjSQqFQrryyivl9/t1+eWX65ZbblFxcfEp21y3bp36+vpin+7u7mSWDAAADDPuU4nPP/98vfnmm2d8vtfrldfrlWVZsixLkUhkDKsDAACpltQ7J1lZWUpPT1coFIrbHwqFlJ2dPaq2A4GAOjs71draOqp2AACA2ZIaTjIyMlRYWKimpqbYvmg0qqamJpWVlSXzUgAAYIJK+LFOf3+/Dhw4ENvu6upSR0eHZs6cqdzcXAWDQVVVVamoqEglJSWqq6vTwMCAVq1aldTCAQDAxJRwOGlra9OSJUti28FgUJJUVVWlLVu2aOXKlTp8+LDWr1+vnp4eFRQUaMeOHcMGySaKMScAALiDx7ZtO9VFJCIcDiszM1N9fX2aNm1a0tvPq9ket/3exqVJvwYAAG6TyL/fvFsHAAAYxTHhxLIs5efnn3ZNFAAA4HyOCSdMJQYAwB0cE04AAIA7EE4AAIBRHBNOGHMCAIA7OCacMOYEAAB3cEw4AQAA7kA4AQAARiGcAAAAozgmnDAgFgAAd3BMOGFALAAA7uCYcAIAANyBcAIAAIxCOAEAAEYhnAAAAKM4JpwwWwcAAHdwTDhhtg4AAO7gmHACAADcgXACAACMQjgBAABGIZwAAACjEE4AAIBRCCcAAMAok1JdwJmyLEuWZSkSiYzrdfNqtg/b997GpeNaAwAAbuKYOyescwIAgDs4JpwAAAB3IJwAAACjEE4AAIBRCCcAAMAohBMAAGAUwgkAADAK4QQAABiFcAIAAIxCOAEAAEZxTDixLEv5+fkqLi5OdSkAAGAMOSacsHw9AADu4JhwAgAA3IFwAgAAjEI4AQAARiGcAAAAoxBOAACAUQgnAADAKIQTAABgFMIJAAAwCuEEAAAYhXACAACMQjgBAABGSVk4+fjjj7VgwQLdeeedqSoBAAAYKGXh5N5779Xll1+eqssDAABDpSSc7N+/X++++64qKytTcXkAAGCwhMNJc3Ozli1bppycHHk8HjU2Ng47x7Is5eXlafLkySotLVVLS0vc8TvvvFO1tbUjLhoAAExcCYeTgYEB+f1+WZZ10uMNDQ0KBoPasGGD9u7dK7/fr4qKCvX29kqSnnrqKV144YW68MILR1c5AACYkCYl+gOVlZWnfRyzadMmrVmzRqtWrZIk1dfXa/v27dq8ebNqamr02muvaevWrdq2bZv6+/v12Wefadq0aVq/fv1J2xscHNTg4GBsOxwOJ1oyAABwkKSOORkaGlJ7e7vKy8uPXyAtTeXl5dqzZ48kqba2Vt3d3Xrvvff0m9/8RmvWrDllMPni/MzMzNhn/vz5ySwZAAAYJqnh5MiRI4pEIvL5fHH7fT6fenp6RtTmunXr1NfXF/t0d3cno1QAAGCohB/rJNOtt976ped4vV55vd6xLwYAABghqXdOsrKylJ6erlAoFLc/FAopOzt7VG1blqX8/HwVFxePqh0AAGC2pIaTjIwMFRYWqqmpKbYvGo2qqalJZWVlo2o7EAios7NTra2toy1z1PJqtsd9AABA8iT8WKe/v18HDhyIbXd1damjo0MzZ85Ubm6ugsGgqqqqVFRUpJKSEtXV1WlgYCA2ewcAAOB0Eg4nbW1tWrJkSWw7GAxKkqqqqrRlyxatXLlShw8f1vr169XT06OCggLt2LFj2CDZRFmWJcuyFIlERtUOAAAwm8e2bTvVRSQiHA4rMzNTfX19mjZtWtLbH8ljmvc2Lk16HQAATCSJ/Pudshf/AQAAnIxjwgmzdQAAcAfHhBOTZusAAICx45hwAgAA3IFwAgAAjEI4AQAARnFMOGFALAAA7uCYcMKAWAAA3MEx4QQAALgD4QQAABiFcAIAAIzimHDCgFgAANzBMeGEAbEAALiDY8IJAABwB8IJAAAwCuEEAAAYhXACAACM4phwwmwdAADcwTHhhNk6AAC4g2PCCQAAcIdJqS5gosqr2R63/d7GpSmqBAAAZ+HOCQAAMArhBAAAGIVwAgAAjOKYcMJUYgAA3MEx4YSpxAAAuINjwgkAAHAHwgkAADAK4QQAABiFcAIAAIxCOAEAAEYhnAAAAKMQTgAAgFEIJwAAwCiEEwAAYBTHhBOWrwcAwB0cE05Yvh4AAHdwTDgBAADuMCnVBbhFXs32Yfve27g0BZUAAGA27pwAAACjEE4AAIBRCCcAAMAohBMAAGAUwgkAADAK4QQAABiFcAIAAIxCOAEAAEYhnAAAAKOMezg5evSoioqKVFBQoIULF+rhhx8e7xIAAIDBxn35+qlTp6q5uVlTpkzRwMCAFi5cqBUrVmjWrFnjXQoAADDQuN85SU9P15QpUyRJg4ODsm1btm2PdxkAAMBQCYeT5uZmLVu2TDk5OfJ4PGpsbBx2jmVZysvL0+TJk1VaWqqWlpa440ePHpXf79e8efP0s5/9TFlZWSP+BQAAwMSS8GOdgYEB+f1+rV69WitWrBh2vKGhQcFgUPX19SotLVVdXZ0qKiq0b98+zZ49W5I0ffp0vfnmmwqFQlqxYoVuvPFG+Xy+0f82DsebiwEAGMGdk8rKSt1zzz1avnz5SY9v2rRJa9as0apVq5Sfn6/6+npNmTJFmzdvHnauz+eT3+/XX//611Neb3BwUOFwOO4DAAAmrqSOORkaGlJ7e7vKy8uPXyAtTeXl5dqzZ48kKRQK6dixY5Kkvr4+NTc366KLLjplm7W1tcrMzIx95s+fn8ySAQCAYZIaTo4cOaJIJDLsEY3P51NPT48k6f3339dVV10lv9+vq666Sj/60Y906aWXnrLNdevWqa+vL/bp7u5OZskAAMAw4z6VuKSkRB0dHWd8vtfrldfrHbuCAACAUZJ65yQrK0vp6ekKhUJx+0OhkLKzs0fVtmVZys/PV3Fx8ajaAQAAZktqOMnIyFBhYaGamppi+6LRqJqamlRWVjaqtgOBgDo7O9Xa2jraMgEAgMESfqzT39+vAwcOxLa7urrU0dGhmTNnKjc3V8FgUFVVVSoqKlJJSYnq6uo0MDCgVatWJbVwAAAwMSUcTtra2rRkyZLYdjAYlCRVVVVpy5YtWrlypQ4fPqz169erp6dHBQUF2rFjx6jXMbEsS5ZlKRKJjKodAABgtoTDyeLFi790ufnq6mpVV1ePuKiTCQQCCgQCCofDyszMTGrbAADAHOP+bh0AAIDTIZwAAACjjPs6JyPl1jEnJ75vh3ftAAAmOsfcOWEqMQAA7uCYcAIAANyBcAIAAIzimHDC8vUAALiDY8IJY04AAHAHx4QTAADgDo6ZSoxTY7oxAGAi4c4JAAAwimPCCQNiAQBwB8eEEwbEAgDgDo4JJwAAwB0IJwAAwCiEEwAAYBTCCQAAMIpjwgmzdQAAcAfHLMIWCAQUCAQUDoeVmZmZ6nKMduKibBILswEAnMMxd04AAIA7EE4AAIBRCCcAAMAohBMAAGAUwgkAADCKY2brWJYly7IUiURSXcqEwIweAICpHHPnhBf/AQDgDo4JJwAAwB0IJwAAwCiEEwAAYBTCCQAAMArhBAAAGIVwAgAAjOKYdU4w9k5c+4R1TwAAqcCdEwAAYBTCCQAAMIpjwollWcrPz1dxcXGqSwEAAGPIMeGE5esBAHAHx4QTAADgDoQTAABgFMIJAAAwCuEEAAAYhXACAACMwgqxSAiryAIAxhp3TgAAgFEIJwAAwCiEEwAAYBTGnGBUThyDIjEOBQAwOoQTjDkCDAAgEeP+WKe7u1uLFy9Wfn6+Fi1apG3bto13CQAAwGDjfudk0qRJqqurU0FBgXp6elRYWKhrr71W55xzzniXAgAADDTu4WTOnDmaM2eOJCk7O1tZWVn66KOPCCcAAEDSCB7rNDc3a9myZcrJyZHH41FjY+OwcyzLUl5eniZPnqzS0lK1tLSctK329nZFIhHNnz8/4cIBAMDElHA4GRgYkN/vl2VZJz3e0NCgYDCoDRs2aO/evfL7/aqoqFBvb2/ceR999JFuueUWPfTQQyOrHAAATEgJP9aprKxUZWXlKY9v2rRJa9as0apVqyRJ9fX12r59uzZv3qyamhpJ0uDgoG644QbV1NTom9/85mmvNzg4qMHBwdh2OBxOtGQAAOAgSZ2tMzQ0pPb2dpWXlx+/QFqaysvLtWfPHkmSbdu69dZbdfXVV+t73/vel7ZZW1urzMzM2IdHQAAATGxJDSdHjhxRJBKRz+eL2+/z+dTT0yNJevXVV9XQ0KDGxkYVFBSooKBAb7/99inbXLdunfr6+mKf7u7uZJYMAAAMM+6zda688kpFo9EzPt/r9crr9Y5hRQAAwCRJvXOSlZWl9PR0hUKhuP2hUEjZ2dmjatuyLOXn56u4uHhU7QAAALMlNZxkZGSosLBQTU1NsX3RaFRNTU0qKysbVduBQECdnZ1qbW0dbZkAAMBgCT/W6e/v14EDB2LbXV1d6ujo0MyZM5Wbm6tgMKiqqioVFRWppKREdXV1GhgYiM3eAaTh79vhXTsAgC8kHE7a2tq0ZMmS2HYwGJQkVVVVacuWLVq5cqUOHz6s9evXq6enRwUFBdqxY8ewQbKJsixLlmUpEomMqh0AAGC2hMPJ4sWLZdv2ac+prq5WdXX1iIs6mUAgoEAgoHA4rMzMzKS2DQAAzDHubyUGAAA4HcIJAAAwimPCCVOJAQBwB8eEE6YSAwDgDo4JJwAAwB3Gffl64EyxFgoAuJNj7pww5gQAAHdwTDhhzAkAAO7gmHACAADcgTEncIwTx6BIjEMBgImIOycAAMAojgknDIgFAMAdHBNOGBALAIA7OCacAAAAdyCcAAAAoxBOAACAUZhKjAmPZfABwFkcc+eE2ToAALiDY8IJs3UAAHAHHutgQjnZKrIAAGdxzJ0TAADgDoQTAABgFMIJAAAwCuEEAAAYhXACAACM4phwwjonAAC4g2PCCeucAADgDo4JJwAAwB0IJwAAwCiEEwAAYBTCCQAAMArv1gFO4mTv6Hlv49IUVAIA7kM4AcQLAwHAJDzWAQAARiGcAAAAoxBOAACAURwTTli+HgAAd3BMOGH5egAA3MEx4QQAALgD4QQAABiFcAIAAIxCOAEAAEYhnAAAAKOwfD0wQrx/BwDGBndOAACAUQgnAADAKIQTAABgFMIJAAAwSkrCyfLlyzVjxgzdeOONqbg8AAAwWEpm66xdu1arV6/WY489lorLAyl14iwfZvgAQLyU3DlZvHixpk6dmopLAwAAwyUcTpqbm7Vs2TLl5OTI4/GosbFx2DmWZSkvL0+TJ09WaWmpWlpaklErAABwgYTDycDAgPx+vyzLOunxhoYGBYNBbdiwQXv37pXf71dFRYV6e3tHXSwAAJj4Eh5zUllZqcrKylMe37Rpk9asWaNVq1ZJkurr67V9+3Zt3rxZNTU1CRc4ODiowcHB2HY4HE64DQAA4BxJHRA7NDSk9vZ2rVu3LrYvLS1N5eXl2rNnz4jarK2t1d13352sEgHjsAw+AMRL6oDYI0eOKBKJyOfzxe33+Xzq6emJbZeXl+umm27SX/7yF82bN++0wWXdunXq6+uLfbq7u5NZMgAAMExKphK/+OKLZ3yu1+uV1+sdw2oAAIBJkhpOsrKylJ6erlAoFLc/FAopOzt7VG1bliXLshSJREbVDjCWWMMEAEYvqY91MjIyVFhYqKampti+aDSqpqYmlZWVjartQCCgzs5Otba2jrZMAABgsITvnPT39+vAgQOx7a6uLnV0dGjmzJnKzc1VMBhUVVWVioqKVFJSorq6Og0MDMRm7wAAAJxOwuGkra1NS5YsiW0Hg0FJUlVVlbZs2aKVK1fq8OHDWr9+vXp6elRQUKAdO3YMGySbKB7rAPF4hARgoko4nCxevFi2bZ/2nOrqalVXV4+4qJMJBAIKBAIKh8PKzMxMatsAAMAcKXm3DgAAwKkQTgAAgFEcE04sy1J+fr6Ki4tTXQoAABhDjgknTCUGAMAdHBNOAACAOxBOAACAURwTThhzAgCAOzgmnDDmBAAAd3BMOAEAAO5AOAEAAEYhnAAAAKMk/G6dVOHFf3CzE1/yl8x2eGEgANM45s4JA2IBAHAHx4QTAADgDoQTAABgFMIJAAAwCuEEAAAYhdk6AIY5cVYPM3oAjCfH3Dlhtg4AAO7gmHACAADcgXACAACMQjgBAABGIZwAAACjEE4AAIBRCCcAAMAorHMCTGDJeptxsq7FeikAzoRj7pywzgkAAO7gmHACAADcgXACAACMQjgBAABGIZwAAACjEE4AAIBRCCcAAMAohBMAAGAUwgkAADAK4QQAABjFMeHEsizl5+eruLg41aUAAIAx5JhwwvL1AAC4g2PCCQAAcAfCCQAAMArhBAAAGIVwAgAAjEI4AQAARiGcAAAAoxBOAACAUQgnAADAKIQTAABgFMIJAAAwCuEEAAAYJSXh5Nlnn9VFF12kCy64QI888kgqSgAAAIaaNN4X/N///qdgMKhdu3YpMzNThYWFWr58uWbNmjXepQAAAAON+52TlpYWXXLJJZo7d67OPfdcVVZW6oUXXhjvMgAAgKESDifNzc1atmyZcnJy5PF41NjYOOwcy7KUl5enyZMnq7S0VC0tLbFjhw4d0ty5c2Pbc+fO1cGDB0dWPQAAmHASDicDAwPy+/2yLOukxxsaGhQMBrVhwwbt3btXfr9fFRUV6u3tHVGBg4ODCofDcR8AADBxJTzmpLKyUpWVlac8vmnTJq1Zs0arVq2SJNXX12v79u3avHmzampqlJOTE3en5ODBgyopKTlle7W1tbr77rsTLRNAEuXVbB+2772NS8es7S+71kh+JtXGsg8nqhP7bKT9lax2JioTv5tJHXMyNDSk9vZ2lZeXH79AWprKy8u1Z88eSVJJSYneeecdHTx4UP39/XruuedUUVFxyjbXrVunvr6+2Ke7uzuZJQMAAMMkdbbOkSNHFIlE5PP54vb7fD69++67n19w0iTdf//9WrJkiaLRqH7+85+fdqaO1+uV1+tNZpkAAMBg4z6VWJKuu+46XXfddQn9jGVZsixLkUhkjKoCAAAmSOpjnaysLKWnpysUCsXtD4VCys7OHlXbgUBAnZ2dam1tHVU7AADAbEkNJxkZGSosLFRTU1NsXzQaVVNTk8rKypJ5KQAAMEEl/Finv79fBw4ciG13dXWpo6NDM2fOVG5uroLBoKqqqlRUVKSSkhLV1dVpYGAgNnsHAADgdBIOJ21tbVqyZElsOxgMSpKqqqq0ZcsWrVy5UocPH9b69evV09OjgoIC7dixY9gg2UQx5gQAAHdIOJwsXrxYtm2f9pzq6mpVV1ePuKiTCQQCCgQCCofDyszMTGrbAADAHCl5KzEAAMCpOCacWJal/Px8FRcXp7oUAAAwhhwTTphKDACAOzgmnAAAAHcgnAAAAKM4Jpww5gQAAHdwTDhhzAkAAO6Qkhf/jcYXa6yEw+ExaT86+HHCP3OyWs6knRN/biQ/w7W4VrKvNdKfc8K1UulkNZtWo2lO7LOR9ley2pmoxuu7+UWbX7ZWmiR57DM5yyD/+c9/NH/+/FSXAQAARqC7u1vz5s077TmOCyfRaFSHDh3S1KlT5fF4Ev75cDis+fPnq7u7W9OmTRuDCp2DvjiOvohHfxxHXxxHX8SjP447k76wbVvHjh1TTk6O0tJOP6rEcY910tLSvjRxnYlp06a5/sv0BfriOPoiHv1xHH1xHH0Rj/447sv64kxfP+OYAbEAAMAdCCcAAMAorgsnXq9XGzZskNfrTXUpKUdfHEdfxKM/jqMvjqMv4tEfxyW7Lxw3IBYAAExsrrtzAgAAzEY4AQAARiGcAAAAoxBOAACAUVwVTizLUl5eniZPnqzS0lK1tLSkuqRx0dzcrGXLliknJ0cej0eNjY1xx23b1vr16zVnzhydffbZKi8v1/79+1NT7Birra1VcXGxpk6dqtmzZ+uGG27Qvn374s759NNPFQgENGvWLJ177rn6zne+o1AolKKKx86DDz6oRYsWxRZNKisr03PPPRc77pZ+OJmNGzfK4/HojjvuiO1zU3/88pe/lMfjiftcfPHFseNu6gtJOnjwoL773e9q1qxZOvvss3XppZeqra0tdtwtf0Pz8vKGfS88Ho8CgYCk5H4vXBNOGhoaFAwGtWHDBu3du1d+v18VFRXq7e1NdWljbmBgQH6/X5ZlnfT4r371Kz3wwAOqr6/X66+/rnPOOUcVFRX69NNPx7nSsbd7924FAgG99tpr2rlzpz777DN9+9vf1sDAQOycn/zkJ3rmmWe0bds27d69W4cOHdKKFStSWPXYmDdvnjZu3Kj29na1tbXp6quv1vXXX6+///3vktzTDydqbW3V73//ey1atChuv9v645JLLtGHH34Y+7zyyiuxY27qi//+97+64oordNZZZ+m5555TZ2en7r//fs2YMSN2jlv+hra2tsZ9J3bu3ClJuummmyQl+Xthu0RJSYkdCARi25FIxM7JybFra2tTWNX4k2Q/+eSTse1oNGpnZ2fbv/71r2P7jh49anu9XvuPf/xjCiocX729vbYke/fu3bZtf/67n3XWWfa2bdti5/zjH/+wJdl79uxJVZnjZsaMGfYjjzzi2n44duyYfcEFF9g7d+60v/Wtb9lr1661bdt934sNGzbYfr//pMfc1hd33XWXfeWVV57yuJv/hq5du9b+6le/akej0aR/L1xx52RoaEjt7e0qLy+P7UtLS1N5ebn27NmTwspSr6urSz09PXF9k5mZqdLSUlf0TV9fnyRp5syZkqT29nZ99tlncf1x8cUXKzc3d0L3RyQS0datWzUwMKCysjLX9kMgENDSpUvjfm/Jnd+L/fv3KycnR+eff75uvvlmffDBB5Lc1xdPP/20ioqKdNNNN2n27Nm67LLL9PDDD8eOu/Vv6NDQkB5//HGtXr1aHo8n6d8LV4STI0eOKBKJyOfzxe33+Xzq6elJUVVm+OL3d2PfRKNR3XHHHbriiiu0cOFCSZ/3R0ZGhqZPnx537kTtj7ffflvnnnuuvF6vbrvtNj355JPKz893XT9I0tatW7V3717V1tYOO+a2/igtLdWWLVu0Y8cOPfjgg+rq6tJVV12lY8eOua4v/v3vf+vBBx/UBRdcoOeff1633367fvzjH+uxxx6T5N6/oY2NjTp69KhuvfVWScn/f8RxbyUGkiUQCOidd96Je5buNhdddJE6OjrU19enP//5z6qqqtLu3btTXda46+7u1tq1a7Vz505Nnjw51eWkXGVlZey/Fy1apNLSUi1YsEB/+tOfdPbZZ6ewsvEXjUZVVFSk++67T5J02WWX6Z133lF9fb2qqqpSXF3q/OEPf1BlZaVycnLGpH1X3DnJyspSenr6sFHDoVBI2dnZKarKDF/8/m7rm+rqaj377LPatWuX5s2bF9ufnZ2toaEhHT16NO78idofGRkZ+trXvqbCwkLV1tbK7/frt7/9rev6ob29Xb29vfrGN76hSZMmadKkSdq9e7ceeOABTZo0ST6fz1X9caLp06frwgsv1IEDB1z33ZgzZ47y8/Pj9n3961+PPeZy49/Q999/Xy+++KK+//3vx/Yl+3vhinCSkZGhwsJCNTU1xfZFo1E1NTWprKwshZWl3nnnnafs7Oy4vgmHw3r99dcnZN/Ytq3q6mo9+eSTeumll3TeeefFHS8sLNRZZ50V1x/79u3TBx98MCH740TRaFSDg4Ou64drrrlGb7/9tjo6OmKfoqIi3XzzzbH/dlN/nKi/v1//+te/NGfOHNd9N6644ophyw3885//1IIFCyS572+oJD366KOaPXu2li5dGtuX9O9FEgfuGm3r1q221+u1t2zZYnd2dto/+MEP7OnTp9s9PT2pLm3MHTt2zH7jjTfsN954w5Zkb9q0yX7jjTfs999/37Zt2964caM9ffp0+6mnnrLfeust+/rrr7fPO+88+5NPPklx5cl3++2325mZmfbLL79sf/jhh7HPxx9/HDvntttus3Nzc+2XXnrJbmtrs8vKyuyysrIUVj02ampq7N27d9tdXV32W2+9ZdfU1Ngej8d+4YUXbNt2Tz+cyv+frWPb7uqPn/70p/bLL79sd3V12a+++qpdXl5uZ2Vl2b29vbZtu6svWlpa7EmTJtn33nuvvX//fvuJJ56wp0yZYj/++OOxc9z0NzQSidi5ubn2XXfdNexYMr8Xrgkntm3bv/vd7+zc3Fw7IyPDLikpsV977bVUlzQudu3aZUsa9qmqqrJt+/OpcL/4xS9sn89ne71e+5prrrH37duX2qLHyMn6QZL96KOPxs755JNP7B/+8If2jBkz7ClTptjLly+3P/zww9QVPUZWr15tL1iwwM7IyLC/8pWv2Ndcc00smNi2e/rhVE4MJ27qj5UrV9pz5syxMzIy7Llz59orV660Dxw4EDvupr6wbdt+5pln7IULF9per9e++OKL7YceeijuuJv+hj7//PO2pJP+fsn8Xnhs27ZHeGcHAAAg6Vwx5gQAADgH4QQAABiFcAIAAIxCOAEAAEYhnAAAAKMQTgAAgFEIJwAAwCiEEwAAYBTCCQAAMArhBAAAGIVwAgAAjEI4AQAARvk/g4HrboKNJNoAAAAASUVORK5CYII="},"metadata":{}}]},{"cell_type":"code","source":"df.drop(columns=['id','qid1','qid2'],inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.988119Z","iopub.execute_input":"2024-05-31T15:59:38.988422Z","iopub.status.idle":"2024-05-31T15:59:39.009014Z","shell.execute_reply.started":"2024-05-31T15:59:38.988396Z","shell.execute_reply":"2024-05-31T15:59:39.008163Z"},"trusted":true},"execution_count":19,"outputs":[]},{"cell_type":"code","source":"df","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.010182Z","iopub.execute_input":"2024-05-31T15:59:39.010484Z","iopub.status.idle":"2024-05-31T15:59:39.023858Z","shell.execute_reply.started":"2024-05-31T15:59:39.010459Z","shell.execute_reply":"2024-05-31T15:59:39.022788Z"},"trusted":true},"execution_count":20,"outputs":[{"execution_count":20,"output_type":"execute_result","data":{"text/plain":" question1 \\\n0 what is the step by step guide to invest in sh... \n1 what is the story of kohinoor kohinoor diamond \n2 how can i increase the speed of my internet co... \n3 why am i mentally very lonely how can i solve it \n4 which one dissolve in water quikly sugar salt ... \n... ... \n199996 which of these tv shows should i watch next \n199997 should i change my name \n199998 should i buy the new macbook 2016 or one from ... \n199999 what is your review of love 2011 movie \n200000 can pakistan hit indian air craft carrier in a... \n\n question2 is_duplicate \n0 what is the step by step guide to invest in sh... 0 \n1 what would happen if the indian government sto... 0 \n2 how can internet speed be increased by hacking... 0 \n3 find the remainder when 2324math is divided by... 0 \n4 which fish would survive in salt water 0 \n... ... ... \n199996 what are some thriller shows i should watch next 0 \n199997 should i legally change my first name 0 \n199998 should i buy the new macbook pro 2016 or the m... 1 \n199999 what is your review of love birds 2011 movie 0 \n200000 can pakistan destroy an indian aircraft carrie... 0 \n\n[200000 rows x 3 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
question1question2is_duplicate
0what is the step by step guide to invest in sh...what is the step by step guide to invest in sh...0
1what is the story of kohinoor kohinoor diamondwhat would happen if the indian government sto...0
2how can i increase the speed of my internet co...how can internet speed be increased by hacking...0
3why am i mentally very lonely how can i solve itfind the remainder when 2324math is divided by...0
4which one dissolve in water quikly sugar salt ...which fish would survive in salt water0
............
199996which of these tv shows should i watch nextwhat are some thriller shows i should watch next0
199997should i change my nameshould i legally change my first name0
199998should i buy the new macbook 2016 or one from ...should i buy the new macbook pro 2016 or the m...1
199999what is your review of love 2011 moviewhat is your review of love birds 2011 movie0
200000can pakistan hit indian air craft carrier in a...can pakistan destroy an indian aircraft carrie...0
\n

200000 rows × 3 columns

\n
"},"metadata":{}}]},{"cell_type":"code","source":"df['q1_len']=df['question1'].str.len()\ndf['q2_len']=df['question2'].str.len()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.025425Z","iopub.execute_input":"2024-05-31T15:59:39.025810Z","iopub.status.idle":"2024-05-31T15:59:39.159384Z","shell.execute_reply.started":"2024-05-31T15:59:39.025774Z","shell.execute_reply":"2024-05-31T15:59:39.158511Z"},"trusted":true},"execution_count":21,"outputs":[]},{"cell_type":"code","source":"df['q1_num_words']=df['question1'].apply(lambda row: len(row.split(\" \")))\ndf['q2_num_words']=df['question2'].apply(lambda row: len(row.split(\" \")))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.160903Z","iopub.execute_input":"2024-05-31T15:59:39.161222Z","iopub.status.idle":"2024-05-31T15:59:39.633424Z","shell.execute_reply.started":"2024-05-31T15:59:39.161194Z","shell.execute_reply":"2024-05-31T15:59:39.632355Z"},"trusted":true},"execution_count":22,"outputs":[]},{"cell_type":"code","source":"def common_words(row):\n w1=set(map(lambda word: word.lower().strip(),row['question1'].split(\" \")))\n w2=set(map(lambda word: word.lower().strip(),row['question2'].split(\" \")))\n return len(w1 & w2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.634722Z","iopub.execute_input":"2024-05-31T15:59:39.635031Z","iopub.status.idle":"2024-05-31T15:59:39.641017Z","shell.execute_reply.started":"2024-05-31T15:59:39.635005Z","shell.execute_reply":"2024-05-31T15:59:39.639990Z"},"trusted":true},"execution_count":23,"outputs":[]},{"cell_type":"code","source":"def total_words(row):\n w1=set(map(lambda word: word.lower().strip(),row['question1'].split(\" \")))\n w2=set(map(lambda word: word.lower().strip(),row['question2'].split(\" \")))\n return len(w1) + len(w2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.642318Z","iopub.execute_input":"2024-05-31T15:59:39.642632Z","iopub.status.idle":"2024-05-31T15:59:39.651761Z","shell.execute_reply.started":"2024-05-31T15:59:39.642600Z","shell.execute_reply":"2024-05-31T15:59:39.650721Z"},"trusted":true},"execution_count":24,"outputs":[]},{"cell_type":"code","source":"df['word_common']=df.apply(common_words,axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.652945Z","iopub.execute_input":"2024-05-31T15:59:39.653278Z","iopub.status.idle":"2024-05-31T15:59:43.993662Z","shell.execute_reply.started":"2024-05-31T15:59:39.653253Z","shell.execute_reply":"2024-05-31T15:59:43.992797Z"},"trusted":true},"execution_count":25,"outputs":[]},{"cell_type":"code","source":"df['word_total']=df.apply(total_words,axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:43.994773Z","iopub.execute_input":"2024-05-31T15:59:43.995058Z","iopub.status.idle":"2024-05-31T15:59:48.205245Z","shell.execute_reply.started":"2024-05-31T15:59:43.995018Z","shell.execute_reply":"2024-05-31T15:59:48.204131Z"},"trusted":true},"execution_count":26,"outputs":[]},{"cell_type":"code","source":"df","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.206484Z","iopub.execute_input":"2024-05-31T15:59:48.206787Z","iopub.status.idle":"2024-05-31T15:59:48.221588Z","shell.execute_reply.started":"2024-05-31T15:59:48.206762Z","shell.execute_reply":"2024-05-31T15:59:48.220559Z"},"trusted":true},"execution_count":27,"outputs":[{"execution_count":27,"output_type":"execute_result","data":{"text/plain":" question1 \\\n0 what is the step by step guide to invest in sh... \n1 what is the story of kohinoor kohinoor diamond \n2 how can i increase the speed of my internet co... \n3 why am i mentally very lonely how can i solve it \n4 which one dissolve in water quikly sugar salt ... \n... ... \n199996 which of these tv shows should i watch next \n199997 should i change my name \n199998 should i buy the new macbook 2016 or one from ... \n199999 what is your review of love 2011 movie \n200000 can pakistan hit indian air craft carrier in a... \n\n question2 is_duplicate \\\n0 what is the step by step guide to invest in sh... 0 \n1 what would happen if the indian government sto... 0 \n2 how can internet speed be increased by hacking... 0 \n3 find the remainder when 2324math is divided by... 0 \n4 which fish would survive in salt water 0 \n... ... ... \n199996 what are some thriller shows i should watch next 0 \n199997 should i legally change my first name 0 \n199998 should i buy the new macbook pro 2016 or the m... 1 \n199999 what is your review of love birds 2011 movie 0 \n200000 can pakistan destroy an indian aircraft carrie... 0 \n\n q1_len q2_len q1_num_words q2_num_words word_common word_total \n0 65 56 14 12 11 23 \n1 46 83 8 13 4 18 \n2 72 58 14 10 4 24 \n3 48 51 11 9 0 19 \n4 73 38 13 7 4 20 \n... ... ... ... ... ... ... \n199996 43 48 9 9 5 18 \n199997 23 37 5 7 5 12 \n199998 50 61 11 13 9 21 \n199999 38 44 8 9 8 17 \n200000 146 60 27 10 6 34 \n\n[200000 rows x 9 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
question1question2is_duplicateq1_lenq2_lenq1_num_wordsq2_num_wordsword_commonword_total
0what is the step by step guide to invest in sh...what is the step by step guide to invest in sh...0655614121123
1what is the story of kohinoor kohinoor diamondwhat would happen if the indian government sto...04683813418
2how can i increase the speed of my internet co...how can internet speed be increased by hacking...072581410424
3why am i mentally very lonely how can i solve itfind the remainder when 2324math is divided by...04851119019
4which one dissolve in water quikly sugar salt ...which fish would survive in salt water07338137420
..............................
199996which of these tv shows should i watch nextwhat are some thriller shows i should watch next0434899518
199997should i change my nameshould i legally change my first name0233757512
199998should i buy the new macbook 2016 or one from ...should i buy the new macbook pro 2016 or the m...150611113921
199999what is your review of love 2011 moviewhat is your review of love birds 2011 movie0384489817
200000can pakistan hit indian air craft carrier in a...can pakistan destroy an indian aircraft carrie...0146602710634
\n

200000 rows × 9 columns

\n
"},"metadata":{}}]},{"cell_type":"code","source":"df['word_share']=round(df['word_common']/df['word_total'],2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.223003Z","iopub.execute_input":"2024-05-31T15:59:48.223347Z","iopub.status.idle":"2024-05-31T15:59:48.235292Z","shell.execute_reply.started":"2024-05-31T15:59:48.223321Z","shell.execute_reply":"2024-05-31T15:59:48.234269Z"},"trusted":true},"execution_count":28,"outputs":[]},{"cell_type":"code","source":"df","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.236483Z","iopub.execute_input":"2024-05-31T15:59:48.236780Z","iopub.status.idle":"2024-05-31T15:59:48.255263Z","shell.execute_reply.started":"2024-05-31T15:59:48.236750Z","shell.execute_reply":"2024-05-31T15:59:48.254206Z"},"trusted":true},"execution_count":29,"outputs":[{"execution_count":29,"output_type":"execute_result","data":{"text/plain":" question1 \\\n0 what is the step by step guide to invest in sh... \n1 what is the story of kohinoor kohinoor diamond \n2 how can i increase the speed of my internet co... \n3 why am i mentally very lonely how can i solve it \n4 which one dissolve in water quikly sugar salt ... \n... ... \n199996 which of these tv shows should i watch next \n199997 should i change my name \n199998 should i buy the new macbook 2016 or one from ... \n199999 what is your review of love 2011 movie \n200000 can pakistan hit indian air craft carrier in a... \n\n question2 is_duplicate \\\n0 what is the step by step guide to invest in sh... 0 \n1 what would happen if the indian government sto... 0 \n2 how can internet speed be increased by hacking... 0 \n3 find the remainder when 2324math is divided by... 0 \n4 which fish would survive in salt water 0 \n... ... ... \n199996 what are some thriller shows i should watch next 0 \n199997 should i legally change my first name 0 \n199998 should i buy the new macbook pro 2016 or the m... 1 \n199999 what is your review of love birds 2011 movie 0 \n200000 can pakistan destroy an indian aircraft carrie... 0 \n\n q1_len q2_len q1_num_words q2_num_words word_common word_total \\\n0 65 56 14 12 11 23 \n1 46 83 8 13 4 18 \n2 72 58 14 10 4 24 \n3 48 51 11 9 0 19 \n4 73 38 13 7 4 20 \n... ... ... ... ... ... ... \n199996 43 48 9 9 5 18 \n199997 23 37 5 7 5 12 \n199998 50 61 11 13 9 21 \n199999 38 44 8 9 8 17 \n200000 146 60 27 10 6 34 \n\n word_share \n0 0.48 \n1 0.22 \n2 0.17 \n3 0.00 \n4 0.20 \n... ... \n199996 0.28 \n199997 0.42 \n199998 0.43 \n199999 0.47 \n200000 0.18 \n\n[200000 rows x 10 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
question1question2is_duplicateq1_lenq2_lenq1_num_wordsq2_num_wordsword_commonword_totalword_share
0what is the step by step guide to invest in sh...what is the step by step guide to invest in sh...06556141211230.48
1what is the story of kohinoor kohinoor diamondwhat would happen if the indian government sto...046838134180.22
2how can i increase the speed of my internet co...how can internet speed be increased by hacking...0725814104240.17
3why am i mentally very lonely how can i solve itfind the remainder when 2324math is divided by...048511190190.00
4which one dissolve in water quikly sugar salt ...which fish would survive in salt water073381374200.20
.................................
199996which of these tv shows should i watch nextwhat are some thriller shows i should watch next04348995180.28
199997should i change my nameshould i legally change my first name02337575120.42
199998should i buy the new macbook 2016 or one from ...should i buy the new macbook pro 2016 or the m...1506111139210.43
199999what is your review of love 2011 moviewhat is your review of love birds 2011 movie03844898170.47
200000can pakistan hit indian air craft carrier in a...can pakistan destroy an indian aircraft carrie...01466027106340.18
\n

200000 rows × 10 columns

\n
"},"metadata":{}}]},{"cell_type":"code","source":"ndf1=df[['question1','question2']]\nndf2=df.drop(columns=['question1','question2'])","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.256585Z","iopub.execute_input":"2024-05-31T15:59:48.256964Z","iopub.status.idle":"2024-05-31T15:59:48.280291Z","shell.execute_reply.started":"2024-05-31T15:59:48.256930Z","shell.execute_reply":"2024-05-31T15:59:48.279254Z"},"trusted":true},"execution_count":30,"outputs":[]},{"cell_type":"code","source":"ndf1","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.281704Z","iopub.execute_input":"2024-05-31T15:59:48.282112Z","iopub.status.idle":"2024-05-31T15:59:48.295253Z","shell.execute_reply.started":"2024-05-31T15:59:48.282079Z","shell.execute_reply":"2024-05-31T15:59:48.294063Z"},"trusted":true},"execution_count":31,"outputs":[{"execution_count":31,"output_type":"execute_result","data":{"text/plain":" question1 \\\n0 what is the step by step guide to invest in sh... \n1 what is the story of kohinoor kohinoor diamond \n2 how can i increase the speed of my internet co... \n3 why am i mentally very lonely how can i solve it \n4 which one dissolve in water quikly sugar salt ... \n... ... \n199996 which of these tv shows should i watch next \n199997 should i change my name \n199998 should i buy the new macbook 2016 or one from ... \n199999 what is your review of love 2011 movie \n200000 can pakistan hit indian air craft carrier in a... \n\n question2 \n0 what is the step by step guide to invest in sh... \n1 what would happen if the indian government sto... \n2 how can internet speed be increased by hacking... \n3 find the remainder when 2324math is divided by... \n4 which fish would survive in salt water \n... ... \n199996 what are some thriller shows i should watch next \n199997 should i legally change my first name \n199998 should i buy the new macbook pro 2016 or the m... \n199999 what is your review of love birds 2011 movie \n200000 can pakistan destroy an indian aircraft carrie... \n\n[200000 rows x 2 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
question1question2
0what is the step by step guide to invest in sh...what is the step by step guide to invest in sh...
1what is the story of kohinoor kohinoor diamondwhat would happen if the indian government sto...
2how can i increase the speed of my internet co...how can internet speed be increased by hacking...
3why am i mentally very lonely how can i solve itfind the remainder when 2324math is divided by...
4which one dissolve in water quikly sugar salt ...which fish would survive in salt water
.........
199996which of these tv shows should i watch nextwhat are some thriller shows i should watch next
199997should i change my nameshould i legally change my first name
199998should i buy the new macbook 2016 or one from ...should i buy the new macbook pro 2016 or the m...
199999what is your review of love 2011 moviewhat is your review of love birds 2011 movie
200000can pakistan hit indian air craft carrier in a...can pakistan destroy an indian aircraft carrie...
\n

200000 rows × 2 columns

\n
"},"metadata":{}}]},{"cell_type":"code","source":"ndf2","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.296620Z","iopub.execute_input":"2024-05-31T15:59:48.297003Z","iopub.status.idle":"2024-05-31T15:59:48.316699Z","shell.execute_reply.started":"2024-05-31T15:59:48.296972Z","shell.execute_reply":"2024-05-31T15:59:48.315499Z"},"trusted":true},"execution_count":32,"outputs":[{"execution_count":32,"output_type":"execute_result","data":{"text/plain":" is_duplicate q1_len q2_len q1_num_words q2_num_words word_common \\\n0 0 65 56 14 12 11 \n1 0 46 83 8 13 4 \n2 0 72 58 14 10 4 \n3 0 48 51 11 9 0 \n4 0 73 38 13 7 4 \n... ... ... ... ... ... ... \n199996 0 43 48 9 9 5 \n199997 0 23 37 5 7 5 \n199998 1 50 61 11 13 9 \n199999 0 38 44 8 9 8 \n200000 0 146 60 27 10 6 \n\n word_total word_share \n0 23 0.48 \n1 18 0.22 \n2 24 0.17 \n3 19 0.00 \n4 20 0.20 \n... ... ... \n199996 18 0.28 \n199997 12 0.42 \n199998 21 0.43 \n199999 17 0.47 \n200000 34 0.18 \n\n[200000 rows x 8 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
is_duplicateq1_lenq2_lenq1_num_wordsq2_num_wordsword_commonword_totalword_share
006556141211230.48
1046838134180.22
20725814104240.17
3048511190190.00
4073381374200.20
...........................
19999604348995180.28
19999702337575120.42
1999981506111139210.43
19999903844898170.47
20000001466027106340.18
\n

200000 rows × 8 columns

\n
"},"metadata":{}}]},{"cell_type":"code","source":"from nltk.corpus import stopwords\n\ndef fetch_token_features(row):\n \n q1 = row['question1']\n q2 = row['question2']\n \n SAFE_DIV = 0.0001 \n\n STOP_WORDS = stopwords.words(\"english\")\n \n token_features = [0.0]*8\n \n # Converting the Sentence into Tokens: \n q1_tokens = q1.split()\n q2_tokens = q2.split()\n \n if len(q1_tokens) == 0 or len(q2_tokens) == 0:\n return token_features\n\n # Get the non-stopwords in Questions\n q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])\n q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])\n \n #Get the stopwords in Questions\n q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])\n q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])\n \n # Get the common non-stopwords from Question pair\n common_word_count = len(q1_words.intersection(q2_words))\n \n # Get the common stopwords from Question pair\n common_stop_count = len(q1_stops.intersection(q2_stops))\n \n # Get the common Tokens from Question pair\n common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))\n \n \n token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)\n token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)\n token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)\n token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)\n token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)\n token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)\n \n # Last word of both question is same or not\n token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])\n \n # First word of both question is same or not\n token_features[7] = int(q1_tokens[0] == q2_tokens[0])\n \n return token_features","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.318063Z","iopub.execute_input":"2024-05-31T15:59:48.318457Z","iopub.status.idle":"2024-05-31T15:59:49.088959Z","shell.execute_reply.started":"2024-05-31T15:59:48.318418Z","shell.execute_reply":"2024-05-31T15:59:49.088070Z"},"trusted":true},"execution_count":33,"outputs":[]},{"cell_type":"code","source":"token_features = df.apply(fetch_token_features, axis=1)\n\ndf[\"cwc_min\"] = list(map(lambda x: x[0], token_features))\ndf[\"cwc_max\"] = list(map(lambda x: x[1], token_features))\ndf[\"csc_min\"] = list(map(lambda x: x[2], token_features))\ndf[\"csc_max\"] = list(map(lambda x: x[3], token_features))\ndf[\"ctc_min\"] = list(map(lambda x: x[4], token_features))\ndf[\"ctc_max\"] = list(map(lambda x: x[5], token_features))\ndf[\"last_word_eq\"] = list(map(lambda x: x[6], token_features))\ndf[\"first_word_eq\"] = list(map(lambda x: x[7], token_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:49.090329Z","iopub.execute_input":"2024-05-31T15:59:49.090993Z","iopub.status.idle":"2024-05-31T16:00:39.809819Z","shell.execute_reply.started":"2024-05-31T15:59:49.090955Z","shell.execute_reply":"2024-05-31T16:00:39.808943Z"},"trusted":true},"execution_count":34,"outputs":[]},{"cell_type":"code","source":"pip install distance","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:00:39.811472Z","iopub.execute_input":"2024-05-31T16:00:39.811883Z","iopub.status.idle":"2024-05-31T16:00:54.852520Z","shell.execute_reply.started":"2024-05-31T16:00:39.811847Z","shell.execute_reply":"2024-05-31T16:00:54.851156Z"},"trusted":true},"execution_count":35,"outputs":[{"name":"stdout","text":"Collecting distance\n Downloading Distance-0.1.3.tar.gz (180 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m180.3/180.3 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n\u001b[?25hBuilding wheels for collected packages: distance\n Building wheel for distance (setup.py) ... \u001b[?25ldone\n\u001b[?25h Created wheel for distance: filename=Distance-0.1.3-py3-none-any.whl size=16258 sha256=cd544d5c1039ea6345ff5a69695ae0ef0e616e019bdaf0ccaadf6d5845ffc9ac\n Stored in directory: /root/.cache/pip/wheels/e8/bb/de/f71bf63559ea9a921059a5405806f7ff6ed612a9231c4a9309\nSuccessfully built distance\n\u001b[33mWARNING: Error parsing requirements for aiohttp: [Errno 2] No such file or directory: '/opt/conda/lib/python3.10/site-packages/aiohttp-3.9.1.dist-info/METADATA'\u001b[0m\u001b[33m\n\u001b[0mInstalling collected packages: distance\nSuccessfully installed distance-0.1.3\nNote: you may need to restart the kernel to use updated packages.\n","output_type":"stream"}]},{"cell_type":"code","source":"import distance\n\ndef fetch_length_features(row):\n \n q1 = row['question1']\n q2 = row['question2']\n \n length_features = [0.0]*3\n \n # Converting the Sentence into Tokens: \n q1_tokens = q1.split()\n q2_tokens = q2.split()\n \n if len(q1_tokens) == 0 or len(q2_tokens) == 0:\n return length_features\n \n # Absolute length features\n length_features[0] = abs(len(q1_tokens) - len(q2_tokens))\n \n # Average Token Length of both Questions\n length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2\n \n # Find the longest common substring\n strs = list(distance.lcsubstrings(q1, q2))\n if strs:\n length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)\n else:\n length_features[2] = 0.0\n \n return length_features","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:00:54.854466Z","iopub.execute_input":"2024-05-31T16:00:54.855416Z","iopub.status.idle":"2024-05-31T16:00:54.868756Z","shell.execute_reply.started":"2024-05-31T16:00:54.855371Z","shell.execute_reply":"2024-05-31T16:00:54.867731Z"},"trusted":true},"execution_count":36,"outputs":[]},{"cell_type":"code","source":"length_features = df.apply(fetch_length_features, axis=1)\n\ndf['abs_len_diff'] = list(map(lambda x: x[0], length_features))\ndf['mean_len'] = list(map(lambda x: x[1], length_features))\ndf['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:00:54.870202Z","iopub.execute_input":"2024-05-31T16:00:54.870885Z","iopub.status.idle":"2024-05-31T16:03:34.399606Z","shell.execute_reply.started":"2024-05-31T16:00:54.870849Z","shell.execute_reply":"2024-05-31T16:03:34.398480Z"},"trusted":true},"execution_count":37,"outputs":[]},{"cell_type":"code","source":"# Fuzzy Features\nfrom fuzzywuzzy import fuzz\n\ndef fetch_fuzzy_features(row):\n \n q1 = row['question1']\n q2 = row['question2']\n \n fuzzy_features = [0.0]*4\n \n # fuzz_ratio\n fuzzy_features[0] = fuzz.QRatio(q1, q2)\n\n # fuzz_partial_ratio\n fuzzy_features[1] = fuzz.partial_ratio(q1, q2)\n\n # token_sort_ratio\n fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)\n\n # token_set_ratio\n fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)\n\n return fuzzy_features","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:03:34.401346Z","iopub.execute_input":"2024-05-31T16:03:34.401740Z","iopub.status.idle":"2024-05-31T16:03:34.415928Z","shell.execute_reply.started":"2024-05-31T16:03:34.401703Z","shell.execute_reply":"2024-05-31T16:03:34.414845Z"},"trusted":true},"execution_count":38,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning\n warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')\n","output_type":"stream"}]},{"cell_type":"code","source":"fuzzy_features = df.apply(fetch_fuzzy_features, axis=1)\n\n# Creating new feature columns for fuzzy features\ndf['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))\ndf['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))\ndf['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))\ndf['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:03:34.417222Z","iopub.execute_input":"2024-05-31T16:03:34.417541Z","iopub.status.idle":"2024-05-31T16:12:37.749312Z","shell.execute_reply.started":"2024-05-31T16:03:34.417507Z","shell.execute_reply":"2024-05-31T16:12:37.748091Z"},"trusted":true},"execution_count":39,"outputs":[]},{"cell_type":"code","source":"ndf1=df[['question1','question2']]\nndf2=df.drop(columns=['question1','question2'])","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:12:37.750792Z","iopub.execute_input":"2024-05-31T16:12:37.751247Z","iopub.status.idle":"2024-05-31T16:12:37.796876Z","shell.execute_reply.started":"2024-05-31T16:12:37.751211Z","shell.execute_reply":"2024-05-31T16:12:37.796026Z"},"trusted":true},"execution_count":40,"outputs":[]},{"cell_type":"code","source":"from sklearn.feature_extraction.text import TfidfVectorizer","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:12:37.806016Z","iopub.execute_input":"2024-05-31T16:12:37.806372Z","iopub.status.idle":"2024-05-31T16:12:37.811136Z","shell.execute_reply.started":"2024-05-31T16:12:37.806344Z","shell.execute_reply":"2024-05-31T16:12:37.810107Z"},"trusted":true},"execution_count":41,"outputs":[]},{"cell_type":"code","source":"cv=TfidfVectorizer(max_features=1000)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:11.173544Z","iopub.execute_input":"2024-05-31T16:13:11.173929Z","iopub.status.idle":"2024-05-31T16:13:11.179093Z","shell.execute_reply.started":"2024-05-31T16:13:11.173896Z","shell.execute_reply":"2024-05-31T16:13:11.177928Z"},"trusted":true},"execution_count":45,"outputs":[]},{"cell_type":"code","source":"\nquestions=list(ndf1['question1'])+list(ndf1['question2'])","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:12.853466Z","iopub.execute_input":"2024-05-31T16:13:12.853846Z","iopub.status.idle":"2024-05-31T16:13:12.911399Z","shell.execute_reply.started":"2024-05-31T16:13:12.853814Z","shell.execute_reply":"2024-05-31T16:13:12.910271Z"},"trusted":true},"execution_count":46,"outputs":[]},{"cell_type":"code","source":"","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"q1_arr,q2_arr=np.vsplit(cv.fit_transform(questions).toarray(),2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:17.135566Z","iopub.execute_input":"2024-05-31T16:13:17.135964Z","iopub.status.idle":"2024-05-31T16:13:26.663959Z","shell.execute_reply.started":"2024-05-31T16:13:17.135933Z","shell.execute_reply":"2024-05-31T16:13:26.662828Z"},"trusted":true},"execution_count":47,"outputs":[]},{"cell_type":"code","source":"\ntemp_df=pd.concat([pd.DataFrame(q1_arr,index=ndf1.index),pd.DataFrame(q2_arr,index=ndf1.index)],axis=1)\ntemp_df.shape","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:26.665906Z","iopub.execute_input":"2024-05-31T16:13:26.666258Z","iopub.status.idle":"2024-05-31T16:13:34.625571Z","shell.execute_reply.started":"2024-05-31T16:13:26.666230Z","shell.execute_reply":"2024-05-31T16:13:34.624673Z"},"trusted":true},"execution_count":48,"outputs":[{"execution_count":48,"output_type":"execute_result","data":{"text/plain":"(200000, 2000)"},"metadata":{}}]},{"cell_type":"code","source":"q1_arr=\"\"\nq2_arr=\"\"","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:34.627158Z","iopub.execute_input":"2024-05-31T16:13:34.627867Z","iopub.status.idle":"2024-05-31T16:13:34.791511Z","shell.execute_reply.started":"2024-05-31T16:13:34.627828Z","shell.execute_reply":"2024-05-31T16:13:34.790413Z"},"trusted":true},"execution_count":49,"outputs":[]},{"cell_type":"code","source":"temp_df=pd.concat([ndf2,temp_df],axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:34.794367Z","iopub.execute_input":"2024-05-31T16:13:34.795113Z","iopub.status.idle":"2024-05-31T16:13:38.803374Z","shell.execute_reply.started":"2024-05-31T16:13:34.795080Z","shell.execute_reply":"2024-05-31T16:13:38.802436Z"},"trusted":true},"execution_count":50,"outputs":[]},{"cell_type":"code","source":"temp_df.shape","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.804531Z","iopub.execute_input":"2024-05-31T16:13:38.804807Z","iopub.status.idle":"2024-05-31T16:13:38.810942Z","shell.execute_reply.started":"2024-05-31T16:13:38.804784Z","shell.execute_reply":"2024-05-31T16:13:38.809885Z"},"trusted":true},"execution_count":51,"outputs":[{"execution_count":51,"output_type":"execute_result","data":{"text/plain":"(200000, 2023)"},"metadata":{}}]},{"cell_type":"code","source":"temp_df['is_duplicate']","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.812454Z","iopub.execute_input":"2024-05-31T16:13:38.813079Z","iopub.status.idle":"2024-05-31T16:13:38.827266Z","shell.execute_reply.started":"2024-05-31T16:13:38.813018Z","shell.execute_reply":"2024-05-31T16:13:38.826128Z"},"trusted":true},"execution_count":52,"outputs":[{"execution_count":52,"output_type":"execute_result","data":{"text/plain":"0 0\n1 0\n2 0\n3 0\n4 0\n ..\n199996 0\n199997 0\n199998 1\n199999 0\n200000 0\nName: is_duplicate, Length: 200000, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"temp_df.columns = temp_df.columns.astype(str)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.828616Z","iopub.execute_input":"2024-05-31T16:13:38.829513Z","iopub.status.idle":"2024-05-31T16:13:38.838173Z","shell.execute_reply.started":"2024-05-31T16:13:38.829475Z","shell.execute_reply":"2024-05-31T16:13:38.837202Z"},"trusted":true},"execution_count":53,"outputs":[]},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.839553Z","iopub.execute_input":"2024-05-31T16:13:38.839956Z","iopub.status.idle":"2024-05-31T16:13:38.849624Z","shell.execute_reply.started":"2024-05-31T16:13:38.839921Z","shell.execute_reply":"2024-05-31T16:13:38.848631Z"},"trusted":true},"execution_count":54,"outputs":[]},{"cell_type":"code","source":"\nx_train,x_test,y_train,y_test=train_test_split(temp_df.drop(columns='is_duplicate'),temp_df['is_duplicate'],test_size=0.1,random_state=3)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.851064Z","iopub.execute_input":"2024-05-31T16:13:38.851910Z","iopub.status.idle":"2024-05-31T16:13:42.573240Z","shell.execute_reply.started":"2024-05-31T16:13:38.851873Z","shell.execute_reply":"2024-05-31T16:13:42.572007Z"},"trusted":true},"execution_count":55,"outputs":[]},{"cell_type":"code","source":"temp_df","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:45.458273Z","iopub.execute_input":"2024-05-31T16:13:45.458942Z","iopub.status.idle":"2024-05-31T16:13:45.547209Z","shell.execute_reply.started":"2024-05-31T16:13:45.458905Z","shell.execute_reply":"2024-05-31T16:13:45.546132Z"},"trusted":true},"execution_count":56,"outputs":[{"execution_count":56,"output_type":"execute_result","data":{"text/plain":" is_duplicate q1_len q2_len q1_num_words q2_num_words word_common \\\n0 0 65 56 14 12 11 \n1 0 46 83 8 13 4 \n2 0 72 58 14 10 4 \n3 0 48 51 11 9 0 \n4 0 73 38 13 7 4 \n... ... ... ... ... ... ... \n199996 0 43 48 9 9 5 \n199997 0 23 37 5 7 5 \n199998 1 50 61 11 13 9 \n199999 0 38 44 8 9 8 \n200000 0 146 60 27 10 6 \n\n word_total word_share cwc_min cwc_max ... 990 991 992 993 \\\n0 23 0.48 0.999980 0.833319 ... 0.0 0.0 0.0 0.0 \n1 18 0.22 0.666644 0.249997 ... 0.0 0.0 0.0 0.0 \n2 24 0.17 0.399992 0.333328 ... 0.0 0.0 0.0 0.0 \n3 19 0.00 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 \n4 20 0.20 0.399992 0.199998 ... 0.0 0.0 0.0 0.0 \n... ... ... ... ... ... ... ... ... ... \n199996 18 0.28 0.749981 0.749981 ... 0.0 0.0 0.0 0.0 \n199997 12 0.42 0.999950 0.499988 ... 0.0 0.0 0.0 0.0 \n199998 21 0.43 0.833319 0.833319 ... 0.0 0.0 0.0 0.0 \n199999 17 0.47 0.999975 0.799984 ... 0.0 0.0 0.0 0.0 \n200000 34 0.18 0.666656 0.222221 ... 0.0 0.0 0.0 0.0 \n\n 994 995 996 997 998 999 \n0 0.0 0.0 0.0 0.000000 0.0 0.0 \n1 0.0 0.0 0.0 0.000000 0.0 0.0 \n2 0.0 0.0 0.0 0.000000 0.0 0.0 \n3 0.0 0.0 0.0 0.000000 0.0 0.0 \n4 0.0 0.0 0.0 0.000000 0.0 0.0 \n... ... ... ... ... ... ... \n199996 0.0 0.0 0.0 0.000000 0.0 0.0 \n199997 0.0 0.0 0.0 0.000000 0.0 0.0 \n199998 0.0 0.0 0.0 0.000000 0.0 0.0 \n199999 0.0 0.0 0.0 0.344384 0.0 0.0 \n200000 0.0 0.0 0.0 0.000000 0.0 0.0 \n\n[200000 rows x 2023 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
is_duplicateq1_lenq2_lenq1_num_wordsq2_num_wordsword_commonword_totalword_sharecwc_mincwc_max...990991992993994995996997998999
006556141211230.480.9999800.833319...0.00.00.00.00.00.00.00.0000000.00.0
1046838134180.220.6666440.249997...0.00.00.00.00.00.00.00.0000000.00.0
20725814104240.170.3999920.333328...0.00.00.00.00.00.00.00.0000000.00.0
3048511190190.000.0000000.000000...0.00.00.00.00.00.00.00.0000000.00.0
4073381374200.200.3999920.199998...0.00.00.00.00.00.00.00.0000000.00.0
..................................................................
19999604348995180.280.7499810.749981...0.00.00.00.00.00.00.00.0000000.00.0
19999702337575120.420.9999500.499988...0.00.00.00.00.00.00.00.0000000.00.0
1999981506111139210.430.8333190.833319...0.00.00.00.00.00.00.00.0000000.00.0
19999903844898170.470.9999750.799984...0.00.00.00.00.00.00.00.3443840.00.0
20000001466027106340.180.6666560.222221...0.00.00.00.00.00.00.00.0000000.00.0
\n

200000 rows × 2023 columns

\n
"},"metadata":{}}]},{"cell_type":"code","source":"temp_df.columns = temp_df.columns.astype(str)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:45.723629Z","iopub.execute_input":"2024-05-31T16:13:45.724341Z","iopub.status.idle":"2024-05-31T16:13:45.729720Z","shell.execute_reply.started":"2024-05-31T16:13:45.724307Z","shell.execute_reply":"2024-05-31T16:13:45.728559Z"},"trusted":true},"execution_count":57,"outputs":[]},{"cell_type":"code","source":"from sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:45.948658Z","iopub.execute_input":"2024-05-31T16:13:45.949487Z","iopub.status.idle":"2024-05-31T16:13:46.109297Z","shell.execute_reply.started":"2024-05-31T16:13:45.949447Z","shell.execute_reply":"2024-05-31T16:13:46.108278Z"},"trusted":true},"execution_count":58,"outputs":[]},{"cell_type":"code","source":"\nrf=RandomForestClassifier()\nrf.fit(x_train,y_train)\ny_pred=rf.predict(x_test)\naccuracy_score(y_test,y_pred)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:46.859901Z","iopub.execute_input":"2024-05-31T16:13:46.860985Z","iopub.status.idle":"2024-05-31T16:18:53.609846Z","shell.execute_reply.started":"2024-05-31T16:13:46.860950Z","shell.execute_reply":"2024-05-31T16:18:53.608761Z"},"trusted":true},"execution_count":59,"outputs":[{"execution_count":59,"output_type":"execute_result","data":{"text/plain":"0.8151"},"metadata":{}}]},{"cell_type":"code","source":"import pickle\nmodel_pkl_file = \"RF.pkl\" \n\nwith open(model_pkl_file, 'wb') as file: \n pickle.dump(rf, file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:53.611939Z","iopub.execute_input":"2024-05-31T16:18:53.612764Z","iopub.status.idle":"2024-05-31T16:18:54.365381Z","shell.execute_reply.started":"2024-05-31T16:18:53.612727Z","shell.execute_reply":"2024-05-31T16:18:54.364401Z"},"trusted":true},"execution_count":60,"outputs":[]},{"cell_type":"code","source":"model_pkl_file = \"BOW.pkl\" \n\nwith open(model_pkl_file, 'wb') as file: \n pickle.dump(cv, file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:54.366547Z","iopub.execute_input":"2024-05-31T16:18:54.366856Z","iopub.status.idle":"2024-05-31T16:18:54.400438Z","shell.execute_reply.started":"2024-05-31T16:18:54.366830Z","shell.execute_reply":"2024-05-31T16:18:54.399515Z"},"trusted":true},"execution_count":61,"outputs":[]},{"cell_type":"code","source":"import pickle\nwith open(\"/kaggle/working/BOW.pkl\", 'rb') as file: \n cv = pickle.load(file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:54.401578Z","iopub.execute_input":"2024-05-31T16:18:54.401866Z","iopub.status.idle":"2024-05-31T16:18:54.429997Z","shell.execute_reply.started":"2024-05-31T16:18:54.401842Z","shell.execute_reply":"2024-05-31T16:18:54.429102Z"},"trusted":true},"execution_count":62,"outputs":[]},{"cell_type":"code","source":"with open(\"/kaggle/working/RF.pkl\", 'rb') as file: \n rf = pickle.load(file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:54.432348Z","iopub.execute_input":"2024-05-31T16:18:54.432667Z","iopub.status.idle":"2024-05-31T16:18:55.121525Z","shell.execute_reply.started":"2024-05-31T16:18:54.432641Z","shell.execute_reply":"2024-05-31T16:18:55.120436Z"},"trusted":true},"execution_count":63,"outputs":[]},{"cell_type":"code","source":"df=pd.read_csv(\"/kaggle/input/quora-duplicate-questions-copy/train.csv\")\ndf=df.tail(204290)\ndf.dropna(inplace=True)\ndf.drop(columns=['id','qid1','qid2'],inplace=True)\ndf['question1']=df['question1'].apply(preprocess)\ndf['question2']=df['question2'].apply(preprocess)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:55.122843Z","iopub.execute_input":"2024-05-31T16:18:55.123148Z","iopub.status.idle":"2024-05-31T16:19:11.234145Z","shell.execute_reply.started":"2024-05-31T16:18:55.123124Z","shell.execute_reply":"2024-05-31T16:19:11.233103Z"},"trusted":true},"execution_count":64,"outputs":[]},{"cell_type":"code","source":"\ndf['q1_len']=df['question1'].str.len()\ndf['q2_len']=df['question2'].str.len()\ndf['q1_num_words']=df['question1'].apply(lambda row: len(row.split(\" \")))\ndf['q2_num_words']=df['question2'].apply(lambda row: len(row.split(\" \")))\ndef common_words(row):\n w1=set(map(lambda word: word.lower().strip(),row['question1'].split(\" \")))\n w2=set(map(lambda word: word.lower().strip(),row['question2'].split(\" \")))\n return len(w1 & w2)\ndef total_words(row):\n w1=set(map(lambda word: word.lower().strip(),row['question1'].split(\" \")))\n w2=set(map(lambda word: word.lower().strip(),row['question2'].split(\" \")))\n return len(w1) + len(w2)\n\ndf['word_common']=df.apply(common_words,axis=1)\ndf['word_total']=df.apply(total_words,axis=1)\ndf['word_share']=round(df['word_common']/df['word_total'],2)\n\n","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:19:11.235593Z","iopub.execute_input":"2024-05-31T16:19:11.236383Z","iopub.status.idle":"2024-05-31T16:19:20.397449Z","shell.execute_reply.started":"2024-05-31T16:19:11.236343Z","shell.execute_reply":"2024-05-31T16:19:20.396339Z"},"trusted":true},"execution_count":65,"outputs":[]},{"cell_type":"code","source":"token_features = df.apply(fetch_token_features, axis=1)\n\ndf[\"cwc_min\"] = list(map(lambda x: x[0], token_features))\ndf[\"cwc_max\"] = list(map(lambda x: x[1], token_features))\ndf[\"csc_min\"] = list(map(lambda x: x[2], token_features))\ndf[\"csc_max\"] = list(map(lambda x: x[3], token_features))\ndf[\"ctc_min\"] = list(map(lambda x: x[4], token_features))\ndf[\"ctc_max\"] = list(map(lambda x: x[5], token_features))\ndf[\"last_word_eq\"] = list(map(lambda x: x[6], token_features))\ndf[\"first_word_eq\"] = list(map(lambda x: x[7], token_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:19:20.398704Z","iopub.execute_input":"2024-05-31T16:19:20.399012Z","iopub.status.idle":"2024-05-31T16:20:13.569480Z","shell.execute_reply.started":"2024-05-31T16:19:20.398986Z","shell.execute_reply":"2024-05-31T16:20:13.568221Z"},"trusted":true},"execution_count":66,"outputs":[]},{"cell_type":"code","source":"length_features = df.apply(fetch_length_features, axis=1)\n\ndf['abs_len_diff'] = list(map(lambda x: x[0], length_features))\ndf['mean_len'] = list(map(lambda x: x[1], length_features))\ndf['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:20:13.570901Z","iopub.execute_input":"2024-05-31T16:20:13.571264Z","iopub.status.idle":"2024-05-31T16:22:55.402008Z","shell.execute_reply.started":"2024-05-31T16:20:13.571234Z","shell.execute_reply":"2024-05-31T16:22:55.400892Z"},"trusted":true},"execution_count":67,"outputs":[]},{"cell_type":"code","source":"fuzzy_features = df.apply(fetch_fuzzy_features, axis=1)\n\n# Creating new feature columns for fuzzy features\ndf['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))\ndf['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))\ndf['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))\ndf['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:22:55.403684Z","iopub.execute_input":"2024-05-31T16:22:55.403981Z","iopub.status.idle":"2024-05-31T16:32:18.985961Z","shell.execute_reply.started":"2024-05-31T16:22:55.403956Z","shell.execute_reply":"2024-05-31T16:32:18.984852Z"},"trusted":true},"execution_count":68,"outputs":[]},{"cell_type":"code","source":"ndf2=df.drop(columns=['question1','question2'])","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:18.987276Z","iopub.execute_input":"2024-05-31T16:32:18.987559Z","iopub.status.idle":"2024-05-31T16:32:19.008186Z","shell.execute_reply.started":"2024-05-31T16:32:18.987536Z","shell.execute_reply":"2024-05-31T16:32:19.007170Z"},"trusted":true},"execution_count":69,"outputs":[]},{"cell_type":"code","source":"questions=list(df['question1'])+list(df['question2'])\nq1_arr,q2_arr=np.vsplit(cv.fit_transform(questions).toarray(),2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:19.009434Z","iopub.execute_input":"2024-05-31T16:32:19.009730Z","iopub.status.idle":"2024-05-31T16:32:28.784256Z","shell.execute_reply.started":"2024-05-31T16:32:19.009705Z","shell.execute_reply":"2024-05-31T16:32:28.783401Z"},"trusted":true},"execution_count":70,"outputs":[]},{"cell_type":"code","source":"tenp_df=\"\"\nndf1=\"\"","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:28.785527Z","iopub.execute_input":"2024-05-31T16:32:28.785898Z","iopub.status.idle":"2024-05-31T16:32:28.794742Z","shell.execute_reply.started":"2024-05-31T16:32:28.785866Z","shell.execute_reply":"2024-05-31T16:32:28.793759Z"},"trusted":true},"execution_count":71,"outputs":[]},{"cell_type":"code","source":"\ntemp_df=pd.concat([pd.DataFrame(q1_arr,index=ndf2.index),pd.DataFrame(q2_arr,index=ndf2.index)],axis=1)\ntemp_df.shape\n","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:28.798176Z","iopub.execute_input":"2024-05-31T16:32:28.798534Z","iopub.status.idle":"2024-05-31T16:32:37.549598Z","shell.execute_reply.started":"2024-05-31T16:32:28.798507Z","shell.execute_reply":"2024-05-31T16:32:37.548558Z"},"trusted":true},"execution_count":72,"outputs":[{"execution_count":72,"output_type":"execute_result","data":{"text/plain":"(204288, 2000)"},"metadata":{}}]},{"cell_type":"code","source":"q1_arr=\"\"\nq2_arr=\"\"","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:37.550761Z","iopub.execute_input":"2024-05-31T16:32:37.551083Z","iopub.status.idle":"2024-05-31T16:32:37.719975Z","shell.execute_reply.started":"2024-05-31T16:32:37.551032Z","shell.execute_reply":"2024-05-31T16:32:37.718778Z"},"trusted":true},"execution_count":73,"outputs":[]},{"cell_type":"code","source":"temp_df=pd.concat([ndf2,temp_df],axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:37.721428Z","iopub.execute_input":"2024-05-31T16:32:37.721758Z","iopub.status.idle":"2024-05-31T16:32:41.807913Z","shell.execute_reply.started":"2024-05-31T16:32:37.721730Z","shell.execute_reply":"2024-05-31T16:32:41.807000Z"},"trusted":true},"execution_count":74,"outputs":[]},{"cell_type":"code","source":"temp_df.columns = temp_df.columns.astype(str)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:41.808947Z","iopub.execute_input":"2024-05-31T16:32:41.809256Z","iopub.status.idle":"2024-05-31T16:32:41.814768Z","shell.execute_reply.started":"2024-05-31T16:32:41.809230Z","shell.execute_reply":"2024-05-31T16:32:41.813818Z"},"trusted":true},"execution_count":75,"outputs":[]},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split\nx_train,x_test,y_train,y_test=train_test_split(temp_df.drop(columns='is_duplicate'),temp_df['is_duplicate'],test_size=0.1,random_state=3)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:41.815878Z","iopub.execute_input":"2024-05-31T16:32:41.816183Z","iopub.status.idle":"2024-05-31T16:32:45.660976Z","shell.execute_reply.started":"2024-05-31T16:32:41.816159Z","shell.execute_reply":"2024-05-31T16:32:45.659752Z"},"trusted":true},"execution_count":76,"outputs":[]},{"cell_type":"code","source":"rf.fit(x_train,y_train)\ny_pred=rf.predict(x_test)\naccuracy_score(y_test,y_pred)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:45.662778Z","iopub.execute_input":"2024-05-31T16:32:45.663346Z","iopub.status.idle":"2024-05-31T16:37:57.160516Z","shell.execute_reply.started":"2024-05-31T16:32:45.663299Z","shell.execute_reply":"2024-05-31T16:37:57.159407Z"},"trusted":true},"execution_count":77,"outputs":[{"execution_count":77,"output_type":"execute_result","data":{"text/plain":"0.8166821675069754"},"metadata":{}}]},{"cell_type":"code","source":"model_pkl_file = \"RF.pkl\" \n\nwith open(model_pkl_file, 'wb') as file: \n pickle.dump(rf, file)\n \nmodel_pkl_file = \"BOW.pkl\" \n\nwith open(model_pkl_file, 'wb') as file: \n pickle.dump(cv, file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:37:57.161922Z","iopub.execute_input":"2024-05-31T16:37:57.162258Z","iopub.status.idle":"2024-05-31T16:37:58.112704Z","shell.execute_reply.started":"2024-05-31T16:37:57.162230Z","shell.execute_reply":"2024-05-31T16:37:58.111599Z"},"trusted":true},"execution_count":78,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]} \ No newline at end of file