File size: 113,500 Bytes
79f04c1
1
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":8565891,"sourceType":"datasetVersion","datasetId":5120988}],"dockerImageVersionId":30716,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:18.221392Z","iopub.execute_input":"2024-05-31T15:59:18.221694Z","iopub.status.idle":"2024-05-31T15:59:20.313965Z","shell.execute_reply.started":"2024-05-31T15:59:18.221668Z","shell.execute_reply":"2024-05-31T15:59:20.312942Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"df=pd.read_csv(\"/kaggle/input/quora-duplicate-questions-copy/train.csv\")","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:20.316233Z","iopub.execute_input":"2024-05-31T15:59:20.316770Z","iopub.status.idle":"2024-05-31T15:59:22.476112Z","shell.execute_reply.started":"2024-05-31T15:59:20.316735Z","shell.execute_reply":"2024-05-31T15:59:22.475105Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"df.shape","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.477876Z","iopub.execute_input":"2024-05-31T15:59:22.478273Z","iopub.status.idle":"2024-05-31T15:59:22.487296Z","shell.execute_reply.started":"2024-05-31T15:59:22.478236Z","shell.execute_reply":"2024-05-31T15:59:22.486116Z"},"trusted":true},"execution_count":3,"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":"(404290, 6)"},"metadata":{}}]},{"cell_type":"code","source":"df.head()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.488561Z","iopub.execute_input":"2024-05-31T15:59:22.488954Z","iopub.status.idle":"2024-05-31T15:59:22.514564Z","shell.execute_reply.started":"2024-05-31T15:59:22.488922Z","shell.execute_reply":"2024-05-31T15:59:22.513359Z"},"trusted":true},"execution_count":4,"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":"   id  qid1  qid2                                          question1  \\\n0   0     1     2  What is the step by step guide to invest in sh...   \n1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   \n2   2     5     6  How can I increase the speed of my internet co...   \n3   3     7     8  Why am I mentally very lonely? How can I solve...   \n4   4     9    10  Which one dissolve in water quikly sugar, salt...   \n\n                                           question2  is_duplicate  \n0  What is the step by step guide to invest in sh...             0  \n1  What would happen if the Indian government sto...             0  \n2  How can Internet speed be increased by hacking...             0  \n3  Find the remainder when [math]23^{24}[/math] i...             0  \n4            Which fish would survive in salt water?             0  ","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>qid1</th>\n      <th>qid2</th>\n      <th>question1</th>\n      <th>question2</th>\n      <th>is_duplicate</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>1</td>\n      <td>2</td>\n      <td>What is the step by step guide to invest in sh...</td>\n      <td>What is the step by step guide to invest in sh...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>3</td>\n      <td>4</td>\n      <td>What is the story of Kohinoor (Koh-i-Noor) Dia...</td>\n      <td>What would happen if the Indian government sto...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>5</td>\n      <td>6</td>\n      <td>How can I increase the speed of my internet co...</td>\n      <td>How can Internet speed be increased by hacking...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>7</td>\n      <td>8</td>\n      <td>Why am I mentally very lonely? How can I solve...</td>\n      <td>Find the remainder when [math]23^{24}[/math] i...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>9</td>\n      <td>10</td>\n      <td>Which one dissolve in water quikly sugar, salt...</td>\n      <td>Which fish would survive in salt water?</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"df.isnull().sum()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.517703Z","iopub.execute_input":"2024-05-31T15:59:22.518337Z","iopub.status.idle":"2024-05-31T15:59:22.596705Z","shell.execute_reply.started":"2024-05-31T15:59:22.518297Z","shell.execute_reply":"2024-05-31T15:59:22.595501Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":"id              0\nqid1            0\nqid2            0\nquestion1       1\nquestion2       2\nis_duplicate    0\ndtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"df.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.598173Z","iopub.execute_input":"2024-05-31T15:59:22.598466Z","iopub.status.idle":"2024-05-31T15:59:22.736840Z","shell.execute_reply.started":"2024-05-31T15:59:22.598442Z","shell.execute_reply":"2024-05-31T15:59:22.735647Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"df.duplicated().sum()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.741851Z","iopub.execute_input":"2024-05-31T15:59:22.744101Z","iopub.status.idle":"2024-05-31T15:59:23.191640Z","shell.execute_reply.started":"2024-05-31T15:59:22.744061Z","shell.execute_reply":"2024-05-31T15:59:23.190396Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"0"},"metadata":{}}]},{"cell_type":"code","source":"df=df.head(200000)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.192948Z","iopub.execute_input":"2024-05-31T15:59:23.193273Z","iopub.status.idle":"2024-05-31T15:59:23.198226Z","shell.execute_reply.started":"2024-05-31T15:59:23.193238Z","shell.execute_reply":"2024-05-31T15:59:23.197248Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"def preprocess(q):\n    q=str(q).lower().strip()\n    \n    q=q.replace('%',' percent ')\n    q=q.replace('@',' at ')\n    q=q.replace('$',' dollar ')\n    \n    q=q.replace('[math]','')\n    \n    q=q.replace(',000,000,000 ','b ')\n    q=q.replace(',000,000 ','m ')\n    q=q.replace(',000 ','k ')\n    \n    import re\n    q=re.sub(r'([0-9]+)000000000',r'\\1b',q)\n    q=re.sub(r'([0-9]+)000000',r'\\1m',q)\n    q=re.sub(r'([0-9]+)000',r'\\1k',q)\n    \n    contractions = { \n    \"ain't\": \"am not\",\n    \"aren't\": \"are not\",\n    \"can't\": \"can not\",\n    \"can't've\": \"can not have\",\n    \"'cause\": \"because\",\n    \"could've\": \"could have\",\n    \"couldn't\": \"could not\",\n    \"couldn't've\": \"could not have\",\n    \"didn't\": \"did not\",\n    \"doesn't\": \"does not\",\n    \"don't\": \"do not\",\n    \"hadn't\": \"had not\",\n    \"hadn't've\": \"had not have\",\n    \"hasn't\": \"has not\",\n    \"haven't\": \"have not\",\n    \"he'd\": \"he would\",\n    \"he'd've\": \"he would have\",\n    \"he'll\": \"he will\",\n    \"he'll've\": \"he will have\",\n    \"he's\": \"he is\",\n    \"how'd\": \"how did\",\n    \"how'd'y\": \"how do you\",\n    \"how'll\": \"how will\",\n    \"how's\": \"how is\",\n    \"i'd\": \"i would\",\n    \"i'd've\": \"i would have\",\n    \"i'll\": \"i will\",\n    \"i'll've\": \"i will have\",\n    \"i'm\": \"i am\",\n    \"i've\": \"i have\",\n    \"isn't\": \"is not\",\n    \"it'd\": \"it would\",\n    \"it'd've\": \"it would have\",\n    \"it'll\": \"it will\",\n    \"it'll've\": \"it will have\",\n    \"it's\": \"it is\",\n    \"let's\": \"let us\",\n    \"ma'am\": \"madam\",\n    \"mayn't\": \"may not\",\n    \"might've\": \"might have\",\n    \"mightn't\": \"might not\",\n    \"mightn't've\": \"might not have\",\n    \"must've\": \"must have\",\n    \"mustn't\": \"must not\",\n    \"mustn't've\": \"must not have\",\n    \"needn't\": \"need not\",\n    \"needn't've\": \"need not have\",\n    \"o'clock\": \"of the clock\",\n    \"oughtn't\": \"ought not\",\n    \"oughtn't've\": \"ought not have\",\n    \"shan't\": \"shall not\",\n    \"sha'n't\": \"shall not\",\n    \"shan't've\": \"shall not have\",\n    \"she'd\": \"she would\",\n    \"she'd've\": \"she would have\",\n    \"she'll\": \"she will\",\n    \"she'll've\": \"she will have\",\n    \"she's\": \"she is\",\n    \"should've\": \"should have\",\n    \"shouldn't\": \"should not\",\n    \"shouldn't've\": \"should not have\",\n    \"so've\": \"so have\",\n    \"so's\": \"so as\",\n    \"that'd\": \"that would\",\n    \"that'd've\": \"that would have\",\n    \"that's\": \"that is\",\n    \"there'd\": \"there would\",\n    \"there'd've\": \"there would have\",\n    \"there's\": \"there is\",\n    \"they'd\": \"they would\",\n    \"they'd've\": \"they would have\",\n    \"they'll\": \"they will\",\n    \"they'll've\": \"they will have\",\n    \"they're\": \"they are\",\n    \"they've\": \"they have\",\n    \"to've\": \"to have\",\n    \"wasn't\": \"was not\",\n    \"we'd\": \"we would\",\n    \"we'd've\": \"we would have\",\n    \"we'll\": \"we will\",\n    \"we'll've\": \"we will have\",\n    \"we're\": \"we are\",\n    \"we've\": \"we have\",\n    \"weren't\": \"were not\",\n    \"what'll\": \"what will\",\n    \"what'll've\": \"what will have\",\n    \"what're\": \"what are\",\n    \"what's\": \"what is\",\n    \"what've\": \"what have\",\n    \"when's\": \"when is\",\n    \"when've\": \"when have\",\n    \"where'd\": \"where did\",\n    \"where's\": \"where is\",\n    \"where've\": \"where have\",\n    \"who'll\": \"who will\",\n    \"who'll've\": \"who will have\",\n    \"who's\": \"who is\",\n    \"who've\": \"who have\",\n    \"why's\": \"why is\",\n    \"why've\": \"why have\",\n    \"will've\": \"will have\",\n    \"won't\": \"will not\",\n    \"won't've\": \"will not have\",\n    \"would've\": \"would have\",\n    \"wouldn't\": \"would not\",\n    \"wouldn't've\": \"would not have\",\n    \"y'all\": \"you all\",\n    \"y'all'd\": \"you all would\",\n    \"y'all'd've\": \"you all would have\",\n    \"y'all're\": \"you all are\",\n    \"y'all've\": \"you all have\",\n    \"you'd\": \"you would\",\n    \"you'd've\": \"you would have\",\n    \"you'll\": \"you will\",\n    \"you'll've\": \"you will have\",\n    \"you're\": \"you are\",\n    \"you've\": \"you have\"\n    }\n\n    q_decontracted = []\n\n    for word in q.split():\n        if word in contractions:\n            word = contractions[word]\n\n        q_decontracted.append(word)\n\n    q = ' '.join(q_decontracted)\n    q = q.replace(\"'ve\", \" have\")\n    q = q.replace(\"n't\", \" not\")\n    q = q.replace(\"'re\", \" are\")\n    q = q.replace(\"'ll\", \" will\")\n    \n    q=re.sub(re.compile('<.*?>'),'',q)\n    \n    import string\n    q=q.translate(str.maketrans('', '', string.punctuation))\n    \n    return q","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.199690Z","iopub.execute_input":"2024-05-31T15:59:23.199977Z","iopub.status.idle":"2024-05-31T15:59:23.217455Z","shell.execute_reply.started":"2024-05-31T15:59:23.199952Z","shell.execute_reply":"2024-05-31T15:59:23.216322Z"},"trusted":true},"execution_count":9,"outputs":[]},{"cell_type":"code","source":"df['is_duplicate'].value_counts().plot(kind='bar')","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.218752Z","iopub.execute_input":"2024-05-31T15:59:23.219131Z","iopub.status.idle":"2024-05-31T15:59:23.586693Z","shell.execute_reply.started":"2024-05-31T15:59:23.219097Z","shell.execute_reply":"2024-05-31T15:59:23.585550Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"<Axes: xlabel='is_duplicate'>"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"<Figure size 640x480 with 1 Axes>","image/png":""},"metadata":{}}]},{"cell_type":"code","source":"qid=pd.Series(df['qid1'].tolist()+df['qid2'].tolist())","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.588197Z","iopub.execute_input":"2024-05-31T15:59:23.588543Z","iopub.status.idle":"2024-05-31T15:59:23.745422Z","shell.execute_reply.started":"2024-05-31T15:59:23.588515Z","shell.execute_reply":"2024-05-31T15:59:23.743914Z"},"trusted":true},"execution_count":11,"outputs":[]},{"cell_type":"code","source":"np.unique(qid).shape[0]","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.747034Z","iopub.execute_input":"2024-05-31T15:59:23.747847Z","iopub.status.idle":"2024-05-31T15:59:23.793227Z","shell.execute_reply.started":"2024-05-31T15:59:23.747809Z","shell.execute_reply":"2024-05-31T15:59:23.791954Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"301629"},"metadata":{}}]},{"cell_type":"code","source":"df['question1']=df['question1'].apply(preprocess)\ndf['question2']=df['question2'].apply(preprocess)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:23.794871Z","iopub.execute_input":"2024-05-31T15:59:23.795807Z","iopub.status.idle":"2024-05-31T15:59:38.168654Z","shell.execute_reply.started":"2024-05-31T15:59:23.795769Z","shell.execute_reply":"2024-05-31T15:59:38.167619Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"code","source":"qid.shape[0]","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.174320Z","iopub.execute_input":"2024-05-31T15:59:38.174626Z","iopub.status.idle":"2024-05-31T15:59:38.181066Z","shell.execute_reply.started":"2024-05-31T15:59:38.174601Z","shell.execute_reply":"2024-05-31T15:59:38.180099Z"},"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":"400000"},"metadata":{}}]},{"cell_type":"code","source":"x=qid.value_counts()>1","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.182483Z","iopub.execute_input":"2024-05-31T15:59:38.182810Z","iopub.status.idle":"2024-05-31T15:59:38.222431Z","shell.execute_reply.started":"2024-05-31T15:59:38.182777Z","shell.execute_reply":"2024-05-31T15:59:38.221528Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"x[x]","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.223680Z","iopub.execute_input":"2024-05-31T15:59:38.223986Z","iopub.status.idle":"2024-05-31T15:59:38.232481Z","shell.execute_reply.started":"2024-05-31T15:59:38.223961Z","shell.execute_reply":"2024-05-31T15:59:38.231296Z"},"trusted":true},"execution_count":16,"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"2559      True\n4044      True\n30782     True\n17978     True\n2561      True\n          ... \n41258     True\n64963     True\n22576     True\n141425    True\n47459     True\nName: count, Length: 47906, dtype: bool"},"metadata":{}}]},{"cell_type":"code","source":"x[x].shape[0]","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.233700Z","iopub.execute_input":"2024-05-31T15:59:38.234161Z","iopub.status.idle":"2024-05-31T15:59:38.242383Z","shell.execute_reply.started":"2024-05-31T15:59:38.234134Z","shell.execute_reply":"2024-05-31T15:59:38.241358Z"},"trusted":true},"execution_count":17,"outputs":[{"execution_count":17,"output_type":"execute_result","data":{"text/plain":"47906"},"metadata":{}}]},{"cell_type":"code","source":"plt.hist(qid.value_counts().values,bins=100)\nplt.yscale('log')\nplt.show()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.243819Z","iopub.execute_input":"2024-05-31T15:59:38.244313Z","iopub.status.idle":"2024-05-31T15:59:38.987011Z","shell.execute_reply.started":"2024-05-31T15:59:38.244277Z","shell.execute_reply":"2024-05-31T15:59:38.986059Z"},"trusted":true},"execution_count":18,"outputs":[{"output_type":"display_data","data":{"text/plain":"<Figure size 640x480 with 1 Axes>","image/png":""},"metadata":{}}]},{"cell_type":"code","source":"df.drop(columns=['id','qid1','qid2'],inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:38.988119Z","iopub.execute_input":"2024-05-31T15:59:38.988422Z","iopub.status.idle":"2024-05-31T15:59:39.009014Z","shell.execute_reply.started":"2024-05-31T15:59:38.988396Z","shell.execute_reply":"2024-05-31T15:59:39.008163Z"},"trusted":true},"execution_count":19,"outputs":[]},{"cell_type":"code","source":"df","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.010182Z","iopub.execute_input":"2024-05-31T15:59:39.010484Z","iopub.status.idle":"2024-05-31T15:59:39.023858Z","shell.execute_reply.started":"2024-05-31T15:59:39.010459Z","shell.execute_reply":"2024-05-31T15:59:39.022788Z"},"trusted":true},"execution_count":20,"outputs":[{"execution_count":20,"output_type":"execute_result","data":{"text/plain":"                                                question1  \\\n0       what is the step by step guide to invest in sh...   \n1          what is the story of kohinoor kohinoor diamond   \n2       how can i increase the speed of my internet co...   \n3        why am i mentally very lonely how can i solve it   \n4       which one dissolve in water quikly sugar salt ...   \n...                                                   ...   \n199996        which of these tv shows should i watch next   \n199997                            should i change my name   \n199998  should i buy the new macbook 2016 or one from ...   \n199999             what is your review of love 2011 movie   \n200000  can pakistan hit indian air craft carrier in a...   \n\n                                                question2  is_duplicate  \n0       what is the step by step guide to invest in sh...             0  \n1       what would happen if the indian government sto...             0  \n2       how can internet speed be increased by hacking...             0  \n3       find the remainder when 2324math is divided by...             0  \n4                  which fish would survive in salt water             0  \n...                                                   ...           ...  \n199996   what are some thriller shows i should watch next             0  \n199997              should i legally change my first name             0  \n199998  should i buy the new macbook pro 2016 or the m...             1  \n199999       what is your review of love birds 2011 movie             0  \n200000  can pakistan destroy an indian aircraft carrie...             0  \n\n[200000 rows x 3 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>question1</th>\n      <th>question2</th>\n      <th>is_duplicate</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>what is the step by step guide to invest in sh...</td>\n      <td>what is the step by step guide to invest in sh...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>what is the story of kohinoor kohinoor diamond</td>\n      <td>what would happen if the indian government sto...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>how can i increase the speed of my internet co...</td>\n      <td>how can internet speed be increased by hacking...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>why am i mentally very lonely how can i solve it</td>\n      <td>find the remainder when 2324math is divided by...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>which one dissolve in water quikly sugar salt ...</td>\n      <td>which fish would survive in salt water</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>199996</th>\n      <td>which of these tv shows should i watch next</td>\n      <td>what are some thriller shows i should watch next</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>199997</th>\n      <td>should i change my name</td>\n      <td>should i legally change my first name</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>199998</th>\n      <td>should i buy the new macbook 2016 or one from ...</td>\n      <td>should i buy the new macbook pro 2016 or the m...</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>199999</th>\n      <td>what is your review of love 2011 movie</td>\n      <td>what is your review of love birds 2011 movie</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>200000</th>\n      <td>can pakistan hit indian air craft carrier in a...</td>\n      <td>can pakistan destroy an indian aircraft carrie...</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n<p>200000 rows × 3 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"df['q1_len']=df['question1'].str.len()\ndf['q2_len']=df['question2'].str.len()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.025425Z","iopub.execute_input":"2024-05-31T15:59:39.025810Z","iopub.status.idle":"2024-05-31T15:59:39.159384Z","shell.execute_reply.started":"2024-05-31T15:59:39.025774Z","shell.execute_reply":"2024-05-31T15:59:39.158511Z"},"trusted":true},"execution_count":21,"outputs":[]},{"cell_type":"code","source":"df['q1_num_words']=df['question1'].apply(lambda row: len(row.split(\" \")))\ndf['q2_num_words']=df['question2'].apply(lambda row: len(row.split(\" \")))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.160903Z","iopub.execute_input":"2024-05-31T15:59:39.161222Z","iopub.status.idle":"2024-05-31T15:59:39.633424Z","shell.execute_reply.started":"2024-05-31T15:59:39.161194Z","shell.execute_reply":"2024-05-31T15:59:39.632355Z"},"trusted":true},"execution_count":22,"outputs":[]},{"cell_type":"code","source":"def common_words(row):\n    w1=set(map(lambda word: word.lower().strip(),row['question1'].split(\" \")))\n    w2=set(map(lambda word: word.lower().strip(),row['question2'].split(\" \")))\n    return len(w1 & w2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.634722Z","iopub.execute_input":"2024-05-31T15:59:39.635031Z","iopub.status.idle":"2024-05-31T15:59:39.641017Z","shell.execute_reply.started":"2024-05-31T15:59:39.635005Z","shell.execute_reply":"2024-05-31T15:59:39.639990Z"},"trusted":true},"execution_count":23,"outputs":[]},{"cell_type":"code","source":"def total_words(row):\n    w1=set(map(lambda word: word.lower().strip(),row['question1'].split(\" \")))\n    w2=set(map(lambda word: word.lower().strip(),row['question2'].split(\" \")))\n    return len(w1) + len(w2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.642318Z","iopub.execute_input":"2024-05-31T15:59:39.642632Z","iopub.status.idle":"2024-05-31T15:59:39.651761Z","shell.execute_reply.started":"2024-05-31T15:59:39.642600Z","shell.execute_reply":"2024-05-31T15:59:39.650721Z"},"trusted":true},"execution_count":24,"outputs":[]},{"cell_type":"code","source":"df['word_common']=df.apply(common_words,axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:39.652945Z","iopub.execute_input":"2024-05-31T15:59:39.653278Z","iopub.status.idle":"2024-05-31T15:59:43.993662Z","shell.execute_reply.started":"2024-05-31T15:59:39.653253Z","shell.execute_reply":"2024-05-31T15:59:43.992797Z"},"trusted":true},"execution_count":25,"outputs":[]},{"cell_type":"code","source":"df['word_total']=df.apply(total_words,axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:43.994773Z","iopub.execute_input":"2024-05-31T15:59:43.995058Z","iopub.status.idle":"2024-05-31T15:59:48.205245Z","shell.execute_reply.started":"2024-05-31T15:59:43.995018Z","shell.execute_reply":"2024-05-31T15:59:48.204131Z"},"trusted":true},"execution_count":26,"outputs":[]},{"cell_type":"code","source":"df","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.206484Z","iopub.execute_input":"2024-05-31T15:59:48.206787Z","iopub.status.idle":"2024-05-31T15:59:48.221588Z","shell.execute_reply.started":"2024-05-31T15:59:48.206762Z","shell.execute_reply":"2024-05-31T15:59:48.220559Z"},"trusted":true},"execution_count":27,"outputs":[{"execution_count":27,"output_type":"execute_result","data":{"text/plain":"                                                question1  \\\n0       what is the step by step guide to invest in sh...   \n1          what is the story of kohinoor kohinoor diamond   \n2       how can i increase the speed of my internet co...   \n3        why am i mentally very lonely how can i solve it   \n4       which one dissolve in water quikly sugar salt ...   \n...                                                   ...   \n199996        which of these tv shows should i watch next   \n199997                            should i change my name   \n199998  should i buy the new macbook 2016 or one from ...   \n199999             what is your review of love 2011 movie   \n200000  can pakistan hit indian air craft carrier in a...   \n\n                                                question2  is_duplicate  \\\n0       what is the step by step guide to invest in sh...             0   \n1       what would happen if the indian government sto...             0   \n2       how can internet speed be increased by hacking...             0   \n3       find the remainder when 2324math is divided by...             0   \n4                  which fish would survive in salt water             0   \n...                                                   ...           ...   \n199996   what are some thriller shows i should watch next             0   \n199997              should i legally change my first name             0   \n199998  should i buy the new macbook pro 2016 or the m...             1   \n199999       what is your review of love birds 2011 movie             0   \n200000  can pakistan destroy an indian aircraft carrie...             0   \n\n        q1_len  q2_len  q1_num_words  q2_num_words  word_common  word_total  \n0           65      56            14            12           11          23  \n1           46      83             8            13            4          18  \n2           72      58            14            10            4          24  \n3           48      51            11             9            0          19  \n4           73      38            13             7            4          20  \n...        ...     ...           ...           ...          ...         ...  \n199996      43      48             9             9            5          18  \n199997      23      37             5             7            5          12  \n199998      50      61            11            13            9          21  \n199999      38      44             8             9            8          17  \n200000     146      60            27            10            6          34  \n\n[200000 rows x 9 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>question1</th>\n      <th>question2</th>\n      <th>is_duplicate</th>\n      <th>q1_len</th>\n      <th>q2_len</th>\n      <th>q1_num_words</th>\n      <th>q2_num_words</th>\n      <th>word_common</th>\n      <th>word_total</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>what is the step by step guide to invest in sh...</td>\n      <td>what is the step by step guide to invest in sh...</td>\n      <td>0</td>\n      <td>65</td>\n      <td>56</td>\n      <td>14</td>\n      <td>12</td>\n      <td>11</td>\n      <td>23</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>what is the story of kohinoor kohinoor diamond</td>\n      <td>what would happen if the indian government sto...</td>\n      <td>0</td>\n      <td>46</td>\n      <td>83</td>\n      <td>8</td>\n      <td>13</td>\n      <td>4</td>\n      <td>18</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>how can i increase the speed of my internet co...</td>\n      <td>how can internet speed be increased by hacking...</td>\n      <td>0</td>\n      <td>72</td>\n      <td>58</td>\n      <td>14</td>\n      <td>10</td>\n      <td>4</td>\n      <td>24</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>why am i mentally very lonely how can i solve it</td>\n      <td>find the remainder when 2324math is divided by...</td>\n      <td>0</td>\n      <td>48</td>\n      <td>51</td>\n      <td>11</td>\n      <td>9</td>\n      <td>0</td>\n      <td>19</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>which one dissolve in water quikly sugar salt ...</td>\n      <td>which fish would survive in salt water</td>\n      <td>0</td>\n      <td>73</td>\n      <td>38</td>\n      <td>13</td>\n      <td>7</td>\n      <td>4</td>\n      <td>20</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>199996</th>\n      <td>which of these tv shows should i watch next</td>\n      <td>what are some thriller shows i should watch next</td>\n      <td>0</td>\n      <td>43</td>\n      <td>48</td>\n      <td>9</td>\n      <td>9</td>\n      <td>5</td>\n      <td>18</td>\n    </tr>\n    <tr>\n      <th>199997</th>\n      <td>should i change my name</td>\n      <td>should i legally change my first name</td>\n      <td>0</td>\n      <td>23</td>\n      <td>37</td>\n      <td>5</td>\n      <td>7</td>\n      <td>5</td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <th>199998</th>\n      <td>should i buy the new macbook 2016 or one from ...</td>\n      <td>should i buy the new macbook pro 2016 or the m...</td>\n      <td>1</td>\n      <td>50</td>\n      <td>61</td>\n      <td>11</td>\n      <td>13</td>\n      <td>9</td>\n      <td>21</td>\n    </tr>\n    <tr>\n      <th>199999</th>\n      <td>what is your review of love 2011 movie</td>\n      <td>what is your review of love birds 2011 movie</td>\n      <td>0</td>\n      <td>38</td>\n      <td>44</td>\n      <td>8</td>\n      <td>9</td>\n      <td>8</td>\n      <td>17</td>\n    </tr>\n    <tr>\n      <th>200000</th>\n      <td>can pakistan hit indian air craft carrier in a...</td>\n      <td>can pakistan destroy an indian aircraft carrie...</td>\n      <td>0</td>\n      <td>146</td>\n      <td>60</td>\n      <td>27</td>\n      <td>10</td>\n      <td>6</td>\n      <td>34</td>\n    </tr>\n  </tbody>\n</table>\n<p>200000 rows × 9 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"df['word_share']=round(df['word_common']/df['word_total'],2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.223003Z","iopub.execute_input":"2024-05-31T15:59:48.223347Z","iopub.status.idle":"2024-05-31T15:59:48.235292Z","shell.execute_reply.started":"2024-05-31T15:59:48.223321Z","shell.execute_reply":"2024-05-31T15:59:48.234269Z"},"trusted":true},"execution_count":28,"outputs":[]},{"cell_type":"code","source":"df","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.236483Z","iopub.execute_input":"2024-05-31T15:59:48.236780Z","iopub.status.idle":"2024-05-31T15:59:48.255263Z","shell.execute_reply.started":"2024-05-31T15:59:48.236750Z","shell.execute_reply":"2024-05-31T15:59:48.254206Z"},"trusted":true},"execution_count":29,"outputs":[{"execution_count":29,"output_type":"execute_result","data":{"text/plain":"                                                question1  \\\n0       what is the step by step guide to invest in sh...   \n1          what is the story of kohinoor kohinoor diamond   \n2       how can i increase the speed of my internet co...   \n3        why am i mentally very lonely how can i solve it   \n4       which one dissolve in water quikly sugar salt ...   \n...                                                   ...   \n199996        which of these tv shows should i watch next   \n199997                            should i change my name   \n199998  should i buy the new macbook 2016 or one from ...   \n199999             what is your review of love 2011 movie   \n200000  can pakistan hit indian air craft carrier in a...   \n\n                                                question2  is_duplicate  \\\n0       what is the step by step guide to invest in sh...             0   \n1       what would happen if the indian government sto...             0   \n2       how can internet speed be increased by hacking...             0   \n3       find the remainder when 2324math is divided by...             0   \n4                  which fish would survive in salt water             0   \n...                                                   ...           ...   \n199996   what are some thriller shows i should watch next             0   \n199997              should i legally change my first name             0   \n199998  should i buy the new macbook pro 2016 or the m...             1   \n199999       what is your review of love birds 2011 movie             0   \n200000  can pakistan destroy an indian aircraft carrie...             0   \n\n        q1_len  q2_len  q1_num_words  q2_num_words  word_common  word_total  \\\n0           65      56            14            12           11          23   \n1           46      83             8            13            4          18   \n2           72      58            14            10            4          24   \n3           48      51            11             9            0          19   \n4           73      38            13             7            4          20   \n...        ...     ...           ...           ...          ...         ...   \n199996      43      48             9             9            5          18   \n199997      23      37             5             7            5          12   \n199998      50      61            11            13            9          21   \n199999      38      44             8             9            8          17   \n200000     146      60            27            10            6          34   \n\n        word_share  \n0             0.48  \n1             0.22  \n2             0.17  \n3             0.00  \n4             0.20  \n...            ...  \n199996        0.28  \n199997        0.42  \n199998        0.43  \n199999        0.47  \n200000        0.18  \n\n[200000 rows x 10 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>question1</th>\n      <th>question2</th>\n      <th>is_duplicate</th>\n      <th>q1_len</th>\n      <th>q2_len</th>\n      <th>q1_num_words</th>\n      <th>q2_num_words</th>\n      <th>word_common</th>\n      <th>word_total</th>\n      <th>word_share</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>what is the step by step guide to invest in sh...</td>\n      <td>what is the step by step guide to invest in sh...</td>\n      <td>0</td>\n      <td>65</td>\n      <td>56</td>\n      <td>14</td>\n      <td>12</td>\n      <td>11</td>\n      <td>23</td>\n      <td>0.48</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>what is the story of kohinoor kohinoor diamond</td>\n      <td>what would happen if the indian government sto...</td>\n      <td>0</td>\n      <td>46</td>\n      <td>83</td>\n      <td>8</td>\n      <td>13</td>\n      <td>4</td>\n      <td>18</td>\n      <td>0.22</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>how can i increase the speed of my internet co...</td>\n      <td>how can internet speed be increased by hacking...</td>\n      <td>0</td>\n      <td>72</td>\n      <td>58</td>\n      <td>14</td>\n      <td>10</td>\n      <td>4</td>\n      <td>24</td>\n      <td>0.17</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>why am i mentally very lonely how can i solve it</td>\n      <td>find the remainder when 2324math is divided by...</td>\n      <td>0</td>\n      <td>48</td>\n      <td>51</td>\n      <td>11</td>\n      <td>9</td>\n      <td>0</td>\n      <td>19</td>\n      <td>0.00</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>which one dissolve in water quikly sugar salt ...</td>\n      <td>which fish would survive in salt water</td>\n      <td>0</td>\n      <td>73</td>\n      <td>38</td>\n      <td>13</td>\n      <td>7</td>\n      <td>4</td>\n      <td>20</td>\n      <td>0.20</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>199996</th>\n      <td>which of these tv shows should i watch next</td>\n      <td>what are some thriller shows i should watch next</td>\n      <td>0</td>\n      <td>43</td>\n      <td>48</td>\n      <td>9</td>\n      <td>9</td>\n      <td>5</td>\n      <td>18</td>\n      <td>0.28</td>\n    </tr>\n    <tr>\n      <th>199997</th>\n      <td>should i change my name</td>\n      <td>should i legally change my first name</td>\n      <td>0</td>\n      <td>23</td>\n      <td>37</td>\n      <td>5</td>\n      <td>7</td>\n      <td>5</td>\n      <td>12</td>\n      <td>0.42</td>\n    </tr>\n    <tr>\n      <th>199998</th>\n      <td>should i buy the new macbook 2016 or one from ...</td>\n      <td>should i buy the new macbook pro 2016 or the m...</td>\n      <td>1</td>\n      <td>50</td>\n      <td>61</td>\n      <td>11</td>\n      <td>13</td>\n      <td>9</td>\n      <td>21</td>\n      <td>0.43</td>\n    </tr>\n    <tr>\n      <th>199999</th>\n      <td>what is your review of love 2011 movie</td>\n      <td>what is your review of love birds 2011 movie</td>\n      <td>0</td>\n      <td>38</td>\n      <td>44</td>\n      <td>8</td>\n      <td>9</td>\n      <td>8</td>\n      <td>17</td>\n      <td>0.47</td>\n    </tr>\n    <tr>\n      <th>200000</th>\n      <td>can pakistan hit indian air craft carrier in a...</td>\n      <td>can pakistan destroy an indian aircraft carrie...</td>\n      <td>0</td>\n      <td>146</td>\n      <td>60</td>\n      <td>27</td>\n      <td>10</td>\n      <td>6</td>\n      <td>34</td>\n      <td>0.18</td>\n    </tr>\n  </tbody>\n</table>\n<p>200000 rows × 10 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"ndf1=df[['question1','question2']]\nndf2=df.drop(columns=['question1','question2'])","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.256585Z","iopub.execute_input":"2024-05-31T15:59:48.256964Z","iopub.status.idle":"2024-05-31T15:59:48.280291Z","shell.execute_reply.started":"2024-05-31T15:59:48.256930Z","shell.execute_reply":"2024-05-31T15:59:48.279254Z"},"trusted":true},"execution_count":30,"outputs":[]},{"cell_type":"code","source":"ndf1","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.281704Z","iopub.execute_input":"2024-05-31T15:59:48.282112Z","iopub.status.idle":"2024-05-31T15:59:48.295253Z","shell.execute_reply.started":"2024-05-31T15:59:48.282079Z","shell.execute_reply":"2024-05-31T15:59:48.294063Z"},"trusted":true},"execution_count":31,"outputs":[{"execution_count":31,"output_type":"execute_result","data":{"text/plain":"                                                question1  \\\n0       what is the step by step guide to invest in sh...   \n1          what is the story of kohinoor kohinoor diamond   \n2       how can i increase the speed of my internet co...   \n3        why am i mentally very lonely how can i solve it   \n4       which one dissolve in water quikly sugar salt ...   \n...                                                   ...   \n199996        which of these tv shows should i watch next   \n199997                            should i change my name   \n199998  should i buy the new macbook 2016 or one from ...   \n199999             what is your review of love 2011 movie   \n200000  can pakistan hit indian air craft carrier in a...   \n\n                                                question2  \n0       what is the step by step guide to invest in sh...  \n1       what would happen if the indian government sto...  \n2       how can internet speed be increased by hacking...  \n3       find the remainder when 2324math is divided by...  \n4                  which fish would survive in salt water  \n...                                                   ...  \n199996   what are some thriller shows i should watch next  \n199997              should i legally change my first name  \n199998  should i buy the new macbook pro 2016 or the m...  \n199999       what is your review of love birds 2011 movie  \n200000  can pakistan destroy an indian aircraft carrie...  \n\n[200000 rows x 2 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>question1</th>\n      <th>question2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>what is the step by step guide to invest in sh...</td>\n      <td>what is the step by step guide to invest in sh...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>what is the story of kohinoor kohinoor diamond</td>\n      <td>what would happen if the indian government sto...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>how can i increase the speed of my internet co...</td>\n      <td>how can internet speed be increased by hacking...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>why am i mentally very lonely how can i solve it</td>\n      <td>find the remainder when 2324math is divided by...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>which one dissolve in water quikly sugar salt ...</td>\n      <td>which fish would survive in salt water</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>199996</th>\n      <td>which of these tv shows should i watch next</td>\n      <td>what are some thriller shows i should watch next</td>\n    </tr>\n    <tr>\n      <th>199997</th>\n      <td>should i change my name</td>\n      <td>should i legally change my first name</td>\n    </tr>\n    <tr>\n      <th>199998</th>\n      <td>should i buy the new macbook 2016 or one from ...</td>\n      <td>should i buy the new macbook pro 2016 or the m...</td>\n    </tr>\n    <tr>\n      <th>199999</th>\n      <td>what is your review of love 2011 movie</td>\n      <td>what is your review of love birds 2011 movie</td>\n    </tr>\n    <tr>\n      <th>200000</th>\n      <td>can pakistan hit indian air craft carrier in a...</td>\n      <td>can pakistan destroy an indian aircraft carrie...</td>\n    </tr>\n  </tbody>\n</table>\n<p>200000 rows × 2 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"ndf2","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.296620Z","iopub.execute_input":"2024-05-31T15:59:48.297003Z","iopub.status.idle":"2024-05-31T15:59:48.316699Z","shell.execute_reply.started":"2024-05-31T15:59:48.296972Z","shell.execute_reply":"2024-05-31T15:59:48.315499Z"},"trusted":true},"execution_count":32,"outputs":[{"execution_count":32,"output_type":"execute_result","data":{"text/plain":"        is_duplicate  q1_len  q2_len  q1_num_words  q2_num_words  word_common  \\\n0                  0      65      56            14            12           11   \n1                  0      46      83             8            13            4   \n2                  0      72      58            14            10            4   \n3                  0      48      51            11             9            0   \n4                  0      73      38            13             7            4   \n...              ...     ...     ...           ...           ...          ...   \n199996             0      43      48             9             9            5   \n199997             0      23      37             5             7            5   \n199998             1      50      61            11            13            9   \n199999             0      38      44             8             9            8   \n200000             0     146      60            27            10            6   \n\n        word_total  word_share  \n0               23        0.48  \n1               18        0.22  \n2               24        0.17  \n3               19        0.00  \n4               20        0.20  \n...            ...         ...  \n199996          18        0.28  \n199997          12        0.42  \n199998          21        0.43  \n199999          17        0.47  \n200000          34        0.18  \n\n[200000 rows x 8 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>is_duplicate</th>\n      <th>q1_len</th>\n      <th>q2_len</th>\n      <th>q1_num_words</th>\n      <th>q2_num_words</th>\n      <th>word_common</th>\n      <th>word_total</th>\n      <th>word_share</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>65</td>\n      <td>56</td>\n      <td>14</td>\n      <td>12</td>\n      <td>11</td>\n      <td>23</td>\n      <td>0.48</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0</td>\n      <td>46</td>\n      <td>83</td>\n      <td>8</td>\n      <td>13</td>\n      <td>4</td>\n      <td>18</td>\n      <td>0.22</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0</td>\n      <td>72</td>\n      <td>58</td>\n      <td>14</td>\n      <td>10</td>\n      <td>4</td>\n      <td>24</td>\n      <td>0.17</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0</td>\n      <td>48</td>\n      <td>51</td>\n      <td>11</td>\n      <td>9</td>\n      <td>0</td>\n      <td>19</td>\n      <td>0.00</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0</td>\n      <td>73</td>\n      <td>38</td>\n      <td>13</td>\n      <td>7</td>\n      <td>4</td>\n      <td>20</td>\n      <td>0.20</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>199996</th>\n      <td>0</td>\n      <td>43</td>\n      <td>48</td>\n      <td>9</td>\n      <td>9</td>\n      <td>5</td>\n      <td>18</td>\n      <td>0.28</td>\n    </tr>\n    <tr>\n      <th>199997</th>\n      <td>0</td>\n      <td>23</td>\n      <td>37</td>\n      <td>5</td>\n      <td>7</td>\n      <td>5</td>\n      <td>12</td>\n      <td>0.42</td>\n    </tr>\n    <tr>\n      <th>199998</th>\n      <td>1</td>\n      <td>50</td>\n      <td>61</td>\n      <td>11</td>\n      <td>13</td>\n      <td>9</td>\n      <td>21</td>\n      <td>0.43</td>\n    </tr>\n    <tr>\n      <th>199999</th>\n      <td>0</td>\n      <td>38</td>\n      <td>44</td>\n      <td>8</td>\n      <td>9</td>\n      <td>8</td>\n      <td>17</td>\n      <td>0.47</td>\n    </tr>\n    <tr>\n      <th>200000</th>\n      <td>0</td>\n      <td>146</td>\n      <td>60</td>\n      <td>27</td>\n      <td>10</td>\n      <td>6</td>\n      <td>34</td>\n      <td>0.18</td>\n    </tr>\n  </tbody>\n</table>\n<p>200000 rows × 8 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"from nltk.corpus import stopwords\n\ndef fetch_token_features(row):\n    \n    q1 = row['question1']\n    q2 = row['question2']\n    \n    SAFE_DIV = 0.0001 \n\n    STOP_WORDS = stopwords.words(\"english\")\n    \n    token_features = [0.0]*8\n    \n    # Converting the Sentence into Tokens: \n    q1_tokens = q1.split()\n    q2_tokens = q2.split()\n    \n    if len(q1_tokens) == 0 or len(q2_tokens) == 0:\n        return token_features\n\n    # Get the non-stopwords in Questions\n    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])\n    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])\n    \n    #Get the stopwords in Questions\n    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])\n    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])\n    \n    # Get the common non-stopwords from Question pair\n    common_word_count = len(q1_words.intersection(q2_words))\n    \n    # Get the common stopwords from Question pair\n    common_stop_count = len(q1_stops.intersection(q2_stops))\n    \n    # Get the common Tokens from Question pair\n    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))\n    \n    \n    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)\n    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)\n    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)\n    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)\n    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)\n    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)\n    \n    # Last word of both question is same or not\n    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])\n    \n    # First word of both question is same or not\n    token_features[7] = int(q1_tokens[0] == q2_tokens[0])\n    \n    return token_features","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:48.318063Z","iopub.execute_input":"2024-05-31T15:59:48.318457Z","iopub.status.idle":"2024-05-31T15:59:49.088959Z","shell.execute_reply.started":"2024-05-31T15:59:48.318418Z","shell.execute_reply":"2024-05-31T15:59:49.088070Z"},"trusted":true},"execution_count":33,"outputs":[]},{"cell_type":"code","source":"token_features = df.apply(fetch_token_features, axis=1)\n\ndf[\"cwc_min\"]       = list(map(lambda x: x[0], token_features))\ndf[\"cwc_max\"]       = list(map(lambda x: x[1], token_features))\ndf[\"csc_min\"]       = list(map(lambda x: x[2], token_features))\ndf[\"csc_max\"]       = list(map(lambda x: x[3], token_features))\ndf[\"ctc_min\"]       = list(map(lambda x: x[4], token_features))\ndf[\"ctc_max\"]       = list(map(lambda x: x[5], token_features))\ndf[\"last_word_eq\"]  = list(map(lambda x: x[6], token_features))\ndf[\"first_word_eq\"] = list(map(lambda x: x[7], token_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:49.090329Z","iopub.execute_input":"2024-05-31T15:59:49.090993Z","iopub.status.idle":"2024-05-31T16:00:39.809819Z","shell.execute_reply.started":"2024-05-31T15:59:49.090955Z","shell.execute_reply":"2024-05-31T16:00:39.808943Z"},"trusted":true},"execution_count":34,"outputs":[]},{"cell_type":"code","source":"pip install distance","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:00:39.811472Z","iopub.execute_input":"2024-05-31T16:00:39.811883Z","iopub.status.idle":"2024-05-31T16:00:54.852520Z","shell.execute_reply.started":"2024-05-31T16:00:39.811847Z","shell.execute_reply":"2024-05-31T16:00:54.851156Z"},"trusted":true},"execution_count":35,"outputs":[{"name":"stdout","text":"Collecting distance\n  Downloading Distance-0.1.3.tar.gz (180 kB)\n\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m180.3/180.3 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n\u001b[?25hBuilding wheels for collected packages: distance\n  Building wheel for distance (setup.py) ... \u001b[?25ldone\n\u001b[?25h  Created wheel for distance: filename=Distance-0.1.3-py3-none-any.whl size=16258 sha256=cd544d5c1039ea6345ff5a69695ae0ef0e616e019bdaf0ccaadf6d5845ffc9ac\n  Stored in directory: /root/.cache/pip/wheels/e8/bb/de/f71bf63559ea9a921059a5405806f7ff6ed612a9231c4a9309\nSuccessfully built distance\n\u001b[33mWARNING: Error parsing requirements for aiohttp: [Errno 2] No such file or directory: '/opt/conda/lib/python3.10/site-packages/aiohttp-3.9.1.dist-info/METADATA'\u001b[0m\u001b[33m\n\u001b[0mInstalling collected packages: distance\nSuccessfully installed distance-0.1.3\nNote: you may need to restart the kernel to use updated packages.\n","output_type":"stream"}]},{"cell_type":"code","source":"import distance\n\ndef fetch_length_features(row):\n    \n    q1 = row['question1']\n    q2 = row['question2']\n    \n    length_features = [0.0]*3\n    \n    # Converting the Sentence into Tokens: \n    q1_tokens = q1.split()\n    q2_tokens = q2.split()\n    \n    if len(q1_tokens) == 0 or len(q2_tokens) == 0:\n        return length_features\n    \n    # Absolute length features\n    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))\n    \n    # Average Token Length of both Questions\n    length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2\n    \n    # Find the longest common substring\n    strs = list(distance.lcsubstrings(q1, q2))\n    if strs:\n        length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)\n    else:\n        length_features[2] = 0.0\n    \n    return length_features","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:00:54.854466Z","iopub.execute_input":"2024-05-31T16:00:54.855416Z","iopub.status.idle":"2024-05-31T16:00:54.868756Z","shell.execute_reply.started":"2024-05-31T16:00:54.855371Z","shell.execute_reply":"2024-05-31T16:00:54.867731Z"},"trusted":true},"execution_count":36,"outputs":[]},{"cell_type":"code","source":"length_features = df.apply(fetch_length_features, axis=1)\n\ndf['abs_len_diff'] = list(map(lambda x: x[0], length_features))\ndf['mean_len'] = list(map(lambda x: x[1], length_features))\ndf['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:00:54.870202Z","iopub.execute_input":"2024-05-31T16:00:54.870885Z","iopub.status.idle":"2024-05-31T16:03:34.399606Z","shell.execute_reply.started":"2024-05-31T16:00:54.870849Z","shell.execute_reply":"2024-05-31T16:03:34.398480Z"},"trusted":true},"execution_count":37,"outputs":[]},{"cell_type":"code","source":"# Fuzzy Features\nfrom fuzzywuzzy import fuzz\n\ndef fetch_fuzzy_features(row):\n    \n    q1 = row['question1']\n    q2 = row['question2']\n    \n    fuzzy_features = [0.0]*4\n    \n    # fuzz_ratio\n    fuzzy_features[0] = fuzz.QRatio(q1, q2)\n\n    # fuzz_partial_ratio\n    fuzzy_features[1] = fuzz.partial_ratio(q1, q2)\n\n    # token_sort_ratio\n    fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)\n\n    # token_set_ratio\n    fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)\n\n    return fuzzy_features","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:03:34.401346Z","iopub.execute_input":"2024-05-31T16:03:34.401740Z","iopub.status.idle":"2024-05-31T16:03:34.415928Z","shell.execute_reply.started":"2024-05-31T16:03:34.401703Z","shell.execute_reply":"2024-05-31T16:03:34.414845Z"},"trusted":true},"execution_count":38,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning\n  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')\n","output_type":"stream"}]},{"cell_type":"code","source":"fuzzy_features = df.apply(fetch_fuzzy_features, axis=1)\n\n# Creating new feature columns for fuzzy features\ndf['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))\ndf['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))\ndf['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))\ndf['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:03:34.417222Z","iopub.execute_input":"2024-05-31T16:03:34.417541Z","iopub.status.idle":"2024-05-31T16:12:37.749312Z","shell.execute_reply.started":"2024-05-31T16:03:34.417507Z","shell.execute_reply":"2024-05-31T16:12:37.748091Z"},"trusted":true},"execution_count":39,"outputs":[]},{"cell_type":"code","source":"ndf1=df[['question1','question2']]\nndf2=df.drop(columns=['question1','question2'])","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:12:37.750792Z","iopub.execute_input":"2024-05-31T16:12:37.751247Z","iopub.status.idle":"2024-05-31T16:12:37.796876Z","shell.execute_reply.started":"2024-05-31T16:12:37.751211Z","shell.execute_reply":"2024-05-31T16:12:37.796026Z"},"trusted":true},"execution_count":40,"outputs":[]},{"cell_type":"code","source":"from sklearn.feature_extraction.text import TfidfVectorizer","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:12:37.806016Z","iopub.execute_input":"2024-05-31T16:12:37.806372Z","iopub.status.idle":"2024-05-31T16:12:37.811136Z","shell.execute_reply.started":"2024-05-31T16:12:37.806344Z","shell.execute_reply":"2024-05-31T16:12:37.810107Z"},"trusted":true},"execution_count":41,"outputs":[]},{"cell_type":"code","source":"cv=TfidfVectorizer(max_features=1000)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:11.173544Z","iopub.execute_input":"2024-05-31T16:13:11.173929Z","iopub.status.idle":"2024-05-31T16:13:11.179093Z","shell.execute_reply.started":"2024-05-31T16:13:11.173896Z","shell.execute_reply":"2024-05-31T16:13:11.177928Z"},"trusted":true},"execution_count":45,"outputs":[]},{"cell_type":"code","source":"\nquestions=list(ndf1['question1'])+list(ndf1['question2'])","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:12.853466Z","iopub.execute_input":"2024-05-31T16:13:12.853846Z","iopub.status.idle":"2024-05-31T16:13:12.911399Z","shell.execute_reply.started":"2024-05-31T16:13:12.853814Z","shell.execute_reply":"2024-05-31T16:13:12.910271Z"},"trusted":true},"execution_count":46,"outputs":[]},{"cell_type":"code","source":"","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"q1_arr,q2_arr=np.vsplit(cv.fit_transform(questions).toarray(),2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:17.135566Z","iopub.execute_input":"2024-05-31T16:13:17.135964Z","iopub.status.idle":"2024-05-31T16:13:26.663959Z","shell.execute_reply.started":"2024-05-31T16:13:17.135933Z","shell.execute_reply":"2024-05-31T16:13:26.662828Z"},"trusted":true},"execution_count":47,"outputs":[]},{"cell_type":"code","source":"\ntemp_df=pd.concat([pd.DataFrame(q1_arr,index=ndf1.index),pd.DataFrame(q2_arr,index=ndf1.index)],axis=1)\ntemp_df.shape","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:26.665906Z","iopub.execute_input":"2024-05-31T16:13:26.666258Z","iopub.status.idle":"2024-05-31T16:13:34.625571Z","shell.execute_reply.started":"2024-05-31T16:13:26.666230Z","shell.execute_reply":"2024-05-31T16:13:34.624673Z"},"trusted":true},"execution_count":48,"outputs":[{"execution_count":48,"output_type":"execute_result","data":{"text/plain":"(200000, 2000)"},"metadata":{}}]},{"cell_type":"code","source":"q1_arr=\"\"\nq2_arr=\"\"","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:34.627158Z","iopub.execute_input":"2024-05-31T16:13:34.627867Z","iopub.status.idle":"2024-05-31T16:13:34.791511Z","shell.execute_reply.started":"2024-05-31T16:13:34.627828Z","shell.execute_reply":"2024-05-31T16:13:34.790413Z"},"trusted":true},"execution_count":49,"outputs":[]},{"cell_type":"code","source":"temp_df=pd.concat([ndf2,temp_df],axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:34.794367Z","iopub.execute_input":"2024-05-31T16:13:34.795113Z","iopub.status.idle":"2024-05-31T16:13:38.803374Z","shell.execute_reply.started":"2024-05-31T16:13:34.795080Z","shell.execute_reply":"2024-05-31T16:13:38.802436Z"},"trusted":true},"execution_count":50,"outputs":[]},{"cell_type":"code","source":"temp_df.shape","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.804531Z","iopub.execute_input":"2024-05-31T16:13:38.804807Z","iopub.status.idle":"2024-05-31T16:13:38.810942Z","shell.execute_reply.started":"2024-05-31T16:13:38.804784Z","shell.execute_reply":"2024-05-31T16:13:38.809885Z"},"trusted":true},"execution_count":51,"outputs":[{"execution_count":51,"output_type":"execute_result","data":{"text/plain":"(200000, 2023)"},"metadata":{}}]},{"cell_type":"code","source":"temp_df['is_duplicate']","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.812454Z","iopub.execute_input":"2024-05-31T16:13:38.813079Z","iopub.status.idle":"2024-05-31T16:13:38.827266Z","shell.execute_reply.started":"2024-05-31T16:13:38.813018Z","shell.execute_reply":"2024-05-31T16:13:38.826128Z"},"trusted":true},"execution_count":52,"outputs":[{"execution_count":52,"output_type":"execute_result","data":{"text/plain":"0         0\n1         0\n2         0\n3         0\n4         0\n         ..\n199996    0\n199997    0\n199998    1\n199999    0\n200000    0\nName: is_duplicate, Length: 200000, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"temp_df.columns = temp_df.columns.astype(str)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.828616Z","iopub.execute_input":"2024-05-31T16:13:38.829513Z","iopub.status.idle":"2024-05-31T16:13:38.838173Z","shell.execute_reply.started":"2024-05-31T16:13:38.829475Z","shell.execute_reply":"2024-05-31T16:13:38.837202Z"},"trusted":true},"execution_count":53,"outputs":[]},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.839553Z","iopub.execute_input":"2024-05-31T16:13:38.839956Z","iopub.status.idle":"2024-05-31T16:13:38.849624Z","shell.execute_reply.started":"2024-05-31T16:13:38.839921Z","shell.execute_reply":"2024-05-31T16:13:38.848631Z"},"trusted":true},"execution_count":54,"outputs":[]},{"cell_type":"code","source":"\nx_train,x_test,y_train,y_test=train_test_split(temp_df.drop(columns='is_duplicate'),temp_df['is_duplicate'],test_size=0.1,random_state=3)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:38.851064Z","iopub.execute_input":"2024-05-31T16:13:38.851910Z","iopub.status.idle":"2024-05-31T16:13:42.573240Z","shell.execute_reply.started":"2024-05-31T16:13:38.851873Z","shell.execute_reply":"2024-05-31T16:13:42.572007Z"},"trusted":true},"execution_count":55,"outputs":[]},{"cell_type":"code","source":"temp_df","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:45.458273Z","iopub.execute_input":"2024-05-31T16:13:45.458942Z","iopub.status.idle":"2024-05-31T16:13:45.547209Z","shell.execute_reply.started":"2024-05-31T16:13:45.458905Z","shell.execute_reply":"2024-05-31T16:13:45.546132Z"},"trusted":true},"execution_count":56,"outputs":[{"execution_count":56,"output_type":"execute_result","data":{"text/plain":"        is_duplicate  q1_len  q2_len  q1_num_words  q2_num_words  word_common  \\\n0                  0      65      56            14            12           11   \n1                  0      46      83             8            13            4   \n2                  0      72      58            14            10            4   \n3                  0      48      51            11             9            0   \n4                  0      73      38            13             7            4   \n...              ...     ...     ...           ...           ...          ...   \n199996             0      43      48             9             9            5   \n199997             0      23      37             5             7            5   \n199998             1      50      61            11            13            9   \n199999             0      38      44             8             9            8   \n200000             0     146      60            27            10            6   \n\n        word_total  word_share   cwc_min   cwc_max  ...  990  991  992  993  \\\n0               23        0.48  0.999980  0.833319  ...  0.0  0.0  0.0  0.0   \n1               18        0.22  0.666644  0.249997  ...  0.0  0.0  0.0  0.0   \n2               24        0.17  0.399992  0.333328  ...  0.0  0.0  0.0  0.0   \n3               19        0.00  0.000000  0.000000  ...  0.0  0.0  0.0  0.0   \n4               20        0.20  0.399992  0.199998  ...  0.0  0.0  0.0  0.0   \n...            ...         ...       ...       ...  ...  ...  ...  ...  ...   \n199996          18        0.28  0.749981  0.749981  ...  0.0  0.0  0.0  0.0   \n199997          12        0.42  0.999950  0.499988  ...  0.0  0.0  0.0  0.0   \n199998          21        0.43  0.833319  0.833319  ...  0.0  0.0  0.0  0.0   \n199999          17        0.47  0.999975  0.799984  ...  0.0  0.0  0.0  0.0   \n200000          34        0.18  0.666656  0.222221  ...  0.0  0.0  0.0  0.0   \n\n        994  995  996       997  998  999  \n0       0.0  0.0  0.0  0.000000  0.0  0.0  \n1       0.0  0.0  0.0  0.000000  0.0  0.0  \n2       0.0  0.0  0.0  0.000000  0.0  0.0  \n3       0.0  0.0  0.0  0.000000  0.0  0.0  \n4       0.0  0.0  0.0  0.000000  0.0  0.0  \n...     ...  ...  ...       ...  ...  ...  \n199996  0.0  0.0  0.0  0.000000  0.0  0.0  \n199997  0.0  0.0  0.0  0.000000  0.0  0.0  \n199998  0.0  0.0  0.0  0.000000  0.0  0.0  \n199999  0.0  0.0  0.0  0.344384  0.0  0.0  \n200000  0.0  0.0  0.0  0.000000  0.0  0.0  \n\n[200000 rows x 2023 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>is_duplicate</th>\n      <th>q1_len</th>\n      <th>q2_len</th>\n      <th>q1_num_words</th>\n      <th>q2_num_words</th>\n      <th>word_common</th>\n      <th>word_total</th>\n      <th>word_share</th>\n      <th>cwc_min</th>\n      <th>cwc_max</th>\n      <th>...</th>\n      <th>990</th>\n      <th>991</th>\n      <th>992</th>\n      <th>993</th>\n      <th>994</th>\n      <th>995</th>\n      <th>996</th>\n      <th>997</th>\n      <th>998</th>\n      <th>999</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>65</td>\n      <td>56</td>\n      <td>14</td>\n      <td>12</td>\n      <td>11</td>\n      <td>23</td>\n      <td>0.48</td>\n      <td>0.999980</td>\n      <td>0.833319</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.000000</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0</td>\n      <td>46</td>\n      <td>83</td>\n      <td>8</td>\n      <td>13</td>\n      <td>4</td>\n      <td>18</td>\n      <td>0.22</td>\n      <td>0.666644</td>\n      <td>0.249997</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.000000</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0</td>\n      <td>72</td>\n      <td>58</td>\n      <td>14</td>\n      <td>10</td>\n      <td>4</td>\n      <td>24</td>\n      <td>0.17</td>\n      <td>0.399992</td>\n      <td>0.333328</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.000000</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0</td>\n      <td>48</td>\n      <td>51</td>\n      <td>11</td>\n      <td>9</td>\n      <td>0</td>\n      <td>19</td>\n      <td>0.00</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.000000</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0</td>\n      <td>73</td>\n      <td>38</td>\n      <td>13</td>\n      <td>7</td>\n      <td>4</td>\n      <td>20</td>\n      <td>0.20</td>\n      <td>0.399992</td>\n      <td>0.199998</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.000000</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>199996</th>\n      <td>0</td>\n      <td>43</td>\n      <td>48</td>\n      <td>9</td>\n      <td>9</td>\n      <td>5</td>\n      <td>18</td>\n      <td>0.28</td>\n      <td>0.749981</td>\n      <td>0.749981</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.000000</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>199997</th>\n      <td>0</td>\n      <td>23</td>\n      <td>37</td>\n      <td>5</td>\n      <td>7</td>\n      <td>5</td>\n      <td>12</td>\n      <td>0.42</td>\n      <td>0.999950</td>\n      <td>0.499988</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.000000</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>199998</th>\n      <td>1</td>\n      <td>50</td>\n      <td>61</td>\n      <td>11</td>\n      <td>13</td>\n      <td>9</td>\n      <td>21</td>\n      <td>0.43</td>\n      <td>0.833319</td>\n      <td>0.833319</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.000000</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>199999</th>\n      <td>0</td>\n      <td>38</td>\n      <td>44</td>\n      <td>8</td>\n      <td>9</td>\n      <td>8</td>\n      <td>17</td>\n      <td>0.47</td>\n      <td>0.999975</td>\n      <td>0.799984</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.344384</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>200000</th>\n      <td>0</td>\n      <td>146</td>\n      <td>60</td>\n      <td>27</td>\n      <td>10</td>\n      <td>6</td>\n      <td>34</td>\n      <td>0.18</td>\n      <td>0.666656</td>\n      <td>0.222221</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.000000</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n  </tbody>\n</table>\n<p>200000 rows × 2023 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"temp_df.columns = temp_df.columns.astype(str)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:45.723629Z","iopub.execute_input":"2024-05-31T16:13:45.724341Z","iopub.status.idle":"2024-05-31T16:13:45.729720Z","shell.execute_reply.started":"2024-05-31T16:13:45.724307Z","shell.execute_reply":"2024-05-31T16:13:45.728559Z"},"trusted":true},"execution_count":57,"outputs":[]},{"cell_type":"code","source":"from sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:45.948658Z","iopub.execute_input":"2024-05-31T16:13:45.949487Z","iopub.status.idle":"2024-05-31T16:13:46.109297Z","shell.execute_reply.started":"2024-05-31T16:13:45.949447Z","shell.execute_reply":"2024-05-31T16:13:46.108278Z"},"trusted":true},"execution_count":58,"outputs":[]},{"cell_type":"code","source":"\nrf=RandomForestClassifier()\nrf.fit(x_train,y_train)\ny_pred=rf.predict(x_test)\naccuracy_score(y_test,y_pred)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:13:46.859901Z","iopub.execute_input":"2024-05-31T16:13:46.860985Z","iopub.status.idle":"2024-05-31T16:18:53.609846Z","shell.execute_reply.started":"2024-05-31T16:13:46.860950Z","shell.execute_reply":"2024-05-31T16:18:53.608761Z"},"trusted":true},"execution_count":59,"outputs":[{"execution_count":59,"output_type":"execute_result","data":{"text/plain":"0.8151"},"metadata":{}}]},{"cell_type":"code","source":"import pickle\nmodel_pkl_file = \"RF.pkl\"  \n\nwith open(model_pkl_file, 'wb') as file:  \n    pickle.dump(rf, file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:53.611939Z","iopub.execute_input":"2024-05-31T16:18:53.612764Z","iopub.status.idle":"2024-05-31T16:18:54.365381Z","shell.execute_reply.started":"2024-05-31T16:18:53.612727Z","shell.execute_reply":"2024-05-31T16:18:54.364401Z"},"trusted":true},"execution_count":60,"outputs":[]},{"cell_type":"code","source":"model_pkl_file = \"BOW.pkl\"  \n\nwith open(model_pkl_file, 'wb') as file:  \n    pickle.dump(cv, file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:54.366547Z","iopub.execute_input":"2024-05-31T16:18:54.366856Z","iopub.status.idle":"2024-05-31T16:18:54.400438Z","shell.execute_reply.started":"2024-05-31T16:18:54.366830Z","shell.execute_reply":"2024-05-31T16:18:54.399515Z"},"trusted":true},"execution_count":61,"outputs":[]},{"cell_type":"code","source":"import pickle\nwith open(\"/kaggle/working/BOW.pkl\", 'rb') as file:  \n        cv = pickle.load(file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:54.401578Z","iopub.execute_input":"2024-05-31T16:18:54.401866Z","iopub.status.idle":"2024-05-31T16:18:54.429997Z","shell.execute_reply.started":"2024-05-31T16:18:54.401842Z","shell.execute_reply":"2024-05-31T16:18:54.429102Z"},"trusted":true},"execution_count":62,"outputs":[]},{"cell_type":"code","source":"with open(\"/kaggle/working/RF.pkl\", 'rb') as file:  \n        rf = pickle.load(file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:54.432348Z","iopub.execute_input":"2024-05-31T16:18:54.432667Z","iopub.status.idle":"2024-05-31T16:18:55.121525Z","shell.execute_reply.started":"2024-05-31T16:18:54.432641Z","shell.execute_reply":"2024-05-31T16:18:55.120436Z"},"trusted":true},"execution_count":63,"outputs":[]},{"cell_type":"code","source":"df=pd.read_csv(\"/kaggle/input/quora-duplicate-questions-copy/train.csv\")\ndf=df.tail(204290)\ndf.dropna(inplace=True)\ndf.drop(columns=['id','qid1','qid2'],inplace=True)\ndf['question1']=df['question1'].apply(preprocess)\ndf['question2']=df['question2'].apply(preprocess)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:18:55.122843Z","iopub.execute_input":"2024-05-31T16:18:55.123148Z","iopub.status.idle":"2024-05-31T16:19:11.234145Z","shell.execute_reply.started":"2024-05-31T16:18:55.123124Z","shell.execute_reply":"2024-05-31T16:19:11.233103Z"},"trusted":true},"execution_count":64,"outputs":[]},{"cell_type":"code","source":"\ndf['q1_len']=df['question1'].str.len()\ndf['q2_len']=df['question2'].str.len()\ndf['q1_num_words']=df['question1'].apply(lambda row: len(row.split(\" \")))\ndf['q2_num_words']=df['question2'].apply(lambda row: len(row.split(\" \")))\ndef common_words(row):\n    w1=set(map(lambda word: word.lower().strip(),row['question1'].split(\" \")))\n    w2=set(map(lambda word: word.lower().strip(),row['question2'].split(\" \")))\n    return len(w1 & w2)\ndef total_words(row):\n    w1=set(map(lambda word: word.lower().strip(),row['question1'].split(\" \")))\n    w2=set(map(lambda word: word.lower().strip(),row['question2'].split(\" \")))\n    return len(w1) + len(w2)\n\ndf['word_common']=df.apply(common_words,axis=1)\ndf['word_total']=df.apply(total_words,axis=1)\ndf['word_share']=round(df['word_common']/df['word_total'],2)\n\n","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:19:11.235593Z","iopub.execute_input":"2024-05-31T16:19:11.236383Z","iopub.status.idle":"2024-05-31T16:19:20.397449Z","shell.execute_reply.started":"2024-05-31T16:19:11.236343Z","shell.execute_reply":"2024-05-31T16:19:20.396339Z"},"trusted":true},"execution_count":65,"outputs":[]},{"cell_type":"code","source":"token_features = df.apply(fetch_token_features, axis=1)\n\ndf[\"cwc_min\"]       = list(map(lambda x: x[0], token_features))\ndf[\"cwc_max\"]       = list(map(lambda x: x[1], token_features))\ndf[\"csc_min\"]       = list(map(lambda x: x[2], token_features))\ndf[\"csc_max\"]       = list(map(lambda x: x[3], token_features))\ndf[\"ctc_min\"]       = list(map(lambda x: x[4], token_features))\ndf[\"ctc_max\"]       = list(map(lambda x: x[5], token_features))\ndf[\"last_word_eq\"]  = list(map(lambda x: x[6], token_features))\ndf[\"first_word_eq\"] = list(map(lambda x: x[7], token_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:19:20.398704Z","iopub.execute_input":"2024-05-31T16:19:20.399012Z","iopub.status.idle":"2024-05-31T16:20:13.569480Z","shell.execute_reply.started":"2024-05-31T16:19:20.398986Z","shell.execute_reply":"2024-05-31T16:20:13.568221Z"},"trusted":true},"execution_count":66,"outputs":[]},{"cell_type":"code","source":"length_features = df.apply(fetch_length_features, axis=1)\n\ndf['abs_len_diff'] = list(map(lambda x: x[0], length_features))\ndf['mean_len'] = list(map(lambda x: x[1], length_features))\ndf['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:20:13.570901Z","iopub.execute_input":"2024-05-31T16:20:13.571264Z","iopub.status.idle":"2024-05-31T16:22:55.402008Z","shell.execute_reply.started":"2024-05-31T16:20:13.571234Z","shell.execute_reply":"2024-05-31T16:22:55.400892Z"},"trusted":true},"execution_count":67,"outputs":[]},{"cell_type":"code","source":"fuzzy_features = df.apply(fetch_fuzzy_features, axis=1)\n\n# Creating new feature columns for fuzzy features\ndf['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))\ndf['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))\ndf['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))\ndf['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:22:55.403684Z","iopub.execute_input":"2024-05-31T16:22:55.403981Z","iopub.status.idle":"2024-05-31T16:32:18.985961Z","shell.execute_reply.started":"2024-05-31T16:22:55.403956Z","shell.execute_reply":"2024-05-31T16:32:18.984852Z"},"trusted":true},"execution_count":68,"outputs":[]},{"cell_type":"code","source":"ndf2=df.drop(columns=['question1','question2'])","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:18.987276Z","iopub.execute_input":"2024-05-31T16:32:18.987559Z","iopub.status.idle":"2024-05-31T16:32:19.008186Z","shell.execute_reply.started":"2024-05-31T16:32:18.987536Z","shell.execute_reply":"2024-05-31T16:32:19.007170Z"},"trusted":true},"execution_count":69,"outputs":[]},{"cell_type":"code","source":"questions=list(df['question1'])+list(df['question2'])\nq1_arr,q2_arr=np.vsplit(cv.fit_transform(questions).toarray(),2)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:19.009434Z","iopub.execute_input":"2024-05-31T16:32:19.009730Z","iopub.status.idle":"2024-05-31T16:32:28.784256Z","shell.execute_reply.started":"2024-05-31T16:32:19.009705Z","shell.execute_reply":"2024-05-31T16:32:28.783401Z"},"trusted":true},"execution_count":70,"outputs":[]},{"cell_type":"code","source":"tenp_df=\"\"\nndf1=\"\"","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:28.785527Z","iopub.execute_input":"2024-05-31T16:32:28.785898Z","iopub.status.idle":"2024-05-31T16:32:28.794742Z","shell.execute_reply.started":"2024-05-31T16:32:28.785866Z","shell.execute_reply":"2024-05-31T16:32:28.793759Z"},"trusted":true},"execution_count":71,"outputs":[]},{"cell_type":"code","source":"\ntemp_df=pd.concat([pd.DataFrame(q1_arr,index=ndf2.index),pd.DataFrame(q2_arr,index=ndf2.index)],axis=1)\ntemp_df.shape\n","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:28.798176Z","iopub.execute_input":"2024-05-31T16:32:28.798534Z","iopub.status.idle":"2024-05-31T16:32:37.549598Z","shell.execute_reply.started":"2024-05-31T16:32:28.798507Z","shell.execute_reply":"2024-05-31T16:32:37.548558Z"},"trusted":true},"execution_count":72,"outputs":[{"execution_count":72,"output_type":"execute_result","data":{"text/plain":"(204288, 2000)"},"metadata":{}}]},{"cell_type":"code","source":"q1_arr=\"\"\nq2_arr=\"\"","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:37.550761Z","iopub.execute_input":"2024-05-31T16:32:37.551083Z","iopub.status.idle":"2024-05-31T16:32:37.719975Z","shell.execute_reply.started":"2024-05-31T16:32:37.551032Z","shell.execute_reply":"2024-05-31T16:32:37.718778Z"},"trusted":true},"execution_count":73,"outputs":[]},{"cell_type":"code","source":"temp_df=pd.concat([ndf2,temp_df],axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:37.721428Z","iopub.execute_input":"2024-05-31T16:32:37.721758Z","iopub.status.idle":"2024-05-31T16:32:41.807913Z","shell.execute_reply.started":"2024-05-31T16:32:37.721730Z","shell.execute_reply":"2024-05-31T16:32:41.807000Z"},"trusted":true},"execution_count":74,"outputs":[]},{"cell_type":"code","source":"temp_df.columns = temp_df.columns.astype(str)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:41.808947Z","iopub.execute_input":"2024-05-31T16:32:41.809256Z","iopub.status.idle":"2024-05-31T16:32:41.814768Z","shell.execute_reply.started":"2024-05-31T16:32:41.809230Z","shell.execute_reply":"2024-05-31T16:32:41.813818Z"},"trusted":true},"execution_count":75,"outputs":[]},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split\nx_train,x_test,y_train,y_test=train_test_split(temp_df.drop(columns='is_duplicate'),temp_df['is_duplicate'],test_size=0.1,random_state=3)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:41.815878Z","iopub.execute_input":"2024-05-31T16:32:41.816183Z","iopub.status.idle":"2024-05-31T16:32:45.660976Z","shell.execute_reply.started":"2024-05-31T16:32:41.816159Z","shell.execute_reply":"2024-05-31T16:32:45.659752Z"},"trusted":true},"execution_count":76,"outputs":[]},{"cell_type":"code","source":"rf.fit(x_train,y_train)\ny_pred=rf.predict(x_test)\naccuracy_score(y_test,y_pred)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:32:45.662778Z","iopub.execute_input":"2024-05-31T16:32:45.663346Z","iopub.status.idle":"2024-05-31T16:37:57.160516Z","shell.execute_reply.started":"2024-05-31T16:32:45.663299Z","shell.execute_reply":"2024-05-31T16:37:57.159407Z"},"trusted":true},"execution_count":77,"outputs":[{"execution_count":77,"output_type":"execute_result","data":{"text/plain":"0.8166821675069754"},"metadata":{}}]},{"cell_type":"code","source":"model_pkl_file = \"RF.pkl\"  \n\nwith open(model_pkl_file, 'wb') as file:  \n    pickle.dump(rf, file)\n    \nmodel_pkl_file = \"BOW.pkl\"  \n\nwith open(model_pkl_file, 'wb') as file:  \n    pickle.dump(cv, file)","metadata":{"execution":{"iopub.status.busy":"2024-05-31T16:37:57.161922Z","iopub.execute_input":"2024-05-31T16:37:57.162258Z","iopub.status.idle":"2024-05-31T16:37:58.112704Z","shell.execute_reply.started":"2024-05-31T16:37:57.162230Z","shell.execute_reply":"2024-05-31T16:37:58.111599Z"},"trusted":true},"execution_count":78,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}