{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":8565891,"sourceType":"datasetVersion","datasetId":5120988}],"dockerImageVersionId":30716,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:18.221392Z","iopub.execute_input":"2024-05-31T15:59:18.221694Z","iopub.status.idle":"2024-05-31T15:59:20.313965Z","shell.execute_reply.started":"2024-05-31T15:59:18.221668Z","shell.execute_reply":"2024-05-31T15:59:20.312942Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"df=pd.read_csv(\"/kaggle/input/quora-duplicate-questions-copy/train.csv\")","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:20.316233Z","iopub.execute_input":"2024-05-31T15:59:20.316770Z","iopub.status.idle":"2024-05-31T15:59:22.476112Z","shell.execute_reply.started":"2024-05-31T15:59:20.316735Z","shell.execute_reply":"2024-05-31T15:59:22.475105Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"df.shape","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.477876Z","iopub.execute_input":"2024-05-31T15:59:22.478273Z","iopub.status.idle":"2024-05-31T15:59:22.487296Z","shell.execute_reply.started":"2024-05-31T15:59:22.478236Z","shell.execute_reply":"2024-05-31T15:59:22.486116Z"},"trusted":true},"execution_count":3,"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":"(404290, 6)"},"metadata":{}}]},{"cell_type":"code","source":"df.head()","metadata":{"execution":{"iopub.status.busy":"2024-05-31T15:59:22.488561Z","iopub.execute_input":"2024-05-31T15:59:22.488954Z","iopub.status.idle":"2024-05-31T15:59:22.514564Z","shell.execute_reply.started":"2024-05-31T15:59:22.488922Z","shell.execute_reply":"2024-05-31T15:59:22.513359Z"},"trusted":true},"execution_count":4,"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":" id qid1 qid2 question1 \\\n0 0 1 2 What is the step by step guide to invest in sh... \n1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia... \n2 2 5 6 How can I increase the speed of my internet co... \n3 3 7 8 Why am I mentally very lonely? How can I solve... \n4 4 9 10 Which one dissolve in water quikly sugar, salt... \n\n question2 is_duplicate \n0 What is the step by step guide to invest in sh... 0 \n1 What would happen if the Indian government sto... 0 \n2 How can Internet speed be increased by hacking... 0 \n3 Find the remainder when [math]23^{24}[/math] i... 0 \n4 Which fish would survive in salt water? 0 ","text/html":"
\n | id | \nqid1 | \nqid2 | \nquestion1 | \nquestion2 | \nis_duplicate | \n
---|---|---|---|---|---|---|
0 | \n0 | \n1 | \n2 | \nWhat is the step by step guide to invest in sh... | \nWhat is the step by step guide to invest in sh... | \n0 | \n
1 | \n1 | \n3 | \n4 | \nWhat is the story of Kohinoor (Koh-i-Noor) Dia... | \nWhat would happen if the Indian government sto... | \n0 | \n
2 | \n2 | \n5 | \n6 | \nHow can I increase the speed of my internet co... | \nHow can Internet speed be increased by hacking... | \n0 | \n
3 | \n3 | \n7 | \n8 | \nWhy am I mentally very lonely? How can I solve... | \nFind the remainder when [math]23^{24}[/math] i... | \n0 | \n
4 | \n4 | \n9 | \n10 | \nWhich one dissolve in water quikly sugar, salt... | \nWhich fish would survive in salt water? | \n0 | \n
\n | question1 | \nquestion2 | \nis_duplicate | \n
---|---|---|---|
0 | \nwhat is the step by step guide to invest in sh... | \nwhat is the step by step guide to invest in sh... | \n0 | \n
1 | \nwhat is the story of kohinoor kohinoor diamond | \nwhat would happen if the indian government sto... | \n0 | \n
2 | \nhow can i increase the speed of my internet co... | \nhow can internet speed be increased by hacking... | \n0 | \n
3 | \nwhy am i mentally very lonely how can i solve it | \nfind the remainder when 2324math is divided by... | \n0 | \n
4 | \nwhich one dissolve in water quikly sugar salt ... | \nwhich fish would survive in salt water | \n0 | \n
... | \n... | \n... | \n... | \n
199996 | \nwhich of these tv shows should i watch next | \nwhat are some thriller shows i should watch next | \n0 | \n
199997 | \nshould i change my name | \nshould i legally change my first name | \n0 | \n
199998 | \nshould i buy the new macbook 2016 or one from ... | \nshould i buy the new macbook pro 2016 or the m... | \n1 | \n
199999 | \nwhat is your review of love 2011 movie | \nwhat is your review of love birds 2011 movie | \n0 | \n
200000 | \ncan pakistan hit indian air craft carrier in a... | \ncan pakistan destroy an indian aircraft carrie... | \n0 | \n
200000 rows × 3 columns
\n\n | question1 | \nquestion2 | \nis_duplicate | \nq1_len | \nq2_len | \nq1_num_words | \nq2_num_words | \nword_common | \nword_total | \n
---|---|---|---|---|---|---|---|---|---|
0 | \nwhat is the step by step guide to invest in sh... | \nwhat is the step by step guide to invest in sh... | \n0 | \n65 | \n56 | \n14 | \n12 | \n11 | \n23 | \n
1 | \nwhat is the story of kohinoor kohinoor diamond | \nwhat would happen if the indian government sto... | \n0 | \n46 | \n83 | \n8 | \n13 | \n4 | \n18 | \n
2 | \nhow can i increase the speed of my internet co... | \nhow can internet speed be increased by hacking... | \n0 | \n72 | \n58 | \n14 | \n10 | \n4 | \n24 | \n
3 | \nwhy am i mentally very lonely how can i solve it | \nfind the remainder when 2324math is divided by... | \n0 | \n48 | \n51 | \n11 | \n9 | \n0 | \n19 | \n
4 | \nwhich one dissolve in water quikly sugar salt ... | \nwhich fish would survive in salt water | \n0 | \n73 | \n38 | \n13 | \n7 | \n4 | \n20 | \n
... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
199996 | \nwhich of these tv shows should i watch next | \nwhat are some thriller shows i should watch next | \n0 | \n43 | \n48 | \n9 | \n9 | \n5 | \n18 | \n
199997 | \nshould i change my name | \nshould i legally change my first name | \n0 | \n23 | \n37 | \n5 | \n7 | \n5 | \n12 | \n
199998 | \nshould i buy the new macbook 2016 or one from ... | \nshould i buy the new macbook pro 2016 or the m... | \n1 | \n50 | \n61 | \n11 | \n13 | \n9 | \n21 | \n
199999 | \nwhat is your review of love 2011 movie | \nwhat is your review of love birds 2011 movie | \n0 | \n38 | \n44 | \n8 | \n9 | \n8 | \n17 | \n
200000 | \ncan pakistan hit indian air craft carrier in a... | \ncan pakistan destroy an indian aircraft carrie... | \n0 | \n146 | \n60 | \n27 | \n10 | \n6 | \n34 | \n
200000 rows × 9 columns
\n\n | question1 | \nquestion2 | \nis_duplicate | \nq1_len | \nq2_len | \nq1_num_words | \nq2_num_words | \nword_common | \nword_total | \nword_share | \n
---|---|---|---|---|---|---|---|---|---|---|
0 | \nwhat is the step by step guide to invest in sh... | \nwhat is the step by step guide to invest in sh... | \n0 | \n65 | \n56 | \n14 | \n12 | \n11 | \n23 | \n0.48 | \n
1 | \nwhat is the story of kohinoor kohinoor diamond | \nwhat would happen if the indian government sto... | \n0 | \n46 | \n83 | \n8 | \n13 | \n4 | \n18 | \n0.22 | \n
2 | \nhow can i increase the speed of my internet co... | \nhow can internet speed be increased by hacking... | \n0 | \n72 | \n58 | \n14 | \n10 | \n4 | \n24 | \n0.17 | \n
3 | \nwhy am i mentally very lonely how can i solve it | \nfind the remainder when 2324math is divided by... | \n0 | \n48 | \n51 | \n11 | \n9 | \n0 | \n19 | \n0.00 | \n
4 | \nwhich one dissolve in water quikly sugar salt ... | \nwhich fish would survive in salt water | \n0 | \n73 | \n38 | \n13 | \n7 | \n4 | \n20 | \n0.20 | \n
... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
199996 | \nwhich of these tv shows should i watch next | \nwhat are some thriller shows i should watch next | \n0 | \n43 | \n48 | \n9 | \n9 | \n5 | \n18 | \n0.28 | \n
199997 | \nshould i change my name | \nshould i legally change my first name | \n0 | \n23 | \n37 | \n5 | \n7 | \n5 | \n12 | \n0.42 | \n
199998 | \nshould i buy the new macbook 2016 or one from ... | \nshould i buy the new macbook pro 2016 or the m... | \n1 | \n50 | \n61 | \n11 | \n13 | \n9 | \n21 | \n0.43 | \n
199999 | \nwhat is your review of love 2011 movie | \nwhat is your review of love birds 2011 movie | \n0 | \n38 | \n44 | \n8 | \n9 | \n8 | \n17 | \n0.47 | \n
200000 | \ncan pakistan hit indian air craft carrier in a... | \ncan pakistan destroy an indian aircraft carrie... | \n0 | \n146 | \n60 | \n27 | \n10 | \n6 | \n34 | \n0.18 | \n
200000 rows × 10 columns
\n\n | question1 | \nquestion2 | \n
---|---|---|
0 | \nwhat is the step by step guide to invest in sh... | \nwhat is the step by step guide to invest in sh... | \n
1 | \nwhat is the story of kohinoor kohinoor diamond | \nwhat would happen if the indian government sto... | \n
2 | \nhow can i increase the speed of my internet co... | \nhow can internet speed be increased by hacking... | \n
3 | \nwhy am i mentally very lonely how can i solve it | \nfind the remainder when 2324math is divided by... | \n
4 | \nwhich one dissolve in water quikly sugar salt ... | \nwhich fish would survive in salt water | \n
... | \n... | \n... | \n
199996 | \nwhich of these tv shows should i watch next | \nwhat are some thriller shows i should watch next | \n
199997 | \nshould i change my name | \nshould i legally change my first name | \n
199998 | \nshould i buy the new macbook 2016 or one from ... | \nshould i buy the new macbook pro 2016 or the m... | \n
199999 | \nwhat is your review of love 2011 movie | \nwhat is your review of love birds 2011 movie | \n
200000 | \ncan pakistan hit indian air craft carrier in a... | \ncan pakistan destroy an indian aircraft carrie... | \n
200000 rows × 2 columns
\n\n | is_duplicate | \nq1_len | \nq2_len | \nq1_num_words | \nq2_num_words | \nword_common | \nword_total | \nword_share | \n
---|---|---|---|---|---|---|---|---|
0 | \n0 | \n65 | \n56 | \n14 | \n12 | \n11 | \n23 | \n0.48 | \n
1 | \n0 | \n46 | \n83 | \n8 | \n13 | \n4 | \n18 | \n0.22 | \n
2 | \n0 | \n72 | \n58 | \n14 | \n10 | \n4 | \n24 | \n0.17 | \n
3 | \n0 | \n48 | \n51 | \n11 | \n9 | \n0 | \n19 | \n0.00 | \n
4 | \n0 | \n73 | \n38 | \n13 | \n7 | \n4 | \n20 | \n0.20 | \n
... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
199996 | \n0 | \n43 | \n48 | \n9 | \n9 | \n5 | \n18 | \n0.28 | \n
199997 | \n0 | \n23 | \n37 | \n5 | \n7 | \n5 | \n12 | \n0.42 | \n
199998 | \n1 | \n50 | \n61 | \n11 | \n13 | \n9 | \n21 | \n0.43 | \n
199999 | \n0 | \n38 | \n44 | \n8 | \n9 | \n8 | \n17 | \n0.47 | \n
200000 | \n0 | \n146 | \n60 | \n27 | \n10 | \n6 | \n34 | \n0.18 | \n
200000 rows × 8 columns
\n\n | is_duplicate | \nq1_len | \nq2_len | \nq1_num_words | \nq2_num_words | \nword_common | \nword_total | \nword_share | \ncwc_min | \ncwc_max | \n... | \n990 | \n991 | \n992 | \n993 | \n994 | \n995 | \n996 | \n997 | \n998 | \n999 | \n
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n0 | \n65 | \n56 | \n14 | \n12 | \n11 | \n23 | \n0.48 | \n0.999980 | \n0.833319 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.000000 | \n0.0 | \n0.0 | \n
1 | \n0 | \n46 | \n83 | \n8 | \n13 | \n4 | \n18 | \n0.22 | \n0.666644 | \n0.249997 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.000000 | \n0.0 | \n0.0 | \n
2 | \n0 | \n72 | \n58 | \n14 | \n10 | \n4 | \n24 | \n0.17 | \n0.399992 | \n0.333328 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.000000 | \n0.0 | \n0.0 | \n
3 | \n0 | \n48 | \n51 | \n11 | \n9 | \n0 | \n19 | \n0.00 | \n0.000000 | \n0.000000 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.000000 | \n0.0 | \n0.0 | \n
4 | \n0 | \n73 | \n38 | \n13 | \n7 | \n4 | \n20 | \n0.20 | \n0.399992 | \n0.199998 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.000000 | \n0.0 | \n0.0 | \n
... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
199996 | \n0 | \n43 | \n48 | \n9 | \n9 | \n5 | \n18 | \n0.28 | \n0.749981 | \n0.749981 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.000000 | \n0.0 | \n0.0 | \n
199997 | \n0 | \n23 | \n37 | \n5 | \n7 | \n5 | \n12 | \n0.42 | \n0.999950 | \n0.499988 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.000000 | \n0.0 | \n0.0 | \n
199998 | \n1 | \n50 | \n61 | \n11 | \n13 | \n9 | \n21 | \n0.43 | \n0.833319 | \n0.833319 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.000000 | \n0.0 | \n0.0 | \n
199999 | \n0 | \n38 | \n44 | \n8 | \n9 | \n8 | \n17 | \n0.47 | \n0.999975 | \n0.799984 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.344384 | \n0.0 | \n0.0 | \n
200000 | \n0 | \n146 | \n60 | \n27 | \n10 | \n6 | \n34 | \n0.18 | \n0.666656 | \n0.222221 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.000000 | \n0.0 | \n0.0 | \n
200000 rows × 2023 columns
\n