gyanbardhan123 commited on
Commit
515506a
1 Parent(s): 292bba5

Upload 4 files

Browse files
Files changed (4) hide show
  1. bow-00.ipynb +0 -0
  2. packages.txt +1 -0
  3. requirements.txt +8 -0
  4. x.py +363 -0
bow-00.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ libgl1
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ streamlit
4
+ distance
5
+ nltk
6
+ scipy==1.12
7
+ fuzzywuzzy
8
+ scikit-learn==1.2.2
x.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import pandas as pd
4
+ import numpy as np
5
+ import streamlit as st
6
+ import re
7
+ import pickle
8
+ def preprocess(q):
9
+ q=str(q).lower().strip()
10
+
11
+ q=q.replace('%',' percent ')
12
+ q=q.replace('@',' at ')
13
+ q=q.replace('$',' dollar ')
14
+
15
+ q=q.replace('[math]','')
16
+
17
+ q=q.replace(',000,000,000 ','b ')
18
+ q=q.replace(',000,000 ','m ')
19
+ q=q.replace(',000 ','k ')
20
+
21
+ import re
22
+ q=re.sub(r'([0-9]+)000000000',r'\1b',q)
23
+ q=re.sub(r'([0-9]+)000000',r'\1m',q)
24
+ q=re.sub(r'([0-9]+)000',r'\1k',q)
25
+
26
+ contractions = {
27
+ "ain't": "am not",
28
+ "aren't": "are not",
29
+ "can't": "can not",
30
+ "can't've": "can not have",
31
+ "'cause": "because",
32
+ "could've": "could have",
33
+ "couldn't": "could not",
34
+ "couldn't've": "could not have",
35
+ "didn't": "did not",
36
+ "doesn't": "does not",
37
+ "don't": "do not",
38
+ "hadn't": "had not",
39
+ "hadn't've": "had not have",
40
+ "hasn't": "has not",
41
+ "haven't": "have not",
42
+ "he'd": "he would",
43
+ "he'd've": "he would have",
44
+ "he'll": "he will",
45
+ "he'll've": "he will have",
46
+ "he's": "he is",
47
+ "how'd": "how did",
48
+ "how'd'y": "how do you",
49
+ "how'll": "how will",
50
+ "how's": "how is",
51
+ "i'd": "i would",
52
+ "i'd've": "i would have",
53
+ "i'll": "i will",
54
+ "i'll've": "i will have",
55
+ "i'm": "i am",
56
+ "i've": "i have",
57
+ "isn't": "is not",
58
+ "it'd": "it would",
59
+ "it'd've": "it would have",
60
+ "it'll": "it will",
61
+ "it'll've": "it will have",
62
+ "it's": "it is",
63
+ "let's": "let us",
64
+ "ma'am": "madam",
65
+ "mayn't": "may not",
66
+ "might've": "might have",
67
+ "mightn't": "might not",
68
+ "mightn't've": "might not have",
69
+ "must've": "must have",
70
+ "mustn't": "must not",
71
+ "mustn't've": "must not have",
72
+ "needn't": "need not",
73
+ "needn't've": "need not have",
74
+ "o'clock": "of the clock",
75
+ "oughtn't": "ought not",
76
+ "oughtn't've": "ought not have",
77
+ "shan't": "shall not",
78
+ "sha'n't": "shall not",
79
+ "shan't've": "shall not have",
80
+ "she'd": "she would",
81
+ "she'd've": "she would have",
82
+ "she'll": "she will",
83
+ "she'll've": "she will have",
84
+ "she's": "she is",
85
+ "should've": "should have",
86
+ "shouldn't": "should not",
87
+ "shouldn't've": "should not have",
88
+ "so've": "so have",
89
+ "so's": "so as",
90
+ "that'd": "that would",
91
+ "that'd've": "that would have",
92
+ "that's": "that is",
93
+ "there'd": "there would",
94
+ "there'd've": "there would have",
95
+ "there's": "there is",
96
+ "they'd": "they would",
97
+ "they'd've": "they would have",
98
+ "they'll": "they will",
99
+ "they'll've": "they will have",
100
+ "they're": "they are",
101
+ "they've": "they have",
102
+ "to've": "to have",
103
+ "wasn't": "was not",
104
+ "we'd": "we would",
105
+ "we'd've": "we would have",
106
+ "we'll": "we will",
107
+ "we'll've": "we will have",
108
+ "we're": "we are",
109
+ "we've": "we have",
110
+ "weren't": "were not",
111
+ "what'll": "what will",
112
+ "what'll've": "what will have",
113
+ "what're": "what are",
114
+ "what's": "what is",
115
+ "what've": "what have",
116
+ "when's": "when is",
117
+ "when've": "when have",
118
+ "where'd": "where did",
119
+ "where's": "where is",
120
+ "where've": "where have",
121
+ "who'll": "who will",
122
+ "who'll've": "who will have",
123
+ "who's": "who is",
124
+ "who've": "who have",
125
+ "why's": "why is",
126
+ "why've": "why have",
127
+ "will've": "will have",
128
+ "won't": "will not",
129
+ "won't've": "will not have",
130
+ "would've": "would have",
131
+ "wouldn't": "would not",
132
+ "wouldn't've": "would not have",
133
+ "y'all": "you all",
134
+ "y'all'd": "you all would",
135
+ "y'all'd've": "you all would have",
136
+ "y'all're": "you all are",
137
+ "y'all've": "you all have",
138
+ "you'd": "you would",
139
+ "you'd've": "you would have",
140
+ "you'll": "you will",
141
+ "you'll've": "you will have",
142
+ "you're": "you are",
143
+ "you've": "you have"
144
+ }
145
+
146
+ q_decontracted = []
147
+
148
+ for word in q.split():
149
+ if word in contractions:
150
+ word = contractions[word]
151
+
152
+ q_decontracted.append(word)
153
+
154
+ q = ' '.join(q_decontracted)
155
+ q = q.replace("'ve", " have")
156
+ q = q.replace("n't", " not")
157
+ q = q.replace("'re", " are")
158
+ q = q.replace("'ll", " will")
159
+
160
+ q=re.sub(re.compile('<.*?>'),'',q)
161
+
162
+ import string
163
+ q=q.translate(str.maketrans('', '', string.punctuation))
164
+
165
+ return q
166
+
167
+ def common_words(row):
168
+ w1=set(map(lambda word: word.lower().strip(),row['question1'].split(" ")))
169
+ w2=set(map(lambda word: word.lower().strip(),row['question2'].split(" ")))
170
+ return len(w1 & w2)
171
+
172
+ def total_words(row):
173
+ w1=set(map(lambda word: word.lower().strip(),row['question1'].split(" ")))
174
+ w2=set(map(lambda word: word.lower().strip(),row['question2'].split(" ")))
175
+ return len(w1) + len(w2)
176
+
177
+ import nltk
178
+
179
+ nltk.download("stopwords")
180
+ from nltk.corpus import stopwords
181
+
182
+ def fetch_token_features(row):
183
+
184
+ q1 = row['question1']
185
+ q2 = row['question2']
186
+
187
+ SAFE_DIV = 0.0001
188
+
189
+ STOP_WORDS = stopwords.words("english")
190
+
191
+ token_features = [0.0]*8
192
+
193
+ # Converting the Sentence into Tokens:
194
+ q1_tokens = q1.split()
195
+ q2_tokens = q2.split()
196
+
197
+ if len(q1_tokens) == 0 or len(q2_tokens) == 0:
198
+ return token_features
199
+
200
+ # Get the non-stopwords in Questions
201
+ q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
202
+ q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
203
+
204
+ #Get the stopwords in Questions
205
+ q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
206
+ q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
207
+
208
+ # Get the common non-stopwords from Question pair
209
+ common_word_count = len(q1_words.intersection(q2_words))
210
+
211
+ # Get the common stopwords from Question pair
212
+ common_stop_count = len(q1_stops.intersection(q2_stops))
213
+
214
+ # Get the common Tokens from Question pair
215
+ common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
216
+
217
+
218
+ token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
219
+ token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
220
+ token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
221
+ token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
222
+ token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
223
+ token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
224
+
225
+ # Last word of both question is same or not
226
+ token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
227
+
228
+ # First word of both question is same or not
229
+ token_features[7] = int(q1_tokens[0] == q2_tokens[0])
230
+
231
+ return token_features
232
+
233
+ import distance
234
+
235
+ def fetch_length_features(row):
236
+
237
+ q1 = row['question1']
238
+ q2 = row['question2']
239
+
240
+ length_features = [0.0]*3
241
+
242
+ # Converting the Sentence into Tokens:
243
+ q1_tokens = q1.split()
244
+ q2_tokens = q2.split()
245
+
246
+ if len(q1_tokens) == 0 or len(q2_tokens) == 0:
247
+ return length_features
248
+
249
+ # Absolute length features
250
+ length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
251
+
252
+ # Average Token Length of both Questions
253
+ length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2
254
+
255
+ # Find the longest common substring
256
+ strs = list(distance.lcsubstrings(q1, q2))
257
+ if strs:
258
+ length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
259
+ else:
260
+ length_features[2] = 0.0
261
+
262
+ return length_features
263
+
264
+ # Fuzzy Features
265
+ from fuzzywuzzy import fuzz
266
+
267
+ def fetch_fuzzy_features(row):
268
+
269
+ q1 = row['question1']
270
+ q2 = row['question2']
271
+
272
+ fuzzy_features = [0.0]*4
273
+
274
+ # fuzz_ratio
275
+ fuzzy_features[0] = fuzz.QRatio(q1, q2)
276
+
277
+ # fuzz_partial_ratio
278
+ fuzzy_features[1] = fuzz.partial_ratio(q1, q2)
279
+
280
+ # token_sort_ratio
281
+ fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)
282
+
283
+ # token_set_ratio
284
+ fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)
285
+
286
+ return fuzzy_features
287
+
288
+ def all_prep(df):
289
+ df['q1_len']=df['question1'].str.len()
290
+ df['q2_len']=df['question2'].str.len()
291
+
292
+ df['q1_num_words']=df['question1'].apply(lambda row: len(row.split(" ")))
293
+ df['q2_num_words']=df['question2'].apply(lambda row: len(row.split(" ")))
294
+
295
+ df['word_common']=df.apply(common_words,axis=1)
296
+ df['word_total']=df.apply(total_words,axis=1)
297
+ df['word_share']=round(df['word_common']/df['word_total'],2)
298
+
299
+ token_features = df.apply(fetch_token_features, axis=1)
300
+ df["cwc_min"] = list(map(lambda x: x[0], token_features))
301
+ df["cwc_max"] = list(map(lambda x: x[1], token_features))
302
+ df["csc_min"] = list(map(lambda x: x[2], token_features))
303
+ df["csc_max"] = list(map(lambda x: x[3], token_features))
304
+ df["ctc_min"] = list(map(lambda x: x[4], token_features))
305
+ df["ctc_max"] = list(map(lambda x: x[5], token_features))
306
+ df["last_word_eq"] = list(map(lambda x: x[6], token_features))
307
+ df["first_word_eq"] = list(map(lambda x: x[7], token_features))
308
+
309
+ length_features = df.apply(fetch_length_features, axis=1)
310
+ df['abs_len_diff'] = list(map(lambda x: x[0], length_features))
311
+ df['mean_len'] = list(map(lambda x: x[1], length_features))
312
+ df['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))
313
+
314
+ fuzzy_features = df.apply(fetch_fuzzy_features, axis=1)
315
+ df['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))
316
+ df['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))
317
+ df['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))
318
+ df['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))
319
+
320
+ ndf2=df.drop(columns=['question1','question2'])
321
+ import pickle
322
+ import numpy as np
323
+ with open("BOW.pkl", 'rb') as file:
324
+ cv = pickle.load(file)
325
+
326
+ questions=list(df['question1'])+list(df['question2'])
327
+ q1_arr,q2_arr=np.vsplit(cv.transform(questions).toarray(),2)
328
+ temp_df=pd.concat([pd.DataFrame(q1_arr,index=ndf2.index),pd.DataFrame(q2_arr,index=ndf2.index)],axis=1)
329
+ temp_df=pd.concat([ndf2,temp_df],axis=1)
330
+ temp_df.columns = temp_df.columns.astype(str)
331
+
332
+ return temp_df
333
+
334
+
335
+ def clear_text():
336
+ st.session_state["text1"] = ""
337
+ st.session_state["text2"] = ""
338
+
339
+
340
+ def main():
341
+ st.title('Duplicate Question')
342
+
343
+ q1 = st.text_input("Enter Question1", key="text1")
344
+ q2 = st.text_input("Enter Question2", key="text2")
345
+
346
+ data=[]
347
+ df = pd.DataFrame(data)
348
+ df['question1']=[preprocess(q1)]
349
+ df['question2']=[preprocess(q2)]
350
+ with open("RF.pkl", 'rb') as file:
351
+ rf = pickle.load(file)
352
+
353
+ if st.button('Find'):
354
+ z=rf.predict(all_prep(df))[0]
355
+ if z==1:
356
+ st.success("Duplicate")
357
+ else:
358
+ st.success("Not Duplicate")
359
+ st.button("Clear", on_click=clear_text)
360
+
361
+
362
+ if __name__=='__main__':
363
+ main()