gamingflexer commited on
Commit
0492a6a
·
1 Parent(s): 80ed9e0

Fucntion added research

Browse files
src/plagiarism/preprocessing.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.corpus import stopwords
2
+ from nltk.stem import WordNetLemmatizer
3
+
4
+
5
+ def remove_numbers(words_list: list) -> list:
6
+ """Remove all numbers from a list of strings."""
7
+ return [word for word in words_list if not word.isdigit()]
8
+
9
+ def remove_stop_words(words_list: list) -> list:
10
+ """Remove stop words from a list of strings."""
11
+ stop_words = set(stopwords.words('english'))
12
+ return [word for word in words_list if word.lower() not in stop_words]
13
+
14
+ def lemmatize(words_list: list) -> list:
15
+ """Lemmatize a list of strings."""
16
+ lemmatizer = WordNetLemmatizer()
17
+ return [lemmatizer.lemmatize(word) for word in words_list]
src/plagiarism/similarity_algos.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import difflib
3
+
4
+ from preprocessing import remove_numbers, remove_stop_words, lemmatize
5
+
6
+
7
+ def difflib_overlap(word_token1: list, word_token2: list) -> float:
8
+ """Get similarity percentage from matching sequences between two strings"""
9
+
10
+ seq = difflib.SequenceMatcher(a=word_token1, b=word_token2)
11
+
12
+ # Return similarity percentage based on difflib library Sequence Matcher
13
+ return round(seq.ratio() * 100, 3)
14
+
15
+
16
+ def calculate_overlap(word_token1: list, word_token2: list) -> float:
17
+ """Get similarity percentage from usage of similar words in two strings"""
18
+
19
+ overlapping_words = []
20
+
21
+ for word in word_token1:
22
+ if word in word_token2:
23
+ overlapping_words.append(word)
24
+
25
+ overlap_percentage = len(overlapping_words) / len(word_token1) * 100
26
+
27
+ return round(overlap_percentage, 3)
28
+
29
+
30
+ def calculate_jaccard(word_tokens1: list, word_tokens2: list) -> float:
31
+ """Calculates intersection over union and return Jaccard similarity score"""
32
+
33
+ list1, list2 = remove_numbers(word_tokens1), remove_numbers(word_tokens2)
34
+ list1, list2 = remove_stop_words(list1), remove_stop_words(list2)
35
+ list1, list2 = lemmatize(list1), lemmatize(list2)
36
+
37
+ # Combine both tokens to find union
38
+ both_tokens = list1 + list2
39
+ union = set(both_tokens)
40
+
41
+ # Calculate intersection
42
+ intersection = set()
43
+ for word in list1:
44
+ if word in list2:
45
+ intersection.add(word)
46
+
47
+ jaccard_score = len(intersection) / len(union)
48
+
49
+ return round(jaccard_score, 3)