File size: 1,506 Bytes
0492a6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

import difflib

from preprocessing import remove_numbers, remove_stop_words, lemmatize


def difflib_overlap(word_token1: list, word_token2: list) -> float:
    """Get similarity percentage from matching sequences between two strings"""

    seq = difflib.SequenceMatcher(a=word_token1, b=word_token2)

    # Return similarity percentage based on difflib library Sequence Matcher
    return round(seq.ratio() * 100, 3)


def calculate_overlap(word_token1: list, word_token2: list) -> float:
    """Get similarity percentage from usage of similar words in two strings"""

    overlapping_words = []

    for word in word_token1:
        if word in word_token2:
            overlapping_words.append(word)

    overlap_percentage = len(overlapping_words) / len(word_token1) * 100

    return round(overlap_percentage, 3)


def calculate_jaccard(word_tokens1: list, word_tokens2: list) -> float:
    """Calculates intersection over union and return Jaccard similarity score"""

    list1, list2 = remove_numbers(word_tokens1), remove_numbers(word_tokens2)
    list1, list2 = remove_stop_words(list1), remove_stop_words(list2)
    list1, list2 = lemmatize(list1), lemmatize(list2)

    # Combine both tokens to find union
    both_tokens = list1 + list2
    union = set(both_tokens)

    # Calculate intersection
    intersection = set()
    for word in list1:
        if word in list2:
            intersection.add(word)

    jaccard_score = len(intersection) / len(union)

    return round(jaccard_score, 3)