Spaces:
Runtime error
Runtime error
import difflib | |
from preprocessing import remove_numbers, remove_stop_words, lemmatize | |
def difflib_overlap(word_token1: list, word_token2: list) -> float: | |
"""Get similarity percentage from matching sequences between two strings""" | |
seq = difflib.SequenceMatcher(a=word_token1, b=word_token2) | |
# Return similarity percentage based on difflib library Sequence Matcher | |
return round(seq.ratio() * 100, 3) | |
def calculate_overlap(word_token1: list, word_token2: list) -> float: | |
"""Get similarity percentage from usage of similar words in two strings""" | |
overlapping_words = [] | |
for word in word_token1: | |
if word in word_token2: | |
overlapping_words.append(word) | |
overlap_percentage = len(overlapping_words) / len(word_token1) * 100 | |
return round(overlap_percentage, 3) | |
def calculate_jaccard(word_tokens1: list, word_tokens2: list) -> float: | |
"""Calculates intersection over union and return Jaccard similarity score""" | |
list1, list2 = remove_numbers(word_tokens1), remove_numbers(word_tokens2) | |
list1, list2 = remove_stop_words(list1), remove_stop_words(list2) | |
list1, list2 = lemmatize(list1), lemmatize(list2) | |
# Combine both tokens to find union | |
both_tokens = list1 + list2 | |
union = set(both_tokens) | |
# Calculate intersection | |
intersection = set() | |
for word in list1: | |
if word in list2: | |
intersection.add(word) | |
jaccard_score = len(intersection) / len(union) | |
return round(jaccard_score, 3) | |