Spaces:

asach
/

arxiv-plagiarism-checker-Ilm

Runtime error

gamingflexer

Fucntion added research

0492a6a 12 months ago

1.51 kB


	import difflib

	from preprocessing import remove_numbers, remove_stop_words, lemmatize


	def difflib_overlap(word_token1: list, word_token2: list) -> float:
	"""Get similarity percentage from matching sequences between two strings"""

	seq = difflib.SequenceMatcher(a=word_token1, b=word_token2)

	# Return similarity percentage based on difflib library Sequence Matcher
	return round(seq.ratio() * 100, 3)


	def calculate_overlap(word_token1: list, word_token2: list) -> float:
	"""Get similarity percentage from usage of similar words in two strings"""

	overlapping_words = []

	for word in word_token1:
	if word in word_token2:
	overlapping_words.append(word)

	overlap_percentage = len(overlapping_words) / len(word_token1) * 100

	return round(overlap_percentage, 3)


	def calculate_jaccard(word_tokens1: list, word_tokens2: list) -> float:
	"""Calculates intersection over union and return Jaccard similarity score"""

	list1, list2 = remove_numbers(word_tokens1), remove_numbers(word_tokens2)
	list1, list2 = remove_stop_words(list1), remove_stop_words(list2)
	list1, list2 = lemmatize(list1), lemmatize(list2)

	# Combine both tokens to find union
	both_tokens = list1 + list2
	union = set(both_tokens)

	# Calculate intersection
	intersection = set()
	for word in list1:
	if word in list2:
	intersection.add(word)

	jaccard_score = len(intersection) / len(union)

	return round(jaccard_score, 3)