File size: 5,255 Bytes
7baf701 3b4471f 7baf701 436c4c1 7baf701 436c4c1 7baf701 436c4c1 7baf701 436c4c1 d814758 7baf701 d814758 3b4471f d814758 81f410f 436c4c1 4f150bd 436c4c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import re
from nltk.corpus import stopwords
def find_common_subsequences(sentence, str_list):
stop_words = set(stopwords.words('english'))
sentence = sentence.lower()
str_list = [s.lower() for s in str_list]
def is_present(subseq, str_list):
subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b')
return all(subseq_regex.search(s) for s in str_list)
def remove_stop_words_and_special_chars(sentence):
sentence = re.sub(r'[^\w\s]', '', sentence)
words = sentence.split()
filtered_words = [word for word in words if word.lower() not in stop_words]
return " ".join(filtered_words)
cleaned_sentence = remove_stop_words_and_special_chars(sentence)
cleaned_str_list = [remove_stop_words_and_special_chars(s) for s in str_list]
words = cleaned_sentence.split()
common_grams = []
added_phrases = set()
for n in range(5, 0, -1): # Check n-grams from size 5 to 1
for i in range(len(words) - n + 1):
subseq = " ".join(words[i:i + n])
if is_present(subseq, cleaned_str_list) and not any(subseq in phrase for phrase in added_phrases):
common_grams.append((i, subseq))
added_phrases.add(subseq)
# Sort by the first appearance in the original sentence
common_grams.sort(key=lambda x: x[0])
# Assign indices based on the sorted order
indexed_common_grams = [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)]
return indexed_common_grams
def find_common_gram_positions(str_list, common_grams):
# Initialize a list to hold positions for each sentence
positions = []
for sentence in str_list:
# Number each word in the sentence
words = re.sub(r'[^\w\s]', '', sentence).lower().split()
word_positions = {word: [] for word in words}
for idx, word in enumerate(words):
word_positions[word].append(idx + 1) # Store 1-based index positions
# Create a list to store positions of common grams for the current sentence
sentence_positions = []
for gram in common_grams:
# Clean the gram for matching
cleaned_gram = re.sub(r'[^\w\s]', '', gram).lower()
gram_words = cleaned_gram.split()
# Check for the position of the common gram in the current sentence
if all(word in word_positions for word in gram_words):
# Get the position of the first word of the common gram
start_idx = word_positions[gram_words[0]][0]
sentence_positions.append(start_idx)
else:
sentence_positions.append(-1) # Common gram not found
# Append the positions for the current sentence to the main positions list
positions.append(sentence_positions)
return positions
# # Example usage
# sentence = "Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time since the Holocaust” to be Jewish in the United States."
# str_list = [
# 'During a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump stated that being Jewish in the United States has never been more hazardous since the Holocaust.',
# 'At a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump declared that being Jewish in the United States has never been more hazardous since the Holocaust.',
# 'Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania, and stated that being Jewish in the United States has never been more perilous since the Holocaust.',
# 'Donald Trump made the statement at a campaign rally in Wilkes-Barre, Pennsylvania, saying that being Jewish in the United States has never been more dangerous since the Holocaust.',
# 'Last month, Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania and stated that being Jewish in the United States has never been more hazardous than during World War II.',
# 'In Wilkes-Barre, Pennsylvania, Donald Trump spoke at a campaign rally and claimed that the Holocaust was always more hazardous for Jews in the United States.',
# 'A campaign rally in Wilkes-Barre, Pennsylvania saw Donald Trump declare that being Jewish in the United States has never been more perilous since WWII.',
# 'Speaking at a campaign rally in Wilkes-Barre, Pennsylvania today, Donald Trump declared that being Jewish has never been more hazardous in the United States since the Holocaust.',
# 'During his campaign rally in Wilkes-Barre, Pennsylvania today Donald Trump stated: "There has never been a safer place for being Jewish in the United States since the Holocaust."',
# 'At a campaign rally in Wilkes-Barre, Pennsylvania (pictured), Donald Trump said, "There has never been... gotten worse for being Jewish in America since the Holocaust."'
# ]
# # Find common subsequences
# common_grams = find_common_subsequences(sentence, str_list)
# # Extract the subsequences from the common grams for position checking
# subsequences = [subseq for _, subseq in common_grams]
# # Find positions of the common grams
# common_gram_positions = find_common_gram_positions(str_list, subsequences)
# print(common_grams)
|