File size: 5,255 Bytes
7baf701
 
 
 
 
 
 
 
 
3b4471f
 
7baf701
 
 
 
 
 
 
436c4c1
 
7baf701
436c4c1
7baf701
 
 
436c4c1
7baf701
436c4c1
 
d814758
7baf701
 
d814758
 
3b4471f
d814758
 
 
81f410f
 
436c4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f150bd
436c4c1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import re
from nltk.corpus import stopwords

def find_common_subsequences(sentence, str_list):
    stop_words = set(stopwords.words('english'))
    sentence = sentence.lower()
    str_list = [s.lower() for s in str_list]

    def is_present(subseq, str_list):
        subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b')
        return all(subseq_regex.search(s) for s in str_list)

    def remove_stop_words_and_special_chars(sentence):
        sentence = re.sub(r'[^\w\s]', '', sentence)
        words = sentence.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        return " ".join(filtered_words)

    cleaned_sentence = remove_stop_words_and_special_chars(sentence)
    cleaned_str_list = [remove_stop_words_and_special_chars(s) for s in str_list]

    words = cleaned_sentence.split()
    common_grams = []
    added_phrases = set()

    for n in range(5, 0, -1):  # Check n-grams from size 5 to 1
        for i in range(len(words) - n + 1):
            subseq = " ".join(words[i:i + n])
            if is_present(subseq, cleaned_str_list) and not any(subseq in phrase for phrase in added_phrases):
                common_grams.append((i, subseq))
                added_phrases.add(subseq)

    # Sort by the first appearance in the original sentence
    common_grams.sort(key=lambda x: x[0])

    # Assign indices based on the sorted order
    indexed_common_grams = [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)]

    return indexed_common_grams

def find_common_gram_positions(str_list, common_grams):
    # Initialize a list to hold positions for each sentence
    positions = []

    for sentence in str_list:
        # Number each word in the sentence
        words = re.sub(r'[^\w\s]', '', sentence).lower().split()
        word_positions = {word: [] for word in words}

        for idx, word in enumerate(words):
            word_positions[word].append(idx + 1)  # Store 1-based index positions

        # Create a list to store positions of common grams for the current sentence
        sentence_positions = []
        for gram in common_grams:
            # Clean the gram for matching
            cleaned_gram = re.sub(r'[^\w\s]', '', gram).lower()
            gram_words = cleaned_gram.split()

            # Check for the position of the common gram in the current sentence
            if all(word in word_positions for word in gram_words):
                # Get the position of the first word of the common gram
                start_idx = word_positions[gram_words[0]][0]
                sentence_positions.append(start_idx)
            else:
                sentence_positions.append(-1)  # Common gram not found

        # Append the positions for the current sentence to the main positions list
        positions.append(sentence_positions)

    return positions


# # Example usage
# sentence = "Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time since the Holocaust” to be Jewish in the United States."
# str_list = [
#     'During a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump stated that being Jewish in the United States has never been more hazardous since the Holocaust.',
#     'At a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump declared that being Jewish in the United States has never been more hazardous since the Holocaust.',
#     'Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania, and stated that being Jewish in the United States has never been more perilous since the Holocaust.',
#     'Donald Trump made the statement at a campaign rally in Wilkes-Barre, Pennsylvania, saying that being Jewish in the United States has never been more dangerous since the Holocaust.',
#     'Last month, Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania and stated that being Jewish in the United States has never been more hazardous than during World War II.',
#     'In Wilkes-Barre, Pennsylvania, Donald Trump spoke at a campaign rally and claimed that the Holocaust was always more hazardous for Jews in the United States.',
#     'A campaign rally in Wilkes-Barre, Pennsylvania saw Donald Trump declare that being Jewish in the United States has never been more perilous since WWII.',
#     'Speaking at a campaign rally in Wilkes-Barre, Pennsylvania today, Donald Trump declared that being Jewish has never been more hazardous in the United States since the Holocaust.',
#     'During his campaign rally in Wilkes-Barre, Pennsylvania today Donald Trump stated: "There has never been a safer place for being Jewish in the United States since the Holocaust."',
#     'At a campaign rally in Wilkes-Barre, Pennsylvania (pictured), Donald Trump said, "There has never been... gotten worse for being Jewish in America since the Holocaust."'
# ]

# # Find common subsequences
# common_grams = find_common_subsequences(sentence, str_list)
# # Extract the subsequences from the common grams for position checking
# subsequences = [subseq for _, subseq in common_grams]

# # Find positions of the common grams
# common_gram_positions = find_common_gram_positions(str_list, subsequences)


# print(common_grams)