aiisc-watermarking-modelv3

Sleeping

App Files Files Community

jgyasu commited on 27 days ago

Commit

436c4c1

•

1 Parent(s): 5d9cd0b

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

.gitignore +2 -1
.gradio/certificate.pem +31 -0
app.py +88 -10
detectability.py +343 -0
distortion.py +385 -0
euclidean_distance.py +261 -0
gpt_mask_filling.py +70 -0
highlighter.py +18 -0
lcs.py +64 -11
masking_methods.py +146 -94
paraphraser.py +83 -31
requirements.txt +3 -1
sampling_methods.py +22 -35
threeD_plot.py +137 -0
tree.py +0 -338
vocabulary_split.py +57 -0
watermark_detector.py +75 -0

.gitignore CHANGED Viewed

	@@ -1 +1,2 @@
1	- ~~__pycache__~~


1	+ .env
2	+ __pycache__/

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

app.py CHANGED Viewed

@@ -1,19 +1,22 @@
 import nltk
 nltk.download('stopwords')
-from transformers import AutoTokenizer
-from transformers import AutoModelForSeq2SeqLM
 import plotly.graph_objs as go
 from transformers import pipeline
-from matplotlib.colors import ListedColormap, rgb2hex
 import random
 import gradio as gr
 from tree import generate_subplot1, generate_subplot2
 from paraphraser import generate_paraphrase
-from lcs import find_common_subsequences
-from highlighter import highlight_common_words, highlight_common_words_dict
 from entailment import analyze_entailment
 from masking_methods import mask_non_stopword, mask_non_stopword_pseudorandom, high_entropy_words
 from sampling_methods import sample_word
 # Function for the Gradio interface
@@ -21,8 +24,10 @@ def model(prompt):
     user_prompt = prompt
     paraphrased_sentences = generate_paraphrase(user_prompt)
     analyzed_paraphrased_sentences, selected_sentences, discarded_sentences = analyze_entailment(user_prompt, paraphrased_sentences, 0.7)
-    length_accepted_sentences = len(selected_sentences)
     common_grams = find_common_subsequences(user_prompt, selected_sentences)
     masked_sentences = []
     masked_words = []
@@ -51,7 +56,8 @@ def model(prompt):
         sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='temperature', temperature=1.0))
         sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='greedy', temperature=1.0))
-    print(len(sampled_sentences))
     colors = ["red", "blue", "brown", "green"]
@@ -83,7 +89,60 @@ def model(prompt):
         masked_index += 3
         sampled_index += 12
-    return [highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + trees1 + trees2
 with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
@@ -127,8 +186,27 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
                     tree2 = gr.Plot()
                     tree2_tabs.append(tree2)
-    submit_button.click(model, inputs=user_input, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs)
     clear_button.click(lambda: "", inputs=None, outputs=user_input)
-    clear_button.click(lambda: "", inputs=None, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs)
 demo.launch(share=True)

 import nltk
 nltk.download('stopwords')
+# from transformers import AutoTokenizer
+# from transformers import AutoModelForSeq2SeqLM
 import plotly.graph_objs as go
 from transformers import pipeline
 import random
 import gradio as gr
 from tree import generate_subplot1, generate_subplot2
 from paraphraser import generate_paraphrase
+from lcs import find_common_subsequences, find_common_gram_positions
+from highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html
 from entailment import analyze_entailment
 from masking_methods import mask_non_stopword, mask_non_stopword_pseudorandom, high_entropy_words
 from sampling_methods import sample_word
+from detectability import SentenceDetectabilityCalculator
+from distortion import SentenceDistortionCalculator
+from euclidean_distance import SentenceEuclideanDistanceCalculator
+from threeD_plot import gen_three_D_plot
 # Function for the Gradio interface
     user_prompt = prompt
     paraphrased_sentences = generate_paraphrase(user_prompt)
     analyzed_paraphrased_sentences, selected_sentences, discarded_sentences = analyze_entailment(user_prompt, paraphrased_sentences, 0.7)
+    print(analyze_entailment(user_prompt, paraphrased_sentences, 0.7))
     common_grams = find_common_subsequences(user_prompt, selected_sentences)
+    subsequences = [subseq for _, subseq in common_grams]
+    common_grams_position = find_common_gram_positions(selected_sentences, subsequences)
     masked_sentences = []
     masked_words = []
         sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='temperature', temperature=1.0))
         sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='greedy', temperature=1.0))
     colors = ["red", "blue", "brown", "green"]
         masked_index += 3
         sampled_index += 12
+    reparaphrased_sentences = generate_paraphrase(sampled_sentences)
+    len_reparaphrased_sentences = len(reparaphrased_sentences)
+    reparaphrased_sentences_list = []
+    # Process the sentences in batches of 10
+    for i in range(0, len_reparaphrased_sentences, 10):
+        # Get the current batch of 10 sentences
+        batch = reparaphrased_sentences[i:i + 10]
+        # Check if the batch has exactly 10 sentences
+        if len(batch) == 10:
+            # Call the display_sentences function and store the result in the list
+            html_block = reparaphrased_sentences_html(batch)
+            reparaphrased_sentences_list.append(html_block)
+    distortion_list = []
+    detectability_list = []
+    euclidean_dist_list = []
+    distortion_calculator = SentenceDistortionCalculator(user_prompt, reparaphrased_sentences)
+    distortion_calculator.calculate_all_metrics()
+    distortion_calculator.normalize_metrics()
+    distortion_calculator.calculate_combined_distortion()
+    distortion = distortion_calculator.get_combined_distortions()
+    for each in distortion.items():
+        distortion_list.append(each[1])
+    detectability_calculator = SentenceDetectabilityCalculator(user_prompt, reparaphrased_sentences)
+    detectability_calculator.calculate_all_metrics()
+    detectability_calculator.normalize_metrics()
+    detectability_calculator.calculate_combined_detectability()
+    detectability = detectability_calculator.get_combined_detectabilities()
+    for each in detectability.items():
+        detectability_list.append(each[1])
+    euclidean_dist_calculator = SentenceEuclideanDistanceCalculator(user_prompt, reparaphrased_sentences)
+    euclidean_dist_calculator.calculate_all_metrics()
+    euclidean_dist_calculator.normalize_metrics()
+    euclidean_dist_calculator.get_normalized_metrics()
+    euclidean_dist = detectability_calculator.get_combined_detectabilities()
+    for each in euclidean_dist.items():
+        euclidean_dist_list.append(each[1])
+    three_D_plot = gen_three_D_plot(detectability_list, distortion_list, euclidean_dist_list)
+    return [highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + trees1 + trees2 + reparaphrased_sentences_list + [three_D_plot]
 with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
                     tree2 = gr.Plot()
                     tree2_tabs.append(tree2)
+        # Adding the "Re-paraphrased Sentences" section
+    with gr.Row():
+        gr.Markdown("### Re-paraphrased Sentences")  # Label for re-paraphrased sentences
+    # Adding tabs for the re-paraphrased sentences
+    with gr.Row():
+        with gr.Tabs():
+            reparaphrased_sentences_tabs = []
+            for i in range(120):  # 120 tabs for 120 batches of sentences
+                with gr.TabItem(f"Sentence {i+1}"):
+                    reparaphrased_sent_html = gr.HTML()  # Placeholder for each batch
+                    reparaphrased_sentences_tabs.append(reparaphrased_sent_html)
+    with gr.Row():
+        gr.Markdown("### 3D Plot for Sweet Spot")
+    with gr.Row():
+        three_D_plot = gr.Plot()
+    submit_button.click(model, inputs=user_input, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs + reparaphrased_sentences_tabs + [three_D_plot])
     clear_button.click(lambda: "", inputs=None, outputs=user_input)
+    clear_button.click(lambda: "", inputs=None, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs + reparaphrased_sentences_tabs + [three_D_plot])
 demo.launch(share=True)

detectability.py ADDED Viewed

	@@ -0,0 +1,343 @@

+# Import necessary libraries
+import nltk
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import BertModel, BertTokenizer
+from sentence_transformers import SentenceTransformer
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+# Download NLTK data if not already present
+nltk.download('punkt', quiet=True)
+detectability_val={}
+class SentenceDetectabilityCalculator:
+    """
+    A class to calculate and analyze detectability metrics between an original sentence and paraphrased sentences.
+    """
+    def __init__(self, original_sentence, paraphrased_sentences):
+        """
+        Initialize the calculator with the original sentence and a list of paraphrased sentences.
+        """
+        self.original_sentence = original_sentence
+        self.paraphrased_sentences = paraphrased_sentences
+        # Raw metric dictionaries
+        self.bleu_scores = {}
+        self.cosine_similarities = {}
+        self.sts_scores = {}
+        # Normalized metric dictionaries
+        self.normalized_bleu = {}
+        self.normalized_cosine = {}
+        self.normalized_sts = {}
+        # Combined detectability dictionary
+        self.combined_detectabilities = {}
+        # Load pre-trained BERT and SentenceTransformer for Cosine Similarity and STS Score
+        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
+        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        self.sts_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
+    def calculate_all_metrics(self):
+        """
+        Calculate all detectability metrics for each paraphrased sentence.
+        """
+        original_embedding = self._get_sentence_embedding(self.original_sentence)
+        sts_original_embedding = self.sts_model.encode(self.original_sentence)
+        for idx, paraphrased_sentence in enumerate(self.paraphrased_sentences):
+            key = f"Sentence_{idx+1}"
+            # BLEU Score
+            self.bleu_scores[key] = self._calculate_bleu(self.original_sentence, paraphrased_sentence)
+            # Cosine Similarity
+            paraphrase_embedding = self._get_sentence_embedding(paraphrased_sentence)
+            self.cosine_similarities[key] = cosine_similarity([original_embedding], [paraphrase_embedding])[0][0]
+            # STS Score
+            sts_paraphrase_embedding = self.sts_model.encode(paraphrased_sentence)
+            self.sts_scores[key] = cosine_similarity([sts_original_embedding], [sts_paraphrase_embedding])[0][0]
+    def normalize_metrics(self):
+        """
+        Normalize all metrics to be between 0 and 1.
+        """
+        self.normalized_bleu = self._normalize_dict(self.bleu_scores)
+        self.normalized_cosine = self._normalize_dict(self.cosine_similarities)
+        self.normalized_sts = self._normalize_dict(self.sts_scores)
+    def calculate_combined_detectability(self):
+        """
+        Calculate the combined detectability using the root mean square of the normalized metrics.
+        """
+        for key in self.normalized_bleu.keys():
+            rms = np.sqrt(
+                (
+                    self.normalized_bleu[key] ** 2 +
+                    self.normalized_cosine[key] ** 2 +
+                    self.normalized_sts[key] ** 2
+                ) / 3
+            )
+            self.combined_detectabilities[key] = rms
+    def plot_metrics(self):
+        """
+        Plot each normalized metric and the combined detectability in separate graphs.
+        """
+        keys = list(self.normalized_bleu.keys())
+        indices = np.arange(len(keys))
+        # Prepare data for plotting
+        metrics = {
+            'BLEU Score': [self.normalized_bleu[key] for key in keys],
+            'Cosine Similarity': [self.normalized_cosine[key] for key in keys],
+            'STS Score': [self.normalized_sts[key] for key in keys],
+            'Combined Detectability': [self.combined_detectabilities[key] for key in keys]
+        }
+        # Plot each metric separately
+        for metric_name, values in metrics.items():
+            plt.figure(figsize=(12, 6))
+            plt.plot(indices, values, marker='o', color=np.random.rand(3,))
+            plt.xlabel('Sentence Index')
+            plt.ylabel('Normalized Value (0-1)')
+            plt.title(f'Normalized {metric_name}')
+            plt.grid(True)
+            plt.tight_layout()
+            plt.show()
+    # Private methods for metric calculations
+    def _calculate_bleu(self, reference, candidate):
+        """
+        Calculate the BLEU score between the original and paraphrased sentence using smoothing.
+        """
+        reference_tokens = nltk.word_tokenize(reference)
+        candidate_tokens = nltk.word_tokenize(candidate)
+        smoothing = SmoothingFunction().method1
+        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothing)
+    def _get_sentence_embedding(self, sentence):
+        """
+        Get sentence embedding using BERT.
+        """
+        tokens = self.bert_tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
+        with torch.no_grad():
+            outputs = self.bert_model(**tokens)
+        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+    def _normalize_dict(self, metric_dict):
+        """
+        Normalize the values in a dictionary to be between 0 and 1.
+        """
+        values = np.array(list(metric_dict.values()))
+        min_val = values.min()
+        max_val = values.max()
+        # Avoid division by zero if all values are the same
+        if max_val - min_val == 0:
+            normalized_values = np.zeros_like(values)
+        else:
+            normalized_values = (values - min_val) / (max_val - min_val)
+        return dict(zip(metric_dict.keys(), normalized_values))
+    # Getter methods
+    def get_normalized_metrics(self):
+        """
+        Get all normalized metrics as a dictionary.
+        """
+        return {
+            'BLEU Score': self.normalized_bleu,
+            'Cosine Similarity': self.normalized_cosine,
+            'STS Score': self.normalized_sts
+        }
+    def get_combined_detectabilities(self):
+        """
+        Get the dictionary of combined detectability values.
+        """
+        return self.combined_detectabilities
+# Example usage
+if __name__ == "__main__":
+    # Original sentence
+    original_sentence = "The quick brown fox jumps over the lazy dog"
+    # Paraphrased sentences
+    paraphrased_sentences = [
+    # Original 1: "A swift auburn fox leaps across a sleepy canine."
+    "The swift auburn fox leaps across a sleepy canine.",
+    "A quick auburn fox leaps across a sleepy canine.",
+    "A swift ginger fox leaps across a sleepy canine.",
+    "A swift auburn fox bounds across a sleepy canine.",
+    "A swift auburn fox leaps across a tired canine.",
+    "Three swift auburn foxes leap across a sleepy canine.",
+    "The vulpine specimen rapidly traverses over a dormant dog.",
+    "Like lightning, the russet hunter soars over the drowsy guardian.",
+    "Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
+    "One rapid Vulpes vulpes traverses the path of a quiescent canine.",
+    "A swift auburn predator navigates across a lethargic pet.",
+    "Subject A (fox) demonstrates velocity over Subject B (dog).",
+    # Original 2: "The agile russet fox bounds over an idle hound."
+    "Some agile russet foxes bound over an idle hound.",
+    "The nimble russet fox bounds over an idle hound.",
+    "The agile brown fox bounds over an idle hound.",
+    "The agile russet fox jumps over an idle hound.",
+    "The agile russet fox bounds over a lazy hound.",
+    "Two agile russet foxes bound over an idle hound.",
+    "A dexterous vulpine surpasses a stationary canine.",
+    "Quick as thought, the copper warrior sails over the guardian.",
+    "Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
+    "A dexterous V. vulpes exceeds the plane of an inactive canine.",
+    "An agile russet hunter maneuvers above a resting hound.",
+    "Test subject F-1 achieves displacement superior to subject D-1.",
+    # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
+    "The nimble mahogany vulpine vaults above a drowsy dog.",
+    "A swift mahogany vulpine vaults above a drowsy dog.",
+    "A nimble reddish vulpine vaults above a drowsy dog.",
+    "A nimble mahogany fox vaults above a drowsy dog.",
+    "A nimble mahogany vulpine leaps above a drowsy dog.",
+    "Four nimble mahogany vulpines vault above a drowsy dog.",
+    "An agile specimen of reddish fur surpasses a somnolent canine.",
+    "Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
+    "Tha quick brown beastie jumps o'er the tired pup, aye.",
+    "Single V. vulpes demonstrates vertical traverse over C. familiaris.",
+    "A nimble rust-colored predator crosses above a drowsy pet.",
+    "Observed: Subject Red executes vertical motion over Subject Gray.",
+    # Original 4: "The speedy copper-colored fox hops over the lethargic pup."
+    "A speedy copper-colored fox hops over the lethargic pup.",
+    "The quick copper-colored fox hops over the lethargic pup.",
+    "The speedy bronze fox hops over the lethargic pup.",
+    "The speedy copper-colored fox jumps over the lethargic pup.",
+    "The speedy copper-colored fox hops over the tired pup.",
+    "Multiple speedy copper-colored foxes hop over the lethargic pup.",
+    "A rapid vulpine of bronze hue traverses an inactive young canine.",
+    "Swift as a dart, the metallic hunter bounds over the lazy puppy.",
+    "Tha fast copper beastie leaps o'er the sleepy wee dog.",
+    "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
+    "A fleet copper-toned predator moves past a sluggish young dog.",
+    "Field note: Adult fox subject exceeds puppy subject vertically.",
+    # Original 5: "A rapid tawny fox springs over a sluggish dog."
+    "The rapid tawny fox springs over a sluggish dog.",
+    "A quick tawny fox springs over a sluggish dog.",
+    "A rapid golden fox springs over a sluggish dog.",
+    "A rapid tawny fox jumps over a sluggish dog.",
+    "A rapid tawny fox springs over a lazy dog.",
+    "Six rapid tawny foxes spring over a sluggish dog.",
+    "An expeditious yellowish vulpine surpasses a torpid canine.",
+    "Fast as a bullet, the golden hunter vaults over the idle guard.",
+    "Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
+    "One V. vulpes displays rapid transit over one inactive C. familiaris.",
+    "A speedy yellow-brown predator bypasses a motionless dog.",
+    "Log entry: Vulpine subject achieves swift vertical displacement.",
+    # Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
+    "A fleet-footed chestnut fox soars above an indolent canine.",
+    "The swift chestnut fox soars above an indolent canine.",
+    "The fleet-footed brown fox soars above an indolent canine.",
+    "The fleet-footed chestnut fox leaps above an indolent canine.",
+    "The fleet-footed chestnut fox soars above a lazy canine.",
+    "Several fleet-footed chestnut foxes soar above an indolent canine.",
+    "A rapid brown vulpine specimen traverses a lethargic domestic dog.",
+    "Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
+    "Tha quick brown beastie sails o'er the sleepy hound, ken.",
+    "Single agile V. vulpes achieves elevation above stationary canine.",
+    "A nimble brown predator glides over an unmoving domestic animal.",
+    "Research note: Brown subject displays superior vertical mobility.",
+    # Original 7: "A fast ginger fox hurdles past a slothful dog."
+    "The fast ginger fox hurdles past a slothful dog.",
+    "A quick ginger fox hurdles past a slothful dog.",
+    "A fast red fox hurdles past a slothful dog.",
+    "A fast ginger fox jumps past a slothful dog.",
+    "A fast ginger fox hurdles past a lazy dog.",
+    "Five fast ginger foxes hurdle past a slothful dog.",
+    "A rapid orange vulpine bypasses a lethargic canine.",
+    "Quick as lightning, the flame-colored hunter races past the lazy guard.",
+    "Tha swift ginger beastie leaps past the tired doggy, ye see.",
+    "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
+    "A speedy red-orange predator overtakes a motionless dog.",
+    "Data point: Orange subject demonstrates rapid transit past Gray subject.",
+    # Original 8: "The spry rusty-colored fox jumps across a dozing hound."
+    "A spry rusty-colored fox jumps across a dozing hound.",
+    "The agile rusty-colored fox jumps across a dozing hound.",
+    "The spry reddish fox jumps across a dozing hound.",
+    "The spry rusty-colored fox leaps across a dozing hound.",
+    "The spry rusty-colored fox jumps across a sleeping hound.",
+    "Multiple spry rusty-colored foxes jump across a dozing hound.",
+    "An agile rust-toned vulpine traverses a somnolent canine.",
+    "Nimble as thought, the copper hunter bounds over the resting guard.",
+    "Tha lively rust-colored beastie hops o'er the snoozin' hound.",
+    "Single dexterous V. vulpes crosses path of dormant C. familiaris.",
+    "A lithe rust-tinted predator moves past a slumbering dog.",
+    "Observation: Russet subject exhibits agility over dormant subject.",
+    # Original 9: "A quick tan fox leaps over an inactive dog."
+    "The quick tan fox leaps over an inactive dog.",
+    "A swift tan fox leaps over an inactive dog.",
+    "A quick beige fox leaps over an inactive dog.",
+    "A quick tan fox jumps over an inactive dog.",
+    "A quick tan fox leaps over a motionless dog.",
+    "Seven quick tan foxes leap over an inactive dog.",
+    "A rapid light-brown vulpine surpasses a stationary canine.",
+    "Fast as wind, the sand-colored hunter soars over the still guard.",
+    "Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
+    "One agile fawn V. vulpes traverses one immobile C. familiaris.",
+    "A fleet tan-colored predator bypasses an unmoving dog.",
+    "Field report: Tan subject demonstrates movement over static subject.",
+    # Original 10: "The brisk auburn vulpine bounces over a listless canine."
+    "Some brisk auburn vulpines bounce over a listless canine.",
+    "The quick auburn vulpine bounces over a listless canine.",
+    "The brisk russet vulpine bounces over a listless canine.",
+    "The brisk auburn fox bounces over a listless canine.",
+    "The brisk auburn vulpine jumps over a listless canine.",
+    "Five brisk auburn vulpines bounce over a listless canine.",
+    "The expeditious specimen supersedes a quiescent Canis lupus.",
+    "Swift as wind, the russet hunter vaults over the idle guardian.",
+    "Tha quick ginger beastie hops o'er the lazy mutt, aye.",
+    "One V. vulpes achieves displacement over inactive C. familiaris.",
+    "A high-velocity auburn predator traverses an immobile animal.",
+    "Final observation: Red subject shows mobility over Gray subject."
+    ]
+    # Initialize the calculator
+    calculator = SentenceDetectabilityCalculator(original_sentence, paraphrased_sentences)
+    # Calculate all metrics
+    calculator.calculate_all_metrics()
+    # Normalize the metrics
+    calculator.normalize_metrics()
+    # Calculate combined detectability
+    calculator.calculate_combined_detectability()
+    # Retrieve the normalized metrics and combined detectabilities
+    normalized_metrics = calculator.get_normalized_metrics()
+    combined_detectabilities = calculator.get_combined_detectabilities()
+    detectability_val=combined_detectabilities
+    # Display the results
+    # print("Normalized Metrics:")
+    # for metric_name, metric_dict in normalized_metrics.items():
+    #     print(f"\n{metric_name}:")
+    #     for key, value in metric_dict.items():
+    #         print(f"{key}: {value:.4f}")
+    print("\nCombined Detectabilities:")
+    for each in combined_detectabilities.items():
+        print(f"{each[1]}")
+    # Plot the metrics
+    # calculator.plot_metrics()

distortion.py ADDED Viewed

	@@ -0,0 +1,385 @@

+# Import necessary libraries
+import nltk
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+from scipy.special import rel_entr
+from collections import Counter
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+distortion_val={}
+# Download NLTK data if not already present
+nltk.download('punkt', quiet=True)
+class SentenceDistortionCalculator:
+    """
+    A class to calculate and analyze distortion metrics between an original sentence and modified sentences.
+    """
+    def __init__(self, original_sentence, modified_sentences):
+        """
+        Initialize the calculator with the original sentence and a list of modified sentences.
+        """
+        self.original_sentence = original_sentence
+        self.modified_sentences = modified_sentences
+        # Raw metric dictionaries
+        self.levenshtein_distances = {}
+        self.word_level_changes = {}
+        self.kl_divergences = {}
+        self.perplexities = {}
+        # Normalized metric dictionaries
+        self.normalized_levenshtein = {}
+        self.normalized_word_changes = {}
+        self.normalized_kl_divergences = {}
+        self.normalized_perplexities = {}
+        # Combined distortion dictionary
+        self.combined_distortions = {}
+        # Initialize GPT-2 model and tokenizer for perplexity calculation
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+        self.model = GPT2LMHeadModel.from_pretrained("gpt2")
+        self.model.eval()  # Set model to evaluation mode
+    def calculate_all_metrics(self):
+        """
+        Calculate all distortion metrics for each modified sentence.
+        """
+        for idx, modified_sentence in enumerate(self.modified_sentences):
+            key = f"Sentence_{idx+1}"
+            self.levenshtein_distances[key] = self._calculate_levenshtein_distance(modified_sentence)
+            self.word_level_changes[key] = self._calculate_word_level_change(modified_sentence)
+            self.kl_divergences[key] = self._calculate_kl_divergence(modified_sentence)
+            self.perplexities[key] = self._calculate_perplexity(modified_sentence)
+    def normalize_metrics(self):
+        """
+        Normalize all metrics to be between 0 and 1.
+        """
+        self.normalized_levenshtein = self._normalize_dict(self.levenshtein_distances)
+        self.normalized_word_changes = self._normalize_dict(self.word_level_changes)
+        self.normalized_kl_divergences = self._normalize_dict(self.kl_divergences)
+        self.normalized_perplexities = self._normalize_dict(self.perplexities)
+    def calculate_combined_distortion(self):
+        """
+        Calculate the combined distortion using the root mean square of the normalized metrics.
+        """
+        for key in self.normalized_levenshtein.keys():
+            rms = np.sqrt(
+                (
+                    self.normalized_levenshtein[key] ** 2 +
+                    self.normalized_word_changes[key] ** 2 +
+                    self.normalized_kl_divergences[key] ** 2 +
+                    self.normalized_perplexities[key] ** 2
+                ) / 4
+            )
+            self.combined_distortions[key] = rms
+    def plot_metrics(self):
+        """
+        Plot each normalized metric and the combined distortion in separate graphs.
+        """
+        import matplotlib.pyplot as plt
+        keys = list(self.normalized_levenshtein.keys())
+        indices = np.arange(len(keys))
+        # Prepare data for plotting
+        metrics = {
+            'Levenshtein Distance': [self.normalized_levenshtein[key] for key in keys],
+            'Word-Level Changes': [self.normalized_word_changes[key] for key in keys],
+            'KL Divergence': [self.normalized_kl_divergences[key] for key in keys],
+            'Perplexity': [self.normalized_perplexities[key] for key in keys],
+            'Combined Distortion': [self.combined_distortions[key] for key in keys]
+        }
+        # Plot each metric separately
+        for metric_name, values in metrics.items():
+            plt.figure(figsize=(12, 6))
+            plt.plot(indices, values, marker='o', color=np.random.rand(3,))
+            plt.xlabel('Sentence Index')
+            plt.ylabel('Normalized Value (0-1)')
+            plt.title(f'Normalized {metric_name}')
+            plt.grid(True)
+            plt.tight_layout()
+            plt.show()
+    # Private methods for metric calculations
+    def _calculate_levenshtein_distance(self, modified_sentence):
+        """
+        Calculate the Levenshtein Distance between the original and modified sentence.
+        """
+        return nltk.edit_distance(self.original_sentence, modified_sentence)
+    def _calculate_word_level_change(self, modified_sentence):
+        """
+        Calculate the proportion of word-level changes between the original and modified sentence.
+        """
+        original_words = self.original_sentence.split()
+        modified_words = modified_sentence.split()
+        total_words = max(len(original_words), len(modified_words))
+        changed_words = sum(1 for o, m in zip(original_words, modified_words) if o != m)
+        # Account for extra words in the modified sentence
+        changed_words += abs(len(original_words) - len(modified_words))
+        distortion = changed_words / total_words
+        return distortion
+    def _calculate_kl_divergence(self, modified_sentence):
+        """
+        Calculate the KL Divergence between the word distributions of the original and modified sentence.
+        """
+        original_counts = Counter(self.original_sentence.lower().split())
+        modified_counts = Counter(modified_sentence.lower().split())
+        all_words = set(original_counts.keys()).union(set(modified_counts.keys()))
+        original_probs = np.array([original_counts.get(word, 0) for word in all_words], dtype=float)
+        modified_probs = np.array([modified_counts.get(word, 0) for word in all_words], dtype=float)
+        # Add smoothing to avoid division by zero
+        original_probs += 1e-10
+        modified_probs += 1e-10
+        # Normalize to create probability distributions
+        original_probs /= original_probs.sum()
+        modified_probs /= modified_probs.sum()
+        kl_divergence = np.sum(rel_entr(original_probs, modified_probs))
+        return kl_divergence
+    def _calculate_perplexity(self, sentence):
+        """
+        Calculate the perplexity of a sentence using GPT-2.
+        """
+        encodings = self.tokenizer(sentence, return_tensors='pt')
+        max_length = self.model.config.n_positions
+        stride = max_length
+        lls = []
+        for i in range(0, encodings.input_ids.size(1), stride):
+            begin_loc = i
+            end_loc = min(i + stride, encodings.input_ids.size(1))
+            trg_len = end_loc - begin_loc
+            input_ids = encodings.input_ids[:, begin_loc:end_loc]
+            target_ids = input_ids.clone()
+            with torch.no_grad():
+                outputs = self.model(input_ids, labels=target_ids)
+                log_likelihood = outputs.loss * trg_len
+            lls.append(log_likelihood)
+        ppl = torch.exp(torch.stack(lls).sum() / end_loc)
+        return ppl.item()
+    def _normalize_dict(self, metric_dict):
+        """
+        Normalize the values in a dictionary to be between 0 and 1.
+        """
+        values = np.array(list(metric_dict.values()))
+        min_val = values.min()
+        max_val = values.max()
+        # Avoid division by zero if all values are the same
+        if max_val - min_val == 0:
+            normalized_values = np.zeros_like(values)
+        else:
+            normalized_values = (values - min_val) / (max_val - min_val)
+        return dict(zip(metric_dict.keys(), normalized_values))
+    # Getter methods
+    def get_normalized_metrics(self):
+        """
+        Get all normalized metrics as a dictionary.
+        """
+        return {
+            'Levenshtein Distance': self.normalized_levenshtein,
+            'Word-Level Changes': self.normalized_word_changes,
+            'KL Divergence': self.normalized_kl_divergences,
+            'Perplexity': self.normalized_perplexities
+        }
+    def get_combined_distortions(self):
+        """
+        Get the dictionary of combined distortion values.
+        """
+        return self.combined_distortions
+# # Example usage
+# if __name__ == "__main__":
+#     # Original sentence
+#     original_sentence = "The quick brown fox jumps over the lazy dog"
+#     paraphrased_sentences = [
+#     # Original 1: "A swift auburn fox leaps across a sleepy canine."
+#     "The swift auburn fox leaps across a sleepy canine.",
+#     "A quick auburn fox leaps across a sleepy canine.",
+#     "A swift ginger fox leaps across a sleepy canine.",
+#     "A swift auburn fox bounds across a sleepy canine.",
+#     "A swift auburn fox leaps across a tired canine.",
+#     "Three swift auburn foxes leap across a sleepy canine.",
+#     "The vulpine specimen rapidly traverses over a dormant dog.",
+#     "Like lightning, the russet hunter soars over the drowsy guardian.",
+#     "Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
+#     "One rapid Vulpes vulpes traverses the path of a quiescent canine.",
+#     "A swift auburn predator navigates across a lethargic pet.",
+#     "Subject A (fox) demonstrates velocity over Subject B (dog).",
+#     # Original 2: "The agile russet fox bounds over an idle hound."
+#     "Some agile russet foxes bound over an idle hound.",
+#     "The nimble russet fox bounds over an idle hound.",
+#     "The agile brown fox bounds over an idle hound.",
+#     "The agile russet fox jumps over an idle hound.",
+#     "The agile russet fox bounds over a lazy hound.",
+#     "Two agile russet foxes bound over an idle hound.",
+#     "A dexterous vulpine surpasses a stationary canine.",
+#     "Quick as thought, the copper warrior sails over the guardian.",
+#     "Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
+#     "A dexterous V. vulpes exceeds the plane of an inactive canine.",
+#     "An agile russet hunter maneuvers above a resting hound.",
+#     "Test subject F-1 achieves displacement superior to subject D-1.",
+#     # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
+#     "The nimble mahogany vulpine vaults above a drowsy dog.",
+#     "A swift mahogany vulpine vaults above a drowsy dog.",
+#     "A nimble reddish vulpine vaults above a drowsy dog.",
+#     "A nimble mahogany fox vaults above a drowsy dog.",
+#     "A nimble mahogany vulpine leaps above a drowsy dog.",
+#     "Four nimble mahogany vulpines vault above a drowsy dog.",
+#     "An agile specimen of reddish fur surpasses a somnolent canine.",
+#     "Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
+#     "Tha quick brown beastie jumps o'er the tired pup, aye.",
+#     "Single V. vulpes demonstrates vertical traverse over C. familiaris.",
+#     "A nimble rust-colored predator crosses above a drowsy pet.",
+#     "Observed: Subject Red executes vertical motion over Subject Gray.",
+#     # Original 4: "The speedy copper-colored fox hops over the lethargic pup."
+#     "A speedy copper-colored fox hops over the lethargic pup.",
+#     "The quick copper-colored fox hops over the lethargic pup.",
+#     "The speedy bronze fox hops over the lethargic pup.",
+#     "The speedy copper-colored fox jumps over the lethargic pup.",
+#     "The speedy copper-colored fox hops over the tired pup.",
+#     "Multiple speedy copper-colored foxes hop over the lethargic pup.",
+#     "A rapid vulpine of bronze hue traverses an inactive young canine.",
+#     "Swift as a dart, the metallic hunter bounds over the lazy puppy.",
+#     "Tha fast copper beastie leaps o'er the sleepy wee dog.",
+#     "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
+#     "A fleet copper-toned predator moves past a sluggish young dog.",
+#     "Field note: Adult fox subject exceeds puppy subject vertically.",
+#     # Original 5: "A rapid tawny fox springs over a sluggish dog."
+#     "The rapid tawny fox springs over a sluggish dog.",
+#     "A quick tawny fox springs over a sluggish dog.",
+#     "A rapid golden fox springs over a sluggish dog.",
+#     "A rapid tawny fox jumps over a sluggish dog.",
+#     "A rapid tawny fox springs over a lazy dog.",
+#     "Six rapid tawny foxes spring over a sluggish dog.",
+#     "An expeditious yellowish vulpine surpasses a torpid canine.",
+#     "Fast as a bullet, the golden hunter vaults over the idle guard.",
+#     "Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
+#     "One V. vulpes displays rapid transit over one inactive C. familiaris.",
+#     "A speedy yellow-brown predator bypasses a motionless dog.",
+#     "Log entry: Vulpine subject achieves swift vertical displacement.",
+#     # Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
+#     "A fleet-footed chestnut fox soars above an indolent canine.",
+#     "The swift chestnut fox soars above an indolent canine.",
+#     "The fleet-footed brown fox soars above an indolent canine.",
+#     "The fleet-footed chestnut fox leaps above an indolent canine.",
+#     "The fleet-footed chestnut fox soars above a lazy canine.",
+#     "Several fleet-footed chestnut foxes soar above an indolent canine.",
+#     "A rapid brown vulpine specimen traverses a lethargic domestic dog.",
+#     "Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
+#     "Tha quick brown beastie sails o'er the sleepy hound, ken.",
+#     "Single agile V. vulpes achieves elevation above stationary canine.",
+#     "A nimble brown predator glides over an unmoving domestic animal.",
+#     "Research note: Brown subject displays superior vertical mobility.",
+#     # Original 7: "A fast ginger fox hurdles past a slothful dog."
+#     "The fast ginger fox hurdles past a slothful dog.",
+#     "A quick ginger fox hurdles past a slothful dog.",
+#     "A fast red fox hurdles past a slothful dog.",
+#     "A fast ginger fox jumps past a slothful dog.",
+#     "A fast ginger fox hurdles past a lazy dog.",
+#     "Five fast ginger foxes hurdle past a slothful dog.",
+#     "A rapid orange vulpine bypasses a lethargic canine.",
+#     "Quick as lightning, the flame-colored hunter races past the lazy guard.",
+#     "Tha swift ginger beastie leaps past the tired doggy, ye see.",
+#     "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
+#     "A speedy red-orange predator overtakes a motionless dog.",
+#     "Data point: Orange subject demonstrates rapid transit past Gray subject.",
+#     # Original 8: "The spry rusty-colored fox jumps across a dozing hound."
+#     "A spry rusty-colored fox jumps across a dozing hound.",
+#     "The agile rusty-colored fox jumps across a dozing hound.",
+#     "The spry reddish fox jumps across a dozing hound.",
+#     "The spry rusty-colored fox leaps across a dozing hound.",
+#     "The spry rusty-colored fox jumps across a sleeping hound.",
+#     "Multiple spry rusty-colored foxes jump across a dozing hound.",
+#     "An agile rust-toned vulpine traverses a somnolent canine.",
+#     "Nimble as thought, the copper hunter bounds over the resting guard.",
+#     "Tha lively rust-colored beastie hops o'er the snoozin' hound.",
+#     "Single dexterous V. vulpes crosses path of dormant C. familiaris.",
+#     "A lithe rust-tinted predator moves past a slumbering dog.",
+#     "Observation: Russet subject exhibits agility over dormant subject.",
+#     # Original 9: "A quick tan fox leaps over an inactive dog."
+#     "The quick tan fox leaps over an inactive dog.",
+#     "A swift tan fox leaps over an inactive dog.",
+#     "A quick beige fox leaps over an inactive dog.",
+#     "A quick tan fox jumps over an inactive dog.",
+#     "A quick tan fox leaps over a motionless dog.",
+#     "Seven quick tan foxes leap over an inactive dog.",
+#     "A rapid light-brown vulpine surpasses a stationary canine.",
+#     "Fast as wind, the sand-colored hunter soars over the still guard.",
+#     "Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
+#     "One agile fawn V. vulpes traverses one immobile C. familiaris.",
+#     "A fleet tan-colored predator bypasses an unmoving dog.",
+#     "Field report: Tan subject demonstrates movement over static subject.",
+#     # Original 10: "The brisk auburn vulpine bounces over a listless canine."
+#     "Some brisk auburn vulpines bounce over a listless canine.",
+#     "The quick auburn vulpine bounces over a listless canine.",
+#     "The brisk russet vulpine bounces over a listless canine.",
+#     "The brisk auburn fox bounces over a listless canine.",
+#     "The brisk auburn vulpine jumps over a listless canine.",
+#     "Five brisk auburn vulpines bounce over a listless canine.",
+#     "The expeditious specimen supersedes a quiescent Canis lupus.",
+#     "Swift as wind, the russet hunter vaults over the idle guardian.",
+#     "Tha quick ginger beastie hops o'er the lazy mutt, aye.",
+#     "One V. vulpes achieves displacement over inactive C. familiaris.",
+#     "A high-velocity auburn predator traverses an immobile animal.",
+#     "Final observation: Red subject shows mobility over Gray subject."
+#     ]
+#     # Initialize the calculator
+#     calculator = SentenceDistortionCalculator(original_sentence, paraphrased_sentences)
+#     # Calculate all metrics
+#     calculator.calculate_all_metrics()
+#     # Normalize the metrics
+#     calculator.normalize_metrics()
+#     # Calculate combined distortion
+#     calculator.calculate_combined_distortion()
+#     # Retrieve the normalized metrics and combined distortions
+#     normalized_metrics = calculator.get_normalized_metrics()
+#     combined_distortions = calculator.get_combined_distortions()
+#     distortion_val=combined_distortions
+#     # Display the results
+#     print("Normalized Metrics:")
+#     for metric_name, metric_dict in normalized_metrics.items():
+#         print(f"\n{metric_name}:")
+#         for key, value in metric_dict.items():
+#             print(f"{key}: {value:.4f}")
+#     print("\nCombined Distortions:")
+#     for key, value in combined_distortions.items():
+#         print(f"{key}: {value:.4f}")
+#     # Plot the metrics
+#     calculator.plot_metrics()

euclidean_distance.py ADDED Viewed

	@@ -0,0 +1,261 @@

+# Import necessary libraries
+import numpy as np
+import matplotlib.pyplot as plt
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import euclidean_distances
+euclidean_val={}
+class SentenceEuclideanDistanceCalculator:
+    """
+    A class to calculate and analyze Euclidean distance between an original sentence and paraphrased sentences.
+    """
+    def __init__(self, original_sentence, paraphrased_sentences):
+        """
+        Initialize the calculator with the original sentence and a list of paraphrased sentences.
+        """
+        self.original_sentence = original_sentence
+        self.paraphrased_sentences = paraphrased_sentences
+        # Euclidean distance dictionary
+        self.euclidean_distances = {}
+        # Normalized Euclidean distances
+        self.normalized_euclidean = {}
+        # Load SentenceTransformer model for embedding calculation
+        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+    def calculate_all_metrics(self):
+        """
+        Calculate Euclidean distance between the original and each paraphrased sentence.
+        """
+        original_embedding = self._get_sentence_embedding(self.original_sentence)
+        for idx, paraphrased_sentence in enumerate(self.paraphrased_sentences):
+            key = f"Sentence_{idx+1}"
+            # Euclidean Distance
+            paraphrase_embedding = self._get_sentence_embedding(paraphrased_sentence)
+            self.euclidean_distances[key] = euclidean_distances([original_embedding], [paraphrase_embedding])[0][0]
+    def normalize_metrics(self):
+        """
+        Normalize all metrics to be between 0 and 1.
+        """
+        self.normalized_euclidean = self._normalize_dict(self.euclidean_distances)
+    def plot_metrics(self):
+        """
+        Plot the normalized Euclidean distances in a graph.
+        """
+        keys = list(self.normalized_euclidean.keys())
+        indices = np.arange(len(keys))
+        # Prepare data for plotting
+        plt.figure(figsize=(12, 6))
+        plt.plot(indices, [self.normalized_euclidean[key] for key in keys], marker='o', color=np.random.rand(3,))
+        plt.xlabel('Sentence Index')
+        plt.ylabel('Normalized Euclidean Distance (0-1)')
+        plt.title('Normalized Euclidean Distance')
+        plt.grid(True)
+        plt.tight_layout()
+        plt.show()
+    # Private methods for metric calculations
+    def _get_sentence_embedding(self, sentence):
+        """
+        Get sentence embedding using the SentenceTransformer model.
+        """
+        return self.model.encode(sentence)
+    def _normalize_dict(self, metric_dict):
+        """
+        Normalize the values in a dictionary to be between 0 and 1.
+        """
+        values = np.array(list(metric_dict.values()))
+        min_val = values.min()
+        max_val = values.max()
+        # Avoid division by zero if all values are the same
+        if max_val - min_val == 0:
+            normalized_values = np.zeros_like(values)
+        else:
+            normalized_values = (values - min_val) / (max_val - min_val)
+        return dict(zip(metric_dict.keys(), normalized_values))
+    # Getter methods
+    def get_normalized_metrics(self):
+        """
+        Get the normalized Euclidean distances as a dictionary.
+        """
+        return self.normalized_euclidean
+# # Example usage
+# if __name__ == "__main__":
+#     # Original sentence
+#     original_sentence = "The quick brown fox jumps over the lazy dog"
+#     # Paraphrased sentences
+#     paraphrased_sentences = [
+#     # Original 1: "A swift auburn fox leaps across a sleepy canine."
+#     "The swift auburn fox leaps across a sleepy canine.",
+#     "A quick auburn fox leaps across a sleepy canine.",
+#     "A swift ginger fox leaps across a sleepy canine.",
+#     "A swift auburn fox bounds across a sleepy canine.",
+#     "A swift auburn fox leaps across a tired canine.",
+#     "Three swift auburn foxes leap across a sleepy canine.",
+#     "The vulpine specimen rapidly traverses over a dormant dog.",
+#     "Like lightning, the russet hunter soars over the drowsy guardian.",
+#     "Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
+#     "One rapid Vulpes vulpes traverses the path of a quiescent canine.",
+#     "A swift auburn predator navigates across a lethargic pet.",
+#     "Subject A (fox) demonstrates velocity over Subject B (dog).",
+#     # Original 2: "The agile russet fox bounds over an idle hound."
+#     "Some agile russet foxes bound over an idle hound.",
+#     "The nimble russet fox bounds over an idle hound.",
+#     "The agile brown fox bounds over an idle hound.",
+#     "The agile russet fox jumps over an idle hound.",
+#     "The agile russet fox bounds over a lazy hound.",
+#     "Two agile russet foxes bound over an idle hound.",
+#     "A dexterous vulpine surpasses a stationary canine.",
+#     "Quick as thought, the copper warrior sails over the guardian.",
+#     "Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
+#     "A dexterous V. vulpes exceeds the plane of an inactive canine.",
+#     "An agile russet hunter maneuvers above a resting hound.",
+#     "Test subject F-1 achieves displacement superior to subject D-1.",
+#     # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
+#     "The nimble mahogany vulpine vaults above a drowsy dog.",
+#     "A swift mahogany vulpine vaults above a drowsy dog.",
+#     "A nimble reddish vulpine vaults above a drowsy dog.",
+#     "A nimble mahogany fox vaults above a drowsy dog.",
+#     "A nimble mahogany vulpine leaps above a drowsy dog.",
+#     "Four nimble mahogany vulpines vault above a drowsy dog.",
+#     "An agile specimen of reddish fur surpasses a somnolent canine.",
+#     "Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
+#     "Tha quick brown beastie jumps o'er the tired pup, aye.",
+#     "Single V. vulpes demonstrates vertical traverse over C. familiaris.",
+#     "A nimble rust-colored predator crosses above a drowsy pet.",
+#     "Observed: Subject Red executes vertical motion over Subject Gray.",
+#     # Original 4: "The speedy copper-colored fox hops over the lethargic pup."
+#     "A speedy copper-colored fox hops over the lethargic pup.",
+#     "The quick copper-colored fox hops over the lethargic pup.",
+#     "The speedy bronze fox hops over the lethargic pup.",
+#     "The speedy copper-colored fox jumps over the lethargic pup.",
+#     "The speedy copper-colored fox hops over the tired pup.",
+#     "Multiple speedy copper-colored foxes hop over the lethargic pup.",
+#     "A rapid vulpine of bronze hue traverses an inactive young canine.",
+#     "Swift as a dart, the metallic hunter bounds over the lazy puppy.",
+#     "Tha fast copper beastie leaps o'er the sleepy wee dog.",
+#     "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
+#     "A fleet copper-toned predator moves past a sluggish young dog.",
+#     "Field note: Adult fox subject exceeds puppy subject vertically.",
+#     # Original 5: "A rapid tawny fox springs over a sluggish dog."
+#     "The rapid tawny fox springs over a sluggish dog.",
+#     "A quick tawny fox springs over a sluggish dog.",
+#     "A rapid golden fox springs over a sluggish dog.",
+#     "A rapid tawny fox jumps over a sluggish dog.",
+#     "A rapid tawny fox springs over a lazy dog.",
+#     "Six rapid tawny foxes spring over a sluggish dog.",
+#     "An expeditious yellowish vulpine surpasses a torpid canine.",
+#     "Fast as a bullet, the golden hunter vaults over the idle guard.",
+#     "Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
+#     "One V. vulpes displays rapid transit over one inactive C. familiaris.",
+#     "A speedy yellow-brown predator bypasses a motionless dog.",
+#     "Log entry: Vulpine subject achieves swift vertical displacement.",
+#     # Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
+#     "A fleet-footed chestnut fox soars above an indolent canine.",
+#     "The swift chestnut fox soars above an indolent canine.",
+#     "The fleet-footed brown fox soars above an indolent canine.",
+#     "The fleet-footed chestnut fox leaps above an indolent canine.",
+#     "The fleet-footed chestnut fox soars above a lazy canine.",
+#     "Several fleet-footed chestnut foxes soar above an indolent canine.",
+#     "A rapid brown vulpine specimen traverses a lethargic domestic dog.",
+#     "Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
+#     "Tha quick brown beastie sails o'er the sleepy hound, ken.",
+#     "Single agile V. vulpes achieves elevation above stationary canine.",
+#     "A nimble brown predator glides over an unmoving domestic animal.",
+#     "Research note: Brown subject displays superior vertical mobility.",
+#     # Original 7: "A fast ginger fox hurdles past a slothful dog."
+#     "The fast ginger fox hurdles past a slothful dog.",
+#     "A quick ginger fox hurdles past a slothful dog.",
+#     "A fast red fox hurdles past a slothful dog.",
+#     "A fast ginger fox jumps past a slothful dog.",
+#     "A fast ginger fox hurdles past a lazy dog.",
+#     "Five fast ginger foxes hurdle past a slothful dog.",
+#     "A rapid orange vulpine bypasses a lethargic canine.",
+#     "Quick as lightning, the flame-colored hunter races past the lazy guard.",
+#     "Tha swift ginger beastie leaps past the tired doggy, ye see.",
+#     "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
+#     "A speedy red-orange predator overtakes a motionless dog.",
+#     "Data point: Orange subject demonstrates rapid transit past Gray subject.",
+#     # Original 8: "The spry rusty-colored fox jumps across a dozing hound."
+#     "A spry rusty-colored fox jumps across a dozing hound.",
+#     "The agile rusty-colored fox jumps across a dozing hound.",
+#     "The spry reddish fox jumps across a dozing hound.",
+#     "The spry rusty-colored fox leaps across a dozing hound.",
+#     "The spry rusty-colored fox jumps across a sleeping hound.",
+#     "Multiple spry rusty-colored foxes jump across a dozing hound.",
+#     "An agile rust-toned vulpine traverses a somnolent canine.",
+#     "Nimble as thought, the copper hunter bounds over the resting guard.",
+#     "Tha lively rust-colored beastie hops o'er the snoozin' hound.",
+#     "Single dexterous V. vulpes crosses path of dormant C. familiaris.",
+#     "A lithe rust-tinted predator moves past a slumbering dog.",
+#     "Observation: Russet subject exhibits agility over dormant subject.",
+#     # Original 9: "A quick tan fox leaps over an inactive dog."
+#     "The quick tan fox leaps over an inactive dog.",
+#     "A swift tan fox leaps over an inactive dog.",
+#     "A quick beige fox leaps over an inactive dog.",
+#     "A quick tan fox jumps over an inactive dog.",
+#     "A quick tan fox leaps over a motionless dog.",
+#     "Seven quick tan foxes leap over an inactive dog.",
+#     "A rapid light-brown vulpine surpasses a stationary canine.",
+#     "Fast as wind, the sand-colored hunter soars over the still guard.",
+#     "Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
+#     "One agile fawn V. vulpes traverses one immobile C. familiaris.",
+#     "A fleet tan-colored predator bypasses an unmoving dog.",
+#     "Field report: Tan subject demonstrates movement over static subject.",
+#     # Original 10: "The brisk auburn vulpine bounces over a listless canine."
+#     "Some brisk auburn vulpines bounce over a listless canine.",
+#     "The quick auburn vulpine bounces over a listless canine.",
+#     "The brisk russet vulpine bounces over a listless canine.",
+#     "The brisk auburn fox bounces over a listless canine.",
+#     "The brisk auburn vulpine jumps over a listless canine.",
+#     "Five brisk auburn vulpines bounce over a listless canine.",
+#     "The expeditious specimen supersedes a quiescent Canis lupus.",
+#     "Swift as wind, the russet hunter vaults over the idle guardian.",
+#     "Tha quick ginger beastie hops o'er the lazy mutt, aye.",
+#     "One V. vulpes achieves displacement over inactive C. familiaris.",
+#     "A high-velocity auburn predator traverses an immobile animal.",
+#     "Final observation: Red subject shows mobility over Gray subject."
+#     ]
+#     # Initialize the calculator
+#     calculator = SentenceEuclideanDistanceCalculator(original_sentence, paraphrased_sentences)
+#     # Calculate Euclidean distances
+#     calculator.calculate_all_metrics()
+#     # Normalize the distances
+#     calculator.normalize_metrics()
+#     # Retrieve the normalized Euclidean distances
+#     normalized_metrics = calculator.get_normalized_metrics()
+#     euclidean_val=normalized_metrics
+#     # Display the results
+#     print("Normalized Euclidean Distances:")
+#     for key, value in normalized_metrics.items():
+#         print(f"{key}: {value:.4f}")
+#     # Plot the metrics
+#     calculator.plot_metrics()

gpt_mask_filling.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import openai
+import os
+from dotenv import load_dotenv
+load_dotenv()
+openai.api_key = os.getenv("API_KEY")
+#Takes in a sentence and returns a list of dicts consisiting of key-value pairs of masked words and lists of the possible replacements
+def predict_masked_words(sentence, n_suggestions=5):
+    prompt = (
+          f"Given a sentence with masked words, masked word can be one or more than one, indicated by [MASK], generate {n_suggestions} possible words to fill each mask. "
+          "Return the results as a list of dictionaries, where each dictionary key is a masked word and its value is a list of 5 potential words to fill that mask.\n\n"
+          "Example input: \"The [MASK] fox [MASK] over the [MASK] dog.\"\n\n"
+          "Example output:\n"
+          "[\n"
+          "  {\n"
+          "    \"[MASK]1\": [\"quick\", \"sly\", \"red\", \"clever\", \"sneaky\"]\n"
+          "  },\n"
+          "  {\n"
+          "    \"[MASK]2\": [\"jumped\", \"leaped\", \"hopped\", \"sprang\", \"bounded\"]\n"
+          "  },\n"
+          "  {\n"
+          "    \"[MASK]3\": [\"lazy\", \"sleeping\", \"brown\", \"tired\", \"old\"]\n"
+          "  }\n"
+          "]\n\n"
+          "Example input: \"The [MASK] [MASK] ran swiftly across the [MASK] field.\"\n\n"
+          "Example output:\n"
+          "[\n"
+          "  {\n"
+          "    \"[MASK]1\": [\"tall\", \"fierce\", \"young\", \"old\", \"beautiful\"]\n"
+          "  },\n"
+          "  {\n"
+          "    \"[MASK]2\": [\"lion\", \"tiger\", \"horse\", \"cheetah\", \"deer\"]\n"
+          "  },\n"
+          "  {\n"
+          "    \"[MASK]3\": [\"green\", \"wide\", \"sunny\", \"open\", \"empty\"]\n"
+          "  }\n"
+          "]\n\n"
+          "Example input: \"It was a [MASK] day when the train arrived at the station.\"\n\n"
+          "Example output:\n"
+          "[\n"
+          "  {\n"
+          "    \"[MASK]1\": [\"sunny\", \"rainy\", \"cloudy\", \"foggy\", \"stormy\"]\n"
+          "  },\n"
+          "]\n\n"
+          "Now, please process the following sentence:\n"
+          f"{sentence}"
+      )
+    response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ],
+        max_tokens=100,
+        n=1,
+        stop=None,
+        temperature=0.7
+    )
+    print(response['choices'][0]['message']['content'])
+# sentence = "Evacuations and storm [MASK] began on Sunday night as forecasters projected that Hurricane Dorian would hit into Florida’s west coast on Wednesday as a major hurricane packing life-threatening winds and storm surge."
+# predict_masked_words(sentence, n_suggestions=5)

highlighter.py CHANGED Viewed

@@ -83,4 +83,22 @@ def highlight_common_words_dict(common_words, sentences, title):
     <h3 style="margin-top: 0; font-size: 1em; color: #111827;">{title}</h3>
     <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
     </div>
     '''

     <h3 style="margin-top: 0; font-size: 1em; color: #111827;">{title}</h3>
     <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
     </div>
+    '''
+def reparaphrased_sentences_html(sentences):
+    formatted_sentences = []
+    for idx, sentence in enumerate(sentences, start=1):
+        # Add index to each sentence
+        sentence_with_idx = f"{idx}. {sentence}"
+        formatted_sentences.append(sentence_with_idx)
+    final_html = "<br><br>".join(formatted_sentences)
+    return f'''
+    <div style="border: solid 1px #ccc; padding: 16px; background-color: #FFFFFF; color: #374151;
+        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
+        <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
+    </div>
     '''

lcs.py CHANGED Viewed

@@ -4,7 +4,6 @@ from nltk.corpus import stopwords
 def find_common_subsequences(sentence, str_list):
     stop_words = set(stopwords.words('english'))
     sentence = sentence.lower()
     str_list = [s.lower() for s in str_list]
     def is_present(subseq, str_list):
@@ -17,17 +16,17 @@ def find_common_subsequences(sentence, str_list):
         filtered_words = [word for word in words if word.lower() not in stop_words]
         return " ".join(filtered_words)
-    sentence = remove_stop_words_and_special_chars(sentence)
-    str_list = [remove_stop_words_and_special_chars(s) for s in str_list]
-    words = sentence.split()
     common_grams = []
     added_phrases = set()
-    for n in range(5, 0, -1):
         for i in range(len(words) - n + 1):
-            subseq = " ".join(words[i:i+n])
-            if is_present(subseq, str_list) and not any(subseq in phrase for phrase in added_phrases):
                 common_grams.append((i, subseq))
                 added_phrases.add(subseq)
@@ -39,8 +38,62 @@ def find_common_subsequences(sentence, str_list):
     return indexed_common_grams
-# Example usage
-# sentence = "Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time 5since the Holocaust” to be Jewish in the United States."
-# str_list = ['']
-# print(find_common_subsequences(sentence, str_list))

 def find_common_subsequences(sentence, str_list):
     stop_words = set(stopwords.words('english'))
     sentence = sentence.lower()
     str_list = [s.lower() for s in str_list]
     def is_present(subseq, str_list):
         filtered_words = [word for word in words if word.lower() not in stop_words]
         return " ".join(filtered_words)
+    cleaned_sentence = remove_stop_words_and_special_chars(sentence)
+    cleaned_str_list = [remove_stop_words_and_special_chars(s) for s in str_list]
+    words = cleaned_sentence.split()
     common_grams = []
     added_phrases = set()
+    for n in range(5, 0, -1):  # Check n-grams from size 5 to 1
         for i in range(len(words) - n + 1):
+            subseq = " ".join(words[i:i + n])
+            if is_present(subseq, cleaned_str_list) and not any(subseq in phrase for phrase in added_phrases):
                 common_grams.append((i, subseq))
                 added_phrases.add(subseq)
     return indexed_common_grams
+def find_common_gram_positions(str_list, common_grams):
+    # Initialize a list to hold positions for each sentence
+    positions = []
+    for sentence in str_list:
+        # Number each word in the sentence
+        words = re.sub(r'[^\w\s]', '', sentence).lower().split()
+        word_positions = {word: [] for word in words}
+        for idx, word in enumerate(words):
+            word_positions[word].append(idx + 1)  # Store 1-based index positions
+        # Create a list to store positions of common grams for the current sentence
+        sentence_positions = []
+        for gram in common_grams:
+            # Clean the gram for matching
+            cleaned_gram = re.sub(r'[^\w\s]', '', gram).lower()
+            gram_words = cleaned_gram.split()
+            # Check for the position of the common gram in the current sentence
+            if all(word in word_positions for word in gram_words):
+                # Get the position of the first word of the common gram
+                start_idx = word_positions[gram_words[0]][0]
+                sentence_positions.append(start_idx)
+            else:
+                sentence_positions.append(-1)  # Common gram not found
+        # Append the positions for the current sentence to the main positions list
+        positions.append(sentence_positions)
+    return positions
+# # Example usage
+# sentence = "Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time since the Holocaust” to be Jewish in the United States."
+# str_list = [
+#     'During a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump stated that being Jewish in the United States has never been more hazardous since the Holocaust.',
+#     'At a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump declared that being Jewish in the United States has never been more hazardous since the Holocaust.',
+#     'Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania, and stated that being Jewish in the United States has never been more perilous since the Holocaust.',
+#     'Donald Trump made the statement at a campaign rally in Wilkes-Barre, Pennsylvania, saying that being Jewish in the United States has never been more dangerous since the Holocaust.',
+#     'Last month, Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania and stated that being Jewish in the United States has never been more hazardous than during World War II.',
+#     'In Wilkes-Barre, Pennsylvania, Donald Trump spoke at a campaign rally and claimed that the Holocaust was always more hazardous for Jews in the United States.',
+#     'A campaign rally in Wilkes-Barre, Pennsylvania saw Donald Trump declare that being Jewish in the United States has never been more perilous since WWII.',
+#     'Speaking at a campaign rally in Wilkes-Barre, Pennsylvania today, Donald Trump declared that being Jewish has never been more hazardous in the United States since the Holocaust.',
+#     'During his campaign rally in Wilkes-Barre, Pennsylvania today Donald Trump stated: "There has never been a safer place for being Jewish in the United States since the Holocaust."',
+#     'At a campaign rally in Wilkes-Barre, Pennsylvania (pictured), Donald Trump said, "There has never been... gotten worse for being Jewish in America since the Holocaust."'
+# ]
+# # Find common subsequences
+# common_grams = find_common_subsequences(sentence, str_list)
+# # Extract the subsequences from the common grams for position checking
+# subsequences = [subseq for _, subseq in common_grams]
+# # Find positions of the common grams
+# common_gram_positions = find_common_gram_positions(str_list, subsequences)
+# print(common_grams)

masking_methods.py CHANGED Viewed

@@ -1,73 +1,31 @@
-# from transformers import AutoTokenizer, AutoModelForMaskedLM
-# from transformers import pipeline
-# import random
-# from nltk.corpus import stopwords
-# import math
-# # Masking Model
-# def mask_non_stopword(sentence):
-#     stop_words = set(stopwords.words('english'))
-#     words = sentence.split()
-#     non_stop_words = [word for word in words if word.lower() not in stop_words]
-#     if not non_stop_words:
-#         return sentence
-#     word_to_mask = random.choice(non_stop_words)
-#     masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
-#     return masked_sentence
-# def mask_non_stopword_pseudorandom(sentence):
-#     stop_words = set(stopwords.words('english'))
-#     words = sentence.split()
-#     non_stop_words = [word for word in words if word.lower() not in stop_words]
-#     if not non_stop_words:
-#         return sentence
-#     random.seed(10)
-#     word_to_mask = random.choice(non_stop_words)
-#     masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
-#     return masked_sentence
-# def high_entropy_words(sentence, non_melting_points):
-#     stop_words = set(stopwords.words('english'))
-#     words = sentence.split()
-#     non_melting_words = set()
-#     for _, point in non_melting_points:
-#         non_melting_words.update(point.lower().split())
-#     candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
-#     if not candidate_words:
-#         return sentence
-#     max_entropy = -float('inf')
-#     max_entropy_word = None
-#     for word in candidate_words:
-#         masked_sentence = sentence.replace(word, '[MASK]', 1)
-#         predictions = fill_mask(masked_sentence)
-#         # Calculate entropy based on top 5 predictions
-#         entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5])
-#         if entropy > max_entropy:
-#             max_entropy = entropy
-#             max_entropy_word = word
-#     return sentence.replace(max_entropy_word, '[MASK]', 1)
-# # Load tokenizer and model for masked language model
-# tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
-# model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
-# fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 from transformers import pipeline
 import random
 from nltk.corpus import stopwords
 import math
-# Masking Model
 def mask_non_stopword(sentence):
     stop_words = set(stopwords.words('english'))
     words = sentence.split()
@@ -76,10 +34,10 @@ def mask_non_stopword(sentence):
         return sentence, None, None
     word_to_mask = random.choice(non_stop_words)
     masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
-    predictions = fill_mask(masked_sentence)
-    words = [pred['score'] for pred in predictions]
-    logits = [pred['token_str'] for pred in predictions]
-    return masked_sentence, words, logits
 def mask_non_stopword_pseudorandom(sentence):
     stop_words = set(stopwords.words('english'))
@@ -87,54 +45,148 @@ def mask_non_stopword_pseudorandom(sentence):
     non_stop_words = [word for word in words if word.lower() not in stop_words]
     if not non_stop_words:
         return sentence, None, None
-    random.seed(10)
     word_to_mask = random.choice(non_stop_words)
     masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
-    predictions = fill_mask(masked_sentence)
-    words = [pred['score'] for pred in predictions]
-    logits = [pred['token_str'] for pred in predictions]
-    return masked_sentence, words, logits
 def high_entropy_words(sentence, non_melting_points):
     stop_words = set(stopwords.words('english'))
     words = sentence.split()
     non_melting_words = set()
     for _, point in non_melting_points:
         non_melting_words.update(point.lower().split())
     candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
     if not candidate_words:
         return sentence, None, None
     max_entropy = -float('inf')
     max_entropy_word = None
     max_logits = None
     for word in candidate_words:
         masked_sentence = sentence.replace(word, '[MASK]', 1)
-        predictions = fill_mask(masked_sentence)
         # Calculate entropy based on top 5 predictions
-        entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5])
         if entropy > max_entropy:
             max_entropy = entropy
             max_entropy_word = word
-            max_logits = [pred['score'] for pred in predictions]
     masked_sentence = sentence.replace(max_entropy_word, '[MASK]', 1)
-    words = [pred['score'] for pred in predictions]
-    logits = [pred['token_str'] for pred in predictions]
-    return masked_sentence, words, logits
-# Load tokenizer and model for masked language model
-tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
-model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
-fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
-non_melting_points = [(1, 'Jewish'), (2, 'messages'), (3, 'stab')]
-a, b, c = high_entropy_words("A former Cornell University student was sentenced to 21 months in prison on Monday after admitting that he had posted a series of online messages last fall in which he threatened to stab, rape and behead Jewish people", non_melting_points)
-print(f"logits type: {type(b)}")
-print(f"logits content: {b}")

+import torch
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 from transformers import pipeline
 import random
 from nltk.corpus import stopwords
 import math
+from vocabulary_split import split_vocabulary, filter_logits
+# Load tokenizer and model for masked language model
+tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
+fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
+# Get permissible vocabulary
+permissible, _ = split_vocabulary(seed=42)
+permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
+def get_logits_for_mask(model, tokenizer, sentence):
+    inputs = tokenizer(sentence, return_tensors="pt")
+    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+    with torch.no_grad():
+        outputs = model(**inputs)
+    logits = outputs.logits
+    mask_token_logits = logits[0, mask_token_index, :]
+    return mask_token_logits.squeeze()
 def mask_non_stopword(sentence):
     stop_words = set(stopwords.words('english'))
     words = sentence.split()
         return sentence, None, None
     word_to_mask = random.choice(non_stop_words)
     masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
+    logits = get_logits_for_mask(model, tokenizer, masked_sentence)
+    filtered_logits = filter_logits(logits, permissible_indices)
+    words = [tokenizer.decode([i]) for i in filtered_logits.argsort()[-5:]]
+    return masked_sentence, filtered_logits.tolist(), words
 def mask_non_stopword_pseudorandom(sentence):
     stop_words = set(stopwords.words('english'))
     non_stop_words = [word for word in words if word.lower() not in stop_words]
     if not non_stop_words:
         return sentence, None, None
+    random.seed(10)  # Fixed seed for pseudo-randomness
     word_to_mask = random.choice(non_stop_words)
     masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
+    logits = get_logits_for_mask(model, tokenizer, masked_sentence)
+    filtered_logits = filter_logits(logits, permissible_indices)
+    words = [tokenizer.decode([i]) for i in filtered_logits.argsort()[-5:]]
+    return masked_sentence, filtered_logits.tolist(), words
+# New function: mask words between LCS points
+def mask_between_lcs(sentence, lcs_points):
+    words = sentence.split()
+    masked_indices = []
+    # Mask between first word and first LCS point
+    if lcs_points and lcs_points[0] > 0:
+        idx = random.randint(0, lcs_points[0]-1)
+        words[idx] = '[MASK]'
+        masked_indices.append(idx)
+    # Mask between LCS points
+    for i in range(len(lcs_points) - 1):
+        start, end = lcs_points[i], lcs_points[i+1]
+        if end - start > 1:
+            mask_index = random.randint(start + 1, end - 1)
+            words[mask_index] = '[MASK]'
+            masked_indices.append(mask_index)
+    # Mask between last LCS point and last word
+    if lcs_points and lcs_points[-1] < len(words) - 1:
+        idx = random.randint(lcs_points[-1]+1, len(words)-1)
+        words[idx] = '[MASK]'
+        masked_indices.append(idx)
+    masked_sentence = ' '.join(words)
+    logits = get_logits_for_mask(model, tokenizer, masked_sentence)
+    # Now process each masked token separately
+    top_words_list = []
+    logits_list = []
+    for i in range(len(masked_indices)):
+        logits_i = logits[i]
+        if logits_i.dim() > 1:
+            logits_i = logits_i.squeeze()
+        filtered_logits_i = filter_logits(logits_i, permissible_indices)
+        logits_list.append(filtered_logits_i.tolist())
+        top_5_indices = filtered_logits_i.topk(5).indices.tolist()
+        top_words = [tokenizer.decode([i]) for i in top_5_indices]
+        top_words_list.append(top_words)
+    return masked_sentence, logits_list, top_words_list
 def high_entropy_words(sentence, non_melting_points):
     stop_words = set(stopwords.words('english'))
     words = sentence.split()
     non_melting_words = set()
     for _, point in non_melting_points:
         non_melting_words.update(point.lower().split())
     candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
     if not candidate_words:
         return sentence, None, None
     max_entropy = -float('inf')
     max_entropy_word = None
     max_logits = None
     for word in candidate_words:
         masked_sentence = sentence.replace(word, '[MASK]', 1)
+        logits = get_logits_for_mask(model, tokenizer, masked_sentence)
+        filtered_logits = filter_logits(logits, permissible_indices)
         # Calculate entropy based on top 5 predictions
+        probs = torch.softmax(filtered_logits, dim=-1)
+        top_5_probs = probs.topk(5).values
+        entropy = -torch.sum(top_5_probs * torch.log(top_5_probs))
         if entropy > max_entropy:
             max_entropy = entropy
             max_entropy_word = word
+            max_logits = filtered_logits
+    if max_entropy_word is None:
+        return sentence, None, None
     masked_sentence = sentence.replace(max_entropy_word, '[MASK]', 1)
+    words = [tokenizer.decode([i]) for i in max_logits.argsort()[-5:]]
+    return masked_sentence, max_logits.tolist(), words
+# New function: mask based on part of speech
+def mask_by_pos(sentence, pos_to_mask=['NOUN', 'VERB', 'ADJ']):
+    import nltk
+    nltk.download('averaged_perceptron_tagger', quiet=True)
+    words = nltk.word_tokenize(sentence)
+    pos_tags = nltk.pos_tag(words)
+    maskable_words = [word for word, pos in pos_tags if pos[:2] in pos_to_mask]
+    if not maskable_words:
+        return sentence, None, None
+    word_to_mask = random.choice(maskable_words)
+    masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
+    logits = get_logits_for_mask(model, tokenizer, masked_sentence)
+    filtered_logits = filter_logits(logits, permissible_indices)
+    words = [tokenizer.decode([i]) for i in filtered_logits.argsort()[-5:]]
+    return masked_sentence, filtered_logits.tolist(), words
+# New function: mask named entities
+def mask_named_entity(sentence):
+    import nltk
+    nltk.download('maxent_ne_chunker', quiet=True)
+    nltk.download('words', quiet=True)
+    words = nltk.word_tokenize(sentence)
+    pos_tags = nltk.pos_tag(words)
+    named_entities = nltk.ne_chunk(pos_tags)
+    maskable_words = [word for word, tag in named_entities.leaves() if isinstance(tag, nltk.Tree)]
+    if not maskable_words:
+        return sentence, None, None
+    word_to_mask = random.choice(maskable_words)
+    masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
+    logits = get_logits_for_mask(model, tokenizer, masked_sentence)
+    filtered_logits = filter_logits(logits, permissible_indices)
+    words = [tokenizer.decode([i]) for i in filtered_logits.argsort()[-5:]]
+    return masked_sentence, filtered_logits.tolist(), words
+# sentence = "This is a sample sentence with some LCS points"
+# lcs_points = [2, 5, 8]  # Indices of LCS points
+# masked_sentence, logits_list, top_words_list = mask_between_lcs(sentence, lcs_points)
+# print("Masked Sentence:", masked_sentence)
+# for idx, top_words in enumerate(top_words_list):
+#     print(f"Top words for mask {idx+1}:", top_words)

paraphraser.py CHANGED Viewed

@@ -1,31 +1,83 @@
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-# Function to Initialize the Model
-def init_model():
-    para_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
-    para_model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
-    return para_tokenizer, para_model
-# Function to Paraphrase the Text
-def paraphrase(question, para_tokenizer, para_model, num_beams=10, num_beam_groups=10, num_return_sequences=10, repetition_penalty=10.0, diversity_penalty=3.0, no_repeat_ngram_size=2, temperature=0.7, max_length=64):
-    input_ids = para_tokenizer(
-        f'paraphrase: {question}',
-        return_tensors="pt", padding="longest",
-        max_length=max_length,
-        truncation=True,
-    ).input_ids
-    outputs = para_model.generate(
-        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
-        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
-        num_beams=num_beams, num_beam_groups=num_beam_groups,
-        max_length=max_length, diversity_penalty=diversity_penalty
-    )
-    res = para_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-    return res
-def generate_paraphrase(question):
-    para_tokenizer, para_model = init_model()
-    res = paraphrase(question, para_tokenizer, para_model)
-    return res
-# print(generate_paraphrase("Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time 5since the Holocaust” to be Jewish in the United States."))

+# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# # Function to Initialize the Model
+# def init_model():
+#     para_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
+#     para_model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
+#     return para_tokenizer, para_model
+# # Function to Paraphrase the Text
+# def paraphrase(question, para_tokenizer, para_model, num_beams=10, num_beam_groups=10, num_return_sequences=10, repetition_penalty=10.0, diversity_penalty=3.0, no_repeat_ngram_size=2, temperature=0.7, max_length=64):
+#     input_ids = para_tokenizer(
+#         f'paraphrase: {question}',
+#         return_tensors="pt", padding="longest",
+#         max_length=max_length,
+#         truncation=True,
+#     ).input_ids
+#     outputs = para_model.generate(
+#         input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
+#         num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
+#         num_beams=num_beams, num_beam_groups=num_beam_groups,
+#         max_length=max_length, diversity_penalty=diversity_penalty
+#     )
+#     res = para_tokenizer.batch_decode(outputs, skip_special_tokens=True)
+#     return res
+# def generate_paraphrase(question):
+#     para_tokenizer, para_model = init_model()
+#     res = paraphrase(question, para_tokenizer, para_model)
+#     return res
+# print(generate_paraphrase("Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time 5since the Holocaust” to be Jewish in the United States."))
+'''
+Accepts a sentence or list of sentences and returns a lit of all their paraphrases using GPT-4.
+'''
+from openai import OpenAI
+from dotenv import load_dotenv
+load_dotenv()
+import os
+key = os.getenv("OPENAI_API_KEY")
+# Initialize the OpenAI client
+client = OpenAI(
+    api_key=key  # Replace with your actual API key
+)
+# Function to paraphrase sentences using GPT-4
+def generate_paraphrase(sentences, model="gpt-4o", num_paraphrases=10, max_tokens=150, temperature=0.7):
+    # Ensure sentences is a list even if a single sentence is passed
+    if isinstance(sentences, str):
+        sentences = [sentences]
+    paraphrased_sentences_list = []
+    for sentence in sentences:
+        full_prompt = f"Paraphrase the following text: '{sentence}'"
+        try:
+            chat_completion = client.chat.completions.create(
+                messages=[
+                    {
+                        "role": "user",
+                        "content": full_prompt,
+                    }
+                ],
+                model=model,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                n=num_paraphrases  # Number of paraphrased sentences to generate
+            )
+            # Extract the paraphrased sentences from the response
+            paraphrased_sentences = [choice.message.content.strip() for choice in chat_completion.choices]
+            # Append paraphrased sentences to the list
+            paraphrased_sentences_list.extend(paraphrased_sentences)
+        except Exception as e:
+            print(f"Error paraphrasing sentence '{sentence}': {e}")
+    return paraphrased_sentences_list
+result = generate_paraphrase("Mayor Eric Adams did not attend the first candidate forum for the New York City mayoral race, but his record — and the criminal charges he faces — received plenty of attention on Saturday from the Democrats who are running to unseat him.")
+print(len(result))

requirements.txt CHANGED Viewed

@@ -14,4 +14,6 @@ nltk
 tenacity
 pandas
 graphviz==0.20.3
-gradio

 tenacity
 pandas
 graphviz==0.20.3
+gradio=4.29.0
+openai
+python-dotenv

sampling_methods.py CHANGED Viewed

@@ -1,55 +1,42 @@
-# import torch
-# import random
-# def sample_word(words, logits, sampling_technique='inverse_transform', temperature=1.0):
-#     if sampling_technique == 'inverse_transform':
-#         probs = torch.softmax(torch.tensor(logits), dim=-1)
-#         cumulative_probs = torch.cumsum(probs, dim=-1)
-#         random_prob = random.random()
-#         sampled_index = torch.where(cumulative_probs >= random_prob)[0][0]
-#     elif sampling_technique == 'exponential_minimum':
-#         probs = torch.softmax(torch.tensor(logits), dim=-1)
-#         exp_probs = torch.exp(-torch.log(probs))
-#         random_probs = torch.rand_like(exp_probs)
-#         sampled_index = torch.argmax(random_probs * exp_probs)
-#     elif sampling_technique == 'temperature':
-#         scaled_logits = torch.tensor(logits) / temperature
-#         probs = torch.softmax(scaled_logits, dim=-1)
-#         sampled_index = torch.multinomial(probs, 1).item()
-#     elif sampling_technique == 'greedy':
-#         sampled_index = torch.argmax(torch.tensor(logits)).item()
-#     else:
-#         raise ValueError("Invalid sampling technique. Choose 'inverse_transform', 'exponential_minimum', 'temperature', or 'greedy'.")
-#     sampled_word = words[sampled_index]
-#     return sampled_word
 import torch
 import random
 def sample_word(sentence, words, logits, sampling_technique='inverse_transform', temperature=1.0):
     if sampling_technique == 'inverse_transform':
-        probs = torch.softmax(torch.tensor(logits), dim=-1)
         cumulative_probs = torch.cumsum(probs, dim=-1)
         random_prob = random.random()
         sampled_index = torch.where(cumulative_probs >= random_prob)[0][0]
     elif sampling_technique == 'exponential_minimum':
-        probs = torch.softmax(torch.tensor(logits), dim=-1)
         exp_probs = torch.exp(-torch.log(probs))
         random_probs = torch.rand_like(exp_probs)
         sampled_index = torch.argmax(random_probs * exp_probs)
     elif sampling_technique == 'temperature':
-        scaled_logits = torch.tensor(logits) / temperature
-        probs = torch.softmax(scaled_logits, dim=-1)
         sampled_index = torch.multinomial(probs, 1).item()
     elif sampling_technique == 'greedy':
-        sampled_index = torch.argmax(torch.tensor(logits)).item()
     else:
         raise ValueError("Invalid sampling technique. Choose 'inverse_transform', 'exponential_minimum', 'temperature', or 'greedy'.")
-    sampled_word = words[sampled_index]
     # Replace [MASK] with the sampled word
     filled_sentence = sentence.replace('[MASK]', sampled_word)
     return filled_sentence

 import torch
 import random
+from vocabulary_split import split_vocabulary, filter_logits
+# from transformers import AutoTokenizer, AutoModelForMaskedLM
+from masking_methods import tokenizer
+# Load tokenizer and model for masked language model
+# tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+# model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
+# Get permissible vocabulary
+permissible, _ = split_vocabulary(seed=42)
+permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
 def sample_word(sentence, words, logits, sampling_technique='inverse_transform', temperature=1.0):
+    filtered_logits = filter_logits(torch.tensor(logits), permissible_indices)
     if sampling_technique == 'inverse_transform':
+        probs = torch.softmax(filtered_logits / temperature, dim=-1)
         cumulative_probs = torch.cumsum(probs, dim=-1)
         random_prob = random.random()
         sampled_index = torch.where(cumulative_probs >= random_prob)[0][0]
     elif sampling_technique == 'exponential_minimum':
+        probs = torch.softmax(filtered_logits / temperature, dim=-1)
         exp_probs = torch.exp(-torch.log(probs))
         random_probs = torch.rand_like(exp_probs)
         sampled_index = torch.argmax(random_probs * exp_probs)
     elif sampling_technique == 'temperature':
+        probs = torch.softmax(filtered_logits / temperature, dim=-1)
         sampled_index = torch.multinomial(probs, 1).item()
     elif sampling_technique == 'greedy':
+        sampled_index = torch.argmax(filtered_logits).item()
     else:
         raise ValueError("Invalid sampling technique. Choose 'inverse_transform', 'exponential_minimum', 'temperature', or 'greedy'.")
+    sampled_word = tokenizer.decode([sampled_index])
     # Replace [MASK] with the sampled word
     filled_sentence = sentence.replace('[MASK]', sampled_word)
     return filled_sentence

threeD_plot.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# import numpy as np
+# import plotly.graph_objects as go
+# from scipy.interpolate import griddata
+# def gen_three_D_plot(detectability_val, distortion_val, euclidean_val):
+#     detectability = np.array(detectability_val)
+#     distortion = np.array(distortion_val)
+#     euclidean = np.array(euclidean_val)
+#     # Find the closest point to the origin
+#     distances_to_origin = np.linalg.norm(np.array([distortion, detectability, euclidean]).T, axis=1)
+#     closest_point_index = np.argmin(distances_to_origin)
+#     # Determine the closest points to each axis
+#     closest_to_x_axis = np.argmin(distortion)
+#     closest_to_y_axis = np.argmin(detectability)
+#     closest_to_z_axis = np.argmin(euclidean)
+#     # Use the detected closest point as the "sweet spot"
+#     sweet_spot_detectability = detectability[closest_point_index]
+#     sweet_spot_distortion = distortion[closest_point_index]
+#     sweet_spot_euclidean = euclidean[closest_point_index]
+#     # Create a meshgrid from the data
+#     x_grid, y_grid = np.meshgrid(np.linspace(min(detectability), max(detectability), 30),
+#                                  np.linspace(min(distortion), max(distortion), 30))
+#     # Interpolate z values (Euclidean distances) to fit the grid
+#     z_grid = griddata((detectability, distortion), euclidean, (x_grid, y_grid), method='linear')
+#     if z_grid is None:
+#         raise ValueError("griddata could not generate a valid interpolation. Check your input data.")
+#     # Create the 3D contour plot with the Plasma color scale
+#     fig = go.Figure(data=go.Surface(
+#         z=z_grid,
+#         x=x_grid,
+#         y=y_grid,
+#         contours={
+#             "z": {"show": True, "start": min(euclidean), "end": max(euclidean), "size": 0.1, "usecolormap": True}
+#         },
+#         colorscale='Plasma'
+#     ))
+#     # Add a marker for the sweet spot
+#     fig.add_trace(go.Scatter3d(
+#         x=[sweet_spot_detectability],
+#         y=[sweet_spot_distortion],
+#         z=[sweet_spot_euclidean],
+#         mode='markers+text',
+#         marker=dict(size=10, color='red', symbol='circle'),
+#         text=["Sweet Spot"],
+#         textposition="top center"
+#     ))
+#     # Set axis labels
+#     fig.update_layout(
+#         scene=dict(
+#             xaxis_title='Detectability Score',
+#             yaxis_title='Distortion Score',
+#             zaxis_title='Euclidean Distance'
+#         ),
+#         margin=dict(l=0, r=0, b=0, t=0)
+#     )
+#     return fig
+import numpy as np
+import plotly.graph_objects as go
+from scipy.interpolate import griddata
+def gen_three_D_plot(detectability_val, distortion_val, euclidean_val):
+    detectability = np.array(detectability_val)
+    distortion = np.array(distortion_val)
+    euclidean = np.array(euclidean_val)
+    # Normalize the values to range [0, 1]
+    norm_detectability = (detectability - min(detectability)) / (max(detectability) - min(detectability))
+    norm_distortion = (distortion - min(distortion)) / (max(distortion) - min(distortion))
+    norm_euclidean = (euclidean - min(euclidean)) / (max(euclidean) - min(euclidean))
+    # Composite score: maximize detectability, minimize distortion and Euclidean distance
+    # We subtract distortion and euclidean as we want them minimized.
+    composite_score = norm_detectability - (norm_distortion + norm_euclidean)
+    # Find the index of the maximum score (sweet spot)
+    sweet_spot_index = np.argmax(composite_score)
+    # Sweet spot values
+    sweet_spot_detectability = detectability[sweet_spot_index]
+    sweet_spot_distortion = distortion[sweet_spot_index]
+    sweet_spot_euclidean = euclidean[sweet_spot_index]
+    # Create a meshgrid from the data
+    x_grid, y_grid = np.meshgrid(np.linspace(min(detectability), max(detectability), 30),
+                                 np.linspace(min(distortion), max(distortion), 30))
+    # Interpolate z values (Euclidean distances) to fit the grid
+    z_grid = griddata((detectability, distortion), euclidean, (x_grid, y_grid), method='linear')
+    if z_grid is None:
+        raise ValueError("griddata could not generate a valid interpolation. Check your input data.")
+    # Create the 3D contour plot with the Plasma color scale
+    fig = go.Figure(data=go.Surface(
+        z=z_grid,
+        x=x_grid,
+        y=y_grid,
+        contours={
+            "z": {"show": True, "start": min(euclidean), "end": max(euclidean), "size": 0.1, "usecolormap": True}
+        },
+        colorscale='Plasma'
+    ))
+    # Add a marker for the sweet spot
+    fig.add_trace(go.Scatter3d(
+        x=[sweet_spot_detectability],
+        y=[sweet_spot_distortion],
+        z=[sweet_spot_euclidean],
+        mode='markers+text',
+        marker=dict(size=10, color='red', symbol='circle'),
+        text=["Sweet Spot"],
+        textposition="top center"
+    ))
+    # Set axis labels
+    fig.update_layout(
+        scene=dict(
+            xaxis_title='Detectability Score',
+            yaxis_title='Distortion Score',
+            zaxis_title='Euclidean Distance'
+        ),
+        margin=dict(l=0, r=0, b=0, t=0)
+    )
+    return fig

tree.py CHANGED Viewed

@@ -1,341 +1,3 @@
-# import plotly.graph_objects as go
-# import textwrap
-# import re
-# from collections import defaultdict
-# def generate_subplot1(paraphrased_sentence, scheme_sentences, highlight_info):
-#     # Combine nodes into one list with appropriate labels
-#     nodes = [paraphrased_sentence] + scheme_sentences
-#     nodes[0] += ' L0'  # Paraphrased sentence is level 0
-#     for i in range(1, len(nodes)):
-#         nodes[i] += ' L1'  # Scheme sentences are level 1
-#     # Define the highlight_words function
-#     def highlight_words(sentence, color_map):
-#         for word, color in color_map.items():
-#             sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
-#         return sentence
-#     # Clean and wrap nodes, and highlight specified words globally
-#     cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
-#     global_color_map = dict(highlight_info)
-#     highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
-#     wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=50)) for node in highlighted_nodes]
-#     # Function to determine tree levels and create edges dynamically
-#     def get_levels_and_edges(nodes):
-#         levels = {}
-#         edges = []
-#         for i, node in enumerate(nodes):
-#             level = int(node.split()[-1][1])
-#             levels[i] = level
-#         # Add edges from L0 to all L1 nodes
-#         root_node = next(i for i, level in levels.items() if level == 0)
-#         for i, level in levels.items():
-#             if level == 1:
-#                 edges.append((root_node, i))
-#         return levels, edges
-#     # Get levels and dynamic edges
-#     levels, edges = get_levels_and_edges(nodes)
-#     max_level = max(levels.values(), default=0)
-#     # Calculate positions
-#     positions = {}
-#     level_heights = defaultdict(int)
-#     for node, level in levels.items():
-#         level_heights[level] += 1
-#     y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
-#     x_gap = 2
-#     l1_y_gap = 10
-#     for node, level in levels.items():
-#         if level == 1:
-#             positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
-#         else:
-#             positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
-#         y_offsets[level] += 1
-#     # Function to highlight words in a wrapped node string
-#     def color_highlighted_words(node, color_map):
-#         parts = re.split(r'(\{\{.*?\}\})', node)
-#         colored_parts = []
-#         for part in parts:
-#             match = re.match(r'\{\{(.*?)\}\}', part)
-#             if match:
-#                 word = match.group(1)
-#                 color = color_map.get(word, 'black')
-#                 colored_parts.append(f"<span style='color: {color};'>{word}</span>")
-#             else:
-#                 colored_parts.append(part)
-#         return ''.join(colored_parts)
-#     # Define the text for each edge
-#     edge_texts = [
-#         "Highest Entropy Masking",
-#         "Pseudo-random Masking",
-#         "Random Masking",
-#         "Greedy Sampling",
-#         "Temperature Sampling",
-#         "Exponential Minimum Sampling",
-#         "Inverse Transform Sampling",
-#         "Greedy Sampling",
-#         "Temperature Sampling",
-#         "Exponential Minimum Sampling",
-#         "Inverse Transform Sampling",
-#         "Greedy Sampling",
-#         "Temperature Sampling",
-#         "Exponential Minimum Sampling",
-#         "Inverse Transform Sampling"
-#     ]
-#     # Create figure
-#     fig1 = go.Figure()
-#     # Add nodes to the figure
-#     for i, node in enumerate(wrapped_nodes):
-#         colored_node = color_highlighted_words(node, global_color_map)
-#         x, y = positions[i]
-#         fig1.add_trace(go.Scatter(
-#             x=[-x],  # Reflect the x coordinate
-#             y=[y],
-#             mode='markers',
-#             marker=dict(size=10, color='blue'),
-#             hoverinfo='none'
-#         ))
-#         fig1.add_annotation(
-#             x=-x,  # Reflect the x coordinate
-#             y=y,
-#             text=colored_node,
-#             showarrow=False,
-#             xshift=15,
-#             align="center",
-#             font=dict(size=12),
-#             bordercolor='black',
-#             borderwidth=1,
-#             borderpad=2,
-#             bgcolor='white',
-#             width=300,
-#             height=120
-#         )
-#     # Add edges and text above each edge
-#     for i, edge in enumerate(edges):
-#         x0, y0 = positions[edge[0]]
-#         x1, y1 = positions[edge[1]]
-#         fig1.add_trace(go.Scatter(
-#             x=[-x0, -x1],  # Reflect the x coordinates
-#             y=[y0, y1],
-#             mode='lines',
-#             line=dict(color='black', width=1)
-#         ))
-#         # Calculate the midpoint of the edge
-#         mid_x = (-x0 + -x1) / 2
-#         mid_y = (y0 + y1) / 2
-#         # Adjust y position to shift text upwards
-#         text_y_position = mid_y + 0.8  # Increase this value to shift the text further upwards
-#         # Add text annotation above the edge
-#         fig1.add_annotation(
-#             x=mid_x,
-#             y=text_y_position,
-#             text=edge_texts[i],  # Use the text specific to this edge
-#             showarrow=False,
-#             font=dict(size=12),
-#             align="center"
-#         )
-#     fig1.update_layout(
-#         showlegend=False,
-#         margin=dict(t=20, b=20, l=20, r=20),
-#         xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
-#         yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
-#         width=1435,  # Adjusted width to accommodate more levels
-#         height=1000   # Adjusted height to accommodate more levels
-#     )
-#     return fig1
-# def generate_subplot2(scheme_sentences, sampled_sentence, highlight_info):
-#     # Combine nodes into one list with appropriate labels
-#     nodes = scheme_sentences + sampled_sentence
-#     para_len = len(scheme_sentences)
-#     # Reassign levels: L1 -> L0, L2 -> L1
-#     for i in range(para_len):
-#         nodes[i] += ' L0'  # Scheme sentences are now level 0
-#     for i in range(para_len, len(nodes)):
-#         nodes[i] += ' L1'  # Sampled sentences are now level 1
-#     # Define the highlight_words function
-#     def highlight_words(sentence, color_map):
-#         for word, color in color_map.items():
-#             sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
-#         return sentence
-#     # Clean and wrap nodes, and highlight specified words globally
-#     cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
-#     global_color_map = dict(highlight_info)
-#     highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
-#     wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=80)) for node in highlighted_nodes]
-#     # Function to determine tree levels and create edges dynamically
-#     def get_levels_and_edges(nodes):
-#         levels = {}
-#         edges = []
-#         for i, node in enumerate(nodes):
-#             level = int(node.split()[-1][1])
-#             levels[i] = level
-#         # Add edges from L0 to all L1 nodes
-#         l0_indices = [i for i, level in levels.items() if level == 0]
-#         l1_indices = [i for i, level in levels.items() if level == 1]
-#         # Ensure there are exactly 3 L0 nodes
-#         if len(l0_indices) < 3:
-#             raise ValueError("There should be exactly 3 L0 nodes to attach edges correctly.")
-#         # Split L1 nodes into 3 groups of 4 for attaching to L0 nodes
-#         for i, l1_node in enumerate(l1_indices):
-#             if i < 4:
-#                 edges.append((l0_indices[0], l1_node))  # Connect to the first L0 node
-#             elif i < 8:
-#                 edges.append((l0_indices[1], l1_node))  # Connect to the second L0 node
-#             else:
-#                 edges.append((l0_indices[2], l1_node))  # Connect to the third L0 node
-#         return levels, edges
-#     # Get levels and dynamic edges
-#     levels, edges = get_levels_and_edges(nodes)
-#     # Calculate positions
-#     positions = {}
-#     level_heights = defaultdict(int)
-#     for node, level in levels.items():
-#         level_heights[level] += 1
-#     y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
-#     x_gap = 2
-#     l1_y_gap = 10
-#     for node, level in levels.items():
-#         if level == 1:
-#             positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
-#         else:
-#             positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
-#         y_offsets[level] += 1
-#     # Function to highlight words in a wrapped node string
-#     def color_highlighted_words(node, color_map):
-#         parts = re.split(r'(\{\{.*?\}\})', node)
-#         colored_parts = []
-#         for part in parts:
-#             match = re.match(r'\{\{(.*?)\}\}', part)
-#             if match:
-#                 word = match.group(1)
-#                 color = color_map.get(word, 'black')
-#                 colored_parts.append(f"<span style='color: {color};'>{word}</span>")
-#             else:
-#                 colored_parts.append(part)
-#         return ''.join(colored_parts)
-#     # Define the text for each edge
-#     edge_texts = [
-#         "Highest Entropy Masking",
-#         "Pseudo-random Masking",
-#         "Random Masking",
-#         "Greedy Sampling",
-#         "Temperature Sampling",
-#         "Exponential Minimum Sampling",
-#         "Inverse Transform Sampling",
-#         "Greedy Sampling",
-#         "Temperature Sampling",
-#         "Exponential Minimum Sampling",
-#         "Inverse Transform Sampling",
-#         "Greedy Sampling",
-#         "Temperature Sampling",
-#         "Exponential Minimum Sampling",
-#         "Inverse Transform Sampling"
-#     ]
-#     # Create figure
-#     fig2 = go.Figure()
-#     # Add nodes to the figure
-#     for i, node in enumerate(wrapped_nodes):
-#         colored_node = color_highlighted_words(node, global_color_map)
-#         x, y = positions[i]
-#         fig2.add_trace(go.Scatter(
-#             x=[-x],  # Reflect the x coordinate
-#             y=[y],
-#             mode='markers',
-#             marker=dict(size=10, color='blue'),
-#             hoverinfo='none'
-#         ))
-#         fig2.add_annotation(
-#             x=-x,  # Reflect the x coordinate
-#             y=y,
-#             text=colored_node,
-#             showarrow=False,
-#             xshift=15,
-#             align="center",
-#             font=dict(size=12),
-#             bordercolor='black',
-#             borderwidth=1,
-#             borderpad=2,
-#             bgcolor='white',
-#             width=450,
-#             height=65
-#         )
-#     # Add edges and text above each edge
-#     for i, edge in enumerate(edges):
-#         x0, y0 = positions[edge[0]]
-#         x1, y1 = positions[edge[1]]
-#         fig2.add_trace(go.Scatter(
-#             x=[-x0, -x1],  # Reflect the x coordinates
-#             y=[y0, y1],
-#             mode='lines',
-#             line=dict(color='black', width=1)
-#         ))
-#         # Calculate the midpoint of the edge
-#         mid_x = (-x0 + -x1) / 2
-#         mid_y = (y0 + y1) / 2
-#         # Adjust y position to shift text upwards
-#         text_y_position = mid_y + 0.8  # Increase this value to shift the text further upwards
-#         # Add text annotation above the edge
-#         fig2.add_annotation(A surprising aspect of tests, specifically self-testing soon after exposure to new material, is that they can significantly improve your ability to learn, apply, and maintain new knowledge.
-#             x=mid_x,
-#             y=text_y_position,
-#             text=edge_texts[i],  # Use the text specific to this edge
-#             showarrow=False,
-#             font=dict(size=12),
-#             align="center"
-#         )
-#     fig2.update_layout(
-#         showlegend=False,
-#         margin=dict(t=20, b=20, l=20, r=20),
-#         xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
-#         yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
-#         width=1435,  # Adjusted width to accommodate more levels
-#         height=1000   # Adjusted height to accommodate more levels
-#     )
-#     return fig2
 import plotly.graph_objects as go
 import textwrap
 import re

 import plotly.graph_objects as go
 import textwrap
 import re

vocabulary_split.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import random
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+import torch
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
+def split_vocabulary(seed=42):
+    # Initialize the tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
+    # Get the full vocabulary
+    vocab = list(tokenizer.get_vocab().items())
+    # Initialize the random number generator
+    random.seed(seed)
+    # Split the vocabulary into permissible and non-permissible buckets
+    permissible = {}
+    non_permissible = {}
+    for word, index in vocab:
+        if random.random() < 0.5:  # 50% chance of being permissible
+            permissible[word] = index
+        else:
+            non_permissible[word] = index
+    return permissible, non_permissible
+def get_logits_for_mask(model, tokenizer, sentence):
+    inputs = tokenizer(sentence, return_tensors="pt")
+    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+    with torch.no_grad():
+        outputs = model(**inputs)
+    logits = outputs.logits
+    mask_token_logits = logits[0, mask_token_index, :]
+    return mask_token_logits.squeeze()
+def filter_logits(logits, permissible_indices):
+    filtered_logits = logits.clone()
+    if filtered_logits.dim() > 1:
+        filtered_logits = filtered_logits.squeeze()
+    if filtered_logits.shape != permissible_indices.shape:
+        permissible_indices = permissible_indices[:filtered_logits.shape[0]]
+    filtered_logits[~permissible_indices] = float('-inf')
+    return filtered_logits
+# Usage example
+permissible, non_permissible = split_vocabulary(seed=42)
+permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
+# When sampling:
+sentence = "The [MASK] is bright today."
+logits = get_logits_for_mask(model, tokenizer, sentence)
+filtered_logits = filter_logits(logits, permissible_indices)
+# Use filtered_logits for sampling

watermark_detector.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import nltk
+from nltk.corpus import stopwords
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+from vocabulary_split import split_vocabulary, filter_logits
+import torch
+from lcs import find_common_subsequences
+from paraphraser import generate_paraphrase
+nltk.download('punkt', quiet=True)
+nltk.download('stopwords', quiet=True)
+tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
+permissible, _ = split_vocabulary(seed=42)
+permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
+def get_non_melting_points(original_sentence):
+    paraphrased_sentences = generate_paraphrase(original_sentence)
+    common_subsequences = find_common_subsequences(original_sentence, paraphrased_sentences)
+    return common_subsequences
+def get_word_between_points(sentence, start_point, end_point):
+    words = nltk.word_tokenize(sentence)
+    stop_words = set(stopwords.words('english'))
+    start_index = sentence.index(start_point[1])
+    end_index = sentence.index(end_point[1])
+    for word in words[start_index+1:end_index]:
+        if word.lower() not in stop_words:
+            return word, words.index(word)
+    return None, None
+def get_logits_for_mask(sentence):
+    inputs = tokenizer(sentence, return_tensors="pt")
+    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+    with torch.no_grad():
+        outputs = model(**inputs)
+    logits = outputs.logits
+    mask_token_logits = logits[0, mask_token_index, :]
+    return mask_token_logits.squeeze()
+def detect_watermark(sentence):
+    non_melting_points = get_non_melting_points(sentence)
+    if len(non_melting_points) < 2:
+        return False, "Not enough non-melting points found."
+    word_to_check, index = get_word_between_points(sentence, non_melting_points[0], non_melting_points[1])
+    if word_to_check is None:
+        return False, "No suitable word found between non-melting points."
+    words = nltk.word_tokenize(sentence)
+    masked_sentence = ' '.join(words[:index] + ['[MASK]'] + words[index+1:])
+    logits = get_logits_for_mask(masked_sentence)
+    filtered_logits = filter_logits(logits, permissible_indices)
+    top_predictions = filtered_logits.argsort()[-5:]
+    predicted_words = [tokenizer.decode([i]) for i in top_predictions]
+    if word_to_check in predicted_words:
+        return True, f"Watermark detected. The word '{word_to_check}' is in the permissible vocabulary."
+    else:
+        return False, f"No watermark detected. The word '{word_to_check}' is not in the permissible vocabulary."
+# Example usage
+# if __name__ == "__main__":
+#     test_sentence = "The quick brown fox jumps over the lazy dog."
+#     is_watermarked, message = detect_watermark(test_sentence)
+#     print(f"Is the sentence watermarked? {is_watermarked}")
+#     print(f"Detection message: {message}")