Spaces:

bhavanishankarpullela
/

CoSTA

Running

App Files Files Community

bhavanishankarpullela commited on Jun 29, 2024

Commit

2dc7757

verified ·

1 Parent(s): b817ab5

Upload 9 files

Browse files

Files changed (9) hide show

ST/inference/codes/alignment.py +105 -0
ST/inference/codes/bleu_significance.py +96 -0
ST/inference/codes/dictionary_creation.py +69 -0
ST/inference/codes/evaluate_exactmatch.py +78 -0
ST/inference/codes/evaluate_sari.py +39 -0
ST/inference/codes/german_synthetic_switching.py +63 -0
ST/inference/codes/getTranslationBleu.py +103 -0
ST/inference/codes/syntheticCodeSwitching.py +110 -0
ST/inference/codes/wilcoxon.py +51 -0

ST/inference/codes/alignment.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+import itertools
+from transformers import AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM
+import spacy
+import random
+import os
+import csv
+os.environ["CUDA_VISIBLE_DEVICES"] = "7"  # SET the GPUs you want to use
+class TextAligner:
+    def __init__(self):
+        self.model = AutoModel.from_pretrained("aneuraz/awesome-align-with-co", )
+        self.tokenizer = AutoTokenizer.from_pretrained("aneuraz/awesome-align-with-co")
+        self.align_layer = 8
+        self.threshold = 1e-3
+    def align_texts(self, original, translated):
+        sent_src, sent_tgt = original.strip().split(), translated.strip().split()
+        token_src, token_tgt = [self.tokenizer.tokenize(word) for word in sent_src], [self.tokenizer.tokenize(word) for word in sent_tgt]
+        wid_src, wid_tgt = [self.tokenizer.convert_tokens_to_ids(x) for x in token_src], [self.tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
+        ids_src, ids_tgt = self.tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', model_max_length=self.tokenizer.model_max_length, truncation=True)['input_ids'], self.tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt', truncation=True, model_max_length=self.tokenizer.model_max_length)['input_ids']
+        sub2word_map_src = [i for i, word_list in enumerate(token_src) for _ in word_list]
+        sub2word_map_tgt = [i for i, word_list in enumerate(token_tgt) for _ in word_list]
+        self.model.eval()
+        with torch.no_grad():
+            out_src = self.model(ids_src.unsqueeze(0), output_hidden_states=True)[2][self.align_layer][0, 1:-1]
+            out_tgt = self.model(ids_tgt.unsqueeze(0), output_hidden_states=True)[2][self.align_layer][0, 1:-1]
+            dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))
+            softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
+            softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)
+            softmax_inter = (softmax_srctgt > self.threshold)*(softmax_tgtsrc > self.threshold)
+        align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
+        align_words = {(sent_src[sub2word_map_src[i]], sent_tgt[sub2word_map_tgt[j]]) for i, j in align_subwords}
+        return align_words
+# Load the NLLB model for translation
+#tel_Telu
+#hin_Deva
+#mar_Deva
+#ben_Beng
+#vie_Latn
+#ces_Latn
+# tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-3.3B", src_lang="hin_Deva")
+# model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")
+# def get_translation(word):
+#     """Fetch the English translation for a given Telugu word using the NLLB model."""
+#     inputs = tokenizer(word, return_tensors="pt")
+#     translated_tokens = model.generate(
+#         **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=100
+#     )
+#     english_phrase = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+#     return english_phrase
+class CodeSwitcher(TextAligner):
+    def __init__(self):
+        super().__init__()
+        self.nlp = spacy.load("en_core_web_sm")
+    def switch_content_words(self, source_sentences, ratio=0.5):
+        english_translations = ['Uhh tell me, is this not a service?', 'But getting stuck here would not help.', 'Bajpai was also included among these economists.', 'I thought, um...should I go see a movie this evening?', 'He...means, he bought his new car.']
+        mixed_sentences = []
+        for source, english in zip(source_sentences, english_translations):
+            aligned_pairs = self.align_texts(source, english)
+            print(aligned_pairs)
+            aligned_dict = dict(aligned_pairs)
+            print(aligned_dict)
+            doc = self.nlp(english)
+            content_word_tags = ["NN", "NNS", "NNP", "NNPS", "JJ", "JJR", "JJS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "RB", "RBR", "RBS", "NOUN", "VERB", "ADJ", "ADV"]
+            content_words = [token.text for token in doc if token.tag_ in content_word_tags]
+            num_to_switch = int(len(content_words) * ratio)
+            words_to_switch = random.sample(content_words, min(num_to_switch, len(content_words)))
+            new_sentence = []
+            for word in source.split():
+                aligned_english_word = aligned_dict.get(word, None)
+                if aligned_english_word and aligned_english_word in words_to_switch:
+                    new_sentence.append(aligned_english_word)
+                else:
+                    new_sentence.append(word)
+            mixed_sentences.append(' '.join(new_sentence))
+        return mixed_sentences
+# Usage:
+switcher = CodeSwitcher()
+hindi_sentences = ['अ मुझे बताइए ये सेवा नहीं है क्या?', 'लेकिन यहां पर पर अटकने से काम नहीं होगा।', 'बाजपेयी भी इन अर अर्थशास्त्रियों में शामिल थे।', 'मैंने ��ोचा कि, उम्म...क्या मैं आज शाम को फिल्म देखने जाऊँ?', 'उसके...मतलब, उसने अपनी नई कार खरीदी है।']
+# french_sentences = ["Je veux que tu envoies la photo photo d' écran à Mireille avec Lucie en cc.", "Envoie un mail à euh jena@polonium.com.", "envoie une un message à Alice.", "Peux-tu euuh envoyer cet SMS sur le chien de maman?", "Dis à Karen par euh SMS que j'arrive en en joignant mon heure d'arrivée.", "Mets en pièce jointe mes coordonnées GPS au courriel pour euh Lucie.", "écris une un mail à Alice.", "Merci d'écrire un email à Pierre."]
+print(switcher.switch_content_words(hindi_sentences, 0.6))
+print("-----------------")
+print(switcher.switch_content_words(hindi_sentences, 0.8))
+print("-----------------")
+print(switcher.switch_content_words(hindi_sentences, 1.0))

ST/inference/codes/bleu_significance.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import pandas as pd
+# from evaluate import load
+from scipy import stats
+from nltk.translate.bleu_score import sentence_bleu
+import string
+# Load data from the CSV files
+df1 = pd.read_csv('MT0_xxl_ape/result_mr')
+df2 = pd.read_csv('MT0_xxl_ape/result_mr_50p')
+df_reference = pd.read_csv('MT0_xxl_ape/result_mr')
+# bleu = load("sacrebleu")
+sentences1 = df1['pred_label']
+sentences2 = df2['pred_label']
+reference_sentences = df_reference['ref']
+def process_sentence(sentence):
+    if not isinstance(sentence, str):
+        return ""
+    # Remove spaces before and after the sentence
+    sentence = sentence.split('\n')[0]
+    sentence = sentence.strip()
+    sentence = sentence.lower()
+    # Remove punctuation marks in the sentence
+    for punctuation in string.punctuation:
+        sentence = sentence.replace(punctuation, "")
+    sentence = sentence.strip()
+    if sentence == "":
+        return sentence
+    if (sentence[-1] == '।'):
+        print(sentence)
+        sentence = sentence[:-1]
+        print(sentence)
+    return sentence
+# Calculate BLEU scores
+def calculate_bleu(sentences, reference):
+    return [sentence_bleu([reference[i]], sentences[i]) for i in range(len(sentences))]
+sentences1 = [process_sentence(s) for s in list(sentences1)]
+sentences2 = [process_sentence(s) for s in list(sentences2)]
+reference_sentences = [process_sentence(s) for s in list(reference_sentences)]
+bleu_scores1 = calculate_bleu(sentences1, reference_sentences)
+bleu_scores2 = calculate_bleu(sentences2, reference_sentences)
+# Check for normality
+def check_normality(data):
+    stat, p = stats.shapiro(data)
+    if p > 0.05:
+        return True
+    else:
+        return False
+is_normal1 = check_normality(bleu_scores1)
+is_normal2 = check_normality(bleu_scores2)
+# Check for equal variances
+def check_variance(data1, data2):
+    stat, p = stats.levene(data1, data2)
+    if p > 0.05:
+        return True
+    else:
+        return False
+is_equal_var = check_variance(bleu_scores1, bleu_scores2)
+# Decide and perform the significance test
+def perform_significance_test():
+    if is_normal1 and is_normal2:
+        if is_equal_var:
+            t_stat, p = stats.ttest_ind(bleu_scores1, bleu_scores2)
+            return "T-test", p
+        else:
+            t_stat, p = stats.ttest_ind(bleu_scores1, bleu_scores2, equal_var=False)
+            return "Welch's T-test", p
+    else:
+        u_stat, p = stats.mannwhitneyu(bleu_scores1, bleu_scores2)
+        return "Mann-Whitney U test", p
+test_name, p_value = perform_significance_test()
+# Output results
+print(f"Test used: {test_name}")
+print(f"P-value: {p_value}")
+if p_value < 0.05:
+    print("The difference in BLEU scores is statistically significant.")
+else:
+    print("The difference in BLEU scores is not statistically significant.")

ST/inference/codes/dictionary_creation.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import requests
+import bz2
+import xml.etree.ElementTree as ET
+import os
+import pickle
+from tqdm import tqdm
+import mwparserfromhell
+# Step 1: Download the latest dump
+DUMP_URL = "https://dumps.wikimedia.org/tewiktionary/latest/tewiktionary-latest-pages-articles.xml.bz2"
+response = requests.get(DUMP_URL, stream=True)
+print("Downloading the latest dump...")
+total_size = int(response.headers.get('content-length', 0))
+progress_bar = tqdm(total=total_size, unit='B', unit_scale=True)
+dump_file = "tewiktionary-latest-pages-articles.xml.bz2"
+with open(dump_file, 'wb') as file:
+    for chunk in response.iter_content(chunk_size=8192):
+        progress_bar.update(len(chunk))
+        file.write(chunk)
+progress_bar.close()
+# Step 2: Extract the dump
+print("\nExtracting the dump...")
+with bz2.open(dump_file, 'rb') as source, open(dump_file[:-4], 'wb') as dest:
+    for line in source:
+        dest.write(line)
+# Step 3: Parse the XML dump and extract translations
+print("Parsing the XML dump to extract translations...")
+tree = ET.parse(dump_file[:-4])
+root = tree.getroot()
+ns = {'ns': 'http://www.mediawiki.org/xml/export-0.10/'}
+translations = {}
+for page in root.findall('ns:page', ns):
+    title = page.find('ns:title', ns).text
+    revision = page.find('ns:revision', ns)
+    if revision:
+        text_data = revision.find('ns:text', ns)
+        if text_data and text_data.text:
+            # Parse the wikitext
+            wikicode = mwparserfromhell.parse(text_data.text)
+            links = [link.title for link in wikicode.filter_wikilinks() if link.title.startswith("en:")]
+            if links:
+                english_translations = [str(link.split(':')[1]) for link in links]
+                translations[title] = english_translations
+# Display the first 1000 translations
+print("\nDisplaying the first 1000 translations:")
+for i, (telugu_word, english_words) in enumerate(translations.items()):
+    if i >= 1000:
+        break
+    print(f"Telugu Word: {telugu_word}, English Translations: {', '.join(english_words)}")
+# Save the translations to a pickle file
+print("\nSaving translations to pickle file...")
+pickle_filename = "telugu_english_translations.pkl"
+with open(pickle_filename, 'wb') as file:
+    pickle.dump(translations, file)
+print(f"Translations saved to {pickle_filename}")
+# Optional: Remove the downloaded files if you want
+# os.remove(dump_file)
+# os.remove(dump_file[:-4])

ST/inference/codes/evaluate_exactmatch.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from evaluate import load
+import pandas as pd
+import string
+# import os
+# os.environ["CUDA_VISIBLE_DEVICES"] = "3"
+exact_match_metric = load("exact_match")
+bleu = load("sacrebleu")
+# meteor = load('meteor')
+# comet = load('comet')
+# bertscore = load('bertscore')
+# import torch
+# # Check if CUDA (GPU) is available
+# if torch.cuda.is_available():
+#     device = torch.device('cuda')
+#     print("Using GPU:", torch.cuda.get_device_name(0))
+# else:
+#     device = torch.device('cpu')
+#     print("Using CPU")
+# # Optimize for Tensor Cores if available
+# if 'A100' in torch.cuda.get_device_name(0):
+#     # Set the precision for matrix multiplications
+#     # Choose 'medium' for a balance between performance and precision
+#     # Or 'high' if you need higher precision
+#     torch.set_float32_matmul_precision('medium')
+df = pd.read_csv("MT0_xxl_results/result_m_eng_l")
+reference = list(df.label)
+predicted = list(df.pred_label)
+# source = list(df.disfluent)
+def process_sentence(sentence):
+    if not isinstance(sentence, str):
+        return ""
+    # Remove spaces before and after the sentence
+    sentence = sentence.split('\n')[0]
+    sentence = sentence.strip()
+    sentence = sentence.lower()
+    # Remove punctuation marks in the sentence
+    for punctuation in string.punctuation:
+        sentence = sentence.replace(punctuation, "")
+    sentence = sentence.strip()
+    if sentence == "":
+        return sentence
+    if (sentence[-1] == '।'):
+        print(sentence)
+        sentence = sentence[:-1]
+        print(sentence)
+    return sentence
+reference = [process_sentence(s) for s in list(df.label)]
+# source = [process_sentence(s) for s in list(df.disfluent)]
+predicted = [process_sentence(s) for s in list(df.pred_label)]
+results = {}
+results['exact_match'] = exact_match_metric.compute(predictions=predicted, references=reference)
+results['bleu'] = bleu.compute(predictions=predicted, references=reference)
+# results['meteor'] = meteor.compute(predictions=predicted, references=reference)
+# results['comet'] = comet.compute(sources=source, predictions=predicted, references=reference)
+# results['bertscore'] = bertscore.compute(predictions=predicted, references=reference)
+print(results)

ST/inference/codes/evaluate_sari.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from evaluate import load
+import pandas as pd
+import string
+# Load SARI metric
+sari = load("sari")
+# Read the CSV
+df = pd.read_csv("MT0_xxl_results/result_pt_80p")
+def process_sentence(sentence):
+    if not isinstance(sentence, str):
+        return ""
+    sentence = sentence.split('\n')[0]
+    sentence = sentence.strip().lower()
+    for punctuation in string.punctuation:
+        sentence = sentence.replace(punctuation, "")
+    sentence = sentence.strip()
+    if sentence and sentence[-1] == '।':
+        sentence = sentence[:-1]
+    return sentence
+# Process predictions
+original = [process_sentence(s) for s in df['original']]
+predicted = [process_sentence(s) for s in df['pred_label']]
+# Assuming columns "ref1", "ref2", ... "refN" are reference columns
+# Change ["ref1", "ref2", "refN"] to your actual column names
+reference_columns = ["label1", "label2", "label3", "label4"]
+references = []
+for _, row in df.iterrows():
+    current_references = [process_sentence(row[col]) for col in reference_columns]
+    references.append(current_references)
+# Compute SARI score
+results = {}
+results['sari'] = sari.compute(sources=original, predictions=predicted, references=references)
+print(results)

ST/inference/codes/german_synthetic_switching.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import random
+import spacy
+# Load the German spacy model
+# nlp = spacy.load('de_core_news_sm')
+# nlp = spacy.load('de_core_news_sm')
+# nlp = spacy.load('tr_core_news_trf') #French
+# nlp = spacy.load('hi_core_news_sm') #Greek
+# nlp = spacy.load("fr_core_news_sm")
+nlp = spacy.load('pt_core_news_sm')
+# nlp = spacy.load('es_core_news_sm')
+def load_german_english_dict(file_path):
+    """
+    Load the German-English dictionary from a file.
+    Args:
+    - file_path (str): Path to the dictionary file.
+    Returns:
+    - dict: German-English dictionary.
+    """
+    with open(file_path, 'r', encoding='utf-8') as file:
+        lines = file.readlines()
+        return {line.split()[0]: line.split()[1] for line in lines}
+def translate_content_words(sentence, dictionary, probability=0.5):
+    """
+    Randomly translate content words from German to English.
+    Args:
+    - sentence (str): German sentence to translate.
+    - dictionary (dict): Bilingual German-English dictionary.
+    - probability (float): Probability to translate a word.
+    Returns:
+    - str: Sentence with randomly translated content words.
+    """
+    doc = nlp(sentence.lower())
+    translated_sentence = []
+    for token in doc:
+        # Check if the token is a content word
+        if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV', ]:
+            # Randomly decide whether to translate
+            if random.random() < probability:
+                # Translate if word is in the dictionary, otherwise keep the original word
+                translated_sentence.append(dictionary.get(token.text, token.text))
+            else:
+                translated_sentence.append(token.text)
+        else:
+            translated_sentence.append(token.text)
+    return ' '.join(translated_sentence)
+# Load the dictionary from the file
+german_english_dict = load_german_english_dict('Dictionary/portuguese_english_dict.txt')
+# Example usage
+sentence = "비교 가능한 유속을 유지할 수있을 때 그 결과가 높습니다."
+print(translate_content_words(sentence, german_english_dict, 0.5))
+print(translate_content_words(sentence, german_english_dict, 0.8))
+print(translate_content_words(sentence, german_english_dict, 1.0))

ST/inference/codes/getTranslationBleu.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+import os
+from evaluate import load
+import pandas as pd
+import string
+os.environ["CUDA_VISIBLE_DEVICES"] = "2"  # SET the GPUs you want to use
+exact_match_metric = load("exact_match")
+bleu = load("sacrebleu")
+df = pd.read_csv("Annotations/ep1_transcripts.csv")
+df2 = pd.read_csv("Annotations/ep1_translations.csv")
+input_ = []
+reference = []
+# Step 3: Iterate through rows of the DataFrame and filter out rows with "contentType" as "overlap"
+for index, row in df.iterrows():
+    if row['contentType'] != 'overlap':
+        # Append the values to input_ and reference if "contentType" is not "overlap"
+        input_.append(row['asr_transcript'])
+        reference.append(row['translation'])
+# Load the NLLB model for translation
+#tel_Telu
+#hin_Deva
+#mar_Deva
+#ben_Beng
+#vie_Latn
+#ces_Latn
+#por_Latn
+tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="tel_Telu", use_safetensors=True)
+model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
+def get_translation(word):
+    """Fetch the English translation for a given Telugu word using the NLLB model."""
+    inputs = tokenizer(word, return_tensors="pt")
+    translated_tokens = model.generate(
+        **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=1500
+    )
+    english_phrase = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+    return english_phrase
+# df = pd.read_csv("Annotations/ep1.csv")
+# input_ = list(df.transcript)
+# reference = list(df.translation)
+# source = list(df.disfluent)
+# # Step 1: Identify indices of "NULL" in input_
+# null_indices = [i for i, transcript in enumerate(input_) if transcript.strip().lower() == "null"]
+# # Step 2: Remove corresponding elements from input_ and reference
+# input_ = [transcript for i, transcript in enumerate(input_) if i not in null_indices]
+# reference = [translation for i, translation in enumerate(reference) if i not in null_indices]
+def process_sentence(sentence):
+    if not isinstance(sentence, str):
+        return ""
+    # Remove spaces before and after the sentence
+    sentence = sentence.split('\n')[0]
+    sentence = sentence.strip()
+    sentence = sentence.lower()
+    # Remove punctuation marks in the sentence
+    for punctuation in string.punctuation:
+        sentence = sentence.replace(punctuation, "")
+    sentence = sentence.strip()
+    if sentence == "":
+        return sentence
+    if (sentence[-1] == '।'):
+        print(sentence)
+        sentence = sentence[:-1]
+        print(sentence)
+    return sentence
+processed_input = [process_sentence(s) for s in input_]
+processed_ref = [process_sentence(s) for s in reference]
+translated = []
+for i in processed_input:
+    translated_sentence = get_translation(i)
+    print(translated_sentence)
+    translated.append(process_sentence(translated_sentence))
+results = {}
+# results['exact_match'] = exact_match_metric.compute(predictions=predicted, references=reference)
+results['bleu'] = bleu.compute(predictions=translated, references=processed_ref)
+# results['meteor'] = meteor.compute(predictions=predicted, references=reference)
+# results['comet'] = comet.compute(sources=source, predictions=predicted, references=reference)
+# results['bertscore'] = bertscore.compute(predictions=predicted, references=reference)
+print(results)
+df2['cascaded_pred'] = translated

ST/inference/codes/syntheticCodeSwitching.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import random
+import spacy
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+import os
+import csv
+os.environ["CUDA_VISIBLE_DEVICES"] = "3"  # SET the GPUs you want to use
+# Load the NLLB model for translation
+#tel_Telu
+#hin_Deva
+#mar_Deva
+#ben_Beng
+#vie_Latn
+#ces_Latn
+#por_Latn
+tokenizer = AutoTokenizer.from_pretrained("models/nllb-200-3.3B", src_lang="por_Latn")
+model = AutoModelForSeq2SeqLM.from_pretrained("models/nllb-200-3.3B")
+def get_translation(word):
+    """Fetch the English translation for a given Telugu word using the NLLB model."""
+    inputs = tokenizer(word, return_tensors="pt")
+    translated_tokens = model.generate(
+        **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=100
+    )
+    english_phrase = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+    return english_phrase
+def fetch_translation(word, content=1):
+    """Fetch the English translation for a given Telugu word using the NLLB model."""
+    inputs = tokenizer(word, return_tensors="pt")
+    translated_tokens = model.generate(
+        **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=100
+    )
+    english_phrase = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+    if content:
+        # Extract content words from the translated phrase
+        content_words = [token.text for token in nlp_en(english_phrase) if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]]
+        # Return the first content word, or the whole phrase if no content words are found
+        return content_words[0] if content_words else english_phrase
+    return english_phrase
+def get_pos_tag_english(word):
+    """Get the POS tag of the translated English word using spaCy."""
+    doc = nlp_en(word)
+    return doc[0].pos_
+def code_switch(sentence, ratio=0.5, content=1):
+    """Perform code switching based on the given ratio."""
+    words = sentence.split()
+    new_sentence = []
+    for word in words:
+        english_word = fetch_translation(word)
+        if content:
+            pos_tag = get_pos_tag_english(english_word)
+            if pos_tag in ["NOUN", "VERB", "ADJ", "ADV"] and random.random() < ratio:
+                new_sentence.append(english_word)
+            else:
+                new_sentence.append(word)
+        else:
+            if random.random() < ratio:
+                new_sentence.append(english_word)
+            else:
+                new_sentence.append(word)
+    return ' '.join(new_sentence)
+# Load spaCy's English model for POS tagging
+nlp_en = spacy.load("en_core_web_sm")
+# Test
+telugu_sentences = ['నేను ఉహుహు... ప్రపంచ కప్ మ్యాచ్ చూడాలనుకుంటున్నాను','అరేరే నేను నా మొబైల్ ఎక్కడ పెట్టానో మరచిపోయాను!', 'ఆయన బాగా బాగా ఆడుతున్నాడు క్రికెట్.', 'మన ప్రయాణం శుక్రవా శనివారం నాడు కాదండి?', '3:30 కాదు కాదు, 4:30 కి మన ప్రయాణం.', 'నా పుస్తకం నాకు పేపర్ కావాలి.']
+hindi_sentences = ['अ मुझे बताइए ये सेवा नहीं है क्या?', 'लेकिन यहां पर पर अटकने से काम नहीं होगा।', 'बाजपेयी भी इन अर अर्थशास्त्रियों में शामिल थे।', 'मैंने सोचा कि, उम्म...क्या मैं आज शाम को फिल्म देखने जाऊँ?', 'अरे, यह कुत्ता हमारे पास क्यों आ रहा है?', 'क्या हमें कल.. हमें कल चलना चाहिए।', 'उसके...मतलब, उसने अपनी नई कार खरीदी है।']
+marathi_sentences = ['अ मला सांगा, ही सेवा नाही का?', 'माझ्या माझ्या कामाची चर्चा आहे उद्या.', 'या अर अर्थतज्ज्ञांमध्ये वाजपेयींचाही समावेश होता.', 'मी विचार केला कि, अं...मी आज सायंकाळी मी चित्रपट पहायला जाऊ का?', 'अरे हा कुत्रा आमच्याजवळ का येतोय?', 'का आपण उद्या.. आपण उद्या जायला हवं.', 'त्याची...म्हणजे, त्याने त्याची नवीन खरेदी केली आहे.']
+bengali_sentences = ['আহ আমাকে বলুন, এটা কি পরিষেবা নয়?', 'কিন্তু কিন্তু এখানে আটকালে তো কাজ হবেনা।', 'বাজপেয়ী জিও এইসব অর্থ অর্থনীতি তে অন্তর্ভুক্ত ছিলেন।', 'আমি ভাবলাম কি যে, আঃ আজকে সন্ধায় কি আমি সিনেমা দেখতে যাব?', 'আরেহ এই কুকুর টা আমাদের দিকে কেন আসছে?', 'আমিাদের কি কাল, আমাদের কাল যাওয়া উচিত।', 'ওনার, মানে উনি নিজের নতুন গাড়ি কিনেছেন।']
+viet_sentences = ['tôi cần thuê à tôi muốn bay một chuyến khứ hồi từ đà nẵng đến đà lạt.', 'sân bay ừm không hãng hàng không nào có đường bay từ bắc kinh ờ ý tôi là thượng thượng hải đến washington dc mà cần nối chuyến qua các thành phố khác.', 'cho tôi biết tất cả các máy bay à chuyến bay từ huế đến quy nhơn.', 'đà nẵng đến ờ hồ chí minh í lộn đến cà mau.', 'có bao nhiêu ghế à ý tôi là hạng ghế.', 'chuyến bay nào rời buôn ma thuột vào ngày mùng 4 ờ không ngày 5 tháng 7 sau 7 giờ tối và đến cần thơ.']
+czech_sentences = ['Strávily jsme měsíc v hlavním městě Jemenu Sané , kde jsme se zúčastnily kurzu arabštiny.', 'Musíme být úspěšní poprvé sámi.']
+#XNLI
+telugu_xnli = ['మా నంబర్‌లో ఒకరు మీ సూచనలను సూక్ష్మంగా అమలు చేస్తారు.', 'నా బృందంలోని సభ్యుడు మీ ఆర్డర్‌లను చాలా ఖచ్చితత్వంతో అమలు చేస్తారు.', 'స్వలింగ సంపర్కులు మరియు లెస్బియన్లు.', 'భిన్న లింగ సంపర్కులు.', 'వేద వైపు తిరిగి నవ్వాడు.', 'తల్లితో కలిసి తన వెనకే మెల్లగా నడుస్తున్న వేదను చూసి నవ్వాడు.', 'నీకు ఎలా తెలుసు ? ఇదంతా మళ్లీ వారి సమాచారం.', 'ఈ సమాచారం వారికే చెందుతుంది.', 'జాతీయ ఉద్యానవనాలు మరియు నిర్జన ప్రాంతాలలో సహజ పరిస్థితులకు తిరిగి రావాలనే కాంగ్రెస్ నిర్దేశించిన లక్ష్యం వైపు రాష్ట్రాలు తమ రాష్ట్ర అమలు ప్రణాళికలలో సహేతుకమైన పురోగతిని చూపాలి.', 'ఏదైనా మెరుగుదల ఉండాల్సిన అవసరం లేదు.', 'ఆమె తిరిగి నవ్వింది.', 'ఆమె నవ్వు ఆపుకోలేక చాలా సంతోషించింది.']
+hindi_xnli = ['हमारा एक नंबर आपके निर्देशों का सूक्ष्मता से पालन करेगा।', 'मेरी टीम का एक सदस्य आपके आदेशों को अत्यंत सटीकता के साथ निष्पादित करेगा।', 'समलैंगिक और लेस्बियन।', 'विषमलैंगिक।', 'वह मुड़ा और वेदा की ओर देखकर मुस्कुराया।', 'वह वेदा को देखकर मुस्कुराया जो अपनी माँ के साथ उसके पीछे धीरे-धीरे चल रही थी।', 'आपको कैसे मालूम ? ये सब उनकी जानकारी है।', 'ये जानकारी उनकी है।', 'राज्यों को राष्ट्रीय उद्यानों और जंगल क्षेत्रों में प्राकृतिक परिस्थितियों में लौटने के कांग्रेस द्वारा निर्धारित लक्ष्य की दिशा में अपनी राज्य कार्यान्वयन योजनाओं में उचित प्रगति दिखानी चाहिए।', 'इसमें कोई सुधार होना जरूरी नहीं है।', 'वह वापस मुस्कुराई।', 'वह इतनी खुश थी कि वह मुस्कुराना बंद नहीं कर पा रही थी।']
+marathi_xnli = ['आमचा एक नंबर तुमच्या सूचनांची काटेकोरपणे अंम��बजावणी करेल.', 'माझ्या टीमचा एक सदस्य तुमच्या ऑर्डर्स अत्यंत अचूकतेने अंमलात आणेल.', 'समलिंगी आणि समलैंगिक.', 'भिन्नलिंगी.', 'तो वळून वेदाकडे हसला.', 'तो वेदाकडे बघून हसला जो आईसोबत त्याच्या मागे हळू चालत होता.', 'तुला कसे माहीत ? ही सर्व त्यांची माहिती आहे.', 'ही माहिती त्यांच्या मालकीची आहे.', 'राष्ट्रीय उद्याने आणि वाळवंट भागात नैसर्गिक परिस्थितीत परत येण्याच्या कॉंग्रेसने अनिवार्य केलेल्या उद्दिष्टाच्या दिशेने राज्यांनी त्यांच्या राज्य अंमलबजावणी योजनांमध्ये वाजवी प्रगती दर्शविली पाहिजे.', 'त्यात काही सुधारणा होणे आवश्यक नाही.', 'ती परत हसली.', 'तिला इतका आनंद झाला होता की तिला हसू आवरता आले नाही.']
+bengali_xnli = ['আমাদের নম্বরগুলির মধ্যে একটি আপনার নির্দেশাবলী মিনিটে কার্যকর করবে।', 'আমার দলের একজন সদস্য আপনার আদেশগুলি অত্যন্ত নির্ভুলতার সাথে কার্যকর করবে।', 'সমকামী এবং সমকামীরা।', 'বিষমকামী।', 'সে ঘুরে বেদের দিকে তাকিয়ে হাসল।', 'সে বেদাকে দেখে হাসল যে তার মায়ের সাথে তার পিছনে ধীরে ধীরে হাঁটছিল।', 'তুমি কিভাবে জান ? এসবই তাদের তথ্য।', 'এই তথ্য তাদের অন্তর্গত।', 'জাতীয় উদ্যান এবং মরুভূমি অঞ্চলে প্রাকৃতিক পরিস্থিতিতে ফিরে আসার কংগ্রেসের নির্দেশিত লক্ষ্যের দিকে রাজ্যগুলিকে অবশ্যই তাদের রাষ্ট্রীয় বাস্তবায়ন পরিকল্পনায় যুক্তিসঙ্গত অগ্রগতি দেখাতে হবে।', 'এর জন্য কোনো উন্নতির প্রয়োজন নেই।', 'সে ফিরে হাসল।', 'সে এত খুশি ছিল যে সে হাসি থামাতে পারেনি।']
+#Sentiment
+telugu_sentiment = ['అణు కార్యక్రమాన్ని పౌర అవసరాలు, సైనిక అవసరాలుగా విడదీసినందున ఆ ఒప్పందంపై సంతకం పెట్టాల్సిన అవసరం లేదని భారత్ వాదన.', 'ప్రజలకు అన్నివిధాలా తోడ్పాటును అందించాలని సూచించారు.', 'అవసరాలకు అనుగుణంగా అనేక చట్టాలను మార్చుకోవడం సాధ్యం కావడం లేదు.', 'అనంతరం ప్రాంతాల వారీగా చేపట్టే కార్యక్రమాలపై చర్చించారు.', 'జోరుగా టీఆరెస్ సభ్యత్వ నమోదు.', 'కారణం ఏమిటో గానీ తెలుగు రాష్ట్రాల్లో ఏ ఒక్కరికీ కేంద్రంలో మంత్రిపదవి దక్కలేదు.']
+hindi_sentiment = ['असम में ब्रह्मपुत्र नदी के किनारे स्थित इस पार्क में गैंडे के साथ - साथ हाथी , चीता , बाघ , हिरण , डॉल्फिन , सांभर आदि देखे जा सकते हैं ।', 'इसका 13 मेगा पिक्सल कैमरा जो इस डिवाईस का हिरो है ।', 'कुल मिलाकर जी3 स्टाइलस का परफॉर्मेंस अच्छा नहीं कहा जा सकता ।', 'इसके अन्दर लगी 3120 एमएएच क�� बैटरी , पूरे डेढ़ दिन तक चलती है ।', 'इसके बेंचमार्क स्कोर्स बहुत ही आशाजनक थे क्योंकि यह स्मार्टफोन प्रतियोगिता को दूर कर देता है ।', 'जिसका नुकसान ये होता है कि अगर आप फिल्म देख रहे है या गेम खेल रहे है तो स्पीकर्स आपके हाथों से ढक जाते हैं ।']
+# marathi_sentiment =
+# bengali_sentiment =
+hindi_qa = ['पैंथर्स डिफ़ेंस ने कितने अंक दिए?', 'डिवीजनल राउंड में ब्रोंकोस से कौन हारा?', 'वर्तमान में ब्रॉनकोस फ्रैंचाइज़ी में जॉन एलवे की क्या भूमिका है?', 'लेडी गागा ने कितने ग्रैमी जीते हैं?']
+marathi_ape = ['हळूहळू खायला आणि प्यायला मदत होते आणि लहान, वारंवार जेवण होते.', 'कधी कधी खांद्यावरून बाहेर पडणाऱ्या आगीचे चित्रण केले जात नाही.', 'पिंपळाच्या आकाराचे मातीचे शरीर, संपूर्ण शरीरावर लाल कापड चिकटवले जाते.', 'या कालखंडात आणखी एक महत्त्वाची गोष्ट म्हणजे तांत्रिकवादाची वाढ.', 'या कामांच्या माध्यमातून माहिम बेट परेल आणि वरळीशी जोडले गेले होते.']
+portuguese_simple = ['Comportamento semelhante tiveram outros mercados de capitais no mundo.', '- O CPC está abaixo do que queremos, apesar do aumento quando comparado com janeiro.', 'As coisas vão voltar à normalidade.', 'O presidente foi recebido por uma platéia reunida por PT, PC do B e PSB , partidos da base do governo.', '- Havia um Fiat Doblò estacionado em frente a uma panificadora.']
+for sentence in portuguese_simple:
+    for ratio in [0.3, 0.5, 0.8]:
+        print(f"Ratio: {ratio*100}%")
+        print(code_switch(sentence, ratio))
+        print("-----------------------------")
+# for sentence in marathi_sentences:
+#     translation = get_translation(sentence)
+#     print(translation)

ST/inference/codes/wilcoxon.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import csv
+import string
+from scipy.stats import wilcoxon
+import numpy as np
+def process_sentence(sentence):
+    if not isinstance(sentence, str):
+        return ""
+    sentence = sentence.split('\n')[0]
+    sentence = sentence.strip()
+    sentence = sentence.lower()
+    for punctuation in string.punctuation:
+        sentence = sentence.replace(punctuation, "")
+    sentence = sentence.strip()
+    if sentence and sentence[-1] == '।':
+        sentence = sentence[:-1]
+    return sentence
+# Read CSV and generate exact match scores for Prompt A
+with open('MT0_xxl_results/result_vi', 'r') as csvfile:
+    reader = csv.DictReader(csvfile)
+    scores_a = [1 if process_sentence(row['pred_label']) == process_sentence(row['label']) else 0 for row in reader]
+# Read CSV and generate exact match scores for Prompt B
+with open('MT0_xxl_results/result_vi_80p', 'r') as csvfile:
+    reader = csv.DictReader(csvfile)
+    scores_b = [1 if process_sentence(row['pred_label']) == process_sentence(row['label']) else 0 for row in reader]
+# Count the number of 1s in each list
+count_a = scores_a.count(1)
+count_b = scores_b.count(1)
+# Print the counts
+print(f"Number of exact matches for Prompt A: {count_a}")
+print(f"Number of exact matches for Prompt B: {count_b}")
+# Conduct Wilcoxon Signed Rank test
+w_stat, p_val = wilcoxon(scores_a, scores_b)
+# Print the results
+print(f"Wilcoxon Signed Rank statistic: {w_stat}")
+print(f"P-value: {p_val}")
+if p_val < 0.05:
+    print("The difference in score distributions between the prompts is statistically significant.")
+else:
+    print("The difference in score distributions between the prompts is not statistically significant.")