Spaces:

naacl-anonymous
/

selective_pre_translation

Runtime error

App Files Files Community

Anonymous commited on Oct 1, 2024

Commit

f475b49

1 Parent(s): 5da8fd9

fix

Browse files

Files changed (4) hide show

tasks/ner.py +1 -2
tasks/nli.py +1 -66
tasks/qa.py +1 -114
utils/languages_by_word_count.csv +0 -119

tasks/ner.py CHANGED Viewed

@@ -1,11 +1,10 @@
-from typing import List, Dict, Any, Union
 import numpy as np
 from datasets import load_dataset, Dataset
 from easygoogletranslate import EasyGoogleTranslate
 from langchain.prompts import PromptTemplate, FewShotPromptTemplate
 LANGAUGE_TO_PREFIX = {
     "chinese_simplified": "zh-CN",

+from typing import List, Dict, Union
 import numpy as np
 from datasets import load_dataset, Dataset
 from easygoogletranslate import EasyGoogleTranslate
 from langchain.prompts import PromptTemplate, FewShotPromptTemplate
 LANGAUGE_TO_PREFIX = {
     "chinese_simplified": "zh-CN",

tasks/nli.py CHANGED Viewed

@@ -1,14 +1,10 @@
-import time
 import csv
 import json
 import multiprocessing as mp
 import os
 from typing import Any, Dict, List, NewType, Optional, Union
-import openai
 import numpy as np
-import requests
 import yaml
 from datasets import Dataset, DatasetDict, load_dataset
 from easygoogletranslate import EasyGoogleTranslate
@@ -46,68 +42,7 @@ NUMBER_TO_TAG = {0: "entailment", 1: "neutral", 2: "contradiction"}
 PARAMS = NewType("PARAMS", Dict[str, Any])
-def gemini_completion(prompt):
-    # Define the endpoint URL
-    genai.configure(api_key="AIzaSyBnghQNoOS2qiacHjqutK1RpPV5y-gv7Pg")
-    model = genai.GenerativeModel("models/gemini-1.0-pro-latest")
-    return model.generate_content(prompt).text
-def gpt3x_completion(
-    prompt: Union[str, List[Dict[str, str]]],
-    model: str = "chatgpt",
-    # run_details: Any = {},
-    # num_evals_per_sec: int = 2,
-    # **model_params,
-) -> str:
-    import os
-    import openai
-    os.environ["OPENAI_API_KEY"] = ''
-    def get_entities_chatGPT(final_prompt):
-        response = openai.ChatCompletion.create(
-            engine="gpt35-16k",
-            temperature=0,
-            messages=[
-                {"role": "user", "content": final_prompt}
-            ]
-        )
-        return response['choices'][0]['message']['content']
-    return get_entities_chatGPT(final_prompt=prompt)
-def mixtral_completion(prompt):
-    url = "https://api.together.xyz/v1/chat/completions"
-    # Define your Together API key
-    together_api_key = ""  # Replace with your actual API key
-    # Define the request payload
-    payload = {
-        "temperature": 0,
-        "max_tokens": 30,
-        "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-        "messages": [{"role": "user", "content": f"{prompt}"}],
-    }
-    # Define request headers
-    headers = {
-        "Authorization": f"Bearer {together_api_key}",
-        "Content-Type": "application/json",
-    }
-    # Send POST request
-    response = requests.post(url, json=payload, headers=headers)
-    # Check response status
-    if response.status_code == 200:
-        # Print the response content (API output)
-        return response.json()["choices"][0]["message"]["content"]
-    else:
-        # Print error message if request fails
-        print(f"Error: {response.status_code} - {response.text}")
 def read_parameters(args_path) -> PARAMS:

 import csv
 import json
 import multiprocessing as mp
 import os
 from typing import Any, Dict, List, NewType, Optional, Union
 import numpy as np
 import yaml
 from datasets import Dataset, DatasetDict, load_dataset
 from easygoogletranslate import EasyGoogleTranslate
 PARAMS = NewType("PARAMS", Dict[str, Any])
 def read_parameters(args_path) -> PARAMS:

tasks/qa.py CHANGED Viewed

@@ -3,13 +3,9 @@ import json
 import logging
 import multiprocessing as mp
 import os
-import subprocess
 import re
 import string
 import sys
-import subprocess
-import time
 import unicodedata
 from typing import Any, Dict, List, NewType, Optional, Union
@@ -29,26 +25,11 @@ from yaml.loader import SafeLoader
 def gemini_completion(prompt):
     # Define the endpoint URL
-    genai.configure(api_key="AIzaSyCSvECR2K_ca3QcMBcCHbxMzBpZe3y82iI")
     model = genai.GenerativeModel("models/gemini-1.0-pro-latest")
     return model.generate_content(prompt).text
-# checkpoint = "bigscience/mt0-base"
-# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-#
-# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-# model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")
-# model.to("cuda:04")
-os.environ["OPENAI_API_KEY"] = (
-    "sk-proj-YeuUCE17wxVRRjD61Rn8T3BlbkFJr337RfppJB8fadACBXwG"
-)
-OPENAI_API_KEY = "sk-proj-YeuUCE17wxVRRjD61Rn8T3BlbkFJr337RfppJB8fadACBXwG"
-openai.api_key = "sk-proj-YeuUCE17wxVRRjD61Rn8T3BlbkFJr337RfppJB8fadACBXwG"
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 def get_entities_gpt3_long(prompt):
     response = openai.ChatCompletion.create(
@@ -64,15 +45,7 @@ def gpt3x_completion(
         # num_evals_per_sec: int = 2,
         # **model_params,
 ) -> str:
-    import os
     import openai
-    os.environ["OPENAI_API_KEY"] = '07d805ec4fbd484ebc923a3a41e1773d'
-    OPENAI_API_KEY = '07d805ec4fbd484ebc923a3a41e1773d'
-    openai.api_type = "azure"
-    openai.api_base = 'https://hebsum-itaim-uks.openai.azure.com/'
-    openai.api_version = "2023-03-15-preview"
-    openai.api_key = '07d805ec4fbd484ebc923a3a41e1773d'
     def get_entities_chatGPT(final_prompt):
         response = openai.ChatCompletion.create(
             engine="gpt35-16k",
@@ -92,37 +65,6 @@ def mt0_completion(prompt):
     return tokenizer.decode(outputs[0])
-def mixtral_completion(prompt):
-    url = "https://api.together.xyz/v1/chat/completions"
-    # Define your Together API key
-    together_api_key = "851cfc39f3d7a246a2342259f5f6fbba4721c6002123365fba2254c9c9c424ad"  # Replace with your actual API key
-    # Define the request payload
-    payload = {
-        "temperature": 0,
-        "max_tokens": 30,
-        "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-        "messages": [{"role": "user", "content": f"{prompt}"}],
-    }
-    # Define request headers
-    headers = {
-        "Authorization": f"Bearer {together_api_key}",
-        "Content-Type": "application/json",
-    }
-    # Send POST request
-    response = requests.post(url, json=payload, headers=headers)
-    # Check response status
-    if response.status_code == 200:
-        # Print the response content (API output)
-        return response.json()["choices"][0]["message"]["content"]
-    else:
-        # Print error message if request fails
-        print(f"Error: {response.status_code} - {response.text}")
 XQUAD_LANG2CODES = {
     "bengali": "bn",
@@ -614,58 +556,6 @@ def run_one_configuration(params: Optional[PARAMS] = None):
                                      + config_header
                                      + ".csv",
             )
-            #
-            # normalized_prediction = normalize_answer(pred)
-            # batched_predictions.append(normalized_prediction)
-            #
-            # if config["output"] != params["selected_language"]:
-            #     pred = _translate_prediction_to_output_language(
-            #         prediction=normalized_prediction,
-            #         prediction_language=config["output"],
-            #         output_language=params["selected_language"],
-            #     )
-            #     print(
-            #         f"Translated the prediciton from {config['output']} to {params['selected_language']}"
-            #     )
-            #
-            # logger.info("Starting evaluation")
-            #
-            # if dataset_name == "xquad":
-            #     prediction = {"prediction_text": pred, "id": test_example["id"]}
-            #
-            # reference = {}
-            # reference["answers"] = test_example["answers"]
-            # reference["id"] = test_example["id"]
-            # if reference["answers"]["text"][0] == "":
-            #     reference["answers"]["text"] = []
-            #     reference["answers"]["answer_start"] = []
-            #
-            # if params["metric"] == "squad":
-            #     results = squad_metric.compute(
-            #         predictions=[prediction], references=[reference]
-            #     )
-            # else:
-            #     results = squad_metric.compute(
-            #         predictions=[prediction],
-            #         references=[reference],
-            #         no_answer_threshold=0.9,
-            #     )
-            #
-            # f1_sum += results["f1"]
-            # if metric == "squad":
-            #     em_sum += results["exact_match"]
-            # else:
-            #     em_sum += results["exact"]
-            # avg_f1 = f1_sum / (idx + 1)
-            # avg_em = em_sum / (idx + 1)
-            #
-            # preds.append(prediction)
-            # labels.append(reference)
-            # f1s.append(results["f1"])
-            # if metric == "squad":
-            #     ems.append(results["exact_match"])
-            # else:
-            #     ems.append(results["exact"])
         except Exception as e:
             print(f"Found an exception {e}, continue to the next example")
@@ -682,9 +572,6 @@ def run_one_configuration(params: Optional[PARAMS] = None):
     )
-# if __name__ == "__main__":
-#     run_one_configuration()
 def run_one_configuration_paralle(params: Optional[PARAMS] = None, zero: bool = False):
     if not params:

 import logging
 import multiprocessing as mp
 import os
 import re
 import string
 import sys
 import unicodedata
 from typing import Any, Dict, List, NewType, Optional, Union
 def gemini_completion(prompt):
     # Define the endpoint URL
+    genai.configure(api_key="")
     model = genai.GenerativeModel("models/gemini-1.0-pro-latest")
     return model.generate_content(prompt).text
 def get_entities_gpt3_long(prompt):
     response = openai.ChatCompletion.create(
         # num_evals_per_sec: int = 2,
         # **model_params,
 ) -> str:
     import openai
     def get_entities_chatGPT(final_prompt):
         response = openai.ChatCompletion.create(
             engine="gpt35-16k",
     return tokenizer.decode(outputs[0])
 XQUAD_LANG2CODES = {
     "bengali": "bn",
                                      + config_header
                                      + ".csv",
             )
         except Exception as e:
             print(f"Found an exception {e}, continue to the next example")
     )
 def run_one_configuration_paralle(params: Optional[PARAMS] = None, zero: bool = False):
     if not params:

utils/languages_by_word_count.csv DELETED Viewed

@@ -1,119 +0,0 @@
-Unnamed: 0,Language,number of words,percentage of total words
-0,English,181014683608,92.64708%
-1,French,3553061536,1.81853%
-2,German,2870869396,1.46937%
-3,Spanish,1510070974,0.77289%
-4,Italian,1187784217,0.60793%
-5,Portuguese,1025413869,0.52483%
-6,Dutch,669055061,0.34244%
-7,Russian,368157074,0.18843%
-8,Romanian,308182352,0.15773%
-9,Polish,303812362,0.15550%
-10,Finnish,221644679,0.11344%
-11,Danish,221551540,0.11339%
-12,Swedish,220920577,0.11307%
-13,Japanese,217047918,0.11109%
-14,Norwegian,212193299,0.10860%
-15,Chinese,193517396,0.09905%
-16,Czech,139918438,0.07161%
-17,Hungarian,127224375,0.06512%
-18,Indonesian,116930321,0.05985%
-19,Turkish,116141938,0.05944%
-20,Croatian,101613675,0.05201%
-21,Vietnamese,83077650,0.04252%
-22,Greek,61607673,0.03153%
-23,Arabic,60839973,0.03114%
-24,Serbian,52875283,0.02706%
-25,Chinese (Traditional),38583893,0.01975%
-26,Catalan,35126650,0.01798%
-27,Korean,33147663,0.01697%
-28,Slovak,27957963,0.01431%∆
-29,Thai,26806557,0.01372%
-30,Slovenian,26037337,0.01333%
-31,Estonian,20718080,0.01060%
-32,Persian,16731301,0.00856%
-33,Hebrew,15027640,0.00769%
-34,Ukrainian,14905898,0.00763%
-35,Malay,13389340,0.00685%
-36,Latvian,13290098,0.00680%
-37,Bosnian,13160941,0.00674%
-38,Lithuanian,12921255,0.00661%
-39,Icelandic,12792837,0.00655%
-40,Hindi,9434632,0.00483%
-41,Albanian,9253803,0.00474%
-42,Filipino,8650331,0.00443%
-43,Galician,6947527,0.00356%
-44,Javanese,6604056,0.00338%
-45,Bulgarian,5919807,0.00303%
-46,Afrikaans,5461216,0.00280%
-47,Tamil,5163171,0.00264%
-48,Marathi,3660217,0.00187%
-49,Welsh,3459671,0.00177%
-50,Malayalam,3227746,0.00165%
-51,Bangla,3003033,0.00154%
-52,Irish,2878943,0.00147%
-53,Azerbaijani,2496202,0.00128%
-54,Kannada,1913389,0.00098%
-55,Burmese,1853421,0.00095%
-56,Telugu,1638366,0.00084%
-57,Uzbek,1458861,0.00075%
-58,Kinyarwanda,1430208,0.00073%
-59,Cebuano,1329456,0.00068%
-60,Nepali,1120450,0.00057%
-61,Kurdish,1091032,0.00056%
-62,Basque,1048905,0.00054%
-63,Khmer,1041164,0.00053%
-64,Georgian,924256,0.00047%
-65,Scottish Gaelic,841970,0.00043%
-66,Armenian,840171,0.00043%
-67,Maltese,748610,0.00038%
-68,Sinhala,708343,0.00036%
-69,Punjabi,703086,0.00036%
-70,Urdu,689768,0.00035%
-71,Kazakh,670231,0.00034%
-72,Swahili,585858,0.00030%
-73,Southern Sotho,538257,0.00028%
-74,Belarusian,533405,0.00027%
-75,Macedonian,529413,0.00027%
-76,Malagasy,507043,0.00026%
-77,Gujarati,494798,0.00025%
-78,Lao,449476,0.00023%
-79,Haitian Creole,430911,0.00022%
-80,Ganda,261217,0.00013%
-81,Yiddish,227609,0.00012%
-82,Tajik,210167,0.00011%
-83,Sundanese,208819,0.00011%
-84,Hmong,175972,0.00009%
-85,Nyanja,161994,0.00008%
-86,Odia,131688,0.00007%
-87,Divehi,112819,0.00006%
-88,Kyrgyz,91289,0.00005%
-89,Bihari languages,48094,0.00002%
-90,Unknown language [xx] (Gothic),48025,0.00002%
-91,Unknown language [xx] (Runic),37558,0.00002%
-92,Inuktitut,31142,0.00002%
-93,Syriac,21482,0.00001%
-94,Mongolian,7779,0.00000%
-95,Unknown language [xx] (Phoenician),4343,0.00000%
-96,Unknown language [xx] (Unknown Script [Qaai]),4185,0.00000%
-97,Unknown language [xx] (Egyptian hieroglyphs),3395,0.00000%
-98,Unknown language [xx] (N’Ko),3338,0.00000%
-99,Unknown language [xx] (Tifinagh),3277,0.00000%
-100,Unknown language [xx] (Chakma),2608,0.00000%
-101,Unknown language [xx] (Yi),2357,0.00000%
-102,Cherokee,2315,0.00000%
-103,Unknown language [xx] (Phags-pa),1750,0.00000%
-104,Unknown language [xx] (Tai Viet),1622,0.00000%
-105,Unknown language [xx] (Deseret),1504,0.00000%
-106,Unknown language [xx] (Javanese),1448,0.00000%
-107,Unknown language [xx] (Sundanese),780,0.00000%
-108,Unknown language [xx] (Coptic),707,0.00000%
-109,Unknown language [xx] (Glagolitic),673,0.00000%
-110,Unknown language [xx] (Ol Chiki),573,0.00000%
-111,Unknown language [xx] (Shavian),542,0.00000%
-112,Unknown language [xx] (Samaritan),313,0.00000%
-113,Unknown language [xx] (Avestan),213,0.00000%
-114,Unknown language [xx] (Bopomofo),188,0.00000%
-115,Unknown language [xx] (Linear B),156,0.00000%
-116,Unknown language [xx] (Ogham),84,0.00000%
-117,Unknown language [xx] (Cham),49,0.00000%