Anonymous commited on
Commit
f475b49
·
1 Parent(s): 5da8fd9
Files changed (4) hide show
  1. tasks/ner.py +1 -2
  2. tasks/nli.py +1 -66
  3. tasks/qa.py +1 -114
  4. utils/languages_by_word_count.csv +0 -119
tasks/ner.py CHANGED
@@ -1,11 +1,10 @@
1
- from typing import List, Dict, Any, Union
2
 
3
  import numpy as np
4
  from datasets import load_dataset, Dataset
5
  from easygoogletranslate import EasyGoogleTranslate
6
  from langchain.prompts import PromptTemplate, FewShotPromptTemplate
7
 
8
-
9
  LANGAUGE_TO_PREFIX = {
10
 
11
  "chinese_simplified": "zh-CN",
 
1
+ from typing import List, Dict, Union
2
 
3
  import numpy as np
4
  from datasets import load_dataset, Dataset
5
  from easygoogletranslate import EasyGoogleTranslate
6
  from langchain.prompts import PromptTemplate, FewShotPromptTemplate
7
 
 
8
  LANGAUGE_TO_PREFIX = {
9
 
10
  "chinese_simplified": "zh-CN",
tasks/nli.py CHANGED
@@ -1,14 +1,10 @@
1
-
2
- import time
3
-
4
  import csv
5
  import json
6
  import multiprocessing as mp
7
  import os
8
  from typing import Any, Dict, List, NewType, Optional, Union
9
- import openai
10
  import numpy as np
11
- import requests
12
  import yaml
13
  from datasets import Dataset, DatasetDict, load_dataset
14
  from easygoogletranslate import EasyGoogleTranslate
@@ -46,68 +42,7 @@ NUMBER_TO_TAG = {0: "entailment", 1: "neutral", 2: "contradiction"}
46
  PARAMS = NewType("PARAMS", Dict[str, Any])
47
 
48
 
49
- def gemini_completion(prompt):
50
- # Define the endpoint URL
51
- genai.configure(api_key="AIzaSyBnghQNoOS2qiacHjqutK1RpPV5y-gv7Pg")
52
- model = genai.GenerativeModel("models/gemini-1.0-pro-latest")
53
- return model.generate_content(prompt).text
54
-
55
-
56
-
57
- def gpt3x_completion(
58
- prompt: Union[str, List[Dict[str, str]]],
59
- model: str = "chatgpt",
60
- # run_details: Any = {},
61
- # num_evals_per_sec: int = 2,
62
- # **model_params,
63
- ) -> str:
64
- import os
65
- import openai
66
- os.environ["OPENAI_API_KEY"] = ''
67
-
68
-
69
- def get_entities_chatGPT(final_prompt):
70
- response = openai.ChatCompletion.create(
71
- engine="gpt35-16k",
72
- temperature=0,
73
- messages=[
74
- {"role": "user", "content": final_prompt}
75
- ]
76
- )
77
- return response['choices'][0]['message']['content']
78
-
79
- return get_entities_chatGPT(final_prompt=prompt)
80
-
81
- def mixtral_completion(prompt):
82
- url = "https://api.together.xyz/v1/chat/completions"
83
-
84
- # Define your Together API key
85
- together_api_key = "" # Replace with your actual API key
86
 
87
- # Define the request payload
88
- payload = {
89
- "temperature": 0,
90
- "max_tokens": 30,
91
- "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
92
- "messages": [{"role": "user", "content": f"{prompt}"}],
93
- }
94
-
95
- # Define request headers
96
- headers = {
97
- "Authorization": f"Bearer {together_api_key}",
98
- "Content-Type": "application/json",
99
- }
100
-
101
- # Send POST request
102
- response = requests.post(url, json=payload, headers=headers)
103
-
104
- # Check response status
105
- if response.status_code == 200:
106
- # Print the response content (API output)
107
- return response.json()["choices"][0]["message"]["content"]
108
- else:
109
- # Print error message if request fails
110
- print(f"Error: {response.status_code} - {response.text}")
111
 
112
 
113
  def read_parameters(args_path) -> PARAMS:
 
 
 
 
1
  import csv
2
  import json
3
  import multiprocessing as mp
4
  import os
5
  from typing import Any, Dict, List, NewType, Optional, Union
6
+
7
  import numpy as np
 
8
  import yaml
9
  from datasets import Dataset, DatasetDict, load_dataset
10
  from easygoogletranslate import EasyGoogleTranslate
 
42
  PARAMS = NewType("PARAMS", Dict[str, Any])
43
 
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
 
48
  def read_parameters(args_path) -> PARAMS:
tasks/qa.py CHANGED
@@ -3,13 +3,9 @@ import json
3
  import logging
4
  import multiprocessing as mp
5
  import os
6
- import subprocess
7
  import re
8
-
9
  import string
10
  import sys
11
- import subprocess
12
- import time
13
  import unicodedata
14
  from typing import Any, Dict, List, NewType, Optional, Union
15
 
@@ -29,26 +25,11 @@ from yaml.loader import SafeLoader
29
 
30
  def gemini_completion(prompt):
31
  # Define the endpoint URL
32
- genai.configure(api_key="AIzaSyCSvECR2K_ca3QcMBcCHbxMzBpZe3y82iI")
33
  model = genai.GenerativeModel("models/gemini-1.0-pro-latest")
34
  return model.generate_content(prompt).text
35
 
36
 
37
- # checkpoint = "bigscience/mt0-base"
38
- # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
39
- #
40
- # tokenizer = AutoTokenizer.from_pretrained(checkpoint)
41
- # model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")
42
- # model.to("cuda:04")
43
-
44
- os.environ["OPENAI_API_KEY"] = (
45
- "sk-proj-YeuUCE17wxVRRjD61Rn8T3BlbkFJr337RfppJB8fadACBXwG"
46
- )
47
- OPENAI_API_KEY = "sk-proj-YeuUCE17wxVRRjD61Rn8T3BlbkFJr337RfppJB8fadACBXwG"
48
- openai.api_key = "sk-proj-YeuUCE17wxVRRjD61Rn8T3BlbkFJr337RfppJB8fadACBXwG"
49
-
50
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
51
-
52
 
53
  def get_entities_gpt3_long(prompt):
54
  response = openai.ChatCompletion.create(
@@ -64,15 +45,7 @@ def gpt3x_completion(
64
  # num_evals_per_sec: int = 2,
65
  # **model_params,
66
  ) -> str:
67
- import os
68
  import openai
69
- os.environ["OPENAI_API_KEY"] = '07d805ec4fbd484ebc923a3a41e1773d'
70
- OPENAI_API_KEY = '07d805ec4fbd484ebc923a3a41e1773d'
71
- openai.api_type = "azure"
72
- openai.api_base = 'https://hebsum-itaim-uks.openai.azure.com/'
73
- openai.api_version = "2023-03-15-preview"
74
- openai.api_key = '07d805ec4fbd484ebc923a3a41e1773d'
75
-
76
  def get_entities_chatGPT(final_prompt):
77
  response = openai.ChatCompletion.create(
78
  engine="gpt35-16k",
@@ -92,37 +65,6 @@ def mt0_completion(prompt):
92
  return tokenizer.decode(outputs[0])
93
 
94
 
95
- def mixtral_completion(prompt):
96
- url = "https://api.together.xyz/v1/chat/completions"
97
-
98
- # Define your Together API key
99
- together_api_key = "851cfc39f3d7a246a2342259f5f6fbba4721c6002123365fba2254c9c9c424ad" # Replace with your actual API key
100
-
101
- # Define the request payload
102
- payload = {
103
- "temperature": 0,
104
- "max_tokens": 30,
105
- "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
106
- "messages": [{"role": "user", "content": f"{prompt}"}],
107
- }
108
-
109
- # Define request headers
110
- headers = {
111
- "Authorization": f"Bearer {together_api_key}",
112
- "Content-Type": "application/json",
113
- }
114
-
115
- # Send POST request
116
- response = requests.post(url, json=payload, headers=headers)
117
-
118
- # Check response status
119
- if response.status_code == 200:
120
- # Print the response content (API output)
121
- return response.json()["choices"][0]["message"]["content"]
122
- else:
123
- # Print error message if request fails
124
- print(f"Error: {response.status_code} - {response.text}")
125
-
126
 
127
  XQUAD_LANG2CODES = {
128
  "bengali": "bn",
@@ -614,58 +556,6 @@ def run_one_configuration(params: Optional[PARAMS] = None):
614
  + config_header
615
  + ".csv",
616
  )
617
- #
618
- # normalized_prediction = normalize_answer(pred)
619
- # batched_predictions.append(normalized_prediction)
620
- #
621
- # if config["output"] != params["selected_language"]:
622
- # pred = _translate_prediction_to_output_language(
623
- # prediction=normalized_prediction,
624
- # prediction_language=config["output"],
625
- # output_language=params["selected_language"],
626
- # )
627
- # print(
628
- # f"Translated the prediciton from {config['output']} to {params['selected_language']}"
629
- # )
630
- #
631
- # logger.info("Starting evaluation")
632
- #
633
- # if dataset_name == "xquad":
634
- # prediction = {"prediction_text": pred, "id": test_example["id"]}
635
- #
636
- # reference = {}
637
- # reference["answers"] = test_example["answers"]
638
- # reference["id"] = test_example["id"]
639
- # if reference["answers"]["text"][0] == "":
640
- # reference["answers"]["text"] = []
641
- # reference["answers"]["answer_start"] = []
642
- #
643
- # if params["metric"] == "squad":
644
- # results = squad_metric.compute(
645
- # predictions=[prediction], references=[reference]
646
- # )
647
- # else:
648
- # results = squad_metric.compute(
649
- # predictions=[prediction],
650
- # references=[reference],
651
- # no_answer_threshold=0.9,
652
- # )
653
- #
654
- # f1_sum += results["f1"]
655
- # if metric == "squad":
656
- # em_sum += results["exact_match"]
657
- # else:
658
- # em_sum += results["exact"]
659
- # avg_f1 = f1_sum / (idx + 1)
660
- # avg_em = em_sum / (idx + 1)
661
- #
662
- # preds.append(prediction)
663
- # labels.append(reference)
664
- # f1s.append(results["f1"])
665
- # if metric == "squad":
666
- # ems.append(results["exact_match"])
667
- # else:
668
- # ems.append(results["exact"])
669
 
670
  except Exception as e:
671
  print(f"Found an exception {e}, continue to the next example")
@@ -682,9 +572,6 @@ def run_one_configuration(params: Optional[PARAMS] = None):
682
  )
683
 
684
 
685
- # if __name__ == "__main__":
686
- # run_one_configuration()
687
-
688
 
689
  def run_one_configuration_paralle(params: Optional[PARAMS] = None, zero: bool = False):
690
  if not params:
 
3
  import logging
4
  import multiprocessing as mp
5
  import os
 
6
  import re
 
7
  import string
8
  import sys
 
 
9
  import unicodedata
10
  from typing import Any, Dict, List, NewType, Optional, Union
11
 
 
25
 
26
  def gemini_completion(prompt):
27
  # Define the endpoint URL
28
+ genai.configure(api_key="")
29
  model = genai.GenerativeModel("models/gemini-1.0-pro-latest")
30
  return model.generate_content(prompt).text
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def get_entities_gpt3_long(prompt):
35
  response = openai.ChatCompletion.create(
 
45
  # num_evals_per_sec: int = 2,
46
  # **model_params,
47
  ) -> str:
 
48
  import openai
 
 
 
 
 
 
 
49
  def get_entities_chatGPT(final_prompt):
50
  response = openai.ChatCompletion.create(
51
  engine="gpt35-16k",
 
65
  return tokenizer.decode(outputs[0])
66
 
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  XQUAD_LANG2CODES = {
70
  "bengali": "bn",
 
556
  + config_header
557
  + ".csv",
558
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
 
560
  except Exception as e:
561
  print(f"Found an exception {e}, continue to the next example")
 
572
  )
573
 
574
 
 
 
 
575
 
576
  def run_one_configuration_paralle(params: Optional[PARAMS] = None, zero: bool = False):
577
  if not params:
utils/languages_by_word_count.csv DELETED
@@ -1,119 +0,0 @@
1
- Unnamed: 0,Language,number of words,percentage of total words
2
- 0,English,181014683608,92.64708%
3
- 1,French,3553061536,1.81853%
4
- 2,German,2870869396,1.46937%
5
- 3,Spanish,1510070974,0.77289%
6
- 4,Italian,1187784217,0.60793%
7
- 5,Portuguese,1025413869,0.52483%
8
- 6,Dutch,669055061,0.34244%
9
- 7,Russian,368157074,0.18843%
10
- 8,Romanian,308182352,0.15773%
11
- 9,Polish,303812362,0.15550%
12
- 10,Finnish,221644679,0.11344%
13
- 11,Danish,221551540,0.11339%
14
- 12,Swedish,220920577,0.11307%
15
- 13,Japanese,217047918,0.11109%
16
- 14,Norwegian,212193299,0.10860%
17
- 15,Chinese,193517396,0.09905%
18
- 16,Czech,139918438,0.07161%
19
- 17,Hungarian,127224375,0.06512%
20
- 18,Indonesian,116930321,0.05985%
21
- 19,Turkish,116141938,0.05944%
22
- 20,Croatian,101613675,0.05201%
23
- 21,Vietnamese,83077650,0.04252%
24
- 22,Greek,61607673,0.03153%
25
- 23,Arabic,60839973,0.03114%
26
- 24,Serbian,52875283,0.02706%
27
- 25,Chinese (Traditional),38583893,0.01975%
28
- 26,Catalan,35126650,0.01798%
29
- 27,Korean,33147663,0.01697%
30
- 28,Slovak,27957963,0.01431%∆
31
- 29,Thai,26806557,0.01372%
32
- 30,Slovenian,26037337,0.01333%
33
- 31,Estonian,20718080,0.01060%
34
- 32,Persian,16731301,0.00856%
35
- 33,Hebrew,15027640,0.00769%
36
- 34,Ukrainian,14905898,0.00763%
37
- 35,Malay,13389340,0.00685%
38
- 36,Latvian,13290098,0.00680%
39
- 37,Bosnian,13160941,0.00674%
40
- 38,Lithuanian,12921255,0.00661%
41
- 39,Icelandic,12792837,0.00655%
42
- 40,Hindi,9434632,0.00483%
43
- 41,Albanian,9253803,0.00474%
44
- 42,Filipino,8650331,0.00443%
45
- 43,Galician,6947527,0.00356%
46
- 44,Javanese,6604056,0.00338%
47
- 45,Bulgarian,5919807,0.00303%
48
- 46,Afrikaans,5461216,0.00280%
49
- 47,Tamil,5163171,0.00264%
50
- 48,Marathi,3660217,0.00187%
51
- 49,Welsh,3459671,0.00177%
52
- 50,Malayalam,3227746,0.00165%
53
- 51,Bangla,3003033,0.00154%
54
- 52,Irish,2878943,0.00147%
55
- 53,Azerbaijani,2496202,0.00128%
56
- 54,Kannada,1913389,0.00098%
57
- 55,Burmese,1853421,0.00095%
58
- 56,Telugu,1638366,0.00084%
59
- 57,Uzbek,1458861,0.00075%
60
- 58,Kinyarwanda,1430208,0.00073%
61
- 59,Cebuano,1329456,0.00068%
62
- 60,Nepali,1120450,0.00057%
63
- 61,Kurdish,1091032,0.00056%
64
- 62,Basque,1048905,0.00054%
65
- 63,Khmer,1041164,0.00053%
66
- 64,Georgian,924256,0.00047%
67
- 65,Scottish Gaelic,841970,0.00043%
68
- 66,Armenian,840171,0.00043%
69
- 67,Maltese,748610,0.00038%
70
- 68,Sinhala,708343,0.00036%
71
- 69,Punjabi,703086,0.00036%
72
- 70,Urdu,689768,0.00035%
73
- 71,Kazakh,670231,0.00034%
74
- 72,Swahili,585858,0.00030%
75
- 73,Southern Sotho,538257,0.00028%
76
- 74,Belarusian,533405,0.00027%
77
- 75,Macedonian,529413,0.00027%
78
- 76,Malagasy,507043,0.00026%
79
- 77,Gujarati,494798,0.00025%
80
- 78,Lao,449476,0.00023%
81
- 79,Haitian Creole,430911,0.00022%
82
- 80,Ganda,261217,0.00013%
83
- 81,Yiddish,227609,0.00012%
84
- 82,Tajik,210167,0.00011%
85
- 83,Sundanese,208819,0.00011%
86
- 84,Hmong,175972,0.00009%
87
- 85,Nyanja,161994,0.00008%
88
- 86,Odia,131688,0.00007%
89
- 87,Divehi,112819,0.00006%
90
- 88,Kyrgyz,91289,0.00005%
91
- 89,Bihari languages,48094,0.00002%
92
- 90,Unknown language [xx] (Gothic),48025,0.00002%
93
- 91,Unknown language [xx] (Runic),37558,0.00002%
94
- 92,Inuktitut,31142,0.00002%
95
- 93,Syriac,21482,0.00001%
96
- 94,Mongolian,7779,0.00000%
97
- 95,Unknown language [xx] (Phoenician),4343,0.00000%
98
- 96,Unknown language [xx] (Unknown Script [Qaai]),4185,0.00000%
99
- 97,Unknown language [xx] (Egyptian hieroglyphs),3395,0.00000%
100
- 98,Unknown language [xx] (N’Ko),3338,0.00000%
101
- 99,Unknown language [xx] (Tifinagh),3277,0.00000%
102
- 100,Unknown language [xx] (Chakma),2608,0.00000%
103
- 101,Unknown language [xx] (Yi),2357,0.00000%
104
- 102,Cherokee,2315,0.00000%
105
- 103,Unknown language [xx] (Phags-pa),1750,0.00000%
106
- 104,Unknown language [xx] (Tai Viet),1622,0.00000%
107
- 105,Unknown language [xx] (Deseret),1504,0.00000%
108
- 106,Unknown language [xx] (Javanese),1448,0.00000%
109
- 107,Unknown language [xx] (Sundanese),780,0.00000%
110
- 108,Unknown language [xx] (Coptic),707,0.00000%
111
- 109,Unknown language [xx] (Glagolitic),673,0.00000%
112
- 110,Unknown language [xx] (Ol Chiki),573,0.00000%
113
- 111,Unknown language [xx] (Shavian),542,0.00000%
114
- 112,Unknown language [xx] (Samaritan),313,0.00000%
115
- 113,Unknown language [xx] (Avestan),213,0.00000%
116
- 114,Unknown language [xx] (Bopomofo),188,0.00000%
117
- 115,Unknown language [xx] (Linear B),156,0.00000%
118
- 116,Unknown language [xx] (Ogham),84,0.00000%
119
- 117,Unknown language [xx] (Cham),49,0.00000%