Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Running

App Files Files Community

AppleSwing commited on May 4

Commit

4045483

•

1 Parent(s): 28b6090

Fix bugs in gsm8k

Browse files

Files changed (5) hide show

backend-cli.py +1 -1
src/backend/envs.py +1 -1
src/backend/hflm_with_measurement.py +50 -21
src/backend/tasks/gsm8k/gsm8k-custom.yaml +44 -0
src/display/utils.py +1 -1

backend-cli.py CHANGED Viewed

@@ -152,7 +152,7 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
     monitor_thread.start()
     original_apply = RegexFilter.apply
-    if task.benchmark == "gsm8k":
         RegexFilter.apply = tuple_input_decorator(RegexFilter.apply)
     else:
         RegexFilter.apply = original_apply

     monitor_thread.start()
     original_apply = RegexFilter.apply
+    if task.benchmark in ["gsm8k", "gsm8k_cot", "gsm8k_cot_self_consistency", "gsm8k_custom"]:
         RegexFilter.apply = tuple_input_decorator(RegexFilter.apply)
     else:
         RegexFilter.apply = original_apply

src/backend/envs.py CHANGED Viewed

@@ -57,7 +57,7 @@ class Tasks(Enum):
     # task20 = Task("race", "acc", "RACE", 0)
     task21 = Task("mmlu", "acc", "MMLU", 5)
-    task22 = Task("gsm8k", "em", "GSM8K", 5)
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")

     # task20 = Task("race", "acc", "RACE", 0)
     task21 = Task("mmlu", "acc", "MMLU", 5)
+    task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")

src/backend/hflm_with_measurement.py CHANGED Viewed

@@ -295,6 +295,8 @@ class HFLMWithMeasurement(HFLM):
         # and we don't want a warning from HF
         generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
         do_sample = generation_kwargs.get("do_sample", None)
         # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
         if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
@@ -302,22 +304,40 @@ class HFLMWithMeasurement(HFLM):
         if do_sample is False and generation_kwargs.get("temperature") == 0.0:
             generation_kwargs.pop("temperature")
         # build stopping criteria
-        stopping_criteria = stop_sequences_criteria(
-            self.tokenizer, stop, context.shape[1], context.shape[0]
-        )
-        stop_watch = StopWatch(self.tokenizer)
-        start = time()
-        res = self.model.generate(
-            input_ids=context,
-            max_length=max_length,
-            stopping_criteria=stopping_criteria,
-            pad_token_id=self.tokenizer.pad_token_id,
-            use_cache=True,
-            streamer=stop_watch,
-            **generation_kwargs,
-        )
-        end = time()
         batch_size = context.shape[0]
         output_length = stop_watch.decoding_iterations
@@ -408,6 +428,11 @@ class HFLMWithMeasurement(HFLM):
                 until = [eos]
             else:
                 until.append(eos)
             if "max_gen_toks" in kwargs.keys():
                 max_gen_toks = kwargs.pop("max_gen_toks")
             else:
@@ -427,6 +452,8 @@ class HFLMWithMeasurement(HFLM):
                 left_truncate_len=max_ctx_len,
                 truncation=self.truncation,
             )
             context_enc = context_enc.to(self.device)
             attn_masks = attn_masks.to(self.device)
@@ -445,16 +472,18 @@ class HFLMWithMeasurement(HFLM):
             for cont_toks, context in zip(cont_toks_list, contexts):
                 # discard context + left-padding toks if using causal decoder-only LM
                 if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
                     cont_toks = cont_toks[context_enc.shape[1] :]
                 s = self.tok_decode(cont_toks)
                 # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
-                for term in until:
-                    if len(term) > 0:
-                        # ignore '' separator,
-                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
-                        s = s.split(term)[0]
                 res.append((s, end_to_end_time, prefilling_time, token_per_sec))

         # and we don't want a warning from HF
         generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
         do_sample = generation_kwargs.get("do_sample", None)
+        is_gsm8k = generation_kwargs.get("is_gsm8k", False)
         # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
         if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
         if do_sample is False and generation_kwargs.get("temperature") == 0.0:
             generation_kwargs.pop("temperature")
+        generation_kwargs.pop("is_gsm8k")
+        if not is_gsm8k:
         # build stopping criteria
+            stopping_criteria = stop_sequences_criteria(
+                self.tokenizer, stop, context.shape[1], context.shape[0]
+            )
+            stop_watch = StopWatch(self.tokenizer)
+            start = time()
+            res = self.model.generate(
+                input_ids=context,
+                max_length=max_length,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=self.tokenizer.pad_token_id,
+                use_cache=True,
+                streamer=stop_watch,
+                **generation_kwargs,
+            )
+            end = time()
+        else:
+            # print("Using GSM8K")
+            stop_watch = StopWatch(self.tokenizer)
+            start = time()
+            res = self.model.generate(
+                input_ids=context,
+                max_length=max_length,
+                eos_token_id=stop,
+                pad_token_id=self.tokenizer.pad_token_id,
+                use_cache=True,
+                streamer=stop_watch,
+                **generation_kwargs,
+            )
+            end = time()
         batch_size = context.shape[0]
         output_length = stop_watch.decoding_iterations
                 until = [eos]
             else:
                 until.append(eos)
+            is_gsm8k = kwargs.get("is_gsm8k", False)
+            if is_gsm8k:
+                until = [self.tokenizer.eos_token_id, self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
             if "max_gen_toks" in kwargs.keys():
                 max_gen_toks = kwargs.pop("max_gen_toks")
             else:
                 left_truncate_len=max_ctx_len,
                 truncation=self.truncation,
             )
+            # print("context: ", self.tok_decode(context_enc[0]))
             context_enc = context_enc.to(self.device)
             attn_masks = attn_masks.to(self.device)
             for cont_toks, context in zip(cont_toks_list, contexts):
                 # discard context + left-padding toks if using causal decoder-only LM
                 if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                    # print("After Generation: ", self.tok_decode(cont_toks))
                     cont_toks = cont_toks[context_enc.shape[1] :]
                 s = self.tok_decode(cont_toks)
                 # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                if not is_gsm8k:
+                    for term in until:
+                        if len(term) > 0:
+                            # ignore '' separator,
+                            # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                            s = s.split(term)[0]
                 res.append((s, end_to_end_time, prefilling_time, token_per_sec))

src/backend/tasks/gsm8k/gsm8k-custom.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+group:
+  - math_word_problems
+task: gsm8k_custom
+dataset_path: gsm8k
+dataset_name: main
+output_type: generate_until
+training_split: train
+fewshot_split: train
+test_split: test
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: false
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+      - "(?s).*#### "
+      - "\\.$"
+generation_kwargs:
+  until:
+    - "<|eot_id|>"
+  do_sample: false
+  temperature: 0.0
+  is_gsm8k: true
+repeats: 1
+num_fewshot: 5
+filter_list:
+  # - name: "strict-match"
+  #   filter:
+  #     - function: "regex"
+  #       regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
+  #     - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
+      - function: "take_first"
+metadata:
+  version: 3.0

src/display/utils.py CHANGED Viewed

@@ -75,7 +75,7 @@ class Tasks(Enum):
     # # XXX include me back at some point
     selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
     mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
-    gsm8k = Task("gsm8k", "em", "GSM8K") #GSM8K/EM (5-shot)
 # These classes are for user facing column names,

     # # XXX include me back at some point
     selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
     mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
+    gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (8-shot)
 # These classes are for user facing column names,