Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Running

App Files Files Community

future-xy commited on Apr 14, 2024

Commit

2088911

1 Parent(s): a549d9d

support mmlu

Browse files

Files changed (4) hide show

requirements.txt +1 -1
src/backend/hflm_with_measurement.py +220 -0
src/backend/run_eval_suite.py +48 -4
src/backend/tasks/measurement_task_utils.py +1 -1

requirements.txt CHANGED Viewed

@@ -18,7 +18,7 @@ tqdm
 wandb
 transformers>=4.36.0
 tokenizers>=0.15.0
-lm_eval[ifeval] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git
 accelerate
 sentencepiece
 langdetect

 wandb
 transformers>=4.36.0
 tokenizers>=0.15.0
+lm_eval[ifeval] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@0.4.2
 accelerate
 sentencepiece
 langdetect

src/backend/hflm_with_measurement.py CHANGED Viewed

@@ -68,6 +68,226 @@ class HFLMWithMeasurement(HFLM):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
     def _model_generate(self, context, max_length, stop, **generation_kwargs):
         # temperature = 0.0 if not set
         # if do_sample is false and temp==0.0:

     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+    def _loglikelihood_tokens(
+        self,
+        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+        override_bs: int = None,
+    ) -> List[Tuple[float, bool]]:
+        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
+        res = []
+        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key to group and lookup one-token continuations"""
+            # Use with group_by="contexts" (optional)"
+            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
+            # speeds up some multiple-choice tasks proportionally to the number of choices.
+            # groups requests by context+continuation[:-1] and infer on one request/group.
+            return req[-2] + req[-1][:-1]
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate,
+            group_by="contexts"
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+            and self.logits_cache
+            else None,
+            group_fn=_lookup_one_token_cont,
+        )
+        # automatic (variable) batch size detection for vectorization
+        # pull longest context sample from request
+        n_reordered_requests = len(re_ord)
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else override_bs
+            if override_bs is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
+            else None
+        )
+        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inps = []
+            cont_toks_list = []
+            inplens = []
+            conts = []
+            encoder_attns = []
+            padding_len_inp = None
+            padding_len_cont = None
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+            for _, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+                # when too long to fit in context, truncate from the left
+                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                    inp = torch.tensor(
+                        (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (inplen,) = inp.shape
+                elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                    inp = torch.tensor(
+                        (context_enc)[-self.max_length :],
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (inplen,) = inp.shape
+                    # build encoder attn masks
+                    encoder_attns.append(torch.ones_like(inp))
+                    cont = torch.tensor(
+                        (continuation_enc)[-self.max_length :],
+                        # TODO: left-shift these?
+                        # TODO: our code assumes we never end up truncating conts for either model type
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (contlen,) = cont.shape
+                    conts.append(cont)
+                    padding_len_cont = (
+                        max(padding_len_cont, contlen)
+                        if padding_len_cont is not None
+                        else contlen
+                    )
+                padding_len_inp = (
+                    max(padding_len_inp, inplen)
+                    if padding_len_inp is not None
+                    else inplen
+                )
+                inps.append(inp)  # [1, inp_length]
+                cont_toks_list.append(continuation_enc)
+                inplens.append(inplen)
+            # create encoder attn mask and batched conts, if seq2seq
+            call_kwargs = {}
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                batched_inps = pad_and_concat(
+                    padding_len_inp, inps, padding_side="right"
+                )  # [batch, padding_len_inp]
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                # TODO: left-pad encoder inps and mask?
+                batched_inps = pad_and_concat(
+                    padding_len_inp, inps
+                )  # [batch, padding_len_inp]
+                batched_conts = pad_and_concat(
+                    padding_len_cont, conts
+                )  # [batch, padding_len_cont]
+                batched_encoder_mask = pad_and_concat(
+                    padding_len_inp, encoder_attns
+                )  # [batch, padding_len_inp]
+                call_kwargs = {
+                    "attn_mask": batched_encoder_mask,
+                    "labels": batched_conts,
+                }
+            start = time()
+            intermediate_res = self._model_call(batched_inps, **call_kwargs)
+            end = time()
+            multi_logits = F.log_softmax(
+                intermediate_res , dim=-1
+            )  # [batch, padding_length (inp or cont), vocab]
+            per_sample_time = (end - start) / len(multi_logits)
+            for (request_str, ctx_tokens, _), logits, inplen, cont_toks in zip(
+                chunk, multi_logits, inplens, cont_toks_list
+            ):
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                # take only logits in the continuation
+                # (discard context toks if decoder-only ; discard right-padding)
+                # also discards + checks for "virtual tokens" in the causal LM's input window
+                # from prompt/prefix tuning tokens, if applicable
+                ctx_len = (
+                    inplen + (logits.shape[0] - padding_len_inp)
+                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                    else None
+                )
+                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
+                logits = logits.unsqueeze(0)  # [1, seq, vocab]
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+                # check for one-token continuation cache hits.
+                # noop in case group_by != "contexts" or no cache hit and returns the
+                # original args. Otherwise, expands the logits batch dimension and yields each
+                # batch along with matching continuation tokens and prompt strings.
+                # logits -> [1, seq, vocab]
+                for request_str, cont_toks, logits in re_ord.get_cache(
+                    req_str=request_str,
+                    cxt_toks=ctx_tokens,
+                    cont_toks=cont_toks,
+                    logits=logits,
+                ):
+                    cont_toks = torch.tensor(
+                        cont_toks, dtype=torch.long, device=self.device
+                    ).unsqueeze(0)  # [1, seq]
+                    max_equal = (greedy_tokens == cont_toks).all()
+                    # Obtain log-probs at the corresponding continuation token indices
+                    # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                        -1
+                    )  # [1, seq]
+                    # Answer: (log prob, is-exact-match)
+                    answer = (float(logits.sum()), bool(max_equal))
+                    res.append((answer, per_sample_time, 0, 0))
+                    self.cache_hook.add_partial("loglikelihood", request_str, answer)
+                    pbar.update(1)
+        pbar.close()
+        return re_ord.get_original(res)
     def _model_generate(self, context, max_length, stop, **generation_kwargs):
         # temperature = 0.0 if not set
         # if do_sample is false and temp==0.0:

src/backend/run_eval_suite.py CHANGED Viewed

@@ -1,13 +1,57 @@
 from lm_eval import evaluator
 from lm_eval.tasks import TaskManager
 from src.backend.manage_requests import EvalRequest
-from src.backend.tasks.xsum.task import XSum
-from src.backend.tasks.xsum.task_v2 import XSumv2
-from src.backend.tasks.cnndm.task import CNNDM
-from src.backend.tasks.cnndm.task_v2 import CNNDMv2
 from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT

 from lm_eval import evaluator
 from lm_eval.tasks import TaskManager
+from lm_eval.api.metrics import mean
+from lm_eval.api.task import ConfigurableTask
 from src.backend.manage_requests import EvalRequest
+orig_process_results = ConfigurableTask.process_results
+orig_aggregation = ConfigurableTask.aggregation
+orig_higher_is_better = ConfigurableTask.higher_is_better
+def process_results_decorator(func):
+    def wrapper(self, doc, results, *args, **kwargs):
+        processed_results = [r[0] for r in results]
+        end_to_end_time = sum([r[1] for r in results]) / len(results)
+        prefilling_time = sum([r[2] for r in results]) / len(results)
+        decoding_throughput = sum([r[3] for r in results]) / len(results)
+        # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
+        result_dict = func(self, doc, processed_results, *args, **kwargs)
+        result_dict["end_to_end_time"] = end_to_end_time
+        result_dict["prefilling_time"] = prefilling_time
+        result_dict["decoding_throughput"] = decoding_throughput
+        return result_dict
+    return wrapper
+ConfigurableTask.process_results = process_results_decorator(orig_process_results)
+def aggregation_decorator(func):
+    def wrapper(self, *args, **kwargs):
+        aggregation_list = func(self, *args, **kwargs)
+        aggregation_list["end_to_end_time"] = mean
+        aggregation_list["prefilling_time"] = mean
+        aggregation_list["decoding_throughput"] = mean
+        return aggregation_list
+    return wrapper
+ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
+def higher_is_better_decorator(func):
+    def wrapper(self, *args, **kwargs):
+        higher_is_better_dict = func(self, *args, **kwargs)
+        higher_is_better_dict["end_to_end_time"] = False
+        higher_is_better_dict["prefilling_time"] = False
+        higher_is_better_dict["decoding_throughput"] = True
+        return higher_is_better_dict
+    return wrapper
+ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)
+# from src.backend.tasks.xsum.task import XSum
+# from src.backend.tasks.xsum.task_v2 import XSumv2
+# from src.backend.tasks.cnndm.task import CNNDM
+# from src.backend.tasks.cnndm.task_v2 import CNNDMv2
 from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT

src/backend/tasks/measurement_task_utils.py CHANGED Viewed

@@ -12,7 +12,7 @@ def process_results_decorator(func):
         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
-        print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         # Now call the original process_results with the processed results
         result_dict = func(self, doc, processed_results, *args, **kwargs)

         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
+        # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         # Now call the original process_results with the processed results
         result_dict = func(self, doc, processed_results, *args, **kwargs)