Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Running

App Files Files Community

future-xy commited on Apr 7, 2024

Commit

1ae96c8

1 Parent(s): 581fdbd

fix tps

Browse files

Files changed (3) hide show

backend-cli.py +5 -4
src/backend/hflm_with_measurement.py +43 -4
src/backend/tasks/measurement_task_utils.py +17 -5

backend-cli.py CHANGED Viewed

@@ -12,7 +12,7 @@ from src.backend.run_eval_suite import run_evaluation
 from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
 from src.backend.sort_queue import sort_models_by_priority
 from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
 from src.backend.manage_requests import EvalRequest
 from src.leaderboard.read_evals import EvalResult
@@ -124,7 +124,7 @@ def request_to_result_name(request: EvalRequest) -> str:
 def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
-    batch_size = 4
     try:
         results = run_evaluation(
             eval_request=eval_request,
@@ -404,7 +404,8 @@ if __name__ == "__main__":
     local_debug = args.debug
     # debug specific task by ping
     if local_debug:
-        debug_model_names = ["mistralai/Mixtral-8x7B-Instruct-v0.1"]
         # debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
         debug_task_name = 'selfcheckgpt'
         # debug_task_name = "mmlu"
@@ -415,7 +416,7 @@ if __name__ == "__main__":
                 if task_name != debug_task_name:
                     continue
                 eval_request = EvalRequest(
-                    model=debug_model_name, private=False, status="", json_filepath="", precision="float16"
                 )
                 results = process_evaluation(task, eval_request)
     else:

 from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
 from src.backend.sort_queue import sort_models_by_priority
 from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
+LIMIT=2
 from src.backend.manage_requests import EvalRequest
 from src.leaderboard.read_evals import EvalResult
 def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
+    batch_size = 1
     try:
         results = run_evaluation(
             eval_request=eval_request,
     local_debug = args.debug
     # debug specific task by ping
     if local_debug:
+        # debug_model_names = ["mistralai/Mixtral-8x7B-Instruct-v0.1"]
+        debug_model_names = ["facebook/opt-1.3b"]
         # debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
         debug_task_name = 'selfcheckgpt'
         # debug_task_name = "mmlu"
                 if task_name != debug_task_name:
                     continue
                 eval_request = EvalRequest(
+                    model=debug_model_name, private=False, status="", json_filepath="", precision="float16", inference_framework="hf-chat"
                 )
                 results = process_evaluation(task, eval_request)
     else:

src/backend/hflm_with_measurement.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import copy
 import os
 from datetime import timedelta
-import random
 from pathlib import Path
 from typing import List, Literal, Optional, Tuple, Union
@@ -22,6 +22,7 @@ from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
 )
 from lm_eval import utils
 from lm_eval.api.instance import Instance
@@ -37,6 +38,31 @@ from lm_eval.models.utils import (
 from lm_eval.models.huggingface import HFLM
 class HFLMWithMeasurement(HFLM):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -59,14 +85,27 @@ class HFLMWithMeasurement(HFLM):
         stopping_criteria = stop_sequences_criteria(
             self.tokenizer, stop, context.shape[1], context.shape[0]
         )
-        return self.model.generate(
             input_ids=context,
             max_length=max_length,
             stopping_criteria=stopping_criteria,
             pad_token_id=self.tokenizer.pad_token_id,
             use_cache=True,
             **generation_kwargs,
         )
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
@@ -174,7 +213,7 @@ class HFLMWithMeasurement(HFLM):
                 kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
             # perform batched generation
-            cont = self._model_generate(
                 context=context_enc,
                 attention_mask=attn_masks,
                 stop=until,
@@ -196,7 +235,7 @@ class HFLMWithMeasurement(HFLM):
                         # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
                         s = s.split(term)[0]
-                res.append((s, random.random()))
                 self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
                 pbar.update(1)

 import copy
 import os
 from datetime import timedelta
+from time import time
 from pathlib import Path
 from typing import List, Literal, Optional, Tuple, Union
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
 )
+from transformers import TextStreamer
 from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.models.huggingface import HFLM
+class StopWatch(TextStreamer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.start_prefilling = None
+        self.prefilling_time = None
+        self.start_decoding = None
+        self.decoding_time = None
+        self.decoding_iterations = 0
+    def put(self, value):
+        if self.start_prefilling is None:
+            self.start_prefilling = time()
+            return
+        elif self.prefilling_time is None:
+            self.prefilling_time = time() - self.start_prefilling
+            self.start_decoding = time()
+        self.decoding_iterations += 1
+        return
+    def end(self):
+        if self.decoding_time is None and self.start_decoding is not None:
+            self.decoding_time = time() - self.start_decoding
+        return
 class HFLMWithMeasurement(HFLM):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         stopping_criteria = stop_sequences_criteria(
             self.tokenizer, stop, context.shape[1], context.shape[0]
         )
+        stop_watch = StopWatch(self.tokenizer)
+        start = time()
+        res = self.model.generate(
             input_ids=context,
             max_length=max_length,
             stopping_criteria=stopping_criteria,
             pad_token_id=self.tokenizer.pad_token_id,
             use_cache=True,
+            streamer=stop_watch,
             **generation_kwargs,
         )
+        end = time()
+        batch_size = context.shape[0]
+        output_length = stop_watch.decoding_iterations
+        end_to_end_time = (end - start) / batch_size
+        prefilling_time = stop_watch.prefilling_time / batch_size
+        decoding_time = stop_watch.decoding_time / batch_size
+        token_per_sec = output_length / decoding_time
+        return res, end_to_end_time, prefilling_time, token_per_sec
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
                 kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
             # perform batched generation
+            cont, end_to_end_time, prefilling_time, token_per_sec = self._model_generate(
                 context=context_enc,
                 attention_mask=attn_masks,
                 stop=until,
                         # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
                         s = s.split(term)[0]
+                res.append((s, end_to_end_time, prefilling_time, token_per_sec))
                 self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
                 pbar.update(1)

src/backend/tasks/measurement_task_utils.py CHANGED Viewed

@@ -9,12 +9,20 @@ def process_results_decorator(func):
         # We process the results here
         processed_results = [r[0] for r in results]
-        latency = sum([r[1] for r in results]) / len(results)
-        print(f"Average latency: {latency}")
         # Now call the original process_results with the processed results
         result_dict = func(self, doc, processed_results, *args, **kwargs)
-        result_dict["latency"] = latency
         return result_dict
     return wrapper
@@ -23,7 +31,9 @@ def aggregation_decorator(func):
     @functools.wraps(func)
     def wrapper(self, *args, **kwargs):
         aggregation_list = func(self, *args, **kwargs)
-        aggregation_list["latency"] = mean
         return aggregation_list
     return wrapper
@@ -32,7 +42,9 @@ def higher_is_better_decorator(func):
     @functools.wraps(func)
     def wrapper(self, *args, **kwargs):
         higher_is_better_dict = func(self, *args, **kwargs)
-        higher_is_better_dict["latency"] = False
         return higher_is_better_dict
     return wrapper

         # We process the results here
         processed_results = [r[0] for r in results]
+        # end_to_end_time = end_to_end_time / batch_size
+        # prefilling_time = prefilling_time / batch_size
+        # token_per_sec = output_length / (decoding_time / batch_size)
+        end_to_end_time = sum([r[1] for r in results]) / len(results)
+        prefilling_time = sum([r[2] for r in results]) / len(results)
+        token_per_sec = sum([r[3] for r in results]) / len(results)
+        print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, token_per_sec: {token_per_sec}")
         # Now call the original process_results with the processed results
         result_dict = func(self, doc, processed_results, *args, **kwargs)
+        result_dict["end_to_end_time"] = end_to_end_time
+        result_dict["prefilling_time"] = prefilling_time
+        result_dict["token_per_sec"] = token_per_sec
         return result_dict
     return wrapper
     @functools.wraps(func)
     def wrapper(self, *args, **kwargs):
         aggregation_list = func(self, *args, **kwargs)
+        aggregation_list["end_to_end_time"] = mean
+        aggregation_list["prefilling_time"] = mean
+        aggregation_list["token_per_sec"] = mean
         return aggregation_list
     return wrapper
     @functools.wraps(func)
     def wrapper(self, *args, **kwargs):
         higher_is_better_dict = func(self, *args, **kwargs)
+        higher_is_better_dict["end_to_end_time"] = False
+        higher_is_better_dict["prefilling_time"] = False
+        higher_is_better_dict["token_per_sec"] = True
         return higher_is_better_dict
     return wrapper