Spaces:
Running
Running
File size: 4,340 Bytes
35e3254 7d58e23 35e3254 7d58e23 f5a63b8 bf38ec8 7d58e23 35e3254 7d58e23 35e3254 bf38ec8 35e3254 bf38ec8 35e3254 64721de bf38ec8 64721de 7d58e23 64721de bf38ec8 64721de bf38ec8 64721de bf38ec8 64721de bf38ec8 64721de 35e3254 f5a63b8 35e3254 7d58e23 35e3254 64721de 590064e 7d58e23 64721de 7d58e23 35e3254 b23ba47 f5a63b8 b23ba47 35e3254 64721de 35e3254 f5a63b8 64721de 35e3254 7d58e23 35e3254 64721de 35e3254 64721de 35e3254 bf38ec8 35e3254 64721de bf38ec8 f5a63b8 bf38ec8 35e3254 590064e 64721de f5a63b8 35e3254 b23ba47 7d58e23 f5a63b8 b23ba47 f5a63b8 35e3254 64721de 35e3254 bf38ec8 64721de f5a63b8 35e3254 bf38ec8 64721de bf38ec8 64721de bf38ec8 64721de bf38ec8 64721de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import json
import time
from datasets import Dataset
from requests_futures.sessions import FuturesSession
from transformers import AutoTokenizer
from defaults import (ADDRESS_BETTERTRANSFORMER, ADDRESS_VANILLA, HEADERS,
MODEL_NAME)
RETURN_MESSAGE_SINGLE = """
Inference statistics:
* Response status: {0}
* Prediction: {1}
* Inference latency (preprocessing/forward/postprocessing): {2} ms
* Peak GPU memory usage: {3} MB
* End-to-end latency (communication + pre/forward/post): {4} ms
* Padding ratio: 0.0 %
"""
RETURN_MESSAGE_SPAM = """
Processing inputs sent asynchronously. Grab a coffee.
Inference statistics:
* Throughput: {0} samples/s
* Mean inference latency (preprocessing/forward/postprocessing): {1} ms
* Mean peak GPU memory: {2} MB
* Mean padding ratio: {3} %
* Mean sequence length: {4} tokens
* Effective mean batch size: {5}
"""
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def get_message_single(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs
):
return RETURN_MESSAGE_SINGLE.format(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
)
def get_message_spam(
throughput,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
effective_batch_size,
**kwargs,
):
return RETURN_MESSAGE_SPAM.format(
throughput,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
effective_batch_size,
)
SESSION = FuturesSession()
def send_single(input_model_vanilla: str, address: str):
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
# should not take more than 10 s, so timeout if that's the case
inp = json.dumps({"text": input_model_vanilla, "pre_tokenized": False}).encode(
"utf-8"
)
start = time.time()
promise = SESSION.post(address, headers=HEADERS, data=inp, timeout=10)
try:
response = promise.result() # resolve ASAP
end = time.time()
except Exception as e:
return f"{e}"
status = response.status_code
response_text = json.loads(response.text)
prediction = response_text[0]
inf_latency = response_text[1]
peak_gpu_memory = response_text[2]
end_to_end_latency = round((end - start) * 1e3, 2)
return get_message_single(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
)
def send_spam(inp: Dataset, address: str):
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
mean_inference_latency = 0
mean_peak_gpu_memory = 0
n_pads = 0
n_elems = 0
sequence_length = 0
effective_batch_size = 0
promises = []
n_inputs = len(inp)
start = time.time()
for i in range(n_inputs):
input_data = inp[i]["sentence"].encode("utf-8")
# should not take more than 15 s, so timeout if that's the case
promises.append(
SESSION.post(address, headers=HEADERS, data=input_data, timeout=15)
)
# to measure throughput first
end = 0
for promise in promises:
try:
response = promise.result() # resolve ASAP
except Exception as e:
return f"{e}"
end = max(time.time(), end)
# then other metrics
for promise in promises:
response = promise.result()
response_text = json.loads(response.text)
mean_inference_latency += response_text[1]
mean_peak_gpu_memory += response_text[2]
n_pads += response_text[3]
n_elems += response_text[4]
sequence_length += response_text[5]
effective_batch_size += response_text[6]
throughput = n_inputs / (end - start)
mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
mean_sequence_length = sequence_length / n_inputs
effective_batch_size = effective_batch_size / n_inputs
throughput = round(throughput, 2)
mean_inference_latency = round(mean_inference_latency / n_inputs, 2)
mean_peak_gpu_memory = round(mean_peak_gpu_memory / n_inputs, 2)
return get_message_spam(
throughput,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
effective_batch_size,
)
|