Spaces:
Running
Running
Alan Liu
commited on
Commit
•
ed50ee5
1
Parent(s):
6aa1c8b
add generation arithmetic intensity
Browse files- app.py +9 -6
- calc_util.py +43 -29
app.py
CHANGED
@@ -57,7 +57,7 @@ subtotal_operations = [
|
|
57 |
|
58 |
|
59 |
|
60 |
-
col1, col2, col3, col4, col5 = st.columns([1,1.5,2.
|
61 |
|
62 |
inference_config = {}
|
63 |
parameter_count = {}
|
@@ -144,7 +144,7 @@ with col3: # Prefilling
|
|
144 |
|
145 |
operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
|
146 |
subtotal_operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key in subtotal_operations}
|
147 |
-
prefilling_arithmetic_intensity = {key: "{:.3f}".format(prefilling_operation_count[key]/prefilling_activation_memory_count[key])
|
148 |
prefilling_activation_memory_count = {key: "{:,}".format(int(value)) for key, value in prefilling_activation_memory_count.items()}
|
149 |
|
150 |
|
@@ -182,15 +182,18 @@ with col4: # Generation
|
|
182 |
|
183 |
operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
|
184 |
subtotal_operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key in subtotal_operations}
|
|
|
185 |
generation_activation_memory_count = {key: "{:,}".format(int(value)) for key, value in generation_activation_memory_count.items()}
|
186 |
|
187 |
## Convert dictionaries to pandas dataframes for table display
|
188 |
df_operation_count = pd.DataFrame(list(operation_items.items()), columns=["Operation", "FLOPS"])
|
189 |
df_subtotal_operation_count = pd.DataFrame(list(subtotal_operation_items.items()), columns=["Operation", "FLOPS"])
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
|
|
194 |
header4("Inference Ops: Generation")
|
195 |
st.markdown(create_table(df_operation_count))
|
196 |
|
@@ -199,7 +202,7 @@ with col4: # Generation
|
|
199 |
st.write(f"Generation-only throughput (tokens/s): {inference_info['inference_generation_throughput']:.2f}")
|
200 |
st.write(f"(Client) Generation throughput (tokens/s): {inference_info['inference_client_generation_throughput']:.2f}")
|
201 |
st.write(f"FLOPS latency: {inference_info['inference_generation_time']}")
|
202 |
-
|
203 |
|
204 |
if inference_config['KV_cache']:
|
205 |
st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
|
|
|
57 |
|
58 |
|
59 |
|
60 |
+
col1, col2, col3, col4, col5 = st.columns([1,1.5,2.5,2.5,0.1])
|
61 |
|
62 |
inference_config = {}
|
63 |
parameter_count = {}
|
|
|
144 |
|
145 |
operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
|
146 |
subtotal_operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key in subtotal_operations}
|
147 |
+
prefilling_arithmetic_intensity = {key: "{:.3f}".format(prefilling_operation_count[key]/prefilling_activation_memory_count[key] if prefilling_activation_memory_count[key]>0 else float('inf')) for key in prefilling_activation_memory_count}
|
148 |
prefilling_activation_memory_count = {key: "{:,}".format(int(value)) for key, value in prefilling_activation_memory_count.items()}
|
149 |
|
150 |
|
|
|
182 |
|
183 |
operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
|
184 |
subtotal_operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key in subtotal_operations}
|
185 |
+
generation_arithmetic_intensity = {key: "{:.3f}".format(generation_operation_count[key]/generation_activation_memory_count[key] if generation_activation_memory_count[key]>0 else float('inf')) for key in generation_activation_memory_count}
|
186 |
generation_activation_memory_count = {key: "{:,}".format(int(value)) for key, value in generation_activation_memory_count.items()}
|
187 |
|
188 |
## Convert dictionaries to pandas dataframes for table display
|
189 |
df_operation_count = pd.DataFrame(list(operation_items.items()), columns=["Operation", "FLOPS"])
|
190 |
df_subtotal_operation_count = pd.DataFrame(list(subtotal_operation_items.items()), columns=["Operation", "FLOPS"])
|
191 |
|
192 |
+
df_operation_count["Activation (Byte)"] = df_operation_count["Operation"].map(generation_activation_memory_count)
|
193 |
+
df_operation_count["Arithmetic Intensity"] = df_operation_count["Operation"].map(generation_arithmetic_intensity)
|
194 |
+
df_subtotal_operation_count["Activation (Byte)"] = df_subtotal_operation_count["Operation"].map(generation_activation_memory_count)
|
195 |
+
df_subtotal_operation_count["Arithmetic Intensity"] = df_subtotal_operation_count["Operation"].map(generation_arithmetic_intensity)
|
196 |
+
|
197 |
header4("Inference Ops: Generation")
|
198 |
st.markdown(create_table(df_operation_count))
|
199 |
|
|
|
202 |
st.write(f"Generation-only throughput (tokens/s): {inference_info['inference_generation_throughput']:.2f}")
|
203 |
st.write(f"(Client) Generation throughput (tokens/s): {inference_info['inference_client_generation_throughput']:.2f}")
|
204 |
st.write(f"FLOPS latency: {inference_info['inference_generation_time']}")
|
205 |
+
st.write(f"Memory latency: {inference_info['generation_memory_latency']}")
|
206 |
|
207 |
if inference_config['KV_cache']:
|
208 |
st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
|
calc_util.py
CHANGED
@@ -184,8 +184,8 @@ def attention_softmax_activation_memory(model_config, inference_config, seq_leng
|
|
184 |
per_head_per_layer = (2 * inference_config['batchsize'] * seq_length * seq_length)
|
185 |
return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * per_head_per_layer
|
186 |
|
187 |
-
def attention_multV_activation_memory(model_config, inference_config,
|
188 |
-
per_head_per_layer = inference_config['batchsize'] *
|
189 |
return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * per_head_per_layer
|
190 |
|
191 |
def attention_out_activation_memory(model_config, inference_config, seq_length):
|
@@ -215,7 +215,7 @@ def prefilling_activation_memory(model_config, inference_config):
|
|
215 |
activation_memory['attention_V'] = attention_V_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
|
216 |
activation_memory['attention_QK'] = attention_QK_activation_memory(model_config, inference_config, inference_config['input_seq_length'], inference_config['input_seq_length'])
|
217 |
activation_memory['attention_softmax'] = attention_softmax_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
|
218 |
-
activation_memory['attention_multV'] = attention_multV_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
|
219 |
activation_memory['attention_out'] = attention_out_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
|
220 |
|
221 |
activation_memory['layernorm'] = layernorm_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
|
@@ -243,31 +243,50 @@ def prefilling_activation_memory(model_config, inference_config):
|
|
243 |
|
244 |
return activation_memory
|
245 |
|
246 |
-
|
247 |
def generation_activation_memory(model_config, inference_config):
|
248 |
-
# TODO Check how KV cache affects activation_memory
|
249 |
activation_memory = {}
|
250 |
-
|
251 |
-
activation_memory['word_embedding'] =
|
252 |
-
activation_memory['positional_embedding'] =
|
253 |
-
|
254 |
-
activation_memory['
|
255 |
-
activation_memory['
|
256 |
-
activation_memory['
|
257 |
-
activation_memory['
|
258 |
-
activation_memory['
|
259 |
-
activation_memory['
|
260 |
-
activation_memory['
|
261 |
-
|
262 |
-
activation_memory['layernorm'] =
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
activation_memory['embeddings'] = activation_memory['word_embedding'] + activation_memory['positional_embedding']
|
268 |
activation_memory['attention'] = (
|
269 |
-
activation_memory['
|
270 |
-
activation_memory['
|
271 |
activation_memory['attention_softmax'] + activation_memory['attention_multV'] +
|
272 |
activation_memory['attention_out']
|
273 |
)
|
@@ -276,10 +295,5 @@ def generation_activation_memory(model_config, inference_config):
|
|
276 |
activation_memory['embeddings'] + activation_memory['attention'] +
|
277 |
activation_memory['mlp'] + activation_memory['layernorm']
|
278 |
)
|
279 |
-
|
280 |
-
activation_memory['embeddings'] = activation_memory['word_embedding'] + activation_memory['positional_embedding']
|
281 |
-
activation_memory['attention'] = sum([v for k,v in activation_memory.items() if 'attention' in k])
|
282 |
-
activation_memory['mlp'] = activation_memory['mlp1'] + activation_memory['mlp2']
|
283 |
-
activation_memory['total'] = (activation_memory['attention'] + activation_memory['mlp'] + activation_memory['layernorm'])
|
284 |
|
285 |
return activation_memory
|
|
|
184 |
per_head_per_layer = (2 * inference_config['batchsize'] * seq_length * seq_length)
|
185 |
return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * per_head_per_layer
|
186 |
|
187 |
+
def attention_multV_activation_memory(model_config, inference_config, seq_length_Q, seq_length_V):
|
188 |
+
per_head_per_layer = inference_config['batchsize'] * seq_length_Q * seq_length_V + inference_config['batchsize'] * seq_length_Q * model_config['hidden_size_per_head'] + inference_config['batchsize'] * seq_length_V * model_config['hidden_size_per_head']
|
189 |
return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * per_head_per_layer
|
190 |
|
191 |
def attention_out_activation_memory(model_config, inference_config, seq_length):
|
|
|
215 |
activation_memory['attention_V'] = attention_V_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
|
216 |
activation_memory['attention_QK'] = attention_QK_activation_memory(model_config, inference_config, inference_config['input_seq_length'], inference_config['input_seq_length'])
|
217 |
activation_memory['attention_softmax'] = attention_softmax_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
|
218 |
+
activation_memory['attention_multV'] = attention_multV_activation_memory(model_config, inference_config, inference_config['input_seq_length'], inference_config['input_seq_length'])
|
219 |
activation_memory['attention_out'] = attention_out_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
|
220 |
|
221 |
activation_memory['layernorm'] = layernorm_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
|
|
|
243 |
|
244 |
return activation_memory
|
245 |
|
|
|
246 |
def generation_activation_memory(model_config, inference_config):
|
|
|
247 |
activation_memory = {}
|
248 |
+
|
249 |
+
activation_memory['word_embedding'] = 0
|
250 |
+
activation_memory['positional_embedding'] = 0
|
251 |
+
activation_memory['attention_K'] = 0
|
252 |
+
activation_memory['attention_V'] = 0
|
253 |
+
activation_memory['attention_Q'] = 0
|
254 |
+
activation_memory['attention_QK'] = 0
|
255 |
+
activation_memory['attention_softmax'] = 0
|
256 |
+
activation_memory['attention_multV'] = 0
|
257 |
+
activation_memory['attention_out'] = 0
|
258 |
+
activation_memory['mlp1'] = 0
|
259 |
+
activation_memory['mlp2'] = 0
|
260 |
+
activation_memory['layernorm'] = 0
|
261 |
+
|
262 |
+
for t in range(inference_config['output_seq_length']):
|
263 |
+
if inference_config['KV_cache']:
|
264 |
+
activation_memory['attention_K'] += attention_K_activation_memory(model_config, inference_config, 1)
|
265 |
+
activation_memory['attention_V'] += attention_V_activation_memory(model_config, inference_config, 1)
|
266 |
+
activation_memory['attention_Q'] += attention_Q_activation_memory(model_config, inference_config, 1)
|
267 |
+
activation_memory['attention_QK'] += attention_QK_activation_memory(model_config, inference_config, seq_length_Q=1, seq_length_K=(t+1)+inference_config['input_seq_length'])
|
268 |
+
activation_memory['attention_softmax'] += attention_softmax_activation_memory(model_config, inference_config, 1)
|
269 |
+
activation_memory['attention_multV'] += attention_multV_activation_memory(model_config, inference_config, seq_length_Q=1, seq_length_V=(t+1)+inference_config['input_seq_length'])
|
270 |
+
activation_memory['attention_out'] += attention_out_activation_memory(model_config, inference_config, 1)
|
271 |
+
activation_memory['mlp1'] += mlp1_activation_memory(model_config, inference_config, 1)
|
272 |
+
activation_memory['mlp2'] += mlp2_activation_memory(model_config, inference_config, 1)
|
273 |
+
else:
|
274 |
+
activation_memory['attention_K'] += attention_K_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
|
275 |
+
activation_memory['attention_V'] += attention_V_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
|
276 |
+
activation_memory['attention_Q'] += attention_Q_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
|
277 |
+
activation_memory['attention_QK'] += attention_QK_activation_memory(model_config, inference_config, seq_length_Q=(t+1)+inference_config['input_seq_length'], seq_length_K=(t+1)+inference_config['input_seq_length'])
|
278 |
+
activation_memory['attention_softmax'] += attention_softmax_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
|
279 |
+
activation_memory['attention_multV'] += attention_multV_activation_memory(model_config, inference_config, seq_length_Q=(t+1)+inference_config['input_seq_length'], seq_length_V=(t+1)+inference_config['input_seq_length'])
|
280 |
+
activation_memory['attention_out'] += attention_out_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
|
281 |
+
activation_memory['mlp1'] += mlp1_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
|
282 |
+
activation_memory['mlp2'] += mlp2_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
|
283 |
+
|
284 |
+
activation_memory['layernorm'] += layernorm_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
|
285 |
+
|
286 |
activation_memory['embeddings'] = activation_memory['word_embedding'] + activation_memory['positional_embedding']
|
287 |
activation_memory['attention'] = (
|
288 |
+
activation_memory['attention_K'] + activation_memory['attention_V'] +
|
289 |
+
activation_memory['attention_Q'] + activation_memory['attention_QK'] +
|
290 |
activation_memory['attention_softmax'] + activation_memory['attention_multV'] +
|
291 |
activation_memory['attention_out']
|
292 |
)
|
|
|
295 |
activation_memory['embeddings'] + activation_memory['attention'] +
|
296 |
activation_memory['mlp'] + activation_memory['layernorm']
|
297 |
)
|
|
|
|
|
|
|
|
|
|
|
298 |
|
299 |
return activation_memory
|