varun4 commited on
Commit
0606100
1 Parent(s): fba41a4

quantizing scripts added

Browse files
Files changed (3) hide show
  1. aggregate_data.py +5 -5
  2. quantize.py +267 -0
  3. quantize_onnx.py +175 -0
aggregate_data.py CHANGED
@@ -31,7 +31,7 @@ MODELS = [
31
 
32
 
33
  def get_model_size(model_name):
34
- return os.path.getsize(f"models/{model_name}/pytorch_model.bin") / (1024.0 * 1024.0)
35
 
36
 
37
  def compute_model_score(model_name):
@@ -64,16 +64,16 @@ def compute_model_score(model_name):
64
  DATA = {
65
  "Model": MODELS,
66
  "Model Size (MB)": [
67
- get_model_size(model) for model in MODELS
68
  ],
69
  "Score": [
70
- 5 # compute_model_score(model) for model in MODELS
71
  ],
72
  "q8 Model Size (MB)": [
73
- get_model_size(model + "-q8") for model in MODELS
74
  ],
75
  "q8 Score": [
76
- compute_model_score(model + "-q8") for model in MODELS
77
  ],
78
  }
79
 
 
31
 
32
 
33
  def get_model_size(model_name):
34
+ return os.path.getsize(f"models/{model_name}") / (1024.0 * 1024.0)
35
 
36
 
37
  def compute_model_score(model_name):
 
64
  DATA = {
65
  "Model": MODELS,
66
  "Model Size (MB)": [
67
+ get_model_size(f"{model}/pytorch_model.bin") for model in MODELS
68
  ],
69
  "Score": [
70
+ compute_model_score(model) for model in MODELS
71
  ],
72
  "q8 Model Size (MB)": [
73
+ get_model_size(f"optimum/{model}-self-optimum-q8/model.onnx") for model in MODELS
74
  ],
75
  "q8 Score": [
76
+ compute_model_score(f"optimum/{model}-q8") for model in MODELS
77
  ],
78
  }
79
 
quantize.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json
3
+ import os
4
+ import shutil
5
+ from dataclasses import dataclass, field
6
+ from typing import Optional, Set
7
+ from tqdm import tqdm
8
+
9
+ from transformers import (
10
+ AutoConfig,
11
+ AutoTokenizer,
12
+ HfArgumentParser
13
+ )
14
+
15
+ import onnx
16
+ from optimum.exporters.onnx import main_export, export_models
17
+ from optimum.exporters.tasks import TasksManager
18
+ from onnxruntime.quantization import (
19
+ quantize_dynamic,
20
+ QuantType
21
+ )
22
+
23
+ DEFAULT_QUANTIZE_PARAMS = {
24
+ 'per_channel': True,
25
+ 'reduce_range': True,
26
+ }
27
+
28
+ MODEL_SPECIFIC_QUANTIZE_PARAMS = {
29
+ 'whisper': {
30
+ 'per_channel': False,
31
+ 'reduce_range': False,
32
+ }
33
+ }
34
+
35
+ MODELS_WITHOUT_TOKENIZERS = [
36
+ 'wav2vec2'
37
+ ]
38
+
39
+
40
+ @dataclass
41
+ class ConversionArguments:
42
+ """
43
+ Arguments used for converting HuggingFace models to onnx.
44
+ """
45
+
46
+ model_id: str = field(
47
+ metadata={
48
+ "help": "Model identifier"
49
+ }
50
+ )
51
+ quantize: bool = field(
52
+ default=False,
53
+ metadata={
54
+ "help": "Whether to quantize the model."
55
+ }
56
+ )
57
+ output_parent_dir: str = field(
58
+ default='./models/',
59
+ metadata={
60
+ "help": "Path where the converted model will be saved to."
61
+ }
62
+ )
63
+
64
+ task: Optional[str] = field(
65
+ default='auto',
66
+ metadata={
67
+ "help": (
68
+ "The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among:"
69
+ f" {str(list(TasksManager._TASKS_TO_AUTOMODELS.keys()))}. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder."
70
+ )
71
+ }
72
+ )
73
+
74
+ opset: int = field(
75
+ default=None,
76
+ metadata={
77
+ "help": (
78
+ "If specified, ONNX opset version to export the model with. Otherwise, the default opset will be used."
79
+ )
80
+ }
81
+ )
82
+
83
+ device: str = field(
84
+ default='cpu',
85
+ metadata={
86
+ "help": 'The device to use to do the export.'
87
+ }
88
+ )
89
+ skip_validation: bool = field(
90
+ default=False,
91
+ metadata={
92
+ "help": "Whether to skip validation of the converted model"
93
+ }
94
+ )
95
+
96
+ per_channel: bool = field(
97
+ default=None,
98
+ metadata={
99
+ "help": "Whether to quantize weights per channel"
100
+ }
101
+ )
102
+ reduce_range: bool = field(
103
+ default=None,
104
+ metadata={
105
+ "help": "Whether to quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode"
106
+ }
107
+ )
108
+
109
+ output_attentions: bool = field(
110
+ default=False,
111
+ metadata={
112
+ "help": "Whether to output attentions from the model. NOTE: This is only supported for whisper models right now."
113
+ }
114
+ )
115
+
116
+ split_modalities: bool = field(
117
+ default=False,
118
+ metadata={
119
+ "help": "Whether to split multimodal models. NOTE: This is only supported for CLIP models right now."
120
+ }
121
+ )
122
+
123
+
124
+ def get_operators(model: onnx.ModelProto) -> Set[str]:
125
+ operators = set()
126
+
127
+ def traverse_graph(graph):
128
+ for node in graph.node:
129
+ operators.add(node.op_type)
130
+ for attr in node.attribute:
131
+ if attr.type == onnx.AttributeProto.GRAPH:
132
+ subgraph = attr.g
133
+ traverse_graph(subgraph)
134
+
135
+ traverse_graph(model.graph)
136
+ return operators
137
+
138
+
139
+ def quantize(model_names_or_paths, **quantize_kwargs):
140
+ """
141
+ Quantize the weights of the model from float32 to int8 to allow very efficient inference on modern CPU
142
+
143
+ Uses unsigned ints for activation values, signed ints for weights, per
144
+ https://onnxruntime.ai/docs/performance/quantization.html#data-type-selection
145
+ it is faster on most CPU architectures
146
+ Args:
147
+ onnx_model_path: Path to location the exported ONNX model is stored
148
+ Returns: The Path generated for the quantized
149
+ """
150
+
151
+ quantize_config = dict(
152
+ **quantize_kwargs,
153
+ per_model_config={}
154
+ )
155
+
156
+ for model in tqdm(model_names_or_paths, desc='Quantizing'):
157
+ directory_path = os.path.dirname(model)
158
+ file_name_without_extension = os.path.splitext(
159
+ os.path.basename(model))[0]
160
+
161
+ # NOTE:
162
+ # As of 2023/04/20, the current latest version of onnxruntime-web is 1.14.0, and does not support INT8 weights for Conv layers.
163
+ # For this reason, we choose model weight types to ensure compatibility with onnxruntime-web.
164
+ #
165
+ # As per docs, signed weight type (QInt8) is faster on most CPUs, so, we use that unless the model contains a Conv layer.
166
+ # For more information, see:
167
+ # - https://github.com/microsoft/onnxruntime/issues/3130#issuecomment-1105200621
168
+ # - https://github.com/microsoft/onnxruntime/issues/2339
169
+
170
+ loaded_model = onnx.load_model(model)
171
+ op_types = get_operators(loaded_model)
172
+ weight_type = QuantType.QUInt8 if 'Conv' in op_types else QuantType.QInt8
173
+
174
+ quantize_dynamic(
175
+ model_input=model,
176
+ model_output=os.path.join(
177
+ directory_path, f'{file_name_without_extension}_quantized.onnx'),
178
+
179
+ weight_type=weight_type,
180
+ optimize_model=False,
181
+
182
+ # TODO allow user to specify these
183
+ # op_types_to_quantize=['MatMul', 'Add', 'Conv'],
184
+ extra_options=dict(
185
+ EnableSubgraph=True
186
+ ),
187
+ **quantize_kwargs
188
+ )
189
+
190
+ quantize_config['per_model_config'][file_name_without_extension] = dict(
191
+ op_types=list(op_types),
192
+ weight_type=str(weight_type),
193
+ )
194
+
195
+ # Save quantization config
196
+ with open(os.path.join(directory_path, 'quantize_config.json'), 'w') as fp:
197
+ json.dump(quantize_config, fp, indent=4)
198
+
199
+
200
+ def main():
201
+ """
202
+ Example usage:
203
+ python quantize.py --model_id sentence-transformers/all-MiniLM-L6-v2-unquantized --quantize --task default
204
+ """
205
+ parser = HfArgumentParser(
206
+ (ConversionArguments, )
207
+ )
208
+ conv_args, = parser.parse_args_into_dataclasses()
209
+
210
+ model_id = conv_args.model_id
211
+
212
+ output_model_folder = os.path.join(conv_args.output_parent_dir, model_id)
213
+
214
+ # Create output folder
215
+ os.makedirs(output_model_folder, exist_ok=True)
216
+
217
+ # Saving the model config
218
+ config = AutoConfig.from_pretrained(model_id)
219
+
220
+ tokenizer = None
221
+ try:
222
+ # Load tokenizer
223
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
224
+
225
+ except KeyError:
226
+ pass # No Tokenizer
227
+
228
+ except Exception as e:
229
+ if config.model_type not in MODELS_WITHOUT_TOKENIZERS:
230
+ raise e
231
+
232
+ # model_name_or_path can be local path or huggingface id
233
+ export_kwargs = dict(
234
+ model_name_or_path=model_id,
235
+ output=output_model_folder,
236
+ task=conv_args.task,
237
+ opset=conv_args.opset,
238
+ device=conv_args.device,
239
+ do_validation=not conv_args.skip_validation,
240
+ )
241
+
242
+
243
+ # Step 1. convert huggingface model to onnx
244
+ main_export(**export_kwargs)
245
+
246
+
247
+ # Step 2. (optional, recommended) quantize the converted model for fast inference and to reduce model size.
248
+ if conv_args.quantize:
249
+ # Update quantize config with model specific defaults
250
+ quantize_config = MODEL_SPECIFIC_QUANTIZE_PARAMS.get(
251
+ config.model_type, DEFAULT_QUANTIZE_PARAMS)
252
+
253
+ quantize([
254
+ os.path.join(output_model_folder, x)
255
+ for x in os.listdir(output_model_folder)
256
+ if x.endswith('.onnx') and not x.endswith('_quantized.onnx')
257
+ ], **quantize_config)
258
+
259
+ # Step 3. Move .onnx files to the 'onnx' subfolder
260
+ os.makedirs(os.path.join(output_model_folder, 'onnx'), exist_ok=True)
261
+ for file in os.listdir(output_model_folder):
262
+ if file.endswith(('.onnx', '.onnx_data')):
263
+ shutil.move(os.path.join(output_model_folder, file),
264
+ os.path.join(output_model_folder, 'onnx', file))
265
+
266
+ if __name__ == '__main__':
267
+ main()
quantize_onnx.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+ from typing import Optional, Set
4
+
5
+ import onnx
6
+ from onnxruntime.quantization import (
7
+ quantize_dynamic,
8
+ QuantType
9
+ )
10
+
11
+ from optimum.exporters.tasks import TasksManager
12
+ from transformers import (
13
+ AutoConfig,
14
+ HfArgumentParser
15
+ )
16
+
17
+ DEFAULT_QUANTIZE_PARAMS = {
18
+ 'per_channel': True,
19
+ 'reduce_range': True,
20
+ }
21
+
22
+ MODEL_SPECIFIC_QUANTIZE_PARAMS = {
23
+ 'whisper': {
24
+ 'per_channel': False,
25
+ 'reduce_range': False,
26
+ }
27
+ }
28
+
29
+ MODELS_WITHOUT_TOKENIZERS = [
30
+ 'wav2vec2'
31
+ ]
32
+
33
+
34
+ @dataclass
35
+ class ConversionArguments:
36
+ """
37
+ Arguments used for converting HuggingFace models to onnx.
38
+ """
39
+
40
+ model_id: str = field(
41
+ metadata={
42
+ "help": "Model identifier"
43
+ }
44
+ )
45
+ quantize: bool = field(
46
+ default=False,
47
+ metadata={
48
+ "help": "Whether to quantize the model."
49
+ }
50
+ )
51
+ output_parent_dir: str = field(
52
+ default='./models/',
53
+ metadata={
54
+ "help": "Path where the converted model will be saved to."
55
+ }
56
+ )
57
+
58
+ task: Optional[str] = field(
59
+ default='auto',
60
+ metadata={
61
+ "help": (
62
+ "The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among:"
63
+ f" {str(list(TasksManager._TASKS_TO_AUTOMODELS.keys()))}. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder."
64
+ )
65
+ }
66
+ )
67
+
68
+ opset: int = field(
69
+ default=None,
70
+ metadata={
71
+ "help": (
72
+ "If specified, ONNX opset version to export the model with. Otherwise, the default opset will be used."
73
+ )
74
+ }
75
+ )
76
+
77
+ device: str = field(
78
+ default='cpu',
79
+ metadata={
80
+ "help": 'The device to use to do the export.'
81
+ }
82
+ )
83
+ skip_validation: bool = field(
84
+ default=False,
85
+ metadata={
86
+ "help": "Whether to skip validation of the converted model"
87
+ }
88
+ )
89
+
90
+ per_channel: bool = field(
91
+ default=None,
92
+ metadata={
93
+ "help": "Whether to quantize weights per channel"
94
+ }
95
+ )
96
+ reduce_range: bool = field(
97
+ default=None,
98
+ metadata={
99
+ "help": "Whether to quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode"
100
+ }
101
+ )
102
+
103
+ output_attentions: bool = field(
104
+ default=False,
105
+ metadata={
106
+ "help": "Whether to output attentions from the model. NOTE: This is only supported for whisper models right now."
107
+ }
108
+ )
109
+
110
+ split_modalities: bool = field(
111
+ default=False,
112
+ metadata={
113
+ "help": "Whether to split multimodal models. NOTE: This is only supported for CLIP models right now."
114
+ }
115
+ )
116
+
117
+
118
+ def get_operators(model: onnx.ModelProto) -> Set[str]:
119
+ operators = set()
120
+
121
+ def traverse_graph(graph):
122
+ for node in graph.node:
123
+ operators.add(node.op_type)
124
+ for attr in node.attribute:
125
+ if attr.type == onnx.AttributeProto.GRAPH:
126
+ subgraph = attr.g
127
+ traverse_graph(subgraph)
128
+
129
+ traverse_graph(model.graph)
130
+ return operators
131
+
132
+
133
+ def quantize(model_path):
134
+ """
135
+ Quantize the weights of the model from float32 to int8 to allow very efficient inference on modern CPU
136
+
137
+ Uses unsigned ints for activation values, signed ints for weights, per
138
+ https://onnxruntime.ai/docs/performance/quantization.html#data-type-selection
139
+ it is faster on most CPU architectures
140
+ Args:
141
+ onnx_model_path: Path to location the exported ONNX model is stored
142
+ Returns: The Path generated for the quantized
143
+ """
144
+ directory_path = os.path.dirname(model_path)
145
+
146
+ loaded_model = onnx.load_model(model_path)
147
+ op_types = get_operators(loaded_model)
148
+ weight_type = QuantType.QUInt8 if 'Conv' in op_types else QuantType.QInt8
149
+ print("quantizing to", weight_type)
150
+
151
+ quantize_dynamic(
152
+ model_input=model_path,
153
+ model_output=os.path.join(directory_path, 'model-q8.onnx'),
154
+ weight_type=weight_type,
155
+ optimize_model=False,
156
+ )
157
+
158
+
159
+ def main():
160
+ """
161
+ Example usage:
162
+ python quantize_onnx.py --model_id sentence-transformers/all-MiniLM-L6-v2-unquantized
163
+ """
164
+ parser = HfArgumentParser(
165
+ (ConversionArguments,)
166
+ )
167
+ conv_args, = parser.parse_args_into_dataclasses()
168
+
169
+ model_id = conv_args.model_id
170
+
171
+ quantize(os.path.join(model_id, "model.onnx"))
172
+
173
+
174
+ if __name__ == '__main__':
175
+ main()