quantizing scripts added
Browse files- aggregate_data.py +5 -5
- quantize.py +267 -0
- quantize_onnx.py +175 -0
aggregate_data.py
CHANGED
@@ -31,7 +31,7 @@ MODELS = [
|
|
31 |
|
32 |
|
33 |
def get_model_size(model_name):
|
34 |
-
return os.path.getsize(f"models/{model_name}
|
35 |
|
36 |
|
37 |
def compute_model_score(model_name):
|
@@ -64,16 +64,16 @@ def compute_model_score(model_name):
|
|
64 |
DATA = {
|
65 |
"Model": MODELS,
|
66 |
"Model Size (MB)": [
|
67 |
-
get_model_size(model) for model in MODELS
|
68 |
],
|
69 |
"Score": [
|
70 |
-
|
71 |
],
|
72 |
"q8 Model Size (MB)": [
|
73 |
-
get_model_size(model
|
74 |
],
|
75 |
"q8 Score": [
|
76 |
-
compute_model_score(model
|
77 |
],
|
78 |
}
|
79 |
|
|
|
31 |
|
32 |
|
33 |
def get_model_size(model_name):
|
34 |
+
return os.path.getsize(f"models/{model_name}") / (1024.0 * 1024.0)
|
35 |
|
36 |
|
37 |
def compute_model_score(model_name):
|
|
|
64 |
DATA = {
|
65 |
"Model": MODELS,
|
66 |
"Model Size (MB)": [
|
67 |
+
get_model_size(f"{model}/pytorch_model.bin") for model in MODELS
|
68 |
],
|
69 |
"Score": [
|
70 |
+
compute_model_score(model) for model in MODELS
|
71 |
],
|
72 |
"q8 Model Size (MB)": [
|
73 |
+
get_model_size(f"optimum/{model}-self-optimum-q8/model.onnx") for model in MODELS
|
74 |
],
|
75 |
"q8 Score": [
|
76 |
+
compute_model_score(f"optimum/{model}-q8") for model in MODELS
|
77 |
],
|
78 |
}
|
79 |
|
quantize.py
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
from dataclasses import dataclass, field
|
6 |
+
from typing import Optional, Set
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
from transformers import (
|
10 |
+
AutoConfig,
|
11 |
+
AutoTokenizer,
|
12 |
+
HfArgumentParser
|
13 |
+
)
|
14 |
+
|
15 |
+
import onnx
|
16 |
+
from optimum.exporters.onnx import main_export, export_models
|
17 |
+
from optimum.exporters.tasks import TasksManager
|
18 |
+
from onnxruntime.quantization import (
|
19 |
+
quantize_dynamic,
|
20 |
+
QuantType
|
21 |
+
)
|
22 |
+
|
23 |
+
DEFAULT_QUANTIZE_PARAMS = {
|
24 |
+
'per_channel': True,
|
25 |
+
'reduce_range': True,
|
26 |
+
}
|
27 |
+
|
28 |
+
MODEL_SPECIFIC_QUANTIZE_PARAMS = {
|
29 |
+
'whisper': {
|
30 |
+
'per_channel': False,
|
31 |
+
'reduce_range': False,
|
32 |
+
}
|
33 |
+
}
|
34 |
+
|
35 |
+
MODELS_WITHOUT_TOKENIZERS = [
|
36 |
+
'wav2vec2'
|
37 |
+
]
|
38 |
+
|
39 |
+
|
40 |
+
@dataclass
|
41 |
+
class ConversionArguments:
|
42 |
+
"""
|
43 |
+
Arguments used for converting HuggingFace models to onnx.
|
44 |
+
"""
|
45 |
+
|
46 |
+
model_id: str = field(
|
47 |
+
metadata={
|
48 |
+
"help": "Model identifier"
|
49 |
+
}
|
50 |
+
)
|
51 |
+
quantize: bool = field(
|
52 |
+
default=False,
|
53 |
+
metadata={
|
54 |
+
"help": "Whether to quantize the model."
|
55 |
+
}
|
56 |
+
)
|
57 |
+
output_parent_dir: str = field(
|
58 |
+
default='./models/',
|
59 |
+
metadata={
|
60 |
+
"help": "Path where the converted model will be saved to."
|
61 |
+
}
|
62 |
+
)
|
63 |
+
|
64 |
+
task: Optional[str] = field(
|
65 |
+
default='auto',
|
66 |
+
metadata={
|
67 |
+
"help": (
|
68 |
+
"The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among:"
|
69 |
+
f" {str(list(TasksManager._TASKS_TO_AUTOMODELS.keys()))}. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder."
|
70 |
+
)
|
71 |
+
}
|
72 |
+
)
|
73 |
+
|
74 |
+
opset: int = field(
|
75 |
+
default=None,
|
76 |
+
metadata={
|
77 |
+
"help": (
|
78 |
+
"If specified, ONNX opset version to export the model with. Otherwise, the default opset will be used."
|
79 |
+
)
|
80 |
+
}
|
81 |
+
)
|
82 |
+
|
83 |
+
device: str = field(
|
84 |
+
default='cpu',
|
85 |
+
metadata={
|
86 |
+
"help": 'The device to use to do the export.'
|
87 |
+
}
|
88 |
+
)
|
89 |
+
skip_validation: bool = field(
|
90 |
+
default=False,
|
91 |
+
metadata={
|
92 |
+
"help": "Whether to skip validation of the converted model"
|
93 |
+
}
|
94 |
+
)
|
95 |
+
|
96 |
+
per_channel: bool = field(
|
97 |
+
default=None,
|
98 |
+
metadata={
|
99 |
+
"help": "Whether to quantize weights per channel"
|
100 |
+
}
|
101 |
+
)
|
102 |
+
reduce_range: bool = field(
|
103 |
+
default=None,
|
104 |
+
metadata={
|
105 |
+
"help": "Whether to quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode"
|
106 |
+
}
|
107 |
+
)
|
108 |
+
|
109 |
+
output_attentions: bool = field(
|
110 |
+
default=False,
|
111 |
+
metadata={
|
112 |
+
"help": "Whether to output attentions from the model. NOTE: This is only supported for whisper models right now."
|
113 |
+
}
|
114 |
+
)
|
115 |
+
|
116 |
+
split_modalities: bool = field(
|
117 |
+
default=False,
|
118 |
+
metadata={
|
119 |
+
"help": "Whether to split multimodal models. NOTE: This is only supported for CLIP models right now."
|
120 |
+
}
|
121 |
+
)
|
122 |
+
|
123 |
+
|
124 |
+
def get_operators(model: onnx.ModelProto) -> Set[str]:
|
125 |
+
operators = set()
|
126 |
+
|
127 |
+
def traverse_graph(graph):
|
128 |
+
for node in graph.node:
|
129 |
+
operators.add(node.op_type)
|
130 |
+
for attr in node.attribute:
|
131 |
+
if attr.type == onnx.AttributeProto.GRAPH:
|
132 |
+
subgraph = attr.g
|
133 |
+
traverse_graph(subgraph)
|
134 |
+
|
135 |
+
traverse_graph(model.graph)
|
136 |
+
return operators
|
137 |
+
|
138 |
+
|
139 |
+
def quantize(model_names_or_paths, **quantize_kwargs):
|
140 |
+
"""
|
141 |
+
Quantize the weights of the model from float32 to int8 to allow very efficient inference on modern CPU
|
142 |
+
|
143 |
+
Uses unsigned ints for activation values, signed ints for weights, per
|
144 |
+
https://onnxruntime.ai/docs/performance/quantization.html#data-type-selection
|
145 |
+
it is faster on most CPU architectures
|
146 |
+
Args:
|
147 |
+
onnx_model_path: Path to location the exported ONNX model is stored
|
148 |
+
Returns: The Path generated for the quantized
|
149 |
+
"""
|
150 |
+
|
151 |
+
quantize_config = dict(
|
152 |
+
**quantize_kwargs,
|
153 |
+
per_model_config={}
|
154 |
+
)
|
155 |
+
|
156 |
+
for model in tqdm(model_names_or_paths, desc='Quantizing'):
|
157 |
+
directory_path = os.path.dirname(model)
|
158 |
+
file_name_without_extension = os.path.splitext(
|
159 |
+
os.path.basename(model))[0]
|
160 |
+
|
161 |
+
# NOTE:
|
162 |
+
# As of 2023/04/20, the current latest version of onnxruntime-web is 1.14.0, and does not support INT8 weights for Conv layers.
|
163 |
+
# For this reason, we choose model weight types to ensure compatibility with onnxruntime-web.
|
164 |
+
#
|
165 |
+
# As per docs, signed weight type (QInt8) is faster on most CPUs, so, we use that unless the model contains a Conv layer.
|
166 |
+
# For more information, see:
|
167 |
+
# - https://github.com/microsoft/onnxruntime/issues/3130#issuecomment-1105200621
|
168 |
+
# - https://github.com/microsoft/onnxruntime/issues/2339
|
169 |
+
|
170 |
+
loaded_model = onnx.load_model(model)
|
171 |
+
op_types = get_operators(loaded_model)
|
172 |
+
weight_type = QuantType.QUInt8 if 'Conv' in op_types else QuantType.QInt8
|
173 |
+
|
174 |
+
quantize_dynamic(
|
175 |
+
model_input=model,
|
176 |
+
model_output=os.path.join(
|
177 |
+
directory_path, f'{file_name_without_extension}_quantized.onnx'),
|
178 |
+
|
179 |
+
weight_type=weight_type,
|
180 |
+
optimize_model=False,
|
181 |
+
|
182 |
+
# TODO allow user to specify these
|
183 |
+
# op_types_to_quantize=['MatMul', 'Add', 'Conv'],
|
184 |
+
extra_options=dict(
|
185 |
+
EnableSubgraph=True
|
186 |
+
),
|
187 |
+
**quantize_kwargs
|
188 |
+
)
|
189 |
+
|
190 |
+
quantize_config['per_model_config'][file_name_without_extension] = dict(
|
191 |
+
op_types=list(op_types),
|
192 |
+
weight_type=str(weight_type),
|
193 |
+
)
|
194 |
+
|
195 |
+
# Save quantization config
|
196 |
+
with open(os.path.join(directory_path, 'quantize_config.json'), 'w') as fp:
|
197 |
+
json.dump(quantize_config, fp, indent=4)
|
198 |
+
|
199 |
+
|
200 |
+
def main():
|
201 |
+
"""
|
202 |
+
Example usage:
|
203 |
+
python quantize.py --model_id sentence-transformers/all-MiniLM-L6-v2-unquantized --quantize --task default
|
204 |
+
"""
|
205 |
+
parser = HfArgumentParser(
|
206 |
+
(ConversionArguments, )
|
207 |
+
)
|
208 |
+
conv_args, = parser.parse_args_into_dataclasses()
|
209 |
+
|
210 |
+
model_id = conv_args.model_id
|
211 |
+
|
212 |
+
output_model_folder = os.path.join(conv_args.output_parent_dir, model_id)
|
213 |
+
|
214 |
+
# Create output folder
|
215 |
+
os.makedirs(output_model_folder, exist_ok=True)
|
216 |
+
|
217 |
+
# Saving the model config
|
218 |
+
config = AutoConfig.from_pretrained(model_id)
|
219 |
+
|
220 |
+
tokenizer = None
|
221 |
+
try:
|
222 |
+
# Load tokenizer
|
223 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
224 |
+
|
225 |
+
except KeyError:
|
226 |
+
pass # No Tokenizer
|
227 |
+
|
228 |
+
except Exception as e:
|
229 |
+
if config.model_type not in MODELS_WITHOUT_TOKENIZERS:
|
230 |
+
raise e
|
231 |
+
|
232 |
+
# model_name_or_path can be local path or huggingface id
|
233 |
+
export_kwargs = dict(
|
234 |
+
model_name_or_path=model_id,
|
235 |
+
output=output_model_folder,
|
236 |
+
task=conv_args.task,
|
237 |
+
opset=conv_args.opset,
|
238 |
+
device=conv_args.device,
|
239 |
+
do_validation=not conv_args.skip_validation,
|
240 |
+
)
|
241 |
+
|
242 |
+
|
243 |
+
# Step 1. convert huggingface model to onnx
|
244 |
+
main_export(**export_kwargs)
|
245 |
+
|
246 |
+
|
247 |
+
# Step 2. (optional, recommended) quantize the converted model for fast inference and to reduce model size.
|
248 |
+
if conv_args.quantize:
|
249 |
+
# Update quantize config with model specific defaults
|
250 |
+
quantize_config = MODEL_SPECIFIC_QUANTIZE_PARAMS.get(
|
251 |
+
config.model_type, DEFAULT_QUANTIZE_PARAMS)
|
252 |
+
|
253 |
+
quantize([
|
254 |
+
os.path.join(output_model_folder, x)
|
255 |
+
for x in os.listdir(output_model_folder)
|
256 |
+
if x.endswith('.onnx') and not x.endswith('_quantized.onnx')
|
257 |
+
], **quantize_config)
|
258 |
+
|
259 |
+
# Step 3. Move .onnx files to the 'onnx' subfolder
|
260 |
+
os.makedirs(os.path.join(output_model_folder, 'onnx'), exist_ok=True)
|
261 |
+
for file in os.listdir(output_model_folder):
|
262 |
+
if file.endswith(('.onnx', '.onnx_data')):
|
263 |
+
shutil.move(os.path.join(output_model_folder, file),
|
264 |
+
os.path.join(output_model_folder, 'onnx', file))
|
265 |
+
|
266 |
+
if __name__ == '__main__':
|
267 |
+
main()
|
quantize_onnx.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dataclasses import dataclass, field
|
3 |
+
from typing import Optional, Set
|
4 |
+
|
5 |
+
import onnx
|
6 |
+
from onnxruntime.quantization import (
|
7 |
+
quantize_dynamic,
|
8 |
+
QuantType
|
9 |
+
)
|
10 |
+
|
11 |
+
from optimum.exporters.tasks import TasksManager
|
12 |
+
from transformers import (
|
13 |
+
AutoConfig,
|
14 |
+
HfArgumentParser
|
15 |
+
)
|
16 |
+
|
17 |
+
DEFAULT_QUANTIZE_PARAMS = {
|
18 |
+
'per_channel': True,
|
19 |
+
'reduce_range': True,
|
20 |
+
}
|
21 |
+
|
22 |
+
MODEL_SPECIFIC_QUANTIZE_PARAMS = {
|
23 |
+
'whisper': {
|
24 |
+
'per_channel': False,
|
25 |
+
'reduce_range': False,
|
26 |
+
}
|
27 |
+
}
|
28 |
+
|
29 |
+
MODELS_WITHOUT_TOKENIZERS = [
|
30 |
+
'wav2vec2'
|
31 |
+
]
|
32 |
+
|
33 |
+
|
34 |
+
@dataclass
|
35 |
+
class ConversionArguments:
|
36 |
+
"""
|
37 |
+
Arguments used for converting HuggingFace models to onnx.
|
38 |
+
"""
|
39 |
+
|
40 |
+
model_id: str = field(
|
41 |
+
metadata={
|
42 |
+
"help": "Model identifier"
|
43 |
+
}
|
44 |
+
)
|
45 |
+
quantize: bool = field(
|
46 |
+
default=False,
|
47 |
+
metadata={
|
48 |
+
"help": "Whether to quantize the model."
|
49 |
+
}
|
50 |
+
)
|
51 |
+
output_parent_dir: str = field(
|
52 |
+
default='./models/',
|
53 |
+
metadata={
|
54 |
+
"help": "Path where the converted model will be saved to."
|
55 |
+
}
|
56 |
+
)
|
57 |
+
|
58 |
+
task: Optional[str] = field(
|
59 |
+
default='auto',
|
60 |
+
metadata={
|
61 |
+
"help": (
|
62 |
+
"The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among:"
|
63 |
+
f" {str(list(TasksManager._TASKS_TO_AUTOMODELS.keys()))}. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder."
|
64 |
+
)
|
65 |
+
}
|
66 |
+
)
|
67 |
+
|
68 |
+
opset: int = field(
|
69 |
+
default=None,
|
70 |
+
metadata={
|
71 |
+
"help": (
|
72 |
+
"If specified, ONNX opset version to export the model with. Otherwise, the default opset will be used."
|
73 |
+
)
|
74 |
+
}
|
75 |
+
)
|
76 |
+
|
77 |
+
device: str = field(
|
78 |
+
default='cpu',
|
79 |
+
metadata={
|
80 |
+
"help": 'The device to use to do the export.'
|
81 |
+
}
|
82 |
+
)
|
83 |
+
skip_validation: bool = field(
|
84 |
+
default=False,
|
85 |
+
metadata={
|
86 |
+
"help": "Whether to skip validation of the converted model"
|
87 |
+
}
|
88 |
+
)
|
89 |
+
|
90 |
+
per_channel: bool = field(
|
91 |
+
default=None,
|
92 |
+
metadata={
|
93 |
+
"help": "Whether to quantize weights per channel"
|
94 |
+
}
|
95 |
+
)
|
96 |
+
reduce_range: bool = field(
|
97 |
+
default=None,
|
98 |
+
metadata={
|
99 |
+
"help": "Whether to quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode"
|
100 |
+
}
|
101 |
+
)
|
102 |
+
|
103 |
+
output_attentions: bool = field(
|
104 |
+
default=False,
|
105 |
+
metadata={
|
106 |
+
"help": "Whether to output attentions from the model. NOTE: This is only supported for whisper models right now."
|
107 |
+
}
|
108 |
+
)
|
109 |
+
|
110 |
+
split_modalities: bool = field(
|
111 |
+
default=False,
|
112 |
+
metadata={
|
113 |
+
"help": "Whether to split multimodal models. NOTE: This is only supported for CLIP models right now."
|
114 |
+
}
|
115 |
+
)
|
116 |
+
|
117 |
+
|
118 |
+
def get_operators(model: onnx.ModelProto) -> Set[str]:
|
119 |
+
operators = set()
|
120 |
+
|
121 |
+
def traverse_graph(graph):
|
122 |
+
for node in graph.node:
|
123 |
+
operators.add(node.op_type)
|
124 |
+
for attr in node.attribute:
|
125 |
+
if attr.type == onnx.AttributeProto.GRAPH:
|
126 |
+
subgraph = attr.g
|
127 |
+
traverse_graph(subgraph)
|
128 |
+
|
129 |
+
traverse_graph(model.graph)
|
130 |
+
return operators
|
131 |
+
|
132 |
+
|
133 |
+
def quantize(model_path):
|
134 |
+
"""
|
135 |
+
Quantize the weights of the model from float32 to int8 to allow very efficient inference on modern CPU
|
136 |
+
|
137 |
+
Uses unsigned ints for activation values, signed ints for weights, per
|
138 |
+
https://onnxruntime.ai/docs/performance/quantization.html#data-type-selection
|
139 |
+
it is faster on most CPU architectures
|
140 |
+
Args:
|
141 |
+
onnx_model_path: Path to location the exported ONNX model is stored
|
142 |
+
Returns: The Path generated for the quantized
|
143 |
+
"""
|
144 |
+
directory_path = os.path.dirname(model_path)
|
145 |
+
|
146 |
+
loaded_model = onnx.load_model(model_path)
|
147 |
+
op_types = get_operators(loaded_model)
|
148 |
+
weight_type = QuantType.QUInt8 if 'Conv' in op_types else QuantType.QInt8
|
149 |
+
print("quantizing to", weight_type)
|
150 |
+
|
151 |
+
quantize_dynamic(
|
152 |
+
model_input=model_path,
|
153 |
+
model_output=os.path.join(directory_path, 'model-q8.onnx'),
|
154 |
+
weight_type=weight_type,
|
155 |
+
optimize_model=False,
|
156 |
+
)
|
157 |
+
|
158 |
+
|
159 |
+
def main():
|
160 |
+
"""
|
161 |
+
Example usage:
|
162 |
+
python quantize_onnx.py --model_id sentence-transformers/all-MiniLM-L6-v2-unquantized
|
163 |
+
"""
|
164 |
+
parser = HfArgumentParser(
|
165 |
+
(ConversionArguments,)
|
166 |
+
)
|
167 |
+
conv_args, = parser.parse_args_into_dataclasses()
|
168 |
+
|
169 |
+
model_id = conv_args.model_id
|
170 |
+
|
171 |
+
quantize(os.path.join(model_id, "model.onnx"))
|
172 |
+
|
173 |
+
|
174 |
+
if __name__ == '__main__':
|
175 |
+
main()
|