Spaces:

varun4
/

qMTEB

Sleeping

App Files Files Community

varun4 commited on Sep 13, 2023

Commit

0606100

•

1 Parent(s): fba41a4

quantizing scripts added

Browse files

Files changed (3) hide show

aggregate_data.py +5 -5
quantize.py +267 -0
quantize_onnx.py +175 -0

aggregate_data.py CHANGED Viewed

@@ -31,7 +31,7 @@ MODELS = [
 def get_model_size(model_name):
-    return os.path.getsize(f"models/{model_name}/pytorch_model.bin") / (1024.0 * 1024.0)
 def compute_model_score(model_name):
@@ -64,16 +64,16 @@ def compute_model_score(model_name):
 DATA = {
         "Model": MODELS,
         "Model Size (MB)": [
-                get_model_size(model) for model in MODELS
             ],
         "Score": [
-                5  # compute_model_score(model) for model in MODELS
             ],
         "q8 Model Size (MB)": [
-                get_model_size(model + "-q8") for model in MODELS
             ],
         "q8 Score": [
-                compute_model_score(model + "-q8") for model in MODELS
             ],
     }

 def get_model_size(model_name):
+    return os.path.getsize(f"models/{model_name}") / (1024.0 * 1024.0)
 def compute_model_score(model_name):
 DATA = {
         "Model": MODELS,
         "Model Size (MB)": [
+                get_model_size(f"{model}/pytorch_model.bin") for model in MODELS
             ],
         "Score": [
+                compute_model_score(model) for model in MODELS
             ],
         "q8 Model Size (MB)": [
+                get_model_size(f"optimum/{model}-self-optimum-q8/model.onnx") for model in MODELS
             ],
         "q8 Score": [
+                compute_model_score(f"optimum/{model}-q8") for model in MODELS
             ],
     }

quantize.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import json
+import os
+import shutil
+from dataclasses import dataclass, field
+from typing import Optional, Set
+from tqdm import tqdm
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    HfArgumentParser
+)
+import onnx
+from optimum.exporters.onnx import main_export, export_models
+from optimum.exporters.tasks import TasksManager
+from onnxruntime.quantization import (
+    quantize_dynamic,
+    QuantType
+)
+DEFAULT_QUANTIZE_PARAMS = {
+    'per_channel': True,
+    'reduce_range': True,
+}
+MODEL_SPECIFIC_QUANTIZE_PARAMS = {
+    'whisper': {
+        'per_channel': False,
+        'reduce_range': False,
+    }
+}
+MODELS_WITHOUT_TOKENIZERS = [
+    'wav2vec2'
+]
+@dataclass
+class ConversionArguments:
+    """
+    Arguments used for converting HuggingFace models to onnx.
+    """
+    model_id: str = field(
+        metadata={
+            "help": "Model identifier"
+        }
+    )
+    quantize: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to quantize the model."
+        }
+    )
+    output_parent_dir: str = field(
+        default='./models/',
+        metadata={
+            "help": "Path where the converted model will be saved to."
+        }
+    )
+    task: Optional[str] = field(
+        default='auto',
+        metadata={
+            "help": (
+                "The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among:"
+                f" {str(list(TasksManager._TASKS_TO_AUTOMODELS.keys()))}. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder."
+            )
+        }
+    )
+    opset: int = field(
+        default=None,
+        metadata={
+            "help": (
+                "If specified, ONNX opset version to export the model with. Otherwise, the default opset will be used."
+            )
+        }
+    )
+    device: str = field(
+        default='cpu',
+        metadata={
+            "help": 'The device to use to do the export.'
+        }
+    )
+    skip_validation: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to skip validation of the converted model"
+        }
+    )
+    per_channel: bool = field(
+        default=None,
+        metadata={
+            "help": "Whether to quantize weights per channel"
+        }
+    )
+    reduce_range: bool = field(
+        default=None,
+        metadata={
+            "help": "Whether to quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode"
+        }
+    )
+    output_attentions: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to output attentions from the model. NOTE: This is only supported for whisper models right now."
+        }
+    )
+    split_modalities: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to split multimodal models. NOTE: This is only supported for CLIP models right now."
+        }
+    )
+def get_operators(model: onnx.ModelProto) -> Set[str]:
+    operators = set()
+    def traverse_graph(graph):
+        for node in graph.node:
+            operators.add(node.op_type)
+            for attr in node.attribute:
+                if attr.type == onnx.AttributeProto.GRAPH:
+                    subgraph = attr.g
+                    traverse_graph(subgraph)
+    traverse_graph(model.graph)
+    return operators
+def quantize(model_names_or_paths, **quantize_kwargs):
+    """
+    Quantize the weights of the model from float32 to int8 to allow very efficient inference on modern CPU
+    Uses unsigned ints for activation values, signed ints for weights, per
+    https://onnxruntime.ai/docs/performance/quantization.html#data-type-selection
+    it is faster on most CPU architectures
+    Args:
+        onnx_model_path: Path to location the exported ONNX model is stored
+    Returns: The Path generated for the quantized
+    """
+    quantize_config = dict(
+        **quantize_kwargs,
+        per_model_config={}
+    )
+    for model in tqdm(model_names_or_paths, desc='Quantizing'):
+        directory_path = os.path.dirname(model)
+        file_name_without_extension = os.path.splitext(
+            os.path.basename(model))[0]
+        # NOTE:
+        # As of 2023/04/20, the current latest version of onnxruntime-web is 1.14.0, and does not support INT8 weights for Conv layers.
+        # For this reason, we choose model weight types to ensure compatibility with onnxruntime-web.
+        #
+        # As per docs, signed weight type (QInt8) is faster on most CPUs, so, we use that unless the model contains a Conv layer.
+        # For more information, see:
+        #  - https://github.com/microsoft/onnxruntime/issues/3130#issuecomment-1105200621
+        #  - https://github.com/microsoft/onnxruntime/issues/2339
+        loaded_model = onnx.load_model(model)
+        op_types = get_operators(loaded_model)
+        weight_type = QuantType.QUInt8 if 'Conv' in op_types else QuantType.QInt8
+        quantize_dynamic(
+            model_input=model,
+            model_output=os.path.join(
+                directory_path, f'{file_name_without_extension}_quantized.onnx'),
+            weight_type=weight_type,
+            optimize_model=False,
+            # TODO allow user to specify these
+            # op_types_to_quantize=['MatMul', 'Add', 'Conv'],
+            extra_options=dict(
+                EnableSubgraph=True
+            ),
+            **quantize_kwargs
+        )
+        quantize_config['per_model_config'][file_name_without_extension] = dict(
+            op_types=list(op_types),
+            weight_type=str(weight_type),
+        )
+    # Save quantization config
+    with open(os.path.join(directory_path, 'quantize_config.json'), 'w') as fp:
+        json.dump(quantize_config, fp, indent=4)
+def main():
+    """
+    Example usage:
+    python quantize.py --model_id sentence-transformers/all-MiniLM-L6-v2-unquantized  --quantize --task default
+    """
+    parser = HfArgumentParser(
+        (ConversionArguments, )
+    )
+    conv_args, = parser.parse_args_into_dataclasses()
+    model_id = conv_args.model_id
+    output_model_folder = os.path.join(conv_args.output_parent_dir, model_id)
+    # Create output folder
+    os.makedirs(output_model_folder, exist_ok=True)
+    # Saving the model config
+    config = AutoConfig.from_pretrained(model_id)
+    tokenizer = None
+    try:
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+    except KeyError:
+        pass  # No Tokenizer
+    except Exception as e:
+        if config.model_type not in MODELS_WITHOUT_TOKENIZERS:
+            raise e
+    # model_name_or_path can be local path or huggingface id
+    export_kwargs = dict(
+        model_name_or_path=model_id,
+        output=output_model_folder,
+        task=conv_args.task,
+        opset=conv_args.opset,
+        device=conv_args.device,
+        do_validation=not conv_args.skip_validation,
+    )
+    # Step 1. convert huggingface model to onnx
+    main_export(**export_kwargs)
+    # Step 2. (optional, recommended) quantize the converted model for fast inference and to reduce model size.
+    if conv_args.quantize:
+        # Update quantize config with model specific defaults
+        quantize_config = MODEL_SPECIFIC_QUANTIZE_PARAMS.get(
+            config.model_type, DEFAULT_QUANTIZE_PARAMS)
+        quantize([
+            os.path.join(output_model_folder, x)
+            for x in os.listdir(output_model_folder)
+            if x.endswith('.onnx') and not x.endswith('_quantized.onnx')
+        ], **quantize_config)
+    # Step 3. Move .onnx files to the 'onnx' subfolder
+    os.makedirs(os.path.join(output_model_folder, 'onnx'), exist_ok=True)
+    for file in os.listdir(output_model_folder):
+        if file.endswith(('.onnx', '.onnx_data')):
+            shutil.move(os.path.join(output_model_folder, file),
+                        os.path.join(output_model_folder, 'onnx', file))
+if __name__ == '__main__':
+    main()

quantize_onnx.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import os
+from dataclasses import dataclass, field
+from typing import Optional, Set
+import onnx
+from onnxruntime.quantization import (
+    quantize_dynamic,
+    QuantType
+)
+from optimum.exporters.tasks import TasksManager
+from transformers import (
+    AutoConfig,
+    HfArgumentParser
+)
+DEFAULT_QUANTIZE_PARAMS = {
+    'per_channel': True,
+    'reduce_range': True,
+}
+MODEL_SPECIFIC_QUANTIZE_PARAMS = {
+    'whisper': {
+        'per_channel': False,
+        'reduce_range': False,
+    }
+}
+MODELS_WITHOUT_TOKENIZERS = [
+    'wav2vec2'
+]
+@dataclass
+class ConversionArguments:
+    """
+    Arguments used for converting HuggingFace models to onnx.
+    """
+    model_id: str = field(
+        metadata={
+            "help": "Model identifier"
+        }
+    )
+    quantize: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to quantize the model."
+        }
+    )
+    output_parent_dir: str = field(
+        default='./models/',
+        metadata={
+            "help": "Path where the converted model will be saved to."
+        }
+    )
+    task: Optional[str] = field(
+        default='auto',
+        metadata={
+            "help": (
+                "The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among:"
+                f" {str(list(TasksManager._TASKS_TO_AUTOMODELS.keys()))}. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder."
+            )
+        }
+    )
+    opset: int = field(
+        default=None,
+        metadata={
+            "help": (
+                "If specified, ONNX opset version to export the model with. Otherwise, the default opset will be used."
+            )
+        }
+    )
+    device: str = field(
+        default='cpu',
+        metadata={
+            "help": 'The device to use to do the export.'
+        }
+    )
+    skip_validation: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to skip validation of the converted model"
+        }
+    )
+    per_channel: bool = field(
+        default=None,
+        metadata={
+            "help": "Whether to quantize weights per channel"
+        }
+    )
+    reduce_range: bool = field(
+        default=None,
+        metadata={
+            "help": "Whether to quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode"
+        }
+    )
+    output_attentions: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to output attentions from the model. NOTE: This is only supported for whisper models right now."
+        }
+    )
+    split_modalities: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to split multimodal models. NOTE: This is only supported for CLIP models right now."
+        }
+    )
+def get_operators(model: onnx.ModelProto) -> Set[str]:
+    operators = set()
+    def traverse_graph(graph):
+        for node in graph.node:
+            operators.add(node.op_type)
+            for attr in node.attribute:
+                if attr.type == onnx.AttributeProto.GRAPH:
+                    subgraph = attr.g
+                    traverse_graph(subgraph)
+    traverse_graph(model.graph)
+    return operators
+def quantize(model_path):
+    """
+    Quantize the weights of the model from float32 to int8 to allow very efficient inference on modern CPU
+    Uses unsigned ints for activation values, signed ints for weights, per
+    https://onnxruntime.ai/docs/performance/quantization.html#data-type-selection
+    it is faster on most CPU architectures
+    Args:
+        onnx_model_path: Path to location the exported ONNX model is stored
+    Returns: The Path generated for the quantized
+    """
+    directory_path = os.path.dirname(model_path)
+    loaded_model = onnx.load_model(model_path)
+    op_types = get_operators(loaded_model)
+    weight_type = QuantType.QUInt8 if 'Conv' in op_types else QuantType.QInt8
+    print("quantizing to", weight_type)
+    quantize_dynamic(
+        model_input=model_path,
+        model_output=os.path.join(directory_path, 'model-q8.onnx'),
+        weight_type=weight_type,
+        optimize_model=False,
+    )
+def main():
+    """
+    Example usage:
+    python quantize_onnx.py --model_id sentence-transformers/all-MiniLM-L6-v2-unquantized
+    """
+    parser = HfArgumentParser(
+        (ConversionArguments,)
+    )
+    conv_args, = parser.parse_args_into_dataclasses()
+    model_id = conv_args.model_id
+    quantize(os.path.join(model_id, "model.onnx"))
+if __name__ == '__main__':
+    main()