import torch from transformers import AutoModel import os os.environ["TRANSFORMERS_OFFLINE"] = "1" # 1 for offline model_fp32 = AutoModel.from_pretrained("./models/all-MiniLM-L6-v2") model_int8 = torch.ao.quantization.quantize_dynamic( model_fp32, # the original model {torch.nn.Linear}, # a set of layers to dynamically quantize dtype=torch.float16) torch.save(model_int8.state_dict(), "./models/all-MiniLM-L6-v2-unquantized-q16/pytorch_model.bin")