import torch
from transformers import AutoModel
import os
os.environ["TRANSFORMERS_OFFLINE"] = "1"  # 1 for offline

model_fp32 = AutoModel.from_pretrained("./models/all-MiniLM-L6-v2")

model_int8 = torch.ao.quantization.quantize_dynamic(
    model_fp32,  # the original model
    {torch.nn.Linear},  # a set of layers to dynamically quantize
    dtype=torch.float16)

torch.save(model_int8.state_dict(), "./models/all-MiniLM-L6-v2-unquantized-q16/pytorch_model.bin")