mgoin commited on
Commit
8df500d
1 Parent(s): 80fe67f

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +96 -0
README.md ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```
2
+ lm_eval --model vllm --model_args pretrained=/home/mgoin/code/llm-compressor/examples/quantizing_moe/OLMoE-1B-7B-0924-Instruct-FP8,tensor_parallel_size=1,trust_remote_code=True --tasks gsm8k --num_fewshot 5 --batch_size auto
3
+ vllm (pretrained=/home/mgoin/code/llm-compressor/examples/quantizing_moe/OLMoE-1B-7B-0924-Instruct-FP8,tensor_parallel_size=1,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: auto
4
+ |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr|
5
+ |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
6
+ |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.3510|± |0.0131|
7
+ | | |strict-match | 5|exact_match|↑ |0.3389|± |0.0130|
8
+ ```
9
+
10
+ ## Creation
11
+ ```python
12
+ import torch
13
+ from datasets import load_dataset
14
+ from transformers import AutoTokenizer
15
+
16
+ from llmcompressor.modifiers.quantization import QuantizationModifier
17
+ from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
18
+
19
+ # select a Mixture of Experts model for quantization
20
+ MODEL_ID = "allenai/OLMoE-1B-7B-0924-Instruct"
21
+
22
+ model = SparseAutoModelForCausalLM.from_pretrained(
23
+ MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True
24
+ )
25
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
26
+
27
+ # Select calibration dataset.
28
+ # its recommended to use more calibration samples for MoE models so each expert is hit
29
+ DATASET_ID = "HuggingFaceH4/ultrachat_200k"
30
+ DATASET_SPLIT = "train_sft"
31
+ NUM_CALIBRATION_SAMPLES = 2048
32
+ MAX_SEQUENCE_LENGTH = 2048
33
+
34
+
35
+ # Load dataset and preprocess.
36
+ ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
37
+ ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
38
+
39
+
40
+ def preprocess(example):
41
+ return {
42
+ "text": tokenizer.apply_chat_template(
43
+ example["messages"],
44
+ tokenize=False,
45
+ )
46
+ }
47
+
48
+
49
+ ds = ds.map(preprocess)
50
+
51
+
52
+ # Tokenize inputs.
53
+ def tokenize(sample):
54
+ return tokenizer(
55
+ sample["text"],
56
+ padding=False,
57
+ max_length=MAX_SEQUENCE_LENGTH,
58
+ truncation=True,
59
+ add_special_tokens=False,
60
+ )
61
+
62
+
63
+ ds = ds.map(tokenize, remove_columns=ds.column_names)
64
+
65
+ # define a llmcompressor recipe for FP8 W8A8 quantization
66
+ # since the MoE gate layers are sensitive to quantization, we add them to the ignore
67
+ # list so they remain at full precision
68
+ recipe = [
69
+ QuantizationModifier(
70
+ targets="Linear",
71
+ scheme="FP8",
72
+ ignore=["lm_head", "re:.*mlp.gate$"],
73
+ ),
74
+ ]
75
+
76
+ SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8"
77
+
78
+ oneshot(
79
+ model=model,
80
+ dataset=ds,
81
+ recipe=recipe,
82
+ max_seq_length=MAX_SEQUENCE_LENGTH,
83
+ num_calibration_samples=NUM_CALIBRATION_SAMPLES,
84
+ save_compressed=True,
85
+ output_dir=SAVE_DIR,
86
+ )
87
+
88
+
89
+ print("========== SAMPLE GENERATION ==============")
90
+ SAMPLE_INPUT = ["I love quantization because"]
91
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
92
+ inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
93
+ output = model.generate(**inputs, max_length=50)
94
+ text_output = tokenizer.batch_decode(output)
95
+ print(text_output)
96
+ ```