GusPuffy commited on
Commit
a5e469f
·
verified ·
1 Parent(s): 73ade43

Upload compress.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. compress.py +93 -0
compress.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ import re
4
+
5
+ from llmcompressor.modifiers.quantization import GPTQModifier
6
+ from llmcompressor.transformers import oneshot
7
+
8
+ # Select model and load it.
9
+ MODEL_ID = "ArliAI/Llama-3.1-70B-ArliAI-RPMax-v1.3"
10
+
11
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
12
+
13
+ # Select calibration dataset.
14
+ DATASET_ID = "openerotica/erotiquant3"
15
+ DATASET_SPLIT = "train"
16
+
17
+ # Select number of samples. 512 samples is a good place to start.
18
+ # Increasing the number of samples can improve accuracy.
19
+ NUM_CALIBRATION_SAMPLES = 512
20
+ MAX_SEQUENCE_LENGTH = 4096
21
+
22
+ # Load dataset and preprocess.
23
+ ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
24
+
25
+ def preprocess(example):
26
+ result = []
27
+ matches = re.findall(r'(SYSTEM|USER|ASSISTANT):\s*((?:(?!SYSTEM|USER|ASSISTANT:).|\n)+)', example['text'], re.DOTALL)
28
+
29
+ # Loop through the matches and create a dictionary for each role and its content
30
+ for role, content in matches:
31
+ result.append({"role": role.lower(), "content": content.strip()})
32
+
33
+ text = tokenizer.apply_chat_template(result, tokenize=False, add_generation_prompt=False)
34
+ tokens = tokenizer.apply_chat_template(result, tokenize=True, add_generation_prompt=False)
35
+
36
+ return {
37
+ "chat": result,
38
+ "text": text,
39
+ "tokens": tokens,
40
+ }
41
+
42
+ ds = ds.map(preprocess)
43
+
44
+ def filter_short_rows(example):
45
+ result = len(example['tokens']) > MAX_SEQUENCE_LENGTH
46
+ if result == False:
47
+ print(f"length: {len(example['tokens'])}")
48
+ return result
49
+
50
+ ds = ds.filter(filter_short_rows)
51
+
52
+ ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
53
+
54
+
55
+ # Tokenize inputs.
56
+ def tokenize(sample):
57
+ return tokenizer(
58
+ sample["text"],
59
+ padding=False,
60
+ max_length=MAX_SEQUENCE_LENGTH,
61
+ truncation=True,
62
+ add_special_tokens=False,
63
+ )
64
+
65
+ ds = ds.map(tokenize, remove_columns=ds.column_names)
66
+
67
+ # Configure the quantization algorithm to run.
68
+ # * quantize the weights to 4 bit with GPTQ with a group size 128
69
+ recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
70
+
71
+ model = AutoModelForCausalLM.from_pretrained(
72
+ MODEL_ID,
73
+ device_map="auto",
74
+ torch_dtype="auto",
75
+ )
76
+
77
+ # Apply algorithms.
78
+ oneshot(
79
+ model=model,
80
+ dataset=ds,
81
+ recipe=recipe,
82
+ max_seq_length=MAX_SEQUENCE_LENGTH,
83
+ num_calibration_samples=NUM_CALIBRATION_SAMPLES
84
+ )
85
+
86
+ print('SAVING')
87
+
88
+ # Save to disk compressed.
89
+ SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
90
+ model.save_pretrained(SAVE_DIR, save_compressed=True, skip_compression_stats=True)
91
+ tokenizer.save_pretrained(SAVE_DIR)
92
+
93
+ print('Saved')