jaymojnidar commited on
Commit
617c515
1 Parent(s): b349bb2

loading the model in CPU mode

Browse files
Files changed (1) hide show
  1. model.py +11 -7
model.py CHANGED
@@ -10,20 +10,24 @@ model_id = 'jaymojnidar/Llama-2-7b-chat-hf-sharded-bf16-5GBMAX'
10
 
11
  if not torch.cuda.is_available():
12
  tok = os.environ['HF_TOKEN']
 
13
  device_map = {
14
- "transformer.word_embeddings": 0,
15
- "transformer.word_embeddings_layernorm": 0,
16
- "lm_head": "cpu",
17
- "transformer.h": 0,
18
- "transformer.ln_f": 0,
19
- }
 
 
 
20
  login(new_session=True,
21
  write_permission=False,
22
  token=tok
23
 
24
  #, token="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"
25
  )
26
- quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
27
 
28
  config = AutoConfig.from_pretrained(model_id,
29
  use_auth_token=True)
 
10
 
11
  if not torch.cuda.is_available():
12
  tok = os.environ['HF_TOKEN']
13
+
14
  device_map = {
15
+ "transformer.word_embeddings": "cpu",
16
+ "transformer.word_embeddings_layernorm": "cpu",
17
+ "lm_head": "cpu",
18
+ "transformer.h": "cpu",
19
+ "transformer.ln_f": "cpu",
20
+ "model.layers": "cpu",
21
+ "model.norm": "cpu",
22
+ }
23
+
24
  login(new_session=True,
25
  write_permission=False,
26
  token=tok
27
 
28
  #, token="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"
29
  )
30
+ quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True, load_in_8bit=True,llm_int8_threshold=200.0)
31
 
32
  config = AutoConfig.from_pretrained(model_id,
33
  use_auth_token=True)