Spaces:

jaymojnidar
/

playshard

Paused

jaymojnidar commited on Sep 17, 2023

Commit

617c515

•

1 Parent(s): b349bb2

loading the model in CPU mode

Files changed (1) hide show

model.py CHANGED Viewed

@@ -10,20 +10,24 @@ model_id = 'jaymojnidar/Llama-2-7b-chat-hf-sharded-bf16-5GBMAX'
 if not torch.cuda.is_available():
     tok = os.environ['HF_TOKEN']
     device_map = {
-    "transformer.word_embeddings": 0,
-    "transformer.word_embeddings_layernorm": 0,
-    "lm_head": "cpu",
-    "transformer.h": 0,
-    "transformer.ln_f": 0,
-}
     login(new_session=True,
           write_permission=False,
           token=tok
           #, token="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"
           )
-    quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
     config = AutoConfig.from_pretrained(model_id,
         use_auth_token=True)

 if not torch.cuda.is_available():
     tok = os.environ['HF_TOKEN']
     device_map = {
+        "transformer.word_embeddings": "cpu",
+        "transformer.word_embeddings_layernorm": "cpu",
+        "lm_head": "cpu",
+        "transformer.h": "cpu",
+        "transformer.ln_f": "cpu",
+        "model.layers": "cpu",
+        "model.norm": "cpu",
+        }
     login(new_session=True,
           write_permission=False,
           token=tok
           #, token="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"
           )
+    quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True, load_in_8bit=True,llm_int8_threshold=200.0)
     config = AutoConfig.from_pretrained(model_id,
         use_auth_token=True)