jaymojnidar commited on
Commit
6297add
1 Parent(s): 617c515

loading the model back in GPU mode

Browse files
Files changed (2) hide show
  1. app.py +2 -2
  2. model.py +6 -18
app.py CHANGED
@@ -30,10 +30,10 @@ As a derivate work of [Llama-2-13b-chat](https://huggingface.co/meta-llama/Llama
30
  this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/USE_POLICY.md).
31
  """
32
 
33
- '''
34
  if not torch.cuda.is_available():
35
  DESCRIPTION += '\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>'
36
- '''
37
 
38
  def clear_and_save_textbox(message: str) -> tuple[str, str]:
39
  return '', message
 
30
  this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/USE_POLICY.md).
31
  """
32
 
33
+
34
  if not torch.cuda.is_available():
35
  DESCRIPTION += '\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>'
36
+
37
 
38
  def clear_and_save_textbox(message: str) -> tuple[str, str]:
39
  return '', message
model.py CHANGED
@@ -3,41 +3,29 @@ from threading import Thread
3
  from typing import Iterator
4
 
5
  import torch
6
- from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
7
  from huggingface_hub import login
8
 
9
  model_id = 'jaymojnidar/Llama-2-7b-chat-hf-sharded-bf16-5GBMAX'
10
 
11
- if not torch.cuda.is_available():
12
  tok = os.environ['HF_TOKEN']
13
-
14
- device_map = {
15
- "transformer.word_embeddings": "cpu",
16
- "transformer.word_embeddings_layernorm": "cpu",
17
- "lm_head": "cpu",
18
- "transformer.h": "cpu",
19
- "transformer.ln_f": "cpu",
20
- "model.layers": "cpu",
21
- "model.norm": "cpu",
22
- }
23
-
24
  login(new_session=True,
25
  write_permission=False,
26
  token=tok
27
 
28
  #, token="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"
29
  )
30
- quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True, load_in_8bit=True,llm_int8_threshold=200.0)
31
-
32
  config = AutoConfig.from_pretrained(model_id,
33
  use_auth_token=True)
34
  config.pretraining_tp = 1
35
  model = AutoModelForCausalLM.from_pretrained(
36
  model_id,
37
  config=config,
38
- quantization_config=quantization_config,
39
  torch_dtype=torch.float16,
40
- device_map=device_map,
 
41
  use_auth_token=True
42
  )
43
  else:
@@ -73,7 +61,7 @@ def run(message: str,
73
  top_p: float = 0.95,
74
  top_k: int = 50) -> Iterator[str]:
75
  prompt = get_prompt(message, chat_history, system_prompt)
76
- inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to(torch.device) #.to('cuda')
77
 
78
  streamer = TextIteratorStreamer(tokenizer,
79
  timeout=10.,
 
3
  from typing import Iterator
4
 
5
  import torch
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
7
  from huggingface_hub import login
8
 
9
  model_id = 'jaymojnidar/Llama-2-7b-chat-hf-sharded-bf16-5GBMAX'
10
 
11
+ if torch.cuda.is_available():
12
  tok = os.environ['HF_TOKEN']
 
 
 
 
 
 
 
 
 
 
 
13
  login(new_session=True,
14
  write_permission=False,
15
  token=tok
16
 
17
  #, token="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"
18
  )
19
+
 
20
  config = AutoConfig.from_pretrained(model_id,
21
  use_auth_token=True)
22
  config.pretraining_tp = 1
23
  model = AutoModelForCausalLM.from_pretrained(
24
  model_id,
25
  config=config,
 
26
  torch_dtype=torch.float16,
27
+ load_in_8bit=True,
28
+ device_map='auto',
29
  use_auth_token=True
30
  )
31
  else:
 
61
  top_p: float = 0.95,
62
  top_k: int = 50) -> Iterator[str]:
63
  prompt = get_prompt(message, chat_history, system_prompt)
64
+ inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
65
 
66
  streamer = TextIteratorStreamer(tokenizer,
67
  timeout=10.,