AlyxTeam commited on
Commit
16c80da
1 Parent(s): 6ddacd8

feat: ZeroGPU不支持量化

Browse files
Files changed (3) hide show
  1. README.md +8 -1
  2. app.py +29 -6
  3. requirements.txt +3 -1
README.md CHANGED
@@ -10,4 +10,11 @@ pinned: false
10
  license: mit
11
  ---
12
 
13
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
 
 
 
 
 
 
 
10
  license: mit
11
  ---
12
 
13
+ An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
14
+
15
+
16
+
17
+ OSError: [Errno 28] No space left on device
18
+ ```bash
19
+ rm -rf /data-nvme/zerogpu-offload/*
20
+ ```
app.py CHANGED
@@ -1,14 +1,35 @@
1
  import spaces
2
  import gradio as gr
3
  from huggingface_hub import InferenceClient
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
  import torch
6
  import subprocess
7
 
8
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True)
11
- model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
12
 
13
  @spaces.GPU(duration=120)
14
  def respond(
@@ -19,16 +40,18 @@ def respond(
19
  temperature,
20
  top_p,
21
  ):
 
 
22
  if len(message) < 1:
23
  message = "write a quick sort algorithm in python."
24
 
25
  messages = [
26
- { 'role': 'user', 'content': message }
27
  ]
28
 
29
- inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
30
 
31
- outputs = model.generate(inputs, max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_k=50, top_p=top_p, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
32
 
33
  return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
34
 
 
1
  import spaces
2
  import gradio as gr
3
  from huggingface_hub import InferenceClient
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
5
  import torch
6
  import subprocess
7
 
8
+ subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
9
+ subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)
10
+
11
+ kwargs = {}
12
+
13
+ """
14
+ https://hugging-face.cn/docs/transformers/quantization/bitsandbytes
15
+ """
16
+
17
+ # quantization_config = BitsAndBytesConfig(
18
+ # load_in_4bit=True,
19
+ # bnb_4bit_quant_type="nf4",
20
+ # bnb_4bit_use_double_quant=True,
21
+ # bnb_4bit_compute_dtype=torch.bfloat16,
22
+ # )
23
+
24
+ # quantization_config = BitsAndBytesConfig(
25
+ # load_in_8bit=True,
26
+ # # llm_int8_enable_fp32_cpu_offload=True,
27
+ # )
28
+
29
+ # kwargs = { "quantization_config": quantization_config, "low_cpu_mem_usage": True }
30
 
31
  tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True)
32
+ model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True, torch_dtype=torch.bfloat16, **kwargs).cuda()
33
 
34
  @spaces.GPU(duration=120)
35
  def respond(
 
40
  temperature,
41
  top_p,
42
  ):
43
+ modelx = model
44
+
45
  if len(message) < 1:
46
  message = "write a quick sort algorithm in python."
47
 
48
  messages = [
49
+ { "role": "user", "content": message }
50
  ]
51
 
52
+ inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(modelx.device)
53
 
54
+ outputs = modelx.generate(inputs, max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_k=50, top_p=top_p, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
55
 
56
  return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
57
 
requirements.txt CHANGED
@@ -1,2 +1,4 @@
1
  huggingface_hub==0.22.2
2
- transformers
 
 
 
1
  huggingface_hub==0.22.2
2
+ transformers
3
+ # accelerate
4
+ # bitsandbytes