vilarin commited on
Commit
1e64d54
·
verified ·
1 Parent(s): 84e1807

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -22
app.py CHANGED
@@ -1,15 +1,8 @@
1
- # import subprocess
2
- # subprocess.run(
3
- # 'pip install flash-attn --no-build-isolation',
4
- # env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
5
- # shell=True
6
- # )
7
-
8
  import os
9
  import time
10
  import spaces
11
  import torch
12
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
13
  import gradio as gr
14
  from threading import Thread
15
 
@@ -40,24 +33,12 @@ h3 {
40
 
41
  device = "cuda" # for GPU usage or "cpu" for CPU usage
42
 
43
- quantization_config = BitsAndBytesConfig(
44
- load_in_4bit=True,
45
- bnb_4bit_compute_dtype=torch.bfloat16,
46
- bnb_4bit_use_double_quant=True,
47
- bnb_4bit_quant_type= "nf4")
48
-
49
  tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
50
  model = AutoModelForCausalLM.from_pretrained(
51
  MODEL,
52
- torch_dtype=torch.float16,
53
  device_map="auto",
54
- trust_remote_code=True,
55
- # attn_implementation="flash_attention_2",
56
- quantization_config=quantization_config).eval().to(device)
57
-
58
- # Ensure `pad_token_id` is set
59
- # if tokenizer.pad_token_id is None:
60
- # tokenizer.pad_token_id = tokenizer.eos_token_id
61
 
62
  @spaces.GPU()
63
  def stream_chat(
 
 
 
 
 
 
 
 
1
  import os
2
  import time
3
  import spaces
4
  import torch
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
  import gradio as gr
7
  from threading import Thread
8
 
 
33
 
34
  device = "cuda" # for GPU usage or "cpu" for CPU usage
35
 
 
 
 
 
 
 
36
  tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
37
  model = AutoModelForCausalLM.from_pretrained(
38
  MODEL,
39
+ torch_dtype=torch.bfloat16,
40
  device_map="auto",
41
+ trust_remote_code=True).eval().to(device)
 
 
 
 
 
 
42
 
43
  @spaces.GPU()
44
  def stream_chat(