Artificium-llama-3.1-8B

Running on Zero

freeCS-dot-org commited on Sep 8

Commit

a5e2fed

•

1 Parent(s): 0244d86

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -33,18 +33,11 @@ h3 {
 device = "cuda" # for GPU usage or "cpu" for CPU usage
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type= "nf4")
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL,
     torch_dtype=torch.bfloat16,
-    device_map="auto",
-    quantization_config=quantization_config)
 @spaces.GPU()
 def stream_chat(

 device = "cuda" # for GPU usage or "cpu" for CPU usage
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL,
     torch_dtype=torch.bfloat16,
+    device_map="auto")
 @spaces.GPU()
 def stream_chat(