Vitrous commited on
Commit
ace0225
·
verified ·
1 Parent(s): 6ef9b66

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -24
app.py CHANGED
@@ -19,26 +19,6 @@ conversations = {}
19
  Device_Type = "cuda"
20
 
21
 
22
- def load_quantized_model(model_id, model_basename):
23
- # The code supports all huggingface models that ends with GPTQ and have some variation
24
- # of .no-act.order or .safetensors in their HF repo.
25
- print("Using AutoGPTQForCausalLM for quantized models")
26
-
27
- if ".safetensors" in model_basename:
28
- # Remove the ".safetensors" ending if present
29
- model_basename = model_basename.replace(".safetensors", "")
30
-
31
- quantized_tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
32
- print("Tokenizer loaded")
33
-
34
- quantized_model = AutoGPTQForCausalLM.from_quantized(model_id, model_basename=model_basename, use_safetensors=True, trust_remote_code=True, device_map="auto", use_triton=False, quantize_config=None,)
35
-
36
- return quantized_model, quantized_tokenizer
37
-
38
-
39
- # Making the code device-agnostic
40
- #model, tokenizer = load_quantized_model(model_name_or_path, "model.safetensors")
41
-
42
  def load_model_norm():
43
  if torch.cuda.is_available():
44
  print("CUDA is available. GPU will be used.")
@@ -50,7 +30,7 @@ def load_model_norm():
50
  # For example: revision="main"
51
  model = AutoModelForCausalLM.from_pretrained(model_name_or_path,device_map="auto", trust_remote_code=True,revision="gptq-4bit-128g-actorder_True")
52
  # Switch to CPU inference
53
- model.to("cuda")
54
  tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
55
 
56
  return model, tokenizer
@@ -91,9 +71,18 @@ def generate_response(prompt: str) -> str:
91
 
92
  prompt_template = f'{PERSONA_DESC}\n\nASSISTANT: {prompt}\n'
93
 
94
- input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
95
- output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
96
- generated_text = tokenizer.decode(output[0])
 
 
 
 
 
 
 
 
 
97
 
98
 
99
  return generated_text
 
19
  Device_Type = "cuda"
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def load_model_norm():
23
  if torch.cuda.is_available():
24
  print("CUDA is available. GPU will be used.")
 
30
  # For example: revision="main"
31
  model = AutoModelForCausalLM.from_pretrained(model_name_or_path,device_map="auto", trust_remote_code=True,revision="gptq-4bit-128g-actorder_True")
32
  # Switch to CPU inference
33
+ #model.to("cuda")
34
  tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
35
 
36
  return model, tokenizer
 
71
 
72
  prompt_template = f'{PERSONA_DESC}\n\nASSISTANT: {prompt}\n'
73
 
74
+ pipe = pipeline(
75
+ "text-generation",
76
+ model=model,
77
+ tokenizer=tokenizer,
78
+ max_new_tokens=512,
79
+ do_sample=True,
80
+ temperature=0.7,
81
+ top_p=0.95,
82
+ top_k=40,
83
+ repetition_penalty=1.1
84
+ )
85
+ generated_text = (pipe(prompt_template)[0]['generated_text'])
86
 
87
 
88
  return generated_text