Daniel Marques commited on
Commit
8fa0233
·
1 Parent(s): 2ea73cf

fix: add streamer

Browse files
Files changed (2) hide show
  1. load_models.py +7 -3
  2. main.py +2 -3
load_models.py CHANGED
@@ -22,7 +22,7 @@ torch.set_grad_enabled(False)
22
  from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH
23
 
24
 
25
- def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, logging):
26
  """
27
  Load a GGUF/GGML quantized model using LlamaCpp.
28
 
@@ -56,13 +56,17 @@ def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, loggin
56
  "model_path": model_path,
57
  "n_ctx": CONTEXT_WINDOW_SIZE,
58
  "max_tokens": MAX_NEW_TOKENS,
59
- "n_batch": N_BATCH, # set this based on your GPU & CPU RAM
 
60
  }
61
  if device_type.lower() == "mps":
62
  kwargs["n_gpu_layers"] = 1
63
  if device_type.lower() == "cuda":
64
  kwargs["n_gpu_layers"] = N_GPU_LAYERS # set this based on your GPU
65
 
 
 
 
66
  return LlamaCpp(**kwargs)
67
  except:
68
  if "ggml" in model_basename:
@@ -185,7 +189,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging, stre
185
 
186
  if model_basename is not None:
187
  if ".gguf" in model_basename.lower():
188
- llm = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
189
  return llm
190
  elif ".ggml" in model_basename.lower():
191
  model, tokenizer = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
 
22
  from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH
23
 
24
 
25
+ def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, logging, stream = False):
26
  """
27
  Load a GGUF/GGML quantized model using LlamaCpp.
28
 
 
56
  "model_path": model_path,
57
  "n_ctx": CONTEXT_WINDOW_SIZE,
58
  "max_tokens": MAX_NEW_TOKENS,
59
+ "n_batch": N_BATCH,
60
+ # set this based on your GPU & CPU RAM
61
  }
62
  if device_type.lower() == "mps":
63
  kwargs["n_gpu_layers"] = 1
64
  if device_type.lower() == "cuda":
65
  kwargs["n_gpu_layers"] = N_GPU_LAYERS # set this based on your GPU
66
 
67
+ #add stream
68
+ kwargs["stream"] = stream
69
+
70
  return LlamaCpp(**kwargs)
71
  except:
72
  if "ggml" in model_basename:
 
189
 
190
  if model_basename is not None:
191
  if ".gguf" in model_basename.lower():
192
+ llm = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING, stream)
193
  return llm
194
  elif ".ggml" in model_basename.lower():
195
  model, tokenizer = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
main.py CHANGED
@@ -42,7 +42,8 @@ DB = Chroma(
42
 
43
  RETRIEVER = DB.as_retriever()
44
 
45
- models = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME, stream=False)
 
46
  LLM = models[0]
47
  STREAMER = models[1]
48
 
@@ -164,8 +165,6 @@ async def predict(data: Predict):
164
  global QA
165
  user_prompt = data.prompt
166
  if user_prompt:
167
- # print(f'User Prompt: {user_prompt}')
168
- # Get the answer from the chain
169
  res = QA(user_prompt)
170
 
171
  print(res)
 
42
 
43
  RETRIEVER = DB.as_retriever()
44
 
45
+ models = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME, stream=True)
46
+
47
  LLM = models[0]
48
  STREAMER = models[1]
49
 
 
165
  global QA
166
  user_prompt = data.prompt
167
  if user_prompt:
 
 
168
  res = QA(user_prompt)
169
 
170
  print(res)