Spaces:
Paused
Paused
Daniel Marques
commited on
Commit
·
8fa0233
1
Parent(s):
2ea73cf
fix: add streamer
Browse files- load_models.py +7 -3
- main.py +2 -3
load_models.py
CHANGED
@@ -22,7 +22,7 @@ torch.set_grad_enabled(False)
|
|
22 |
from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH
|
23 |
|
24 |
|
25 |
-
def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, logging):
|
26 |
"""
|
27 |
Load a GGUF/GGML quantized model using LlamaCpp.
|
28 |
|
@@ -56,13 +56,17 @@ def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, loggin
|
|
56 |
"model_path": model_path,
|
57 |
"n_ctx": CONTEXT_WINDOW_SIZE,
|
58 |
"max_tokens": MAX_NEW_TOKENS,
|
59 |
-
"n_batch": N_BATCH,
|
|
|
60 |
}
|
61 |
if device_type.lower() == "mps":
|
62 |
kwargs["n_gpu_layers"] = 1
|
63 |
if device_type.lower() == "cuda":
|
64 |
kwargs["n_gpu_layers"] = N_GPU_LAYERS # set this based on your GPU
|
65 |
|
|
|
|
|
|
|
66 |
return LlamaCpp(**kwargs)
|
67 |
except:
|
68 |
if "ggml" in model_basename:
|
@@ -185,7 +189,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging, stre
|
|
185 |
|
186 |
if model_basename is not None:
|
187 |
if ".gguf" in model_basename.lower():
|
188 |
-
llm = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
|
189 |
return llm
|
190 |
elif ".ggml" in model_basename.lower():
|
191 |
model, tokenizer = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
|
|
|
22 |
from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH
|
23 |
|
24 |
|
25 |
+
def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, logging, stream = False):
|
26 |
"""
|
27 |
Load a GGUF/GGML quantized model using LlamaCpp.
|
28 |
|
|
|
56 |
"model_path": model_path,
|
57 |
"n_ctx": CONTEXT_WINDOW_SIZE,
|
58 |
"max_tokens": MAX_NEW_TOKENS,
|
59 |
+
"n_batch": N_BATCH,
|
60 |
+
# set this based on your GPU & CPU RAM
|
61 |
}
|
62 |
if device_type.lower() == "mps":
|
63 |
kwargs["n_gpu_layers"] = 1
|
64 |
if device_type.lower() == "cuda":
|
65 |
kwargs["n_gpu_layers"] = N_GPU_LAYERS # set this based on your GPU
|
66 |
|
67 |
+
#add stream
|
68 |
+
kwargs["stream"] = stream
|
69 |
+
|
70 |
return LlamaCpp(**kwargs)
|
71 |
except:
|
72 |
if "ggml" in model_basename:
|
|
|
189 |
|
190 |
if model_basename is not None:
|
191 |
if ".gguf" in model_basename.lower():
|
192 |
+
llm = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING, stream)
|
193 |
return llm
|
194 |
elif ".ggml" in model_basename.lower():
|
195 |
model, tokenizer = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
|
main.py
CHANGED
@@ -42,7 +42,8 @@ DB = Chroma(
|
|
42 |
|
43 |
RETRIEVER = DB.as_retriever()
|
44 |
|
45 |
-
models = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME, stream=
|
|
|
46 |
LLM = models[0]
|
47 |
STREAMER = models[1]
|
48 |
|
@@ -164,8 +165,6 @@ async def predict(data: Predict):
|
|
164 |
global QA
|
165 |
user_prompt = data.prompt
|
166 |
if user_prompt:
|
167 |
-
# print(f'User Prompt: {user_prompt}')
|
168 |
-
# Get the answer from the chain
|
169 |
res = QA(user_prompt)
|
170 |
|
171 |
print(res)
|
|
|
42 |
|
43 |
RETRIEVER = DB.as_retriever()
|
44 |
|
45 |
+
models = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME, stream=True)
|
46 |
+
|
47 |
LLM = models[0]
|
48 |
STREAMER = models[1]
|
49 |
|
|
|
165 |
global QA
|
166 |
user_prompt = data.prompt
|
167 |
if user_prompt:
|
|
|
|
|
168 |
res = QA(user_prompt)
|
169 |
|
170 |
print(res)
|