Daniel Marques commited on
Commit
fe3defb
·
1 Parent(s): 416e7fd

feat: add websocket

Browse files
Files changed (4) hide show
  1. constants.py +2 -2
  2. load_models.py +0 -2
  3. main.py +1 -0
  4. prompt_template_utils.py +6 -1
constants.py CHANGED
@@ -37,8 +37,8 @@ MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE # int(CONTEXT_WINDOW_SIZE/4)
37
 
38
  #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
39
 
40
- N_GPU_LAYERS = 40 # Llama-2-70B has 83 layers
41
- N_BATCH = 1024
42
 
43
  ### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
44
  # N_GPU_LAYERS = 20
 
37
 
38
  #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
39
 
40
+ N_GPU_LAYERS = 100 # Llama-2-70B has 83 layers
41
+ N_BATCH = CONTEXT_WINDOW_SIZE
42
 
43
  ### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
44
  # N_GPU_LAYERS = 20
load_models.py CHANGED
@@ -58,8 +58,6 @@ def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, loggin
58
  "model_path": model_path,
59
  "n_ctx": CONTEXT_WINDOW_SIZE,
60
  "max_tokens": MAX_NEW_TOKENS,
61
- "n_batch": MAX_NEW_TOKENS,
62
-
63
  # set this based on your GPU & CPU RAM
64
  }
65
  if device_type.lower() == "mps":
 
58
  "model_path": model_path,
59
  "n_ctx": CONTEXT_WINDOW_SIZE,
60
  "max_tokens": MAX_NEW_TOKENS,
 
 
61
  # set this based on your GPU & CPU RAM
62
  }
63
  if device_type.lower() == "mps":
main.py CHANGED
@@ -51,6 +51,7 @@ QA = RetrievalQA.from_chain_type(
51
  return_source_documents=SHOW_SOURCES,
52
  chain_type_kwargs={
53
  "prompt": prompt,
 
54
  },
55
  )
56
 
 
51
  return_source_documents=SHOW_SOURCES,
52
  chain_type_kwargs={
53
  "prompt": prompt,
54
+ "memory": memory
55
  },
56
  )
57
 
prompt_template_utils.py CHANGED
@@ -6,6 +6,11 @@ This seems to have significant impact on the output of the LLM.
6
 
7
  from langchain.memory import ConversationBufferMemory
8
  from langchain.prompts import PromptTemplate
 
 
 
 
 
9
 
10
  # this is specific to Llama-2.
11
 
@@ -84,7 +89,7 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
84
  )
85
  prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
86
 
87
- memory = ConversationBufferMemory(input_key="question", memory_key="history")
88
 
89
  return (
90
  prompt,
 
6
 
7
  from langchain.memory import ConversationBufferMemory
8
  from langchain.prompts import PromptTemplate
9
+ from langchain.memory.chat_message_histories import RedisChatMessageHistory
10
+
11
+ message_history = RedisChatMessageHistory(
12
+ url="redis://localhost:6379/1", ttl=600, session_id="my-session"
13
+ )
14
 
15
  # this is specific to Llama-2.
16
 
 
89
  )
90
  prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
91
 
92
+ memory = ConversationBufferMemory(input_key="question", memory_key="history", chat_memory=message_history)
93
 
94
  return (
95
  prompt,