Mattral commited on
Commit
4b3e764
1 Parent(s): d16e462

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -0
app.py CHANGED
@@ -15,6 +15,8 @@ print("Embedding model loaded...")
15
 
16
  # Load the LLM
17
  callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
 
 
18
  llm = AutoModelForCausalLM.from_pretrained(
19
  "TheBloke/Llama-2-7B-Chat-GGUF",
20
  model_file="llama-2-7b-chat.Q3_K_S.gguf",
@@ -23,6 +25,16 @@ llm = AutoModelForCausalLM.from_pretrained(
23
  repetition_penalty=1.5,
24
  max_new_tokens=300,
25
  )
 
 
 
 
 
 
 
 
 
 
26
  print("LLM loaded...")
27
 
28
  client = QdrantClient(path="./db")
 
15
 
16
  # Load the LLM
17
  callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
18
+
19
+ '''
20
  llm = AutoModelForCausalLM.from_pretrained(
21
  "TheBloke/Llama-2-7B-Chat-GGUF",
22
  model_file="llama-2-7b-chat.Q3_K_S.gguf",
 
25
  repetition_penalty=1.5,
26
  max_new_tokens=300,
27
  )
28
+ '''
29
+ llm = LlamaCpp(
30
+ model_path="./llama-2-7b-chat.Q3_K_S.gguf",
31
+ temperature = 0.2,
32
+ n_ctx=2048,
33
+ f16_kv=True, # MUST set to True, otherwise you will run into problem after a couple of calls
34
+ max_tokens = 500,
35
+ callback_manager=callback_manager,
36
+ verbose=True,
37
+ )
38
  print("LLM loaded...")
39
 
40
  client = QdrantClient(path="./db")