Spaces:

sasan
/

KITT

Build error

App Files Files Community

sasan commited on Jun 21

Commit

676fdbb

•

1 Parent(s): 411ba15

chore: Add local backend option for LLM

Browse files

Files changed (2) hide show

kitt/core/model.py +57 -1
main.py +5 -2

kitt/core/model.py CHANGED Viewed

@@ -345,6 +345,58 @@ def run_inference_ollama(prompt):
     return res
 def run_inference(prompt, backend="ollama"):
     prompt += AI_PREAMBLE
@@ -352,8 +404,12 @@ def run_inference(prompt, backend="ollama"):
     if backend == "ollama":
         output = run_inference_ollama(prompt)
-    else:
         output = run_inference_replicate(prompt)
     logger.debug(f"Response from model: {output}")
     return output

     return res
+def load_gpu_model():
+    import bitsandbytes
+    import flash_attn
+    from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
+    tokenizer = AutoTokenizer.from_pretrained(
+        "NousResearch/Hermes-2-Pro-Llama-3-8B", trust_remote_code=True
+    )
+    model = LlamaForCausalLM.from_pretrained(
+        "NousResearch/Hermes-2-Pro-Llama-3-8B",
+        torch_dtype=torch.float16,
+        device_map="auto",
+        load_in_8bit=False,
+        load_in_4bit=True,
+        use_flash_attention_2=True,
+    )
+    return model, tokenizer
+try:
+    model, tokenizer = load_gpu_model()
+except Exception as e:
+    logger.error(f"Could not load model: {e}")
+    model, tokenizer = None, None
+def run_inference_local(prompt):
+    """Run inference on local model using huggingface transformers"""
+    if not model:
+        logger.error("Model not loaded. Exiting.")
+        raise ValueError("Model not loaded. Exiting.")
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
+    generated_ids = model.generate(
+        input_ids,
+        max_new_tokens=1500,
+        temperature=TEMPERATURE,
+        repetition_penalty=REPEAT_PENALTY,
+        do_sample=True,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+    response = tokenizer.decode(
+        generated_ids[0][input_ids.shape[-1] :],
+        skip_special_tokens=True,
+        clean_up_tokenization_space=True,
+    )
+    return response
 def run_inference(prompt, backend="ollama"):
     prompt += AI_PREAMBLE
     if backend == "ollama":
         output = run_inference_ollama(prompt)
+    elif backend == "replicate":
         output = run_inference_replicate(prompt)
+    elif backend == "local":
+        output = run_inference_local(prompt)
+    else:
+        raise ValueError(f"Backend {backend} not supported")
     logger.debug(f"Response from model: {output}")
     return output

main.py CHANGED Viewed

@@ -198,7 +198,8 @@ def set_tts_enabled(tts_enabled, state):
 def set_llm_backend(llm_backend, state):
-    new_llm_backend = "ollama" if llm_backend == "Ollama" else "replicate"
     logger.info(
         f"LLM backend was {state['llm_backend']} and changed to {new_llm_backend}"
     )
@@ -283,6 +284,8 @@ def create_demo(tts_server: bool = False, model="llama3"):
         global_context["map"] = plot
         with gr.Row():
             with gr.Column(scale=1, min_width=300):
                 vehicle_status = gr.JSON(
                     value=vehicle.model_dump(), label="Vehicle status"
@@ -375,7 +378,7 @@ def create_demo(tts_server: bool = False, model="llama3"):
                         interactive=True,
                     )
                     llm_backend = gr.Radio(
-                        choices=["Ollama", "Replicate"],
                         label="LLM Backend",
                         value=DEFAULT_LLM_BACKEND.title(),
                         interactive=True,

 def set_llm_backend(llm_backend, state):
+    assert llm_backend in ["Ollama", "Replicate", "Local"], "Invalid LLM backend"
+    new_llm_backend = llm_backend.lower()
     logger.info(
         f"LLM backend was {state['llm_backend']} and changed to {new_llm_backend}"
     )
         global_context["map"] = plot
         with gr.Row():
+            # with gr.Row():
+            #     gr.Text("KITT", interactive=False)
             with gr.Column(scale=1, min_width=300):
                 vehicle_status = gr.JSON(
                     value=vehicle.model_dump(), label="Vehicle status"
                         interactive=True,
                     )
                     llm_backend = gr.Radio(
+                        choices=["Ollama", "Replicate", "Local"],
                         label="LLM Backend",
                         value=DEFAULT_LLM_BACKEND.title(),
                         interactive=True,