desert commited on
Commit
eb2e235
1 Parent(s): 7b6b9cb

init inference

Browse files
Files changed (1) hide show
  1. app.py +45 -54
app.py CHANGED
@@ -1,75 +1,66 @@
 
 
1
  import gradio as gr
2
- from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
5
- # Model identifier from Hugging Face
6
- adapter_repo = "Mat17892/llama_lora_gguf" # Hugging Face model ID
7
-
8
- # Download the GGUF file from Hugging Face
9
- lora_adapter_path = hf_hub_download(repo_id=adapter_repo, filename="llama_lora_adapter.gguf")
10
-
11
- from huggingface_hub import hf_hub_download
12
 
13
  # Download the base model GGUF file
14
- base_model_repo = "unsloth/Llama-3.2-3B-Instruct-GGUF"
15
  base_model_path = hf_hub_download(repo_id=base_model_repo, filename="Llama-3.2-3B-Instruct-Q8_0.gguf")
16
 
 
 
 
17
 
18
- # Load the base model
19
- print("Loading base model...")
20
- llm = Llama(model_path=base_model_path, n_ctx=2048, n_threads=8)
21
-
22
- # Apply the LoRA adapter
23
- print("Applying LoRA adapter...")
24
- llm.load_adapter(adapter_path=lora_adapter_path)
25
-
26
- print("Model ready with LoRA adapter!")
27
-
28
- # Chat function
29
- def chat_with_model(user_input, chat_history):
30
- """
31
- Process user input and generate a response from the model.
32
- :param user_input: User's input string
33
- :param chat_history: List of [user_message, ai_response] pairs
34
- :return: Updated chat history
35
- """
36
- # Construct the prompt from chat history
 
 
 
 
 
 
 
 
 
37
  prompt = ""
38
  for user, ai in chat_history:
39
  prompt += f"User: {user}\nAI: {ai}\n"
40
- prompt += f"User: {user_input}\nAI:" # Add the latest user input
41
 
42
- # Generate response from the model
43
- raw_response = llm(prompt)["choices"][0]["text"].strip()
44
 
45
- # Clean the response (remove extra tags, if any)
46
- response = raw_response.split("User:")[0].strip()
47
-
48
- # Update chat history with the new turn
49
  chat_history.append((user_input, response))
50
  return chat_history, chat_history
51
 
52
-
53
- # Gradio UI
54
  with gr.Blocks() as demo:
55
- gr.Markdown("# 🦙 LLaMA GGUF Chatbot")
56
- chatbot = gr.Chatbot(label="Chat with the GGUF Model")
57
 
58
  with gr.Row():
59
  with gr.Column(scale=4):
60
  user_input = gr.Textbox(label="Your Message", placeholder="Type a message...")
61
- with gr.Column(scale=1):
62
- submit_btn = gr.Button("Send")
63
-
64
- chat_history = gr.State([])
65
-
66
- # Link components
67
- submit_btn.click(
68
- chat_with_model,
69
- inputs=[user_input, chat_history],
70
- outputs=[chatbot, chat_history],
71
- show_progress=True,
72
- )
73
-
74
- # Launch the app
75
- demo.launch()
 
1
+ import os
2
+ import subprocess
3
  import gradio as gr
 
4
  from huggingface_hub import hf_hub_download
5
 
6
+ # Hugging Face repository IDs
7
+ base_model_repo = "unsloth/Llama-3.2-3B-Instruct-GGUF"
8
+ adapter_repo = "Mat17892/llama_lora_gguf"
 
 
 
 
9
 
10
  # Download the base model GGUF file
11
+ print("Downloading base model...")
12
  base_model_path = hf_hub_download(repo_id=base_model_repo, filename="Llama-3.2-3B-Instruct-Q8_0.gguf")
13
 
14
+ # Download the LoRA adapter GGUF file
15
+ print("Downloading LoRA adapter...")
16
+ lora_adapter_path = hf_hub_download(repo_id=adapter_repo, filename="llama_lora_adapter.gguf")
17
 
18
+ # Function to run `llama-cli` with base model and adapter
19
+ def run_llama_cli(prompt):
20
+ print("Running inference with llama-cli...")
21
+ cmd = [
22
+ "./llama-cli",
23
+ "-c", "2048", # Context length
24
+ "-cnv", # Enable conversational mode
25
+ "-m", base_model_path,
26
+ "--lora", lora_adapter_path,
27
+ "--prompt", prompt,
28
+ ]
29
+ try:
30
+ process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
31
+ stdout, stderr = process.communicate()
32
+
33
+ if process.returncode != 0:
34
+ print("Error during inference:")
35
+ print(stderr.decode())
36
+ return "Error: Could not generate response."
37
+
38
+ return stdout.decode().strip()
39
+ except Exception as e:
40
+ print(f"Exception occurred: {e}")
41
+ return "Error: Could not generate response."
42
+
43
+ # Gradio interface
44
+ def chatbot_fn(user_input, chat_history):
45
+ # Build the full chat history as the prompt
46
  prompt = ""
47
  for user, ai in chat_history:
48
  prompt += f"User: {user}\nAI: {ai}\n"
49
+ prompt += f"User: {user_input}\nAI:" # Add latest user input
50
 
51
+ # Generate response using llama-cli
52
+ response = run_llama_cli(prompt)
53
 
54
+ # Update chat history
 
 
 
55
  chat_history.append((user_input, response))
56
  return chat_history, chat_history
57
 
58
+ # Build the Gradio UI
 
59
  with gr.Blocks() as demo:
60
+ gr.Markdown("# 🦙 LLaMA Chatbot with Base Model and LoRA Adapter")
61
+ chatbot = gr.Chatbot(label="Chat with the Model")
62
 
63
  with gr.Row():
64
  with gr.Column(scale=4):
65
  user_input = gr.Textbox(label="Your Message", placeholder="Type a message...")
66
+ with gr.Column(scale=