TobDeBer commited on
Commit
2b8d5aa
1 Parent(s): c64c2a3

cloud version

Browse files
Files changed (1) hide show
  1. app_cloud.py +180 -0
app_cloud.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import llama_cpp
2
+ import os
3
+ import json
4
+ import subprocess
5
+ from llama_cpp import Llama
6
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
7
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
8
+ from llama_cpp_agent.chat_history import BasicChatHistory
9
+ from llama_cpp_agent.chat_history.messages import Roles
10
+ import gradio as gr
11
+ from huggingface_hub import hf_hub_download
12
+
13
+ huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
14
+
15
+ hf_hub_download(
16
+ repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
17
+ filename="qwen2-0_5b-instruct-q4_k_m.gguf",
18
+ local_dir="./models"
19
+ )
20
+
21
+ hf_hub_download(
22
+ repo_id="TobDeBer/Meta-Llama-3.1-8B-Instruct-Q4_K_M-GGUF",
23
+ filename="meta-llama-3.1-8b-instruct-q4_k_m.gguf",
24
+ local_dir="./models",
25
+ token=huggingface_token
26
+ )
27
+ # 5GB
28
+
29
+ # RichardErkhov/ibm-granite_-_granite-7b-base-gguf
30
+ # granite-7b-base.Q4_K_M.gguf
31
+ hf_hub_download(
32
+ repo_id="RichardErkhov/ibm-granite_-_granite-7b-base-gguf",
33
+ filename="granite-7b-base.Q4_K_M.gguf",
34
+ local_dir="./models",
35
+ token=huggingface_token
36
+ )# 4GB
37
+
38
+ # TobDeBer/granite-8b-code-instruct-128k-Q4_K_M-GGUF
39
+ # granite-8b-code-instruct-128k-q4_k_m.gguf
40
+ hf_hub_download(
41
+ repo_id="TobDeBer/granite-8b-code-instruct-128k-Q4_K_M-GGUF",
42
+ filename="granite-8b-code-instruct-128k-q4_k_m.gguf",
43
+ local_dir="./models",
44
+ token=huggingface_token
45
+ )# 5GB
46
+
47
+ # Dropdown for Model Selection
48
+ model_dropdown = gr.Dropdown(
49
+ [
50
+ 'qwen2-0_5b-instruct-q4_k_m.gguf',
51
+ 'meta-llama-3.1-8b-instruct-q4_k_m.gguf',
52
+ 'granite-7b-base.Q4_K_M.gguf',
53
+ 'granite-8b-code-instruct-128k-q4_k_m.gguf',
54
+ ],
55
+ value="qwen2-0_5b-instruct-q4_k_m.gguf",
56
+ label="Model"
57
+ )
58
+
59
+ llm = None
60
+ llm_model = None
61
+
62
+ def respond(
63
+ message,
64
+ history: list[tuple[str, str]],
65
+ system_message,
66
+ max_tokens,
67
+ temperature,
68
+ top_p,
69
+ top_k,
70
+ repeat_penalty,
71
+ selected_model, # This is now a parameter received from the interface
72
+ ):
73
+ chat_template = MessagesFormatterType.GEMMA_2
74
+
75
+ global llm
76
+ global llm_model
77
+
78
+ # Update the model if it has changed
79
+ if llm is None or llm_model != selected_model:
80
+ llm = Llama(
81
+ model_path=f"models/{selected_model}",
82
+ flash_attn=True,
83
+ n_gpu_layers=81,
84
+ n_batch=1024,
85
+ n_ctx=8192,
86
+ )
87
+ llm_model = selected_model
88
+
89
+ provider = LlamaCppPythonProvider(llm)
90
+
91
+ agent = LlamaCppAgent(
92
+ provider,
93
+ system_prompt=f"{system_message}",
94
+ predefined_messages_formatter_type=chat_template,
95
+ debug_output=True
96
+ )
97
+
98
+ settings = provider.get_provider_default_settings()
99
+ settings.temperature = temperature
100
+ settings.top_k = top_k
101
+ settings.top_p = top_p
102
+ settings.max_tokens = max_tokens
103
+ settings.repeat_penalty = repeat_penalty
104
+ settings.stream = True
105
+
106
+ messages = BasicChatHistory()
107
+
108
+ for msn in history:
109
+ user = {
110
+ 'role': Roles.user,
111
+ 'content': msn[0]
112
+ }
113
+ assistant = {
114
+ 'role': Roles.assistant,
115
+ 'content': msn[1]
116
+ }
117
+ messages.add_message(user)
118
+ messages.add_message(assistant)
119
+
120
+ stream = agent.get_chat_response(
121
+ message,
122
+ llm_sampling_settings=settings,
123
+ chat_history=messages,
124
+ returns_streaming_generator=True,
125
+ print_output=False
126
+ )
127
+
128
+ outputs = ""
129
+ for output in stream:
130
+ outputs += output
131
+ yield outputs
132
+
133
+ description = """<p align="center">Defaults to Qwen 500M</p>
134
+ """
135
+
136
+ # Create the Gradio interface
137
+ with gr.Blocks() as demo: # Create a Gradio Blocks context
138
+
139
+ # Model selection dropdown above the chat
140
+ model_dropdown.render()
141
+
142
+ # Main chat interface
143
+ chat_interface = gr.ChatInterface(
144
+ respond,
145
+ additional_inputs=[
146
+ gr.Textbox(value="You are a helpful assistant.", label="System message"),
147
+ gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
148
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
149
+ gr.Slider(
150
+ minimum=0.1,
151
+ maximum=1.0,
152
+ value=0.95,
153
+ step=0.05,
154
+ label="Top-p",
155
+ ),
156
+ gr.Slider(
157
+ minimum=0,
158
+ maximum=100,
159
+ value=40,
160
+ step=1,
161
+ label="Top-k",
162
+ ),
163
+ gr.Slider(
164
+ minimum=0.0,
165
+ maximum=2.0,
166
+ value=1.1,
167
+ step=0.1,
168
+ label="Repetition penalty",
169
+ ),
170
+ model_dropdown # Pass the dropdown directly
171
+ ],
172
+ retry_btn="Retry",
173
+ undo_btn="Undo",
174
+ clear_btn="Clear",
175
+ submit_btn="Send",
176
+ title="Chat with Qwen 2 and friends using llama.cpp",
177
+ description=description,
178
+ )
179
+
180
+ demo.queue().launch()