torVik commited on
Commit
8a3a2cd
·
verified ·
1 Parent(s): 8af0d2d

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +371 -0
  2. requirements.txt +9 -0
  3. style.css +17 -0
app.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import os
4
+ from threading import Thread
5
+ from typing import Iterator
6
+
7
+ import gradio as gr
8
+ import spaces
9
+ import torch
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
11
+
12
+ # Debugging: Start script
13
+ print("Starting script...")
14
+
15
+ HF_TOKEN = os.environ.get("HF_TOKEN")
16
+ if HF_TOKEN is None:
17
+ print("Warning: HF_TOKEN is not set!")
18
+
19
+ PASSWORD = os.getenv("APP_PASSWORD", "mysecretpassword") # Set your desired password here or via environment variable
20
+
21
+ DESCRIPTION = "# FT of Lama"
22
+
23
+ if not torch.cuda.is_available():
24
+ DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
25
+ print("Warning: No GPU available. This model cannot run on CPU.")
26
+ else:
27
+ print("GPU is available!")
28
+
29
+ MAX_MAX_NEW_TOKENS = 2048
30
+ DEFAULT_MAX_NEW_TOKENS = 1024
31
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
32
+
33
+ # Debugging: GPU check passed, loading model
34
+ if torch.cuda.is_available():
35
+ model_id = "BGLAW/llama-3-8b-Instruct-bglawinsv1UNS_merged"
36
+ try:
37
+ print("Loading model...")
38
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", token=HF_TOKEN)
39
+ print("Model loaded successfully!")
40
+
41
+ print("Loading tokenizer...")
42
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
43
+ print("Tokenizer loaded successfully!")
44
+ except Exception as e:
45
+ print(f"Error loading model or tokenizer: {e}")
46
+ raise e # Re-raise the error after logging it
47
+
48
+
49
+ @spaces.GPU
50
+ def generate(
51
+ message: str,
52
+ chat_history: list[tuple[str, str]],
53
+ max_new_tokens: int = 1024,
54
+ temperature: float = 0.6,
55
+ top_p: float = 0.9,
56
+ top_k: int = 50,
57
+ repetition_penalty: float = 1.2,
58
+ ) -> Iterator[str]:
59
+ print(f"Received message: {message}")
60
+ print(f"Chat history: {chat_history}")
61
+
62
+ conversation = []
63
+ for user, assistant in chat_history:
64
+ conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
65
+ conversation.append({"role": "user", "content": message})
66
+
67
+ try:
68
+ print("Tokenizing input...")
69
+ input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
70
+ print(f"Input tokenized: {input_ids.shape}")
71
+
72
+ if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
73
+ input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
74
+ gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
75
+ print("Trimmed input tokens due to length.")
76
+
77
+ input_ids = input_ids.to(model.device)
78
+ print("Input moved to the model's device.")
79
+
80
+ streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
81
+ generate_kwargs = dict(
82
+ {"input_ids": input_ids},
83
+ streamer=streamer,
84
+ max_new_tokens=max_new_tokens,
85
+ do_sample=True,
86
+ top_p=top_p,
87
+ top_k=top_k,
88
+ temperature=temperature,
89
+ num_beams=1,
90
+ repetition_penalty=repetition_penalty,
91
+ )
92
+
93
+ print("Starting generation...")
94
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
95
+ t.start()
96
+ print("Thread started for model generation.")
97
+
98
+ outputs = []
99
+ for text in streamer:
100
+ outputs.append(text)
101
+ print(f"Generated text so far: {''.join(outputs)}")
102
+ yield "".join(outputs)
103
+
104
+ except Exception as e:
105
+ print(f"Error during generation: {e}")
106
+ raise e # Re-raise the error after logging it
107
+
108
+
109
+ def password_auth(password):
110
+ if password == PASSWORD:
111
+ return gr.update(visible=True), gr.update(visible=False)
112
+ else:
113
+ return gr.update(visible=False), gr.update(visible=True, value="Incorrect password. Try again.")
114
+
115
+ chat_interface = gr.ChatInterface(
116
+ fn=generate,
117
+ additional_inputs=[
118
+ gr.Slider(
119
+ label="Max new tokens",
120
+ minimum=1,
121
+ maximum=MAX_MAX_NEW_TOKENS,
122
+ step=1,
123
+ value=DEFAULT_MAX_NEW_TOKENS,
124
+ ),
125
+ gr.Slider(
126
+ label="Temperature",
127
+ minimum=0.1,
128
+ maximum=4.0,
129
+ step=0.1,
130
+ value=0.6,
131
+ ),
132
+ gr.Slider(
133
+ label="Top-p (nucleus sampling)",
134
+ minimum=0.05,
135
+ maximum=1.0,
136
+ step=0.05,
137
+ value=0.9,
138
+ ),
139
+ gr.Slider(
140
+ label="Top-k",
141
+ minimum=1,
142
+ maximum=1000,
143
+ step=1,
144
+ value=50,
145
+ ),
146
+ gr.Slider(
147
+ label="Repetition penalty",
148
+ minimum=1.0,
149
+ maximum=2.0,
150
+ step=0.05,
151
+ value=1.2,
152
+ ),
153
+ ],
154
+ stop_btn=None,
155
+ examples=[
156
+ ["Hello there! How are you doing?"],
157
+ ["Can you explain briefly to me what is the Python programming language?"],
158
+ ["Explain the plot of Cinderella in a sentence."],
159
+ ["How many hours does it take a man to eat a Helicopter?"],
160
+ ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
161
+ ],
162
+ )
163
+
164
+ # Debugging: Interface setup
165
+ print("Setting up interface...")
166
+
167
+ with gr.Blocks(css="style.css") as demo:
168
+ gr.Markdown(DESCRIPTION)
169
+
170
+ # Create login components
171
+ with gr.Row(visible=True) as login_area:
172
+ password_input = gr.Textbox(
173
+ label="Enter Password", type="password", placeholder="Password", show_label=True
174
+ )
175
+ login_btn = gr.Button("Submit")
176
+ incorrect_password_msg = gr.Markdown("Incorrect password. Try again.", visible=False)
177
+
178
+ # Main chat interface
179
+ with gr.Column(visible=False) as chat_area:
180
+ gr.Markdown(DESCRIPTION)
181
+ gr.DuplicateButton(
182
+ value="Duplicate Space for private use",
183
+ elem_id="duplicate-button",
184
+ visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
185
+ )
186
+ chat_interface.render()
187
+
188
+ # Bind login button to check password
189
+ login_btn.click(password_auth, inputs=password_input, outputs=[chat_area, incorrect_password_msg])
190
+
191
+ # Debugging: Starting queue and launching the demo
192
+ print("Launching demo...")
193
+
194
+ if __name__ == "__main__":
195
+ demo.queue(max_size=20).launch(share=True)
196
+
197
+
198
+
199
+ # WORKING
200
+ # #!/usr/bin/env python
201
+
202
+ # import os
203
+ # from threading import Thread
204
+ # from typing import Iterator
205
+
206
+ # import gradio as gr
207
+ # import spaces
208
+ # import torch
209
+ # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
210
+
211
+ # # Debugging: Start script
212
+ # print("Starting script...")
213
+
214
+ # HF_TOKEN = os.environ.get("HF_TOKEN")
215
+ # if HF_TOKEN is None:
216
+ # print("Warning: HF_TOKEN is not set!")
217
+
218
+ # DESCRIPTION = "# Mistral-7B v0.2"
219
+
220
+ # if not torch.cuda.is_available():
221
+ # DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
222
+ # print("Warning: No GPU available. This model cannot run on CPU.")
223
+ # else:
224
+ # print("GPU is available!")
225
+
226
+ # MAX_MAX_NEW_TOKENS = 2048
227
+ # DEFAULT_MAX_NEW_TOKENS = 1024
228
+ # MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
229
+
230
+ # # Debugging: GPU check passed, loading model
231
+ # if torch.cuda.is_available():
232
+ # model_id = "mistralai/Mistral-7B-Instruct-v0.2"
233
+ # try:
234
+ # print("Loading model...")
235
+ # model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", token=HF_TOKEN)
236
+ # print("Model loaded successfully!")
237
+
238
+ # print("Loading tokenizer...")
239
+ # tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
240
+ # print("Tokenizer loaded successfully!")
241
+ # except Exception as e:
242
+ # print(f"Error loading model or tokenizer: {e}")
243
+ # raise e # Re-raise the error after logging it
244
+
245
+
246
+ # @spaces.GPU
247
+ # def generate(
248
+ # message: str,
249
+ # chat_history: list[tuple[str, str]],
250
+ # max_new_tokens: int = 1024,
251
+ # temperature: float = 0.6,
252
+ # top_p: float = 0.9,
253
+ # top_k: int = 50,
254
+ # repetition_penalty: float = 1.2,
255
+ # ) -> Iterator[str]:
256
+ # print(f"Received message: {message}")
257
+ # print(f"Chat history: {chat_history}")
258
+
259
+ # conversation = []
260
+ # for user, assistant in chat_history:
261
+ # conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
262
+ # conversation.append({"role": "user", "content": message})
263
+
264
+ # try:
265
+ # print("Tokenizing input...")
266
+ # input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
267
+ # print(f"Input tokenized: {input_ids.shape}")
268
+
269
+ # if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
270
+ # input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
271
+ # gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
272
+ # print("Trimmed input tokens due to length.")
273
+
274
+ # input_ids = input_ids.to(model.device)
275
+ # print("Input moved to the model's device.")
276
+
277
+ # streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
278
+ # generate_kwargs = dict(
279
+ # {"input_ids": input_ids},
280
+ # streamer=streamer,
281
+ # max_new_tokens=max_new_tokens,
282
+ # do_sample=True,
283
+ # top_p=top_p,
284
+ # top_k=top_k,
285
+ # temperature=temperature,
286
+ # num_beams=1,
287
+ # repetition_penalty=repetition_penalty,
288
+ # )
289
+
290
+ # print("Starting generation...")
291
+ # t = Thread(target=model.generate, kwargs=generate_kwargs)
292
+ # t.start()
293
+ # print("Thread started for model generation.")
294
+
295
+ # outputs = []
296
+ # for text in streamer:
297
+ # outputs.append(text)
298
+ # print(f"Generated text so far: {''.join(outputs)}")
299
+ # yield "".join(outputs)
300
+
301
+ # except Exception as e:
302
+ # print(f"Error during generation: {e}")
303
+ # raise e # Re-raise the error after logging it
304
+
305
+
306
+ # chat_interface = gr.ChatInterface(
307
+ # fn=generate,
308
+ # additional_inputs=[
309
+ # gr.Slider(
310
+ # label="Max new tokens",
311
+ # minimum=1,
312
+ # maximum=MAX_MAX_NEW_TOKENS,
313
+ # step=1,
314
+ # value=DEFAULT_MAX_NEW_TOKENS,
315
+ # ),
316
+ # gr.Slider(
317
+ # label="Temperature",
318
+ # minimum=0.1,
319
+ # maximum=4.0,
320
+ # step=0.1,
321
+ # value=0.6,
322
+ # ),
323
+ # gr.Slider(
324
+ # label="Top-p (nucleus sampling)",
325
+ # minimum=0.05,
326
+ # maximum=1.0,
327
+ # step=0.05,
328
+ # value=0.9,
329
+ # ),
330
+ # gr.Slider(
331
+ # label="Top-k",
332
+ # minimum=1,
333
+ # maximum=1000,
334
+ # step=1,
335
+ # value=50,
336
+ # ),
337
+ # gr.Slider(
338
+ # label="Repetition penalty",
339
+ # minimum=1.0,
340
+ # maximum=2.0,
341
+ # step=0.05,
342
+ # value=1.2,
343
+ # ),
344
+ # ],
345
+ # stop_btn=None,
346
+ # examples=[
347
+ # ["Hello there! How are you doing?"],
348
+ # ["Can you explain briefly to me what is the Python programming language?"],
349
+ # ["Explain the plot of Cinderella in a sentence."],
350
+ # ["How many hours does it take a man to eat a Helicopter?"],
351
+ # ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
352
+ # ],
353
+ # )
354
+
355
+ # # Debugging: Interface setup
356
+ # print("Setting up interface...")
357
+
358
+ # with gr.Blocks(css="style.css") as demo:
359
+ # gr.Markdown(DESCRIPTION)
360
+ # gr.DuplicateButton(
361
+ # value="Duplicate Space for private use",
362
+ # elem_id="duplicate-button",
363
+ # visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
364
+ # )
365
+ # chat_interface.render()
366
+
367
+ # # Debugging: Starting queue and launching the demo
368
+ # print("Launching demo...")
369
+
370
+ # if __name__ == "__main__":
371
+ # demo.queue(max_size=20).launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ huggingface_hub
2
+ accelerate==0.31.0
3
+ bitsandbytes==0.43.1
4
+ gradio==4.36.1
5
+ scipy==1.13.0
6
+ sentencepiece==0.2.0
7
+ spaces==0.28.3
8
+ torch==2.0.1
9
+ transformers==4.41.2
style.css ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ display: block;
4
+ }
5
+
6
+ #duplicate-button {
7
+ margin: auto;
8
+ color: white;
9
+ background: #1565c0;
10
+ border-radius: 100vh;
11
+ }
12
+
13
+ .contain {
14
+ max-width: 900px;
15
+ margin: auto;
16
+ padding-top: 1.5rem;
17
+ }