BenevolenceMessiah commited on
Commit
0b1e565
1 Parent(s): 18ff4e4

Upload 7 files

Browse files
Files changed (8) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +64 -64
  3. README.md +15 -15
  4. app.py +458 -380
  5. error.png +0 -0
  6. groups_merged.txt +0 -0
  7. llama.png +2 -2
  8. start.sh +4 -4
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  llama.png filter=lfs diff=lfs merge=lfs -text
37
  imatrix_calibration.txt filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  llama.png filter=lfs diff=lfs merge=lfs -text
37
  imatrix_calibration.txt filter=lfs diff=lfs merge=lfs -text
38
+ error.png filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,64 +1,64 @@
1
- FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
2
-
3
- ENV DEBIAN_FRONTEND=noninteractive
4
- RUN apt-get update && \
5
- apt-get upgrade -y && \
6
- apt-get install -y --no-install-recommends \
7
- git \
8
- git-lfs \
9
- wget \
10
- curl \
11
- # python build dependencies \
12
- build-essential \
13
- libssl-dev \
14
- zlib1g-dev \
15
- libbz2-dev \
16
- libreadline-dev \
17
- libsqlite3-dev \
18
- libncursesw5-dev \
19
- xz-utils \
20
- tk-dev \
21
- libxml2-dev \
22
- libxmlsec1-dev \
23
- libffi-dev \
24
- liblzma-dev \
25
- ffmpeg \
26
- nvidia-driver-515
27
-
28
- RUN useradd -m -u 1000 user
29
- USER user
30
- ENV HOME=/home/user \
31
- PATH=/home/user/.local/bin:${PATH}
32
- WORKDIR ${HOME}/app
33
-
34
- RUN curl https://pyenv.run | bash
35
- ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
36
- ARG PYTHON_VERSION=3.10.13
37
- RUN pyenv install ${PYTHON_VERSION} && \
38
- pyenv global ${PYTHON_VERSION} && \
39
- pyenv rehash && \
40
- pip install --no-cache-dir -U pip setuptools wheel && \
41
- pip install "huggingface-hub" "hf-transfer" "gradio[oauth]>=4.28.0" "gradio_huggingfacehub_search==0.0.7" "APScheduler"
42
-
43
- COPY --chown=1000 . ${HOME}/app
44
- RUN git clone https://github.com/ggerganov/llama.cpp
45
- RUN pip install -r llama.cpp/requirements.txt
46
-
47
- COPY groups_merged.txt ${HOME}/app/llama.cpp/
48
-
49
- ENV PYTHONPATH=${HOME}/app \
50
- PYTHONUNBUFFERED=1 \
51
- HF_HUB_ENABLE_HF_TRANSFER=1 \
52
- GRADIO_ALLOW_FLAGGING=never \
53
- GRADIO_NUM_PORTS=1 \
54
- GRADIO_SERVER_NAME=0.0.0.0 \
55
- GRADIO_THEME=huggingface \
56
- TQDM_POSITION=-1 \
57
- TQDM_MININTERVAL=1 \
58
- SYSTEM=spaces \
59
- LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} \
60
- PATH=/usr/local/nvidia/bin:${PATH}
61
-
62
-
63
- ENTRYPOINT /bin/sh start.sh
64
-
 
1
+ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+ RUN apt-get update && \
5
+ apt-get upgrade -y && \
6
+ apt-get install -y --no-install-recommends \
7
+ git \
8
+ git-lfs \
9
+ wget \
10
+ curl \
11
+ # python build dependencies \
12
+ build-essential \
13
+ libssl-dev \
14
+ zlib1g-dev \
15
+ libbz2-dev \
16
+ libreadline-dev \
17
+ libsqlite3-dev \
18
+ libncursesw5-dev \
19
+ xz-utils \
20
+ tk-dev \
21
+ libxml2-dev \
22
+ libxmlsec1-dev \
23
+ libffi-dev \
24
+ liblzma-dev \
25
+ ffmpeg \
26
+ nvidia-driver-515
27
+
28
+ RUN useradd -m -u 1000 user
29
+ USER user
30
+ ENV HOME=/home/user \
31
+ PATH=/home/user/.local/bin:${PATH}
32
+ WORKDIR ${HOME}/app
33
+
34
+ RUN curl https://pyenv.run | bash
35
+ ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
36
+ ARG PYTHON_VERSION=3.10.13
37
+ RUN pyenv install ${PYTHON_VERSION} && \
38
+ pyenv global ${PYTHON_VERSION} && \
39
+ pyenv rehash && \
40
+ pip install --no-cache-dir -U pip setuptools wheel && \
41
+ pip install "huggingface-hub" "hf-transfer" "gradio[oauth]>=4.28.0" "gradio_huggingfacehub_search==0.0.7" "APScheduler"
42
+
43
+ COPY --chown=1000 . ${HOME}/app
44
+ RUN git clone https://github.com/ggerganov/llama.cpp
45
+ RUN pip install -r llama.cpp/requirements.txt
46
+
47
+ COPY groups_merged.txt ${HOME}/app/llama.cpp/
48
+
49
+ ENV PYTHONPATH=${HOME}/app \
50
+ PYTHONUNBUFFERED=1 \
51
+ HF_HUB_ENABLE_HF_TRANSFER=1 \
52
+ GRADIO_ALLOW_FLAGGING=never \
53
+ GRADIO_NUM_PORTS=1 \
54
+ GRADIO_SERVER_NAME=0.0.0.0 \
55
+ GRADIO_THEME=huggingface \
56
+ TQDM_POSITION=-1 \
57
+ TQDM_MININTERVAL=1 \
58
+ SYSTEM=spaces \
59
+ LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} \
60
+ PATH=/usr/local/nvidia/bin:${PATH}
61
+
62
+
63
+ ENTRYPOINT /bin/sh start.sh
64
+
README.md CHANGED
@@ -1,15 +1,15 @@
1
- ---
2
- title: GGUF My Repo
3
- emoji: 🦙
4
- colorFrom: gray
5
- colorTo: pink
6
- sdk: docker
7
- hf_oauth: true
8
- hf_oauth_scopes:
9
- - read-repos
10
- - write-repos
11
- - manage-repos
12
- pinned: false
13
- ---
14
-
15
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: GGUF My Repo
3
+ emoji: 🦙
4
+ colorFrom: gray
5
+ colorTo: pink
6
+ sdk: docker
7
+ hf_oauth: true
8
+ hf_oauth_scopes:
9
+ - read-repos
10
+ - write-repos
11
+ - manage-repos
12
+ pinned: false
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,381 +1,459 @@
1
- import os
2
- import shutil
3
- import subprocess
4
- import signal
5
- os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
6
- import gradio as gr
7
-
8
- from huggingface_hub import create_repo, HfApi
9
- from huggingface_hub import snapshot_download
10
- from huggingface_hub import whoami
11
- from huggingface_hub import ModelCard
12
-
13
- from gradio_huggingfacehub_search import HuggingfaceHubSearch
14
-
15
- from apscheduler.schedulers.background import BackgroundScheduler
16
-
17
- from textwrap import dedent
18
-
19
- HF_TOKEN = os.environ.get("HF_TOKEN")
20
-
21
- def generate_importance_matrix(model_path, train_data_path):
22
- imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
23
-
24
- os.chdir("llama.cpp")
25
-
26
- print(f"Current working directory: {os.getcwd()}")
27
- print(f"Files in the current directory: {os.listdir('.')}")
28
-
29
- if not os.path.isfile(f"../{model_path}"):
30
- raise Exception(f"Model file not found: {model_path}")
31
-
32
- print("Running imatrix command...")
33
- process = subprocess.Popen(imatrix_command, shell=True)
34
-
35
- try:
36
- process.wait(timeout=60) # added wait
37
- except subprocess.TimeoutExpired:
38
- print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
39
- process.send_signal(signal.SIGINT)
40
- try:
41
- process.wait(timeout=5) # grace period
42
- except subprocess.TimeoutExpired:
43
- print("Imatrix proc still didn't term. Forecfully terming process...")
44
- process.kill()
45
-
46
- os.chdir("..")
47
-
48
- print("Importance matrix generation completed.")
49
-
50
- def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
51
- if oauth_token.token is None:
52
- raise ValueError("You have to be logged in.")
53
-
54
- split_cmd = f"llama.cpp/llama-gguf-split --split --split-max-tensors {split_max_tensors}"
55
- if split_max_size:
56
- split_cmd += f" --split-max-size {split_max_size}"
57
- split_cmd += f" {model_path} {model_path.split('.')[0]}"
58
-
59
- print(f"Split command: {split_cmd}")
60
-
61
- result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
62
- print(f"Split command stdout: {result.stdout}")
63
- print(f"Split command stderr: {result.stderr}")
64
-
65
- if result.returncode != 0:
66
- raise Exception(f"Error splitting the model: {result.stderr}")
67
- print("Model split successfully!")
68
-
69
-
70
- sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
71
- if sharded_model_files:
72
- print(f"Sharded model files: {sharded_model_files}")
73
- api = HfApi(token=oauth_token.token)
74
- for file in sharded_model_files:
75
- file_path = os.path.join('.', file)
76
- print(f"Uploading file: {file_path}")
77
- try:
78
- api.upload_file(
79
- path_or_fileobj=file_path,
80
- path_in_repo=file,
81
- repo_id=repo_id,
82
- )
83
- except Exception as e:
84
- raise Exception(f"Error uploading file {file_path}: {e}")
85
- else:
86
- raise Exception("No sharded files found.")
87
-
88
- print("Sharded model has been uploaded successfully!")
89
-
90
- def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
91
- if oauth_token.token is None:
92
- raise ValueError("You must be logged in to use GGUF-my-repo")
93
- model_name = model_id.split('/')[-1]
94
- fp16 = f"{model_name}.fp16.gguf"
95
-
96
- try:
97
- api = HfApi(token=oauth_token.token)
98
-
99
- dl_pattern = ["*.md", "*.json", "*.model"]
100
-
101
- pattern = (
102
- "*.safetensors"
103
- if any(
104
- file.path.endswith(".safetensors")
105
- for file in api.list_repo_tree(
106
- repo_id=model_id,
107
- recursive=True,
108
- )
109
- )
110
- else "*.bin"
111
- )
112
-
113
- dl_pattern += pattern
114
-
115
- api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
116
- print("Model downloaded successfully!")
117
- print(f"Current working directory: {os.getcwd()}")
118
- print(f"Model directory contents: {os.listdir(model_name)}")
119
-
120
- conversion_script = "convert_hf_to_gguf.py"
121
- fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
122
- result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
123
- print(result)
124
- if result.returncode != 0:
125
- raise Exception(f"Error converting to fp16: {result.stderr}")
126
- print("Model converted to fp16 successfully!")
127
- print(f"Converted model path: {fp16}")
128
-
129
- imatrix_path = "llama.cpp/imatrix.dat"
130
-
131
- if use_imatrix:
132
- if train_data_file:
133
- train_data_path = train_data_file.name
134
- else:
135
- train_data_path = "groups_merged.txt" #fallback calibration dataset
136
-
137
- print(f"Training data file path: {train_data_path}")
138
-
139
- if not os.path.isfile(train_data_path):
140
- raise Exception(f"Training data file not found: {train_data_path}")
141
-
142
- generate_importance_matrix(fp16, train_data_path)
143
- else:
144
- print("Not using imatrix quantization.")
145
- username = whoami(oauth_token.token)["name"]
146
- quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
147
- quantized_gguf_path = quantized_gguf_name
148
- if use_imatrix:
149
- quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
150
- else:
151
- quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
152
- result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
153
- if result.returncode != 0:
154
- raise Exception(f"Error quantizing: {result.stderr}")
155
- print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
156
- print(f"Quantized model path: {quantized_gguf_path}")
157
-
158
- # Create empty repo
159
- new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
160
- new_repo_id = new_repo_url.repo_id
161
- print("Repo created successfully!", new_repo_url)
162
-
163
- try:
164
- card = ModelCard.load(model_id, token=oauth_token.token)
165
- except:
166
- card = ModelCard("")
167
- if card.data.tags is None:
168
- card.data.tags = []
169
- card.data.tags.append("llama-cpp")
170
- card.data.tags.append("gguf-my-repo")
171
- card.data.base_model = model_id
172
- card.text = dedent(
173
- f"""
174
- # {new_repo_id}
175
- This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
176
- Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
177
-
178
- ## Use with llama.cpp
179
- Install llama.cpp through brew (works on Mac and Linux)
180
-
181
- ```bash
182
- brew install llama.cpp
183
-
184
- ```
185
- Invoke the llama.cpp server or the CLI.
186
-
187
- ### CLI:
188
- ```bash
189
- llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
190
- ```
191
-
192
- ### Server:
193
- ```bash
194
- llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
195
- ```
196
-
197
- Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
198
-
199
- Step 1: Clone llama.cpp from GitHub.
200
- ```
201
- git clone https://github.com/ggerganov/llama.cpp
202
- ```
203
-
204
- Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
205
- ```
206
- cd llama.cpp && LLAMA_CURL=1 make
207
- ```
208
-
209
- Step 3: Run inference through the main binary.
210
- ```
211
- ./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
212
- ```
213
- or
214
- ```
215
- ./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
216
- ```
217
- """
218
- )
219
- card.save(f"README.md")
220
-
221
- if split_model:
222
- split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
223
- else:
224
- try:
225
- print(f"Uploading quantized model: {quantized_gguf_path}")
226
- api.upload_file(
227
- path_or_fileobj=quantized_gguf_path,
228
- path_in_repo=quantized_gguf_name,
229
- repo_id=new_repo_id,
230
- )
231
- except Exception as e:
232
- raise Exception(f"Error uploading quantized model: {e}")
233
-
234
-
235
- imatrix_path = "llama.cpp/imatrix.dat"
236
- if os.path.isfile(imatrix_path):
237
- try:
238
- print(f"Uploading imatrix.dat: {imatrix_path}")
239
- api.upload_file(
240
- path_or_fileobj=imatrix_path,
241
- path_in_repo="imatrix.dat",
242
- repo_id=new_repo_id,
243
- )
244
- except Exception as e:
245
- raise Exception(f"Error uploading imatrix.dat: {e}")
246
-
247
- api.upload_file(
248
- path_or_fileobj=f"README.md",
249
- path_in_repo=f"README.md",
250
- repo_id=new_repo_id,
251
- )
252
- print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
253
-
254
- return (
255
- f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
256
- "llama.png",
257
- )
258
- except Exception as e:
259
- return (f"Error: {e}", "error.png")
260
- finally:
261
- shutil.rmtree(model_name, ignore_errors=True)
262
- print("Folder cleaned up successfully!")
263
-
264
- css="""/* Custom CSS to allow scrolling */
265
- .gradio-container {overflow-y: auto;}
266
- """
267
- # Create Gradio interface
268
- with gr.Blocks(css=css) as demo:
269
- gr.Markdown("You must be logged in to use GGUF-my-repo.")
270
- gr.LoginButton(min_width=250)
271
-
272
- model_id = HuggingfaceHubSearch(
273
- label="Hub Model ID",
274
- placeholder="Search for model id on Huggingface",
275
- search_type="model",
276
- )
277
-
278
- q_method = gr.Dropdown(
279
- ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
280
- label="Quantization Method",
281
- info="GGML quantization type",
282
- value="Q4_K_M",
283
- filterable=False,
284
- visible=True
285
- )
286
-
287
- imatrix_q_method = gr.Dropdown(
288
- ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
289
- label="Imatrix Quantization Method",
290
- info="GGML imatrix quants type",
291
- value="IQ4_NL",
292
- filterable=False,
293
- visible=False
294
- )
295
-
296
- use_imatrix = gr.Checkbox(
297
- value=False,
298
- label="Use Imatrix Quantization",
299
- info="Use importance matrix for quantization."
300
- )
301
-
302
- private_repo = gr.Checkbox(
303
- value=False,
304
- label="Private Repo",
305
- info="Create a private repo under your username."
306
- )
307
-
308
- train_data_file = gr.File(
309
- label="Training Data File",
310
- file_types=["txt"],
311
- visible=False
312
- )
313
-
314
- split_model = gr.Checkbox(
315
- value=False,
316
- label="Split Model",
317
- info="Shard the model using gguf-split."
318
- )
319
-
320
- split_max_tensors = gr.Number(
321
- value=256,
322
- label="Max Tensors per File",
323
- info="Maximum number of tensors per file when splitting model.",
324
- visible=False
325
- )
326
-
327
- split_max_size = gr.Textbox(
328
- label="Max File Size",
329
- info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.",
330
- visible=False
331
- )
332
-
333
- def update_visibility(use_imatrix):
334
- return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
335
-
336
- use_imatrix.change(
337
- fn=update_visibility,
338
- inputs=use_imatrix,
339
- outputs=[q_method, imatrix_q_method, train_data_file]
340
- )
341
-
342
- iface = gr.Interface(
343
- fn=process_model,
344
- inputs=[
345
- model_id,
346
- q_method,
347
- use_imatrix,
348
- imatrix_q_method,
349
- private_repo,
350
- train_data_file,
351
- split_model,
352
- split_max_tensors,
353
- split_max_size,
354
- ],
355
- outputs=[
356
- gr.Markdown(label="output"),
357
- gr.Image(show_label=False),
358
- ],
359
- title="Create your own GGUF Quants, blazingly fast ⚡!",
360
- description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
361
- api_name=False
362
- )
363
-
364
- def update_split_visibility(split_model):
365
- return gr.update(visible=split_model), gr.update(visible=split_model)
366
-
367
- split_model.change(
368
- fn=update_split_visibility,
369
- inputs=split_model,
370
- outputs=[split_max_tensors, split_max_size]
371
- )
372
-
373
- def restart_space():
374
- HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
375
-
376
- scheduler = BackgroundScheduler()
377
- scheduler.add_job(restart_space, "interval", seconds=21600)
378
- scheduler.start()
379
-
380
- # Launch the interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)
 
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ import signal
5
+ os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
6
+ import gradio as gr
7
+
8
+ from huggingface_hub import create_repo, HfApi
9
+ from huggingface_hub import snapshot_download
10
+ from huggingface_hub import whoami
11
+ from huggingface_hub import ModelCard
12
+
13
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
14
+
15
+ from apscheduler.schedulers.background import BackgroundScheduler
16
+
17
+ from textwrap import dedent
18
+
19
+ HF_TOKEN = os.environ.get("HF_TOKEN")
20
+
21
+ def generate_importance_matrix(model_path, train_data_path):
22
+ imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
23
+
24
+ os.chdir("llama.cpp")
25
+
26
+ print(f"Current working directory: {os.getcwd()}")
27
+ print(f"Files in the current directory: {os.listdir('.')}")
28
+
29
+ if not os.path.isfile(f"../{model_path}"):
30
+ raise Exception(f"Model file not found: {model_path}")
31
+
32
+ print("Running imatrix command...")
33
+ process = subprocess.Popen(imatrix_command, shell=True)
34
+
35
+ try:
36
+ process.wait(timeout=60) # added wait
37
+ except subprocess.TimeoutExpired:
38
+ print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
39
+ process.send_signal(signal.SIGINT)
40
+ try:
41
+ process.wait(timeout=5) # grace period
42
+ except subprocess.TimeoutExpired:
43
+ print("Imatrix proc still didn't term. Forecfully terming process...")
44
+ process.kill()
45
+
46
+ os.chdir("..")
47
+
48
+ print("Importance matrix generation completed.")
49
+
50
+ def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
51
+ if oauth_token.token is None:
52
+ raise ValueError("You have to be logged in.")
53
+
54
+ split_cmd = f"llama.cpp/llama-gguf-split --split --split-max-tensors {split_max_tensors}"
55
+ if split_max_size:
56
+ split_cmd += f" --split-max-size {split_max_size}"
57
+ split_cmd += f" {model_path} {model_path.split('.')[0]}"
58
+
59
+ print(f"Split command: {split_cmd}")
60
+
61
+ result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
62
+ print(f"Split command stdout: {result.stdout}")
63
+ print(f"Split command stderr: {result.stderr}")
64
+
65
+ if result.returncode != 0:
66
+ raise Exception(f"Error splitting the model: {result.stderr}")
67
+ print("Model split successfully!")
68
+
69
+
70
+ sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
71
+ if sharded_model_files:
72
+ print(f"Sharded model files: {sharded_model_files}")
73
+ api = HfApi(token=oauth_token.token)
74
+ for file in sharded_model_files:
75
+ file_path = os.path.join('.', file)
76
+ print(f"Uploading file: {file_path}")
77
+ try:
78
+ api.upload_file(
79
+ path_or_fileobj=file_path,
80
+ path_in_repo=file,
81
+ repo_id=repo_id,
82
+ )
83
+ except Exception as e:
84
+ raise Exception(f"Error uploading file {file_path}: {e}")
85
+ else:
86
+ raise Exception("No sharded files found.")
87
+
88
+ print("Sharded model has been uploaded successfully!")
89
+
90
+ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
91
+ if oauth_token.token is None:
92
+ raise ValueError("You must be logged in to use GGUF-my-repo")
93
+ model_name = model_id.split('/')[-1]
94
+ fp16 = f"{model_name}.fp16.gguf"
95
+
96
+ try:
97
+ api = HfApi(token=oauth_token.token)
98
+
99
+ dl_pattern = ["*.md", "*.json", "*.model"]
100
+
101
+ pattern = (
102
+ "*.safetensors"
103
+ if any(
104
+ file.path.endswith(".safetensors")
105
+ for file in api.list_repo_tree(
106
+ repo_id=model_id,
107
+ recursive=True,
108
+ )
109
+ )
110
+ else "*.bin"
111
+ )
112
+
113
+ dl_pattern += pattern
114
+
115
+ api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
116
+ print("Model downloaded successfully!")
117
+ print(f"Current working directory: {os.getcwd()}")
118
+ print(f"Model directory contents: {os.listdir(model_name)}")
119
+
120
+ conversion_script = "convert_hf_to_gguf.py"
121
+ fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
122
+ result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
123
+ print(result)
124
+ if result.returncode != 0:
125
+ raise Exception(f"Error converting to fp16: {result.stderr}")
126
+ print("Model converted to fp16 successfully!")
127
+ print(f"Converted model path: {fp16}")
128
+
129
+ imatrix_path = "llama.cpp/imatrix.dat"
130
+
131
+ if use_imatrix:
132
+ if train_data_file:
133
+ train_data_path = train_data_file.name
134
+ else:
135
+ train_data_path = "groups_merged.txt" #fallback calibration dataset
136
+
137
+ print(f"Training data file path: {train_data_path}")
138
+
139
+ if not os.path.isfile(train_data_path):
140
+ raise Exception(f"Training data file not found: {train_data_path}")
141
+
142
+ generate_importance_matrix(fp16, train_data_path)
143
+ else:
144
+ print("Not using imatrix quantization.")
145
+ username = whoami(oauth_token.token)["name"]
146
+ quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
147
+ quantized_gguf_path = quantized_gguf_name
148
+ if use_imatrix:
149
+ quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
150
+ else:
151
+ quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
152
+ result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
153
+ if result.returncode != 0:
154
+ raise Exception(f"Error quantizing: {result.stderr}")
155
+ print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
156
+ print(f"Quantized model path: {quantized_gguf_path}")
157
+
158
+ # Create empty repo
159
+ new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
160
+ new_repo_id = new_repo_url.repo_id
161
+ print("Repo created successfully!", new_repo_url)
162
+
163
+ try:
164
+ card = ModelCard.load(model_id, token=oauth_token.token)
165
+ except:
166
+ card = ModelCard("")
167
+ if card.data.tags is None:
168
+ card.data.tags = []
169
+ card.data.tags.append("llama-cpp")
170
+ card.data.tags.append("gguf-my-repo")
171
+ card.data.base_model = model_id
172
+ card.text = dedent(
173
+ f"""
174
+ # {new_repo_id}
175
+ Asalamu Alaikum! This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
176
+ Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
177
+
178
+ ## Description (per [TheBloke](https://huggingface.co/TheBloke))
179
+
180
+ This repo contains GGUF format model files.
181
+
182
+ These files were quantised using ggml-org/gguf-my-repo [https://huggingface.co/spaces/ggml-org/gguf-my-repo]
183
+
184
+ <!-- description end -->
185
+ <!-- README_GGUF.md-about-gguf start -->
186
+ ### About GGUF (per [TheBloke](https://huggingface.co/TheBloke))
187
+
188
+ GGUF is a new format introduced by the llama.cpp team on August 21st 2023. It is a replacement for GGML, which is no longer supported by llama.cpp.
189
+
190
+ Here is an incomplete list of clients and libraries that are known to support GGUF:
191
+
192
+ * [llama.cpp](https://github.com/ggerganov/llama.cpp). The source project for GGUF. Offers a CLI and a server option.
193
+ * [text-generation-webui](https://github.com/oobabooga/text-generation-webui), the most widely used web UI, with many features and powerful extensions. Supports GPU acceleration.
194
+ * [KoboldCpp](https://github.com/LostRuins/koboldcpp), a fully featured web UI, with GPU accel across all platforms and GPU architectures. Especially good for story telling.
195
+ * [GPT4All](https://gpt4all.io/index.html), a free and open source local running GUI, supporting Windows, Linux and macOS with full GPU accel.
196
+ * [LM Studio](https://lmstudio.ai/), an easy-to-use and powerful local GUI for Windows and macOS (Silicon), with GPU acceleration. Linux available, in beta as of 27/11/2023.
197
+ * [LoLLMS Web UI](https://github.com/ParisNeo/lollms-webui), a great web UI with many interesting and unique features, including a full model library for easy model selection.
198
+ * [Faraday.dev](https://faraday.dev/), an attractive and easy to use character-based chat GUI for Windows and macOS (both Silicon and Intel), with GPU acceleration.
199
+ * [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), a Python library with GPU accel, LangChain support, and OpenAI-compatible API server.
200
+ * [candle](https://github.com/huggingface/candle), a Rust ML framework with a focus on performance, including GPU support, and ease of use.
201
+ * [ctransformers](https://github.com/marella/ctransformers), a Python library with GPU accel, LangChain support, and OpenAI-compatible AI server. Note, as of time of writing (November 27th 2023), ctransformers has not been updated in a long time and does not support many recent models.
202
+
203
+ <!-- README_GGUF.md-about-gguf end -->
204
+
205
+ <!-- compatibility_gguf start -->
206
+ ## Compatibility
207
+
208
+ These quantised GGUFv2 files are compatible with llama.cpp from August 27th 2023 onwards, as of commit [d0cee0d](https://github.com/ggerganov/llama.cpp/commit/d0cee0d36d5be95a0d9088b674dbb27354107221)
209
+
210
+ They are also compatible with many third party UIs and libraries - please see the list at the top of this README.
211
+
212
+ ## Explanation of quantisation methods
213
+
214
+ <details>
215
+ <summary>Click to see details</summary>
216
+
217
+ The new methods available are:
218
+
219
+ * GGML_TYPE_Q2_K - "type-1" 2-bit quantization in super-blocks containing 16 blocks, each block having 16 weight. Block scales and mins are quantized with 4 bits. This ends up effectively using 2.5625 bits per weight (bpw)
220
+ * GGML_TYPE_Q3_K - "type-0" 3-bit quantization in super-blocks containing 16 blocks, each block having 16 weights. Scales are quantized with 6 bits. This end up using 3.4375 bpw.
221
+ * GGML_TYPE_Q4_K - "type-1" 4-bit quantization in super-blocks containing 8 blocks, each block having 32 weights. Scales and mins are quantized with 6 bits. This ends up using 4.5 bpw.
222
+ * GGML_TYPE_Q5_K - "type-1" 5-bit quantization. Same super-block structure as GGML_TYPE_Q4_K resulting in 5.5 bpw
223
+ * GGML_TYPE_Q6_K - "type-0" 6-bit quantization. Super-blocks with 16 blocks, each block having 16 weights. Scales are quantized with 8 bits. This ends up using 6.5625 bpw
224
+
225
+ Refer to the Provided Files table below to see what files use which methods, and how.
226
+ </details>
227
+ <!-- compatibility_gguf end -->
228
+
229
+ <!-- README_GGUF.md-provided-files start -->
230
+ ## Provided Files (Not Including iMatrix Quantization)
231
+
232
+ | Quant method | Bits | Example Size | Max RAM required | Use case |
233
+ | ---- | ---- | ---- | ---- | ----- |
234
+ | Q2_K | 2 | 2.72 GB| 5.22 GB | significant quality loss - not recommended for most purposes |
235
+ | Q3_K_S | 3 | 3.16 GB| 5.66 GB | very small, high quality loss |
236
+ | Q3_K_M | 3 | 3.52 GB| 6.02 GB | very small, high quality loss |
237
+ | Q3_K_L | 3 | 3.82 GB| 6.32 GB | small, substantial quality loss |
238
+ | Q4_0 | 4 | 4.11 GB| 6.61 GB | legacy; small, very high quality loss - prefer using Q3_K_M |
239
+ | Q4_K_S | 4 | 4.14 GB| 6.64 GB | small, greater quality loss |
240
+ | Q4_K_M | 4 | 4.37 GB| 6.87 GB | medium, balanced quality - recommended |
241
+ | Q5_0 | 5 | 5.00 GB| 7.50 GB | legacy; medium, balanced quality - prefer using Q4_K_M |
242
+ | Q5_K_S | 5 | 5.00 GB| 7.50 GB | large, low quality loss - recommended |
243
+ | Q5_K_M | 5 | 5.13 GB| 7.63 GB | large, very low quality loss - recommended |
244
+ | Q6_K | 6 | 5.94 GB| 8.44 GB | very large, extremely low quality loss |
245
+ | Q8_0 | 8 | 7.70 GB| 10.20 GB | very large, extremely low quality loss - not recommended |
246
+
247
+ **Note**: the above RAM figures assume no GPU offloading. If layers are offloaded to the GPU, this will reduce RAM usage and use VRAM instead.
248
+
249
+
250
+
251
+ <!-- README_GGUF.md-provided-files end -->
252
+
253
+ <!-- repositories-available start -->
254
+ ---
255
+
256
+ ## Use with llama.cpp
257
+ Install llama.cpp through brew (works on Mac and Linux)
258
+
259
+ ```bash
260
+ brew install llama.cpp
261
+
262
+ ```
263
+ Invoke the llama.cpp server or the CLI.
264
+
265
+ ### CLI:
266
+ ```bash
267
+ llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
268
+ ```
269
+
270
+ ### Server:
271
+ ```bash
272
+ llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
273
+ ```
274
+
275
+ Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
276
+
277
+ Step 1: Clone llama.cpp from GitHub.
278
+ ```
279
+ git clone https://github.com/ggerganov/llama.cpp
280
+ ```
281
+
282
+ Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
283
+ ```
284
+ cd llama.cpp && LLAMA_CURL=1 make
285
+ ```
286
+
287
+ Step 3: Run inference through the main binary.
288
+ ```
289
+ ./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
290
+ ```
291
+ or
292
+ ```
293
+ ./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
294
+ ```
295
+ """
296
+ )
297
+ card.save(f"README.md")
298
+
299
+ if split_model:
300
+ split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
301
+ else:
302
+ try:
303
+ print(f"Uploading quantized model: {quantized_gguf_path}")
304
+ api.upload_file(
305
+ path_or_fileobj=quantized_gguf_path,
306
+ path_in_repo=quantized_gguf_name,
307
+ repo_id=new_repo_id,
308
+ )
309
+ except Exception as e:
310
+ raise Exception(f"Error uploading quantized model: {e}")
311
+
312
+
313
+ imatrix_path = "llama.cpp/imatrix.dat"
314
+ if os.path.isfile(imatrix_path):
315
+ try:
316
+ print(f"Uploading imatrix.dat: {imatrix_path}")
317
+ api.upload_file(
318
+ path_or_fileobj=imatrix_path,
319
+ path_in_repo="imatrix.dat",
320
+ repo_id=new_repo_id,
321
+ )
322
+ except Exception as e:
323
+ raise Exception(f"Error uploading imatrix.dat: {e}")
324
+
325
+ api.upload_file(
326
+ path_or_fileobj=f"README.md",
327
+ path_in_repo=f"README.md",
328
+ repo_id=new_repo_id,
329
+ )
330
+ print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
331
+
332
+ return (
333
+ f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
334
+ "llama.png",
335
+ )
336
+ except Exception as e:
337
+ return (f"Error: {e}", "error.png")
338
+ finally:
339
+ shutil.rmtree(model_name, ignore_errors=True)
340
+ print("Folder cleaned up successfully!")
341
+
342
+ css="""/* Custom CSS to allow scrolling */
343
+ .gradio-container {overflow-y: auto;}
344
+ """
345
+ # Create Gradio interface
346
+ with gr.Blocks(css=css) as demo:
347
+ gr.Markdown("You must be logged in to use GGUF-my-repo.")
348
+ gr.LoginButton(min_width=250)
349
+
350
+ model_id = HuggingfaceHubSearch(
351
+ label="Hub Model ID",
352
+ placeholder="Search for model id on Huggingface",
353
+ search_type="model",
354
+ )
355
+
356
+ q_method = gr.Dropdown(
357
+ ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
358
+ label="Quantization Method",
359
+ info="GGML quantization type",
360
+ value="Q8_0",
361
+ filterable=False,
362
+ visible=True
363
+ )
364
+
365
+ imatrix_q_method = gr.Dropdown(
366
+ ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
367
+ label="Imatrix Quantization Method",
368
+ info="GGML imatrix quants type",
369
+ value="IQ4_NL",
370
+ filterable=False,
371
+ visible=False
372
+ )
373
+
374
+ use_imatrix = gr.Checkbox(
375
+ value=False,
376
+ label="Use Imatrix Quantization",
377
+ info="Use importance matrix for quantization."
378
+ )
379
+
380
+ private_repo = gr.Checkbox(
381
+ value=True,
382
+ label="Private Repo",
383
+ info="Create a private repo under your username."
384
+ )
385
+
386
+ train_data_file = gr.File(
387
+ label="Training Data File",
388
+ file_types=["txt"],
389
+ visible=False
390
+ )
391
+
392
+ split_model = gr.Checkbox(
393
+ value=False,
394
+ label="Split Model",
395
+ info="Shard the model using gguf-split."
396
+ )
397
+
398
+ split_max_tensors = gr.Number(
399
+ value=256,
400
+ label="Max Tensors per File",
401
+ info="Maximum number of tensors per file when splitting model.",
402
+ visible=False
403
+ )
404
+
405
+ split_max_size = gr.Textbox(
406
+ label="Max File Size",
407
+ info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.",
408
+ visible=False
409
+ )
410
+
411
+ def update_visibility(use_imatrix):
412
+ return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
413
+
414
+ use_imatrix.change(
415
+ fn=update_visibility,
416
+ inputs=use_imatrix,
417
+ outputs=[q_method, imatrix_q_method, train_data_file]
418
+ )
419
+
420
+ iface = gr.Interface(
421
+ fn=process_model,
422
+ inputs=[
423
+ model_id,
424
+ q_method,
425
+ use_imatrix,
426
+ imatrix_q_method,
427
+ private_repo,
428
+ train_data_file,
429
+ split_model,
430
+ split_max_tensors,
431
+ split_max_size,
432
+ ],
433
+ outputs=[
434
+ gr.Markdown(label="output"),
435
+ gr.Image(show_label=False),
436
+ ],
437
+ title="Asalamu Alaikum! Create your own GGUF Quantizations, B̶L̶A̶Z̶I̶N̶G̶L̶Y̶ ̶F̶A̶S̶T̶ ⚡! (Hey it's free!)",
438
+ description="The space takes a HuggingFace repo as an input, quantizes it and creates a private repo containing the selected quant under your HF user namespace.",
439
+ api_name=False
440
+ )
441
+
442
+ def update_split_visibility(split_model):
443
+ return gr.update(visible=split_model), gr.update(visible=split_model)
444
+
445
+ split_model.change(
446
+ fn=update_split_visibility,
447
+ inputs=split_model,
448
+ outputs=[split_max_tensors, split_max_size]
449
+ )
450
+
451
+ def restart_space():
452
+ HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
453
+
454
+ scheduler = BackgroundScheduler()
455
+ scheduler.add_job(restart_space, "interval", seconds=21600)
456
+ scheduler.start()
457
+
458
+ # Launch the interface
459
  demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)
error.png CHANGED

Git LFS Details

  • SHA256: 9fe4f4850a91331d01b51d0b371a7abaffeda0f680ff5dd7b3f02d28f131e8b7
  • Pointer size: 132 Bytes
  • Size of remote file: 7.64 MB
groups_merged.txt CHANGED
The diff for this file is too large to render. See raw diff
 
llama.png CHANGED

Git LFS Details

  • SHA256: a287a47ae4c6f87a363471130be4c916948664792a7a8efbca1bdaaf8d016ebc
  • Pointer size: 132 Bytes
  • Size of remote file: 1.8 MB

Git LFS Details

  • SHA256: 2c74a1f7c3ffec624c66212b1d92560e0566dc5688c23c444c2aab1008181a95
  • Pointer size: 132 Bytes
  • Size of remote file: 8.06 MB
start.sh CHANGED
@@ -1,5 +1,5 @@
1
- cd llama.cpp
2
- LLAMA_CUDA=1 make -j llama-quantize llama-gguf-split llama-imatrix
3
-
4
- cd ..
5
  python app.py
 
1
+ cd llama.cpp
2
+ LLAMA_CUDA=1 make -j llama-quantize llama-gguf-split llama-imatrix
3
+
4
+ cd ..
5
  python app.py