File size: 5,343 Bytes
20aa964
 
 
 
 
 
 
a360f5e
20aa964
 
883b775
 
4149fa9
20aa964
 
 
4b54665
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44fe74d
883b775
 
 
 
 
 
 
 
4b54665
3e2702a
a360f5e
 
20aa964
a360f5e
20aa964
a360f5e
 
 
883b775
a360f5e
883b775
bf39fb9
883b775
20aa964
883b775
a360f5e
883b775
bf39fb9
883b775
a360f5e
 
 
4149fa9
 
20aa964
a360f5e
4149fa9
 
a360f5e
4149fa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b54665
 
883b775
44fe74d
 
 
883b775
 
 
 
 
 
 
 
 
 
44fe74d
 
5cfec42
 
44fe74d
20aa964
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import os
import shutil
import subprocess
import signal
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
import gradio as gr

from huggingface_hub import HfApi
from huggingface_hub import ModelCard

from gradio_huggingfacehub_search import HuggingfaceHubSearch

from textwrap import dedent

HF_PATH = "https://huggingface.co/"

CONV_TEMPLATES = [
    "llama-3",
    "llama-3_1",
    "chatml",
    "chatml_nosystem",
    "qwen2",
    "open_hermes_mistral",
    "neural_hermes_mistral",
    "llama_default",
    "llama-2",
    "mistral_default",
    "gpt2",
    "codellama_completion",
    "codellama_instruct",
    "vicuna_v1.1",
    "conv_one_shot",
    "redpajama_chat",
    "rwkv_world",
    "rwkv",
    "gorilla",
    "gorilla-openfunctions-v2",
    "guanaco",
    "dolly",
    "oasst",
    "stablelm",
    "stablecode_completion",
    "stablecode_instruct",
    "minigpt",
    "moss",
    "LM",
    "stablelm-3b",
    "gpt_bigcode",
    "wizardlm_7b",
    "wizard_coder_or_math",
    "glm",
    "custom",  # for web-llm only
    "phi-2",
    "phi-3",
    "phi-3-vision",
    "stablelm-2",
    "gemma_instruction",
    "orion",
    "llava",
    "hermes2_pro_llama3",
    "hermes3_llama-3_1",
    "tinyllama_v1_0",
    "aya-23",
]

QUANTIZATIONS = ["q0f16", 
                 "q0f32", 
                 "q3f16_1", 
                 "q4f16_1", 
                 "q4f32_1", 
                 "q4f16_awq"]

def button_click(hf_model_id, conv_template, quantization, oauth_token: gr.OAuthToken | None):
    if oauth_token.token == None:
        return "Log in to Huggingface to use this"
    elif not hf_model_id:
        return "Enter a Huggingface model ID"
    elif not conv_template:
        return "Select a conversation template"
    elif not quantization:
        return "Select a quantization method"
    
    api = HfApi(token=oauth_token.token)
    model_dir_name = hf_model_id.split("/")[1]
    mlc_model_name = model_dir_name + "-" + quantization + "-" + "MLC"

    os.system("mkdir -p dist/models")
    os.system("git lfs install")

    api.snapshot_download(repo_id=hf_model_id, local_dir=f"./dist/models/{model_dir_name}")

    convert_weight_result = subprocess.run(["mlc_llm convert_weight ./dist/models/" + model_dir_name + "/" + \
              " --quantization " + quantization + \
              " -o dist/" + mlc_model_name], shell=True, capture_output=True, text=True)
    if convert_weight_result.returncode != 0:
        return convert_weight_result.stderr
    
    gen_config_result = subprocess.run(["mlc_llm gen_config ./dist/models/" + model_dir_name + "/" + \
              " --quantization " + quantization + " --conv-template " + conv_template + \
              " -o dist/" + mlc_model_name + "/"], shell=True, capture_output=True, text=True)
    if gen_config_result.returncode != 0:
        return gen_config_result.stderr
    
    # push to HF
    user_name = api.whoami()["name"]
    created_repo_url = api.create_repo(repo_id=f"{user_name}/{mlc_model_name}", private=True)
    created_repo_id = created_repo_url.repo_id

    api.upload_large_folder(folder_path=f"./dist/{mlc_model_name}",
                            repo_id=f"{user_name}/{mlc_model_name}",
                            repo_type="model")
    
    # push model card to HF
    card = ModelCard.load(hf_model_id, token=oauth_token.token)
    if not card.data.tags:
        card.data.tags = []
    card.data.tags.append("mlc-ai")
    card.data.tags.append("MLC-Weight-Conversion")
    card.data.base_model = hf_model_id

    card.text = dedent(
        f"""
        # {created_repo_id}
        This model was compiled using MLC-LLM with {quantization} quantization from [{hf_model_id}]({HF_PATH}{hf_model_id}).
        The conversion was done using the [MLC-Weight-Conversion](https://huggingface.co/spaces/mlc-ai/MLC-Weight-Conversion) space.

        To run this model, please first install [MLC-LLM](https://llm.mlc.ai/docs/install/mlc_llm.html#install-mlc-packages).

        To chat with the model on your terminal:
        ```bash
        mlc_llm chat HF://{created_repo_id}
        ```

        For more information on how to use MLC-LLM, please visit the MLC-LLM [documentation](https://llm.mlc.ai/docs/index.html).
        """
    )
    card.save("./dist/README.md")

    api.upload_file(path_or_fileobj="./dist/README.md",
                    path_in_repo="README.md",
                    repo_id=created_repo_id,
                    repo_type="model")

    os.system("rm -rf dist/")
    
    return "Successful, please find your compiled LLM model on your personal account"

with gr.Blocks() as demo:
    gr.LoginButton()
    gr.Markdown(
    """
    # Compile your LLM model with MLC-LLM and run it locally!
    ### This space takes in Huggingface model ID, and converts it for you using your selected conversation template and quantization method!
    """)
    model_id = HuggingfaceHubSearch(
        label="HF Model ID",
        placeholder="Search for your model on Huggingface",
        search_type="model",
    )
    conv = gr.Dropdown(CONV_TEMPLATES, label="Conversation Template")
    quant = gr.Dropdown(QUANTIZATIONS, label="Quantization Method")
    btn = gr.Button("Convert to MLC")
    out = gr.Textbox(label="Conversion Result")
    btn.click(fn=button_click , inputs=[model_id, conv, quant], outputs=out)

demo.launch()