wq2012 commited on
Commit
ce01037
1 Parent(s): c45bca0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -48
app.py CHANGED
@@ -1,63 +1,86 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
 
 
3
 
4
  """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
 
 
 
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
 
26
- messages.append({"role": "user", "content": message})
27
 
28
- response = ""
 
 
 
 
 
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
 
39
- response += token
40
- yield response
41
 
42
- """
43
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
44
- """
45
- demo = gr.ChatInterface(
46
- respond,
47
- additional_inputs=[
48
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
49
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
- gr.Slider(
52
- minimum=0.1,
53
- maximum=1.0,
54
- value=0.95,
55
- step=0.05,
56
- label="Top-p (nucleus sampling)",
57
- ),
58
- ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  )
60
 
 
 
 
61
 
62
  if __name__ == "__main__":
63
- demo.launch()
 
1
  import gradio as gr
2
+ from gpt4all import GPT4All
3
+ from huggingface_hub import hf_hub_download
4
+
5
+ title = "DiarizationLM GGUF inference on CPU"
6
+
7
+ description = """
8
 
9
  """
 
 
 
10
 
11
+ model_path = "models"
12
+ model_name = "model-unsloth.Q4_K_M.gguf"
13
+ hf_hub_download(repo_id="google/DiarizationLM-13b-Fisher-v1", filename=model_name, local_dir=model_path, local_dir_use_symlinks=False)
14
 
15
+ print("Start the model init process")
16
+ model = GPT4All(model_name, model_path, allow_download = False, device="cpu")
17
+ print("Finish the model init process")
 
 
 
 
 
 
18
 
19
+ model.config["promptTemplate"] = "{0} --> "
20
+ model.config["systemPrompt"] = ""
21
+ model._is_chat_session_activated = False
 
 
22
 
23
+ max_new_tokens = 2048
24
 
25
+ def generater(message, history, temperature, top_p, top_k):
26
+ prompt = model.config["promptTemplate"].format(message)
27
+ outputs = []
28
+ for token in model.generate(prompt=prompt, temp=temperature, top_k = top_k, top_p = top_p, max_tokens = max_new_tokens, streaming=True):
29
+ outputs.append(token)
30
+ yield "".join(outputs)
31
 
32
+ def vote(data: gr.LikeData):
33
+ if data.liked:
34
+ return
35
+ else:
36
+ return
 
 
 
37
 
38
+ chatbot = gr.Chatbot(avatar_images=('resourse/user-icon.png', 'resourse/chatbot-icon.png'),bubble_full_width = False)
 
39
 
40
+ additional_inputs=[
41
+ gr.Slider(
42
+ label="temperature",
43
+ value=0.5,
44
+ minimum=0.0,
45
+ maximum=2.0,
46
+ step=0.05,
47
+ interactive=True,
48
+ info="Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.",
49
+ ),
50
+ gr.Slider(
51
+ label="top_p",
52
+ value=1.0,
53
+ minimum=0.0,
54
+ maximum=1.0,
55
+ step=0.01,
56
+ interactive=True,
57
+ info="0.1 means only the tokens comprising the top 10% probability mass are considered. Suggest set to 1 and use temperature. 1 means 100% and will disable it",
58
+ ),
59
+ gr.Slider(
60
+ label="top_k",
61
+ value=40,
62
+ minimum=0,
63
+ maximum=1000,
64
+ step=1,
65
+ interactive=True,
66
+ info="limits candidate tokens to a fixed number after sorting by probability. Setting it higher than the vocabulary size deactivates this limit.",
67
+ )
68
+ ]
69
+
70
+ iface = gr.ChatInterface(
71
+ fn = generater,
72
+ title=title,
73
+ description = description,
74
+ chatbot=chatbot,
75
+ additional_inputs=additional_inputs,
76
+ examples=[
77
+ ["<speaker:1> Hello, how are you doing <speaker:2> today? I am doing well."],
78
+ ]
79
  )
80
 
81
+ with gr.Blocks(css="resourse/style/custom.css") as demo:
82
+ chatbot.like(vote, None, None)
83
+ iface.render()
84
 
85
  if __name__ == "__main__":
86
+ demo.queue(max_size=3).launch()