File size: 2,765 Bytes
10e1692
 
f956d70
10e1692
 
1e5f1ea
 
 
05bde1f
10e1692
5e1fced
 
10e1692
4ddd8b4
1e5f1ea
05bde1f
a6c3106
05bde1f
e35c2e7
11f229b
05bde1f
 
11f229b
 
05bde1f
10e1692
e35c2e7
05bde1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1a0824
 
05bde1f
 
 
 
 
 
d1a0824
 
 
 
 
 
 
05bde1f
 
 
a6c3106
 
 
 
 
 
 
 
05bde1f
 
33f1e81
11f229b
05bde1f
 
30c43ba
369961f
2ba3e9d
05bde1f
 
 
 
 
 
 
 
 
 
 
8dd48d6
2414024
2ba3e9d
 
 
11f229b
8dd48d6
30c43ba
05bde1f
 
aa2e4f9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""
cf https://huggingface.co/spaces/Nymbo/Qwen-2.5-72B-Instruct/blob/main/app.py
   https://huggingface.co/spaces/prithivMLmods/Llama-3.1-8B-Instruct/blob/main/app.py
https://github.com/huggingface/huggingface-llama-recipes/blob/main/api_inference/inference-api.ipynb
"""

import os 
import time
import gradio as gr

from openai import OpenAI
# from huggingface_hub import InferenceClient

os.environ.update(TZ='Asia/Shanghai')
time.tzset()

# ACCESS_TOKEN = os.getenv("HF_TOKEN")

# client = InferenceClient()
# _ = """
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    # api_key=ACCESS_TOKEN,
    api_key=os.getenv("HF_TOKEN", 'na')
)
# """


def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""
    try:
        _ = client.chat.completions.create(
        model="Qwen/Qwen2.5-72B-Instruct",
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
        messages=messages,
        )
        for message in _:
            token = message.choices[0].delta.content
            response += token
            yield response
    except Exception as e:
        yield str(e)
        
chatbot = gr.Chatbot(height=600)

css = '''
.gradio-container{max-width: 1000px !important}
h1{text-align:center}
footer {
    visibility: hidden
}
'''

demo = gr.ChatInterface(
    respond,
    type='messages',
    # description='chatbox',
    additional_inputs=[
        gr.Textbox(value="", label="System message"),
        # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=1, maximum=32768 // 2 - 500, value=32768 // 2 - 500, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.3, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-P",
        ),
        
    ],
    fill_height=True,
    chatbot=chatbot,
    css=css,
    # examples=[{"role": "user", "content": "Define 'deep learning' in once sentence."}],
    # retry_btn="Retry",  # unexpected keyword argument 'retry_btn'
    # undo_btn="Undo",
    # clear_btn="Clear",
    # stop_btn='Cancel',
    # theme="allenai/gradio-theme",
    # theme="Nymbo/Alyx_Theme",
)
if __name__ == "__main__":
    demo.launch()  # ssr=False