Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,71 +1,113 @@
|
|
1 |
-
import
|
|
|
|
|
|
|
|
|
2 |
import torch
|
3 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
4 |
import spaces
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
descripcion = """Yi-Coder-9B-Chat es un modelo de 9B parámetros ajustado para tareas de codificación. Esta demo muestra su capacidad para generar código basado en tus prompts. Yi-Coder es una serie de modelos de lenguaje de código abierto que ofrece un rendimiento de codificación de vanguardia con menos de 10 mil millones de parámetros. Sobresale en la comprensión de contextos largos con una longitud máxima de contexto de 128K tokens. - Soporta 52 lenguajes de programación principales."""
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
# Cargar el tokenizador y el modelo
|
14 |
-
tokenizador = AutoTokenizer.from_pretrained(ruta_modelo)
|
15 |
-
modelo = AutoModelForCausalLM.from_pretrained(ruta_modelo, device_map="auto").eval()
|
16 |
|
17 |
-
@spaces.GPU(
|
18 |
-
def
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
)
|
34 |
-
|
35 |
-
|
36 |
-
]
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
39 |
|
40 |
-
def interfaz_gradio():
|
41 |
-
with gr.Blocks() as interfaz:
|
42 |
-
gr.Markdown(titulo)
|
43 |
-
gr.Markdown(descripcion)
|
44 |
-
|
45 |
-
prompt_sistema = gr.Textbox(
|
46 |
-
label="☯️Instrucción Yi:",
|
47 |
-
value="Eres un asistente de codificación útil. Proporciona ejemplos de código claros y concisos.",
|
48 |
-
lines=2
|
49 |
-
)
|
50 |
-
prompt_usuario = gr.Code(
|
51 |
-
label="🤔Pregunta de Codificación",
|
52 |
-
value="Escribe un algoritmo de quicksort en Python.",
|
53 |
-
language="python",
|
54 |
-
lines=15
|
55 |
-
)
|
56 |
-
codigo_salida = gr.Code(label="☯️Yi-Coder-9B", language='python', lines=20, interactive=True)
|
57 |
-
max_longitud_slider = gr.Slider(minimum=1, maximum=1800, value=650, label="Longitud Máxima de Tokens")
|
58 |
-
|
59 |
-
boton_generar = gr.Button("Generar Código")
|
60 |
-
boton_generar.click(
|
61 |
-
generar_codigo,
|
62 |
-
inputs=[prompt_sistema, prompt_usuario, max_longitud_slider],
|
63 |
-
outputs=codigo_salida
|
64 |
-
)
|
65 |
-
|
66 |
-
return interfaz
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import subprocess
|
4 |
+
from threading import Thread
|
5 |
+
|
6 |
import torch
|
|
|
7 |
import spaces
|
8 |
+
import gradio as gr
|
9 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
|
10 |
|
11 |
+
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
|
|
12 |
|
13 |
+
MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"
|
14 |
+
CHAT_TEMPLATE = "ChatML"
|
15 |
+
MODEL_NAME = MODEL_ID.split("/")[-1]
|
16 |
+
CONTEXT_LENGTH = int(os.environ.get("CONTEXT_LENGTH"))
|
17 |
+
COLOR = os.environ.get("COLOR")
|
18 |
+
EMOJI = os.environ.get("EMOJI")
|
19 |
+
DESCRIPTION = os.environ.get("DESCRIPTION")
|
20 |
|
|
|
|
|
|
|
21 |
|
22 |
+
@spaces.GPU()
|
23 |
+
def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
|
24 |
+
# Format history with a given chat template
|
25 |
+
if CHAT_TEMPLATE == "Auto":
|
26 |
+
stop_tokens = [tokenizer.eos_token_id]
|
27 |
+
instruction = system_prompt + "\n\n"
|
28 |
+
for user, assistant in history:
|
29 |
+
instruction += f"User: {user}\nAssistant: {assistant}\n"
|
30 |
+
instruction += f"User: {message}\nAssistant:"
|
31 |
+
elif CHAT_TEMPLATE == "ChatML":
|
32 |
+
stop_tokens = ["<|endoftext|>", "<|im_end|>"]
|
33 |
+
instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
|
34 |
+
for user, assistant in history:
|
35 |
+
instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
|
36 |
+
instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
|
37 |
+
elif CHAT_TEMPLATE == "Mistral Instruct":
|
38 |
+
stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
|
39 |
+
instruction = f'<s>[INST] {system_prompt}\n'
|
40 |
+
for user, assistant in history:
|
41 |
+
instruction += f'{user} [/INST] {assistant}</s>[INST]'
|
42 |
+
instruction += f' {message} [/INST]'
|
43 |
+
else:
|
44 |
+
raise Exception("Incorrect chat template, select 'Auto', 'ChatML' or 'Mistral Instruct'")
|
45 |
+
print(instruction)
|
46 |
+
|
47 |
+
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
48 |
+
enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
|
49 |
+
input_ids, attention_mask = enc.input_ids, enc.attention_mask
|
50 |
+
|
51 |
+
if input_ids.shape[1] > CONTEXT_LENGTH:
|
52 |
+
input_ids = input_ids[:, -CONTEXT_LENGTH:]
|
53 |
+
attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
|
54 |
+
|
55 |
+
generate_kwargs = dict(
|
56 |
+
input_ids=input_ids.to(device),
|
57 |
+
attention_mask=attention_mask.to(device),
|
58 |
+
streamer=streamer,
|
59 |
+
do_sample=True,
|
60 |
+
temperature=temperature,
|
61 |
+
max_new_tokens=max_new_tokens,
|
62 |
+
top_k=top_k,
|
63 |
+
repetition_penalty=repetition_penalty,
|
64 |
+
top_p=top_p
|
65 |
)
|
66 |
+
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
67 |
+
t.start()
|
68 |
+
outputs = []
|
69 |
+
for new_token in streamer:
|
70 |
+
outputs.append(new_token)
|
71 |
+
if new_token in stop_tokens:
|
72 |
+
break
|
73 |
+
yield "".join(outputs)
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
+
# Load model
|
77 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
78 |
+
quantization_config = BitsAndBytesConfig(
|
79 |
+
load_in_4bit=True,
|
80 |
+
bnb_4bit_compute_dtype=torch.bfloat16
|
81 |
+
)
|
82 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
83 |
+
model = AutoModelForCausalLM.from_pretrained(
|
84 |
+
MODEL_ID,
|
85 |
+
device_map="auto",
|
86 |
+
quantization_config=quantization_config,
|
87 |
+
attn_implementation="flash_attention_2",
|
88 |
+
)
|
89 |
+
|
90 |
+
# Create Gradio interface
|
91 |
+
gr.ChatInterface(
|
92 |
+
predict,
|
93 |
+
title=EMOJI + " " + MODEL_NAME,
|
94 |
+
description=DESCRIPTION,
|
95 |
+
examples=[
|
96 |
+
["¿Puedes resolver la ecuación 2x + 3 = 11 para x?"],
|
97 |
+
["Escribe un poema épico sobre la Antigua Roma."],
|
98 |
+
["¿Quién fue la primera persona en caminar sobre la Luna?"],
|
99 |
+
["Usa una comprensión de listas para crear una lista de cuadrados de los números del 1 al 10."],
|
100 |
+
["Recomienda algunos libros populares de ciencia ficción."],
|
101 |
+
["¿Puedes escribir una historia corta sobre un detective que viaja en el tiempo?"]
|
102 |
+
],
|
103 |
+
additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False),
|
104 |
+
additional_inputs=[
|
105 |
+
gr.Textbox("Eres un modelo que responde de manera precisa en español.", label="System prompt"),
|
106 |
+
gr.Slider(0, 1, 0.3, label="Temperature"),
|
107 |
+
gr.Slider(128, 4096, 1024, label="Max new tokens"),
|
108 |
+
gr.Slider(1, 80, 40, label="Top K sampling"),
|
109 |
+
gr.Slider(0, 2, 1.1, label="Repetition penalty"),
|
110 |
+
gr.Slider(0, 1, 0.95, label="Top P sampling"),
|
111 |
+
],
|
112 |
+
theme=gr.themes.Soft(primary_hue=COLOR),
|
113 |
+
).queue().launch()
|