DeepMount00 commited on
Commit
a1130ae
1 Parent(s): d7cc17c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -108
app.py CHANGED
@@ -5,127 +5,137 @@ import gradio as gr
5
  import torch
6
  import spaces
7
 
8
- model_id = "DeepMount00/Llama-3-COT-ITA"
9
-
10
- tokenizer = AutoTokenizer.from_pretrained(model_id)
11
- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto").eval() # to("cuda:0")
12
 
13
  DESCRIPTION = '''
14
  <div>
15
- <h1 style="text-align: center;">Meta Llama3 8B ITA</h1>
16
- <p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/DeepMount00/Llama-3-8b-Ita"><b>Meta Llama3 8b Chat ITA</b></a>.</p>
17
  </div>
18
  <div>
19
- <p>This model, <strong>DeepMount00/Llama-3-8b-Ita</strong>, is currently the best open-source large language model for the Italian language. You can view its ranking and compare it with other models on the leaderboard at <a href="https://huggingface.co/spaces/FinancialSupport/open_ita_llm_leaderboard"><b>this site</b></a>.</p>
20
  </div>
21
  '''
22
- PLACEHOLDER = """
23
- <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
24
- <img src="https://cdn-avatars.huggingface.co/v1/production/uploads/64f1bf6a8b550e875926a590/9IXg0qMUF0OV2cWPT8cZn.jpeg" style="width: 80%; max-width: 550px; height: auto; opacity: 0.50; ">
25
- <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">DeepMount00 llama3</h1>
26
- <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Chiedimi qualsiasi cosa...</p>
27
- </div>
28
- """
29
-
30
-
31
- css = """
32
- h1 {
33
- text-align: center;
34
- display: block;
35
- }
36
- """
37
-
38
- prompt = """Sei un assistente virtuale avanzato, progettato per fornire risposte accurate, utili e tempestive. Segui queste linee guida:
39
-
40
- 1. **Professionalità**: Rispondi sempre in modo educato e rispettoso.
41
- 2. **Chiarezza**: Fornisci informazioni chiare e precise.
42
- 3. **Empatia**: Mostra comprensione per le esigenze degli utenti.
43
- 4. **Adattabilità**: Adattati agli stili di comunicazione degli utenti.
44
- 5. **Privacy**: Non richiedere o raccogliere informazioni personali sensibili.
45
- 6. **Supporto**: Assisti con domande generali, risoluzione di problemi tecnici e consigli."""
46
-
47
- @spaces.GPU(duration=120)
48
- def chat_llama3_8b(message: str, history: list, temperature: float, max_new_tokens: int) -> str:
49
- # Initialize the conversation with a system prompt
50
- conversation = [{"role": "system", "content": f"{prompt}"}]
51
-
52
- flat_history = [item for sublist in history for item in sublist]
53
-
54
- if len(flat_history) > 16:
55
- flat_history = flat_history[-16:]
56
-
57
- # Rebuild the conversation from the trimmed history
58
- for i in range(0, len(flat_history), 2):
59
- conversation.extend([
60
- {"role": "user", "content": flat_history[i]},
61
- {"role": "assistant", "content": flat_history[i + 1]}
62
- ])
63
-
64
- # Add the current user message to the conversation
65
  conversation.append({"role": "user", "content": message})
66
 
67
- # Prepare the input for the model
68
- input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
69
-
70
- # Parameters for generating text
71
- do_sample = True if temperature > 0 else False # Use sampling unless temperature is 0
72
- real_temperature = max(temperature, 0.001) # Avoid zero temperature which disables sampling
73
 
74
- # Generate a response from the model
75
- generated_ids = model.generate(
76
- input_ids=input_ids,
 
77
  max_new_tokens=max_new_tokens,
78
- do_sample=do_sample,
79
- temperature=real_temperature,
80
- eos_token_id=tokenizer.eos_token_id
 
 
 
81
  )
82
- input_length = input_ids.size(1)
83
- new_tokens = generated_ids[:, input_length:]
84
- decoded = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
85
-
86
- final_response = decoded.strip("assistant")
87
- if final_response.startswith(':'):
88
- final_response = final_response.lstrip(':').strip()
89
-
90
- return final_response
91
-
92
-
93
-
94
- # Gradio block
95
- chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
96
-
97
- with gr.Blocks(fill_height=True, css=css) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  gr.Markdown(DESCRIPTION)
99
  gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
100
- gr.ChatInterface(
101
- fn=chat_llama3_8b,
102
- chatbot=chatbot,
103
- fill_height=True,
104
- additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
105
- additional_inputs=[
106
- gr.Slider(minimum=0,
107
- maximum=1,
108
- step=0.1,
109
- value=0.001,
110
- label="Temperature",
111
- render=False),
112
- gr.Slider(minimum=128,
113
- maximum=4096,
114
- step=1,
115
- value=512,
116
- label="Max new tokens",
117
- render=False),
118
- ],
119
- examples=[
120
- ['Quanto è alta la torre di Pisa?'],
121
- ["Se un mattone pesa 1kg più mezzo mattone, quanto pesa il mattone? rispondi impostando l'equazione"],
122
- ['Quanto fa 2 * 9?'],
123
- ['Scrivi una funzione python che calcola i primi n numeri di fibonacci'],
124
- ['Inventa tre indovinelli tutti diversi con le relative risposte in formato json']
125
- ],
126
- cache_examples=False,
127
- )
128
-
129
 
130
  if __name__ == "__main__":
131
- demo.launch()
 
5
  import torch
6
  import spaces
7
 
 
 
 
 
8
 
9
  DESCRIPTION = '''
10
  <div>
11
+ <h1 style="text-align: center;">Lexora-Medium-7B</h1>
12
+ <p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/DeepMount00/Lexora-Medium-7B"><b>Lexora-Medium-7B Chat ITA</b></a>.</p>
13
  </div>
14
  <div>
15
+ <p>This model, <strong>DeepMount00/Lexora-Medium-7B</strong>, is currently the best open-source large language model for the Italian language. You can view its ranking and compare it with other models on the leaderboard at <a href="https://huggingface.co/spaces/FinancialSupport/open_ita_llm_leaderboard"><b>this site</b></a>.</p>
16
  </div>
17
  '''
18
+ MAX_MAX_NEW_TOKENS = 2048
19
+ DEFAULT_MAX_NEW_TOKENS = 1024
20
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
21
+
22
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
23
+
24
+ model_id = "DeepMount00/Lexora-Medium-7B"
25
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True,)
26
+ model = AutoModelForCausalLM.from_pretrained(
27
+ model_id,
28
+ device_map="auto",
29
+ torch_dtype=torch.bfloat16,
30
+ attn_implementation="flash_attention_2",
31
+ trust_remote_code=True,
32
+ )
33
+ model.eval()
34
+
35
+
36
+ @spaces.GPU(duration=90)
37
+ def generate(
38
+ message: str,
39
+ chat_history: list[tuple[str, str]],
40
+ system_message: str = "",
41
+ max_new_tokens: int = 2048,
42
+ temperature: float = 0.0001,
43
+ top_p: float = 1.0,
44
+ top_k: int = 50,
45
+ repetition_penalty: float = 1.0,
46
+ ) -> Iterator[str]:
47
+ conversation = [{"role": "system", "content": system_message}]
48
+ for user, assistant in chat_history:
49
+ conversation.extend(
50
+ [
51
+ {"role": "user", "content": user},
52
+ {"role": "assistant", "content": assistant},
53
+ ]
54
+ )
 
 
 
 
 
 
55
  conversation.append({"role": "user", "content": message})
56
 
57
+ input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
58
+ if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
59
+ input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
60
+ gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
61
+ input_ids = input_ids.to(model.device)
 
62
 
63
+ streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
64
+ generate_kwargs = dict(
65
+ {"input_ids": input_ids},
66
+ streamer=streamer,
67
  max_new_tokens=max_new_tokens,
68
+ do_sample=True,
69
+ top_p=top_p,
70
+ top_k=top_k,
71
+ temperature=temperature,
72
+ num_beams=1,
73
+ repetition_penalty=repetition_penalty,
74
  )
75
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
76
+ t.start()
77
+
78
+ outputs = []
79
+ for text in streamer:
80
+ outputs.append(text)
81
+ yield "".join(outputs)
82
+
83
+
84
+ chat_interface = gr.ChatInterface(
85
+ fn=generate,
86
+ additional_inputs=[
87
+ gr.Textbox(
88
+ value="",
89
+ label="System message",
90
+ render=False,
91
+ ),
92
+ gr.Slider(
93
+ label="Max new tokens",
94
+ minimum=1,
95
+ maximum=MAX_MAX_NEW_TOKENS,
96
+ step=1,
97
+ value=DEFAULT_MAX_NEW_TOKENS,
98
+ ),
99
+ gr.Slider(
100
+ label="Temperature",
101
+ minimum=0,
102
+ maximum=4.0,
103
+ step=0.1,
104
+ value=0.001,
105
+ ),
106
+ gr.Slider(
107
+ label="Top-p (nucleus sampling)",
108
+ minimum=0.05,
109
+ maximum=1.0,
110
+ step=0.05,
111
+ value=1.0,
112
+ ),
113
+ gr.Slider(
114
+ label="Top-k",
115
+ minimum=1,
116
+ maximum=1000,
117
+ step=1,
118
+ value=50,
119
+ ),
120
+ gr.Slider(
121
+ label="Repetition penalty",
122
+ minimum=1.0,
123
+ maximum=2.0,
124
+ step=0.05,
125
+ value=1.0,
126
+ ),
127
+ ],
128
+ stop_btn=None,
129
+ examples=[
130
+ ["Ciao! Come stai?"],
131
+ ],
132
+ cache_examples=False,
133
+ )
134
+
135
+ with gr.Blocks(css="style.css", fill_height=True, theme="soft") as demo:
136
  gr.Markdown(DESCRIPTION)
137
  gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
138
+ chat_interface.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
  if __name__ == "__main__":
141
+ demo.queue(max_size=20).launch()