NickyNicky commited on
Commit
781a905
1 Parent(s): 041bf80

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +185 -2
README.md CHANGED
@@ -1,2 +1,185 @@
1
- - oasst2_chatML_Cluster_2: future experts Cluster_2
2
- - Epoch: 7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - OpenAssistant/oasst2
5
+ language:
6
+ - bg
7
+ - ca
8
+ - cs
9
+ - da
10
+ - de
11
+ - en
12
+ - es
13
+ - fr
14
+ - hr
15
+ - hu
16
+ - it
17
+ - nl
18
+ - pl
19
+ - pt
20
+ - ro
21
+ - ru
22
+ - sl
23
+ - sr
24
+ - sv
25
+ - uk
26
+
27
+ library_name: transformers
28
+
29
+ widget:
30
+ - text: |
31
+ <bos><start_of_turn>system
32
+ You are a helpful AI assistant.<end_of_turn>
33
+ <start_of_turn>user
34
+ What is the meaning of life in the current time?<end_of_turn>
35
+ <start_of_turn>model
36
+
37
+ ---
38
+
39
+
40
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/641b435ba5f876fe30c5ae0a/YXqUXFjX8uIJT-mdOnM1h.png)
41
+
42
+ ```
43
+ reference data model:
44
+
45
+ datasets:
46
+ - lang: "bg,ca,cs,da,de,en,es,fr,hr,hu,it,nl,pl,pt,ro,ru,sl,sr,sv,uk"
47
+ link: https://huggingface.co/datasets/NickyNicky/oasst2_clusters
48
+
49
+ model:
50
+ - google/gemma-2b-it
51
+ Link:
52
+ https://huggingface.co/google/gemma-2b-it
53
+
54
+ Epoch: 7
55
+
56
+ future experts: Cluster_2
57
+
58
+ Eval model:
59
+ - link:
60
+ soon
61
+
62
+ ```
63
+
64
+
65
+ ##
66
+
67
+
68
+ ```Python
69
+ !python -m pip install --upgrade pip
70
+ !pip install "torch>=2.1.1" -U
71
+ !pip install torchaudio==2.2.0
72
+ !pip install -q datasets trl peft bitsandbytes sentencepiece wandb
73
+ !pip install -q accelerate safetensors deepspeed
74
+ !pip install -q scipy ninja -U
75
+ !pip install -q -U transformers==4.38.0
76
+ ```
77
+
78
+
79
+ ## Version
80
+ ```py
81
+ import torch
82
+ torch.__version__
83
+ #OUTPUTS: ('2.2.0+cu121' )
84
+ ```
85
+
86
+ ## How to use
87
+ ```py
88
+
89
+ from transformers import (
90
+ AutoModelForCausalLM,
91
+ AutoTokenizer,
92
+ BitsAndBytesConfig,
93
+ HfArgumentParser,
94
+ TrainingArguments,
95
+ pipeline,
96
+ logging,
97
+ GenerationConfig,
98
+ TextIteratorStreamer,
99
+ )
100
+
101
+ from transformers import StoppingCriteria, StoppingCriteriaList
102
+
103
+ import torch
104
+
105
+ model_id='NickyNicky/gemma-2b-it_oasst2_chatML_Cluster_2_V1'
106
+
107
+ model = AutoModelForCausalLM.from_pretrained(model_id,
108
+ device_map="auto",
109
+ trust_remote_code=True,
110
+ torch_dtype=torch.bfloat16,
111
+ attn_implementation="flash_attention_2",
112
+ # load_in_4bit=True,
113
+ # low_cpu_mem_usage= True,
114
+
115
+ )
116
+
117
+ max_length=2055
118
+ print("max_length",max_length)
119
+
120
+
121
+ tokenizer = AutoTokenizer.from_pretrained(model_id,
122
+ # use_fast = False,
123
+ max_length=max_length,)
124
+
125
+
126
+ class ListOfTokensStoppingCriteria(StoppingCriteria):
127
+ """
128
+ Clase para definir un criterio de parada basado en una lista de tokens específicos.
129
+ """
130
+ def __init__(self, tokenizer, stop_tokens):
131
+ self.tokenizer = tokenizer
132
+ # Codifica cada token de parada y guarda sus IDs en una lista
133
+ self.stop_token_ids_list = [tokenizer.encode(stop_token, add_special_tokens=False) for stop_token in stop_tokens]
134
+
135
+ def __call__(self, input_ids, scores, **kwargs):
136
+ # Verifica si los últimos tokens generados coinciden con alguno de los conjuntos de tokens de parada
137
+ for stop_token_ids in self.stop_token_ids_list:
138
+ len_stop_tokens = len(stop_token_ids)
139
+ if len(input_ids[0]) >= len_stop_tokens:
140
+ if input_ids[0, -len_stop_tokens:].tolist() == stop_token_ids:
141
+ return True
142
+ return False
143
+
144
+ # Uso del criterio de parada personalizado
145
+ stop_tokens = ["<end_of_turn>"] # Lista de tokens de parada
146
+
147
+ # Inicializa tu criterio de parada con el tokenizer y la lista de tokens de parada
148
+ stopping_criteria = ListOfTokensStoppingCriteria(tokenizer, stop_tokens)
149
+
150
+ # Añade tu criterio de parada a una StoppingCriteriaList
151
+ stopping_criteria_list = StoppingCriteriaList([stopping_criteria])
152
+
153
+
154
+
155
+ #EXAMPLE #1
156
+ txt="""<bos><start_of_turn>system
157
+ You are a helpful AI assistant.<end_of_turn>
158
+ <start_of_turn>user
159
+ Me dices los diferentes tipos de reciclaje que suelen existir en las ciudades europeas<end_of_turn>
160
+ <start_of_turn>model
161
+ """
162
+
163
+ #EXAMPLE #2
164
+ txt="""<bos><start_of_turn>system
165
+ You are a helpful AI assistant.<end_of_turn>
166
+ <start_of_turn>user
167
+ What is the meaning of life in the current time?<end_of_turn>
168
+ <start_of_turn>model
169
+ """
170
+
171
+ inputs = tokenizer.encode(txt, return_tensors="pt").to("cuda")
172
+
173
+ generation_config = GenerationConfig(
174
+ max_new_tokens=max_new_tokens,
175
+ temperature=0.55,
176
+ #top_p=0.9,
177
+ #top_k=len_tokens,
178
+ repetition_penalty=1.1,
179
+ do_sample=True,
180
+ )
181
+ outputs = model.generate(generation_config=generation_config,
182
+ input_ids=inputs,
183
+ stopping_criteria=stopping_criteria_list,)
184
+ tokenizer.decode(outputs[0], skip_special_tokens=False) #True
185
+ ```