Update README.md
#3
by
kasper-piskorski
- opened
- README.md +49 -75
- generation_config.json +0 -1
- special_tokens_map.json +7 -0
- tokenizer.json +2 -2
- tokenizer_config.json +7 -1
README.md
CHANGED
@@ -6,35 +6,32 @@ language:
|
|
6 |
- pt
|
7 |
tags:
|
8 |
- falcon3
|
9 |
-
license: other
|
10 |
-
license_name: falcon-llm-license
|
11 |
license_link: https://falconllm.tii.ae/falcon-terms-and-conditions.html
|
12 |
-
library_name: transformers
|
13 |
---
|
14 |
|
15 |
-
<div align="center">
|
16 |
-
<img src="https://huggingface.co/datasets/tiiuae/documentation-images/resolve/main/general/falco3-logo.png" alt="drawing" width="500"/>
|
17 |
-
</div>
|
18 |
|
19 |
# Falcon3-7B-Base
|
20 |
|
21 |
-
**Falcon3** family of Open Foundation Models is a set of pretrained and instruct LLMs ranging from 1B to 10B.
|
22 |
|
23 |
-
This repository contains the **Falcon3-7B-Base**. It achieves state
|
24 |
-
Falcon3-7B-Base supports 4 languages (
|
25 |
|
26 |
-
⚠️ **This is a raw, pretrained model, which should be further finetuned for most
|
27 |
|
28 |
## Model Details
|
29 |
- Architecture
|
30 |
-
-
|
31 |
- 28 decoder blocks
|
32 |
-
-
|
33 |
-
-
|
34 |
-
-
|
35 |
-
-
|
36 |
-
-
|
37 |
-
-
|
|
|
38 |
- Supports EN, FR, ES, PT
|
39 |
- Developed by [Technology Innovation Institute](https://www.tii.ae)
|
40 |
- License: TII Falcon-LLM License 2.0
|
@@ -65,10 +62,7 @@ print(response[0]['generated_text'])
|
|
65 |
<br>
|
66 |
|
67 |
## Benchmarks
|
68 |
-
We report in the following table our internal pipeline benchmarks
|
69 |
-
- We use [lm-evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
|
70 |
-
- We report **raw scores**.
|
71 |
-
- We use same batch-size across all models.
|
72 |
|
73 |
|
74 |
|
@@ -79,7 +73,6 @@ We report in the following table our internal pipeline benchmarks.
|
|
79 |
<col style="width: 7%;">
|
80 |
<col style="width: 7%;">
|
81 |
<col style="width: 7%;">
|
82 |
-
<col style="width: 7%;">
|
83 |
<col style="background-color: rgba(80, 15, 213, 0.5); width: 7%;">
|
84 |
</colgroup>
|
85 |
<thead>
|
@@ -87,7 +80,6 @@ We report in the following table our internal pipeline benchmarks.
|
|
87 |
<th>Category</th>
|
88 |
<th>Benchmark</th>
|
89 |
<th>Llama3.1-8B</th>
|
90 |
-
<th>Qwen2-7B</th>
|
91 |
<th>Qwen2.5-7B</th>
|
92 |
<th>gemma-2-9b</th>
|
93 |
<th>Falcon3-7B-Base</th>
|
@@ -98,119 +90,101 @@ We report in the following table our internal pipeline benchmarks.
|
|
98 |
<td rowspan="3">General</td>
|
99 |
<td>MMLU (5-shot)</td>
|
100 |
<td>65.2</td>
|
101 |
-
<td>
|
102 |
-
<td>
|
103 |
-
<td>-</td>
|
104 |
<td>67.5</td>
|
105 |
</tr>
|
106 |
<tr>
|
107 |
<td>MMLU-PRO (5-shot)</td>
|
108 |
<td>32.7</td>
|
109 |
-
<td>
|
110 |
-
<td>
|
111 |
-
<td>-</td>
|
112 |
<td>39.2</td>
|
113 |
</tr>
|
114 |
<tr>
|
115 |
<td>IFEval</td>
|
116 |
<td>12.0</td>
|
117 |
-
<td>30.6</td>
|
118 |
<td>33.9</td>
|
119 |
-
<td
|
120 |
-
<td>34.3</td>
|
121 |
</tr>
|
122 |
<tr>
|
123 |
<td rowspan="2">Math</td>
|
124 |
<td>GSM8K (5-shot)</td>
|
125 |
<td>49.4</td>
|
126 |
-
<td>
|
127 |
-
<td>
|
128 |
-
<td>-</td>
|
129 |
<td>76.2</td>
|
130 |
</tr>
|
131 |
<tr>
|
132 |
-
<td>MATH(4-shot)</td>
|
133 |
<td>4.1</td>
|
134 |
-
<td>17.5</td>
|
135 |
<td>15.5</td>
|
136 |
-
<td
|
137 |
-
<td>18.0</td>
|
138 |
</tr>
|
139 |
<tr>
|
140 |
<td rowspan="4">Reasoning</td>
|
141 |
<td>Arc Challenge (25-shot)</td>
|
142 |
-
<td>
|
143 |
-
<td>
|
144 |
-
<td>
|
145 |
-
<td
|
146 |
-
<td>59.6</td>
|
147 |
</tr>
|
148 |
<tr>
|
149 |
<td>GPQA (0-shot)</td>
|
150 |
<td>31.0</td>
|
151 |
-
<td>31.9</td>
|
152 |
<td>33.0</td>
|
153 |
-
<td
|
154 |
-
<td>35.5</td>
|
155 |
</tr>
|
156 |
<tr>
|
157 |
<td>MUSR (0-shot)</td>
|
158 |
<td>38.0</td>
|
159 |
-
<td>44.1</td>
|
160 |
<td>44.2</td>
|
161 |
-
<td
|
162 |
-
<td>47.3</td>
|
163 |
</tr>
|
164 |
<tr>
|
165 |
<td>BBH (3-shot)</td>
|
166 |
<td>46.5</td>
|
167 |
-
<td>53.3</td>
|
168 |
<td>54.0</td>
|
169 |
-
<td
|
170 |
<td>51.0</td>
|
171 |
</tr>
|
172 |
<tr>
|
173 |
<td rowspan="4">CommonSense Understanding</td>
|
174 |
<td>PIQA (0-shot)</td>
|
175 |
-
<td>
|
176 |
-
<td>79.
|
177 |
-
<td>
|
178 |
-
<td
|
179 |
-
<td>77.7</td>
|
180 |
</tr>
|
181 |
<tr>
|
182 |
<td>SciQ (0-shot)</td>
|
183 |
-
<td>
|
184 |
-
<td>95.
|
185 |
-
<td>
|
186 |
-
<td
|
187 |
-
<td>95.3</td>
|
188 |
</tr>
|
189 |
<tr>
|
190 |
<td>Winogrande (0-shot)</td>
|
191 |
<td>74.0</td>
|
192 |
-
<td>72.1</td>
|
193 |
<td>72.9</td>
|
194 |
-
<td
|
195 |
<td>71.0</td>
|
196 |
</tr>
|
197 |
<tr>
|
198 |
<td>OpenbookQA (0-shot)</td>
|
199 |
-
<td>
|
200 |
-
<td>
|
201 |
-
<td>
|
202 |
-
<td
|
203 |
-
<td>31.4</td>
|
204 |
</tr>
|
205 |
</tbody>
|
206 |
</table>
|
207 |
|
208 |
-
## Useful links
|
209 |
-
- View our [release blogpost](https://huggingface.co/blog/falcon3).
|
210 |
-
- Feel free to join [our discord server](https://discord.gg/fwXpMyGc) if you have any questions or to interact with our researchers and developers.
|
211 |
-
|
212 |
## Technical Report
|
213 |
-
|
214 |
Coming soon....
|
215 |
|
216 |
## Citation
|
@@ -218,7 +192,7 @@ If Falcon3 family were helpful to your work, feel free to give us a cite.
|
|
218 |
|
219 |
```
|
220 |
@misc{Falcon3,
|
221 |
-
title = {Falcon 3 family of Open
|
222 |
author = {TII Team},
|
223 |
month = {December},
|
224 |
year = {2024}
|
|
|
6 |
- pt
|
7 |
tags:
|
8 |
- falcon3
|
9 |
+
license: other
|
10 |
+
license_name: falcon-llm-license
|
11 |
license_link: https://falconllm.tii.ae/falcon-terms-and-conditions.html
|
|
|
12 |
---
|
13 |
|
|
|
|
|
|
|
14 |
|
15 |
# Falcon3-7B-Base
|
16 |
|
17 |
+
**Falcon3** family of Open Foundation Models is a set of pretrained and instruct LLMs ranging from 1B to 10B parameters.
|
18 |
|
19 |
+
This repository contains the **Falcon3-7B-Base**. It achieves state-of-the-art results (at release's time) on reasoning, language understanding, instruction following, code and mathematics tasks.
|
20 |
+
Falcon3-7B-Base supports 4 languages (English, French, Spanish, Portuguese) and a context length of up to 32K.
|
21 |
|
22 |
+
⚠️ **This is a raw, pretrained model, which should be further finetuned using SFT, RLHF, continued pretraining, etc. for most use cases.**
|
23 |
|
24 |
## Model Details
|
25 |
- Architecture
|
26 |
+
- Transformer-based causal decoder-only architecture
|
27 |
- 28 decoder blocks
|
28 |
+
- Grouped Query Attention (GQA) for faster inference: 12 query heads and 4 key-value heads
|
29 |
+
- Wider head dimension: 256
|
30 |
+
- High RoPE value to support long context understanding: 1000042
|
31 |
+
- Uses SwiGLU and RMSNorm
|
32 |
+
- 32K context length
|
33 |
+
- 131K vocab size
|
34 |
+
- Pretrained on 14 Teratokens of datasets comprising of web, code, STEM, high quality and mutlilingual data using 2048 H100 GPU chips
|
35 |
- Supports EN, FR, ES, PT
|
36 |
- Developed by [Technology Innovation Institute](https://www.tii.ae)
|
37 |
- License: TII Falcon-LLM License 2.0
|
|
|
62 |
<br>
|
63 |
|
64 |
## Benchmarks
|
65 |
+
We report in the following table our internal pipeline benchmarks:
|
|
|
|
|
|
|
66 |
|
67 |
|
68 |
|
|
|
73 |
<col style="width: 7%;">
|
74 |
<col style="width: 7%;">
|
75 |
<col style="width: 7%;">
|
|
|
76 |
<col style="background-color: rgba(80, 15, 213, 0.5); width: 7%;">
|
77 |
</colgroup>
|
78 |
<thead>
|
|
|
80 |
<th>Category</th>
|
81 |
<th>Benchmark</th>
|
82 |
<th>Llama3.1-8B</th>
|
|
|
83 |
<th>Qwen2.5-7B</th>
|
84 |
<th>gemma-2-9b</th>
|
85 |
<th>Falcon3-7B-Base</th>
|
|
|
90 |
<td rowspan="3">General</td>
|
91 |
<td>MMLU (5-shot)</td>
|
92 |
<td>65.2</td>
|
93 |
+
<td><b>74.2</b></td>
|
94 |
+
<td>70.8</td>
|
|
|
95 |
<td>67.5</td>
|
96 |
</tr>
|
97 |
<tr>
|
98 |
<td>MMLU-PRO (5-shot)</td>
|
99 |
<td>32.7</td>
|
100 |
+
<td><b>43.5</b></td>
|
101 |
+
<td>41.4</td>
|
|
|
102 |
<td>39.2</td>
|
103 |
</tr>
|
104 |
<tr>
|
105 |
<td>IFEval</td>
|
106 |
<td>12.0</td>
|
|
|
107 |
<td>33.9</td>
|
108 |
+
<td>21.2</td>
|
109 |
+
<td><b>34.3</b></td>
|
110 |
</tr>
|
111 |
<tr>
|
112 |
<td rowspan="2">Math</td>
|
113 |
<td>GSM8K (5-shot)</td>
|
114 |
<td>49.4</td>
|
115 |
+
<td><b>82.9</b></td>
|
116 |
+
<td>69.1</td>
|
|
|
117 |
<td>76.2</td>
|
118 |
</tr>
|
119 |
<tr>
|
120 |
+
<td>MATH Lvl-5 (4-shot)</td>
|
121 |
<td>4.1</td>
|
|
|
122 |
<td>15.5</td>
|
123 |
+
<td>10.5</td>
|
124 |
+
<td><b>18.0</b></td>
|
125 |
</tr>
|
126 |
<tr>
|
127 |
<td rowspan="4">Reasoning</td>
|
128 |
<td>Arc Challenge (25-shot)</td>
|
129 |
+
<td>58.2</td>
|
130 |
+
<td>63.2</td>
|
131 |
+
<td><b>67.5</b></td>
|
132 |
+
<td>63.1</td>
|
|
|
133 |
</tr>
|
134 |
<tr>
|
135 |
<td>GPQA (0-shot)</td>
|
136 |
<td>31.0</td>
|
|
|
137 |
<td>33.0</td>
|
138 |
+
<td>33.4</td>
|
139 |
+
<td><b>35.5</b></td>
|
140 |
</tr>
|
141 |
<tr>
|
142 |
<td>MUSR (0-shot)</td>
|
143 |
<td>38.0</td>
|
|
|
144 |
<td>44.2</td>
|
145 |
+
<td>45.3</td>
|
146 |
+
<td><b>47.3</b></td>
|
147 |
</tr>
|
148 |
<tr>
|
149 |
<td>BBH (3-shot)</td>
|
150 |
<td>46.5</td>
|
|
|
151 |
<td>54.0</td>
|
152 |
+
<td><b>54.3</b></td>
|
153 |
<td>51.0</td>
|
154 |
</tr>
|
155 |
<tr>
|
156 |
<td rowspan="4">CommonSense Understanding</td>
|
157 |
<td>PIQA (0-shot)</td>
|
158 |
+
<td>81.2</td>
|
159 |
+
<td>79.9</td>
|
160 |
+
<td><b>82.9</b></td>
|
161 |
+
<td>79.1</td>
|
|
|
162 |
</tr>
|
163 |
<tr>
|
164 |
<td>SciQ (0-shot)</td>
|
165 |
+
<td>94.6</td>
|
166 |
+
<td>95.2</td>
|
167 |
+
<td><b>97.1</b></td>
|
168 |
+
<td>92.4</td>
|
|
|
169 |
</tr>
|
170 |
<tr>
|
171 |
<td>Winogrande (0-shot)</td>
|
172 |
<td>74.0</td>
|
|
|
173 |
<td>72.9</td>
|
174 |
+
<td><b>74.2</b></td>
|
175 |
<td>71.0</td>
|
176 |
</tr>
|
177 |
<tr>
|
178 |
<td>OpenbookQA (0-shot)</td>
|
179 |
+
<td>44.8</td>
|
180 |
+
<td>47.0</td>
|
181 |
+
<td><b>47.2</b></td>
|
182 |
+
<td>43.8</td>
|
|
|
183 |
</tr>
|
184 |
</tbody>
|
185 |
</table>
|
186 |
|
|
|
|
|
|
|
|
|
187 |
## Technical Report
|
|
|
188 |
Coming soon....
|
189 |
|
190 |
## Citation
|
|
|
192 |
|
193 |
```
|
194 |
@misc{Falcon3,
|
195 |
+
title = {The Falcon 3 family of Open Models},
|
196 |
author = {TII Team},
|
197 |
month = {December},
|
198 |
year = {2024}
|
generation_config.json
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
{
|
2 |
"_from_model_config": true,
|
3 |
-
"bos_token_id": 11,
|
4 |
"eos_token_id": 11,
|
5 |
"transformers_version": "4.46.1"
|
6 |
}
|
|
|
1 |
{
|
2 |
"_from_model_config": true,
|
|
|
3 |
"eos_token_id": 11,
|
4 |
"transformers_version": "4.46.1"
|
5 |
}
|
special_tokens_map.json
CHANGED
@@ -30,5 +30,12 @@
|
|
30 |
"normalized": false,
|
31 |
"rstrip": false,
|
32 |
"single_word": false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
}
|
34 |
}
|
|
|
30 |
"normalized": false,
|
31 |
"rstrip": false,
|
32 |
"single_word": false
|
33 |
+
},
|
34 |
+
"pad_token": {
|
35 |
+
"content": "<|pad|>",
|
36 |
+
"lstrip": false,
|
37 |
+
"normalized": false,
|
38 |
+
"rstrip": false,
|
39 |
+
"single_word": false
|
40 |
}
|
41 |
}
|
tokenizer.json
CHANGED
@@ -18212,7 +18212,7 @@
|
|
18212 |
},
|
18213 |
{
|
18214 |
"id": 2023,
|
18215 |
-
"content": "
|
18216 |
"single_word": false,
|
18217 |
"lstrip": false,
|
18218 |
"rstrip": false,
|
@@ -20280,7 +20280,7 @@
|
|
20280 |
">>UNUSED_1894<<": 2020,
|
20281 |
">>UNUSED_1895<<": 2021,
|
20282 |
">>UNUSED_1896<<": 2022,
|
20283 |
-
"
|
20284 |
"!": 2024,
|
20285 |
"\"": 2025,
|
20286 |
"#": 2026,
|
|
|
18212 |
},
|
18213 |
{
|
18214 |
"id": 2023,
|
18215 |
+
"content": "<|pad|>",
|
18216 |
"single_word": false,
|
18217 |
"lstrip": false,
|
18218 |
"rstrip": false,
|
|
|
20280 |
">>UNUSED_1894<<": 2020,
|
20281 |
">>UNUSED_1895<<": 2021,
|
20282 |
">>UNUSED_1896<<": 2022,
|
20283 |
+
"<|pad|>": 2023,
|
20284 |
"!": 2024,
|
20285 |
"\"": 2025,
|
20286 |
"#": 2026,
|
tokenizer_config.json
CHANGED
@@ -16186,7 +16186,7 @@
|
|
16186 |
"special": true
|
16187 |
},
|
16188 |
"2023": {
|
16189 |
-
"content": "
|
16190 |
"lstrip": false,
|
16191 |
"normalized": false,
|
16192 |
"rstrip": false,
|
@@ -16221,6 +16221,12 @@
|
|
16221 |
],
|
16222 |
"clean_up_tokenization_spaces": true,
|
16223 |
"eos_token": "<|endoftext|>",
|
|
|
|
|
|
|
|
|
|
|
16224 |
"model_max_length": 32768,
|
|
|
16225 |
"tokenizer_class": "PreTrainedTokenizerFast"
|
16226 |
}
|
|
|
16186 |
"special": true
|
16187 |
},
|
16188 |
"2023": {
|
16189 |
+
"content": "<|pad|>",
|
16190 |
"lstrip": false,
|
16191 |
"normalized": false,
|
16192 |
"rstrip": false,
|
|
|
16221 |
],
|
16222 |
"clean_up_tokenization_spaces": true,
|
16223 |
"eos_token": "<|endoftext|>",
|
16224 |
+
"extra_special_tokens": {},
|
16225 |
+
"model_input_names": [
|
16226 |
+
"input_ids",
|
16227 |
+
"attention_mask"
|
16228 |
+
],
|
16229 |
"model_max_length": 32768,
|
16230 |
+
"pad_token": "<|pad|>",
|
16231 |
"tokenizer_class": "PreTrainedTokenizerFast"
|
16232 |
}
|