Kotokin commited on
Commit
305e994
1 Parent(s): 67e6edf

Upload 15 files

Browse files
README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: tongyi-qianwen
4
+ license_link: https://huggingface.co/Qwen/Qwen2-72B-Instruct/blob/main/LICENSE
5
+ language:
6
+ - en
7
+ pipeline_tag: text-generation
8
+ tags:
9
+ - chat
10
+ ---
11
+ # Turbcat 72b
12
+ ![image/png](3.png)
13
+ ![image/png](4.png)
14
+ ![image/png](5.png)
15
+ ![image/png](6.png)
16
+ ![image/png](7.png)
17
+ ![image/png](8.png)
18
+ # Release notes
19
+ This is a direct upgrade over cat 70B, with 2x the dataset size(2GB-> 5GB), added Chinese support with quality on par with the original English dataset.
20
+ The medical COT portion of the dataset has been sponsored by steelskull, and the action packed character play portion was donated by Gryphe's(aesir dataset). Note that 8b is based on llama3 with limited Chinese support due to base model choice. The chat format in 8b is llama3. The 72b has more comprehensive Chinese support and the format will be chatml.
21
+
22
+ # Data Generation
23
+ In addition to the specified fortifications above, the data generation process is largely the same. Except for added Chinese Ph. D. Entrance exam, Traditional Chinese and Chinese story telling data.
24
+
25
+ ## Special Highlights
26
+ * 20 postdocs (10 Chinese, 10 English speaking doctors specialized in computational biology, biomed, biophysics and biochemistry)participated in the annotation process.
27
+ * GRE and MCAT/Kaoyan questions were manually answered by the participants using strictly COT and BERT judges producing embeddings were trained based on the provided annotation. For an example of BERT embedding visualization and scoring, please refer to https://huggingface.co/turboderp/Cat-Llama-3-70B-instruct
28
+ * Initial support of roleplay as api usage. When roleplaying as an API or function, the model does not produce irrelevant content that's not specified by the system prompt.
29
+
30
+ # Task coverage
31
+
32
+ ## Chinese tasks on par with English data
33
+ ![image/png](1.png)
34
+ For the Chinese portion of the dataset, we strictly kept its distribution and quality comparable to the English counterpart, as visualized by the close distance of the doublets. The overall QC is visualized by PCA after bert embedding
35
+
36
+ ## Individual tasks Quality Checked by doctors
37
+ For each cluster, we QC using BERT embeddings on an umap:
38
+ ![image/png](2.png)
39
+ The outliers have been manually checked by doctors.
40
+
41
+ # Thirdparty dataset
42
+ Thanks to the following people for their tremendous support for dataset generation:
43
+ * steelskull for the medical COT dataset with gpt4o
44
+ * Gryphe for the wonderful action packed dataset
45
+ * Turbca for being turbca
46
+
47
+ # Prompt format for 8b:
48
+ **llama3**
49
+ Example raw prompt:
50
+ ```
51
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
52
+
53
+ CatGPT really likes its new cat ears and ends every message with Nyan_<|eot_id|><|start_header_id|>user<|end_header_id|>
54
+
55
+ CatA: pats CatGPT cat ears<|eot_id|><|start_header_id|>assistant<|end_header_id|>
56
+
57
+ CatGPT:
58
+ ```
59
+
60
+ # Prompt format for 72b:
61
+ **chatml**
62
+ Example raw prompt:
63
+ ```
64
+ <|im_start|>system
65
+ CatGPT really likes its new cat ears and ends every message with Nyan_<|im_end|>
66
+ <|im_start|>user
67
+ CatA: pats CatGPT cat ears<|im_end|>
68
+ <|im_start|>assistant
69
+ CatGPT:
70
+ ```
71
+
72
+ # Support
73
+ Please join https://discord.gg/DwGz54Mz for model support
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "models/Qwen2-72B-Instruct",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "eos_token_id": 151643,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 8192,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 29568,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 80,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 64,
16
+ "num_hidden_layers": 80,
17
+ "num_key_value_heads": 8,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_theta": 1000000.0,
20
+ "sliding_window": 131072,
21
+ "tie_word_embeddings": false,
22
+ "torch_dtype": "float16",
23
+ "transformers_version": "4.41.1",
24
+ "use_cache": false,
25
+ "use_sliding_window": false,
26
+ "vocab_size": 152064,
27
+ "quantization_config": {
28
+ "quant_method": "exl2",
29
+ "version": "0.1.7",
30
+ "bits": 4.0,
31
+ "head_bits": 6,
32
+ "calibration": {
33
+ "rows": 115,
34
+ "length": 2048,
35
+ "dataset": "(default)"
36
+ }
37
+ }
38
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.41.1"
14
+ }
huggingface-metadata.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ url: https://huggingface.co/turboderp/turbcat-instruct-72b-exl2
2
+ branch: 4.0bpw
3
+ download date: 2024-07-19 08:31:25
4
+ sha256sum:
5
+ 529c22edd9da94405cb0c725c105fcb342958e261ae24eedaea6f216d65943d2 output-00001-of-00005.safetensors
6
+ 575205e722e428c69287e5f1bc61c6d6582909e3f52660d585137cfea578e159 output-00002-of-00005.safetensors
7
+ b4d05d62aba6443da4ea0973dd33cc6a103e38b549c788f562fd497030403640 output-00003-of-00005.safetensors
8
+ e3c79448b65061313d97c8116d6ebde9b00921451781be4591b046ca609e099f output-00004-of-00005.safetensors
9
+ 046b0d1301bd056ea1ecfa2711b2f3cfa7c89b7567e35155a0716ad799772549 output-00005-of-00005.safetensors
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
output-00001-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:529c22edd9da94405cb0c725c105fcb342958e261ae24eedaea6f216d65943d2
3
+ size 8512656988
output-00002-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:575205e722e428c69287e5f1bc61c6d6582909e3f52660d585137cfea578e159
3
+ size 8581376330
output-00003-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4d05d62aba6443da4ea0973dd33cc6a103e38b549c788f562fd497030403640
3
+ size 8495051220
output-00004-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3c79448b65061313d97c8116d6ebde9b00921451781be4591b046ca609e099f
3
+ size 8566518504
output-00005-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:046b0d1301bd056ea1ecfa2711b2f3cfa7c89b7567e35155a0716ad799772549
3
+ size 4417068700
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|endoftext|>",
37
+ "errors": "replace",
38
+ "model_max_length": 131072,
39
+ "pad_token": "<|endoftext|>",
40
+ "split_special_tokens": false,
41
+ "tokenizer_class": "Qwen2Tokenizer",
42
+ "unk_token": null
43
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff