File size: 967 Bytes
6f4e44f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
tokenizers_info:
- name: AA
  tokenizer_id: 0
  json_path: ./t5_tokenizer_AA_special.json
  modular_json_path: ./t5_tokenizer_AA_special.json
  start_delimiter: <start_AA>
  end_delimiter: <end_AA>
- name: SMILES
  tokenizer_id: 1
  json_path: ./bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
  modular_json_path: ./bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
  start_delimiter: <start_SMILES>
  end_delimiter: <end_SMILES>
- name: CELL_ATTRIBUTES
  tokenizer_id: 2
  json_path: ./cell_attributes_tokenizer.json
  modular_json_path: ./cell_attributes_tokenizer.json
  start_delimiter: <start_CELL_ATTRIBUTES>
  end_delimiter: <end_CELL_ATTRIBUTES>
- name: GENE
  tokenizer_id: 3
  json_path: ./gene_tokenizer.json
  modular_json_path: ./gene_tokenizer.json
  start_delimiter: <start_GENE>
  end_delimiter: <end_GENE>
  minimal_token_id: 5000
max_possible_token_id: 100000
max_special_token_id: 500