webersni commited on
Commit
1e6ff46
1 Parent(s): 3880684

add tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "2021": 50347,
3
+ "2025": 50403,
4
+ "2030": 50339,
5
+ "2050": 50401,
6
+ "Agreement": 50344,
7
+ "BNP": 50370,
8
+ "BlackRock": 50480,
9
+ "CO2": 50365,
10
+ "Carbon": 50358,
11
+ "Committee": 50329,
12
+ "Corporate": 50408,
13
+ "ESG": 50360,
14
+ "Eni": 50476,
15
+ "Finance": 50493,
16
+ "GHG": 50322,
17
+ "Governance": 50308,
18
+ "Greenhouse": 50463,
19
+ "IFC": 50475,
20
+ "Investment": 50369,
21
+ "Operational": 50486,
22
+ "Paribas": 50385,
23
+ "Principles": 50494,
24
+ "Risk": 50319,
25
+ "Sustainability": 50362,
26
+ "Sustainable": 50341,
27
+ "achieve": 50379,
28
+ "achieving": 50498,
29
+ "across": 50299,
30
+ "activities": 50283,
31
+ "addition": 50287,
32
+ "additional": 50367,
33
+ "adverse": 50411,
34
+ "adversely": 50497,
35
+ "affect": 50320,
36
+ "already": 50496,
37
+ "annual": 50342,
38
+ "annually": 50488,
39
+ "approach": 50311,
40
+ "areas": 50318,
41
+ "assess": 50378,
42
+ "assessment": 50351,
43
+ "asset": 50325,
44
+ "basis": 50473,
45
+ "bond": 50418,
46
+ "bonds": 50452,
47
+ "buildings": 50404,
48
+ "businesses": 50309,
49
+ "caused": 50410,
50
+ "cement": 50492,
51
+ "challenges": 50400,
52
+ "claims": 50470,
53
+ "clients": 50289,
54
+ "commitment": 50337,
55
+ "commitments": 50465,
56
+ "committed": 50334,
57
+ "communities": 50389,
58
+ "companies": 50271,
59
+ "compared": 50467,
60
+ "conditions": 50376,
61
+ "considered": 50479,
62
+ "construction": 50462,
63
+ "consumption": 50314,
64
+ "continued": 50443,
65
+ "contribute": 50405,
66
+ "corporate": 50307,
67
+ "costs": 50275,
68
+ "countries": 50328,
69
+ "criteria": 50477,
70
+ "customer": 50395,
71
+ "customers": 50279,
72
+ "decision": 50481,
73
+ "decisions": 50412,
74
+ "deliver": 50437,
75
+ "developing": 50381,
76
+ "developments": 50474,
77
+ "dioxide": 50471,
78
+ "directly": 50427,
79
+ "disasters": 50432,
80
+ "distribution": 50429,
81
+ "economy": 50304,
82
+ "efforts": 50324,
83
+ "electricity": 50291,
84
+ "emerging": 50489,
85
+ "emission": 50388,
86
+ "emissions": 50266,
87
+ "employees": 50332,
88
+ "engagement": 50397,
89
+ "ensure": 50399,
90
+ "environmental": 50268,
91
+ "equity": 50434,
92
+ "equivalent": 50457,
93
+ "experience": 50448,
94
+ "exposed": 50386,
95
+ "exposure": 50343,
96
+ "facilities": 50336,
97
+ "factors": 50301,
98
+ "failure": 50499,
99
+ "finance": 50298,
100
+ "financing": 50294,
101
+ "fiscal": 50436,
102
+ "flooding": 50407,
103
+ "floods": 50449,
104
+ "following": 50371,
105
+ "footprint": 50338,
106
+ "fossil": 50374,
107
+ "fuels": 50375,
108
+ "funds": 50398,
109
+ "further": 50315,
110
+ "goals": 50348,
111
+ "governance": 50345,
112
+ "greater": 50444,
113
+ "greenhouse": 50323,
114
+ "identify": 50377,
115
+ "impacts": 50272,
116
+ "included": 50393,
117
+ "increase": 50280,
118
+ "increased": 50293,
119
+ "increases": 50446,
120
+ "indirect": 50445,
121
+ "industry": 50292,
122
+ "infrastructure": 50310,
123
+ "initiative": 50424,
124
+ "initiatives": 50316,
125
+ "innovative": 50491,
126
+ "institutions": 50450,
127
+ "insurance": 50296,
128
+ "invested": 50439,
129
+ "investing": 50392,
130
+ "investment": 50269,
131
+ "investments": 50282,
132
+ "investors": 50356,
133
+ "launched": 50440,
134
+ "least": 50487,
135
+ "legislation": 50485,
136
+ "lending": 50368,
137
+ "loan": 50396,
138
+ "loans": 50417,
139
+ "longer": 50431,
140
+ "losses": 50346,
141
+ "mainly": 50466,
142
+ "manage": 50326,
143
+ "managing": 50421,
144
+ "manufacturing": 50425,
145
+ "materials": 50357,
146
+ "methodology": 50483,
147
+ "metrics": 50442,
148
+ "mitigate": 50428,
149
+ "mortgage": 50459,
150
+ "objectives": 50387,
151
+ "operate": 50415,
152
+ "operating": 50372,
153
+ "operational": 50302,
154
+ "operations": 50270,
155
+ "opportunities": 50278,
156
+ "others": 50472,
157
+ "overall": 50366,
158
+ "particular": 50390,
159
+ "partners": 50482,
160
+ "patterns": 50438,
161
+ "planning": 50435,
162
+ "plans": 50383,
163
+ "plants": 50350,
164
+ "policies": 50353,
165
+ "portfolio": 50277,
166
+ "portfolios": 50363,
167
+ "possible": 50420,
168
+ "potential": 50281,
169
+ "practices": 50394,
170
+ "prices": 50484,
171
+ "processes": 50354,
172
+ "profitability": 50454,
173
+ "provide": 50313,
174
+ "provides": 50409,
175
+ "purchase": 50478,
176
+ "reduce": 50284,
177
+ "reduced": 50422,
178
+ "reducing": 50359,
179
+ "reduction": 50306,
180
+ "regarding": 50413,
181
+ "regulations": 50321,
182
+ "regulatory": 50295,
183
+ "relating": 50461,
184
+ "renewable": 50274,
185
+ "reputation": 50327,
186
+ "reputational": 50469,
187
+ "requirements": 50335,
188
+ "resilience": 50433,
189
+ "resulting": 50361,
190
+ "returns": 50495,
191
+ "risks": 50265,
192
+ "sands": 50441,
193
+ "scenario": 50290,
194
+ "scenarios": 50330,
195
+ "sectors": 50312,
196
+ "several": 50416,
197
+ "severity": 50468,
198
+ "significantly": 50458,
199
+ "society": 50364,
200
+ "solar": 50340,
201
+ "solutions": 50355,
202
+ "sources": 50380,
203
+ "stakeholders": 50384,
204
+ "standards": 50349,
205
+ "statements": 50453,
206
+ "strategic": 50300,
207
+ "strategies": 50391,
208
+ "strategy": 50286,
209
+ "suppliers": 50402,
210
+ "supply": 50297,
211
+ "supporting": 50490,
212
+ "sustainability": 50285,
213
+ "sustainable": 50288,
214
+ "systems": 50333,
215
+ "taken": 50460,
216
+ "targets": 50305,
217
+ "technologies": 50352,
218
+ "temperature": 50423,
219
+ "therefore": 50456,
220
+ "tonnes": 50447,
221
+ "towards": 50414,
222
+ "transactions": 50451,
223
+ "transition": 50273,
224
+ "transport": 50419,
225
+ "unable": 50464,
226
+ "uncertainty": 50406,
227
+ "various": 50382,
228
+ "vehicles": 50455,
229
+ "waste": 50331,
230
+ "–": 50317,
231
+ "—": 50430,
232
+ "‘": 50426,
233
+ "’": 50267,
234
+ "“": 50373,
235
+ "•": 50276,
236
+ "€": 50303
237
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "cls_token": {
12
+ "__type": "AddedToken",
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eos_token": {
20
+ "__type": "AddedToken",
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "errors": "replace",
28
+ "mask_token": {
29
+ "__type": "AddedToken",
30
+ "content": "<mask>",
31
+ "lstrip": true,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ },
36
+ "model_max_length": 512,
37
+ "name_or_path": "./envclaim-climatebert",
38
+ "pad_token": {
39
+ "__type": "AddedToken",
40
+ "content": "<pad>",
41
+ "lstrip": false,
42
+ "normalized": true,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "sep_token": {
47
+ "__type": "AddedToken",
48
+ "content": "</s>",
49
+ "lstrip": false,
50
+ "normalized": true,
51
+ "rstrip": false,
52
+ "single_word": false
53
+ },
54
+ "special_tokens_map_file": "pre_model/14102021_roberta/special_tokens_map.json",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": {
58
+ "__type": "AddedToken",
59
+ "content": "<unk>",
60
+ "lstrip": false,
61
+ "normalized": true,
62
+ "rstrip": false,
63
+ "single_word": false
64
+ }
65
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff