infgrad commited on
Commit
be3aa67
1 Parent(s): 6162316

add models

Browse files
Files changed (6) hide show
  1. README.md +1320 -1
  2. config.json +34 -0
  3. special_tokens_map.json +7 -0
  4. tokenizer.json +0 -0
  5. tokenizer_config.json +13 -0
  6. vocab.txt +0 -0
README.md CHANGED
@@ -1,3 +1,1322 @@
1
  ---
2
- license: mit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ pipeline_tag: sentence-similarity
3
+ tags:
4
+ - sentence-transformers
5
+ - feature-extraction
6
+ - sentence-similarity
7
+ - mteb
8
+ model-index:
9
+ - name: stella-large-zh-v2
10
+ results:
11
+ - task:
12
+ type: STS
13
+ dataset:
14
+ type: C-MTEB/AFQMC
15
+ name: MTEB AFQMC
16
+ config: default
17
+ split: validation
18
+ revision: None
19
+ metrics:
20
+ - type: cos_sim_pearson
21
+ value: 47.34436411023816
22
+ - type: cos_sim_spearman
23
+ value: 49.947084806624545
24
+ - type: euclidean_pearson
25
+ value: 48.128834319004824
26
+ - type: euclidean_spearman
27
+ value: 49.947064694876815
28
+ - type: manhattan_pearson
29
+ value: 48.083561270166484
30
+ - type: manhattan_spearman
31
+ value: 49.90207128584442
32
+ - task:
33
+ type: STS
34
+ dataset:
35
+ type: C-MTEB/ATEC
36
+ name: MTEB ATEC
37
+ config: default
38
+ split: test
39
+ revision: None
40
+ metrics:
41
+ - type: cos_sim_pearson
42
+ value: 50.97998570817664
43
+ - type: cos_sim_spearman
44
+ value: 53.11852606980578
45
+ - type: euclidean_pearson
46
+ value: 55.12610520736481
47
+ - type: euclidean_spearman
48
+ value: 53.11852832108405
49
+ - type: manhattan_pearson
50
+ value: 55.10299116717361
51
+ - type: manhattan_spearman
52
+ value: 53.11304196536268
53
+ - task:
54
+ type: Classification
55
+ dataset:
56
+ type: mteb/amazon_reviews_multi
57
+ name: MTEB AmazonReviewsClassification (zh)
58
+ config: zh
59
+ split: test
60
+ revision: 1399c76144fd37290681b995c656ef9b2e06e26d
61
+ metrics:
62
+ - type: accuracy
63
+ value: 40.81799999999999
64
+ - type: f1
65
+ value: 39.022194031906444
66
+ - task:
67
+ type: STS
68
+ dataset:
69
+ type: C-MTEB/BQ
70
+ name: MTEB BQ
71
+ config: default
72
+ split: test
73
+ revision: None
74
+ metrics:
75
+ - type: cos_sim_pearson
76
+ value: 62.83544115057508
77
+ - type: cos_sim_spearman
78
+ value: 65.53509404838948
79
+ - type: euclidean_pearson
80
+ value: 64.08198144850084
81
+ - type: euclidean_spearman
82
+ value: 65.53509404760305
83
+ - type: manhattan_pearson
84
+ value: 64.08808420747272
85
+ - type: manhattan_spearman
86
+ value: 65.54907862648346
87
+ - task:
88
+ type: Clustering
89
+ dataset:
90
+ type: C-MTEB/CLSClusteringP2P
91
+ name: MTEB CLSClusteringP2P
92
+ config: default
93
+ split: test
94
+ revision: None
95
+ metrics:
96
+ - type: v_measure
97
+ value: 39.95428546140963
98
+ - task:
99
+ type: Clustering
100
+ dataset:
101
+ type: C-MTEB/CLSClusteringS2S
102
+ name: MTEB CLSClusteringS2S
103
+ config: default
104
+ split: test
105
+ revision: None
106
+ metrics:
107
+ - type: v_measure
108
+ value: 38.18454393512963
109
+ - task:
110
+ type: Reranking
111
+ dataset:
112
+ type: C-MTEB/CMedQAv1-reranking
113
+ name: MTEB CMedQAv1
114
+ config: default
115
+ split: test
116
+ revision: None
117
+ metrics:
118
+ - type: map
119
+ value: 85.4453602559479
120
+ - type: mrr
121
+ value: 88.1418253968254
122
+ - task:
123
+ type: Reranking
124
+ dataset:
125
+ type: C-MTEB/CMedQAv2-reranking
126
+ name: MTEB CMedQAv2
127
+ config: default
128
+ split: test
129
+ revision: None
130
+ metrics:
131
+ - type: map
132
+ value: 85.82731720256984
133
+ - type: mrr
134
+ value: 88.53230158730159
135
+ - task:
136
+ type: Retrieval
137
+ dataset:
138
+ type: C-MTEB/CmedqaRetrieval
139
+ name: MTEB CmedqaRetrieval
140
+ config: default
141
+ split: dev
142
+ revision: None
143
+ metrics:
144
+ - type: map_at_1
145
+ value: 24.459
146
+ - type: map_at_10
147
+ value: 36.274
148
+ - type: map_at_100
149
+ value: 38.168
150
+ - type: map_at_1000
151
+ value: 38.292
152
+ - type: map_at_3
153
+ value: 32.356
154
+ - type: map_at_5
155
+ value: 34.499
156
+ - type: mrr_at_1
157
+ value: 37.584
158
+ - type: mrr_at_10
159
+ value: 45.323
160
+ - type: mrr_at_100
161
+ value: 46.361999999999995
162
+ - type: mrr_at_1000
163
+ value: 46.412
164
+ - type: mrr_at_3
165
+ value: 42.919000000000004
166
+ - type: mrr_at_5
167
+ value: 44.283
168
+ - type: ndcg_at_1
169
+ value: 37.584
170
+ - type: ndcg_at_10
171
+ value: 42.63
172
+ - type: ndcg_at_100
173
+ value: 50.114000000000004
174
+ - type: ndcg_at_1000
175
+ value: 52.312000000000005
176
+ - type: ndcg_at_3
177
+ value: 37.808
178
+ - type: ndcg_at_5
179
+ value: 39.711999999999996
180
+ - type: precision_at_1
181
+ value: 37.584
182
+ - type: precision_at_10
183
+ value: 9.51
184
+ - type: precision_at_100
185
+ value: 1.554
186
+ - type: precision_at_1000
187
+ value: 0.183
188
+ - type: precision_at_3
189
+ value: 21.505
190
+ - type: precision_at_5
191
+ value: 15.514
192
+ - type: recall_at_1
193
+ value: 24.459
194
+ - type: recall_at_10
195
+ value: 52.32
196
+ - type: recall_at_100
197
+ value: 83.423
198
+ - type: recall_at_1000
199
+ value: 98.247
200
+ - type: recall_at_3
201
+ value: 37.553
202
+ - type: recall_at_5
203
+ value: 43.712
204
+ - task:
205
+ type: PairClassification
206
+ dataset:
207
+ type: C-MTEB/CMNLI
208
+ name: MTEB Cmnli
209
+ config: default
210
+ split: validation
211
+ revision: None
212
+ metrics:
213
+ - type: cos_sim_accuracy
214
+ value: 77.7269993986771
215
+ - type: cos_sim_ap
216
+ value: 86.8488070512359
217
+ - type: cos_sim_f1
218
+ value: 79.32095490716179
219
+ - type: cos_sim_precision
220
+ value: 72.6107226107226
221
+ - type: cos_sim_recall
222
+ value: 87.39770867430443
223
+ - type: dot_accuracy
224
+ value: 77.7269993986771
225
+ - type: dot_ap
226
+ value: 86.84218333157476
227
+ - type: dot_f1
228
+ value: 79.32095490716179
229
+ - type: dot_precision
230
+ value: 72.6107226107226
231
+ - type: dot_recall
232
+ value: 87.39770867430443
233
+ - type: euclidean_accuracy
234
+ value: 77.7269993986771
235
+ - type: euclidean_ap
236
+ value: 86.84880910178296
237
+ - type: euclidean_f1
238
+ value: 79.32095490716179
239
+ - type: euclidean_precision
240
+ value: 72.6107226107226
241
+ - type: euclidean_recall
242
+ value: 87.39770867430443
243
+ - type: manhattan_accuracy
244
+ value: 77.82321106434155
245
+ - type: manhattan_ap
246
+ value: 86.8152244713786
247
+ - type: manhattan_f1
248
+ value: 79.43262411347519
249
+ - type: manhattan_precision
250
+ value: 72.5725338491296
251
+ - type: manhattan_recall
252
+ value: 87.72504091653029
253
+ - type: max_accuracy
254
+ value: 77.82321106434155
255
+ - type: max_ap
256
+ value: 86.84880910178296
257
+ - type: max_f1
258
+ value: 79.43262411347519
259
+ - task:
260
+ type: Retrieval
261
+ dataset:
262
+ type: C-MTEB/CovidRetrieval
263
+ name: MTEB CovidRetrieval
264
+ config: default
265
+ split: dev
266
+ revision: None
267
+ metrics:
268
+ - type: map_at_1
269
+ value: 68.862
270
+ - type: map_at_10
271
+ value: 77.106
272
+ - type: map_at_100
273
+ value: 77.455
274
+ - type: map_at_1000
275
+ value: 77.459
276
+ - type: map_at_3
277
+ value: 75.457
278
+ - type: map_at_5
279
+ value: 76.254
280
+ - type: mrr_at_1
281
+ value: 69.125
282
+ - type: mrr_at_10
283
+ value: 77.13799999999999
284
+ - type: mrr_at_100
285
+ value: 77.488
286
+ - type: mrr_at_1000
287
+ value: 77.492
288
+ - type: mrr_at_3
289
+ value: 75.606
290
+ - type: mrr_at_5
291
+ value: 76.29599999999999
292
+ - type: ndcg_at_1
293
+ value: 69.02000000000001
294
+ - type: ndcg_at_10
295
+ value: 80.81099999999999
296
+ - type: ndcg_at_100
297
+ value: 82.298
298
+ - type: ndcg_at_1000
299
+ value: 82.403
300
+ - type: ndcg_at_3
301
+ value: 77.472
302
+ - type: ndcg_at_5
303
+ value: 78.892
304
+ - type: precision_at_1
305
+ value: 69.02000000000001
306
+ - type: precision_at_10
307
+ value: 9.336
308
+ - type: precision_at_100
309
+ value: 0.9990000000000001
310
+ - type: precision_at_1000
311
+ value: 0.101
312
+ - type: precision_at_3
313
+ value: 27.924
314
+ - type: precision_at_5
315
+ value: 17.492
316
+ - type: recall_at_1
317
+ value: 68.862
318
+ - type: recall_at_10
319
+ value: 92.308
320
+ - type: recall_at_100
321
+ value: 98.84100000000001
322
+ - type: recall_at_1000
323
+ value: 99.684
324
+ - type: recall_at_3
325
+ value: 83.193
326
+ - type: recall_at_5
327
+ value: 86.617
328
+ - task:
329
+ type: Retrieval
330
+ dataset:
331
+ type: C-MTEB/DuRetrieval
332
+ name: MTEB DuRetrieval
333
+ config: default
334
+ split: dev
335
+ revision: None
336
+ metrics:
337
+ - type: map_at_1
338
+ value: 25.063999999999997
339
+ - type: map_at_10
340
+ value: 78.02
341
+ - type: map_at_100
342
+ value: 81.022
343
+ - type: map_at_1000
344
+ value: 81.06
345
+ - type: map_at_3
346
+ value: 53.613
347
+ - type: map_at_5
348
+ value: 68.008
349
+ - type: mrr_at_1
350
+ value: 87.8
351
+ - type: mrr_at_10
352
+ value: 91.827
353
+ - type: mrr_at_100
354
+ value: 91.913
355
+ - type: mrr_at_1000
356
+ value: 91.915
357
+ - type: mrr_at_3
358
+ value: 91.508
359
+ - type: mrr_at_5
360
+ value: 91.758
361
+ - type: ndcg_at_1
362
+ value: 87.8
363
+ - type: ndcg_at_10
364
+ value: 85.753
365
+ - type: ndcg_at_100
366
+ value: 88.82900000000001
367
+ - type: ndcg_at_1000
368
+ value: 89.208
369
+ - type: ndcg_at_3
370
+ value: 84.191
371
+ - type: ndcg_at_5
372
+ value: 83.433
373
+ - type: precision_at_1
374
+ value: 87.8
375
+ - type: precision_at_10
376
+ value: 41.33
377
+ - type: precision_at_100
378
+ value: 4.8
379
+ - type: precision_at_1000
380
+ value: 0.48900000000000005
381
+ - type: precision_at_3
382
+ value: 75.767
383
+ - type: precision_at_5
384
+ value: 64.25999999999999
385
+ - type: recall_at_1
386
+ value: 25.063999999999997
387
+ - type: recall_at_10
388
+ value: 87.357
389
+ - type: recall_at_100
390
+ value: 97.261
391
+ - type: recall_at_1000
392
+ value: 99.309
393
+ - type: recall_at_3
394
+ value: 56.259
395
+ - type: recall_at_5
396
+ value: 73.505
397
+ - task:
398
+ type: Retrieval
399
+ dataset:
400
+ type: C-MTEB/EcomRetrieval
401
+ name: MTEB EcomRetrieval
402
+ config: default
403
+ split: dev
404
+ revision: None
405
+ metrics:
406
+ - type: map_at_1
407
+ value: 46.800000000000004
408
+ - type: map_at_10
409
+ value: 56.898
410
+ - type: map_at_100
411
+ value: 57.567
412
+ - type: map_at_1000
413
+ value: 57.593
414
+ - type: map_at_3
415
+ value: 54.167
416
+ - type: map_at_5
417
+ value: 55.822
418
+ - type: mrr_at_1
419
+ value: 46.800000000000004
420
+ - type: mrr_at_10
421
+ value: 56.898
422
+ - type: mrr_at_100
423
+ value: 57.567
424
+ - type: mrr_at_1000
425
+ value: 57.593
426
+ - type: mrr_at_3
427
+ value: 54.167
428
+ - type: mrr_at_5
429
+ value: 55.822
430
+ - type: ndcg_at_1
431
+ value: 46.800000000000004
432
+ - type: ndcg_at_10
433
+ value: 62.07
434
+ - type: ndcg_at_100
435
+ value: 65.049
436
+ - type: ndcg_at_1000
437
+ value: 65.666
438
+ - type: ndcg_at_3
439
+ value: 56.54
440
+ - type: ndcg_at_5
441
+ value: 59.492999999999995
442
+ - type: precision_at_1
443
+ value: 46.800000000000004
444
+ - type: precision_at_10
445
+ value: 7.84
446
+ - type: precision_at_100
447
+ value: 0.9169999999999999
448
+ - type: precision_at_1000
449
+ value: 0.096
450
+ - type: precision_at_3
451
+ value: 21.133
452
+ - type: precision_at_5
453
+ value: 14.099999999999998
454
+ - type: recall_at_1
455
+ value: 46.800000000000004
456
+ - type: recall_at_10
457
+ value: 78.4
458
+ - type: recall_at_100
459
+ value: 91.7
460
+ - type: recall_at_1000
461
+ value: 96.39999999999999
462
+ - type: recall_at_3
463
+ value: 63.4
464
+ - type: recall_at_5
465
+ value: 70.5
466
+ - task:
467
+ type: Classification
468
+ dataset:
469
+ type: C-MTEB/IFlyTek-classification
470
+ name: MTEB IFlyTek
471
+ config: default
472
+ split: validation
473
+ revision: None
474
+ metrics:
475
+ - type: accuracy
476
+ value: 47.98768757214313
477
+ - type: f1
478
+ value: 35.23884426992269
479
+ - task:
480
+ type: Classification
481
+ dataset:
482
+ type: C-MTEB/JDReview-classification
483
+ name: MTEB JDReview
484
+ config: default
485
+ split: test
486
+ revision: None
487
+ metrics:
488
+ - type: accuracy
489
+ value: 86.97936210131333
490
+ - type: ap
491
+ value: 56.292679530375736
492
+ - type: f1
493
+ value: 81.87001614762136
494
+ - task:
495
+ type: STS
496
+ dataset:
497
+ type: C-MTEB/LCQMC
498
+ name: MTEB LCQMC
499
+ config: default
500
+ split: test
501
+ revision: None
502
+ metrics:
503
+ - type: cos_sim_pearson
504
+ value: 71.17149643620844
505
+ - type: cos_sim_spearman
506
+ value: 77.48040046337948
507
+ - type: euclidean_pearson
508
+ value: 76.32337539923347
509
+ - type: euclidean_spearman
510
+ value: 77.4804004621894
511
+ - type: manhattan_pearson
512
+ value: 76.33275226275444
513
+ - type: manhattan_spearman
514
+ value: 77.48979843086128
515
+ - task:
516
+ type: Reranking
517
+ dataset:
518
+ type: C-MTEB/Mmarco-reranking
519
+ name: MTEB MMarcoReranking
520
+ config: default
521
+ split: dev
522
+ revision: None
523
+ metrics:
524
+ - type: map
525
+ value: 27.966807589556826
526
+ - type: mrr
527
+ value: 26.92023809523809
528
+ - task:
529
+ type: Retrieval
530
+ dataset:
531
+ type: C-MTEB/MMarcoRetrieval
532
+ name: MTEB MMarcoRetrieval
533
+ config: default
534
+ split: dev
535
+ revision: None
536
+ metrics:
537
+ - type: map_at_1
538
+ value: 66.15100000000001
539
+ - type: map_at_10
540
+ value: 75.048
541
+ - type: map_at_100
542
+ value: 75.374
543
+ - type: map_at_1000
544
+ value: 75.386
545
+ - type: map_at_3
546
+ value: 73.26700000000001
547
+ - type: map_at_5
548
+ value: 74.39
549
+ - type: mrr_at_1
550
+ value: 68.381
551
+ - type: mrr_at_10
552
+ value: 75.644
553
+ - type: mrr_at_100
554
+ value: 75.929
555
+ - type: mrr_at_1000
556
+ value: 75.93900000000001
557
+ - type: mrr_at_3
558
+ value: 74.1
559
+ - type: mrr_at_5
560
+ value: 75.053
561
+ - type: ndcg_at_1
562
+ value: 68.381
563
+ - type: ndcg_at_10
564
+ value: 78.669
565
+ - type: ndcg_at_100
566
+ value: 80.161
567
+ - type: ndcg_at_1000
568
+ value: 80.46799999999999
569
+ - type: ndcg_at_3
570
+ value: 75.3
571
+ - type: ndcg_at_5
572
+ value: 77.172
573
+ - type: precision_at_1
574
+ value: 68.381
575
+ - type: precision_at_10
576
+ value: 9.48
577
+ - type: precision_at_100
578
+ value: 1.023
579
+ - type: precision_at_1000
580
+ value: 0.105
581
+ - type: precision_at_3
582
+ value: 28.299999999999997
583
+ - type: precision_at_5
584
+ value: 17.98
585
+ - type: recall_at_1
586
+ value: 66.15100000000001
587
+ - type: recall_at_10
588
+ value: 89.238
589
+ - type: recall_at_100
590
+ value: 96.032
591
+ - type: recall_at_1000
592
+ value: 98.437
593
+ - type: recall_at_3
594
+ value: 80.318
595
+ - type: recall_at_5
596
+ value: 84.761
597
+ - task:
598
+ type: Classification
599
+ dataset:
600
+ type: mteb/amazon_massive_intent
601
+ name: MTEB MassiveIntentClassification (zh-CN)
602
+ config: zh-CN
603
+ split: test
604
+ revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
605
+ metrics:
606
+ - type: accuracy
607
+ value: 68.26160053799597
608
+ - type: f1
609
+ value: 65.96949453305112
610
+ - task:
611
+ type: Classification
612
+ dataset:
613
+ type: mteb/amazon_massive_scenario
614
+ name: MTEB MassiveScenarioClassification (zh-CN)
615
+ config: zh-CN
616
+ split: test
617
+ revision: 7d571f92784cd94a019292a1f45445077d0ef634
618
+ metrics:
619
+ - type: accuracy
620
+ value: 73.12037659717554
621
+ - type: f1
622
+ value: 72.69052407105445
623
+ - task:
624
+ type: Retrieval
625
+ dataset:
626
+ type: C-MTEB/MedicalRetrieval
627
+ name: MTEB MedicalRetrieval
628
+ config: default
629
+ split: dev
630
+ revision: None
631
+ metrics:
632
+ - type: map_at_1
633
+ value: 50.1
634
+ - type: map_at_10
635
+ value: 56.489999999999995
636
+ - type: map_at_100
637
+ value: 57.007
638
+ - type: map_at_1000
639
+ value: 57.06400000000001
640
+ - type: map_at_3
641
+ value: 55.25
642
+ - type: map_at_5
643
+ value: 55.93
644
+ - type: mrr_at_1
645
+ value: 50.3
646
+ - type: mrr_at_10
647
+ value: 56.591
648
+ - type: mrr_at_100
649
+ value: 57.108000000000004
650
+ - type: mrr_at_1000
651
+ value: 57.165
652
+ - type: mrr_at_3
653
+ value: 55.35
654
+ - type: mrr_at_5
655
+ value: 56.03
656
+ - type: ndcg_at_1
657
+ value: 50.1
658
+ - type: ndcg_at_10
659
+ value: 59.419999999999995
660
+ - type: ndcg_at_100
661
+ value: 62.28900000000001
662
+ - type: ndcg_at_1000
663
+ value: 63.9
664
+ - type: ndcg_at_3
665
+ value: 56.813
666
+ - type: ndcg_at_5
667
+ value: 58.044
668
+ - type: precision_at_1
669
+ value: 50.1
670
+ - type: precision_at_10
671
+ value: 6.859999999999999
672
+ - type: precision_at_100
673
+ value: 0.828
674
+ - type: precision_at_1000
675
+ value: 0.096
676
+ - type: precision_at_3
677
+ value: 20.433
678
+ - type: precision_at_5
679
+ value: 12.86
680
+ - type: recall_at_1
681
+ value: 50.1
682
+ - type: recall_at_10
683
+ value: 68.60000000000001
684
+ - type: recall_at_100
685
+ value: 82.8
686
+ - type: recall_at_1000
687
+ value: 95.7
688
+ - type: recall_at_3
689
+ value: 61.3
690
+ - type: recall_at_5
691
+ value: 64.3
692
+ - task:
693
+ type: Classification
694
+ dataset:
695
+ type: C-MTEB/MultilingualSentiment-classification
696
+ name: MTEB MultilingualSentiment
697
+ config: default
698
+ split: validation
699
+ revision: None
700
+ metrics:
701
+ - type: accuracy
702
+ value: 73.41000000000001
703
+ - type: f1
704
+ value: 72.87768282499509
705
+ - task:
706
+ type: PairClassification
707
+ dataset:
708
+ type: C-MTEB/OCNLI
709
+ name: MTEB Ocnli
710
+ config: default
711
+ split: validation
712
+ revision: None
713
+ metrics:
714
+ - type: cos_sim_accuracy
715
+ value: 73.4163508391987
716
+ - type: cos_sim_ap
717
+ value: 78.51058998215277
718
+ - type: cos_sim_f1
719
+ value: 75.3875968992248
720
+ - type: cos_sim_precision
721
+ value: 69.65085049239033
722
+ - type: cos_sim_recall
723
+ value: 82.15417106652588
724
+ - type: dot_accuracy
725
+ value: 73.4163508391987
726
+ - type: dot_ap
727
+ value: 78.51058998215277
728
+ - type: dot_f1
729
+ value: 75.3875968992248
730
+ - type: dot_precision
731
+ value: 69.65085049239033
732
+ - type: dot_recall
733
+ value: 82.15417106652588
734
+ - type: euclidean_accuracy
735
+ value: 73.4163508391987
736
+ - type: euclidean_ap
737
+ value: 78.51058998215277
738
+ - type: euclidean_f1
739
+ value: 75.3875968992248
740
+ - type: euclidean_precision
741
+ value: 69.65085049239033
742
+ - type: euclidean_recall
743
+ value: 82.15417106652588
744
+ - type: manhattan_accuracy
745
+ value: 73.03735787763942
746
+ - type: manhattan_ap
747
+ value: 78.4190891700083
748
+ - type: manhattan_f1
749
+ value: 75.32592950265573
750
+ - type: manhattan_precision
751
+ value: 69.3950177935943
752
+ - type: manhattan_recall
753
+ value: 82.36536430834214
754
+ - type: max_accuracy
755
+ value: 73.4163508391987
756
+ - type: max_ap
757
+ value: 78.51058998215277
758
+ - type: max_f1
759
+ value: 75.3875968992248
760
+ - task:
761
+ type: Classification
762
+ dataset:
763
+ type: C-MTEB/OnlineShopping-classification
764
+ name: MTEB OnlineShopping
765
+ config: default
766
+ split: test
767
+ revision: None
768
+ metrics:
769
+ - type: accuracy
770
+ value: 91.81000000000002
771
+ - type: ap
772
+ value: 89.35809579688139
773
+ - type: f1
774
+ value: 91.79220350456818
775
+ - task:
776
+ type: STS
777
+ dataset:
778
+ type: C-MTEB/PAWSX
779
+ name: MTEB PAWSX
780
+ config: default
781
+ split: test
782
+ revision: None
783
+ metrics:
784
+ - type: cos_sim_pearson
785
+ value: 30.10755999973859
786
+ - type: cos_sim_spearman
787
+ value: 36.221732138848864
788
+ - type: euclidean_pearson
789
+ value: 36.41120179336658
790
+ - type: euclidean_spearman
791
+ value: 36.221731188009436
792
+ - type: manhattan_pearson
793
+ value: 36.34865300346968
794
+ - type: manhattan_spearman
795
+ value: 36.17696483080459
796
+ - task:
797
+ type: STS
798
+ dataset:
799
+ type: C-MTEB/QBQTC
800
+ name: MTEB QBQTC
801
+ config: default
802
+ split: test
803
+ revision: None
804
+ metrics:
805
+ - type: cos_sim_pearson
806
+ value: 36.778975708100226
807
+ - type: cos_sim_spearman
808
+ value: 38.733929926753724
809
+ - type: euclidean_pearson
810
+ value: 37.13383498228113
811
+ - type: euclidean_spearman
812
+ value: 38.73374886550868
813
+ - type: manhattan_pearson
814
+ value: 37.175732896552404
815
+ - type: manhattan_spearman
816
+ value: 38.74120541657908
817
+ - task:
818
+ type: STS
819
+ dataset:
820
+ type: mteb/sts22-crosslingual-sts
821
+ name: MTEB STS22 (zh)
822
+ config: zh
823
+ split: test
824
+ revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
825
+ metrics:
826
+ - type: cos_sim_pearson
827
+ value: 65.97095922825076
828
+ - type: cos_sim_spearman
829
+ value: 68.87452938308421
830
+ - type: euclidean_pearson
831
+ value: 67.23101642424429
832
+ - type: euclidean_spearman
833
+ value: 68.87452938308421
834
+ - type: manhattan_pearson
835
+ value: 67.29909334410189
836
+ - type: manhattan_spearman
837
+ value: 68.89807985930508
838
+ - task:
839
+ type: STS
840
+ dataset:
841
+ type: C-MTEB/STSB
842
+ name: MTEB STSB
843
+ config: default
844
+ split: test
845
+ revision: None
846
+ metrics:
847
+ - type: cos_sim_pearson
848
+ value: 78.98860630733722
849
+ - type: cos_sim_spearman
850
+ value: 79.36601601355665
851
+ - type: euclidean_pearson
852
+ value: 78.77295944956447
853
+ - type: euclidean_spearman
854
+ value: 79.36585127278974
855
+ - type: manhattan_pearson
856
+ value: 78.82060736131619
857
+ - type: manhattan_spearman
858
+ value: 79.4395526421926
859
+ - task:
860
+ type: Reranking
861
+ dataset:
862
+ type: C-MTEB/T2Reranking
863
+ name: MTEB T2Reranking
864
+ config: default
865
+ split: dev
866
+ revision: None
867
+ metrics:
868
+ - type: map
869
+ value: 66.40501824507894
870
+ - type: mrr
871
+ value: 76.18463933756757
872
+ - task:
873
+ type: Retrieval
874
+ dataset:
875
+ type: C-MTEB/T2Retrieval
876
+ name: MTEB T2Retrieval
877
+ config: default
878
+ split: dev
879
+ revision: None
880
+ metrics:
881
+ - type: map_at_1
882
+ value: 27.095000000000002
883
+ - type: map_at_10
884
+ value: 76.228
885
+ - type: map_at_100
886
+ value: 79.865
887
+ - type: map_at_1000
888
+ value: 79.935
889
+ - type: map_at_3
890
+ value: 53.491
891
+ - type: map_at_5
892
+ value: 65.815
893
+ - type: mrr_at_1
894
+ value: 89.554
895
+ - type: mrr_at_10
896
+ value: 92.037
897
+ - type: mrr_at_100
898
+ value: 92.133
899
+ - type: mrr_at_1000
900
+ value: 92.137
901
+ - type: mrr_at_3
902
+ value: 91.605
903
+ - type: mrr_at_5
904
+ value: 91.88
905
+ - type: ndcg_at_1
906
+ value: 89.554
907
+ - type: ndcg_at_10
908
+ value: 83.866
909
+ - type: ndcg_at_100
910
+ value: 87.566
911
+ - type: ndcg_at_1000
912
+ value: 88.249
913
+ - type: ndcg_at_3
914
+ value: 85.396
915
+ - type: ndcg_at_5
916
+ value: 83.919
917
+ - type: precision_at_1
918
+ value: 89.554
919
+ - type: precision_at_10
920
+ value: 41.792
921
+ - type: precision_at_100
922
+ value: 4.997
923
+ - type: precision_at_1000
924
+ value: 0.515
925
+ - type: precision_at_3
926
+ value: 74.795
927
+ - type: precision_at_5
928
+ value: 62.675000000000004
929
+ - type: recall_at_1
930
+ value: 27.095000000000002
931
+ - type: recall_at_10
932
+ value: 82.694
933
+ - type: recall_at_100
934
+ value: 94.808
935
+ - type: recall_at_1000
936
+ value: 98.30600000000001
937
+ - type: recall_at_3
938
+ value: 55.156000000000006
939
+ - type: recall_at_5
940
+ value: 69.19
941
+ - task:
942
+ type: Classification
943
+ dataset:
944
+ type: C-MTEB/TNews-classification
945
+ name: MTEB TNews
946
+ config: default
947
+ split: validation
948
+ revision: None
949
+ metrics:
950
+ - type: accuracy
951
+ value: 51.929
952
+ - type: f1
953
+ value: 50.16876489927282
954
+ - task:
955
+ type: Clustering
956
+ dataset:
957
+ type: C-MTEB/ThuNewsClusteringP2P
958
+ name: MTEB ThuNewsClusteringP2P
959
+ config: default
960
+ split: test
961
+ revision: None
962
+ metrics:
963
+ - type: v_measure
964
+ value: 61.404157724658894
965
+ - task:
966
+ type: Clustering
967
+ dataset:
968
+ type: C-MTEB/ThuNewsClusteringS2S
969
+ name: MTEB ThuNewsClusteringS2S
970
+ config: default
971
+ split: test
972
+ revision: None
973
+ metrics:
974
+ - type: v_measure
975
+ value: 57.11418384351802
976
+ - task:
977
+ type: Retrieval
978
+ dataset:
979
+ type: C-MTEB/VideoRetrieval
980
+ name: MTEB VideoRetrieval
981
+ config: default
982
+ split: dev
983
+ revision: None
984
+ metrics:
985
+ - type: map_at_1
986
+ value: 52.1
987
+ - type: map_at_10
988
+ value: 62.956999999999994
989
+ - type: map_at_100
990
+ value: 63.502
991
+ - type: map_at_1000
992
+ value: 63.51599999999999
993
+ - type: map_at_3
994
+ value: 60.75000000000001
995
+ - type: map_at_5
996
+ value: 62.195
997
+ - type: mrr_at_1
998
+ value: 52.0
999
+ - type: mrr_at_10
1000
+ value: 62.907000000000004
1001
+ - type: mrr_at_100
1002
+ value: 63.452
1003
+ - type: mrr_at_1000
1004
+ value: 63.466
1005
+ - type: mrr_at_3
1006
+ value: 60.699999999999996
1007
+ - type: mrr_at_5
1008
+ value: 62.144999999999996
1009
+ - type: ndcg_at_1
1010
+ value: 52.1
1011
+ - type: ndcg_at_10
1012
+ value: 67.93299999999999
1013
+ - type: ndcg_at_100
1014
+ value: 70.541
1015
+ - type: ndcg_at_1000
1016
+ value: 70.91300000000001
1017
+ - type: ndcg_at_3
1018
+ value: 63.468
1019
+ - type: ndcg_at_5
1020
+ value: 66.08800000000001
1021
+ - type: precision_at_1
1022
+ value: 52.1
1023
+ - type: precision_at_10
1024
+ value: 8.34
1025
+ - type: precision_at_100
1026
+ value: 0.955
1027
+ - type: precision_at_1000
1028
+ value: 0.098
1029
+ - type: precision_at_3
1030
+ value: 23.767
1031
+ - type: precision_at_5
1032
+ value: 15.540000000000001
1033
+ - type: recall_at_1
1034
+ value: 52.1
1035
+ - type: recall_at_10
1036
+ value: 83.39999999999999
1037
+ - type: recall_at_100
1038
+ value: 95.5
1039
+ - type: recall_at_1000
1040
+ value: 98.4
1041
+ - type: recall_at_3
1042
+ value: 71.3
1043
+ - type: recall_at_5
1044
+ value: 77.7
1045
+ - task:
1046
+ type: Classification
1047
+ dataset:
1048
+ type: C-MTEB/waimai-classification
1049
+ name: MTEB Waimai
1050
+ config: default
1051
+ split: test
1052
+ revision: None
1053
+ metrics:
1054
+ - type: accuracy
1055
+ value: 87.12
1056
+ - type: ap
1057
+ value: 70.85284793227382
1058
+ - type: f1
1059
+ value: 85.55420883566512
1060
  ---
1061
+
1062
+ ## stella model
1063
+
1064
+ **新闻 | News**
1065
+
1066
+ **[2023-10-12]** 开源stella-base-zh-v2和stella-large-zh-v2, 效果更好且使用简单,**不需要任何前缀文本**。
1067
+ Release stella-base-zh-v2 and stella-large-zh-v2. The 2 models have better performance
1068
+ and **do not need any prefix text**.\
1069
+ **[2023-09-11]** 开源stella-base-zh和stella-large-zh
1070
+
1071
+ stella是一个通用的文本编码模型,主要有以下模型:
1072
+
1073
+ | Model Name | Model Size (GB) | Dimension | Sequence Length | Language | Need instruction for retrieval? |
1074
+ |:------------------:|:---------------:|:---------:|:---------------:|:--------:|:-------------------------------:|
1075
+ | stella-large-zh-v2 | 0.65 | 1024 | 1024 | Chinese | No |
1076
+ | stella-base-zh-v2 | 0.2 | 768 | 1024 | Chinese | No |
1077
+ | stella-large-zh | 0.65 | 1024 | 1024 | Chinese | Yes |
1078
+ | stella-base-zh | 0.2 | 768 | 1024 | Chinese | Yes |
1079
+
1080
+ 完整的训练思路和训练过程已记录在[博客](https://zhuanlan.zhihu.com/p/655322183),欢迎阅读讨论。
1081
+
1082
+ **训练数据:**
1083
+
1084
+ 1. 开源数据(wudao_base_200GB[1]、m3e[2]和simclue[3]),着重挑选了长度大于512的文本
1085
+ 2. 在通用语料库上使用LLM构造一批(question, paragraph)和(sentence, paragraph)数据
1086
+
1087
+ **训练方法:**
1088
+
1089
+ 1. 对比学习损失函数
1090
+ 2. 带有难负例的对比学习损失函数(分别基于bm25和vector构造了难负例)
1091
+ 3. EWC(Elastic Weights Consolidation)[4]
1092
+ 4. cosent loss[5]
1093
+ 5. 每一种类型的数据一个迭代器,分别计算loss进行更新
1094
+
1095
+ stella-v2在stella模型的基础上,使用了更多的训练数据,同时知识蒸馏等方法去除了前置的instruction(
1096
+ 比如piccolo的`查询:`, `结果:`, e5的`query:`和`passage:`)。
1097
+
1098
+ **初始权重:**\
1099
+ stella-base-zh和stella-large-zh分别以piccolo-base-zh[6]和piccolo-large-zh作为基础模型,512-1024的position
1100
+ embedding使用层次分解位置编码[7]进行初始化。\
1101
+ 感谢商汤科技研究院开源的[piccolo系列模型](https://huggingface.co/sensenova)。
1102
+
1103
+ stella is a general-purpose text encoder, which mainly includes the following models:
1104
+
1105
+ | Model Name | Model Size (GB) | Dimension | Sequence Length | Language | Need instruction for retrieval? |
1106
+ |:------------------:|:---------------:|:---------:|:---------------:|:--------:|:-------------------------------:|
1107
+ | stella-large-zh-v2 | 0.65 | 1024 | 1024 | Chinese | No |
1108
+ | stella-base-zh-v2 | 0.2 | 768 | 1024 | Chinese | No |
1109
+ | stella-large-zh | 0.65 | 1024 | 1024 | Chinese | Yes |
1110
+ | stella-base-zh | 0.2 | 768 | 1024 | Chinese | Yes |
1111
+
1112
+ The training data mainly includes:
1113
+
1114
+ 1. Open-source training data (wudao_base_200GB, m3e, and simclue), with a focus on selecting texts with lengths greater
1115
+ than 512.
1116
+ 2. A batch of (question, paragraph) and (sentence, paragraph) data constructed on a general corpus using LLM.
1117
+
1118
+ The loss functions mainly include:
1119
+
1120
+ 1. Contrastive learning loss function
1121
+ 2. Contrastive learning loss function with hard negative examples (based on bm25 and vector hard negatives)
1122
+ 3. EWC (Elastic Weights Consolidation)
1123
+ 4. cosent loss
1124
+
1125
+ Model weight initialization:\
1126
+ stella-base-zh and stella-large-zh use piccolo-base-zh and piccolo-large-zh as the base models, respectively, and the
1127
+ 512-1024 position embedding uses the initialization strategy of hierarchical decomposed position encoding.
1128
+
1129
+ Training strategy:\
1130
+ One iterator for each type of data, separately calculating the loss.
1131
+
1132
+ Based on stella models, stella-v2 use more training data and remove instruction by Knowledge Distillation.
1133
+
1134
+ ## Metric
1135
+
1136
+ #### C-MTEB leaderboard (Chinese)
1137
+
1138
+ | Model Name | Model Size (GB) | Dimension | Sequence Length | Average (35) | Classification (9) | Clustering (4) | Pair Classification (2) | Reranking (4) | Retrieval (8) | STS (8) |
1139
+ |:------------------:|:---------------:|:---------:|:---------------:|:------------:|:------------------:|:--------------:|:-----------------------:|:-------------:|:-------------:|:-------:|
1140
+ | stella-large-zh-v2 | 0.65 | 1024 | 1024 | 65.13 | 69.05 | 49.16 | 82.68 | 66.41 | 70.14 | 58.66 |
1141
+ | stella-base-zh-v2 | 0.2 | 768 | 1024 | 64.36 | 68.29 | 49.4 | 79.95 | 66.1 | 70.08 | 56.92 |
1142
+ | stella-large-zh | 0.65 | 1024 | 1024 | 64.54 | 67.62 | 48.65 | 78.72 | 65.98 | 71.02 | 58.3 |
1143
+ | stella-base-zh | 0.2 | 768 | 1024 | 64.16 | 67.77 | 48.7 | 76.09 | 66.95 | 71.07 | 56.54 |
1144
+
1145
+ #### Reproduce our results
1146
+
1147
+ Codes:
1148
+
1149
+ ```python
1150
+ import torch
1151
+ import numpy as np
1152
+ from typing import List
1153
+ from mteb import MTEB
1154
+ from sentence_transformers import SentenceTransformer
1155
+
1156
+
1157
+ class FastTextEncoder():
1158
+ def __init__(self, model_name):
1159
+ self.model = SentenceTransformer(model_name).cuda().half().eval()
1160
+ self.model.max_seq_length = 512
1161
+
1162
+ def encode(
1163
+ self,
1164
+ input_texts: List[str],
1165
+ *args,
1166
+ **kwargs
1167
+ ):
1168
+ new_sens = list(set(input_texts))
1169
+ new_sens.sort(key=lambda x: len(x), reverse=True)
1170
+ vecs = self.model.encode(
1171
+ new_sens, normalize_embeddings=True, convert_to_numpy=True, batch_size=256
1172
+ ).astype(np.float32)
1173
+ sen2arrid = {sen: idx for idx, sen in enumerate(new_sens)}
1174
+ vecs = vecs[[sen2arrid[sen] for sen in input_texts]]
1175
+ torch.cuda.empty_cache()
1176
+ return vecs
1177
+
1178
+
1179
+ if __name__ == '__main__':
1180
+ model_name = "infgrad/stella-base-zh-v2"
1181
+ output_folder = "zh_mteb_results/stella-base-zh-v2"
1182
+ task_names = [t.description["name"] for t in MTEB(task_langs=['zh', 'zh-CN']).tasks]
1183
+ model = FastTextEncoder(model_name)
1184
+ for task in task_names:
1185
+ MTEB(tasks=[task], task_langs=['zh', 'zh-CN']).run(model, output_folder=output_folder)
1186
+
1187
+ ```
1188
+
1189
+ #### Evaluation for long text
1190
+
1191
+ 经过实际观察发现,C-MTEB的评测数据长度基本都是小于512的,
1192
+ 更致命的是那些长度大于512的文本,其重点都在前半部分
1193
+ 这里以CMRC2018的数据为例说明这个问题:
1194
+
1195
+ ```
1196
+ question: 《无双大蛇z》是谁旗下ω-force开发的动作游戏?
1197
+
1198
+ passage:《无双大蛇z》是光荣旗下ω-force开发的动作游戏,于2009年3月12日登陆索尼playstation3,并于2009年11月27日推......
1199
+ ```
1200
+
1201
+ passage长度为800多,大于512,但是对于这个question而言只需要前面40个字就足以检索,多的内容对于模型而言是一种噪声,反而降低了效果。\
1202
+ 简言之,现有数据集的2个问题:\
1203
+ 1)长度大于512的过少\
1204
+ 2)即便大于512,对于检索而言也只需要前512的文本内容\
1205
+ 导致**无法准确评估模型的长文本编码能力。**
1206
+
1207
+ 为了解决这个问题,搜集了相关开源数据并使用规则进行过滤,最终整理了6份长文本测试集,他们分别是:
1208
+
1209
+ - CMRC2018,通用百科
1210
+ - CAIL,法律阅读理解
1211
+ - DRCD,繁体百科,已转简体
1212
+ - Military,军工问答
1213
+ - Squad,英文阅读理解,已转中文
1214
+ - Multifieldqa_zh,清华的大模型长文本理解能力评测数据[9]
1215
+
1216
+ 处理规则是选取答案在512长度之后的文本,短的测试数据会欠采样一下,长短文本占比约为1:2,所以模型既得理解短文本也得理解长文本。
1217
+ 除了Military数据集,我们提供了其他5个测试数据的下载地址:https://drive.google.com/file/d/1WC6EWaCbVgz-vPMDFH4TwAMkLyh5WNcN/view?usp=sharing
1218
+
1219
+ 评测指标为Recall@5, 结果如下:
1220
+
1221
+ | Dataset | piccolo-base-zh | piccolo-large-zh | bge-base-zh | bge-large-zh | stella-base-zh | stella-large-zh |
1222
+ |:---------------:|:---------------:|:----------------:|:-----------:|:------------:|:--------------:|:---------------:|
1223
+ | CMRC2018 | 94.34 | 93.82 | 91.56 | 93.12 | 96.08 | 95.56 |
1224
+ | CAIL | 28.04 | 33.64 | 31.22 | 33.94 | 34.62 | 37.18 |
1225
+ | DRCD | 78.25 | 77.9 | 78.34 | 80.26 | 86.14 | 84.58 |
1226
+ | Military | 76.61 | 73.06 | 75.65 | 75.81 | 83.71 | 80.48 |
1227
+ | Squad | 91.21 | 86.61 | 87.87 | 90.38 | 93.31 | 91.21 |
1228
+ | Multifieldqa_zh | 81.41 | 83.92 | 83.92 | 83.42 | 79.9 | 80.4 |
1229
+ | **Average** | 74.98 | 74.83 | 74.76 | 76.15 | **78.96** | **78.24** |
1230
+
1231
+ **注意:** 因为长文本评测数据数量稀少,所以构造时也使用了train部分,如果自行评测,请注意模型的训练数据以免数据泄露。
1232
+
1233
+ ## Usage
1234
+
1235
+ #### stella 中文系列模型
1236
+
1237
+ stella-base-zh 和 stella-large-zh: 本模型是在piccolo基础上训练的,因此**用法和piccolo完全一致**
1238
+ ,即在检索重排任务上给query和passage加上`查询: `和`结果: `。对于短短匹配不需要做任何操作。
1239
+
1240
+ stella-base-zh-v2 和 stella-large-zh-v2: 本模型使用简单,**任何使用场景中都不需要加前缀文本**。
1241
+
1242
+ stella中文系列模型均使用mean pooling做为文本向量。
1243
+
1244
+ 在sentence-transformer库中的使用方法:
1245
+
1246
+ ```python
1247
+ # 对于短对短数据集,下面是通用的使用方式
1248
+ from sentence_transformers import SentenceTransformer
1249
+
1250
+ sentences = ["数据1", "数据2"]
1251
+ model = SentenceTransformer('infgrad/stella-base-zh-v2')
1252
+ print(model.max_seq_length)
1253
+ embeddings_1 = model.encode(sentences, normalize_embeddings=True)
1254
+ embeddings_2 = model.encode(sentences, normalize_embeddings=True)
1255
+ similarity = embeddings_1 @ embeddings_2.T
1256
+ print(similarity)
1257
+ ```
1258
+
1259
+ 直接使用transformers库:
1260
+
1261
+ ```python
1262
+ from transformers import AutoModel, AutoTokenizer
1263
+ from sklearn.preprocessing import normalize
1264
+
1265
+ model = AutoModel.from_pretrained('infgrad/stella-base-zh-v2')
1266
+ tokenizer = AutoTokenizer.from_pretrained('infgrad/stella-base-zh-v2')
1267
+ sentences = ["数据1", "数据ABCDEFGH"]
1268
+ batch_data = tokenizer(
1269
+ batch_text_or_text_pairs=sentences,
1270
+ padding="longest",
1271
+ return_tensors="pt",
1272
+ max_length=1024,
1273
+ truncation=True,
1274
+ )
1275
+ attention_mask = batch_data["attention_mask"]
1276
+ model_output = model(**batch_data)
1277
+ last_hidden = model_output.last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
1278
+ vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
1279
+ vectors = normalize(vectors, norm="l2", axis=1, )
1280
+ print(vectors.shape) # 2,768
1281
+ ```
1282
+
1283
+ #### stella models for English
1284
+
1285
+ developing...
1286
+
1287
+ ## Training Detail
1288
+
1289
+ **硬件:** 单卡A100-80GB
1290
+
1291
+ **环境:** torch1.13.*; transformers-trainer + deepspeed + gradient-checkpointing
1292
+
1293
+ **学习率:** 1e-6
1294
+
1295
+ **batch_size:** base模型为1024,额外增加20%的难负例;large模型为768,额外增加20%的难负例
1296
+
1297
+ **数据量:** 第一版模型约100万,其中用LLM构造的数据约有200K. LLM模型大小为13b。v2系列模型到了2000万训练数据。
1298
+
1299
+ ## ToDoList
1300
+
1301
+ **评测的稳定性:**
1302
+ 评测过程中发现Clustering任务会和官方的结果不一致,大约有±0.0x的小差距,原因是聚类代码没有设置random_seed,差距可以忽略不计,不影响评测结论。
1303
+
1304
+ **更高质量的长文本训练和测试数据:** 训练数据多是用13b模型构造的,肯定会存在噪声。
1305
+ 测试数据基本都是从mrc数据整理来的,所以问题都是factoid类型,不符合真实分布。
1306
+
1307
+ **OOD的性能:** 虽然近期出现了很多向量编码模型,但是对于不是那么通用的domain,这一众模型包括stella、openai和cohere,
1308
+ 它们的效果均比不上BM25。
1309
+
1310
+ ## Reference
1311
+
1312
+ 1. https://www.scidb.cn/en/detail?dataSetId=c6a3fe684227415a9db8e21bac4a15ab
1313
+ 2. https://github.com/wangyuxinwhy/uniem
1314
+ 3. https://github.com/CLUEbenchmark/SimCLUE
1315
+ 4. https://arxiv.org/abs/1612.00796
1316
+ 5. https://kexue.fm/archives/8847
1317
+ 6. https://huggingface.co/sensenova/piccolo-base-zh
1318
+ 7. https://kexue.fm/archives/7947
1319
+ 8. https://github.com/FlagOpen/FlagEmbedding
1320
+ 9. https://github.com/THUDM/LongBench
1321
+
1322
+
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "directionality": "bidi",
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-12,
16
+ "max_position_embeddings": 1024,
17
+ "model_type": "bert",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "output_past": true,
21
+ "pad_token_id": 0,
22
+ "pooler_fc_size": 768,
23
+ "pooler_num_attention_heads": 12,
24
+ "pooler_num_fc_layers": 3,
25
+ "pooler_size_per_head": 128,
26
+ "pooler_type": "first_token_transform",
27
+ "position_embedding_type": "absolute",
28
+ "torch_dtype": "float16",
29
+ "transformers_version": "4.30.2",
30
+ "type_vocab_size": 2,
31
+ "uniem_pooling_strategy": "last_mean",
32
+ "use_cache": true,
33
+ "vocab_size": 21128
34
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 1024,
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "strip_accents": null,
10
+ "tokenize_chinese_chars": true,
11
+ "tokenizer_class": "BertTokenizer",
12
+ "unk_token": "[UNK]"
13
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff