Upload DatasetTransformer

Browse files

Files changed (5) hide show

README.md +369 -369
config.json +7 -1
misc.py +514 -0
model.py +607 -0
model.safetensors +2 -2

README.md CHANGED Viewed

@@ -4,12 +4,14 @@ tags:
 model-index:
 - name: cde-small-v1
   results:
-  - dataset:
-      config: en
       name: MTEB AmazonCounterfactualClassification (en)
-      revision: e8379541af4e31359cca9fbcf4b00f2671dba205
-      split: test
       type: mteb/amazon_counterfactual
     metrics:
     - type: accuracy
       value: 87.01492537313433
@@ -23,14 +25,14 @@ model-index:
       value: 87.74802754480477
     - type: main_score
       value: 87.01492537313433
-    task:
       type: Classification
-  - dataset:
-      config: default
       name: MTEB AmazonPolarityClassification (default)
-      revision: e2d317d38cd51312af73b3d32a06d1a08b442046
-      split: test
       type: mteb/amazon_polarity
     metrics:
     - type: accuracy
       value: 94.652275
@@ -44,14 +46,14 @@ model-index:
       value: 94.64655930708355
     - type: main_score
       value: 94.652275
-    task:
       type: Classification
-  - dataset:
-      config: en
       name: MTEB AmazonReviewsClassification (en)
-      revision: 1399c76144fd37290681b995c656ef9b2e06e26d
-      split: test
       type: mteb/amazon_reviews_multi
     metrics:
     - type: accuracy
       value: 55.75599999999999
@@ -61,14 +63,14 @@ model-index:
       value: 55.07058630829347
     - type: main_score
       value: 55.75599999999999
-    task:
-      type: Classification
-  - dataset:
-      config: default
       name: MTEB ArguAna (default)
-      revision: c22ab2a51041ffd869aaddef7af8d8215647e41a
-      split: test
       type: mteb/arguana
     metrics:
     - type: main_score
       value: 69.959
@@ -352,14 +354,14 @@ model-index:
       value: 74.182
     - type: recall_at_5
       value: 84.495
-    task:
-      type: Retrieval
-  - dataset:
-      config: default
       name: MTEB ArxivClusteringP2P (default)
-      revision: a122ad7f3f0291bf49cc6f4d32aa80929df69d5d
-      split: test
       type: mteb/arxiv-clustering-p2p
     metrics:
     - type: main_score
       value: 48.54672141116669
@@ -367,14 +369,14 @@ model-index:
       value: 48.54672141116669
     - type: v_measure_std
       value: 14.037498386768362
-    task:
       type: Clustering
-  - dataset:
-      config: default
       name: MTEB ArxivClusteringS2S (default)
-      revision: f910caf1a6075f7329cdf8c1a6135696f37dbd53
-      split: test
       type: mteb/arxiv-clustering-s2s
     metrics:
     - type: main_score
       value: 40.5914039166466
@@ -382,14 +384,14 @@ model-index:
       value: 40.5914039166466
     - type: v_measure_std
       value: 14.385069818910331
-    task:
-      type: Clustering
-  - dataset:
-      config: default
       name: MTEB AskUbuntuDupQuestions (default)
-      revision: 2000358ca161889fa9c082cb41daa8dcfb161a54
-      split: test
       type: mteb/askubuntudupquestions-reranking
     metrics:
     - type: main_score
       value: 61.13621260261507
@@ -409,14 +411,14 @@ model-index:
       value: 31.484257486448364
     - type: nAUC_mrr_std
       value: 21.252659250011632
-    task:
-      type: Reranking
-  - dataset:
-      config: default
       name: MTEB BIOSSES (default)
-      revision: d3fb88f8f02e40887cd149695127462bbcf29b4a
-      split: test
       type: mteb/biosses-sts
     metrics:
     - type: cosine_pearson
       value: 89.07028016646942
@@ -436,14 +438,14 @@ model-index:
       value: 89.07028016646942
     - type: spearman
       value: 86.69595132967805
-    task:
-      type: STS
-  - dataset:
-      config: default
       name: MTEB Banking77Classification (default)
-      revision: 0fd18e25b25c072e09e0d92ab615fda904d66300
-      split: test
       type: mteb/banking77
     metrics:
     - type: accuracy
       value: 88.6038961038961
@@ -453,14 +455,14 @@ model-index:
       value: 88.56824205739822
     - type: main_score
       value: 88.6038961038961
-    task:
-      type: Classification
-  - dataset:
-      config: default
       name: MTEB BiorxivClusteringP2P (default)
-      revision: 65b79d1d13f80053f67aca9498d9402c2d9f1f40
-      split: test
       type: mteb/biorxiv-clustering-p2p
     metrics:
     - type: main_score
       value: 44.77800814327256
@@ -468,14 +470,14 @@ model-index:
       value: 44.77800814327256
     - type: v_measure_std
       value: 0.6462535527471919
-    task:
       type: Clustering
-  - dataset:
-      config: default
       name: MTEB BiorxivClusteringS2S (default)
-      revision: 258694dd0231531bc1fd9de6ceb52a0853c6d908
-      split: test
       type: mteb/biorxiv-clustering-s2s
     metrics:
     - type: main_score
       value: 38.16110272459102
@@ -483,14 +485,14 @@ model-index:
       value: 38.16110272459102
     - type: v_measure_std
       value: 0.7456916212435019
-    task:
-      type: Clustering
-  - dataset:
-      config: default
       name: MTEB CQADupstackAndroidRetrieval (default)
-      revision: f46a197baaae43b4f621051089b82a364682dfeb
-      split: test
       type: mteb/cqadupstack-android
     metrics:
     - type: main_score
       value: 49.376
@@ -774,14 +776,14 @@ model-index:
       value: 47.591
     - type: recall_at_5
       value: 54.245
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB CQADupstackEnglishRetrieval (default)
-      revision: ad9991cb51e31e31e430383c75ffb2885547b5f0
-      split: test
       type: mteb/cqadupstack-english
     metrics:
     - type: main_score
       value: 44.727
@@ -1065,14 +1067,14 @@ model-index:
       value: 42.085
     - type: recall_at_5
       value: 47.5
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB CQADupstackGamingRetrieval (default)
-      revision: 4885aa143210c98657558c04aaf3dc47cfb54340
-      split: test
       type: mteb/cqadupstack-gaming
     metrics:
     - type: main_score
       value: 59.001999999999995
@@ -1356,14 +1358,14 @@ model-index:
       value: 57.916000000000004
     - type: recall_at_5
       value: 65.44
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB CQADupstackGisRetrieval (default)
-      revision: 5003b3064772da1887988e05400cf3806fe491f2
-      split: test
       type: mteb/cqadupstack-gis
     metrics:
     - type: main_score
       value: 37.501
@@ -1647,14 +1649,14 @@ model-index:
       value: 37.218
     - type: recall_at_5
       value: 42.559000000000005
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB CQADupstackMathematicaRetrieval (default)
-      revision: 90fceea13679c63fe563ded68f3b6f06e50061de
-      split: test
       type: mteb/cqadupstack-mathematica
     metrics:
     - type: main_score
       value: 27.653
@@ -1938,14 +1940,14 @@ model-index:
       value: 25.469
     - type: recall_at_5
       value: 31.316
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB CQADupstackPhysicsRetrieval (default)
-      revision: 79531abbd1fb92d06c6d6315a0cbbbf5bb247ea4
-      split: test
       type: mteb/cqadupstack-physics
     metrics:
     - type: main_score
       value: 45.314
@@ -2229,14 +2231,14 @@ model-index:
       value: 43.679
     - type: recall_at_5
       value: 49.735
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB CQADupstackProgrammersRetrieval (default)
-      revision: 6184bc1440d2dbc7612be22b50686b8826d22b32
-      split: test
       type: mteb/cqadupstack-programmers
     metrics:
     - type: main_score
       value: 41.972
@@ -2520,27 +2522,27 @@ model-index:
       value: 39.363
     - type: recall_at_5
       value: 44.665
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB CQADupstackRetrieval (default)
-      revision: CQADupstackRetrieval_is_a_combined_dataset
-      split: test
       type: CQADupstackRetrieval_is_a_combined_dataset
     metrics:
     - type: main_score
       value: 39.823499999999996
     - type: ndcg_at_10
       value: 39.823499999999996
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB CQADupstackStatsRetrieval (default)
-      revision: 65ac3a16b8e91f9cee4c9828cc7c335575432a2a
-      split: test
       type: mteb/cqadupstack-stats
     metrics:
     - type: main_score
       value: 34.943000000000005
@@ -2824,14 +2826,14 @@ model-index:
       value: 33.427
     - type: recall_at_5
       value: 37.643
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB CQADupstackTexRetrieval (default)
-      revision: 46989137a86843e03a6195de44b09deda022eec7
-      split: test
       type: mteb/cqadupstack-tex
     metrics:
     - type: main_score
       value: 27.271
@@ -3115,14 +3117,14 @@ model-index:
       value: 25.592
     - type: recall_at_5
       value: 30.279
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB CQADupstackUnixRetrieval (default)
-      revision: 6c6430d3a6d36f8d2a829195bc5dc94d7e063e53
-      split: test
       type: mteb/cqadupstack-unix
     metrics:
     - type: main_score
       value: 38.237
@@ -3406,14 +3408,14 @@ model-index:
       value: 36.275
     - type: recall_at_5
       value: 42.199
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB CQADupstackWebmastersRetrieval (default)
-      revision: 160c094312a0e1facb97e55eeddb698c0abe3571
-      split: test
       type: mteb/cqadupstack-webmasters
     metrics:
     - type: main_score
       value: 38.702
@@ -3697,14 +3699,14 @@ model-index:
       value: 37.634
     - type: recall_at_5
       value: 42.021
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB CQADupstackWordpressRetrieval (default)
-      revision: 4ffe81d471b1924886b33c7567bfb200e9eec5c4
-      split: test
       type: mteb/cqadupstack-wordpress
     metrics:
     - type: main_score
       value: 33.184000000000005
@@ -3988,14 +3990,14 @@ model-index:
       value: 32.683
     - type: recall_at_5
       value: 36.756
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB ClimateFEVER (default)
-      revision: 47f2ac6acb640fc46020b02a5b59fdda04d39380
-      split: test
       type: mteb/climate-fever
     metrics:
     - type: main_score
       value: 25.068
@@ -4279,14 +4281,14 @@ model-index:
       value: 18.312
     - type: recall_at_5
       value: 22.776
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB DBPedia (default)
-      revision: c0f706b76e590d620bd6618b3ca8efdd34e2d659
-      split: test
       type: mteb/dbpedia
     metrics:
     - type: main_score
       value: 40.128
@@ -4570,14 +4572,14 @@ model-index:
       value: 14.562
     - type: recall_at_5
       value: 18.779
-    task:
-      type: Retrieval
-  - dataset:
-      config: default
       name: MTEB EmotionClassification (default)
-      revision: 4f58c6b202a23cf9a4da393831edf4f9183cad37
-      split: test
       type: mteb/emotion
     metrics:
     - type: accuracy
       value: 74.86
@@ -4587,14 +4589,14 @@ model-index:
       value: 75.96499621761998
     - type: main_score
       value: 74.86
-    task:
-      type: Classification
-  - dataset:
-      config: default
       name: MTEB FEVER (default)
-      revision: bea83ef9e8fb933d90a2f1d5515737465d613e12
-      split: test
       type: mteb/fever
     metrics:
     - type: main_score
       value: 86.029
@@ -4878,14 +4880,14 @@ model-index:
       value: 88.382
     - type: recall_at_5
       value: 90.908
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB FiQA2018 (default)
-      revision: 27a168819829fe9bcd655c2df245fb19452e8e06
-      split: test
       type: mteb/fiqa
     metrics:
     - type: main_score
       value: 45.238
@@ -5169,14 +5171,14 @@ model-index:
       value: 37.656
     - type: recall_at_5
       value: 44.766
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB HotpotQA (default)
-      revision: ab518f4d6fcca38d87c25209f94beba119d02014
-      split: test
       type: mteb/hotpotqa
     metrics:
     - type: main_score
       value: 66.672
@@ -5460,14 +5462,14 @@ model-index:
       value: 57.522
     - type: recall_at_5
       value: 62.134
-    task:
-      type: Retrieval
-  - dataset:
-      config: default
       name: MTEB ImdbClassification (default)
-      revision: 3d86128a09e091d6018b6d26cad27f2739fc2db7
-      split: test
       type: mteb/imdb
     metrics:
     - type: accuracy
       value: 93.5944
@@ -5481,14 +5483,14 @@ model-index:
       value: 93.58945949328377
     - type: main_score
       value: 93.5944
-    task:
-      type: Classification
-  - dataset:
-      config: default
       name: MTEB MSMARCO (default)
-      revision: c5a29a104738b98a9e76336939199e264163d4a0
-      split: dev
       type: mteb/msmarco
     metrics:
     - type: main_score
       value: 41.448
@@ -5772,14 +5774,14 @@ model-index:
       value: 41.304
     - type: recall_at_5
       value: 51.076
-    task:
-      type: Retrieval
-  - dataset:
-      config: en
       name: MTEB MTOPDomainClassification (en)
-      revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf
-      split: test
       type: mteb/mtop_domain
     metrics:
     - type: accuracy
       value: 96.03967168262655
@@ -5789,14 +5791,14 @@ model-index:
       value: 96.06623245823347
     - type: main_score
       value: 96.03967168262655
-    task:
       type: Classification
-  - dataset:
-      config: en
       name: MTEB MTOPIntentClassification (en)
-      revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba
-      split: test
       type: mteb/mtop_intent
     metrics:
     - type: accuracy
       value: 89.12904696762428
@@ -5806,14 +5808,14 @@ model-index:
       value: 90.41290566743324
     - type: main_score
       value: 89.12904696762428
-    task:
       type: Classification
-  - dataset:
-      config: en
       name: MTEB MassiveIntentClassification (en)
-      revision: 4672e20407010da34463acc759c162ca9734bca6
-      split: test
       type: mteb/amazon_massive_intent
     metrics:
     - type: accuracy
       value: 76.49630127774041
@@ -5823,14 +5825,14 @@ model-index:
       value: 76.42436195016484
     - type: main_score
       value: 76.49630127774041
-    task:
       type: Classification
-  - dataset:
-      config: en
       name: MTEB MassiveScenarioClassification (en)
-      revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8
-      split: test
       type: mteb/amazon_massive_scenario
     metrics:
     - type: accuracy
       value: 78.9340954942838
@@ -5840,14 +5842,14 @@ model-index:
       value: 78.87787647838971
     - type: main_score
       value: 78.9340954942838
-    task:
-      type: Classification
-  - dataset:
-      config: default
       name: MTEB MedrxivClusteringP2P (default)
-      revision: e7a26af6f3ae46b30dde8737f02c07b1505bcc73
-      split: test
       type: mteb/medrxiv-clustering-p2p
     metrics:
     - type: main_score
       value: 37.50182848656019
@@ -5855,14 +5857,14 @@ model-index:
       value: 37.50182848656019
     - type: v_measure_std
       value: 1.1708518023877268
-    task:
       type: Clustering
-  - dataset:
-      config: default
       name: MTEB MedrxivClusteringS2S (default)
-      revision: 35191c8c0dca72d8ff3efcd72aa802307d469663
-      split: test
       type: mteb/medrxiv-clustering-s2s
     metrics:
     - type: main_score
       value: 35.72762609825363
@@ -5870,14 +5872,14 @@ model-index:
       value: 35.72762609825363
     - type: v_measure_std
       value: 1.4555014772914985
-    task:
-      type: Clustering
-  - dataset:
-      config: default
       name: MTEB MindSmallReranking (default)
-      revision: 59042f120c80e8afa9cdbb224f67076cec0fc9a7
-      split: test
       type: mteb/mind_small
     metrics:
     - type: main_score
       value: 30.47716416454022
@@ -5897,14 +5899,14 @@ model-index:
       value: -15.78941850629242
     - type: nAUC_mrr_std
       value: -1.1330442292510805
-    task:
-      type: Reranking
-  - dataset:
-      config: default
       name: MTEB NFCorpus (default)
-      revision: ec0fa4fe99da2ff19ca1214b7966684033a58814
-      split: test
       type: mteb/nfcorpus
     metrics:
     - type: main_score
       value: 34.648
@@ -6188,14 +6190,14 @@ model-index:
       value: 10.037
     - type: recall_at_5
       value: 12.717999999999998
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB NQ (default)
-      revision: b774495ed302d8c44a3a7ea25c90dbce03968f31
-      split: test
       type: mteb/nq
     metrics:
     - type: main_score
       value: 60.06
@@ -6479,14 +6481,14 @@ model-index:
       value: 61.114000000000004
     - type: recall_at_5
       value: 69.812
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB QuoraRetrieval (default)
-      revision: e4e08e0b7dbe3c8700f0daef558ff32256715259
-      split: test
       type: mteb/quora
     metrics:
     - type: main_score
       value: 89.821
@@ -6770,14 +6772,14 @@ model-index:
       value: 88.714
     - type: recall_at_5
       value: 92.96799999999999
-    task:
-      type: Retrieval
-  - dataset:
-      config: default
       name: MTEB RedditClustering (default)
-      revision: 24640382cdbf8abc73003fb0fa6d111a705499eb
-      split: test
       type: mteb/reddit-clustering
     metrics:
     - type: main_score
       value: 59.36038828851887
@@ -6785,14 +6787,14 @@ model-index:
       value: 59.36038828851887
     - type: v_measure_std
       value: 4.1958765965154425
-    task:
       type: Clustering
-  - dataset:
-      config: default
       name: MTEB RedditClusteringP2P (default)
-      revision: 385e3cb46b4cfa89021f56c4380204149d0efe33
-      split: test
       type: mteb/reddit-clustering-p2p
     metrics:
     - type: main_score
       value: 64.67522832408089
@@ -6800,14 +6802,14 @@ model-index:
       value: 64.67522832408089
     - type: v_measure_std
       value: 12.473765016158698
-    task:
-      type: Clustering
-  - dataset:
-      config: default
       name: MTEB SCIDOCS (default)
-      revision: f8c2fcf00f625baaa80f62ec5bd9e1fff3b8ae88
-      split: test
       type: mteb/scidocs
     metrics:
     - type: main_score
       value: 21.751
@@ -7091,14 +7093,14 @@ model-index:
       value: 11.648
     - type: recall_at_5
       value: 15.883
-    task:
-      type: Retrieval
-  - dataset:
-      config: default
       name: MTEB SICK-R (default)
-      revision: 20a6d6f312dd54037fe07a32d58e5e168867909d
-      split: test
       type: mteb/sickr-sts
     metrics:
     - type: cosine_pearson
       value: 84.0161170579997
@@ -7118,14 +7120,14 @@ model-index:
       value: 84.0161170579997
     - type: spearman
       value: 77.52025923874551
-    task:
       type: STS
-  - dataset:
-      config: default
       name: MTEB STS12 (default)
-      revision: a0d554a64d88156834ff5ae9920b964011b16384
-      split: test
       type: mteb/sts12-sts
     metrics:
     - type: cosine_pearson
       value: 81.32328780209225
@@ -7145,14 +7147,14 @@ model-index:
       value: 81.32328780209225
     - type: spearman
       value: 74.17570679745272
-    task:
       type: STS
-  - dataset:
-      config: default
       name: MTEB STS13 (default)
-      revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca
-      split: test
       type: mteb/sts13-sts
     metrics:
     - type: cosine_pearson
       value: 85.53224141249392
@@ -7172,14 +7174,14 @@ model-index:
       value: 85.53224141249392
     - type: spearman
       value: 86.16981525069227
-    task:
       type: STS
-  - dataset:
-      config: default
       name: MTEB STS14 (default)
-      revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375
-      split: test
       type: mteb/sts14-sts
     metrics:
     - type: cosine_pearson
       value: 82.234064045301
@@ -7199,14 +7201,14 @@ model-index:
       value: 82.234064045301
     - type: spearman
       value: 78.86920830792957
-    task:
       type: STS
-  - dataset:
-      config: default
       name: MTEB STS15 (default)
-      revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3
-      split: test
       type: mteb/sts15-sts
     metrics:
     - type: cosine_pearson
       value: 86.23114543080261
@@ -7226,14 +7228,14 @@ model-index:
       value: 86.23114543080261
     - type: spearman
       value: 87.481042891123
-    task:
       type: STS
-  - dataset:
-      config: default
       name: MTEB STS16 (default)
-      revision: 4d8694f8f0e0100860b497b999b3dbed754a0513
-      split: test
       type: mteb/sts16-sts
     metrics:
     - type: cosine_pearson
       value: 82.9156629047782
@@ -7253,14 +7255,14 @@ model-index:
       value: 82.9156629047782
     - type: spearman
       value: 84.28381329207937
-    task:
       type: STS
-  - dataset:
-      config: en-en
       name: MTEB STS17 (en-en)
-      revision: faeb762787bd10488a50c8b5be4a3b82e411949c
-      split: test
       type: mteb/sts17-crosslingual-sts
     metrics:
     - type: cosine_pearson
       value: 88.91985349746744
@@ -7280,14 +7282,14 @@ model-index:
       value: 88.91985349746744
     - type: spearman
       value: 89.69151633966257
-    task:
       type: STS
-  - dataset:
-      config: en
       name: MTEB STS22 (en)
-      revision: de9d86b3b84231dc21f76c7b7af1f28e2f57f6e3
-      split: test
       type: mteb/sts22-crosslingual-sts
     metrics:
     - type: cosine_pearson
       value: 65.0979772547511
@@ -7307,14 +7309,14 @@ model-index:
       value: 65.0979772547511
     - type: spearman
       value: 65.78126527764236
-    task:
       type: STS
-  - dataset:
-      config: default
       name: MTEB STSBenchmark (default)
-      revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831
-      split: test
       type: mteb/stsbenchmark-sts
     metrics:
     - type: cosine_pearson
       value: 85.6426635049971
@@ -7334,14 +7336,14 @@ model-index:
       value: 85.6426635049971
     - type: spearman
       value: 85.609856578385
-    task:
-      type: STS
-  - dataset:
-      config: default
       name: MTEB SciDocsRR (default)
-      revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab
-      split: test
       type: mteb/scidocs-reranking
     metrics:
     - type: main_score
       value: 82.85163332499799
@@ -7361,14 +7363,14 @@ model-index:
       value: 89.47202967481866
     - type: nAUC_mrr_std
       value: 85.40446996933892
-    task:
-      type: Reranking
-  - dataset:
-      config: default
       name: MTEB SciFact (default)
-      revision: 0228b52cf27578f30900b9e5271d331663a030d7
-      split: test
       type: mteb/scifact
     metrics:
     - type: main_score
       value: 71.655
@@ -7652,14 +7654,14 @@ model-index:
       value: 71.61699999999999
     - type: recall_at_5
       value: 78.361
-    task:
-      type: Retrieval
-  - dataset:
-      config: default
       name: MTEB SprintDuplicateQuestions (default)
-      revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46
-      split: test
       type: mteb/sprintduplicatequestions-pairclassification
     metrics:
     - type: cosine_accuracy
       value: 99.8019801980198
@@ -7743,14 +7745,14 @@ model-index:
       value: 90.79754601226993
     - type: similarity_recall
       value: 88.8
-    task:
-      type: PairClassification
-  - dataset:
-      config: default
       name: MTEB StackExchangeClustering (default)
-      revision: 6cbc1f7b2bc0622f2e39d2c77fa502909748c259
-      split: test
       type: mteb/stackexchange-clustering
     metrics:
     - type: main_score
       value: 66.63931197758824
@@ -7758,14 +7760,14 @@ model-index:
       value: 66.63931197758824
     - type: v_measure_std
       value: 3.896206781511776
-    task:
       type: Clustering
-  - dataset:
-      config: default
       name: MTEB StackExchangeClusteringP2P (default)
-      revision: 815ca46b2622cec33ccafc3735d572c266efdb44
-      split: test
       type: mteb/stackexchange-clustering-p2p
     metrics:
     - type: main_score
       value: 38.984892653301884
@@ -7773,14 +7775,14 @@ model-index:
       value: 38.984892653301884
     - type: v_measure_std
       value: 1.3308552162270453
-    task:
-      type: Clustering
-  - dataset:
-      config: default
       name: MTEB StackOverflowDupQuestions (default)
-      revision: e185fbe320c72810689fc5848eb6114e1ef5ec69
-      split: test
       type: mteb/stackoverflowdupquestions-reranking
     metrics:
     - type: main_score
       value: 52.71499643455044
@@ -7800,14 +7802,14 @@ model-index:
       value: 13.931448578334379
     - type: nAUC_mrr_std
       value: 10.441860004959661
-    task:
-      type: Reranking
-  - dataset:
-      config: default
       name: MTEB SummEval (default)
-      revision: cda12ad7615edc362dbf25a00fdd61d3b1eaf93c
-      split: test
       type: mteb/summeval
     metrics:
     - type: cosine_pearson
       value: 31.5167525286909
@@ -7823,14 +7825,14 @@ model-index:
       value: 31.5167525286909
     - type: spearman
       value: 31.218862970706496
-    task:
-      type: Summarization
-  - dataset:
-      config: default
       name: MTEB TRECCOVID (default)
-      revision: bb9466bac8153a0349341eb1b22e06409e78ef4e
-      split: test
       type: mteb/trec-covid
     metrics:
     - type: main_score
       value: 78.996
@@ -8114,14 +8116,14 @@ model-index:
       value: 0.705
     - type: recall_at_5
       value: 1.162
-    task:
       type: Retrieval
-  - dataset:
-      config: default
       name: MTEB Touche2020 (default)
-      revision: a34f9a33db75fa0cbb21bb5cfc3dae8dc8bec93f
-      split: test
       type: mteb/touche2020
     metrics:
     - type: main_score
       value: 24.234
@@ -8405,14 +8407,14 @@ model-index:
       value: 6.625
     - type: recall_at_5
       value: 9.094
-    task:
-      type: Retrieval
-  - dataset:
-      config: default
       name: MTEB ToxicConversationsClassification (default)
-      revision: edfaf9da55d3dd50d43143d90c1ac476895ae6de
-      split: test
       type: mteb/toxic_conversations_50k
     metrics:
     - type: accuracy
       value: 72.822265625
@@ -8426,14 +8428,14 @@ model-index:
       value: 78.7454393727821
     - type: main_score
       value: 72.822265625
-    task:
       type: Classification
-  - dataset:
-      config: default
       name: MTEB TweetSentimentExtractionClassification (default)
-      revision: d604517c81ca91fe16a244d1248fc021f9ecee7a
-      split: test
       type: mteb/tweet_sentiment_extraction
     metrics:
     - type: accuracy
       value: 72.54385964912281
@@ -8443,14 +8445,14 @@ model-index:
       value: 72.18022450339639
     - type: main_score
       value: 72.54385964912281
-    task:
-      type: Classification
-  - dataset:
-      config: default
       name: MTEB TwentyNewsgroupsClustering (default)
-      revision: 6125ec4e24fa026cec8a478383ee943acfbd5449
-      split: test
       type: mteb/twentynewsgroups-clustering
     metrics:
     - type: main_score
       value: 57.41861450414374
@@ -8458,14 +8460,14 @@ model-index:
       value: 57.41861450414374
     - type: v_measure_std
       value: 1.1732394227153524
-    task:
-      type: Clustering
-  - dataset:
-      config: default
       name: MTEB TwitterSemEval2015 (default)
-      revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1
-      split: test
       type: mteb/twittersemeval2015-pairclassification
     metrics:
     - type: cosine_accuracy
       value: 85.65893783155511
@@ -8549,14 +8551,14 @@ model-index:
       value: 64.0855106888361
     - type: similarity_recall
       value: 71.18733509234828
-    task:
       type: PairClassification
-  - dataset:
-      config: default
       name: MTEB TwitterURLCorpus (default)
-      revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf
-      split: test
       type: mteb/twitterurlcorpus-pairclassification
     metrics:
     - type: cosine_accuracy
       value: 88.86754375751931
@@ -8640,8 +8642,6 @@ model-index:
       value: 74.19310344827586
     - type: similarity_recall
       value: 82.83030489682784
-    task:
-      type: PairClassification
 ---
 # Contextual Document Embeddings (CDE)

 model-index:
 - name: cde-small-v1
   results:
+  - task:
+      type: Classification
+    dataset:
       name: MTEB AmazonCounterfactualClassification (en)
       type: mteb/amazon_counterfactual
+      config: en
+      split: test
+      revision: e8379541af4e31359cca9fbcf4b00f2671dba205
     metrics:
     - type: accuracy
       value: 87.01492537313433
       value: 87.74802754480477
     - type: main_score
       value: 87.01492537313433
+  - task:
       type: Classification
+    dataset:
       name: MTEB AmazonPolarityClassification (default)
       type: mteb/amazon_polarity
+      config: default
+      split: test
+      revision: e2d317d38cd51312af73b3d32a06d1a08b442046
     metrics:
     - type: accuracy
       value: 94.652275
       value: 94.64655930708355
     - type: main_score
       value: 94.652275
+  - task:
       type: Classification
+    dataset:
       name: MTEB AmazonReviewsClassification (en)
       type: mteb/amazon_reviews_multi
+      config: en
+      split: test
+      revision: 1399c76144fd37290681b995c656ef9b2e06e26d
     metrics:
     - type: accuracy
       value: 55.75599999999999
       value: 55.07058630829347
     - type: main_score
       value: 55.75599999999999
+  - task:
+      type: Retrieval
+    dataset:
       name: MTEB ArguAna (default)
       type: mteb/arguana
+      config: default
+      split: test
+      revision: c22ab2a51041ffd869aaddef7af8d8215647e41a
     metrics:
     - type: main_score
       value: 69.959
       value: 74.182
     - type: recall_at_5
       value: 84.495
+  - task:
+      type: Clustering
+    dataset:
       name: MTEB ArxivClusteringP2P (default)
       type: mteb/arxiv-clustering-p2p
+      config: default
+      split: test
+      revision: a122ad7f3f0291bf49cc6f4d32aa80929df69d5d
     metrics:
     - type: main_score
       value: 48.54672141116669
       value: 48.54672141116669
     - type: v_measure_std
       value: 14.037498386768362
+  - task:
       type: Clustering
+    dataset:
       name: MTEB ArxivClusteringS2S (default)
       type: mteb/arxiv-clustering-s2s
+      config: default
+      split: test
+      revision: f910caf1a6075f7329cdf8c1a6135696f37dbd53
     metrics:
     - type: main_score
       value: 40.5914039166466
       value: 40.5914039166466
     - type: v_measure_std
       value: 14.385069818910331
+  - task:
+      type: Reranking
+    dataset:
       name: MTEB AskUbuntuDupQuestions (default)
       type: mteb/askubuntudupquestions-reranking
+      config: default
+      split: test
+      revision: 2000358ca161889fa9c082cb41daa8dcfb161a54
     metrics:
     - type: main_score
       value: 61.13621260261507
       value: 31.484257486448364
     - type: nAUC_mrr_std
       value: 21.252659250011632
+  - task:
+      type: STS
+    dataset:
       name: MTEB BIOSSES (default)
       type: mteb/biosses-sts
+      config: default
+      split: test
+      revision: d3fb88f8f02e40887cd149695127462bbcf29b4a
     metrics:
     - type: cosine_pearson
       value: 89.07028016646942
       value: 89.07028016646942
     - type: spearman
       value: 86.69595132967805
+  - task:
+      type: Classification
+    dataset:
       name: MTEB Banking77Classification (default)
       type: mteb/banking77
+      config: default
+      split: test
+      revision: 0fd18e25b25c072e09e0d92ab615fda904d66300
     metrics:
     - type: accuracy
       value: 88.6038961038961
       value: 88.56824205739822
     - type: main_score
       value: 88.6038961038961
+  - task:
+      type: Clustering
+    dataset:
       name: MTEB BiorxivClusteringP2P (default)
       type: mteb/biorxiv-clustering-p2p
+      config: default
+      split: test
+      revision: 65b79d1d13f80053f67aca9498d9402c2d9f1f40
     metrics:
     - type: main_score
       value: 44.77800814327256
       value: 44.77800814327256
     - type: v_measure_std
       value: 0.6462535527471919
+  - task:
       type: Clustering
+    dataset:
       name: MTEB BiorxivClusteringS2S (default)
       type: mteb/biorxiv-clustering-s2s
+      config: default
+      split: test
+      revision: 258694dd0231531bc1fd9de6ceb52a0853c6d908
     metrics:
     - type: main_score
       value: 38.16110272459102
       value: 38.16110272459102
     - type: v_measure_std
       value: 0.7456916212435019
+  - task:
+      type: Retrieval
+    dataset:
       name: MTEB CQADupstackAndroidRetrieval (default)
       type: mteb/cqadupstack-android
+      config: default
+      split: test
+      revision: f46a197baaae43b4f621051089b82a364682dfeb
     metrics:
     - type: main_score
       value: 49.376
       value: 47.591
     - type: recall_at_5
       value: 54.245
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB CQADupstackEnglishRetrieval (default)
       type: mteb/cqadupstack-english
+      config: default
+      split: test
+      revision: ad9991cb51e31e31e430383c75ffb2885547b5f0
     metrics:
     - type: main_score
       value: 44.727
       value: 42.085
     - type: recall_at_5
       value: 47.5
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB CQADupstackGamingRetrieval (default)
       type: mteb/cqadupstack-gaming
+      config: default
+      split: test
+      revision: 4885aa143210c98657558c04aaf3dc47cfb54340
     metrics:
     - type: main_score
       value: 59.001999999999995
       value: 57.916000000000004
     - type: recall_at_5
       value: 65.44
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB CQADupstackGisRetrieval (default)
       type: mteb/cqadupstack-gis
+      config: default
+      split: test
+      revision: 5003b3064772da1887988e05400cf3806fe491f2
     metrics:
     - type: main_score
       value: 37.501
       value: 37.218
     - type: recall_at_5
       value: 42.559000000000005
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB CQADupstackMathematicaRetrieval (default)
       type: mteb/cqadupstack-mathematica
+      config: default
+      split: test
+      revision: 90fceea13679c63fe563ded68f3b6f06e50061de
     metrics:
     - type: main_score
       value: 27.653
       value: 25.469
     - type: recall_at_5
       value: 31.316
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB CQADupstackPhysicsRetrieval (default)
       type: mteb/cqadupstack-physics
+      config: default
+      split: test
+      revision: 79531abbd1fb92d06c6d6315a0cbbbf5bb247ea4
     metrics:
     - type: main_score
       value: 45.314
       value: 43.679
     - type: recall_at_5
       value: 49.735
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB CQADupstackProgrammersRetrieval (default)
       type: mteb/cqadupstack-programmers
+      config: default
+      split: test
+      revision: 6184bc1440d2dbc7612be22b50686b8826d22b32
     metrics:
     - type: main_score
       value: 41.972
       value: 39.363
     - type: recall_at_5
       value: 44.665
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB CQADupstackRetrieval (default)
       type: CQADupstackRetrieval_is_a_combined_dataset
+      config: default
+      split: test
+      revision: CQADupstackRetrieval_is_a_combined_dataset
     metrics:
     - type: main_score
       value: 39.823499999999996
     - type: ndcg_at_10
       value: 39.823499999999996
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB CQADupstackStatsRetrieval (default)
       type: mteb/cqadupstack-stats
+      config: default
+      split: test
+      revision: 65ac3a16b8e91f9cee4c9828cc7c335575432a2a
     metrics:
     - type: main_score
       value: 34.943000000000005
       value: 33.427
     - type: recall_at_5
       value: 37.643
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB CQADupstackTexRetrieval (default)
       type: mteb/cqadupstack-tex
+      config: default
+      split: test
+      revision: 46989137a86843e03a6195de44b09deda022eec7
     metrics:
     - type: main_score
       value: 27.271
       value: 25.592
     - type: recall_at_5
       value: 30.279
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB CQADupstackUnixRetrieval (default)
       type: mteb/cqadupstack-unix
+      config: default
+      split: test
+      revision: 6c6430d3a6d36f8d2a829195bc5dc94d7e063e53
     metrics:
     - type: main_score
       value: 38.237
       value: 36.275
     - type: recall_at_5
       value: 42.199
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB CQADupstackWebmastersRetrieval (default)
       type: mteb/cqadupstack-webmasters
+      config: default
+      split: test
+      revision: 160c094312a0e1facb97e55eeddb698c0abe3571
     metrics:
     - type: main_score
       value: 38.702
       value: 37.634
     - type: recall_at_5
       value: 42.021
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB CQADupstackWordpressRetrieval (default)
       type: mteb/cqadupstack-wordpress
+      config: default
+      split: test
+      revision: 4ffe81d471b1924886b33c7567bfb200e9eec5c4
     metrics:
     - type: main_score
       value: 33.184000000000005
       value: 32.683
     - type: recall_at_5
       value: 36.756
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB ClimateFEVER (default)
       type: mteb/climate-fever
+      config: default
+      split: test
+      revision: 47f2ac6acb640fc46020b02a5b59fdda04d39380
     metrics:
     - type: main_score
       value: 25.068
       value: 18.312
     - type: recall_at_5
       value: 22.776
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB DBPedia (default)
       type: mteb/dbpedia
+      config: default
+      split: test
+      revision: c0f706b76e590d620bd6618b3ca8efdd34e2d659
     metrics:
     - type: main_score
       value: 40.128
       value: 14.562
     - type: recall_at_5
       value: 18.779
+  - task:
+      type: Classification
+    dataset:
       name: MTEB EmotionClassification (default)
       type: mteb/emotion
+      config: default
+      split: test
+      revision: 4f58c6b202a23cf9a4da393831edf4f9183cad37
     metrics:
     - type: accuracy
       value: 74.86
       value: 75.96499621761998
     - type: main_score
       value: 74.86
+  - task:
+      type: Retrieval
+    dataset:
       name: MTEB FEVER (default)
       type: mteb/fever
+      config: default
+      split: test
+      revision: bea83ef9e8fb933d90a2f1d5515737465d613e12
     metrics:
     - type: main_score
       value: 86.029
       value: 88.382
     - type: recall_at_5
       value: 90.908
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB FiQA2018 (default)
       type: mteb/fiqa
+      config: default
+      split: test
+      revision: 27a168819829fe9bcd655c2df245fb19452e8e06
     metrics:
     - type: main_score
       value: 45.238
       value: 37.656
     - type: recall_at_5
       value: 44.766
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB HotpotQA (default)
       type: mteb/hotpotqa
+      config: default
+      split: test
+      revision: ab518f4d6fcca38d87c25209f94beba119d02014
     metrics:
     - type: main_score
       value: 66.672
       value: 57.522
     - type: recall_at_5
       value: 62.134
+  - task:
+      type: Classification
+    dataset:
       name: MTEB ImdbClassification (default)
       type: mteb/imdb
+      config: default
+      split: test
+      revision: 3d86128a09e091d6018b6d26cad27f2739fc2db7
     metrics:
     - type: accuracy
       value: 93.5944
       value: 93.58945949328377
     - type: main_score
       value: 93.5944
+  - task:
+      type: Retrieval
+    dataset:
       name: MTEB MSMARCO (default)
       type: mteb/msmarco
+      config: default
+      split: dev
+      revision: c5a29a104738b98a9e76336939199e264163d4a0
     metrics:
     - type: main_score
       value: 41.448
       value: 41.304
     - type: recall_at_5
       value: 51.076
+  - task:
+      type: Classification
+    dataset:
       name: MTEB MTOPDomainClassification (en)
       type: mteb/mtop_domain
+      config: en
+      split: test
+      revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf
     metrics:
     - type: accuracy
       value: 96.03967168262655
       value: 96.06623245823347
     - type: main_score
       value: 96.03967168262655
+  - task:
       type: Classification
+    dataset:
       name: MTEB MTOPIntentClassification (en)
       type: mteb/mtop_intent
+      config: en
+      split: test
+      revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba
     metrics:
     - type: accuracy
       value: 89.12904696762428
       value: 90.41290566743324
     - type: main_score
       value: 89.12904696762428
+  - task:
       type: Classification
+    dataset:
       name: MTEB MassiveIntentClassification (en)
       type: mteb/amazon_massive_intent
+      config: en
+      split: test
+      revision: 4672e20407010da34463acc759c162ca9734bca6
     metrics:
     - type: accuracy
       value: 76.49630127774041
       value: 76.42436195016484
     - type: main_score
       value: 76.49630127774041
+  - task:
       type: Classification
+    dataset:
       name: MTEB MassiveScenarioClassification (en)
       type: mteb/amazon_massive_scenario
+      config: en
+      split: test
+      revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8
     metrics:
     - type: accuracy
       value: 78.9340954942838
       value: 78.87787647838971
     - type: main_score
       value: 78.9340954942838
+  - task:
+      type: Clustering
+    dataset:
       name: MTEB MedrxivClusteringP2P (default)
       type: mteb/medrxiv-clustering-p2p
+      config: default
+      split: test
+      revision: e7a26af6f3ae46b30dde8737f02c07b1505bcc73
     metrics:
     - type: main_score
       value: 37.50182848656019
       value: 37.50182848656019
     - type: v_measure_std
       value: 1.1708518023877268
+  - task:
       type: Clustering
+    dataset:
       name: MTEB MedrxivClusteringS2S (default)
       type: mteb/medrxiv-clustering-s2s
+      config: default
+      split: test
+      revision: 35191c8c0dca72d8ff3efcd72aa802307d469663
     metrics:
     - type: main_score
       value: 35.72762609825363
       value: 35.72762609825363
     - type: v_measure_std
       value: 1.4555014772914985
+  - task:
+      type: Reranking
+    dataset:
       name: MTEB MindSmallReranking (default)
       type: mteb/mind_small
+      config: default
+      split: test
+      revision: 59042f120c80e8afa9cdbb224f67076cec0fc9a7
     metrics:
     - type: main_score
       value: 30.47716416454022
       value: -15.78941850629242
     - type: nAUC_mrr_std
       value: -1.1330442292510805
+  - task:
+      type: Retrieval
+    dataset:
       name: MTEB NFCorpus (default)
       type: mteb/nfcorpus
+      config: default
+      split: test
+      revision: ec0fa4fe99da2ff19ca1214b7966684033a58814
     metrics:
     - type: main_score
       value: 34.648
       value: 10.037
     - type: recall_at_5
       value: 12.717999999999998
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB NQ (default)
       type: mteb/nq
+      config: default
+      split: test
+      revision: b774495ed302d8c44a3a7ea25c90dbce03968f31
     metrics:
     - type: main_score
       value: 60.06
       value: 61.114000000000004
     - type: recall_at_5
       value: 69.812
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB QuoraRetrieval (default)
       type: mteb/quora
+      config: default
+      split: test
+      revision: e4e08e0b7dbe3c8700f0daef558ff32256715259
     metrics:
     - type: main_score
       value: 89.821
       value: 88.714
     - type: recall_at_5
       value: 92.96799999999999
+  - task:
+      type: Clustering
+    dataset:
       name: MTEB RedditClustering (default)
       type: mteb/reddit-clustering
+      config: default
+      split: test
+      revision: 24640382cdbf8abc73003fb0fa6d111a705499eb
     metrics:
     - type: main_score
       value: 59.36038828851887
       value: 59.36038828851887
     - type: v_measure_std
       value: 4.1958765965154425
+  - task:
       type: Clustering
+    dataset:
       name: MTEB RedditClusteringP2P (default)
       type: mteb/reddit-clustering-p2p
+      config: default
+      split: test
+      revision: 385e3cb46b4cfa89021f56c4380204149d0efe33
     metrics:
     - type: main_score
       value: 64.67522832408089
       value: 64.67522832408089
     - type: v_measure_std
       value: 12.473765016158698
+  - task:
+      type: Retrieval
+    dataset:
       name: MTEB SCIDOCS (default)
       type: mteb/scidocs
+      config: default
+      split: test
+      revision: f8c2fcf00f625baaa80f62ec5bd9e1fff3b8ae88
     metrics:
     - type: main_score
       value: 21.751
       value: 11.648
     - type: recall_at_5
       value: 15.883
+  - task:
+      type: STS
+    dataset:
       name: MTEB SICK-R (default)
       type: mteb/sickr-sts
+      config: default
+      split: test
+      revision: 20a6d6f312dd54037fe07a32d58e5e168867909d
     metrics:
     - type: cosine_pearson
       value: 84.0161170579997
       value: 84.0161170579997
     - type: spearman
       value: 77.52025923874551
+  - task:
       type: STS
+    dataset:
       name: MTEB STS12 (default)
       type: mteb/sts12-sts
+      config: default
+      split: test
+      revision: a0d554a64d88156834ff5ae9920b964011b16384
     metrics:
     - type: cosine_pearson
       value: 81.32328780209225
       value: 81.32328780209225
     - type: spearman
       value: 74.17570679745272
+  - task:
       type: STS
+    dataset:
       name: MTEB STS13 (default)
       type: mteb/sts13-sts
+      config: default
+      split: test
+      revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca
     metrics:
     - type: cosine_pearson
       value: 85.53224141249392
       value: 85.53224141249392
     - type: spearman
       value: 86.16981525069227
+  - task:
       type: STS
+    dataset:
       name: MTEB STS14 (default)
       type: mteb/sts14-sts
+      config: default
+      split: test
+      revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375
     metrics:
     - type: cosine_pearson
       value: 82.234064045301
       value: 82.234064045301
     - type: spearman
       value: 78.86920830792957
+  - task:
       type: STS
+    dataset:
       name: MTEB STS15 (default)
       type: mteb/sts15-sts
+      config: default
+      split: test
+      revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3
     metrics:
     - type: cosine_pearson
       value: 86.23114543080261
       value: 86.23114543080261
     - type: spearman
       value: 87.481042891123
+  - task:
       type: STS
+    dataset:
       name: MTEB STS16 (default)
       type: mteb/sts16-sts
+      config: default
+      split: test
+      revision: 4d8694f8f0e0100860b497b999b3dbed754a0513
     metrics:
     - type: cosine_pearson
       value: 82.9156629047782
       value: 82.9156629047782
     - type: spearman
       value: 84.28381329207937
+  - task:
       type: STS
+    dataset:
       name: MTEB STS17 (en-en)
       type: mteb/sts17-crosslingual-sts
+      config: en-en
+      split: test
+      revision: faeb762787bd10488a50c8b5be4a3b82e411949c
     metrics:
     - type: cosine_pearson
       value: 88.91985349746744
       value: 88.91985349746744
     - type: spearman
       value: 89.69151633966257
+  - task:
       type: STS
+    dataset:
       name: MTEB STS22 (en)
       type: mteb/sts22-crosslingual-sts
+      config: en
+      split: test
+      revision: de9d86b3b84231dc21f76c7b7af1f28e2f57f6e3
     metrics:
     - type: cosine_pearson
       value: 65.0979772547511
       value: 65.0979772547511
     - type: spearman
       value: 65.78126527764236
+  - task:
       type: STS
+    dataset:
       name: MTEB STSBenchmark (default)
       type: mteb/stsbenchmark-sts
+      config: default
+      split: test
+      revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831
     metrics:
     - type: cosine_pearson
       value: 85.6426635049971
       value: 85.6426635049971
     - type: spearman
       value: 85.609856578385
+  - task:
+      type: Reranking
+    dataset:
       name: MTEB SciDocsRR (default)
       type: mteb/scidocs-reranking
+      config: default
+      split: test
+      revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab
     metrics:
     - type: main_score
       value: 82.85163332499799
       value: 89.47202967481866
     - type: nAUC_mrr_std
       value: 85.40446996933892
+  - task:
+      type: Retrieval
+    dataset:
       name: MTEB SciFact (default)
       type: mteb/scifact
+      config: default
+      split: test
+      revision: 0228b52cf27578f30900b9e5271d331663a030d7
     metrics:
     - type: main_score
       value: 71.655
       value: 71.61699999999999
     - type: recall_at_5
       value: 78.361
+  - task:
+      type: PairClassification
+    dataset:
       name: MTEB SprintDuplicateQuestions (default)
       type: mteb/sprintduplicatequestions-pairclassification
+      config: default
+      split: test
+      revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46
     metrics:
     - type: cosine_accuracy
       value: 99.8019801980198
       value: 90.79754601226993
     - type: similarity_recall
       value: 88.8
+  - task:
+      type: Clustering
+    dataset:
       name: MTEB StackExchangeClustering (default)
       type: mteb/stackexchange-clustering
+      config: default
+      split: test
+      revision: 6cbc1f7b2bc0622f2e39d2c77fa502909748c259
     metrics:
     - type: main_score
       value: 66.63931197758824
       value: 66.63931197758824
     - type: v_measure_std
       value: 3.896206781511776
+  - task:
       type: Clustering
+    dataset:
       name: MTEB StackExchangeClusteringP2P (default)
       type: mteb/stackexchange-clustering-p2p
+      config: default
+      split: test
+      revision: 815ca46b2622cec33ccafc3735d572c266efdb44
     metrics:
     - type: main_score
       value: 38.984892653301884
       value: 38.984892653301884
     - type: v_measure_std
       value: 1.3308552162270453
+  - task:
+      type: Reranking
+    dataset:
       name: MTEB StackOverflowDupQuestions (default)
       type: mteb/stackoverflowdupquestions-reranking
+      config: default
+      split: test
+      revision: e185fbe320c72810689fc5848eb6114e1ef5ec69
     metrics:
     - type: main_score
       value: 52.71499643455044
       value: 13.931448578334379
     - type: nAUC_mrr_std
       value: 10.441860004959661
+  - task:
+      type: Summarization
+    dataset:
       name: MTEB SummEval (default)
       type: mteb/summeval
+      config: default
+      split: test
+      revision: cda12ad7615edc362dbf25a00fdd61d3b1eaf93c
     metrics:
     - type: cosine_pearson
       value: 31.5167525286909
       value: 31.5167525286909
     - type: spearman
       value: 31.218862970706496
+  - task:
+      type: Retrieval
+    dataset:
       name: MTEB TRECCOVID (default)
       type: mteb/trec-covid
+      config: default
+      split: test
+      revision: bb9466bac8153a0349341eb1b22e06409e78ef4e
     metrics:
     - type: main_score
       value: 78.996
       value: 0.705
     - type: recall_at_5
       value: 1.162
+  - task:
       type: Retrieval
+    dataset:
       name: MTEB Touche2020 (default)
       type: mteb/touche2020
+      config: default
+      split: test
+      revision: a34f9a33db75fa0cbb21bb5cfc3dae8dc8bec93f
     metrics:
     - type: main_score
       value: 24.234
       value: 6.625
     - type: recall_at_5
       value: 9.094
+  - task:
+      type: Classification
+    dataset:
       name: MTEB ToxicConversationsClassification (default)
       type: mteb/toxic_conversations_50k
+      config: default
+      split: test
+      revision: edfaf9da55d3dd50d43143d90c1ac476895ae6de
     metrics:
     - type: accuracy
       value: 72.822265625
       value: 78.7454393727821
     - type: main_score
       value: 72.822265625
+  - task:
       type: Classification
+    dataset:
       name: MTEB TweetSentimentExtractionClassification (default)
       type: mteb/tweet_sentiment_extraction
+      config: default
+      split: test
+      revision: d604517c81ca91fe16a244d1248fc021f9ecee7a
     metrics:
     - type: accuracy
       value: 72.54385964912281
       value: 72.18022450339639
     - type: main_score
       value: 72.54385964912281
+  - task:
+      type: Clustering
+    dataset:
       name: MTEB TwentyNewsgroupsClustering (default)
       type: mteb/twentynewsgroups-clustering
+      config: default
+      split: test
+      revision: 6125ec4e24fa026cec8a478383ee943acfbd5449
     metrics:
     - type: main_score
       value: 57.41861450414374
       value: 57.41861450414374
     - type: v_measure_std
       value: 1.1732394227153524
+  - task:
+      type: PairClassification
+    dataset:
       name: MTEB TwitterSemEval2015 (default)
       type: mteb/twittersemeval2015-pairclassification
+      config: default
+      split: test
+      revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1
     metrics:
     - type: cosine_accuracy
       value: 85.65893783155511
       value: 64.0855106888361
     - type: similarity_recall
       value: 71.18733509234828
+  - task:
       type: PairClassification
+    dataset:
       name: MTEB TwitterURLCorpus (default)
       type: mteb/twitterurlcorpus-pairclassification
+      config: default
+      split: test
+      revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf
     metrics:
     - type: cosine_accuracy
       value: 88.86754375751931
       value: 74.19310344827586
     - type: similarity_recall
       value: 82.83030489682784
 ---
 # Contextual Document Embeddings (CDE)

config.json CHANGED Viewed

@@ -1,8 +1,14 @@
 {
   "architecture": "transductive",
   "architectures": [
-    "DatasetConditionedBiencoder"
   ],
   "biencoder_pooling_strategy": "mean",
   "cache_dir": null,
   "config_name": null,

 {
+  "_name_or_path": "/fsx-checkpoints/jxm/cde/2024-09-18-supervised-final-bge--epoch-4/checkpoint-1820",
   "architecture": "transductive",
   "architectures": [
+    "DatasetTransformer"
   ],
+  "attn_implementation": null,
+  "auto_map": {
+    "AutoConfig": "misc.ContextualModelConfig",
+    "AutoModel": "model.DatasetTransformer"
+  },
   "biencoder_pooling_strategy": "mean",
   "cache_dir": null,
   "config_name": null,

misc.py ADDED Viewed

	@@ -0,0 +1,514 @@

+from typing import Dict, Iterable, List, Tuple, Union
+import collections
+import functools
+import glob
+import json
+import hashlib
+import itertools
+import logging
+import multiprocessing
+import os
+import pickle
+import random
+import requests
+import sys
+import zipfile
+import datasets
+import numpy as np
+import safetensors
+import torch
+import tqdm
+import transformers
+from cde.lib.dist import get_num_proc, get_rank
+def get_cde_cache_dir() -> str:
+    script_directory = os.path.normpath(
+        os.path.join(
+            os.path.dirname(os.path.abspath(__file__)),
+            os.pardir, os.pardir,
+        )
+    )
+    return os.path.join(script_directory, "data")
+def get_cache_location_from_kwargs(**kwargs):
+    cache_location = os.path.join(
+        get_cde_cache_dir(), "cluster"
+    )
+    os.makedirs(cache_location, exist_ok=True)
+    return os.path.join(cache_location, md5_hash_kwargs(**kwargs))
+def process_qrels_uncached(corpus: datasets.Dataset, qrels: datasets.Dataset) -> Tuple[Dict[str, List[float]], Dict[str, List[str]]]:
+    qrels_idxs = collections.defaultdict(list)
+    qrels_scores = collections.defaultdict(list)
+    corpus_ids = np.array(corpus['_id'])
+    skipped_qrels = 0
+    for ex in tqdm.tqdm(qrels, desc='processing qrels', colour='#964B00', leave=False):
+        #
+        # example:
+        # {
+        #  'query-id': 1,
+        #  'corpus-id': 'b0680508-2019-04-18T13:48:51Z-00002-000',
+        #  'score': 2
+        # }
+        #
+        q_id = str(ex['query-id'])
+        c_idxs = (corpus_ids == str(ex['corpus-id'])).nonzero()[0]
+        #
+        assert len(c_idxs) <= 1, f"error - duplicate corpus ID? (found {len(c_idxs)} matches)"
+        #
+        if len(c_idxs):
+            qrels_idxs[q_id].append(c_idxs[0])
+            qrels_scores[q_id].append(ex['score'])
+        else:
+            skipped_qrels += 1
+        #
+    if skipped_qrels > 0:
+        logging.warning(f'Warning: Skipped {skipped_qrels}/{len(qrels)} qrels.')
+    return qrels_idxs, qrels_scores
+def process_qrels(
+        corpus: datasets.Dataset, qrels: datasets.Dataset,
+        use_cache: bool = True
+    ) -> Tuple[Dict[str, List[float]], Dict[str, List[str]]]:
+    dataset_cache_file = '_'.join(
+        (corpus.cache_files[0]['filename'], qrels.cache_files[0]['filename'])
+    )
+    cache_file = strip_extension(dataset_cache_file) + '_processed_qrels.p'
+    os.makedirs(os.path.dirname(cache_file), exist_ok=True)
+    if not (use_cache and os.path.exists(cache_file)):
+        qrels_idxs, qrels_scores = process_qrels_uncached(
+            corpus=corpus, qrels=qrels
+        )
+        if use_cache:
+            pickle.dump((qrels_idxs, qrels_scores), open(cache_file, 'wb'))
+    else:
+        qrels_idxs, qrels_scores = pickle.load(open(cache_file, 'rb'))
+    return qrels_idxs, qrels_scores
+def strip_extension(filename: str) -> str:
+    """Strips file extension.
+    Ex:
+        >> strip_extension('/root/dir/sub/file.ext')
+        '/root/dir/sub/file'
+    """
+    return os.path.splitext(filename)[0]
+def md5_hash(t: Tuple[str]) -> str:
+    return hashlib.md5('__'.join(t).encode()).hexdigest()
+def md5_hash_kwargs(**kwargs) -> str:
+    # We ignore special hf args that start with _ like '__cached__setup_devices'.
+    safe_kwargs = {k: str(v) for k,v in kwargs.items() if not k.startswith('_')}
+    s = json.dumps(safe_kwargs, sort_keys=True)
+    return hashlib.md5(s.encode()).hexdigest()
+def download_url(url: str, save_path: str, chunk_size: int = 1024):
+    """Download url with progress bar using tqdm
+    https://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads
+    Args:
+        url (str): downloadable url
+        save_path (str): local path to save the downloaded file
+        chunk_size (int, optional): chunking of files. Defaults to 1024.
+    """
+    r = requests.get(url, stream=True)
+    total = int(r.headers.get('Content-Length', 0))
+    with open(save_path, 'wb') as fd, tqdm.tqdm(
+        desc=save_path,
+        total=total,
+        unit='iB',
+        unit_scale=True,
+        unit_divisor=chunk_size,
+    ) as bar:
+        for data in r.iter_content(chunk_size=chunk_size):
+            size = fd.write(data)
+            bar.update(size)
+def unzip(zip_file: str, out_dir: str):
+    print("unzipping =>", zip_file)
+    zip_ = zipfile.ZipFile(zip_file, "r")
+    zip_.extractall(path=out_dir)
+    zip_.close()
+def download_url_and_unzip(url: str, out_dir: str, chunk_size: int = 1024) -> str:
+    os.makedirs(out_dir, exist_ok=True)
+    dataset = url.split("/")[-1]
+    zip_file = os.path.join(out_dir, dataset)
+    if not os.path.isfile(zip_file):
+        logging.info("Downloading {} ...".format(dataset))
+        download_url(url, zip_file, chunk_size)
+    if not os.path.isdir(zip_file.replace(".zip", "")):
+        logging.info("Unzipping {} ...".format(dataset))
+        unzip(zip_file, out_dir)
+    return os.path.join(out_dir, dataset.replace(".zip", ""))
+def tqdm_if_main_worker(iterable: Iterable, **kwargs) -> Iterable:
+    if get_rank() == 0:
+        return tqdm.tqdm(iterable, **kwargs)
+    else:
+        return iterable
+class ContextualModelConfig(transformers.configuration_utils.PretrainedConfig):
+    """We create a dummy configuration class that will just set properties
+    based on whatever kwargs we pass in.
+    When this class is initialized (see experiments.py) we pass in the
+    union of all data, model, and training args, all of which should
+    get saved to the config json.
+    """
+    def __init__(self, **kwargs):
+        for key, value in kwargs.items():
+            try:
+                json.dumps(value)
+                setattr(self, key, value)
+            except TypeError:
+                # value was not JSON-serializable, skip
+                continue
+        super().__init__()
+def independent_crop(
+    input_ids: torch.Tensor, pad_token_id: int,
+    l1: int = 256, l2: int = 256) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Returns two independent crops from input_ids.
+    Assumes input_ids has a beginning and end token, like
+        [101, ..., 102, 0, 0, 0].
+    Args:
+        input_ids: tensor of IDs
+        pad_token_id: ID of pad tokens in input_ids
+        l1: length of span 1, cropped
+        l2: length of span 2, cropped
+    Returns:
+        span1: first crop (of length l1)
+        span2: second crop (of length l2)
+    """
+    # Count tokens until pad.
+    if (input_ids == pad_token_id).sum() == 0:
+        N = len(input_ids)
+    else:
+        N = (input_ids == pad_token_id).int().argmax().item()
+    ####
+    ###
+    ##
+    ## Contriever:  We use the random cropping data
+    ## augmentation, with documents of 256 tokens and span
+    ## sizes sampled between 5% and 50% of the document
+    ## length
+    ##
+    ###
+    #####
+    ####### LaPraDor: The maximum lengths set for queries and
+    ####### documents are 64 and 350...
+    #####
+    # TODO is this divide-by-two a good idea? (Don't want s1=s2 ever..)
+    nl1 = min(N//2, l1)
+    nl2 = min(N//2, l2)
+    s1_start = random.randint(1, N-nl1)
+    s2_start = random.randint(1, N-nl2)
+    s1_idxs = itertools.chain(
+        [0], range(s1_start, s1_start+nl1), [N-1]
+    )
+    s1 = input_ids[torch.tensor(list(s1_idxs))]
+    s2_idxs = itertools.chain(
+        [0], range(s2_start, s2_start+nl2), [N-1]
+    )
+    s2 = input_ids[torch.tensor(list(s2_idxs))]
+    return (s1, s2)
+def load_dataset_tables(
+    files: Iterable[str], num_workers: int = 16
+) -> Iterable[datasets.table.MemoryMappedTable]:
+    import concurrent
+    from multiprocessing import Pool
+    # num_workers = min(num_workers, len(files))
+    num_workers = min(32, len(files))
+    use_threads = True
+    if use_threads:
+        pool_cls = concurrent.futures.ThreadPoolExecutor
+        pool_kwargs = {"max_workers": num_workers}
+    else:
+        pool_cls = Pool
+        pool_kwargs = {"processes": num_workers}
+    with pool_cls(**pool_kwargs) as pool:
+        if len(files) > 10:
+            files = tqdm_if_main_worker(
+                files,
+                desc=f"Loading {len(files)} files with {num_workers} workers",
+                total=len(files),
+                colour="#ffbd88"
+            )
+        result = list(
+            pool.map(datasets.table.MemoryMappedTable.from_file, files)
+        )
+    return result
+def datasets_fast_load_from_disk(cache_path: str) -> datasets.Dataset:
+    logging.info(f"fast_load_from_disk called with path:", cache_path)
+    dataset_info_path = os.path.join(cache_path, "dataset_info.json")
+    with open(dataset_info_path, encoding="utf-8") as dataset_info_file:
+        dataset_info = datasets.DatasetInfo.from_dict(json.load(dataset_info_file))
+    dataset_state_path = os.path.join(cache_path, "state.json")
+    with open(dataset_state_path, encoding="utf-8") as state_file:
+        state = json.load(state_file)
+    files = glob.glob(os.path.join(cache_path, "data-*.arrow"))
+    files = sorted(files)
+    num_workers = get_num_proc()
+    ds_tables = load_dataset_tables(
+        files=files,
+        num_workers=num_workers
+    )
+    arrow_table = datasets.table.concat_tables(ds_tables)
+    split = state["_split"]
+    split = datasets.splits.Split(split) if split is not None else split
+    # print("returning dataset")
+    return datasets.Dataset(
+        arrow_table=arrow_table,
+        info=dataset_info,
+        split=split,
+        fingerprint=state["_fingerprint"],
+    )
+def tokenize_dataset(
+        dataset: datasets.Dataset,
+        tokenizer: transformers.PreTrainedTokenizer,
+        max_length: int,
+        text_key: str,
+        padding_strategy: str
+    ) -> datasets.Dataset:
+    def tokenize_text(ex: Dict) -> Dict:
+        tt = tokenizer(
+            ex[text_key],
+            max_length=max_length,
+            truncation=True,
+            padding=padding_strategy,
+        )
+        for k,v in tt.items():
+            ex[f"{text_key}_{k}"] = v
+        ex["length"] = [len(tt) for tt in ex[f"{text_key}_input_ids"]]
+        return ex
+    # generate unique hash for tokenizer
+    vocab = tokenizer.vocab
+    vocab_words = tuple(sorted(vocab.keys(), key=lambda word: vocab[word]))
+    vocab_hash = md5_hash(vocab_words)
+    data_fingerprint = '__'.join((
+        dataset._fingerprint, str(vocab_hash), str(max_length),
+        text_key, padding_strategy
+    ))
+    data_fingerprint = md5_hash(data_fingerprint)
+    dataset = dataset.map(
+        tokenize_text,
+        new_fingerprint=data_fingerprint,
+        batched=True,
+        load_from_cache_file=True,
+    )
+    return dataset
+class TensorRunningAverages:
+    _store_sum: Dict[str, torch.Tensor]
+    _store_total: Dict[str, torch.Tensor]
+    def __init__(self):
+        self._store_sum = {}
+        self._store_total = {}
+    def __iter__(self) -> Iterable[str]:
+        return iter(self._store_sum.keys())
+    def update(self, key: str, val: Union[int, float, torch.Tensor]) -> None:
+        if key not in self._store_sum:
+            self.clear(key)
+        if isinstance(val, torch.Tensor):
+            val = val.item() # tensor -> num
+        self._store_sum[key] += val
+        self._store_total[key] += 1
+    def get(self, key: str) -> float:
+        total = max(self._store_total.get(key).item(), 1.0)
+        return (self._store_sum[key] / float(total)).item() or 0.0
+    def clear(self, key: str) -> None:
+        self._store_sum[key] = torch.tensor(0.0, dtype=torch.float32)
+        self._store_total[key] = torch.tensor(0, dtype=torch.int32)
+    def clear_all(self) -> None:
+        for key in self._store_sum:
+            self.clear(key)
+    def get_and_clear_all(self) -> Dict[str, float]:
+        metrics = {}
+        for key in self:
+            metrics[key] = self.get(key)
+            self.clear(key)
+        return metrics
+def load_embedder_and_tokenizer(name: str) -> Tuple[
+        transformers.PreTrainedModel,
+        transformers.PreTrainedTokenizer
+]:
+    if name.startswith("nomic") or (name == "bert-base-uncased"):
+        from cde.lib.nomic_bert import NomicBertModel
+        if name.endswith("--from-scratch"):
+            name = name.replace("--from-scratch", "")
+            config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
+            model = NomicBertModel._from_config(config)
+        else:
+            model = NomicBertModel.from_pretrained(
+                name, add_pooling_layer=False
+            )
+        tokenizer = transformers.AutoTokenizer.from_pretrained(name)
+    elif name in ["gtr-base", "gtr_base"]:
+        model = transformers.AutoModel.from_pretrained(
+            "sentence-transformers/gtr-t5-base"
+        ).encoder
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            "sentence-transformers/gtr-t5-base"
+        )
+    elif name == "pile-t5-base-encoder":
+        model = transformers.AutoModel.from_pretrained(
+            "EleutherAI/pile-t5-base"
+        ).encoder
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            "EleutherAI/pile-t5-base"
+        )
+        tokenizer.pad_token = tokenizer.eos_token
+    elif name == "pile-t5-base-decoder":
+        model = transformers.AutoModel.from_pretrained(
+            "EleutherAI/pile-t5-base"
+        ).decoder
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            "EleutherAI/pile-t5-base"
+        )
+        tokenizer.pad_token = tokenizer.eos_token
+    elif name.startswith("gpt2") or name.startswith("meta-llama") or ("Llama" in name):
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            name,
+            # torch_dtype=torch.bfloat16,
+            attn_implementation="flash_attention_2",
+            low_cpu_mem_usage=True,
+            # device_map="auto",
+        )
+        model.padding_side = "right"
+        tokenizer = transformers.AutoTokenizer.from_pretrained(name)
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.add_eos_token = True
+    else:
+        model = transformers.AutoModel.from_pretrained(name, trust_remote_code=True)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(name)
+        # if use_bettertransformer:
+        #     from optimum.bettertransformer import BetterTransformer
+        #     model = BetterTransformer.transform(model)
+    return model, tokenizer
+def inputs_for_key(inputs: Dict[str, torch.Tensor], key: str):
+    key += "_"
+    return {k.replace(key, ""): v for k,v in inputs.items() if k.startswith(key)}
+def load_model_state_dict_from_path(folder: str) -> Dict:
+    checkpoint_folder = transformers.trainer_utils.get_last_checkpoint(folder)
+    if checkpoint_folder is None:
+        raise FileNotFoundError(f"no checkpoint found in {folder}")
+    WEIGHTS_NAME = "model.safetensors"
+    weights_path = os.path.join(checkpoint_folder, WEIGHTS_NAME)
+    if not os.path.exists(weights_path):
+        raise FileNotFoundError(f"no model weights found at {weights_path}")
+    return safetensors.torch.load_file(weights_path, device="cpu")
+def count_cpus() -> int:
+    try:
+        return len(os.sched_getaffinity(0))
+    except AttributeError:
+        return multiprocessing.cpu_count()
+def shuffle_batches(g: torch.Generator, list_of_tensors: List[torch.Tensor]) -> List[int]:
+    all_indices = []
+    for batch_tensor in tqdm_if_main_worker(list_of_tensors, colour="green", desc="Sampler shuffling per-batch"):
+        rand_perm = torch.randperm(len(batch_tensor), generator=g)
+        batch_list = batch_tensor[rand_perm].tolist()
+        all_indices.extend(batch_list)
+    return all_indices
+# def shuffle_batches_multiproc(g: torch.Generator, list_of_tensors: List[torch.Tensor], num_processes: int = 8) -> List[int]:
+#     all_indices = []
+#     print(f"Shuffling {len(list_of_tensors)} tensors with {num_processes} workers.")
+#     pbar = tqdm_if_main_worker(list_of_tensors, colour="orange", desc=f"Sampler shuffling per-batch (nproc={num_processes})")
+#     pool = multiprocessing.Pool(processes=num_processes)
+#     chunk_size = len(list_of_tensors) // num_processes
+#     chunks = [list_of_tensors[i:i + chunk_size] for i in range(0, len(list_of_tensors), chunk_size)]
+#     worker_func = functools.partial(shuffle_batches, g=g)
+#     results = pool.map(worker_func, chunks)
+#     all_indices = []
+#     for result in results:
+#         all_indices.extend(result)
+#         pbar.update()
+#     return all_indices
+def exit_if_running_or_finished_wandb(
+        project_name: str,
+        exp_group: str, exp_name: str
+    ) -> None:
+    print("Checking if experiment is already running...")
+    import wandb
+    api = wandb.Api()
+    running_runs = api.runs(
+        path="tti-nomic-7",
+        filters={
+            "display_name": exp_name,
+            "state": {"$regex": "Running|Finished"},
+            "config.exp_group": exp_group,
+        }
+    )
+    print("Found", len(running_runs), f"runs with name {exp_name} and group {exp_group} in {project_name}.")
+    if len(running_runs) > 0:
+        print("Exiting because experiment is already running or completed.")
+        sys.exit(0)

model.py ADDED Viewed

	@@ -0,0 +1,607 @@

+from typing import Dict, Optional, Union
+import copy
+import torch
+import torch.nn as nn
+import transformers
+from cde.lib.dist import print0
+from cde.lib.tensor import mean_pool, mean_pool_3d, mean_pool_weighted, last_token_pool
+from cde.lib import load_embedder_and_tokenizer, ContextualModelConfig
+def limit_layers(model: transformers.PreTrainedModel, n_layers: int) -> None:
+    if hasattr(model, 'transformer'):
+        if hasattr(model.transformer, 'h'):
+            # gpt2
+            model.transformer.h = model.transformer.h[:n_layers]
+        else:
+            model.transformer.layer = model.transformer.layer[:n_layers]
+    elif hasattr(model, 'encoder'):
+        if hasattr(model.encoder, 'layers'):
+            model.encoder.layers = model.encoder.layers[:n_layers]
+        else:
+            model.encoder.layer = model.encoder.layer[:n_layers]
+    else:
+        raise RuntimeError(f"unknown how to limit layers of model {type(model)}")
+def disable_dropout(model: torch.nn.Module):
+    dropout_modules = [m for m in model.modules() if isinstance(m, torch.nn.Dropout)]
+    for m in dropout_modules:
+        m.p = 0.0
+    print0(
+        f"Disabled {len(dropout_modules)} dropout modules from model type {type(model)}"
+    )
+def disable_causality(model: torch.nn.Module):
+    disabled_modules = 0
+    for m in model.modules():
+        if hasattr(m, "is_causal"):
+            m.is_causal = False
+            disabled_modules += 1
+    print0(
+        f"Set is_causal=False in {disabled_modules} modules from model type {type(model)}"
+    )
+class ContextualModelMixin(nn.Module):
+    @property
+    def num_corpus_tokens(self) -> int:
+        return self.transductive_corpus_size * self.transductive_tokens_per_document
+    def contextual_init(self):
+        self.n_soft_prompt = 8
+        self.prompt_projection = torch.nn.Sequential(
+            torch.nn.Linear(self.hidden_size, self.hidden_size),
+            torch.nn.ReLU(),
+            torch.nn.Linear(self.hidden_size, self.hidden_size * self.n_soft_prompt)
+        )
+        self.transductive_corpus_size = vars(self.config).get("transductive_corpus_size", 1)
+        self.transductive_tokens_per_document = vars(self.config).get("transductive_tokens_per_document", 1)
+        self.randomize_dataset_sequence_order = True
+        self.sequence_dropout_prob = vars(self.config).get("transductive_sequence_dropout_prob", 0.0)
+        if self.sequence_dropout_prob > 0.0:
+            self.sequence_dropout_null_embedding = torch.nn.Parameter(
+                torch.randn(self.hidden_size) * 0.01,
+                requires_grad = True
+            )
+        self.output_projection = torch.nn.Sequential(
+            torch.nn.Linear(self.hidden_size, self.hidden_size),
+            torch.nn.ReLU(),
+            torch.nn.Linear(self.hidden_size, self.hidden_size)
+        )
+    def _prepare_dataset_embeddings(
+            self,
+            input_ids: torch.Tensor, dataset_embeddings: torch.Tensor,
+            null_dataset_embedding: bool = False,
+        ) -> torch.Tensor:
+        if not isinstance(dataset_embeddings, torch.Tensor):
+            dataset_embeddings = torch.tensor(dataset_embeddings)
+        if len(dataset_embeddings.shape) == 2:
+            # Auto-expand for a batch.
+            dataset_embeddings = dataset_embeddings[None, :, :] # (b, d) -> (1, b, d)
+        dataset_embeddings = dataset_embeddings.to(input_ids.device)
+        batch_size = input_ids.shape[0]
+        if (self.transductive_tokens_per_document > 1):
+            if self.training:
+                # Choose N random documents to fill our context window with.
+                # This logic is a little confusing but allows us to sample a
+                # different batch *per-document*
+                assert dataset_embeddings.shape[1] == self.transductive_tokens_per_document
+                R = torch.randint(
+                    low=0,
+                    high=len(dataset_embeddings),
+                    size=(batch_size, self.config.transductive_corpus_size),
+                    device=dataset_embeddings.device
+                )
+                # TODO make this deterministic somehow for evaluation?
+                dataset_embeddings = dataset_embeddings[R].reshape((batch_size, self.num_corpus_tokens, self.hidden_size))
+            else:
+                dataset_embeddings = dataset_embeddings.reshape((1, self.num_corpus_tokens, self.hidden_size))
+                # print("reshaped to dataset_embeddings.shape =", dataset_embeddings.shape)
+        if dataset_embeddings.shape[1] > self.num_corpus_tokens:
+            # If too many dataset embeddings are passed in, just take the first N until
+            # we have the proper number.
+            dataset_embeddings = dataset_embeddings[:, :self.num_corpus_tokens, :]
+        _, corpus_size, _hidden_size = dataset_embeddings.shape
+        if _ == 1:
+            # Auto-expand for a batch.
+            dataset_embeddings = dataset_embeddings.expand((batch_size, -1, -1))
+        if self.training and self.sequence_dropout_prob > 0.0:
+            sequence_dropout_mask = (
+                torch.rand((batch_size, corpus_size), device=dataset_embeddings.device) < self.sequence_dropout_prob
+            )
+            null_embeddings = self.sequence_dropout_null_embedding[None, None].expand(batch_size, corpus_size, -1)
+            dataset_embeddings = torch.where(
+                sequence_dropout_mask[..., None], null_embeddings, dataset_embeddings
+            )
+        elif null_dataset_embedding:
+            null_embeddings = self.sequence_dropout_null_embedding[None, None].expand(batch_size, corpus_size, -1)
+            dataset_embeddings = null_embeddings
+        # print(f"[ContextualModelMixin] dataset_embeddings.shape = {dataset_embeddings.shape}")
+        # backbone_max_seq_length = self.backbone.config.max_trained_positions
+        # assert batch_size + (2 * self.n_soft_prompt + corpus_size) <= backbone_max_seq_length, "too many hard negatives for backbone model"
+        soft_prompt = torch.ones((1, self.hidden_size), device=dataset_embeddings.device, dtype=dataset_embeddings.dtype)
+        soft_prompt = self.prompt_projection(soft_prompt).reshape((1, self.n_soft_prompt, self.hidden_size))
+        soft_prompt = soft_prompt.expand((len(dataset_embeddings), -1, -1)) # -> (b, 4+b, d) # soft_prompt.repeat((len(input_ids), 1, 1))
+        soft_prompt = torch.cat((dataset_embeddings, soft_prompt), dim=1)
+        # print(f"[ContextualModelMixin] soft_prompt.shape = {soft_prompt.shape}")
+        if self.training and self.randomize_dataset_sequence_order:
+            randomized_order = torch.stack(
+                [
+                    torch.cat(
+                        (
+                            torch.randperm(corpus_size, device=soft_prompt.device),
+                            torch.arange(self.n_soft_prompt, device=soft_prompt.device) + corpus_size
+                        ), dim=0)
+                        for _ in range(batch_size)])
+            randomized_order = randomized_order.to(soft_prompt.device)
+            soft_prompt = soft_prompt.gather(1, randomized_order[..., None].expand_as(soft_prompt))
+        return soft_prompt
+class BiEncoder(transformers.PreTrainedModel):
+    embedder: transformers.PreTrainedModel
+    def __init__(
+            self,
+            config, #: transformers.PreTrainedConfig,
+        ):
+        super().__init__(config=config)
+        embedder, _ = load_embedder_and_tokenizer(
+            config.embedder,
+        )
+        if config.limit_layers:
+            print0(f"Limiting layers to {config.limit_layers}")
+            limit_layers(embedder, config.limit_layers)
+        self.embedder = embedder
+        # if ("t5" in embedder.config.model_type):
+        #     print0(f"using torch.compile() on embedder of type `{embedder.config.model_type}`")
+        #     self.embedder = torch.compile(self.embedder)
+        self.hidden_size = self.embedder.config.hidden_size
+        # Allow pooling to multiple tokens per document
+        self.transductive_tokens_per_document = vars(self.config).get("transductive_tokens_per_document", 1)
+        self.mlp = torch.nn.Sequential(
+            torch.nn.Linear(self.hidden_size, self.hidden_size),
+            torch.nn.GELU(),
+            torch.nn.Linear(self.hidden_size, self.config.embedding_output_dim or self.hidden_size),
+        )
+        self.temp = config.logit_scale
+        if config.disable_dropout:
+            disable_dropout(self)
+        self.pooling_strategy = vars(config).get("pooling_strategy", "mean")
+    def forward(
+            self,
+            input_ids: torch.Tensor,
+            attention_mask: torch.Tensor,
+            dataset_input_ids: Optional[torch.Tensor] = None,
+            dataset_attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids = None,
+            output_hidden_states: bool = False,
+        ) -> torch.Tensor:
+        """
+        query_embedding (float torch.Tensor) - shape (batch_size, embedding_dim)
+        document_embeddings (float torch.Tensor) - shape (corpus_size, embedding_dim)
+            where the corpus_size >= batch_size and is structured like this:
+                [d1, d2, d3, hn1_1, hn1_2, hn2_1, hn2_2, hn3_1, hn3_2]
+                for a corpus with three documents and two hard negatives per document
+        """
+        # del dataset_input_ids
+        # del dataset_attention_mask
+        del token_type_ids
+        # from cde.lib.dist import get_rank
+        # tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")
+        # if get_rank() == 0:
+        #     breakpoint()
+        # torch.distributed.barrier()
+        outputs = (
+            self.embedder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+            ).last_hidden_state
+        )
+        if self.transductive_tokens_per_document > 1:
+            document_embeddings = None
+            batch_size, seq_length, output_dim = outputs.shape
+            if seq_length % self.transductive_tokens_per_document != 0:
+                # Pad to nearest multiple
+                n_extra_embeds = self.transductive_tokens_per_document - (seq_length % self.transductive_tokens_per_document)
+                outputs = torch.cat(
+                    (outputs, torch.zeros((batch_size, n_extra_embeds, output_dim), device=outputs.device)),
+                    dim=1
+                )
+                attention_mask = torch.cat(
+                    (attention_mask, torch.zeros((batch_size, n_extra_embeds), device=attention_mask.device)),
+                    dim=1
+                )
+                seq_length += n_extra_embeds
+                print(f"Added {n_extra_embeds} padding tokens to input_ids and attention_mask")
+            # print("ftransductive_tokens_per_document {self.transductive_tokens_per_document} outputs.shape =", outputs.shape)
+            outputs = outputs.reshape(
+                (batch_size,  self.transductive_tokens_per_document, seq_length // self.transductive_tokens_per_document, output_dim)
+            )
+            attention_mask = attention_mask.reshape((batch_size, self.transductive_tokens_per_document, -1))
+            document_embeddings = mean_pool_3d(outputs, attention_mask)
+            document_embeddings = document_embeddings.reshape((batch_size, self.transductive_tokens_per_document, output_dim))
+        else:
+            if self.pooling_strategy == "mean":
+                document_embeddings = mean_pool(outputs, attention_mask)
+            else:
+                document_embeddings = document_embeddings.max(dim=1)
+        output = self.mlp(document_embeddings)
+        if output_hidden_states:
+            return {
+                "hidden_states": outputs,
+                "pooled": output,
+            }
+        else:
+            return output
+class DatasetConditionedAutoregressive(transformers.PreTrainedModel, ContextualModelMixin):
+    def __init__(
+            self,
+            config,
+            dataset_backbone: transformers.PreTrainedModel,
+            first_stage_hidden_size: int,
+        ):
+        super().__init__(config=config)
+        self.backbone = dataset_backbone
+        self.backbone_hidden_size = self.backbone.config.hidden_size
+        self.hidden_size = first_stage_hidden_size # Input token size
+        self.contextual_init()
+        disable_causality(self.backbone)
+        self.input_ln = torch.nn.LayerNorm(
+            self.backbone_hidden_size,
+            eps=1e-5
+        )
+        # Override contextual init
+        self.output_projection = torch.nn.Sequential(
+            torch.nn.Linear(self.backbone_hidden_size, self.backbone_hidden_size),
+            torch.nn.ReLU(),
+            torch.nn.Linear(self.backbone_hidden_size, self.backbone_hidden_size)
+        )
+        self._shift_rotary_embedding()
+    @property
+    def num_corpus_tokens(self) -> int:
+        return self.config.transductive_corpus_size * self.transductive_tokens_per_document
+    @property
+    def corpus_token_ratio(self) -> float:
+        # How many tokens from the first stage make one token in the second
+        # stage?
+        return self.backbone_hidden_size / self.hidden_size
+    def corpus_token_pad_size(self, n_tokens: int) -> int:
+        return self.hidden_size % self.backbone_hidden_size
+    def _shift_rotary_embedding(self) -> None:
+        disable_transductive_rotary_embedding = vars(self.config).get("disable_transductive_rotary_embedding", True)
+        # TODO: Can we do this for LLAMA?
+        print("Warning: Positional embedding disabling not implemented for LLAMA.")
+    def forward(
+            self,
+            input_ids: torch.Tensor,
+            attention_mask: torch.Tensor,
+            dataset_embeddings: torch.Tensor,
+            output_hidden_states: bool = False,
+            null_dataset_embedding: bool = False,
+        ) -> torch.Tensor:
+        soft_prompt = self._prepare_dataset_embeddings(
+            input_ids=input_ids,
+            dataset_embeddings=dataset_embeddings,
+            null_dataset_embedding=null_dataset_embedding,
+        )
+        # Reshape for this model.
+        # print("[DatasetConditionedAutoregressive] 1 -> soft_prompt.shape =", soft_prompt.shape)
+        num_soft_elements = torch.prod(torch.tensor(soft_prompt.shape[1:])).item()
+        soft_prompt = soft_prompt.reshape((soft_prompt.shape[0], num_soft_elements))
+        num_padding_elements = self.backbone_hidden_size - (num_soft_elements % self.backbone_hidden_size)
+        padding = torch.ones((soft_prompt.shape[0], num_padding_elements), device=soft_prompt.device)
+        soft_prompt = torch.cat((soft_prompt, padding), dim=1)
+        soft_prompt = soft_prompt.reshape(
+            (soft_prompt.shape[0], -1, self.backbone_hidden_size)
+        )
+        soft_prompt = self.input_ln(soft_prompt)
+        # print("[DatasetConditionedAutoregressive] 2 -> soft_prompt.shape =", soft_prompt.shape)
+        backbone_attention_mask = torch.ones(
+            soft_prompt.shape[0:2],
+            dtype=torch.long,
+            device=soft_prompt.device,
+        )
+        token_embeddings = self.backbone.get_input_embeddings()
+        inputs_embeds = token_embeddings(input_ids) # (b, s) -> (b, s, d)
+        # print("[2] inputs_embeds.shape =", inputs_embeds.shape)
+        inputs_embeds = torch.cat((soft_prompt, inputs_embeds), dim=1) # (v, 4+b+s, d)
+        # print("[3.a] inputs_embeds.shape =", inputs_embeds.shape)
+        input_attention_mask = torch.cat((backbone_attention_mask, attention_mask), dim=1)
+        # print("[3.b] attention_mask.shape =", attention_mask.shape)
+        output = self.backbone(
+            inputs_embeds=inputs_embeds,
+            attention_mask=input_attention_mask,
+            output_hidden_states=True,
+        ) # (1, 4 + b + s, d)
+        # trim soft prompt
+        last_hidden_state = output.hidden_states[-1]
+        n_soft_prompt_tokens = soft_prompt.shape[1]
+        output_vectors = last_hidden_state[:, n_soft_prompt_tokens:, :]
+        output_attention_mask = input_attention_mask[:, n_soft_prompt_tokens:]
+        # Take last token position
+        if vars(self.config).get("pooling_strategy") == "last_token":
+            output_pooled = last_token_pool(output_vectors, output_attention_mask)
+        elif vars(self.config).get("pooling_strategy") == "mean":
+            output_pooled = mean_pool(output_vectors, output_attention_mask)
+        else:
+            output_pooled = mean_pool_weighted(output_vectors, output_attention_mask)
+        # average with original vectors
+        # TODO: Argparse for pooling strategy.
+        output = self.output_projection(output_pooled) # (b, 2d) -> (b, d)
+        if output_hidden_states:
+            return {
+                "hidden_states": output_vectors,
+                "pooled": output,
+            }
+        else:
+            return output
+class DatasetConditionedBiencoder(transformers.PreTrainedModel, ContextualModelMixin):
+    def __init__(
+            self,
+            config,
+            dataset_backbone: transformers.PreTrainedModel,
+        ):
+        super().__init__(config=config)
+        self.backbone = dataset_backbone
+        self.hidden_size = self.backbone.config.hidden_size
+        self.hidden_size = dataset_backbone.config.hidden_size
+        # self.input_ln = torch.nn.LayerNorm(
+        #     self.hidden_size,
+        #     eps=self.backbone.config.layer_norm_epsilon
+        # )
+        self.contextual_init()
+        self._shift_rotary_embedding()
+    @property
+    def num_corpus_tokens(self) -> int:
+        return self.config.transductive_corpus_size * self.transductive_tokens_per_document
+    def _shift_rotary_embedding(self) -> None:
+        disable_transductive_rotary_embedding = vars(self.config).get("disable_transductive_rotary_embedding", True)
+        if self.backbone.config.model_type.startswith("nomic") and disable_transductive_rotary_embedding:
+            # We only want to apply positional embeddings to the
+            # *text* portion of the backbone network.
+            self.backbone.config.rotary_start_pos = 0.0
+            rotary_disabled = 0
+            rotary_start_pos = self.num_corpus_tokens
+            for module in self.backbone.modules():
+                if hasattr(module, "rotary_emb_dim"):
+                    module.rotary_start_pos = rotary_start_pos
+                    rotary_disabled += 1
+            print0(f"modified {rotary_disabled} rotary modules – set rotary_start_pos to {rotary_start_pos}")
+    def forward(
+            self,
+            input_ids: torch.Tensor,
+            attention_mask: torch.Tensor,
+            dataset_embeddings: torch.Tensor,
+            output_hidden_states: bool = False,
+            null_dataset_embedding: bool = False,
+        ) -> torch.Tensor:
+        # print(f"[DatasetConditionedBiencoder - 0] input_ids.shape => {input_ids.shape} // dataset_embeddings.shape =", dataset_embeddings.shape)
+        soft_prompt = self._prepare_dataset_embeddings(
+            input_ids=input_ids,
+            dataset_embeddings=dataset_embeddings,
+            null_dataset_embedding=null_dataset_embedding,
+        )
+        # print(f"[DatasetConditionedBiencoder - 1] soft_prompt.shape => {soft_prompt.shape}")
+        backbone_attention_mask = torch.ones(
+            soft_prompt.shape[0:2],
+            dtype=torch.long,
+            device=soft_prompt.device,
+        )
+        inputs_embeds = self.backbone.embeddings(input_ids) # (b, s) -> (b, s, d)
+        # print("[2] inputs_embeds.shape =", inputs_embeds.shape)
+        inputs_embeds = torch.cat((soft_prompt, inputs_embeds), dim=1) # (v, 4+b+s, d)
+        # print("[3.a] inputs_embeds.shape =", inputs_embeds.shape)
+        attention_mask = torch.cat((backbone_attention_mask, attention_mask), dim=1)
+        # print("[3.b] attention_mask.shape =", attention_mask.shape)
+        output = self.backbone(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+        ) # (1, 4 + b + s, d)
+        # trim soft prompt
+        output_vectors = output.last_hidden_state
+        # use only these tokens
+        n_soft_prompt_tokens = soft_prompt.shape[1]
+        # print("n_soft_prompt_tokens =", n_soft_prompt_tokens)
+        output_vectors = output.last_hidden_state[:, n_soft_prompt_tokens:, :]
+        output_attention_mask = attention_mask[:, n_soft_prompt_tokens:]
+        # print("pooling output_vectors.shape =", output_vectors.shape, "and output_attention_mask.shape =", output_attention_mask.shape)
+        output_pooled = mean_pool(output_vectors, output_attention_mask)
+        # average with original vectors
+        # TODO: Argparse for pooling strategy.
+        # output_vectors = torch.cat((soft_prompt_pooled, output_pooled), dim=1) # (b, d) + (b, d) -> (b, 2d)
+        # print("output_pooled.shape =", output_pooled.shape)
+        output = self.output_projection(output_pooled) # (b, 2d) -> (b, d)
+        # print("returning output.shape =", output.shape)
+        if output_hidden_states:
+            return {
+                "hidden_states": output_vectors,
+                "pooled": output,
+            }
+        else:
+            return output
+class DatasetPrefixBiencoder(transformers.PreTrainedModel, ContextualModelMixin):
+    def __init__(
+            self,
+            config, #: transformers.PreTrainedConfig,
+            embedder: transformers.PreTrainedModel,
+        ):
+        super().__init__(config=config)
+        self.embedder = embedder
+        self.hidden_size = self.embedder.config.hidden_size
+        self.contextual_init()
+    def forward(
+            self,
+            input_ids: torch.Tensor,
+            attention_mask: torch.Tensor,
+            dataset_input_ids: torch.Tensor,
+            dataset_attention_mask: torch.Tensor,
+            output_hidden_states: bool = False,
+        ) -> torch.Tensor:
+        R = torch.randint(low=0, high=len(dataset_input_ids), size=(len(input_ids),), device=dataset_input_ids.device)
+        dataset_input_ids = dataset_input_ids[R]
+        input_ids = torch.cat((dataset_input_ids, input_ids), dim=1)
+        dataset_attention_mask = torch.ones_like(dataset_attention_mask, device=dataset_attention_mask.device)
+        input_attention_mask = torch.cat((dataset_attention_mask, attention_mask), dim=1)
+        output_attention_mask = torch.cat(
+            (torch.zeros_like(dataset_input_ids), attention_mask), dim=1
+        )
+        output = self.embedder(
+            input_ids=input_ids,
+            attention_mask=input_attention_mask,
+        )
+        output_vectors = output.last_hidden_state
+        output_pooled = mean_pool(output_vectors, output_attention_mask)
+        output = self.output_projection(output_pooled) # (b, 2d) -> (b, d)
+        if output_hidden_states:
+            S_d = dataset_attention_mask.shape[1]
+            output_vectors = output_vectors[:, S_d:, :]
+            return {
+                "hidden_states": output_vectors,
+                "pooled": output,
+            }
+        else:
+            return output
+class DatasetTransformer(transformers.PreTrainedModel):
+    config_class = ContextualModelConfig
+    embedder: transformers.PreTrainedModel
+    dataset_backbone: transformers.PreTrainedModel
+    def __init__(
+            self,
+            config,
+        ):
+        super().__init__(config=config)
+        dataset_backbone, _ = load_embedder_and_tokenizer(
+            vars(config).get("dataset_backbone", config.embedder)
+        )
+        if config.limit_layers:
+            print0(f"Limiting layers to {config.limit_layers}")
+            limit_layers(dataset_backbone, config.limit_layers)
+        biencoder_config = copy.deepcopy(config)
+        biencoder_config.embedding_output_dim = None
+        biencoder_config.limit_layers = vars(self.config).get("limit_layers_first_stage", None)
+        self.first_stage_model = BiEncoder(
+            config=biencoder_config,
+        )
+        if vars(config).get("autoregressive_backbone", False):
+            self.second_stage_model = DatasetConditionedAutoregressive(
+                config=config,
+                dataset_backbone=dataset_backbone,
+                first_stage_hidden_size=self.first_stage_model.hidden_size,
+            )
+        else:
+            self.second_stage_model = DatasetConditionedBiencoder(
+                config=config,
+                dataset_backbone=dataset_backbone
+            )
+        self.temp = config.logit_scale
+        if config.disable_dropout:
+            disable_dropout(self)
+        transductive_tie_token_embeddings = vars(self.config).get("transductive_tie_token_embeddings", False)
+        if transductive_tie_token_embeddings:
+            self.second_stage_model.backbone.embeddings.word_embeddings.weight = (
+                self.first_stage_model.embedder.embeddings.word_embeddings.weight
+            )
+    def forward(
+            self,
+            input_ids: torch.Tensor,
+            attention_mask: torch.Tensor,
+            dataset_input_ids: Optional[torch.Tensor],
+            dataset_attention_mask: Optional[torch.Tensor],
+            output_hidden_states: bool = False,
+        ) -> torch.Tensor:
+        """
+        input_ids (long torch.Tensor) – ids of input tokens
+        attention_mask (bool torch.Tensor)
+        """
+        dataset_embeddings = self.first_stage_model(
+            input_ids=dataset_input_ids,
+            attention_mask=dataset_attention_mask
+        )
+        return self.second_stage_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            dataset_embeddings=dataset_embeddings,
+            output_hidden_states=output_hidden_states,
+        )
+def get_model_class(name: str):
+    if name in 'transductive':
+        return DatasetTransformer
+    elif name == 'biencoder':
+        return BiEncoder
+    elif name == "dataset_prefix_biencoder":
+        return DatasetPrefixBiencoder
+    else:
+        raise ValueError(f'unknown model cls {name}')

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8232363a55e0327e0b9cc85762662f474f314a2975127f70c9ba0777857d19d7
-size 572926008

 version https://git-lfs.github.com/spec/v1
+oid sha256:6ec79407ada665817aebe929bdabbe83eecd816b75f7f26e3bdd8b4c092efb2a
+size 1124594680