update readme

Browse files

Files changed (4) hide show

README.md +28 -0
configs/fine-tune.yaml +62 -0
configs/index.yaml +25 -0
configs/search.yaml +25 -0

README.md CHANGED Viewed

@@ -1,3 +1,31 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
 ---
+# Lightning IR ColBERT
+This model is a ColBERT[^1] model fine-tuned using [Lightning IR](https://github.com/webis-de/lightning-ir).
+See the [Lightning IR Model Zoo](https://webis-de.github.io/lightning-ir/models.html) for a comparison with other models.
+## Reproduction
+To reproduce the model training, install Lightning IR and run the following command using the [fine-tune.yaml](./configs/fine-tune.yaml) configuration file:
+```bash
+lightning-ir fit --config fine-tune.yaml
+```
+To index MS~MARCO passages, use the following command and the [index.yaml](./configs/index.yaml) configuration file:
+```bash
+lightning-ir index --config index.yaml
+```
+After indexing, to evaluate the model on TREC Deep Learning 2019 and 2020, use the following command and the [search.yaml](./configs/search.yaml) configuration file:
+```bash
+lightning-ir search --config search.yaml
+```
+[^1]: Khattab and Zaharia, [ColBERT: Efficient and Effective Passage Search via Contextualized Late Interaction over BERT](https://dl.acm.org/doi/abs/10.1145/3397271.3401075)

configs/fine-tune.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+# lightning.pytorch==2.3.3
+seed_everything: 0
+trainer:
+  precision: bf16-mixed
+  max_steps: 50000
+data:
+  class_path: lightning_ir.LightningIRDataModule
+  init_args:
+    num_workers: 1
+    train_batch_size: 64
+    shuffle_train: true
+    train_dataset:
+      class_path: lightning_ir.RunDataset
+      init_args:
+        run_path_or_id: msmarco-passage/train/rank-distillm/set-encoder
+        depth: 100
+        sample_size: 8
+        sampling_strategy: log_random
+        targets: score
+        normalize_targets: false
+model:
+  class_path: lightning_ir.BiEncoderModule
+  init_args:
+    model_name_or_path: bert-base-uncased
+    config:
+      class_path: lightning_ir.ColConfig
+      init_args:
+        similarity_function: dot
+        query_expansion: true
+        attend_to_query_expanded_tokens: true
+        query_mask_scoring_tokens: null
+        doc_mask_scoring_tokens: punctuation
+        query_aggregation_function: mean
+        normalize: false
+        add_marker_tokens: false
+        embedding_dim: 128
+        projection: linear
+        query_pooling_strategy: mean
+        doc_expansion: false
+        attend_to_doc_expanded_tokens: false
+        doc_pooling_strategy: mean
+        sparsification: null
+        query_length: 32
+        doc_length: 256
+    loss_functions:
+    - class_path: lightning_ir.SupervisedMarginMSE
+    - class_path: lightning_ir.KLDivergence
+    - class_path: lightning_ir.InBatchCrossEntropy
+      init_args:
+        pos_sampling_technique: first
+        neg_sampling_technique: first
+        max_num_neg_samples: 8
+optimizer:
+  class_path: torch.optim.AdamW
+  init_args:
+    lr: 2.0e-05
+lr_scheduler:
+  class_path: lightning_ir.LinearLRSchedulerWithLinearWarmup
+  init_args:
+    num_warmup_steps: 5000
+    final_value: 0.02
+    num_delay_steps: 0

configs/index.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+trainer:
+  logger: false
+  callbacks:
+  - class_path: lightning_ir.IndexCallback
+    init_args:
+      index_dir: ./index
+      index_config:
+        class_path: FaissIVFPQIndexConfig
+        init_args:
+          num_centroids: 262144
+          num_subquantizers: 16
+          n_bits: 8
+model:
+  class_path: lightning_ir.BiEncoderModule
+  init_args:
+    model_name_or_path: webis/bert-bi-encoder
+data:
+  class_path: lightning_ir.LightningIRDataModule
+  init_args:
+    num_workers: 1
+    inference_batch_size: 256
+    inference_datasets:
+      - class_path: DocDataset
+        init_args:
+          doc_dataset: msmarco-passage

configs/search.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+trainer:
+  logger: false
+  callbacks:
+  - class_path: SearchCallback
+    init_args:
+      index_dir: ./index
+      use_gpu: false
+      search_config:
+        class_path: FaissSearchConfig
+        init_args:
+          k: 10
+model:
+  class_path: lightning_ir.BiEncoderModule
+  init_args:
+    model_name_or_path: webis/bert-bi-encoder
+    evaluation_metrics:
+    - nDCG@10
+data:
+  class_path: lightning_ir.LightningIRDataModule
+  init_args:
+    inference_datasets:
+      - class_path: QueryDataset
+        init_args:
+          doc_dataset: msmarco-passage/trec-dl-2019/judged