Spaces:

castorini
/

ONNX-Demo

Build error

App Files Files Community

ArthurChen189 commited on Aug 3, 2023

Commit

62977bb

1 Parent(s): 30ac9ed

upload pyserini

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

pyserini/2cr/_base.py +95 -0
pyserini/2cr/miracl.py +447 -0
pyserini/2cr/miracl.yaml +1180 -0
pyserini/2cr/miracl_html.template +256 -0
pyserini/2cr/miracl_html_table.template +35 -0
pyserini/2cr/miracl_html_table_row.template +336 -0
pyserini/2cr/mrtydi.py +330 -0
pyserini/2cr/mrtydi.yaml +890 -0
pyserini/2cr/mrtydi_html.template +256 -0
pyserini/2cr/mrtydi_html_table.template +28 -0
pyserini/2cr/mrtydi_html_table_row.template +212 -0
pyserini/2cr/msmarco-v1-doc.yaml +539 -0
pyserini/2cr/msmarco-v1-passage.yaml +764 -0
pyserini/2cr/msmarco-v2-doc.yaml +287 -0
pyserini/2cr/msmarco-v2-passage.yaml +287 -0
pyserini/2cr/msmarco.py +600 -0
pyserini/2cr/msmarco_html_row_v1.template +81 -0
pyserini/2cr/msmarco_html_row_v2.template +82 -0
pyserini/2cr/msmarco_html_v1_doc.template +296 -0
pyserini/2cr/msmarco_html_v1_passage.template +325 -0
pyserini/2cr/msmarco_html_v2_doc.template +292 -0
pyserini/2cr/msmarco_html_v2_passage.template +292 -0
pyserini/__init__.py +1 -0
pyserini/__pycache__/__init__.cpython-310.pyc +0 -0
pyserini/__pycache__/encoded_corpus_info.cpython-310.pyc +0 -0
pyserini/__pycache__/encoded_query_info.cpython-310.pyc +0 -0
pyserini/__pycache__/evaluate_script_info.cpython-310.pyc +0 -0
pyserini/__pycache__/prebuilt_index_info.cpython-310.pyc +0 -0
pyserini/__pycache__/pyclass.cpython-310.pyc +0 -0
pyserini/__pycache__/setup.cpython-310.pyc +0 -0
pyserini/__pycache__/util.cpython-310.pyc +0 -0
pyserini/analysis/__init__.py +19 -0
pyserini/analysis/__pycache__/__init__.cpython-310.pyc +0 -0
pyserini/analysis/__pycache__/_base.cpython-310.pyc +0 -0
pyserini/analysis/_base.py +166 -0
pyserini/collection/__init__.py +20 -0
pyserini/collection/_base.py +153 -0
pyserini/collection/_collection_support.py +78 -0
pyserini/demo/acl.py +124 -0
pyserini/demo/dpr.py +105 -0
pyserini/demo/miracl.py +149 -0
pyserini/demo/msmarco.py +118 -0
pyserini/demo/templates/acl.html +74 -0
pyserini/demo/templates/assets/acl-logo.svg +10 -0
pyserini/demo/templates/miracl.html +127 -0
pyserini/dsearch.py +46 -0
pyserini/encode/__init__.py +28 -0
pyserini/encode/__main__.py +147 -0
pyserini/encode/__pycache__/__init__.cpython-310.pyc +0 -0
pyserini/encode/__pycache__/_aggretriever.cpython-310.pyc +0 -0

pyserini/2cr/_base.py ADDED Viewed

	@@ -0,0 +1,95 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import subprocess
+fail_str = '\033[91m[FAIL]\033[0m'
+ok_str = '[OK]'
+okish_str = '\033[94m[OKish]\033[0m'
+def run_command(cmd):
+    process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout, stderr = process.communicate()
+    stdout = stdout.decode('utf-8')
+    stderr = stderr.decode('utf-8')
+    return stdout, stderr
+def run_eval_and_return_metric(metric, eval_key, defs, runfile):
+    eval_cmd = f'python -m pyserini.eval.trec_eval {defs} {eval_key} {runfile}'
+    eval_stdout, eval_stderr = run_command(eval_cmd)
+    for line in eval_stdout.split('\n'):
+        parts = line.split('\t')
+        if len(parts) == 3 and parts[1] == 'all':
+            return round(float(parts[2]), 4)
+    return 0.0
+def run_dpr_retrieval_eval_and_return_metric(defs, json_file):
+    """Generate dpr retrieval evaluation scores
+    Args:
+        defs: topk definitions (e.g., '--topk 5 20')
+        json_file: dpr retrieval json file
+    Returns:
+        topk: a dictionary of topk scores (e.g., {"Top5": <score>})
+    """
+    eval_cmd = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {json_file} {defs} '
+    eval_stdout, eval_stderr = run_command(eval_cmd)
+    topk = {}
+    for line in eval_stdout.split('\n'):
+        parts = line.split('\t')
+        if len(parts) == 2 and 'accuracy' in parts[1]:
+            topk.update({parts[0]:round(float(parts[1][10:])*100, 4)})
+    return topk
+def convert_trec_run_to_dpr_retrieval_json(topics,index,runfile,output):
+    """Convert trec runfile to dpr retrieval json file
+    Args:
+        topics: topics field
+        index: index field
+        runfile: input runfile
+        output: output jsonfile
+    Returns:
+        exit status: exit status
+    """
+    cmd = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics {topics} --index {index} --input {runfile} --output {output}'
+    return os.system(cmd)
+def run_fusion(run_ls, output, k):
+    """run fusion command and return status code
+    Args:
+        run_ls: a list of runfile paths
+        output: output path
+        k: topk value
+    Returns:
+        status code: status code
+    """
+    run_files = ' '.join(run_ls)
+    cmd = f'python -m pyserini.fusion --runs {run_files} --output {output} --k {k}'
+    return os.system(cmd)

pyserini/2cr/miracl.py ADDED Viewed

	@@ -0,0 +1,447 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import argparse
+import math
+import os
+import sys
+import time
+import subprocess
+import pkg_resources
+from collections import defaultdict, OrderedDict
+from string import Template
+import yaml
+from ._base import run_eval_and_return_metric, ok_str, okish_str, fail_str
+languages = [
+    ['ar', 'arabic'],
+    ['bn', 'bengali'],
+    ['en', 'english'],
+    ['es', 'spanish'],
+    ['fa', 'persian'],
+    ['fi', 'finnish'],
+    ['fr', 'french'],
+    ['hi', 'hindi'],
+    ['id', 'indonesian'],
+    ['ja', 'japanese'],
+    ['ko', 'korean'],
+    ['ru', 'russian'],
+    ['sw', 'swahili'],
+    ['te', 'telugu'],
+    ['th', 'thai'],
+    ['zh', 'chinese'],
+    ['de', 'german'],
+    ['yo', 'yoruba']
+]
+html_display = OrderedDict()
+html_display['bm25'] = 'BM25'
+html_display['mdpr-tied-pft-msmarco'] = 'mDPR (tied encoders), pre-FT w/ MS MARCO'
+html_display['mdpr-tied-pft-msmarco-ft-all'] = 'mDPR (tied encoders), pre-FT w/ MS MARCO then FT w/ all Mr. TyDi'
+html_display['bm25-mdpr-tied-pft-msmarco-hybrid'] = 'Hybrid of `bm25` and `mdpr-tied-pft-msmarco`'
+html_display['mdpr-tied-pft-msmarco-ft-miracl'] = 'mDPR (tied encoders), pre-FT w/ MS MARCO then in-lang FT w/ MIRACL'
+html_display['mcontriever-tied-pft-msmarco'] = 'mContriever (tied encoders), pre-FT w/ MS MARCO'
+models = list(html_display)
+trec_eval_metric_definitions = {
+    'nDCG@10': '-c -M 100 -m ndcg_cut.10',
+    'R@100': '-c -m recall.100',
+}
+def format_run_command(raw):
+    return raw.replace('--lang', '\\\n  --lang') \
+        .replace('--encoder', '\\\n  --encoder') \
+        .replace('--topics', '\\\n  --topics') \
+        .replace('--index', '\\\n  --index') \
+        .replace('--output ', '\\\n  --output ') \
+        .replace('--runs', '\\\n  --runs ') \
+        .replace('--batch ', '\\\n  --batch ') \
+        .replace('--threads 12', '--threads 12 \\\n ')
+def format_eval_command(raw):
+    return raw.replace('-c ', '\\\n  -c ') \
+        .replace(raw.split()[-1], f'\\\n  {raw.split()[-1]}')
+def read_file(f):
+    fin = open(f, 'r')
+    text = fin.read()
+    fin.close()
+    return text
+def list_conditions():
+    print('Conditions:\n-----------')
+    for condition, _ in html_display.items():
+        print(condition)
+    print('\nLanguages\n---------')
+    for language in languages:
+        print(language[0])
+def generate_table_rows(table, row_template, commands, eval_commands, table_id, split, metric):
+    row_cnt = 1
+    html_rows = []
+    for model in models:
+        s = Template(row_template)
+        keys = {}
+        used_langs = 0
+        for lang in languages:
+            keys[lang[0]] = f'{model}.{lang[0]}'
+            used_langs += 1 if table[keys[lang[0]]][split][metric] != 0 else 0
+        sum = table[keys["ar"]][split][metric] + \
+              table[keys["bn"]][split][metric] + \
+              table[keys["en"]][split][metric] + \
+              table[keys["es"]][split][metric] + \
+              table[keys["fa"]][split][metric] + \
+              table[keys["fi"]][split][metric] + \
+              table[keys["fr"]][split][metric] + \
+              table[keys["hi"]][split][metric] + \
+              table[keys["id"]][split][metric] + \
+              table[keys["ja"]][split][metric] + \
+              table[keys["ko"]][split][metric] + \
+              table[keys["ru"]][split][metric] + \
+              table[keys["sw"]][split][metric] + \
+              table[keys["te"]][split][metric] + \
+              table[keys["th"]][split][metric] + \
+              table[keys["zh"]][split][metric] + \
+              table[keys["de"]][split][metric] + \
+              table[keys["yo"]][split][metric]
+        avg = sum / used_langs
+        s = s.substitute(table_cnt=table_id,
+                         row_cnt=row_cnt,
+                         model=html_display[model],
+                         ar=f'{table[keys["ar"]][split][metric]:.3f}',
+                         bn=f'{table[keys["bn"]][split][metric]:.3f}',
+                         en=f'{table[keys["en"]][split][metric]:.3f}',
+                         es=f'{table[keys["es"]][split][metric]:.3f}',
+                         fa=f'{table[keys["fa"]][split][metric]:.3f}',
+                         fi=f'{table[keys["fi"]][split][metric]:.3f}',
+                         fr=f'{table[keys["fr"]][split][metric]:.3f}',
+                         hi=f'{table[keys["hi"]][split][metric]:.3f}',
+                         id=f'{table[keys["id"]][split][metric]:.3f}',
+                         ja=f'{table[keys["ja"]][split][metric]:.3f}',
+                         ko=f'{table[keys["ko"]][split][metric]:.3f}',
+                         ru=f'{table[keys["ru"]][split][metric]:.3f}',
+                         sw=f'{table[keys["sw"]][split][metric]:.3f}',
+                         te=f'{table[keys["te"]][split][metric]:.3f}',
+                         th=f'{table[keys["th"]][split][metric]:.3f}',
+                         zh=f'{table[keys["zh"]][split][metric]:.3f}',
+                         de=f'{table[keys["de"]][split][metric]:.3f}',
+                         yo=f'{table[keys["yo"]][split][metric]:.3f}',
+                         avg=f'{avg:.3f}',
+                         cmd1=f'{commands[keys["ar"]]}',
+                         cmd2=f'{commands[keys["bn"]]}',
+                         cmd3=f'{commands[keys["en"]]}',
+                         cmd4=f'{commands[keys["es"]]}',
+                         cmd5=f'{commands[keys["fa"]]}',
+                         cmd6=f'{commands[keys["fi"]]}',
+                         cmd7=f'{commands[keys["fr"]]}',
+                         cmd8=f'{commands[keys["hi"]]}',
+                         cmd9=f'{commands[keys["id"]]}',
+                         cmd10=f'{commands[keys["ja"]]}',
+                         cmd11=f'{commands[keys["ko"]]}',
+                         cmd12=f'{commands[keys["ru"]]}',
+                         cmd13=f'{commands[keys["sw"]]}',
+                         cmd14=f'{commands[keys["te"]]}',
+                         cmd15=f'{commands[keys["th"]]}',
+                         cmd16=f'{commands[keys["zh"]]}',
+                         cmd17=f'{commands[keys["de"]]}',
+                         cmd18=f'{commands[keys["yo"]]}',
+                         eval_cmd1=f'{eval_commands[keys["ar"]][metric]}',
+                         eval_cmd2=f'{eval_commands[keys["bn"]][metric]}',
+                         eval_cmd3=f'{eval_commands[keys["en"]][metric]}',
+                         eval_cmd4=f'{eval_commands[keys["es"]][metric]}',
+                         eval_cmd5=f'{eval_commands[keys["fa"]][metric]}',
+                         eval_cmd6=f'{eval_commands[keys["fi"]][metric]}',
+                         eval_cmd7=f'{eval_commands[keys["fr"]][metric]}',
+                         eval_cmd8=f'{eval_commands[keys["hi"]][metric]}',
+                         eval_cmd9=f'{eval_commands[keys["id"]][metric]}',
+                         eval_cmd10=f'{eval_commands[keys["ja"]][metric]}',
+                         eval_cmd11=f'{eval_commands[keys["ko"]][metric]}',
+                         eval_cmd12=f'{eval_commands[keys["ru"]][metric]}',
+                         eval_cmd13=f'{eval_commands[keys["sw"]][metric]}',
+                         eval_cmd14=f'{eval_commands[keys["te"]][metric]}',
+                         eval_cmd15=f'{eval_commands[keys["th"]][metric]}',
+                         eval_cmd16=f'{eval_commands[keys["zh"]][metric]}',
+                         eval_cmd17=f'{eval_commands[keys["de"]][metric]}',
+                         eval_cmd18=f'{eval_commands[keys["yo"]][metric]}'
+                         )
+        s = s.replace("0.000", "--")
+        html_rows.append(s)
+        row_cnt += 1
+    return html_rows
+def print_results(table, metric, split):
+    print(f'Metric = {metric}, Split = {split}')
+    print(' ' * 35, end='')
+    for lang in languages:
+        print(f'{lang[0]:3}    ', end='')
+    print('')
+    for model in models:
+        print(f'{model:33}', end='')
+        for lang in languages:
+            key = f'{model}.{lang[0]}'
+            print(f'{table[key][split][metric]:7.3f}', end='')
+        print('')
+    print('')
+def extract_topic_fn_from_cmd(cmd):
+    cmd = cmd.split()
+    topic_idx = cmd.index('--topics')
+    return cmd[topic_idx + 1]
+def generate_report(args):
+    table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+    commands = defaultdict(lambda: '')
+    eval_commands = defaultdict(lambda: defaultdict(lambda: ''))
+    html_template = read_file(pkg_resources.resource_filename(__name__, 'miracl_html.template'))
+    table_template = read_file(pkg_resources.resource_filename(__name__, 'miracl_html_table.template'))
+    row_template = read_file(pkg_resources.resource_filename(__name__, 'miracl_html_table_row.template'))
+    with open(pkg_resources.resource_filename(__name__, 'miracl.yaml')) as f:
+        yaml_data = yaml.safe_load(f)
+        for condition in yaml_data['conditions']:
+            name = condition['name']
+            eval_key = condition['eval_key']
+            cmd_template = condition['command']
+            cmd_lst = cmd_template.split()
+            lang = name.split('.')[-1]
+            is_hybrid_run = 'hybrid' in name
+            for splits in condition['splits']:
+                split = splits['split']
+                if is_hybrid_run:
+                    hits = int(cmd_lst[cmd_lst.index('--k') + 1])
+                else:
+                    hits = int(cmd_lst[cmd_lst.index('--hits') + 1])
+                runfile = os.path.join(args.directory, f'run.miracl.{name}.{split}.txt')
+                if is_hybrid_run:
+                    bm25_output = os.path.join(args.directory,
+                                               f'run.miracl.bm25.{lang}.{split}.top{hits}.txt')
+                    mdpr_output = os.path.join(args.directory,
+                                               f'run.miracl.mdpr-tied-pft-msmarco.{lang}.{split}.top{hits}.txt')
+                    expected_args = dict(output=runfile, bm25_output=bm25_output, mdpr_output=mdpr_output)
+                else:
+                    expected_args = dict(split=split, output=runfile)
+                if not all([f"${k}" in cmd_template or f"${{{k}}}" in cmd_template for k in expected_args]):
+                    raise ValueError(f"Not all arguements {list(expected_args)} detected from inputs: {cmd_template}.")
+                cmd = Template(cmd_template).substitute(**expected_args)
+                commands[name] = format_run_command(cmd)
+                for expected in splits['scores']:
+                    for metric in expected:
+                        if str(expected[metric])[-1] == "5":
+                            # without adding espilon, there is a chance that f-string would round 0.5 to 0 rather than 1
+                            # e.g., 0.8885 -> 0.888 rather than 0.889
+                            # add a espilon to the expected score to avoid rounding error
+                            expected[metric] += 1e-5
+                        table[name][split][metric] = expected[metric]
+                        eval_cmd = f'python -m pyserini.eval.trec_eval ' + \
+                                   f'{trec_eval_metric_definitions[metric]} {eval_key}-{split} {runfile}'
+                        eval_commands[name][metric] = format_eval_command(eval_cmd)
+        tables_html = []
+        split = 'dev'
+        # Build the table for MRR@100, test queries
+        html_rows = generate_table_rows(table, row_template, commands, eval_commands, 1, split, 'nDCG@10')
+        all_rows = '\n'.join(html_rows)
+        tables_html.append(Template(table_template).substitute(desc=f'nDCG@10, {split} queries', rows=all_rows))
+        # Build the table for R@100, test queries
+        html_rows = generate_table_rows(table, row_template, commands, eval_commands, 2, split, 'R@100')
+        all_rows = '\n'.join(html_rows)
+        tables_html.append(Template(table_template).substitute(desc=f'Recall@100, {split} queries', rows=all_rows))
+    with open(args.output, 'w') as out:
+        out.write(Template(html_template).substitute(title='MIRACL', tables=' '.join(tables_html)))
+def run_conditions(args):
+    if args.condition == 'mdpr-tied-pft-msmarco-ft-miracl' and args.language in ['de', 'yo']:
+        print('MIRACL de and yo datasets do not have train splits to finetune with')
+        return
+    start = time.time()
+    table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+    with open(pkg_resources.resource_filename(__name__, 'miracl.yaml')) as f:
+        yaml_data = yaml.safe_load(f)
+        for condition in yaml_data['conditions']:
+            name = condition['name']
+            encoder = name.split('.')[0]
+            lang = name.split('.')[-1]
+            if args.all:
+                pass
+            elif args.condition != encoder:
+                continue
+            elif args.language and args.language != lang:
+                continue
+            eval_key = condition['eval_key']
+            cmd_template = condition['command']
+            cmd_lst = cmd_template.split()
+            print(f'condition {name}:')
+            is_hybrid_run = 'hybrid' in name
+            for splits in condition['splits']:
+                split = splits['split']
+                if is_hybrid_run:
+                    hits = int(cmd_lst[cmd_lst.index('--k') + 1])
+                else:
+                    hits = int(cmd_lst[cmd_lst.index('--hits') + 1])
+                print(f'  - split: {split}')
+                runfile = os.path.join(args.directory, f'run.miracl.{name}.{split}.top{hits}.txt')
+                if is_hybrid_run:
+                    bm25_output = os.path.join(args.directory,
+                                               f'run.miracl.bm25.{lang}.{split}.top{hits}.txt')
+                    mdpr_output = os.path.join(args.directory,
+                                               f'run.miracl.mdpr-tied-pft-msmarco.{lang}.{split}.top{hits}.txt')
+                    if not os.path.exists(bm25_output):
+                        print(f'Missing BM25 file: {bm25_output}')
+                        continue
+                    if not os.path.exists(mdpr_output):
+                        print(f'Missing mDPR file: {mdpr_output}')
+                        continue
+                    cmd = Template(cmd_template).substitute(split=split, output=runfile, bm25_output=bm25_output,
+                                                            mdpr_output=mdpr_output)
+                else:
+                    cmd = Template(cmd_template).substitute(split=split, output=runfile)
+                # In the yaml file, the topics are written as something like '--topics miracl-v1.0-ar-${split}'
+                # This works for the dev split because the topics are directly included in Anserini/Pyserini.
+                # For this training split, we have to map the symbol into a file in tools/topics-and-qrels/
+                # Here, we assume that the developer has cloned the miracl repo and placed the topics there.
+                if split == 'train':
+                    cmd = cmd.replace(f'--topics miracl-v1.0-{lang}-{split}',
+                                      f'--topics tools/topics-and-qrels/topics.miracl-v1.0-{lang}-{split}.tsv')
+                if args.display_commands:
+                    print(f'\n```bash\n{format_run_command(cmd)}\n```\n')
+                if not os.path.exists(runfile):
+                    if not args.dry_run:
+                        rtn = subprocess.run(cmd.split(), capture_output=True)
+                        stderr = rtn.stderr.decode()
+                        if '--topics' in cmd:
+                            topic_fn = extract_topic_fn_from_cmd(cmd)
+                            if f'ValueError: Topic {topic_fn} Not Found' in stderr:
+                                print(f'Skipping {topic_fn}: file not found.')
+                                continue
+                for expected in splits['scores']:
+                    for metric in expected:
+                        if not args.skip_eval:
+                            # We have the translate the training qrels into a file located in tools/topics-and-qrels/
+                            # because they are not included with Anserini/Pyserini by default.
+                            # Here, we assume that the developer has cloned the miracl repo and placed the qrels there.
+                            if split == 'train':
+                                qrels = f'tools/topics-and-qrels/qrels.{eval_key}-train.tsv'
+                            else:
+                                qrels = f'{eval_key}-{split}'
+                            score = float(run_eval_and_return_metric(metric, qrels,
+                                                                     trec_eval_metric_definitions[metric], runfile))
+                            if math.isclose(score, float(expected[metric])):
+                                result_str = ok_str
+                            # Flaky tests
+                            elif (name == 'mdpr-tied-pft-msmarco.hi' and split == 'train'
+                                  and math.isclose(score, float(expected[metric]), abs_tol=2e-4)) or \
+                                 (name == 'mdpr-tied-pft-msmarco-ft-all.ru'
+                                  and split == 'dev' and metric == 'nDCG@10'
+                                  and math.isclose(score, float(expected[metric]), abs_tol=2e-4)) or \
+                                 (name == 'bm25-mdpr-tied-pft-msmarco-hybrid.te'
+                                  and split == 'train' and metric == 'nDCG@10'
+                                  and math.isclose(score, float(expected[metric]), abs_tol=2e-4)) or \
+                                 (name == 'bm25-mdpr-tied-pft-msmarco-hybrid.zh'
+                                  and split == 'dev' and metric == 'nDCG@10'
+                                  and math.isclose(score, float(expected[metric]), abs_tol=2e-4)):
+                                result_str = okish_str
+                            else:
+                                result_str = fail_str + f' expected {expected[metric]:.4f}'
+                            print(f'      {metric:7}: {score:.4f} {result_str}')
+                            table[name][split][metric] = score
+                        else:
+                            table[name][split][metric] = expected[metric]
+            print('')
+    for metric in ['nDCG@10', 'R@100']:
+        for split in ['dev', 'train']:
+            print_results(table, metric, split)
+    end = time.time()
+    print(f'Total elapsed time: {end - start:.0f}s')
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Generate regression matrix for MIRACL.')
+    parser.add_argument('--condition', type=str,
+                        help='Condition to run', required=False)
+    # To list all conditions
+    parser.add_argument('--list-conditions', action='store_true', default=False, help='List available conditions.')
+    # For generating reports
+    parser.add_argument('--generate-report', action='store_true', default=False, help='Generate report.')
+    parser.add_argument('--output', type=str, help='File to store report.', required=False)
+    # For actually running the experimental conditions
+    parser.add_argument('--all', action='store_true', default=False, help='Run using all languages.')
+    parser.add_argument('--language', type=str, help='Language to run.', required=False)
+    parser.add_argument('--directory', type=str, help='Base directory.', default='', required=False)
+    parser.add_argument('--dry-run', action='store_true', default=False, help='Print out commands but do not execute.')
+    parser.add_argument('--skip-eval', action='store_true', default=False, help='Skip running trec_eval.')
+    parser.add_argument('--display-commands', action='store_true', default=False, help='Display command.')
+    args = parser.parse_args()
+    if args.list_conditions:
+        list_conditions()
+        sys.exit()
+    if args.generate_report:
+        if not args.output:
+            print(f'Must specify report filename with --output.')
+            sys.exit()
+        generate_report(args)
+        sys.exit()
+    if args.all and (args.condition or args.language):
+        print('Specifying --all will run all conditions and languages')
+        sys.exit()
+    run_conditions(args)

pyserini/2cr/miracl.yaml ADDED Viewed

	@@ -0,0 +1,1180 @@

+conditions:
+  # BM25
+  - name: bm25.ar
+    eval_key: miracl-v1.0-ar
+    command: python -m pyserini.search.lucene --language ar --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4434
+            R@100: 0.8562
+      - split: dev
+        scores:
+          - nDCG@10: 0.4809
+            R@100: 0.8885
+  - name: bm25.bn
+    eval_key: miracl-v1.0-bn
+    command: python -m pyserini.search.lucene --language bn --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5122
+            R@100: 0.8934
+      - split: dev
+        scores:
+          - nDCG@10: 0.5079
+            R@100: 0.9088
+  - name: bm25.en
+    eval_key: miracl-v1.0-en
+    command: python -m pyserini.search.lucene --language en --topics miracl-v1.0-en-${split} --index miracl-v1.0-en --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3415
+            R@100: 0.7928
+      - split: dev
+        scores:
+          - nDCG@10: 0.3506
+            R@100: 0.8190
+  - name: bm25.es
+    eval_key: miracl-v1.0-es
+    command: python -m pyserini.search.lucene --language es --topics miracl-v1.0-es-${split} --index miracl-v1.0-es --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3030
+            R@100: 0.7020
+      - split: dev
+        scores:
+          - nDCG@10: 0.3193
+            R@100: 0.7018
+  - name: bm25.fa
+    eval_key: miracl-v1.0-fa
+    command: python -m pyserini.search.lucene --language fa --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3270
+            R@100: 0.7139
+      - split: dev
+        scores:
+          - nDCG@10: 0.3334
+            R@100: 0.7306
+  - name: bm25.fi
+    eval_key: miracl-v1.0-fi
+    command: python -m pyserini.search.lucene --language fi --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5106
+            R@100: 0.8471
+      - split: dev
+        scores:
+          - nDCG@10: 0.5513
+            R@100: 0.8910
+  - name: bm25.fr
+    eval_key: miracl-v1.0-fr
+    command: python -m pyserini.search.lucene --language fr --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.2152
+            R@100: 0.6601
+      - split: dev
+        scores:
+          - nDCG@10: 0.1832
+            R@100: 0.6528
+  - name: bm25.hi
+    eval_key: miracl-v1.0-hi
+    command: python -m pyserini.search.lucene --language hi --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4745
+            R@100: 0.9016
+      - split: dev
+        scores:
+          - nDCG@10: 0.4578
+            R@100: 0.8679
+  - name: bm25.id
+    eval_key: miracl-v1.0-id
+    command: python -m pyserini.search.lucene --language id --topics miracl-v1.0-id-${split} --index miracl-v1.0-id --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4844
+            R@100: 0.9234
+      - split: dev
+        scores:
+          - nDCG@10: 0.4486
+            R@100: 0.9041
+  - name: bm25.ja
+    eval_key: miracl-v1.0-ja
+    command: python -m pyserini.search.lucene --language ja --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3796
+            R@100: 0.8225
+      - split: dev
+        scores:
+          - nDCG@10: 0.3689
+            R@100: 0.8048
+  - name: bm25.ko
+    eval_key: miracl-v1.0-ko
+    command: python -m pyserini.search.lucene --language ko --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4279
+            R@100: 0.7572
+      - split: dev
+        scores:
+          - nDCG@10: 0.4190
+            R@100: 0.7831
+  - name: bm25.ru
+    eval_key: miracl-v1.0-ru
+    command: python -m pyserini.search.lucene --language ru --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3153
+            R@100: 0.6464
+      - split: dev
+        scores:
+          - nDCG@10: 0.3342
+            R@100: 0.6614
+  - name: bm25.sw
+    eval_key: miracl-v1.0-sw
+    command: python -m pyserini.search.lucene --language sw --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3356
+            R@100: 0.6499
+      - split: dev
+        scores:
+          - nDCG@10: 0.3826
+            R@100: 0.7008
+  - name: bm25.te
+    eval_key: miracl-v1.0-te
+    command: python -m pyserini.search.lucene --language te --topics miracl-v1.0-te-${split} --index miracl-v1.0-te --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4814
+            R@100: 0.8077
+      - split: dev
+        scores:
+          - nDCG@10: 0.4942
+            R@100: 0.8307
+  - name: bm25.th
+    eval_key: miracl-v1.0-th
+    command: python -m pyserini.search.lucene --language th --topics miracl-v1.0-th-${split} --index miracl-v1.0-th --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4629
+            R@100: 0.8768
+      - split: dev
+        scores:
+          - nDCG@10: 0.4838
+            R@100: 0.8874
+  - name: bm25.zh
+    eval_key: miracl-v1.0-zh
+    command: python -m pyserini.search.lucene --language zh --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.2018
+            R@100: 0.5541
+      - split: dev
+        scores:
+          - nDCG@10: 0.1801
+            R@100: 0.5599
+  - name: bm25.de
+    eval_key: miracl-v1.0-de
+    command: python -m pyserini.search.lucene --language de --topics miracl-v1.0-de-${split} --index miracl-v1.0-de --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.2262
+            R@100: 0.5724
+  - name: bm25.yo
+    eval_key: miracl-v1.0-yo
+    command: python -m pyserini.search.lucene --pretokenized --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo --output $output --batch 128 --threads 16 --bm25 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.4059
+            R@100: 0.7325
+  # mdpr-tied-pft-msmarco
+  - name: mdpr-tied-pft-msmarco.ar
+    eval_key: miracl-v1.0-ar
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4653
+            R@100: 0.8293
+      - split: dev
+        scores:
+          - nDCG@10: 0.4993
+            R@100: 0.8407
+  - name: mdpr-tied-pft-msmarco.bn
+    eval_key: miracl-v1.0-bn
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4362
+            R@100: 0.8045
+      - split: dev
+        scores:
+          - nDCG@10: 0.4427
+            R@100: 0.8193
+  - name: mdpr-tied-pft-msmarco.en
+    eval_key: miracl-v1.0-en
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3986
+            R@100: 0.7779
+      - split: dev
+        scores:
+          - nDCG@10: 0.3938
+            R@100: 0.7675
+  - name: mdpr-tied-pft-msmarco.es
+    eval_key: miracl-v1.0-es
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4637
+            R@100: 0.8654
+      - split: dev
+        scores:
+          - nDCG@10: 0.4777
+            R@100: 0.8643
+  - name: mdpr-tied-pft-msmarco.fa
+    eval_key: miracl-v1.0-fa
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4882
+            R@100: 0.9092
+      - split: dev
+        scores:
+          - nDCG@10: 0.4800
+            R@100: 0.8980
+  - name: mdpr-tied-pft-msmarco.fi
+    eval_key: miracl-v1.0-fi
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4426
+            R@100: 0.7611
+      - split: dev
+        scores:
+          - nDCG@10: 0.4721
+            R@100: 0.7877
+  - name: mdpr-tied-pft-msmarco.fr
+    eval_key: miracl-v1.0-fr
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4372
+            R@100: 0.9268
+      - split: dev
+        scores:
+          - nDCG@10: 0.4352
+            R@100: 0.9154
+  - name: mdpr-tied-pft-msmarco.hi
+    eval_key: miracl-v1.0-hi
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3685
+            R@100: 0.7780
+      - split: dev
+        scores:
+          - nDCG@10: 0.3830
+            R@100: 0.7755
+  - name: mdpr-tied-pft-msmarco.id
+    eval_key: miracl-v1.0-id
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.2549
+            R@100: 0.5610
+      - split: dev
+        scores:
+          - nDCG@10: 0.2719
+            R@100: 0.5734
+  - name: mdpr-tied-pft-msmarco.ja
+    eval_key: miracl-v1.0-ja
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4342
+            R@100: 0.8211
+      - split: dev
+        scores:
+          - nDCG@10: 0.4390
+            R@100: 0.8254
+  - name: mdpr-tied-pft-msmarco.ko
+    eval_key: miracl-v1.0-ko
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4147
+            R@100: 0.7699
+      - split: dev
+        scores:
+          - nDCG@10: 0.4189
+            R@100: 0.7369
+  - name: mdpr-tied-pft-msmarco.ru
+    eval_key: miracl-v1.0-ru
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3812
+            R@100: 0.7854
+      - split: dev
+        scores:
+          - nDCG@10: 0.4073
+            R@100: 0.7972
+  - name: mdpr-tied-pft-msmarco.sw
+    eval_key: miracl-v1.0-sw
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.2973
+            R@100: 0.5761
+      - split: dev
+        scores:
+          - nDCG@10: 0.2990
+            R@100: 0.6158
+  - name: mdpr-tied-pft-msmarco.te
+    eval_key: miracl-v1.0-te
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3723
+            R@100: 0.7698
+      - split: dev
+        scores:
+          - nDCG@10: 0.3557
+            R@100: 0.7619
+  - name: mdpr-tied-pft-msmarco.th
+    eval_key: miracl-v1.0-th
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3451
+            R@100: 0.6728
+      - split: dev
+        scores:
+          - nDCG@10: 0.3578
+            R@100: 0.6783
+  - name: mdpr-tied-pft-msmarco.zh
+    eval_key: miracl-v1.0-zh
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5040
+            R@100: 0.9355
+      - split: dev
+        scores:
+          - nDCG@10: 0.5116
+            R@100: 0.9436
+  - name: mdpr-tied-pft-msmarco.de
+    eval_key: miracl-v1.0-de
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-de-${split} --index miracl-v1.0-de-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.4895
+            R@100: 0.8983
+  - name: mdpr-tied-pft-msmarco.yo
+    eval_key: miracl-v1.0-yo
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.4439
+            R@100: 0.8403
+  # mdpr-tied-pft-msmarco-ft-all
+  - name: mdpr-tied-pft-msmarco-ft-all.ar
+    eval_key: miracl-v1.0-ar
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.6954
+            R@100: 0.8542
+      - split: dev
+        scores:
+          - nDCG@10: 0.5782
+            R@100: 0.7953
+  - name: mdpr-tied-pft-msmarco-ft-all.bn
+    eval_key: miracl-v1.0-bn
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.6823
+            R@100: 0.8646
+      - split: dev
+        scores:
+          - nDCG@10: 0.5804
+            R@100: 0.8480
+  - name: mdpr-tied-pft-msmarco-ft-all.en
+    eval_key: miracl-v1.0-en
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3491
+            R@100: 0.5678
+      - split: dev
+        scores:
+          - nDCG@10: 0.2813
+            R@100: 0.5083
+  - name: mdpr-tied-pft-msmarco-ft-all.es
+    eval_key: miracl-v1.0-es
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.2488
+            R@100: 0.4799
+      - split: dev
+        scores:
+          - nDCG@10: 0.2509
+            R@100: 0.4706
+  - name: mdpr-tied-pft-msmarco-ft-all.fa
+    eval_key: miracl-v1.0-fa
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3809
+            R@100: 0.6899
+      - split: dev
+        scores:
+          - nDCG@10: 0.3836
+            R@100: 0.6863
+  - name: mdpr-tied-pft-msmarco-ft-all.fi
+    eval_key: miracl-v1.0-fi
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.7738
+            R@100: 0.9081
+      - split: dev
+        scores:
+          - nDCG@10: 0.5694
+            R@100: 0.7984
+  - name: mdpr-tied-pft-msmarco-ft-all.fr
+    eval_key: miracl-v1.0-fr
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.2989
+            R@100: 0.6197
+      - split: dev
+        scores:
+          - nDCG@10: 0.3010
+            R@100: 0.6005
+  - name: mdpr-tied-pft-msmarco-ft-all.hi
+    eval_key: miracl-v1.0-hi
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3336
+            R@100: 0.6388
+      - split: dev
+        scores:
+          - nDCG@10: 0.3286
+            R@100: 0.6371
+  - name: mdpr-tied-pft-msmarco-ft-all.id
+    eval_key: miracl-v1.0-id
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3321
+            R@100: 0.5492
+      - split: dev
+        scores:
+          - nDCG@10: 0.3462
+            R@100: 0.5841
+  - name: mdpr-tied-pft-msmarco-ft-all.ja
+    eval_key: miracl-v1.0-ja
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.6378
+            R@100: 0.7950
+      - split: dev
+        scores:
+          - nDCG@10: 0.4999
+            R@100: 0.7451
+  - name: mdpr-tied-pft-msmarco-ft-all.ko
+    eval_key: miracl-v1.0-ko
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5795
+            R@100: 0.7850
+      - split: dev
+        scores:
+          - nDCG@10: 0.4864
+            R@100: 0.7183
+  - name: mdpr-tied-pft-msmarco-ft-all.ru
+    eval_key: miracl-v1.0-ru
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.6011
+            R@100: 0.8188
+      - split: dev
+        scores:
+          - nDCG@10: 0.3933
+            R@100: 0.6707
+  - name: mdpr-tied-pft-msmarco-ft-all.sw
+    eval_key: miracl-v1.0-sw
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.8882
+            R@100: 0.9710
+      - split: dev
+        scores:
+          - nDCG@10: 0.6575
+            R@100: 0.8883
+  - name: mdpr-tied-pft-msmarco-ft-all.te
+    eval_key: miracl-v1.0-te
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.8757
+            R@100: 0.9725
+      - split: dev
+        scores:
+          - nDCG@10: 0.7783
+            R@100: 0.9513
+  - name: mdpr-tied-pft-msmarco-ft-all.th
+    eval_key: miracl-v1.0-th
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.7761
+            R@100: 0.9241
+      - split: dev
+        scores:
+          - nDCG@10: 0.5975
+            R@100: 0.8360
+  - name: mdpr-tied-pft-msmarco-ft-all.zh
+    eval_key: miracl-v1.0-zh
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3446
+            R@100: 0.6608
+      - split: dev
+        scores:
+          - nDCG@10: 0.3575
+            R@100: 0.6725
+  - name: mdpr-tied-pft-msmarco-ft-all.de
+    eval_key: miracl-v1.0-de
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-de-${split} --index miracl-v1.0-de-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.3219
+            R@100: 0.5990
+  - name: mdpr-tied-pft-msmarco-ft-all.yo
+    eval_key: miracl-v1.0-yo
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.5983
+            R@100: 0.8908
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.ar
+    eval_key: miracl-v1.0-ar
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.6259
+            R@100: 0.9173
+      - split: dev
+        scores:
+          - nDCG@10: 0.6729
+            R@100: 0.9405
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.bn
+    eval_key: miracl-v1.0-bn
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.6587
+            R@100: 0.9297
+      - split: dev
+        scores:
+          - nDCG@10: 0.6540
+            R@100: 0.9321
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.en
+    eval_key: miracl-v1.0-en
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5347
+            R@100: 0.8772
+      - split: dev
+        scores:
+          - nDCG@10: 0.5488
+            R@100: 0.8815
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.es
+    eval_key: miracl-v1.0-es
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.6234
+            R@100: 0.9425
+      - split: dev
+        scores:
+          - nDCG@10: 0.6413
+            R@100: 0.9479
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.fa
+    eval_key: miracl-v1.0-fa
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5890
+            R@100: 0.9433
+      - split: dev
+        scores:
+          - nDCG@10: 0.5935
+            R@100: 0.9374
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.fi
+    eval_key: miracl-v1.0-fi
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+        scores:
+          - nDCG@10: 0.6164
+            R@100: 0.8506
+      - split: dev
+        scores:
+          - nDCG@10: 0.6716
+            R@100: 0.8949
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.fr
+    eval_key: miracl-v1.0-fr
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5299
+            R@100: 0.9709
+      - split: dev
+        scores:
+          - nDCG@10: 0.5233
+            R@100: 0.9647
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.hi
+    eval_key: miracl-v1.0-hi
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.6217
+            R@100: 0.9059
+      - split: dev
+        scores:
+          - nDCG@10: 0.6157
+            R@100: 0.9115
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.id
+    eval_key: miracl-v1.0-id
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4442
+            R@100: 0.7595
+      - split: dev
+        scores:
+          - nDCG@10: 0.4433
+            R@100: 0.7683
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.ja
+    eval_key: miracl-v1.0-ja
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5795
+            R@100: 0.9082
+      - split: dev
+        scores:
+          - nDCG@10: 0.5757
+            R@100: 0.9036
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.ko
+    eval_key: miracl-v1.0-ko
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5758
+            R@100: 0.8744
+      - split: dev
+        scores:
+          - nDCG@10: 0.6086
+            R@100: 0.8997
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.ru
+    eval_key: miracl-v1.0-ru
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4921
+            R@100: 0.8494
+      - split: dev
+        scores:
+          - nDCG@10: 0.5323
+            R@100: 0.8738
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.sw
+    eval_key: miracl-v1.0-sw
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4100
+            R@100: 0.6987
+      - split: dev
+        scores:
+          - nDCG@10: 0.4457
+            R@100: 0.7254
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.te
+    eval_key: miracl-v1.0-te
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.6000
+            R@100: 0.8717
+      - split: dev
+        scores:
+          - nDCG@10: 0.6021
+            R@100: 0.8569
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.th
+    eval_key: miracl-v1.0-th
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5669
+            R@100: 0.8195
+      - split: dev
+        scores:
+          - nDCG@10: 0.5990
+            R@100: 0.8228
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.zh
+    eval_key: miracl-v1.0-zh
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5209
+            R@100: 0.9576
+      - split: dev
+        scores:
+          - nDCG@10: 0.5254
+            R@100: 0.9587
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.de
+    eval_key: miracl-v1.0-de
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.5643
+            R@100: 0.9482
+  - name: bm25-mdpr-tied-pft-msmarco-hybrid.yo
+    eval_key: miracl-v1.0-yo
+    command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.6114
+            R@100: 0.9496
+ # mdpr-tied-pft-msmarco-ft-miracl-ft-miracl
+  - name: mdpr-tied-pft-msmarco-ft-miracl.ar
+    eval_key: miracl-v1.0-ar
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ar --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mdpr-tied-pft-msmarco-ft-miracl-ar --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.7252
+            R@100: 0.9489
+  - name: mdpr-tied-pft-msmarco-ft-miracl.bn
+    eval_key: miracl-v1.0-bn
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-bn --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mdpr-tied-pft-msmarco-ft-miracl-bn --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.6842
+            R@100: 0.9547
+  - name: mdpr-tied-pft-msmarco-ft-miracl.en
+    eval_key: miracl-v1.0-en
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-en --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mdpr-tied-pft-msmarco-ft-miracl-en --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.4878
+            R@100: 0.8341
+  - name: mdpr-tied-pft-msmarco-ft-miracl.es
+    eval_key: miracl-v1.0-es
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-es --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mdpr-tied-pft-msmarco-ft-miracl-es --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.5648
+            R@100: 0.9109
+  - name: mdpr-tied-pft-msmarco-ft-miracl.fa
+    eval_key: miracl-v1.0-fa
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-fa --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mdpr-tied-pft-msmarco-ft-miracl-fa --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.5934
+            R@100: 0.9133
+  - name: mdpr-tied-pft-msmarco-ft-miracl.fi
+    eval_key: miracl-v1.0-fi
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-fi --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mdpr-tied-pft-msmarco-ft-miracl-fi --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.7139
+            R@100: 0.9479
+  - name: mdpr-tied-pft-msmarco-ft-miracl.fr
+    eval_key: miracl-v1.0-fr
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-fr --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mdpr-tied-pft-msmarco-ft-miracl-fr --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.5893
+            R@100: 0.9537
+  - name: mdpr-tied-pft-msmarco-ft-miracl.hi
+    eval_key: miracl-v1.0-hi
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-hi --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mdpr-tied-pft-msmarco-ft-miracl-hi --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.5164
+            R@100: 0.8862
+  - name: mdpr-tied-pft-msmarco-ft-miracl.id
+    eval_key: miracl-v1.0-id
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-id --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mdpr-tied-pft-msmarco-ft-miracl-id --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.4959
+            R@100: 0.8642
+  - name: mdpr-tied-pft-msmarco-ft-miracl.ja
+    eval_key: miracl-v1.0-ja
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ja --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mdpr-tied-pft-msmarco-ft-miracl-ja --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.6416
+            R@100: 0.9225
+  - name: mdpr-tied-pft-msmarco-ft-miracl.ko
+    eval_key: miracl-v1.0-ko
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ko --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mdpr-tied-pft-msmarco-ft-miracl-ko --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.5901
+            R@100: 0.8857
+  - name: mdpr-tied-pft-msmarco-ft-miracl.ru
+    eval_key: miracl-v1.0-ru
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ru --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mdpr-tied-pft-msmarco-ft-miracl-ru --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.5974
+            R@100: 0.9099
+  - name: mdpr-tied-pft-msmarco-ft-miracl.sw
+    eval_key: miracl-v1.0-sw
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-sw --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mdpr-tied-pft-msmarco-ft-miracl-sw --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.6853
+            R@100: 0.9367
+  - name: mdpr-tied-pft-msmarco-ft-miracl.te
+    eval_key: miracl-v1.0-te
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-te --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mdpr-tied-pft-msmarco-ft-miracl-te --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.8037
+            R@100: 0.9616
+  - name: mdpr-tied-pft-msmarco-ft-miracl.th
+    eval_key: miracl-v1.0-th
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-th --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mdpr-tied-pft-msmarco-ft-miracl-th --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.6951
+            R@100: 0.9311
+  - name: mdpr-tied-pft-msmarco-ft-miracl.zh
+    eval_key: miracl-v1.0-zh
+    command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-zh --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mdpr-tied-pft-msmarco-ft-miracl-zh --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.6500
+            R@100: 0.9631
+  # mcontriever
+  - name: mcontriever-tied-pft-msmarco.ar
+    eval_key: miracl-v1.0-ar
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5027
+            R@100: 0.9166
+      - split: dev
+        scores:
+          - nDCG@10: 0.5248
+            R@100: 0.9253
+  - name: mcontriever-tied-pft-msmarco.bn
+    eval_key: miracl-v1.0-bn
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5138
+            R@100: 0.9313
+      - split: dev
+        scores:
+          - nDCG@10: 0.5011
+            R@100: 0.9205
+  - name: mcontriever-tied-pft-msmarco.en
+    eval_key: miracl-v1.0-en
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3579
+            R@100: 0.7990
+      - split: dev
+        scores:
+          - nDCG@10: 0.3637
+            R@100: 0.7967
+  - name: mcontriever-tied-pft-msmarco.es
+    eval_key: miracl-v1.0-es
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4081
+            R@100: 0.8339
+      - split: dev
+        scores:
+          - nDCG@10: 0.4184
+            R@100: 0.8411
+  - name: mcontriever-tied-pft-msmarco.fa
+    eval_key: miracl-v1.0-fa
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.2263
+            R@100: 0.6374
+      - split: dev
+        scores:
+          - nDCG@10: 0.2152
+            R@100: 0.6540
+  - name: mcontriever-tied-pft-msmarco.fi
+    eval_key: miracl-v1.0-fi
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5680
+            R@100: 0.9369
+      - split: dev
+        scores:
+          - nDCG@10: 0.6019
+            R@100: 0.9527
+  - name: mcontriever-tied-pft-msmarco.fr
+    eval_key: miracl-v1.0-fr
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3332
+            R@100: 0.8341
+      - split: dev
+        scores:
+          - nDCG@10: 0.3140
+            R@100: 0.8243
+  - name: mcontriever-tied-pft-msmarco.hi
+    eval_key: miracl-v1.0-hi
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.2886
+            R@100: 0.6664
+      - split: dev
+        scores:
+          - nDCG@10: 0.2864
+            R@100: 0.6461
+  - name: mcontriever-tied-pft-msmarco.id
+    eval_key: miracl-v1.0-id
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3748
+            R@100: 0.7955
+      - split: dev
+        scores:
+          - nDCG@10: 0.3915
+            R@100: 0.8015
+  - name: mcontriever-tied-pft-msmarco.ja
+    eval_key: miracl-v1.0-ja
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4402
+            R@100: 0.8813
+      - split: dev
+        scores:
+          - nDCG@10: 0.4240
+            R@100: 0.8783
+  - name: mcontriever-tied-pft-msmarco.ko
+    eval_key: miracl-v1.0-ko
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4799
+            R@100: 0.8672
+      - split: dev
+        scores:
+          - nDCG@10: 0.4829
+            R@100: 0.8753
+  - name: mcontriever-tied-pft-msmarco.ru
+    eval_key: miracl-v1.0-ru
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.3811
+            R@100: 0.8369
+      - split: dev
+        scores:
+          - nDCG@10: 0.3913
+            R@100: 0.8500
+  - name: mcontriever-tied-pft-msmarco.sw
+    eval_key: miracl-v1.0-sw
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5568
+            R@100: 0.9130
+      - split: dev
+        scores:
+          - nDCG@10: 0.5600
+            R@100: 0.9108
+  - name: mcontriever-tied-pft-msmarco.te
+    eval_key: miracl-v1.0-te
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5260
+            R@100: 0.9457
+      - split: dev
+        scores:
+          - nDCG@10: 0.5283
+            R@100: 0.9612
+  - name: mcontriever-tied-pft-msmarco.th
+    eval_key: miracl-v1.0-th
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.5299
+            R@100: 0.9361
+      - split: dev
+        scores:
+          - nDCG@10: 0.5173
+            R@100: 0.9361
+  - name: mcontriever-tied-pft-msmarco.zh
+    eval_key: miracl-v1.0-zh
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: train
+        scores:
+          - nDCG@10: 0.4283
+            R@100: 0.8745
+      - split: dev
+        scores:
+          - nDCG@10: 0.4097
+            R@100: 0.9026
+  - name: mcontriever-tied-pft-msmarco.de
+    eval_key: miracl-v1.0-de
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-de-${split} --index miracl-v1.0-de-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.4079
+            R@100: 0.8407
+  - name: mcontriever-tied-pft-msmarco.yo
+    eval_key: miracl-v1.0-yo
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+    splits:
+      - split: dev
+        scores:
+          - nDCG@10: 0.4150
+            R@100: 0.7703

pyserini/2cr/miracl_html.template ADDED Viewed

	@@ -0,0 +1,256 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
+    <meta http-equiv="x-ua-compatible" content="ie=edge" />
+    <title>Pyserini Reproductions</title>
+    <!-- Font Awesome -->
+    <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
+    <!-- Google Fonts Roboto -->
+    <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
+    <!-- MDB -->
+   <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
+    <style>
+tr.hide-table-padding td {
+  padding: 0;
+}
+.expand-button {
+  position: relative;
+}
+.accordion-toggle .expand-button:after {
+  position: absolute;
+  left:.75rem;
+  top: 50%;
+  transform: translate(0, -50%);
+  content: '-';
+}
+.accordion-toggle.collapsed .expand-button:after {
+  content: '+';
+}
+blockquote.mycode {
+  border-left: 3px solid #ccc;
+  margin-left: 25px;
+  margin-top: 15px;
+  padding-left: 15px;
+}
+blockquote.mycode2 {
+  border-left: 3px solid #ccc;
+  margin-left: 25px;
+  padding-top: 10px;
+  padding-bottom: 10px;
+  padding-left: 15px;
+}
+tr th.headertop {
+  border-bottom: none;
+  padding-bottom: 0rem
+}
+tr th.headerbottom {
+  padding-top: 0rem
+}
+.table>:not(caption)>*>*{padding:0.75rem 0.75rem}
+.copy-code-button {
+	border-radius: 0;
+	min-width: 55px;
+	background: none repeat scroll 0 0 transparent;
+	background-color: grey;
+	color: #F1F2F3 !important;
+	cursor: pointer;
+	border-style: none;
+	font-family: 'HELVETICA',sans-serif;
+	font-size: 0.8em;
+	font-weight: normal;
+	text-align: center;
+	text-decoration: none;
+	text-indent: 0;
+	text-transform: uppercase;
+	font-weight: 500;
+	line-height: 1.42rem;
+	margin: 0;
+	padding: 3px 8px;
+	position: absolute !important;
+	top: 0 !important;
+	right: 0 !important;
+}
+.copy-code-button > span {
+	color: #F1F2F3 !important;
+}
+.copy-code-button, ::before, ::after {
+	box-sizing: inherit;
+}
+.copy-code-button::before {
+	content: '';
+	display: inline-block;
+	width: 16px;
+	height: 16px;
+	margin-right: 3px;
+	background-size: contain;
+	background-image: url("data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHN2ZyB3aWR0aD0iMTVweCIgaGVpZ2h0PSIxNXB4IiB2aWV3Qm94PSIwIDAgMTUgMTUiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayI+CiAgICA8IS0tIEdlbmVyYXRvcjogU2tldGNoIDUwLjIgKDU1MDQ3KSAtIGh0dHA6Ly93d3cuYm9oZW1pYW5jb2RpbmcuY29tL3NrZXRjaCAtLT4KICAgIDx0aXRsZT5QYWdlIDE8L3RpdGxlPgogICAgPGRlc2M+Q3JlYXRlZCB3aXRoIFNrZXRjaC48L2Rlc2M+CiAgICA8ZGVmcz48L2RlZnM+CiAgICA8ZyBpZD0iRmxvdyIgc3Ryb2tlPSJub25lIiBzdHJva2Utd2lkdGg9IjEiIGZpbGw9Im5vbmUiIGZpbGwtcnVsZT0iZXZlbm9kZCI+CiAgICAgICAgPGcgaWQ9IkJ0dG5faHRtbCIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTgxOS4wMDAwMDAsIC03NTMuMDAwMDAwKSIgZmlsbD0iI0ZGRkZGRiI+CiAgICAgICAgICAgIDxnIGlkPSJHcm91cC0xIiB0cmFuc2Zvcm09InRyYW5zbGF0ZSgzMTEuMDAwMDAwLCA0MDUuMDAwMDAwKSI+CiAgICAgICAgICAgICAgICA8ZyBpZD0iR3JvdXAtMiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoNTA4LjAwMDAwMCwgMzQyLjAwMDAwMCkiPgogICAgICAgICAgICAgICAgICAgIDxwYXRoIGQ9Ik0xMy45NzcyNzI3LDYgTDMuNDA5MDkwOTEsNiBDMi44NDQ1NDU0NSw2IDIuMzg2MzYzNjQsNi40NTgxODE4MiAyLjM4NjM2MzY0LDcuMDIyNzI3MjcgTDIuMzg2MzYzNjQsMTcuNTkwOTA5MSBDMi4zODYzNjM2NCwxOC4xNTU0NTQ1IDIuODQ0NTQ1NDUsMTguNjEzNjM2NCAzLjQwOTA5MDkxLDE4LjYxMzYzNjQgTDEzLjk3NzI3MjcsMTguNjEzNjM2NCBDMTQuNTQxODE4MiwxOC42MTM2MzY0IDE1LDE4LjE1NTQ1NDUgMTUsMTcuNTkwOTA5MSBMMTUsNy4wMjI3MjcyNyBDMTUsNi40NTgxODE4MiAxNC41NDE4MTgyLDYgMTMuOTc3MjcyNyw2IFogTTE0LjMxODE4MTgsMTcuNTkwOTA5MSBDMTQuMzE4MTgxOCwxNy43NzkwOTA5IDE0LjE2NTQ1NDUsMTcuOTMxODE4MiAxMy45NzcyNzI3LDE3LjkzMTgxODIgTDMuNDA5MDkwOTEsMTcuOTMxODE4MiBDMy4yMjA5MDkwOSwxNy45MzE4MTgyIDMuMDY4MTgxODIsMTcuNzc5MDkwOSAzLjA2ODE4MTgyLDE3LjU5MDkwOTEgTDMuMDY4MTgxODIsNy4wMjI3MjcyNyBDMy4wNjgxODE4Miw2LjgzNDU0NTQ1IDMuMjIwOTA5MDksNi42ODE4MTgxOCAzLjQwOTA5MDkxLDYuNjgxODE4MTggTDEzLjk3NzI3MjcsNi42ODE4MTgxOCBDMTQuMTY1NDU0NSw2LjY4MTgxODE4IDE0LjMxODE4MTgsNi44MzQ1NDU0NSAxNC4zMTgxODE4LDcuMDIyNzI3MjcgTDE0LjMxODE4MTgsMTcuNTkwOTA5MSBaIE0xMS45MzE4MTgyLDE5Ljk3NzI3MjcgQzExLjkzMTgxODIsMjAuMTY1NDU0NSAxMS43NzkwOTA5LDIwLjMxODE4MTggMTEuNTkwOTA5MSwyMC4zMTgxODE4IEwxLjAyMjcyNzI3LDIwLjMxODE4MTggQzAuODM0NTQ1NDU1LDIwLjMxODE4MTggMC42ODE4MTgxODIsMjAuMTY1NDU0NSAwLjY4MTgxODE4MiwxOS45NzcyNzI3IEwwLjY4MTgxODE4Miw5LjQwOTA5MDkxIEMwLjY4MTgxODE4Miw5LjIyMDkwOTA5IDAuODM0NTQ1NDU1LDkuMDY4MTgxODIgMS4wMjI3MjcyNyw5LjA2ODE4MTgyIEwxLjM2MzYzNjM2LDkuMDY4MTgxODIgTDEuMzYzNjM2MzYsOC4zODYzNjM2NCBMMS4wMjI3MjcyNyw4LjM4NjM2MzY0IEMwLjQ1ODE4MTgxOCw4LjM4NjM2MzY0IDAsOC44NDQ1NDU0NSAwLDkuNDA5MDkwOTEgTDAsMTkuOTc3MjcyNyBDMCwyMC41NDE4MTgyIDAuNDU4MTgxODE4LDIxIDEuMDIyNzI3MjcsMjEgTDExLjU5MDkwOTEsMjEgQzEyLjE1NTQ1NDUsMjEgMTIuNjEzNjM2NCwyMC41NDE4MTgyIDEyLjYxMzYzNjQsMTkuOTc3MjcyNyBMMTIuNjEzNjM2NCwxOS42MzYzNjM2IEwxMS45MzE4MTgyLDE5LjYzNjM2MzYgTDExLjkzMTgxODIsMTkuOTc3MjcyNyBaIiBpZD0iUGFnZS0xIj48L3BhdGg+CiAgICAgICAgICAgICAgICA8L2c+CiAgICAgICAgICAgIDwvZz4KICAgICAgICA8L2c+CiAgICA8L2c+Cjwvc3ZnPg==");
+	background-repeat: no-repeat;
+	position: relative;
+	top: 3px;
+}
+.copy-code-button:focus {
+    /* Avoid an ugly focus outline on click in Chrome,
+       but darken the button for accessibility.
+       See https://stackoverflow.com/a/25298082/1481479 */
+    /* background-color: #E6E6E6; */
+	outline: 0;
+}
+pre[class*="prettyprint"] {
+	position: relative;
+	overflow: hidden;
+}
+    </style>
+</head>
+<body>
+    <!-- Background image -->
+    <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
+      <div class="mask" style="
+            background: linear-gradient(
+              45deg,
+              rgba(29, 236, 197, 0.7),
+              rgba(91, 14, 214, 0.7) 100%
+            );
+          ">
+        <div class="container d-flex align-items-center justify-content-center text-center h-100"  style="max-height: 150px">
+          <div class="text-white">
+            <h1 class="mb-3">$title</h1>
+          </div>
+        </div>
+      </div>
+    </div>
+    <!-- Background image -->
+    <div class="container my-4">
+    $tables
+    </ul>
+<div style="padding-top: 20px"/>
+<h4>Programmatic Execution</h4>
+<p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
+To list all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --list-conditions
+</tt></blockquote>
+<p>Run all languages for a specific condition and show commands:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --condition bm25 --display-commands
+</tt></blockquote>
+<p>Run a particular language for a specific condition and show commands:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --condition bm25 --language ko --display-commands
+</tt></blockquote>
+<p>Run all languages for all conditions and show commands:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --all --display-commands
+</tt></blockquote>
+<p>With the above commands, run files will be placed in the current directory. Use the option <tt>--directory runs</tt> to place the runs in a sub-directory.</p>
+<p>For a specific condition, just show the commands and do not run:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --condition bm25 --display-commands --dry-run
+</tt></blockquote>
+<p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
+<p>For a specific condition and language, just show the commands and do not run:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --condition bm25 --language ko --display-commands --dry-run
+</tt></blockquote>
+<p>For all conditions, just show the commands and do not run and skip evaluation:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --all --display-commands --dry-run --skip-eval
+</tt></blockquote>
+<p>Finally, to generate this page:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --generate-report --output docs/2cr/miracl.html
+</tt></blockquote>
+<p>The output file <tt>miracl.html</tt> should be identical to this page.</p>
+<div style="padding-top: 50px"/>
+    </div>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
+    <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
+<script>
+document.querySelectorAll('pre').forEach(function (codeBlock) {
+    var button = document.createElement('button');
+    button.className = 'copy-code-button';
+    button.type = 'button';
+    var s = codeBlock.innerText;
+    button.setAttribute('data-clipboard-text',s);
+    button.innerText = 'Copy';
+    // var pre = codeBlock.parentNode;
+    codeBlock.classList.add('prettyprint');
+    // pre.parentNode.insertBefore(button, pre);
+    codeBlock.appendChild(button);
+});
+var clipboard = new ClipboardJS('.copy-code-button');
+clipboard.on('success', function(e) {
+  console.info('Action:', e.action);
+  console.info('Text:', e.text);
+  console.info('Trigger:', e.trigger);
+  e.trigger.textContent = 'Copied';
+  window.setTimeout(function() {
+    e.trigger.textContent = 'Copy';
+  }, 2000);
+  e.clearSelection();
+});
+clipboard.on('error', function(e) {
+  console.error('Action:', e.action);
+  console.error('Trigger:', e.trigger);
+  e.trigger.textContent = 'Error Copying';
+  window.setTimeout(function() {
+    e.trigger.textContent = 'Copy';
+  }, 2000);
+  e.clearSelection();
+});
+</script>
+</body>
+</html>

pyserini/2cr/miracl_html_table.template ADDED Viewed

	@@ -0,0 +1,35 @@

+<div class="table-responsive">
+  <table class="table">
+    <thead>
+      <tr>
+        <th scope="col"></th>
+        <th scope="col">$desc</th>
+        <th scope="col">ar</th>
+        <th scope="col">bn</th>
+        <th scope="col">en</th>
+        <th scope="col">es</th>
+        <th scope="col">fa</th>
+        <th scope="col">fi</th>
+        <th scope="col">fr</th>
+        <th scope="col">hi</th>
+        <th scope="col">id</th>
+        <th scope="col">ja</th>
+        <th scope="col">ko</th>
+        <th scope="col">ru</th>
+        <th scope="col">sw</th>
+        <th scope="col">te</th>
+        <th scope="col">th</th>
+        <th scope="col">zh</th>
+        <th scope="col">de</th>
+        <th scope="col">yo</th>
+        <th scope="col"></th>
+        <th scope="col">avg</th>
+      </tr>
+    </thead>
+    <tbody>
+$rows
+    </tbody>
+  </table>
+</div>

pyserini/2cr/miracl_html_table_row.template ADDED Viewed

	@@ -0,0 +1,336 @@

+<!-- Condition: $model -->
+<tr class="accordion-toggle collapsed" id="table${table_cnt}-row${row_cnt}" data-toggle="collapse" data-parent="#table${table_cnt}-row${row_cnt}" href="#table${table_cnt}-collapse${row_cnt}">
+<td class="expand-button"></td>
+<td>$model</td>
+<td>$ar</td>
+<td>$bn</td>
+<td>$en</td>
+<td>$es</td>
+<td>$fa</td>
+<td>$fi</td>
+<td>$fr</td>
+<td>$hi</td>
+<td>$id</td>
+<td>$ja</td>
+<td>$ko</td>
+<td>$ru</td>
+<td>$sw</td>
+<td>$te</td>
+<td>$th</td>
+<td>$zh</td>
+<td>$de</td>
+<td>$yo</td>
+<td></td>
+<td>$avg</td>
+</tr>
+<tr class="hide-table-padding">
+<td colspan="22">
+<div id="table${table_cnt}-collapse${row_cnt}" class="collapse in p-3">
+<!-- Tabs navs -->
+<ul class="nav nav-tabs mb-3" id="table${table_cnt}-row${row_cnt}-tabs" role="tablist">
+  <li class="nav-item" role="presentation">
+    <a class="nav-link active" id="table${table_cnt}-row${row_cnt}-tab1-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab1" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab1" aria-selected="true" style="text-transform:none">ar</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab2-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab2" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab2" aria-selected="false" style="text-transform:none">bn</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab3-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab3" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">en</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab4-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab4" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab4" aria-selected="false" style="text-transform:none">es</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab5-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab5" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab5" aria-selected="false" style="text-transform:none">fa</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab6-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab6" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab6" aria-selected="false" style="text-transform:none">fi</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab7-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab7" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab7" aria-selected="false" style="text-transform:none">fr</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab8-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab8" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab8" aria-selected="false" style="text-transform:none">hi</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab9-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab9" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab9" aria-selected="false" style="text-transform:none">id</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab10-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab10" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab10" aria-selected="false" style="text-transform:none">ja</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab11-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab11" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab11" aria-selected="false" style="text-transform:none">ko</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab12-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab12" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab12" aria-selected="false" style="text-transform:none">ru</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab13-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab13" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab13" aria-selected="false" style="text-transform:none">sw</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab14-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab14" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab14" aria-selected="false" style="text-transform:none">te</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab15-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab15" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab15" aria-selected="false" style="text-transform:none">th</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab16-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab16" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab16" aria-selected="false" style="text-transform:none">zh</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab17-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab17" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab17" aria-selected="false" style="text-transform:none">de</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab18-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab18" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab18" aria-selected="false" style="text-transform:none">yo</a>
+  </li>
+</ul>
+<!-- Tabs navs -->
+<!-- Tabs content -->
+<div class="tab-content" id="table${table_cnt}-row${row_cnt}-content">
+  <div class="tab-pane fade show active" id="table${table_cnt}-row${row_cnt}-tab1" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab1">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd1
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd1}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab2" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab2">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd2
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd2}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab3" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab3">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd3
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd3}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab4" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab4">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd4
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd4}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab5" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab5">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd5
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd5}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab6" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab6">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd6
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd6}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab7" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab7">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd7
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd7}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab8" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab8">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd8
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd8}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab9" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab9">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd9
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd9}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab10" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab10">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd10
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd10}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab11" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab11">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd11
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd11}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab12" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab12">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd12
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd12}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab13" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab13">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd13
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd13}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab14" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab14">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd14
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd14}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab15" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab15">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd15
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd15}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab16" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab16">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd16
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd16}</code></pre>
+  </blockquote>
+  </div>
+   <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab17" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab17">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd17
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd17}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab18" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab18">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd18
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd18}</code></pre>
+  </blockquote>
+  </div>
+</div>
+<!-- Tabs content -->
+</div></td>
+</tr>

pyserini/2cr/mrtydi.py ADDED Viewed

	@@ -0,0 +1,330 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from collections import defaultdict
+from string import Template
+import argparse
+import math
+import os
+import pkg_resources
+import sys
+import time
+import yaml
+from ._base import run_eval_and_return_metric, ok_str, okish_str, fail_str
+languages = [
+    ['ar', 'arabic'],
+    ['bn', 'bengali'],
+    ['en', 'english'],
+    ['fi', 'finnish'],
+    ['id', 'indonesian'],
+    ['ja', 'japanese'],
+    ['ko', 'korean'],
+    ['ru', 'russian'],
+    ['sw', 'swahili'],
+    ['te', 'telugu'],
+    ['th', 'thai']
+]
+models = ['bm25', 'mdpr-split-pft-nq', 'mdpr-tied-pft-nq', 'mdpr-tied-pft-msmarco', 'mdpr-tied-pft-msmarco-ft-all']
+html_display = {
+    'bm25': 'BM25',
+    'mdpr-split-pft-nq': 'mDPR (split encoders), pre-FT w/ NQ',
+    'mdpr-tied-pft-nq': 'mDPR (tied encoders), pre-FT w/ NQ',
+    'mdpr-tied-pft-msmarco': 'mDPR (tied encoders), pre-FT w/ MS MARCO',
+    'mdpr-tied-pft-msmarco-ft-all': 'mDPR (tied encoders), pre-FT w/ MS MARCO, FT w/ all'
+}
+trec_eval_metric_definitions = {
+    'MRR@100': '-c -M 100 -m recip_rank',
+    'R@100': '-c -m recall.100',
+}
+def format_run_command(raw):
+    return raw.replace('--lang', '\\\n  --lang')\
+        .replace('--encoder', '\\\n  --encoder')\
+        .replace('--topics', '\\\n  --topics')\
+        .replace('--index', '\\\n  --index')\
+        .replace('--output ', '\\\n  --output ')\
+        .replace('--batch ', '\\\n  --batch ') \
+        .replace('--threads 12', '--threads 12 \\\n ')
+def format_eval_command(raw):
+    return raw.replace('-c ', '\\\n  -c ')\
+        .replace(raw.split()[-1], f'\\\n  {raw.split()[-1]}')
+def read_file(f):
+    fin = open(f, 'r')
+    text = fin.read()
+    fin.close()
+    return text
+def list_conditions():
+    print('Conditions:\n-----------')
+    for condition in models:
+        print(condition)
+    print('\nLanguages\n---------')
+    for language in languages:
+        print(language[0])
+def print_results(table, metric, split):
+    print(f'Metric = {metric}, Split = {split}')
+    print(' ' * 32, end='')
+    for lang in languages:
+        print(f'{lang[0]:3}    ', end='')
+    print('')
+    for model in models:
+        print(f'{model:30}', end='')
+        for lang in languages:
+            key = f'{model}.{lang[0]}'
+            print(f'{table[key][split][metric]:7.3f}', end='')
+        print('')
+    print('')
+def generate_table_rows(table, row_template, commands, eval_commands, table_id, split, metric):
+    row_cnt = 1
+    html_rows = []
+    for model in models:
+        s = Template(row_template)
+        keys = {}
+        for lang in languages:
+            keys[lang[0]] = f'{model}.{lang[0]}'
+        sum = table[keys["ar"]][split][metric] + \
+              table[keys["bn"]][split][metric] + \
+              table[keys["en"]][split][metric] + \
+              table[keys["fi"]][split][metric] + \
+              table[keys["id"]][split][metric] + \
+              table[keys["ja"]][split][metric] + \
+              table[keys["ko"]][split][metric] + \
+              table[keys["ru"]][split][metric] + \
+              table[keys["sw"]][split][metric] + \
+              table[keys["te"]][split][metric] + \
+              table[keys["th"]][split][metric]
+        avg = sum / 11
+        s = s.substitute(table_cnt=table_id,
+                         row_cnt=row_cnt,
+                         model=html_display[model],
+                         ar=f'{table[keys["ar"]][split][metric]:.3f}',
+                         bn=f'{table[keys["bn"]][split][metric]:.3f}',
+                         en=f'{table[keys["en"]][split][metric]:.3f}',
+                         fi=f'{table[keys["fi"]][split][metric]:.3f}',
+                         id=f'{table[keys["id"]][split][metric]:.3f}',
+                         ja=f'{table[keys["ja"]][split][metric]:.3f}',
+                         ko=f'{table[keys["ko"]][split][metric]:.3f}',
+                         ru=f'{table[keys["ru"]][split][metric]:.3f}',
+                         sw=f'{table[keys["sw"]][split][metric]:.3f}',
+                         te=f'{table[keys["te"]][split][metric]:.3f}',
+                         th=f'{table[keys["th"]][split][metric]:.3f}',
+                         avg=f'{avg:.3f}',
+                         cmd1=f'{commands[keys["ar"]]}',
+                         cmd2=f'{commands[keys["bn"]]}',
+                         cmd3=f'{commands[keys["en"]]}',
+                         cmd4=f'{commands[keys["fi"]]}',
+                         cmd5=f'{commands[keys["id"]]}',
+                         cmd6=f'{commands[keys["ja"]]}',
+                         cmd7=f'{commands[keys["ko"]]}',
+                         cmd8=f'{commands[keys["ru"]]}',
+                         cmd9=f'{commands[keys["sw"]]}',
+                         cmd10=f'{commands[keys["te"]]}',
+                         cmd11=f'{commands[keys["th"]]}',
+                         eval_cmd1=f'{eval_commands[keys["ar"]][metric]}',
+                         eval_cmd2=f'{eval_commands[keys["bn"]][metric]}',
+                         eval_cmd3=f'{eval_commands[keys["en"]][metric]}',
+                         eval_cmd4=f'{eval_commands[keys["fi"]][metric]}',
+                         eval_cmd5=f'{eval_commands[keys["id"]][metric]}',
+                         eval_cmd6=f'{eval_commands[keys["ja"]][metric]}',
+                         eval_cmd7=f'{eval_commands[keys["ko"]][metric]}',
+                         eval_cmd8=f'{eval_commands[keys["ru"]][metric]}',
+                         eval_cmd9=f'{eval_commands[keys["sw"]][metric]}',
+                         eval_cmd10=f'{eval_commands[keys["te"]][metric]}',
+                         eval_cmd11=f'{eval_commands[keys["th"]][metric]}'
+                         )
+        html_rows.append(s)
+        row_cnt += 1
+    return html_rows
+def generate_report(args):
+    table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+    commands = defaultdict(lambda: '')
+    eval_commands = defaultdict(lambda: defaultdict(lambda: ''))
+    html_template = read_file(pkg_resources.resource_filename(__name__, 'mrtydi_html.template'))
+    table_template = read_file(pkg_resources.resource_filename(__name__, 'mrtydi_html_table.template'))
+    row_template = read_file(pkg_resources.resource_filename(__name__, 'mrtydi_html_table_row.template'))
+    with open(pkg_resources.resource_filename(__name__, 'mrtydi.yaml')) as f:
+        yaml_data = yaml.safe_load(f)
+        for condition in yaml_data['conditions']:
+            name = condition['name']
+            eval_key = condition['eval_key']
+            cmd_template = condition['command']
+            for splits in condition['splits']:
+                split = splits['split']
+                runfile = os.path.join(args.directory, f'run.mrtydi.{name}.{split}.txt')
+                cmd = Template(cmd_template).substitute(split=split, output=runfile)
+                commands[name] = format_run_command(cmd)
+                for expected in splits['scores']:
+                    for metric in expected:
+                        table[name][split][metric] = expected[metric]
+                        eval_cmd = f'python -m pyserini.eval.trec_eval ' + \
+                                   f'{trec_eval_metric_definitions[metric]} {eval_key}-{split} {runfile}'
+                        eval_commands[name][metric] = format_eval_command(eval_cmd)
+        tables_html = []
+        # Build the table for MRR@100, test queries
+        html_rows = generate_table_rows(table, row_template, commands, eval_commands, 1, 'test', 'MRR@100')
+        all_rows = '\n'.join(html_rows)
+        tables_html.append(Template(table_template).substitute(desc='MRR@100, test queries', rows=all_rows))
+        # Build the table for R@100, test queries
+        html_rows = generate_table_rows(table, row_template, commands, eval_commands, 2, 'test', 'R@100')
+        all_rows = '\n'.join(html_rows)
+        tables_html.append(Template(table_template).substitute(desc='Recall@100, test queries', rows=all_rows))
+    with open(args.output, 'w') as out:
+        out.write(Template(html_template).substitute(title='Mr.TyDi', tables=' '.join(tables_html)))
+def run_conditions(args):
+    start = time.time()
+    table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+    with open(pkg_resources.resource_filename(__name__, 'mrtydi.yaml')) as f:
+        yaml_data = yaml.safe_load(f)
+        for condition in yaml_data['conditions']:
+            name = condition['name']
+            encoder = name.split('.')[0]
+            lang = name.split('.')[-1]
+            if args.all:
+                pass
+            elif args.condition != encoder:
+                continue
+            elif args.language and args.language != lang:
+                continue
+            eval_key = condition['eval_key']
+            cmd_template = condition['command']
+            print(f'condition {name}:')
+            for splits in condition['splits']:
+                split = splits['split']
+                print(f'  - split: {split}')
+                runfile = os.path.join(args.directory, f'run.mrtydi.{name}.{split}.txt')
+                cmd = Template(cmd_template).substitute(split=split, output=runfile)
+                if args.display_commands:
+                    print(f'\n```bash\n{format_run_command(cmd)}\n```\n')
+                if not os.path.exists(runfile):
+                    if not args.dry_run:
+                        os.system(cmd)
+                for expected in splits['scores']:
+                    for metric in expected:
+                        if not args.skip_eval:
+                            score = float(run_eval_and_return_metric(metric, f'{eval_key}-{split}',
+                                                                     trec_eval_metric_definitions[metric], runfile))
+                            if math.isclose(score, float(expected[metric])):
+                                result_str = ok_str
+                            # Flaky test: small difference on orca
+                            elif name == 'mdpr-tied-pft-nq.te' and split == 'dev' \
+                                    and math.isclose(score, float(expected[metric]), abs_tol=2e-4):
+                                result_str = okish_str
+                            # Flaky test: small difference on orca
+                            elif name == 'mdpr-tied-pft-msmarco-ft-all.ko' and split == 'train' \
+                                    and math.isclose(score, float(expected[metric]), abs_tol=4e-4):
+                                result_str = okish_str
+                            # Flaky test: small difference on Mac Studio (M1)
+                            elif name == 'mdpr-tied-pft-msmarco.th' and split == 'train' \
+                                    and math.isclose(score, float(expected[metric]), abs_tol=3e-4):
+                                result_str = okish_str
+                            else:
+                                result_str = fail_str + f' expected {expected[metric]:.4f}'
+                            print(f'      {metric:7}: {score:.4f} {result_str}')
+                            table[name][split][metric] = score
+                        else:
+                            table[name][split][metric] = expected[metric]
+            print('')
+    for metric in ['MRR@100', 'R@100']:
+        for split in ['test', 'dev', 'train']:
+            print_results(table, metric, split)
+    end = time.time()
+    print(f'Total elapsed time: {end - start:.0f}s')
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Generate regression matrix for MIRACL.')
+    parser.add_argument('--condition', type=str,
+                        help='Condition to run', required=False)
+    # To list all conditions
+    parser.add_argument('--list-conditions', action='store_true', default=False, help='List available conditions.')
+    # For generating reports
+    parser.add_argument('--generate-report', action='store_true', default=False, help='Generate report.')
+    parser.add_argument('--output', type=str, help='File to store report.', required=False)
+    # For actually running the experimental conditions
+    parser.add_argument('--all', action='store_true', default=False, help='Run using all languages.')
+    parser.add_argument('--language', type=str, help='Language to run.', required=False)
+    parser.add_argument('--directory', type=str, help='Base directory.', default='', required=False)
+    parser.add_argument('--dry-run', action='store_true', default=False, help='Print out commands but do not execute.')
+    parser.add_argument('--skip-eval', action='store_true', default=False, help='Skip running trec_eval.')
+    parser.add_argument('--display-commands', action='store_true', default=False, help='Display command.')
+    args = parser.parse_args()
+    if args.list_conditions:
+        list_conditions()
+        sys.exit()
+    if args.generate_report:
+        if not args.output:
+            print(f'Must specify report filename with --output.')
+            sys.exit()
+        generate_report(args)
+        sys.exit()
+    if args.all and (args.condition or args.language):
+        print('Specifying --all will run all conditions and languages')
+        sys.exit()
+    run_conditions(args)

pyserini/2cr/mrtydi.yaml ADDED Viewed

	@@ -0,0 +1,890 @@

+conditions:
+  # mDPR, tied encoders, pFT w/ MS MARCO, FT all
+  - name: mdpr-tied-pft-msmarco-ft-all.ar
+    eval_key: mrtydi-v1.1-arabic
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.9505
+            R@100: 1.0000
+      - split: dev
+        scores:
+          - MRR@100: 0.6913
+            R@100: 0.9165
+      - split: test
+        scores:
+          - MRR@100: 0.6949
+            R@100: 0.9004
+  - name: mdpr-tied-pft-msmarco-ft-all.bn
+    eval_key: mrtydi-v1.1-bengali
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.9620
+            R@100: 1.0000
+      - split: dev
+        scores:
+          - MRR@100: 0.5897
+            R@100: 0.8977
+      - split: test
+        scores:
+          - MRR@100: 0.6228
+            R@100: 0.9550
+  - name: mdpr-tied-pft-msmarco-ft-all.en
+    eval_key: mrtydi-v1.1-english
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.8278
+            R@100: 1.0000
+      - split: dev
+        scores:
+          - MRR@100: 0.5357
+            R@100: 0.8884
+      - split: test
+        scores:
+          - MRR@100: 0.4916
+            R@100: 0.8414
+  - name: mdpr-tied-pft-msmarco-ft-all.fi
+    eval_key: mrtydi-v1.1-finnish
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.9577
+            R@100: 0.9997
+      - split: dev
+        scores:
+          - MRR@100: 0.6626
+            R@100: 0.9171
+      - split: test
+        scores:
+          - MRR@100: 0.5595
+            R@100: 0.8563
+  - name: mdpr-tied-pft-msmarco-ft-all.id
+    eval_key: mrtydi-v1.1-indonesian
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.9469
+            R@100: 1.0000
+      - split: dev
+        scores:
+          - MRR@100: 0.6294
+            R@100: 0.9150
+      - split: test
+        scores:
+          - MRR@100: 0.5783
+            R@100: 0.8609
+  - name: mdpr-tied-pft-msmarco-ft-all.ja
+    eval_key: mrtydi-v1.1-japanese
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.8802
+            R@100: 0.9997
+      - split: dev
+        scores:
+          - MRR@100: 0.5505
+            R@100: 0.8696
+      - split: test
+        scores:
+          - MRR@100: 0.5007
+            R@100: 0.8130
+  - name: mdpr-tied-pft-msmarco-ft-all.ko
+    eval_key: mrtydi-v1.1-korean
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.9195
+            R@100: 1.0000
+      - split: dev
+        scores:
+          - MRR@100: 0.5645
+            R@100: 0.8663
+      - split: test
+        scores:
+          - MRR@100: 0.4861
+            R@100: 0.7854
+  - name: mdpr-tied-pft-msmarco-ft-all.ru
+    eval_key: mrtydi-v1.1-russian
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.8473
+            R@100: 0.9994
+      - split: dev
+        scores:
+          - MRR@100: 0.5104
+            R@100: 0.8720
+      - split: test
+        scores:
+          - MRR@100: 0.5161
+            R@100: 0.8432
+  - name: mdpr-tied-pft-msmarco-ft-all.sw
+    eval_key: mrtydi-v1.1-swahili
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.9515
+            R@100: 1.0000
+      - split: dev
+        scores:
+          - MRR@100: 0.6404
+            R@100: 0.9018
+      - split: test
+        scores:
+          - MRR@100: 0.6438
+            R@100: 0.8756
+  - name: mdpr-tied-pft-msmarco-ft-all.te
+    eval_key: mrtydi-v1.1-telugu
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.9679
+            R@100: 1.0000
+      - split: dev
+        scores:
+          - MRR@100: 0.7962
+            R@100: 0.9593
+      - split: test
+        scores:
+          - MRR@100: 0.8908
+            R@100: 0.9659
+  - name: mdpr-tied-pft-msmarco-ft-all.th
+    eval_key: mrtydi-v1.1-thai
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.9504
+            R@100: 1.0000
+      - split: dev
+        scores:
+          - MRR@100: 0.6670
+            R@100: 0.9114
+      - split: test
+        scores:
+          - MRR@100: 0.6175
+            R@100: 0.8826
+  # mDPR, tied encoders, pFT w/ MS MARCO
+  - name: mdpr-tied-pft-msmarco.ar
+    eval_key: mrtydi-v1.1-arabic
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-tied-pft-msmarco --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.3957
+            R@100: 0.7818
+      - split: dev
+        scores:
+          - MRR@100: 0.3978
+            R@100: 0.7778
+      - split: test
+        scores:
+          - MRR@100: 0.4414
+            R@100: 0.7971
+  - name: mdpr-tied-pft-msmarco.bn
+    eval_key: mrtydi-v1.1-bengali
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-tied-pft-msmarco --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2920
+            R@100: 0.7323
+      - split: dev
+        scores:
+          - MRR@100: 0.2993
+            R@100: 0.7318
+      - split: test
+        scores:
+          - MRR@100: 0.3969
+            R@100: 0.7838
+  - name: mdpr-tied-pft-msmarco.en
+    eval_key: mrtydi-v1.1-english
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-tied-pft-msmarco --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.3374
+            R@100: 0.8111
+      - split: dev
+        scores:
+          - MRR@100: 0.3451
+            R@100: 0.7995
+      - split: test
+        scores:
+          - MRR@100: 0.3270
+            R@100: 0.7536
+  - name: mdpr-tied-pft-msmarco.fi
+    eval_key: mrtydi-v1.1-finnish
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-tied-pft-msmarco --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.3668
+            R@100: 0.7337
+      - split: dev
+        scores:
+          - MRR@100: 0.3636
+            R@100: 0.7371
+      - split: test
+        scores:
+          - MRR@100: 0.2750
+            R@100: 0.6471
+  - name: mdpr-tied-pft-msmarco.id
+    eval_key: mrtydi-v1.1-indonesian
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-tied-pft-msmarco --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2794
+            R@100: 0.7044
+      - split: dev
+        scores:
+          - MRR@100: 0.2853
+            R@100: 0.7198
+      - split: test
+        scores:
+          - MRR@100: 0.3520
+            R@100: 0.7356
+  - name: mdpr-tied-pft-msmarco.ja
+    eval_key: mrtydi-v1.1-japanese
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-tied-pft-msmarco --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.3089
+            R@100: 0.7603
+      - split: dev
+        scores:
+          - MRR@100: 0.3108
+            R@100: 0.7597
+      - split: test
+        scores:
+          - MRR@100: 0.3107
+            R@100: 0.7317
+  - name: mdpr-tied-pft-msmarco.ko
+    eval_key: mrtydi-v1.1-korean
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-tied-pft-msmarco --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.3003
+            R@100: 0.6907
+      - split: dev
+        scores:
+          - MRR@100: 0.3017
+            R@100: 0.7046
+      - split: test
+        scores:
+          - MRR@100: 0.2820
+            R@100: 0.6172
+  - name: mdpr-tied-pft-msmarco.ru
+    eval_key: mrtydi-v1.1-russian
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-tied-pft-msmarco --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2856
+            R@100: 0.7305
+      - split: dev
+        scores:
+          - MRR@100: 0.2943
+            R@100: 0.7404
+      - split: test
+        scores:
+          - MRR@100: 0.3561
+            R@100: 0.7432
+  - name: mdpr-tied-pft-msmarco.sw
+    eval_key: mrtydi-v1.1-swahili
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-tied-pft-msmarco --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2491
+            R@100: 0.5195
+      - split: dev
+        scores:
+          - MRR@100: 0.2447
+            R@100: 0.5266
+      - split: test
+        scores:
+          - MRR@100: 0.3418
+            R@100: 0.6343
+  - name: mdpr-tied-pft-msmarco.te
+    eval_key: mrtydi-v1.1-telugu
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-tied-pft-msmarco --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.3059
+            R@100: 0.7510
+      - split: dev
+        scores:
+          - MRR@100: 0.2995
+            R@100: 0.7355
+      - split: test
+        scores:
+          - MRR@100: 0.3102
+            R@100: 0.7817
+  - name: mdpr-tied-pft-msmarco.th
+    eval_key: mrtydi-v1.1-thai
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-tied-pft-msmarco --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2334
+            R@100: 0.5851
+      - split: dev
+        scores:
+          - MRR@100: 0.2407
+            R@100: 0.5795
+      - split: test
+        scores:
+          - MRR@100: 0.2693
+            R@100: 0.5945
+  # mDPR, tied encoders, pFT w/ NQ
+  - name: mdpr-tied-pft-nq.ar
+    eval_key: mrtydi-v1.1-arabic
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-tied-pft-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2087
+            R@100: 0.5854
+      - split: dev
+        scores:
+          - MRR@100: 0.2132
+            R@100: 0.5868
+      - split: test
+        scores:
+          - MRR@100: 0.2214
+            R@100: 0.6001
+  - name: mdpr-tied-pft-nq.bn
+    eval_key: mrtydi-v1.1-bengali
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-tied-pft-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2371
+            R@100: 0.6281
+      - split: dev
+        scores:
+          - MRR@100: 0.2414
+            R@100: 0.6409
+      - split: test
+        scores:
+          - MRR@100: 0.2535
+            R@100: 0.7072
+  - name: mdpr-tied-pft-nq.en
+    eval_key: mrtydi-v1.1-english
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-tied-pft-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2441
+            R@100: 0.7217
+      - split: dev
+        scores:
+          - MRR@100: 0.2359
+            R@100: 0.7187
+      - split: test
+        scores:
+          - MRR@100: 0.2433
+            R@100: 0.6893
+  - name: mdpr-tied-pft-nq.fi
+    eval_key: mrtydi-v1.1-finnish
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-tied-pft-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2996
+            R@100: 0.6787
+      - split: dev
+        scores:
+          - MRR@100: 0.3252
+            R@100: 0.7037
+      - split: test
+        scores:
+          - MRR@100: 0.2444
+            R@100: 0.6401
+  - name: mdpr-tied-pft-nq.id
+    eval_key: mrtydi-v1.1-indonesian
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-tied-pft-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2706
+            R@100: 0.7322
+      - split: dev
+        scores:
+          - MRR@100: 0.2719
+            R@100: 0.7394
+      - split: test
+        scores:
+          - MRR@100: 0.2815
+            R@100: 0.6914
+  - name: mdpr-tied-pft-nq.ja
+    eval_key: mrtydi-v1.1-japanese
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-tied-pft-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2165
+            R@100: 0.6043
+      - split: dev
+        scores:
+          - MRR@100: 0.2299
+            R@100: 0.6239
+      - split: test
+        scores:
+          - MRR@100: 0.2058
+            R@100: 0.5734
+  - name: mdpr-tied-pft-nq.ko
+    eval_key: mrtydi-v1.1-korean
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-tied-pft-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2527
+            R@100: 0.6556
+      - split: dev
+        scores:
+          - MRR@100: 0.2680
+            R@100: 0.6271
+      - split: test
+        scores:
+          - MRR@100: 0.2234
+            R@100: 0.5499
+  - name: mdpr-tied-pft-nq.ru
+    eval_key: mrtydi-v1.1-russian
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-tied-pft-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2160
+            R@100: 0.6262
+      - split: dev
+        scores:
+          - MRR@100: 0.2263
+            R@100: 0.6444
+      - split: test
+        scores:
+          - MRR@100: 0.2501
+            R@100: 0.6181
+  - name: mdpr-tied-pft-nq.sw
+    eval_key: mrtydi-v1.1-swahili
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-tied-pft-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2383
+            R@100: 0.5707
+      - split: dev
+        scores:
+          - MRR@100: 0.2543
+            R@100: 0.6138
+      - split: test
+        scores:
+          - MRR@100: 0.2621
+            R@100: 0.5965
+  - name: mdpr-tied-pft-nq.te
+    eval_key: mrtydi-v1.1-telugu
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-tied-pft-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.1483
+            R@100: 0.4162
+      - split: dev
+        scores:
+          - MRR@100: 0.1494
+            R@100: 0.3967
+      - split: test
+        scores:
+          - MRR@100: 0.0970
+            R@100: 0.2454
+  - name: mdpr-tied-pft-nq.th
+    eval_key: mrtydi-v1.1-thai
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-tied-pft-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.1426
+            R@100: 0.4717
+      - split: dev
+        scores:
+          - MRR@100: 0.1618
+            R@100: 0.4637
+      - split: test
+        scores:
+          - MRR@100: 0.1575
+            R@100: 0.4550
+  # mDPR, split encoders, pFT w/ NQ
+  - name: mdpr-split-pft-nq.ar
+    eval_key: mrtydi-v1.1-arabic
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2510
+            R@100: 0.6384
+      - split: dev
+        scores:
+          - MRR@100: 0.2449
+            R@100: 0.6334
+      - split: test
+        scores:
+          - MRR@100: 0.2907
+            R@100: 0.6502
+  - name: mdpr-split-pft-nq.bn
+    eval_key: mrtydi-v1.1-bengali
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2293
+            R@100: 0.6454
+      - split: dev
+        scores:
+          - MRR@100: 0.2367
+            R@100: 0.6511
+      - split: test
+        scores:
+          - MRR@100: 0.2911
+            R@100: 0.7793
+  - name: mdpr-split-pft-nq.en
+    eval_key: mrtydi-v1.1-english
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2862
+            R@100: 0.7372
+      - split: dev
+        scores:
+          - MRR@100: 0.2821
+            R@100: 0.7437
+      - split: test
+        scores:
+          - MRR@100: 0.2907
+            R@100: 0.6779
+  - name: mdpr-split-pft-nq.fi
+    eval_key: mrtydi-v1.1-finnish
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2473
+            R@100: 0.6289
+      - split: dev
+        scores:
+          - MRR@100: 0.2466
+            R@100: 0.6283
+      - split: test
+        scores:
+          - MRR@100: 0.2050
+            R@100: 0.5680
+  - name: mdpr-split-pft-nq.id
+    eval_key: mrtydi-v1.1-indonesian
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2351
+            R@100: 0.6952
+      - split: dev
+        scores:
+          - MRR@100: 0.2475
+            R@100: 0.7181
+      - split: test
+        scores:
+          - MRR@100: 0.2705
+            R@100: 0.6848
+  - name: mdpr-split-pft-nq.ja
+    eval_key: mrtydi-v1.1-japanese
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.1967
+            R@100: 0.5983
+      - split: dev
+        scores:
+          - MRR@100: 0.2055
+            R@100: 0.6142
+      - split: test
+        scores:
+          - MRR@100: 0.2119
+            R@100: 0.5840
+  - name: mdpr-split-pft-nq.ko
+    eval_key: mrtydi-v1.1-korean
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2383
+            R@100: 0.6180
+      - split: dev
+        scores:
+          - MRR@100: 0.2343
+            R@100: 0.6238
+      - split: test
+        scores:
+          - MRR@100: 0.2345
+            R@100: 0.5325
+  - name: mdpr-split-pft-nq.ru
+    eval_key: mrtydi-v1.1-russian
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2317
+            R@100: 0.6534
+      - split: dev
+        scores:
+          - MRR@100: 0.2490
+            R@100: 0.6553
+      - split: test
+        scores:
+          - MRR@100: 0.2820
+            R@100: 0.6474
+  - name: mdpr-split-pft-nq.sw
+    eval_key: mrtydi-v1.1-swahili
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.1457
+            R@100: 0.4481
+      - split: dev
+        scores:
+          - MRR@100: 0.1547
+            R@100: 0.4724
+      - split: test
+        scores:
+          - MRR@100: 0.1883
+            R@100: 0.5281
+  - name: mdpr-split-pft-nq.te
+    eval_key: mrtydi-v1.1-telugu
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.1489
+            R@100: 0.4905
+      - split: dev
+        scores:
+          - MRR@100: 0.1503
+            R@100: 0.4934
+      - split: test
+        scores:
+          - MRR@100: 0.1099
+            R@100: 0.3661
+  - name: mdpr-split-pft-nq.th
+    eval_key: mrtydi-v1.1-thai
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-nq --output $output --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.1603
+            R@100: 0.4983
+      - split: dev
+        scores:
+          - MRR@100: 0.1584
+            R@100: 0.5083
+      - split: test
+        scores:
+          - MRR@100: 0.1709
+            R@100: 0.5146
+  # BM25
+  - name: bm25.ar
+    eval_key: mrtydi-v1.1-arabic
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ar --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic --output $output --bm25 --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.3356
+            R@100: 0.7944
+      - split: dev
+        scores:
+          - MRR@100: 0.3462
+            R@100: 0.7872
+      - split: test
+        scores:
+          - MRR@100: 0.3682
+            R@100: 0.7928
+  - name: bm25.bn
+    eval_key: mrtydi-v1.1-bengali
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language bn --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali --output $output --bm25 --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.3566
+          - R@100: 0.8336
+      - split: dev
+        scores:
+          - MRR@100: 0.3385
+          - R@100: 0.8432
+      - split: test
+        scores:
+          - MRR@100: 0.4182
+          - R@100: 0.8694
+  - name: bm25.en
+    eval_key: mrtydi-v1.1-english
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language en --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english --output $output --bm25 --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.1592
+          - R@100: 0.5785
+      - split: dev
+        scores:
+          - MRR@100: 0.1685
+          - R@100: 0.6196
+      - split: test
+        scores:
+          - MRR@100: 0.1404
+          - R@100: 0.5365
+  - name: bm25.fi
+    eval_key: mrtydi-v1.1-finnish
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language fi --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish --output $output --bm25 --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.4101
+          - R@100: 0.8198
+      - split: dev
+        scores:
+          - MRR@100: 0.4136
+          - R@100: 0.8285
+      - split: test
+        scores:
+          - MRR@100: 0.2836
+          - R@100: 0.7196
+  - name: bm25.id
+    eval_key: mrtydi-v1.1-indonesian
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language id --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian --output $output --bm25 --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2972
+          - R@100: 0.7948
+      - split: dev
+        scores:
+          - MRR@100: 0.2937
+          - R@100: 0.7827
+      - split: test
+        scores:
+          - MRR@100: 0.3762
+          - R@100: 0.8426
+  - name: bm25.ja
+    eval_key: mrtydi-v1.1-japanese
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ja --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese --output $output --bm25 --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2262
+          - R@100: 0.7290
+      - split: dev
+        scores:
+          - MRR@100: 0.2250
+          - R@100: 0.7252
+      - split: test
+        scores:
+          - MRR@100: 0.2125
+          - R@100: 0.6431
+  - name: bm25.ko
+    eval_key: mrtydi-v1.1-korean
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ko --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean --output $output --bm25 --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2596
+          - R@100: 0.6178
+      - split: dev
+        scores:
+          - MRR@100: 0.2888
+          - R@100: 0.6733
+      - split: test
+        scores:
+          - MRR@100: 0.2848
+          - R@100: 0.6188
+  - name: bm25.ru
+    eval_key: mrtydi-v1.1-russian
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ru --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian --output $output --bm25 --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2229
+          - R@100: 0.5779
+      - split: dev
+        scores:
+          - MRR@100: 0.2202
+          - R@100: 0.5760
+      - split: test
+        scores:
+          - MRR@100: 0.3163
+          - R@100: 0.6541
+  - name: bm25.sw
+    eval_key: mrtydi-v1.1-swahili
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language sw --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili --output $output --bm25 --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.2610
+          - R@100: 0.5903
+      - split: dev
+        scores:
+          - MRR@100: 0.2693
+          - R@100: 0.5789
+      - split: test
+        scores:
+          - MRR@100: 0.3893
+          - R@100: 0.7642
+  - name: bm25.te
+    eval_key: mrtydi-v1.1-telugu
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language te --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu --output $output --bm25 --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.4204
+          - R@100: 0.8229
+      - split: dev
+        scores:
+          - MRR@100: 0.4269
+          - R@100: 0.8362
+      - split: test
+        scores:
+          - MRR@100: 0.5283
+          - R@100: 0.8971
+  - name: bm25.th
+    eval_key: mrtydi-v1.1-thai
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language th --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai --output $output --bm25 --hits 100
+    splits:
+      - split: train
+        scores:
+          - MRR@100: 0.3543
+          - R@100: 0.8349
+      - split: dev
+        scores:
+          - MRR@100: 0.3586
+          - R@100: 0.8536
+      - split: test
+        scores:
+          - MRR@100: 0.4012
+          - R@100: 0.8529

pyserini/2cr/mrtydi_html.template ADDED Viewed

	@@ -0,0 +1,256 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
+    <meta http-equiv="x-ua-compatible" content="ie=edge" />
+    <title>Pyserini Reproductions</title>
+    <!-- Font Awesome -->
+    <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
+    <!-- Google Fonts Roboto -->
+    <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
+    <!-- MDB -->
+   <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
+    <style>
+tr.hide-table-padding td {
+  padding: 0;
+}
+.expand-button {
+  position: relative;
+}
+.accordion-toggle .expand-button:after {
+  position: absolute;
+  left:.75rem;
+  top: 50%;
+  transform: translate(0, -50%);
+  content: '-';
+}
+.accordion-toggle.collapsed .expand-button:after {
+  content: '+';
+}
+blockquote.mycode {
+  border-left: 3px solid #ccc;
+  margin-left: 25px;
+  margin-top: 15px;
+  padding-left: 15px;
+}
+blockquote.mycode2 {
+  border-left: 3px solid #ccc;
+  margin-left: 25px;
+  padding-top: 10px;
+  padding-bottom: 10px;
+  padding-left: 15px;
+}
+tr th.headertop {
+  border-bottom: none;
+  padding-bottom: 0rem
+}
+tr th.headerbottom {
+  padding-top: 0rem
+}
+.table>:not(caption)>*>*{padding:0.75rem 0.75rem}
+.copy-code-button {
+	border-radius: 0;
+	min-width: 55px;
+	background: none repeat scroll 0 0 transparent;
+	background-color: grey;
+	color: #F1F2F3 !important;
+	cursor: pointer;
+	border-style: none;
+	font-family: 'HELVETICA',sans-serif;
+	font-size: 0.8em;
+	font-weight: normal;
+	text-align: center;
+	text-decoration: none;
+	text-indent: 0;
+	text-transform: uppercase;
+	font-weight: 500;
+	line-height: 1.42rem;
+	margin: 0;
+	padding: 3px 8px;
+	position: absolute !important;
+	top: 0 !important;
+	right: 0 !important;
+}
+.copy-code-button > span {
+	color: #F1F2F3 !important;
+}
+.copy-code-button, ::before, ::after {
+	box-sizing: inherit;
+}
+.copy-code-button::before {
+	content: '';
+	display: inline-block;
+	width: 16px;
+	height: 16px;
+	margin-right: 3px;
+	background-size: contain;
+	background-image: url("data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHN2ZyB3aWR0aD0iMTVweCIgaGVpZ2h0PSIxNXB4IiB2aWV3Qm94PSIwIDAgMTUgMTUiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayI+CiAgICA8IS0tIEdlbmVyYXRvcjogU2tldGNoIDUwLjIgKDU1MDQ3KSAtIGh0dHA6Ly93d3cuYm9oZW1pYW5jb2RpbmcuY29tL3NrZXRjaCAtLT4KICAgIDx0aXRsZT5QYWdlIDE8L3RpdGxlPgogICAgPGRlc2M+Q3JlYXRlZCB3aXRoIFNrZXRjaC48L2Rlc2M+CiAgICA8ZGVmcz48L2RlZnM+CiAgICA8ZyBpZD0iRmxvdyIgc3Ryb2tlPSJub25lIiBzdHJva2Utd2lkdGg9IjEiIGZpbGw9Im5vbmUiIGZpbGwtcnVsZT0iZXZlbm9kZCI+CiAgICAgICAgPGcgaWQ9IkJ0dG5faHRtbCIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTgxOS4wMDAwMDAsIC03NTMuMDAwMDAwKSIgZmlsbD0iI0ZGRkZGRiI+CiAgICAgICAgICAgIDxnIGlkPSJHcm91cC0xIiB0cmFuc2Zvcm09InRyYW5zbGF0ZSgzMTEuMDAwMDAwLCA0MDUuMDAwMDAwKSI+CiAgICAgICAgICAgICAgICA8ZyBpZD0iR3JvdXAtMiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoNTA4LjAwMDAwMCwgMzQyLjAwMDAwMCkiPgogICAgICAgICAgICAgICAgICAgIDxwYXRoIGQ9Ik0xMy45NzcyNzI3LDYgTDMuNDA5MDkwOTEsNiBDMi44NDQ1NDU0NSw2IDIuMzg2MzYzNjQsNi40NTgxODE4MiAyLjM4NjM2MzY0LDcuMDIyNzI3MjcgTDIuMzg2MzYzNjQsMTcuNTkwOTA5MSBDMi4zODYzNjM2NCwxOC4xNTU0NTQ1IDIuODQ0NTQ1NDUsMTguNjEzNjM2NCAzLjQwOTA5MDkxLDE4LjYxMzYzNjQgTDEzLjk3NzI3MjcsMTguNjEzNjM2NCBDMTQuNTQxODE4MiwxOC42MTM2MzY0IDE1LDE4LjE1NTQ1NDUgMTUsMTcuNTkwOTA5MSBMMTUsNy4wMjI3MjcyNyBDMTUsNi40NTgxODE4MiAxNC41NDE4MTgyLDYgMTMuOTc3MjcyNyw2IFogTTE0LjMxODE4MTgsMTcuNTkwOTA5MSBDMTQuMzE4MTgxOCwxNy43NzkwOTA5IDE0LjE2NTQ1NDUsMTcuOTMxODE4MiAxMy45NzcyNzI3LDE3LjkzMTgxODIgTDMuNDA5MDkwOTEsMTcuOTMxODE4MiBDMy4yMjA5MDkwOSwxNy45MzE4MTgyIDMuMDY4MTgxODIsMTcuNzc5MDkwOSAzLjA2ODE4MTgyLDE3LjU5MDkwOTEgTDMuMDY4MTgxODIsNy4wMjI3MjcyNyBDMy4wNjgxODE4Miw2LjgzNDU0NTQ1IDMuMjIwOTA5MDksNi42ODE4MTgxOCAzLjQwOTA5MDkxLDYuNjgxODE4MTggTDEzLjk3NzI3MjcsNi42ODE4MTgxOCBDMTQuMTY1NDU0NSw2LjY4MTgxODE4IDE0LjMxODE4MTgsNi44MzQ1NDU0NSAxNC4zMTgxODE4LDcuMDIyNzI3MjcgTDE0LjMxODE4MTgsMTcuNTkwOTA5MSBaIE0xMS45MzE4MTgyLDE5Ljk3NzI3MjcgQzExLjkzMTgxODIsMjAuMTY1NDU0NSAxMS43NzkwOTA5LDIwLjMxODE4MTggMTEuNTkwOTA5MSwyMC4zMTgxODE4IEwxLjAyMjcyNzI3LDIwLjMxODE4MTggQzAuODM0NTQ1NDU1LDIwLjMxODE4MTggMC42ODE4MTgxODIsMjAuMTY1NDU0NSAwLjY4MTgxODE4MiwxOS45NzcyNzI3IEwwLjY4MTgxODE4Miw5LjQwOTA5MDkxIEMwLjY4MTgxODE4Miw5LjIyMDkwOTA5IDAuODM0NTQ1NDU1LDkuMDY4MTgxODIgMS4wMjI3MjcyNyw5LjA2ODE4MTgyIEwxLjM2MzYzNjM2LDkuMDY4MTgxODIgTDEuMzYzNjM2MzYsOC4zODYzNjM2NCBMMS4wMjI3MjcyNyw4LjM4NjM2MzY0IEMwLjQ1ODE4MTgxOCw4LjM4NjM2MzY0IDAsOC44NDQ1NDU0NSAwLDkuNDA5MDkwOTEgTDAsMTkuOTc3MjcyNyBDMCwyMC41NDE4MTgyIDAuNDU4MTgxODE4LDIxIDEuMDIyNzI3MjcsMjEgTDExLjU5MDkwOTEsMjEgQzEyLjE1NTQ1NDUsMjEgMTIuNjEzNjM2NCwyMC41NDE4MTgyIDEyLjYxMzYzNjQsMTkuOTc3MjcyNyBMMTIuNjEzNjM2NCwxOS42MzYzNjM2IEwxMS45MzE4MTgyLDE5LjYzNjM2MzYgTDExLjkzMTgxODIsMTkuOTc3MjcyNyBaIiBpZD0iUGFnZS0xIj48L3BhdGg+CiAgICAgICAgICAgICAgICA8L2c+CiAgICAgICAgICAgIDwvZz4KICAgICAgICA8L2c+CiAgICA8L2c+Cjwvc3ZnPg==");
+	background-repeat: no-repeat;
+	position: relative;
+	top: 3px;
+}
+.copy-code-button:focus {
+    /* Avoid an ugly focus outline on click in Chrome,
+       but darken the button for accessibility.
+       See https://stackoverflow.com/a/25298082/1481479 */
+    /* background-color: #E6E6E6; */
+	outline: 0;
+}
+pre[class*="prettyprint"] {
+	position: relative;
+	overflow: hidden;
+}
+    </style>
+</head>
+<body>
+    <!-- Background image -->
+    <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
+      <div class="mask" style="
+            background: linear-gradient(
+              45deg,
+              rgba(29, 236, 197, 0.7),
+              rgba(91, 14, 214, 0.7) 100%
+            );
+          ">
+        <div class="container d-flex align-items-center justify-content-center text-center h-100"  style="max-height: 150px">
+          <div class="text-white">
+            <h1 class="mb-3">$title</h1>
+          </div>
+        </div>
+      </div>
+    </div>
+    <!-- Background image -->
+    <div class="container my-4">
+    $tables
+  </ul>
+  <div style="padding-top: 20px"/>
+  <h4>Programmatic Execution</h4>
+  <p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
+  To list all the experimental conditions:</p>
+  <blockquote class="mycode2"><tt>
+  python -m pyserini.2cr.mrtydi --list-conditions
+  </tt></blockquote>
+  <p>Run all languages for a specific condition and show commands:</p>
+  <blockquote class="mycode2"><tt>
+  python -m pyserini.2cr.mrtydi --condition bm25 --display-commands
+  </tt></blockquote>
+  <p>Run a particular language for a specific condition and show commands:</p>
+  <blockquote class="mycode2"><tt>
+  python -m pyserini.2cr.mrtydi --condition bm25 --language ko --display-commands
+  </tt></blockquote>
+  <p>Run all languages for all conditions and show commands:</p>
+  <blockquote class="mycode2"><tt>
+  python -m pyserini.2cr.mrtydi --all --display-commands
+  </tt></blockquote>
+  <p>With the above commands, run files will be placed in the current directory. Use the option <tt>--directory runs</tt> to place the runs in a sub-directory.</p>
+  <p>For a specific condition, just show the commands and do not run:</p>
+  <blockquote class="mycode2"><tt>
+  python -m pyserini.2cr.mrtydi --condition bm25 --display-commands --dry-run
+  </tt></blockquote>
+  <p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
+  <p>For a specific condition and language, just show the commands and do not run:</p>
+  <blockquote class="mycode2"><tt>
+  python -m pyserini.2cr.mrtydi --condition bm25 --language ko --display-commands --dry-run
+  </tt></blockquote>
+  <p>For all conditions, just show the commands and do not run and skip evaluation:</p>
+  <blockquote class="mycode2"><tt>
+  python -m pyserini.2cr.mrtydi --all --display-commands --dry-run --skip-eval
+  </tt></blockquote>
+  <p>Finally, to generate this page:</p>
+  <blockquote class="mycode2"><tt>
+  python -m pyserini.2cr.mrtydi --generate-report --output docs/2cr/mrtydi.html
+  </tt></blockquote>
+  <p>The output file <tt>mrtydi.html</tt> should be identical to this page.</p>
+  <div style="padding-top: 50px"/>
+    </div>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
+    <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
+<script>
+document.querySelectorAll('pre').forEach(function (codeBlock) {
+    var button = document.createElement('button');
+    button.className = 'copy-code-button';
+    button.type = 'button';
+    var s = codeBlock.innerText;
+    button.setAttribute('data-clipboard-text',s);
+    button.innerText = 'Copy';
+    // var pre = codeBlock.parentNode;
+    codeBlock.classList.add('prettyprint');
+    // pre.parentNode.insertBefore(button, pre);
+    codeBlock.appendChild(button);
+});
+var clipboard = new ClipboardJS('.copy-code-button');
+clipboard.on('success', function(e) {
+  console.info('Action:', e.action);
+  console.info('Text:', e.text);
+  console.info('Trigger:', e.trigger);
+  e.trigger.textContent = 'Copied';
+  window.setTimeout(function() {
+    e.trigger.textContent = 'Copy';
+  }, 2000);
+  e.clearSelection();
+});
+clipboard.on('error', function(e) {
+  console.error('Action:', e.action);
+  console.error('Trigger:', e.trigger);
+  e.trigger.textContent = 'Error Copying';
+  window.setTimeout(function() {
+    e.trigger.textContent = 'Copy';
+  }, 2000);
+  e.clearSelection();
+});
+</script>
+</body>
+</html>

pyserini/2cr/mrtydi_html_table.template ADDED Viewed

	@@ -0,0 +1,28 @@

+<div class="table-responsive">
+  <table class="table">
+    <thead>
+      <tr>
+        <th scope="col"></th>
+        <th scope="col">$desc</th>
+        <th scope="col">ar</th>
+        <th scope="col">bn</th>
+        <th scope="col">en</th>
+        <th scope="col">fi</th>
+        <th scope="col">id</th>
+        <th scope="col">ja</th>
+        <th scope="col">ko</th>
+        <th scope="col">ru</th>
+        <th scope="col">sw</th>
+        <th scope="col">te</th>
+        <th scope="col">th</th>
+        <th scope="col"></th>
+        <th scope="col">avg</th>
+      </tr>
+    </thead>
+    <tbody>
+$rows
+    </tbody>
+  </table>
+</div>

pyserini/2cr/mrtydi_html_table_row.template ADDED Viewed

	@@ -0,0 +1,212 @@

+<!-- Condition: $model -->
+<tr class="accordion-toggle collapsed" id="table${table_cnt}-row${row_cnt}" data-toggle="collapse" data-parent="#table${table_cnt}-row${row_cnt}" href="#table${table_cnt}-collapse${row_cnt}">
+<td class="expand-button"></td>
+<td>$model</td>
+<td>$ar</td>
+<td>$bn</td>
+<td>$en</td>
+<td>$fi</td>
+<td>$id</td>
+<td>$ja</td>
+<td>$ko</td>
+<td>$ru</td>
+<td>$sw</td>
+<td>$te</td>
+<td>$th</td>
+<td></td>
+<td>$avg</td>
+</tr>
+<tr class="hide-table-padding">
+<td></td>
+<td></td>
+<td colspan="13" style="max-width: 600px">
+<div id="table${table_cnt}-collapse${row_cnt}" class="collapse in p-3">
+<!-- Tabs navs -->
+<ul class="nav nav-tabs mb-3" id="table${table_cnt}-row${row_cnt}-tabs" role="tablist">
+  <li class="nav-item" role="presentation">
+    <a class="nav-link active" id="table${table_cnt}-row${row_cnt}-tab1-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab1" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab1" aria-selected="true" style="text-transform:none">ar</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab2-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab2" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab2" aria-selected="false" style="text-transform:none">bn</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab3-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab3" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">en</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab4-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab4" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">fi</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab5-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab5" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">id</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab6-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab6" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">ja</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab7-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab7" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">ko</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab8-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab8" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">ru</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab9-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab9" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">sw</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab10-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab10" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">te</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab11-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab11" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">th</a>
+  </li>
+</ul>
+<!-- Tabs navs -->
+<!-- Tabs content -->
+<div class="tab-content" id="table${table_cnt}-row${row_cnt}-content">
+  <div class="tab-pane fade show active" id="table${table_cnt}-row${row_cnt}-tab1" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab1">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd1
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd1}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab2" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab2">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd2
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd2}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab3" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab3">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd3
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd3}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab4" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab4">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd4
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd4}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab5" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab5">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd5
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd5}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab6" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab6">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd6
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd6}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab7" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab7">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd7
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd7}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab8" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab8">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd8
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd8}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab9" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab9">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd9
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd9}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab10" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab10">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd10
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd10}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab11" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab11">
+Command to generate run:
+  <blockquote class="mycode">
+<pre><code>$cmd11
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd11}</code></pre>
+  </blockquote>
+  </div>
+</div>
+<!-- Tabs content -->
+</div></td>
+</tr>

pyserini/2cr/msmarco-v1-doc.yaml ADDED Viewed

	@@ -0,0 +1,539 @@

+conditions:
+  - name: bm25-doc-tuned
+    display: BM25 doc (k1=4.46, b=0.82)
+    display-html: BM25 doc (<i>k<sub><small>1</small></sub></i>=4.46, <i>b</i>=0.82)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.2767
+            R@1K: 0.9357
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2336
+            nDCG@10: 0.5233
+            R@1K: 0.6757
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.3581
+            nDCG@10: 0.5061
+            R@1K: 0.7776
+  - name: bm25-doc-default
+    display: BM25 doc (k1=0.9, b=0.4)
+    display-html: BM25 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1a)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --k1 0.9 --b 0.4
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.2299
+            R@1K: 0.8856
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2434
+            nDCG@10: 0.5176
+            R@1K: 0.6966
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.3793
+            nDCG@10: 0.5286
+            R@1K: 0.8085
+  - name: bm25-doc-segmented-tuned
+    display: BM25 doc segmented (k1=2.16, b=0.61)
+    display-html: BM25 doc segmented (<i>k<sub><small>1</small></sub></i>=2.16, <i>b</i>=0.61)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.2756
+            R@1K: 0.9311
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2398
+            nDCG@10: 0.5389
+            R@1K: 0.6565
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.3458
+            nDCG@10: 0.5213
+            R@1K: 0.7725
+  - name: bm25-doc-segmented-default
+    display: BM25 doc segmented (k1=0.9, b=0.4)
+    display-html: BM25 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1b)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.2684
+            R@1K: 0.9178
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2449
+            nDCG@10: 0.5302
+            R@1K: 0.6871
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.3586
+            nDCG@10: 0.5281
+            R@1K: 0.7755
+  - name: bm25-rm3-doc-tuned
+    display: BM25+RM3 doc (k1=4.46, b=0.82)
+    display-html: BM25+RM3 doc (<i>k<sub><small>1</small></sub></i>=4.46, <i>b</i>=0.82)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rm3
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.2227
+            R@1K: 0.9303
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2638
+            nDCG@10: 0.5526
+            R@1K: 0.7188
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.3610
+            nDCG@10: 0.5195
+            R@1K: 0.8180
+  - name: bm25-rm3-doc-default
+    display: BM25+RM3 doc (k1=0.9, b=0.4)
+    display-html: BM25+RM3 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1c)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.1618
+            R@1K: 0.8783
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2773
+            nDCG@10: 0.5174
+            R@1K: 0.7507
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.4015
+            nDCG@10: 0.5254
+            R@1K: 0.8259
+  - name: bm25-rm3-doc-segmented-tuned
+    display: BM25+RM3 doc segmented (k1=2.16, b=0.61)
+    display-html: BM25+RM3 doc segmented (<i>k<sub><small>1</small></sub></i>=2.16, <i>b</i>=0.61)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.2448
+            R@1K: 0.9359
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2655
+            nDCG@10: 0.5392
+            R@1K: 0.7037
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.3471
+            nDCG@10: 0.5030
+            R@1K: 0.8056
+  - name: bm25-rm3-doc-segmented-default
+    display: BM25+RM3 doc segmented (k1=0.9, b=0.4)
+    display-html: BM25+RM3 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1d)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.2413
+            R@1K: 0.9351
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2892
+            nDCG@10: 0.5684
+            R@1K: 0.7368
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.3792
+            nDCG@10: 0.5202
+            R@1K: 0.8023
+  - name: bm25-rocchio-doc-tuned
+    display: BM25+Rocchio doc (k1=4.46, b=0.82)
+    display-html: BM25+Rocchio doc (<i>k<sub><small>1</small></sub></i>=4.46, <i>b</i>=0.82)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rocchio
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.2242
+            R@1K: 0.9314
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2657
+            nDCG@10: 0.5584
+            R@1K: 0.7299
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.3628
+            nDCG@10: 0.5199
+            R@1K: 0.8217
+  - name: bm25-rocchio-doc-default
+    display: BM25+Rocchio doc (k1=0.9, b=0.4)
+    display-html: BM25+Rocchio doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rocchio --k1 0.9 --b 0.4
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.1624
+            R@1K: 0.8789
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2811
+            nDCG@10: 0.5256
+            R@1K: 0.7546
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.4089
+            nDCG@10: 0.5192
+            R@1K: 0.8273
+  - name: bm25-rocchio-doc-segmented-tuned
+    display: BM25+Rocchio doc segmented (k1=2.16, b=0.61)
+    display-html: BM25+Rocchio doc segmented (<i>k<sub><small>1</small></sub></i>=2.16, <i>b</i>=0.61)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rocchio --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.2475
+            R@1K: 0.9395
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2672
+            nDCG@10: 0.5421
+            R@1K: 0.7115
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.3521
+            nDCG@10: 0.4997
+            R@1K: 0.8042
+  - name: bm25-rocchio-doc-segmented-default
+    display: BM25+Rocchio doc segmented (k1=0.9, b=0.4)
+    display-html: BM25+Rocchio doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rocchio --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.2447
+            R@1K: 0.9351
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2889
+            nDCG@10: 0.5570
+            R@1K: 0.7423
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.3830
+            nDCG@10: 0.5226
+            R@1K: 0.8102
+  - name: bm25-d2q-t5-doc-tuned
+    display: BM25 w/ doc2query-T5 doc (k1=4.68, b=0.87)
+    display-html: BM25 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=4.68, <i>b</i>=0.87)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5 --topics $topics --output $output --bm25
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.3269
+            R@1K: 0.9553
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2620
+            nDCG@10: 0.5972
+            R@1K: 0.6867
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.4099
+            nDCG@10: 0.5852
+            R@1K: 0.8105
+  - name: bm25-d2q-t5-doc-default
+    display: BM25 w/ doc2query-T5 doc (k1=0.9, b=0.4)
+    display-html: BM25 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2a)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5 --topics $topics --output $output --bm25 --k1 0.9 --b 0.4
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.2880
+            R@1K: 0.9259
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2700
+            nDCG@10: 0.5968
+            R@1K: 0.7190
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.4230
+            nDCG@10: 0.5885
+            R@1K: 0.8403
+  - name: bm25-d2q-t5-doc-segmented-tuned
+    display: BM25 w/ doc2query-T5 doc segmented (k1=2.56, b=0.59)
+    display-html: BM25 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=2.56, <i>b</i>=0.59)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5 --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.3209
+            R@1K: 0.9530
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2658
+            nDCG@10: 0.6273
+            R@1K: 0.6707
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.4047
+            nDCG@10: 0.5943
+            R@1K: 0.7968
+  - name: bm25-d2q-t5-doc-segmented-default
+    display: BM25 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4)
+    display-html: BM25 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2b)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5 --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.3179
+            R@1K: 0.9490
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2798
+            nDCG@10: 0.6119
+            R@1K: 0.7165
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.4150
+            nDCG@10: 0.5957
+            R@1K: 0.8046
+  - name: bm25-rm3-d2q-t5-doc-tuned
+    display: BM25+RM3 w/ doc2query-T5 doc (k1=4.68, b=0.87)
+    display-html: BM25+RM3 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=4.68, <i>b</i>=0.87)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.2623
+            R@1K: 0.9522
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2813
+            nDCG@10: 0.6091
+            R@1K: 0.7184
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.4100
+            nDCG@10: 0.5745
+            R@1K: 0.8238
+  - name: bm25-rm3-d2q-t5-doc-default
+    display: BM25+RM3 w/ doc2query-T5 doc (k1=0.9, b=0.4)
+    display-html: BM25+RM3 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2c)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.1834
+            R@1K: 0.9126
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.3045
+            nDCG@10: 0.5904
+            R@1K: 0.7737
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.4230
+            nDCG@10: 0.5427
+            R@1K: 0.8631
+  - name: bm25-rm3-d2q-t5-doc-segmented-tuned
+    display: BM25+RM3 w/ doc2query-T5 doc segmented (k1=2.56, b=0.59)
+    display-html: BM25+RM3 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=2.56, <i>b</i>=0.59)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.2973
+            R@1K: 0.9563
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2892
+            nDCG@10: 0.6247
+            R@1K: 0.7069
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.4016
+            nDCG@10: 0.5711
+            R@1K: 0.8156
+  - name: bm25-rm3-d2q-t5-doc-segmented-default
+    display: BM25+RM3 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4)
+    display-html: BM25+RM3 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2d)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.2803
+            R@1K: 0.9551
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.3030
+            nDCG@10: 0.6290
+            R@1K: 0.7483
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.4271
+            nDCG@10: 0.5851
+            R@1K: 0.8266
+  - name: unicoil-noexp-pytorch
+    display: "uniCOIL (noexp): query inference with PyTorch"
+    display-html: "uniCOIL (noexp): query inference with PyTorch"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil-noexp --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.3410
+            R@1K: 0.9420
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2661
+            nDCG@10: 0.6347
+            R@1K: 0.6385
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.3698
+            nDCG@10: 0.5906
+            R@1K: 0.7621
+  - name: unicoil-noexp
+    display: "uniCOIL (noexp): pre-encoded"
+    display-html: "uniCOIL (noexp): pre-encoded queries"
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (3a)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil-noexp --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-doc-dev-unicoil-noexp
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.3409
+            R@1K: 0.9420
+      - topic_key: dl19-doc-unicoil-noexp
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2665
+            nDCG@10: 0.6349
+            R@1K: 0.6391
+      - topic_key: dl20-unicoil-noexp
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.3698
+            nDCG@10: 0.5893
+            R@1K: 0.7623
+  - name: unicoil-pytorch
+    display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+    display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-doc-dev
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.3532
+            R@1K: 0.9546
+      - topic_key: dl19-doc
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2789
+            nDCG@10: 0.6396
+            R@1K: 0.6654
+      - topic_key: dl20
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.3881
+            nDCG@10: 0.6030
+            R@1K: 0.7866
+  - name: unicoil
+    display: "uniCOIL (w/ doc2query-T5): pre-encoded"
+    display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries"
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (3b)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-doc-dev-unicoil
+        eval_key: msmarco-doc-dev
+        scores:
+          - MRR@10: 0.3531
+            R@1K: 0.9546
+      - topic_key: dl19-doc-unicoil
+        eval_key: dl19-doc
+        scores:
+          - MAP: 0.2789
+            nDCG@10: 0.6396
+            R@1K: 0.6652
+      - topic_key: dl20-unicoil
+        eval_key: dl20-doc
+        scores:
+          - MAP: 0.3882
+            nDCG@10: 0.6033
+            R@1K: 0.7869

pyserini/2cr/msmarco-v1-passage.yaml ADDED Viewed

	@@ -0,0 +1,764 @@

+conditions:
+  - name: bm25-rocchio-d2q-t5-tuned
+    display: BM25+Rocchio w/ doc2query-T5 (k1=2.18, b=0.86)
+    display-html: BM25+Rocchio w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=2.18, <i>b</i>=0.86)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rocchio
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.2395
+            R@1K: 0.9535
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4339
+            nDCG@10: 0.6559
+            R@1K: 0.8465
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4376
+            nDCG@10: 0.6224
+            R@1K: 0.8641
+  - name: bm25-rocchio-d2q-t5-default
+    display: BM25+Rocchio w/ doc2query-T5 (k1=0.9, b=0.4)
+    display-html: BM25+Rocchio w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rocchio --k1 0.9 --b 0.4
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.2158
+            R@1K: 0.9467
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4469
+            nDCG@10: 0.6538
+            R@1K: 0.8855
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4246
+            nDCG@10: 0.6102
+            R@1K: 0.8675
+  - name: bm25-rocchio-default
+    display: BM25+Rocchio (k1=0.9, b=0.4)
+    display-html: BM25+Rocchio (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --rocchio
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.1595
+            R@1K: 0.8620
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.3474
+            nDCG@10: 0.5275
+            R@1K: 0.8007
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.3115
+            nDCG@10: 0.4910
+            R@1K: 0.8156
+  - name: bm25-rocchio-tuned
+    display: BM25+Rocchio (k1=0.82, b=0.68)
+    display-html: BM25+Rocchio (<i>k<sub><small>1</small></sub></i>=0.82, <i>b</i>=0.68)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --rocchio
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.1684
+            R@1K: 0.8726
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.3396
+            nDCG@10: 0.5275
+            R@1K: 0.7948
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.3120
+            nDCG@10: 0.4908
+            R@1K: 0.8327
+  - name: distilbert-kd-tasb-pytorch
+    display: "DistilBERT KD TASB: query inference with PyTorch"
+    display-html: "DistilBERT KD TASB: query inference with PyTorch"
+    display-row: "[5]"
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-tas_b-b256 --topics $topics --encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco --output $output
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3444
+            R@1K: 0.9771
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4590
+            nDCG@10: 0.7210
+            R@1K: 0.8406
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4698
+            nDCG@10: 0.6854
+            R@1K: 0.8727
+  - name: distilbert-kd-tasb
+    display: "DistilBERT KD TASB: pre-encoded"
+    display-html: "DistilBERT KD TASB: pre-encoded queries"
+    display-row: "[5]"
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-tas_b-b256 --topics $topics --encoded-queries distilbert_tas_b-$topics --output $output
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3444
+            R@1K: 0.9771
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4590
+            nDCG@10: 0.7210
+            R@1K: 0.8406
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4698
+            nDCG@10: 0.6854
+            R@1K: 0.8727
+  - name: distilbert-kd-pytorch
+    display: "DistilBERT KD: query inference with PyTorch"
+    display-html: "DistilBERT KD: query inference with PyTorch"
+    display-row: "[4]"
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-margin-mse-t2 --topics $topics --encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco --output $output
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3251
+            R@1K: 0.9553
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4053
+            nDCG@10: 0.6994
+            R@1K: 0.7653
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4159
+            nDCG@10: 0.6447
+            R@1K: 0.7953
+  - name: distilbert-kd
+    display: "DistilBERT KD: pre-encoded"
+    display-html: "DistilBERT KD: pre-encoded queries"
+    display-row: "[4]"
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-margin-mse-t2 --topics $topics --encoded-queries distilbert_kd-$topics --output $output
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3251
+            R@1K: 0.9553
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4053
+            nDCG@10: 0.6994
+            R@1K: 0.7653
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4159
+            nDCG@10: 0.6447
+            R@1K: 0.7953
+  - name: ance-pytorch
+    display: "ANCE: query inference with PyTorch"
+    display-html: "ANCE: query inference with PyTorch"
+    display-row: "[3]"
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.ance --topics $topics --encoder castorini/ance-msmarco-passage --output $output
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3302
+            R@1K: 0.9587
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.3710
+            nDCG@10: 0.6452
+            R@1K: 0.7554
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4076
+            nDCG@10: 0.6458
+            R@1K: 0.7764
+  - name: ance
+    display: "ANCE: pre-encoded"
+    display-html: "ANCE: pre-encoded queries"
+    display-row: "[3]"
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.ance --topics $topics --encoded-queries ance-$topics --output $output
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3302
+            R@1K: 0.9584
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.3710
+            nDCG@10: 0.6452
+            R@1K: 0.7554
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4076
+            nDCG@10: 0.6458
+            R@1K: 0.7764
+  - name: bm25-tuned
+    display: BM25 (k1=0.82, b=0.68)
+    display-html: BM25 (<i>k<sub><small>1</small></sub></i>=0.82, <i>b</i>=0.68)
+    command: python -m pyserini.search.lucene --topics $topics --index msmarco-v1-passage --output $output --bm25
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.1875
+            R@1K: 0.8573
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.2903
+            nDCG@10: 0.4973
+            R@1K: 0.7450
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.2876
+            nDCG@10: 0.4876
+            R@1K: 0.8031
+  - name: bm25-rm3-tuned
+    display: BM25+RM3 (k1=0.82, b=0.68)
+    display-html: BM25+RM3 (<i>k<sub><small>1</small></sub></i>=0.82, <i>b</i>=0.68)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --rm3
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.1646
+            R@1K: 0.8704
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.3339
+            nDCG@10: 0.5147
+            R@1K: 0.7950
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.3017
+            nDCG@10: 0.4924
+            R@1K: 0.8292
+  - name: bm25-default
+    display: BM25 (k1=0.9, b=0.4)
+    display-html: BM25 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1a)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --k1 0.9 --b 0.4
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.1840
+            R@1K: 0.8526
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.3013
+            nDCG@10: 0.5058
+            R@1K: 0.7501
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.2856
+            nDCG@10: 0.4796
+            R@1K: 0.7863
+  - name: bm25-rm3-default
+    display: BM25+RM3 (k1=0.9, b=0.4)
+    display-html: BM25+RM3 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1b)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --rm3
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.1566
+            R@1K: 0.8606
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.3416
+            nDCG@10: 0.5216
+            R@1K: 0.8136
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.3006
+            nDCG@10: 0.4896
+            R@1K: 0.8236
+  - name: bm25-d2q-t5-tuned
+    display: BM25 w/ doc2query-T5 (k1=2.18, b=0.86)
+    display-html: BM25 w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=2.18, <i>b</i>=0.86)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5 --topics $topics --output $output --bm25
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.2816
+            R@1K: 0.9506
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4046
+            nDCG@10: 0.6336
+            R@1K: 0.8134
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4171
+            nDCG@10: 0.6265
+            R@1K: 0.8393
+  - name: bm25-d2q-t5-default
+    display: BM25 w/ doc2query-T5 (k1=0.9, b=0.4)
+    display-html: BM25 w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2a)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5 --topics $topics --output $output --bm25 --k1 0.9 --b 0.4
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.2723
+            R@1K: 0.9470
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4034
+            nDCG@10: 0.6417
+            R@1K: 0.8310
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4074
+            nDCG@10: 0.6187
+            R@1K: 0.8452
+  - name: bm25-rm3-d2q-t5-tuned
+    display: BM25+RM3 w/ doc2query-T5 (k1=2.18, b=0.86)
+    display-html: BM25+RM3 w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=2.18, <i>b</i>=0.86)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.2382
+            R@1K: 0.9528
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4377
+            nDCG@10: 0.6537
+            R@1K: 0.8443
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4348
+            nDCG@10: 0.6235
+            R@1K: 0.8605
+  - name: bm25-rm3-d2q-t5-default
+    display: BM25+RM3 w/ doc2query-T5 (k1=0.9, b=0.4)
+    display-html: BM25+RM3 w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2b)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.2139
+            R@1K: 0.9460
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4483
+            nDCG@10: 0.6586
+            R@1K: 0.8863
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4286
+            nDCG@10: 0.6131
+            R@1K: 0.8700
+  - name: unicoil-pytorch
+    display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+    display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --hits 1000 --impact
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3509
+            R@1K: 0.9581
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4617
+            nDCG@10: 0.7027
+            R@1K: 0.8291
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4429
+            nDCG@10: 0.6745
+            R@1K: 0.8433
+  - name: unicoil-onnx
+    display: "uniCOIL (w/ doc2query-T5): query inference with ONNX"
+    display-html: "uniCOIL (w/ doc2query-T5): query inference with ONNX"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil --topics $topics --onnx-encoder UniCoil --output $output --hits 1000 --impact
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3509
+            R@1K: 0.9581
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4617
+            nDCG@10: 0.7027
+            R@1K: 0.8291
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4429
+            nDCG@10: 0.6745
+            R@1K: 0.8433
+  - name: unicoil
+    display: "uniCOIL (w/ doc2query-T5): pre-encoded"
+    display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries"
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (3b)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil --topics $topics --output $output --hits 1000 --impact
+    topics:
+      - topic_key: msmarco-passage-dev-subset-unicoil
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3516
+            R@1K: 0.9582
+      - topic_key: dl19-passage-unicoil
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4612
+            nDCG@10: 0.7024
+            R@1K: 0.8292
+      - topic_key: dl20-unicoil
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4430
+            nDCG@10: 0.6745
+            R@1K: 0.8430
+  - name: unicoil-noexp-pytorch
+    display: "uniCOIL (noexp): query inference with PyTorch"
+    display-html: "uniCOIL (noexp): query inference with PyTorch"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil-noexp --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --hits 1000 --impact
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3153
+            R@1K: 0.9239
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4033
+            nDCG@10: 0.6434
+            R@1K: 0.7752
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4022
+            nDCG@10: 0.6524
+            R@1K: 0.7861
+  - name: unicoil-noexp-onnx
+    display: "uniCOIL (noexp): query inference with ONNX"
+    display-html: "uniCOIL (noexp): query inference with ONNX"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil-noexp --topics $topics --onnx-encoder UniCoil --output $output --hits 1000 --impact
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3119
+            R@1K: 0.9239
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4061
+            nDCG@10: 0.6531
+            R@1K: 0.7809
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.3909
+            nDCG@10: 0.6388
+            R@1K: 0.7915
+  - name: unicoil-noexp
+    display: "uniCOIL (noexp): pre-encoded"
+    display-html: "uniCOIL (noexp): pre-encoded queries"
+    display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (3a)"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil-noexp --topics $topics --output $output --hits 1000 --impact
+    topics:
+      - topic_key: msmarco-passage-dev-subset-unicoil-noexp
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3153
+            R@1K: 0.9239
+      - topic_key: dl19-passage-unicoil-noexp
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4033
+            nDCG@10: 0.6433
+            R@1K: 0.7752
+      - topic_key: dl20-unicoil-noexp
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4021
+            nDCG@10: 0.6523
+            R@1K: 0.7861
+  - name: splade-pp-ed-onnx
+    display: "SPLADE++ EnsembleDistil: query inference with ONNX"
+    display-html: "SPLADE++ EnsembleDistil: query inference with ONNX"
+    display-row: "[2]"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-splade-pp-ed --topics $topics --onnx-encoder SpladePlusPlusEnsembleDistil --output $output --hits 1000 --impact
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3830
+            R@1K: 0.9831
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.5054
+            nDCG@10: 0.7320
+            R@1K: 0.8724
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.5002
+            nDCG@10: 0.7198
+            R@1K: 0.8995
+  - name: splade-pp-sd-onnx
+    display: "SPLADE++ SelfDistil: query inference with ONNX"
+    display-html: "SPLADE++ SelfDistil: query inference with ONNX"
+    display-row: "[2]"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-splade-pp-sd --topics $topics --onnx-encoder SpladePlusPlusSelfDistil --output $output --hits 1000 --impact
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3778
+            R@1K: 0.9846
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4997
+            nDCG@10: 0.7356
+            R@1K: 0.8758
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.5140
+            nDCG@10: 0.7285
+            R@1K: 0.9023
+  - name: tct_colbert-v2-hnp-pytorch
+    display: "TCT_ColBERT-V2-HN+: query inference with PyTorch"
+    display-html: "TCT_ColBERT-V2-HN+: query inference with PyTorch"
+    display-row: "[6]"
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.tct_colbert-v2-hnp --topics $topics --encoder castorini/tct_colbert-v2-hnp-msmarco --output $output
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3584
+            R@1K: 0.9695
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4469
+            nDCG@10: 0.7204
+            R@1K: 0.8261
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4754
+            nDCG@10: 0.6882
+            R@1K: 0.8429
+  - name: tct_colbert-v2-hnp
+    display: "TCT_ColBERT-V2-HN+: pre-encoded"
+    display-html: "TCT_ColBERT-V2-HN+: pre-encoded queries"
+    display-row: "[6]"
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.tct_colbert-v2-hnp --topics $topics --encoded-queries tct_colbert-v2-hnp-$topics --output $output
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3584
+            R@1K: 0.9695
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4469
+            nDCG@10: 0.7204
+            R@1K: 0.8261
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4754
+            nDCG@10: 0.6882
+            R@1K: 0.8429
+  - name: slimr
+    display: "SLIM: query inference with PyTorch"
+    display-html: "SLIM: query inference with PyTorch"
+    display-row: "[7]"
+    command: python -m pyserini.search.lucene --threads 16 --batch 128 --index msmarco-v1-passage-slimr --topics $topics --encoder castorini/slimr-msmarco-passage --encoded-corpus scipy-sparse-vectors.msmarco-v1-passage-slimr --output $output --output-format msmarco --hits 1000 --impact --min-idf 3
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3581
+            R@1K: 0.9620
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4509
+            nDCG@10: 0.7010
+            R@1K: 0.8241
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4419
+            nDCG@10: 0.6403
+            R@1K: 0.8543
+  - name: slimr-pp
+    display: "SLIM++: query inference with PyTorch"
+    display-html: "SLIM++: query inference with PyTorch"
+    display-row: "[7]"
+    command: python -m pyserini.search.lucene --threads 16 --batch 128 --index msmarco-v1-passage-slimr-pp --topics $topics --encoder castorini/slimr-pp-msmarco-passage --encoded-corpus scipy-sparse-vectors.msmarco-v1-passage-slimr-pp --output $output --output-format msmarco --hits 1000 --impact --min-idf 3
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.4032
+            R@1K: 0.9680
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4687
+            nDCG@10: 0.7140
+            R@1K: 0.8415
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4906
+            nDCG@10: 0.7021
+            R@1K: 0.8551
+  - name: aggretriever-distilbert-pytorch
+    display: "Aggretriever-DistilBERT: query inference with PyTorch"
+    display-html: "Aggretriever-DistilBERT: query inference with PyTorch"
+    display-row: "[8]"
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.aggretriever-distilbert --topics $topics --encoder castorini/aggretriever-distilbert --output $output
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3412
+            R@1K: 0.9604
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4301
+            nDCG@10: 0.6816
+            R@1K: 0.8023
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4329
+            nDCG@10: 0.6726
+            R@1K: 0.8351
+  - name: aggretriever-cocondenser-pytorch
+    display: "Aggretriever-coCondenser: query inference with PyTorch"
+    display-html: "Aggretriever-coCondenser: query inference with PyTorch"
+    display-row: "[8]"
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.aggretriever-cocondenser --topics $topics --encoder castorini/aggretriever-cocondenser --output $output
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3619
+            R@1K: 0.9735
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4350
+            nDCG@10: 0.6837
+            R@1K: 0.8078
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4710
+            nDCG@10: 0.6972
+            R@1K: 0.8555
+  - name: openai-ada2
+    display: "OpenAI ada2: pre-encoded queries"
+    display-html: "OpenAI ada2: pre-encoded queries"
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 128 --index msmarco-v1-passage.openai-ada2 --topics $topics --encoded-queries openai-ada2-$topics --output $output
+    topics:
+      - topic_key: msmarco-passage-dev-subset
+        eval_key: msmarco-passage-dev-subset
+        scores:
+          - MRR@10: 0.3435
+            R@1K: 0.9858
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.4788
+            nDCG@10: 0.7035
+            R@1K: 0.8629
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4771
+            nDCG@10: 0.6759
+            R@1K: 0.8705
+  - name: openai-ada2-hyde
+    display: "HyDE-OpenAI ada2: pre-encoded queries"
+    display-html: "HyDE-OpenAI ada2: pre-encoded queries"
+    command: python -m pyserini.search.faiss --threads 16 --batch-size 128 --index msmarco-v1-passage.openai-ada2 --topics $topics --encoded-queries openai-ada2-$topics-hyde --output $output
+    topics:
+      - topic_key: dl19-passage
+        eval_key: dl19-passage
+        scores:
+          - MAP: 0.5125
+            nDCG@10: 0.7163
+            R@1K: 0.9002
+      - topic_key: dl20
+        eval_key: dl20-passage
+        scores:
+          - MAP: 0.4938
+            nDCG@10: 0.6666
+            R@1K: 0.8919

pyserini/2cr/msmarco-v2-doc.yaml ADDED Viewed

	@@ -0,0 +1,287 @@

+conditions:
+  - name: bm25-doc-default
+    display: BM25 doc (k1=0.9, b=0.4)
+    display-html: BM25 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (1a)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc --topics $topics --output $output --bm25
+    topics:
+      - topic_key: msmarco-v2-doc-dev
+        eval_key: msmarco-v2-doc-dev
+        scores:
+          - MRR@100: 0.1572
+            R@1K: 0.8054
+      - topic_key: msmarco-v2-doc-dev2
+        eval_key: msmarco-v2-doc-dev2
+        scores:
+          - MRR@100: 0.1659
+            R@1K: 0.8029
+      - topic_key: dl21
+        eval_key: dl21-doc
+        scores:
+          - MAP@100: 0.2126
+            nDCG@10: 0.5116
+            MRR@100: 0.8367
+            R@100: 0.3195
+            R@1K: 0.6739
+  - name: bm25-doc-segmented-default
+    display: BM25 doc segmented (k1=0.9, b=0.4)
+    display-html: BM25 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (1b)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-v2-doc-dev
+        eval_key: msmarco-v2-doc-dev
+        scores:
+          - MRR@100: 0.1896
+            R@1K: 0.8542
+      - topic_key: msmarco-v2-doc-dev2
+        eval_key: msmarco-v2-doc-dev2
+        scores:
+          - MRR@100: 0.1930
+            R@1K: 0.8549
+      - topic_key: dl21
+        eval_key: dl21-doc
+        scores:
+          - MAP@100: 0.2436
+            nDCG@10: 0.5776
+            MRR@100: 0.8937
+            R@100: 0.3478
+            R@1K: 0.6930
+  - name: bm25-rm3-doc-default
+    display: BM25+RM3 doc (k1=0.9, b=0.4)
+    display-html: BM25+RM3 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (1c)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc --topics $topics --output $output --bm25 --rm3
+    topics:
+      - topic_key: msmarco-v2-doc-dev
+        eval_key: msmarco-v2-doc-dev
+        scores:
+          - MRR@100: 0.0974
+            R@1K: 0.7699
+      - topic_key: msmarco-v2-doc-dev2
+        eval_key: msmarco-v2-doc-dev2
+        scores:
+          - MRR@100: 0.1033
+            R@1K: 0.7736
+      - topic_key: dl21
+        eval_key: dl21-doc
+        scores:
+          - MAP@100: 0.2452
+            nDCG@10: 0.5304
+            MRR@100: 0.7914
+            R@100: 0.3376
+            R@1K: 0.7341
+  - name: bm25-rm3-doc-segmented-default
+    display: BM25+RM3 doc segmented (k1=0.9, b=0.4)
+    display-html: BM25+RM3 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (1d)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-v2-doc-dev
+        eval_key: msmarco-v2-doc-dev
+        scores:
+          - MRR@100: 0.1660
+            R@1K: 0.8608
+      - topic_key: msmarco-v2-doc-dev2
+        eval_key: msmarco-v2-doc-dev2
+        scores:
+          - MRR@100: 0.1702
+            R@1K: 0.8639
+      - topic_key: dl21
+        eval_key: dl21-doc
+        scores:
+          - MAP@100: 0.2936
+            nDCG@10: 0.6189
+            MRR@100: 0.9076
+            R@100: 0.3890
+            R@1K: 0.7678
+  - name: bm25-d2q-t5-doc-default
+    display: BM25 w/ doc2query-T5 doc (k1=0.9, b=0.4)
+    display-html: BM25 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (2a)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-d2q-t5 --topics $topics --output $output --bm25
+    topics:
+      - topic_key: msmarco-v2-doc-dev
+        eval_key: msmarco-v2-doc-dev
+        scores:
+          - MRR@100: 0.2011
+            R@1K: 0.8614
+      - topic_key: msmarco-v2-doc-dev2
+        eval_key: msmarco-v2-doc-dev2
+        scores:
+          - MRR@100: 0.2012
+            R@1K: 0.8568
+      - topic_key: dl21
+        eval_key: dl21-doc
+        scores:
+          - MAP@100: 0.2387
+            nDCG@10: 0.5792
+            MRR@100: 0.8866
+            R@100: 0.3443
+            R@1K: 0.7066
+  - name: bm25-d2q-t5-doc-segmented-default
+    display: BM25 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4)
+    display-html: BM25 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (2b)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-d2q-t5 --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-v2-doc-dev
+        eval_key: msmarco-v2-doc-dev
+        scores:
+          - MRR@100: 0.2226
+            R@1K: 0.8982
+      - topic_key: msmarco-v2-doc-dev2
+        eval_key: msmarco-v2-doc-dev2
+        scores:
+          - MRR@100: 0.2234
+            R@1K: 0.8952
+      - topic_key: dl21
+        eval_key: dl21-doc
+        scores:
+          - MAP@100: 0.2683
+            nDCG@10: 0.6289
+            MRR@100: 0.9454
+            R@100: 0.3656
+            R@1K: 0.7202
+  - name: bm25-rm3-d2q-t5-doc-default
+    display: BM25+RM3 w/ doc2query-T5 doc (k1=0.9, b=0.4)
+    display-html: BM25+RM3 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (2c)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
+    topics:
+      - topic_key: msmarco-v2-doc-dev
+        eval_key: msmarco-v2-doc-dev
+        scores:
+          - MRR@100: 0.1141
+            R@1K: 0.8191
+      - topic_key: msmarco-v2-doc-dev2
+        eval_key: msmarco-v2-doc-dev2
+        scores:
+          - MRR@100: 0.1170
+            R@1K: 0.8247
+      - topic_key: dl21
+        eval_key: dl21-doc
+        scores:
+          - MAP@100: 0.2611
+            nDCG@10: 0.5375
+            MRR@100: 0.8255
+            R@100: 0.3580
+            R@1K: 0.7574
+  - name: bm25-rm3-d2q-t5-doc-segmented-default
+    display: BM25+RM3 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4)
+    display-html: BM25+RM3 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (2d)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-v2-doc-dev
+        eval_key: msmarco-v2-doc-dev
+        scores:
+          - MRR@100: 0.1975
+            R@1K: 0.9002
+      - topic_key: msmarco-v2-doc-dev2
+        eval_key: msmarco-v2-doc-dev2
+        scores:
+          - MRR@100: 0.1978
+            R@1K: 0.8972
+      - topic_key: dl21
+        eval_key: dl21-doc
+        scores:
+          - MAP@100: 0.3191
+            nDCG@10: 0.6559
+            MRR@100: 0.8989
+            R@100: 0.4131
+            R@1K: 0.7948
+  - name: unicoil-noexp
+    display: "uniCOIL (noexp): pre-encoded"
+    display-html: "uniCOIL (noexp): pre-encoded queries"
+    display-row: (3a)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-noexp-0shot --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-v2-doc-dev-unicoil-noexp
+        eval_key: msmarco-v2-doc-dev
+        scores:
+          - MRR@100: 0.2231
+            R@1K: 0.8987
+      - topic_key: msmarco-v2-doc-dev2-unicoil-noexp
+        eval_key: msmarco-v2-doc-dev2
+        scores:
+          - MRR@100: 0.2314
+            R@1K: 0.8995
+      - topic_key: dl21-unicoil-noexp
+        eval_key: dl21-doc
+        scores:
+          - MAP@100: 0.2587
+            nDCG@10: 0.6495
+            MRR@100: 0.9282
+            R@100: 0.3563
+            R@1K: 0.6787
+  - name: unicoil-noexp-otf
+    display: "uniCOIL (noexp): query inference with PyTorch"
+    display-html: "uniCOIL (noexp): query inference with PyTorch"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-noexp-0shot --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-v2-doc-dev
+        eval_key: msmarco-v2-doc-dev
+        scores:
+          - MRR@100: 0.2232
+            R@1K: 0.8987
+      - topic_key: msmarco-v2-doc-dev2
+        eval_key: msmarco-v2-doc-dev2
+        scores:
+          - MRR@100: 0.2314
+            R@1K: 0.8993
+      - topic_key: dl21
+        eval_key: dl21-doc
+        scores:
+          - MAP@100: 0.2589
+            nDCG@10: 0.6501
+            MRR@100: 0.9282
+            R@100: 0.3574
+            R@1K: 0.6782
+  - name: unicoil
+    display: "uniCOIL (w/ doc2query-T5): pre-encoded"
+    display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries"
+    display-row: (3b)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-0shot --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-v2-doc-dev-unicoil
+        eval_key: msmarco-v2-doc-dev
+        scores:
+          - MRR@100: 0.2419
+            R@1K: 0.9122
+      - topic_key: msmarco-v2-doc-dev2-unicoil
+        eval_key: msmarco-v2-doc-dev2
+        scores:
+          - MRR@100: 0.2445
+            R@1K: 0.9172
+      - topic_key: dl21-unicoil
+        eval_key: dl21-doc
+        scores:
+          - MAP@100: 0.2718
+            nDCG@10: 0.6783
+            MRR@100: 0.9684
+            R@100: 0.3700
+            R@1K: 0.7069
+  - name: unicoil-otf
+    display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+    display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-0shot --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+    topics:
+      - topic_key: msmarco-v2-doc-dev
+        eval_key: msmarco-v2-doc-dev
+        scores:
+          - MRR@100: 0.2419
+            R@1K: 0.9120
+      - topic_key: msmarco-v2-doc-dev2
+        eval_key: msmarco-v2-doc-dev2
+        scores:
+          - MRR@100: 0.2447
+            R@1K: 0.9174
+      - topic_key: dl21
+        eval_key: dl21-doc
+        scores:
+          - MAP@100: 0.2720
+            nDCG@10: 0.6782
+            MRR@100: 0.9684
+            R@100: 0.3702
+            R@1K: 0.7071

pyserini/2cr/msmarco-v2-passage.yaml ADDED Viewed

	@@ -0,0 +1,287 @@

+conditions:
+  - name: bm25-default
+    display: BM25 original passage (k1=0.9, b=0.4)
+    display-html: BM25 original passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (1a)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage --topics $topics --output $output --bm25
+    topics:
+      - topic_key: msmarco-v2-passage-dev
+        eval_key: msmarco-v2-passage-dev
+        scores:
+          - MRR@100: 0.0719
+            R@1K: 0.5733
+      - topic_key: msmarco-v2-passage-dev2
+        eval_key: msmarco-v2-passage-dev2
+        scores:
+          - MRR@100: 0.0802
+            R@1K: 0.5839
+      - topic_key: dl21
+        eval_key: dl21-passage
+        scores:
+          - MAP@100: 0.1357
+            nDCG@10: 0.4458
+            MRR@100: 0.5060
+            R@100: 0.3261
+            R@1K: 0.6149
+  - name: bm25-augmented-default
+    display: BM25 augmented passage (k1=0.9, b=0.4)
+    display-html: BM25 augmented passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (1b)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented --topics $topics --output $output --bm25
+    topics:
+      - topic_key: msmarco-v2-passage-dev
+        eval_key: msmarco-v2-passage-dev
+        scores:
+          - MRR@100: 0.0872
+            R@1K: 0.6925
+      - topic_key: msmarco-v2-passage-dev2
+        eval_key: msmarco-v2-passage-dev2
+        scores:
+          - MRR@100: 0.0917
+            R@1K: 0.6933
+      - topic_key: dl21
+        eval_key: dl21-passage
+        scores:
+          - MAP@100: 0.0977
+            nDCG@10: 0.3977
+            MRR@100: 0.5303
+            R@100: 0.2709
+            R@1K: 0.5835
+  - name: bm25-rm3-default
+    display: BM25+RM3 original passage (k1=0.9, b=0.4)
+    display-html: BM25+RM3 original passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (1c)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage --topics $topics --output $output --bm25 --rm3
+    topics:
+      - topic_key: msmarco-v2-passage-dev
+        eval_key: msmarco-v2-passage-dev
+        scores:
+          - MRR@100: 0.0630
+            R@1K: 0.5947
+      - topic_key: msmarco-v2-passage-dev2
+        eval_key: msmarco-v2-passage-dev2
+        scores:
+          - MRR@100: 0.0659
+            R@1K: 0.6062
+      - topic_key: dl21
+        eval_key: dl21-passage
+        scores:
+          - MAP@100: 0.1666
+            nDCG@10: 0.4455
+            MRR@100: 0.5202
+            R@100: 0.3499
+            R@1K: 0.6616
+  - name: bm25-rm3-augmented-default
+    display: BM25+RM3 augmented passage (k1=0.9, b=0.4)
+    display-html: BM25+RM3 augmented passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (1d)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented --topics $topics --output $output --bm25 --rm3
+    topics:
+      - topic_key: msmarco-v2-passage-dev
+        eval_key: msmarco-v2-passage-dev
+        scores:
+          - MRR@100: 0.0667
+            R@1K: 0.6857
+      - topic_key: msmarco-v2-passage-dev2
+        eval_key: msmarco-v2-passage-dev2
+        scores:
+          - MRR@100: 0.0700
+            R@1K: 0.6826
+      - topic_key: dl21
+        eval_key: dl21-passage
+        scores:
+          - MAP@100: 0.1050
+            nDCG@10: 0.3869
+            MRR@100: 0.4915
+            R@100: 0.2807
+            R@1K: 0.6298
+  - name: bm25-d2q-t5-default
+    display: BM25 w/ doc2query-T5 original passage (k1=0.9, b=0.4)
+    display-html: BM25 w/ doc2query-T5 original passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (2a)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-d2q-t5 --topics $topics --output $output --bm25
+    topics:
+      - topic_key: msmarco-v2-passage-dev
+        eval_key: msmarco-v2-passage-dev
+        scores:
+          - MRR@100: 0.1072
+            R@1K: 0.7083
+      - topic_key: msmarco-v2-passage-dev2
+        eval_key: msmarco-v2-passage-dev2
+        scores:
+          - MRR@100: 0.1123
+            R@1K: 0.7151
+      - topic_key: dl21
+        eval_key: dl21-passage
+        scores:
+          - MAP@100: 0.1874
+            nDCG@10: 0.4816
+            MRR@100: 0.6848
+            R@100: 0.4076
+            R@1K: 0.7078
+  - name: bm25-d2q-t5-augmented-default
+    display: BM25 w/ doc2query-T5 augmented passage (k1=0.9, b=0.4)
+    display-html: BM25 w/ doc2query-T5 augmented passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (2b)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented-d2q-t5 --topics $topics --output $output --bm25
+    topics:
+      - topic_key: msmarco-v2-passage-dev
+        eval_key: msmarco-v2-passage-dev
+        scores:
+          - MRR@100: 0.1172
+            R@1K: 0.7647
+      - topic_key: msmarco-v2-passage-dev2
+        eval_key: msmarco-v2-passage-dev2
+        scores:
+          - MRR@100: 0.1170
+            R@1K: 0.7659
+      - topic_key: dl21
+        eval_key: dl21-passage
+        scores:
+          - MAP@100: 0.1649
+            nDCG@10: 0.4702
+            MRR@100: 0.6391
+            R@100: 0.3883
+            R@1K: 0.6962
+  - name: bm25-rm3-d2q-t5-default
+    display: BM25+RM3 w/ doc2query-T5 original passage (k1=0.9, b=0.4)
+    display-html: BM25+RM3 w/ doc2query-T5 original passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (2c)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
+    topics:
+      - topic_key: msmarco-v2-passage-dev
+        eval_key: msmarco-v2-passage-dev
+        scores:
+          - MRR@100: 0.0947
+            R@1K: 0.7181
+      - topic_key: msmarco-v2-passage-dev2
+        eval_key: msmarco-v2-passage-dev2
+        scores:
+          - MRR@100: 0.0984
+            R@1K: 0.7222
+      - topic_key: dl21
+        eval_key: dl21-passage
+        scores:
+          - MAP@100: 0.2285
+            nDCG@10: 0.5098
+            MRR@100: 0.6548
+            R@100: 0.4499
+            R@1K: 0.7537
+  - name: bm25-rm3-d2q-t5-augmented-default
+    display: BM25+RM3 w/ doc2query-T5 augmented passage (k1=0.9, b=0.4)
+    display-html: BM25+RM3 w/ doc2query-T5 augmented passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+    display-row: (2d)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
+    topics:
+      - topic_key: msmarco-v2-passage-dev
+        eval_key: msmarco-v2-passage-dev
+        scores:
+          - MRR@100: 0.0883
+            R@1K: 0.7607
+      - topic_key: msmarco-v2-passage-dev2
+        eval_key: msmarco-v2-passage-dev2
+        scores:
+          - MRR@100: 0.0904
+            R@1K: 0.7649
+      - topic_key: dl21
+        eval_key: dl21-passage
+        scores:
+          - MAP@100: 0.1930
+            nDCG@10: 0.4812
+            MRR@100: 0.5958
+            R@100: 0.4321
+            R@1K: 0.7672
+  - name: unicoil
+    display: "uniCOIL (w/ doc2query-T5): pre-encoded"
+    display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries"
+    display-row: (3b)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-0shot --topics $topics --output $output --hits 1000 --impact
+    topics:
+      - topic_key: msmarco-v2-passage-dev-unicoil
+        eval_key: msmarco-v2-passage-dev
+        scores:
+          - MRR@100: 0.1499
+            R@1K: 0.7616
+      - topic_key: msmarco-v2-passage-dev2-unicoil
+        eval_key: msmarco-v2-passage-dev2
+        scores:
+          - MRR@100: 0.1577
+            R@1K: 0.7671
+      - topic_key: dl21-unicoil
+        eval_key: dl21-passage
+        scores:
+          - MAP@100: 0.2538
+            nDCG@10: 0.6159
+            MRR@100: 0.7311
+            R@100: 0.4731
+            R@1K: 0.7551
+  - name: unicoil-otf
+    display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+    display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-0shot --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --hits 1000 --impact
+    topics:
+      - topic_key: msmarco-v2-passage-dev
+        eval_key: msmarco-v2-passage-dev
+        scores:
+          - MRR@100: 0.1501
+            R@1K: 0.7613
+      - topic_key: msmarco-v2-passage-dev2
+        eval_key: msmarco-v2-passage-dev2
+        scores:
+          - MRR@100: 0.1576
+            R@1K: 0.7676
+      - topic_key: dl21
+        eval_key: dl21-passage
+        scores:
+          - MAP@100: 0.2539
+            nDCG@10: 0.6160
+            MRR@100: 0.7311
+            R@100: 0.4723
+            R@1K: 0.7560
+  - name: unicoil-noexp
+    display: "uniCOIL (noexp): pre-encoded"
+    display-html: "uniCOIL (noexp): pre-encoded queries"
+    display-row: (3a)
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-noexp-0shot --topics $topics --output $output --hits 1000 --impact
+    topics:
+      - topic_key: msmarco-v2-passage-dev-unicoil-noexp
+        eval_key: msmarco-v2-passage-dev
+        scores:
+          - MRR@100: 0.1342
+            R@1K: 0.7010
+      - topic_key: msmarco-v2-passage-dev2-unicoil-noexp
+        eval_key: msmarco-v2-passage-dev2
+        scores:
+          - MRR@100: 0.1385
+            R@1K: 0.7114
+      - topic_key: dl21-unicoil-noexp
+        eval_key: dl21-passage
+        scores:
+          - MAP@100: 0.2193
+            nDCG@10: 0.5756
+            MRR@100: 0.6991
+            R@100: 0.4246
+            R@1K: 0.6897
+  - name: unicoil-noexp-otf
+    display: "uniCOIL (noexp): query inference with PyTorch"
+    display-html: "uniCOIL (noexp): query inference with PyTorch"
+    command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-noexp-0shot --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --hits 1000 --impact
+    topics:
+      - topic_key: msmarco-v2-passage-dev
+        eval_key: msmarco-v2-passage-dev
+        scores:
+          - MRR@100: 0.1343
+            R@1K: 0.7010
+      - topic_key: msmarco-v2-passage-dev2
+        eval_key: msmarco-v2-passage-dev2
+        scores:
+          - MRR@100: 0.1385
+            R@1K: 0.7114
+      - topic_key: dl21
+        eval_key: dl21-passage
+        scores:
+          - MAP@100: 0.2194
+            nDCG@10: 0.5759
+            MRR@100: 0.6991
+            R@100: 0.4247
+            R@1K: 0.6893

pyserini/2cr/msmarco.py ADDED Viewed

	@@ -0,0 +1,600 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import argparse
+import math
+import os
+import re
+import sys
+import time
+from collections import defaultdict
+from string import Template
+import pkg_resources
+import yaml
+from ._base import run_eval_and_return_metric, ok_str, okish_str, fail_str
+# The models: the rows of the results table will be ordered this way.
+models = {
+    # MS MARCO v1 passage
+    'msmarco-v1-passage':
+    ['bm25-default',
+     'bm25-rm3-default',
+     'bm25-rocchio-default',
+     '',
+     'bm25-tuned',
+     'bm25-rm3-tuned',
+     'bm25-rocchio-tuned',
+     '',
+     'bm25-d2q-t5-default',
+     'bm25-rm3-d2q-t5-default',
+     'bm25-rocchio-d2q-t5-default',
+     '',
+     'bm25-d2q-t5-tuned',
+     'bm25-rm3-d2q-t5-tuned',
+     'bm25-rocchio-d2q-t5-tuned',
+     '',
+     'unicoil',
+     'unicoil-pytorch',
+     'unicoil-onnx',
+     'unicoil-noexp',
+     'unicoil-noexp-pytorch',
+     'unicoil-noexp-onnx',
+     '',
+     'splade-pp-ed-onnx',
+     'splade-pp-sd-onnx',
+     '',
+     'ance',
+     'ance-pytorch',
+     '',
+     'distilbert-kd',
+     'distilbert-kd-pytorch',
+     'distilbert-kd-tasb',
+     'distilbert-kd-tasb-pytorch',
+     '',
+     'tct_colbert-v2-hnp',
+     'tct_colbert-v2-hnp-pytorch',
+     '',
+     'slimr',
+     'slimr-pp',
+     '',
+     'aggretriever-distilbert-pytorch',
+     'aggretriever-cocondenser-pytorch',
+     '',
+     'openai-ada2',
+     'openai-ada2-hyde'],
+    # MS MARCO v1 doc
+    'msmarco-v1-doc':
+    ['bm25-doc-default',
+     'bm25-doc-segmented-default',
+     'bm25-rm3-doc-default',
+     'bm25-rm3-doc-segmented-default',
+     'bm25-rocchio-doc-default',
+     'bm25-rocchio-doc-segmented-default',
+     '',
+     'bm25-doc-tuned',
+     'bm25-doc-segmented-tuned',
+     'bm25-rm3-doc-tuned',
+     'bm25-rm3-doc-segmented-tuned',
+     'bm25-rocchio-doc-tuned',
+     'bm25-rocchio-doc-segmented-tuned',
+     '',
+     'bm25-d2q-t5-doc-default',
+     'bm25-d2q-t5-doc-segmented-default',
+     'bm25-rm3-d2q-t5-doc-default',
+     'bm25-rm3-d2q-t5-doc-segmented-default',
+     '',
+     'bm25-d2q-t5-doc-tuned',
+     'bm25-d2q-t5-doc-segmented-tuned',
+     'bm25-rm3-d2q-t5-doc-tuned',
+     'bm25-rm3-d2q-t5-doc-segmented-tuned',
+     '',
+     'unicoil-noexp',
+     'unicoil-noexp-pytorch',
+     '',
+     'unicoil',
+     'unicoil-pytorch'],
+    # MS MARCO v2 passage
+    'msmarco-v2-passage':
+    ['bm25-default',
+     'bm25-augmented-default',
+     'bm25-rm3-default',
+     'bm25-rm3-augmented-default',
+     '',
+     'bm25-d2q-t5-default',
+     'bm25-d2q-t5-augmented-default',
+     'bm25-rm3-d2q-t5-default',
+     'bm25-rm3-d2q-t5-augmented-default',
+     '',
+     'unicoil-noexp',
+     'unicoil',
+     '',
+     'unicoil-noexp-otf',
+     'unicoil-otf'],
+    # MS MARCO v2 doc
+    'msmarco-v2-doc':
+    ['bm25-doc-default',
+     'bm25-doc-segmented-default',
+     'bm25-rm3-doc-default',
+     'bm25-rm3-doc-segmented-default',
+     '',
+     'bm25-d2q-t5-doc-default',
+     'bm25-d2q-t5-doc-segmented-default',
+     'bm25-rm3-d2q-t5-doc-default',
+     'bm25-rm3-d2q-t5-doc-segmented-default',
+     '',
+     'unicoil-noexp',
+     'unicoil',
+     '',
+     'unicoil-noexp-otf',
+     'unicoil-otf'
+     ]
+}
+trec_eval_metric_definitions = {
+    'msmarco-v1-passage': {
+        'msmarco-passage-dev-subset': {
+            'MRR@10': '-c -M 10 -m recip_rank',
+            'R@1K': '-c -m recall.1000'
+        },
+        'dl19-passage': {
+            'MAP': '-c -l 2 -m map',
+            'nDCG@10': '-c -m ndcg_cut.10',
+            'R@1K': '-c -l 2 -m recall.1000'
+        },
+        'dl20-passage': {
+            'MAP': '-c -l 2 -m map',
+            'nDCG@10': '-c -m ndcg_cut.10',
+            'R@1K': '-c -l 2 -m recall.1000'
+        }
+    },
+    'msmarco-v1-doc': {
+        'msmarco-doc-dev': {
+            'MRR@10': '-c -M 100 -m recip_rank',
+            'R@1K': '-c -m recall.1000'
+        },
+        'dl19-doc': {
+            'MAP': '-c -M 100 -m map',
+            'nDCG@10': '-c -m ndcg_cut.10',
+            'R@1K': '-c -m recall.1000'
+        },
+        'dl20-doc': {
+            'MAP': '-c -M 100 -m map',
+            'nDCG@10': '-c -m ndcg_cut.10',
+            'R@1K': '-c -m recall.1000'
+        }
+    },
+    'msmarco-v2-passage': {
+        'msmarco-v2-passage-dev': {
+            'MRR@100': '-c -M 100 -m recip_rank',
+            'R@1K': '-c -m recall.1000'
+        },
+        'msmarco-v2-passage-dev2': {
+            'MRR@100': '-c -M 100 -m recip_rank',
+            'R@1K': '-c -m recall.1000'
+        },
+        'dl21-passage': {
+            'MAP@100': '-c -l 2 -M 100 -m map',
+            'nDCG@10': '-c -m ndcg_cut.10',
+            'MRR@100': '-c -l 2 -M 100 -m recip_rank',
+            'R@100': '-c -l 2 -m recall.100',
+            'R@1K': '-c -l 2 -m recall.1000'
+        }
+    },
+    'msmarco-v2-doc': {
+        'msmarco-v2-doc-dev': {
+            'MRR@100': '-c -M 100 -m recip_rank',
+            'R@1K': '-c -m recall.1000'
+        },
+        'msmarco-v2-doc-dev2': {
+            'MRR@100': '-c -M 100 -m recip_rank',
+            'R@1K': '-c -m recall.1000'
+        },
+        'dl21-doc': {
+            'MAP@100': '-c -M 100 -m map',
+            'nDCG@10': '-c -m ndcg_cut.10',
+            'MRR@100': '-c -M 100 -m recip_rank',
+            'R@100': '-c -m recall.100',
+            'R@1K': '-c -m recall.1000'
+        }
+    }
+}
+def find_msmarco_table_topic_set_key_v1(topic_key):
+    # E.g., we want to map variants like 'dl19-passage-unicoil' and 'dl19-passage' both into 'dl19'
+    key = ''
+    if topic_key.startswith('dl19'):
+        key = 'dl19'
+    elif topic_key.startswith('dl20'):
+        key = 'dl20'
+    elif topic_key.startswith('msmarco'):
+        key = 'dev'
+    return key
+def find_msmarco_table_topic_set_key_v2(topic_key):
+    key = ''
+    if topic_key.endswith('dev') or topic_key.endswith('dev-unicoil') or topic_key.endswith('dev-unicoil-noexp'):
+        key = 'dev'
+    elif topic_key.endswith('dev2') or topic_key.endswith('dev2-unicoil') or topic_key.endswith('dev2-unicoil-noexp'):
+        key = 'dev2'
+    elif topic_key.startswith('dl21'):
+        key = 'dl21'
+    return key
+def format_command(raw):
+    # After "--output foo.txt" are additional options like "--hits 1000 --impact".
+    # We want these on a separate line for better readability, but note that sometimes that might
+    # be the end of the command, in which case we don't want to add an extra line break.
+    return raw.replace('--topics', '\\\n  --topics') \
+        .replace('--threads', '\\\n  --threads')\
+        .replace('--index', '\\\n  --index')\
+        .replace('--output ', '\\\n  --output ')\
+        .replace('--encoder', '\\\n  --encoder')\
+        .replace('--onnx-encoder', '\\\n  --onnx-encoder')\
+        .replace('--encoded-corpus', '\\\n  --encoded-corpus')\
+        .replace('.txt ', '.txt \\\n  ')
+def read_file(f):
+    fin = open(f, 'r')
+    text = fin.read()
+    fin.close()
+    return text
+def list_conditions(args):
+    for condition in models[args.collection]:
+        if condition == '':
+            continue
+        print(condition)
+def generate_report(args):
+    yaml_file = pkg_resources.resource_filename(__name__, f'{args.collection}.yaml')
+    if args.collection == 'msmarco-v1-passage':
+        html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v1_passage.template'))
+        row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v1.template'))
+    elif args.collection == 'msmarco-v1-doc':
+        html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v1_doc.template'))
+        row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v1.template'))
+    elif args.collection == 'msmarco-v2-passage':
+        html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v2_passage.template'))
+        row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v2.template'))
+    elif args.collection == 'msmarco-v2-doc':
+        html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v2_doc.template'))
+        row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v2.template'))
+    else:
+        raise ValueError(f'Unknown corpus: {args.collection}')
+    table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+    commands = defaultdict(lambda: defaultdict(lambda: ''))
+    eval_commands = defaultdict(lambda: defaultdict(lambda: ''))
+    table_keys = {}
+    row_ids = {}
+    with open(yaml_file) as f:
+        yaml_data = yaml.safe_load(f)
+        for condition in yaml_data['conditions']:
+            name = condition['name']
+            display = condition['display-html']
+            row_id = condition['display-row'] if 'display-row' in condition else ''
+            cmd_template = condition['command']
+            row_ids[name] =row_id
+            table_keys[name] = display
+            for topic_set in condition['topics']:
+                topic_key = topic_set['topic_key']
+                eval_key = topic_set['eval_key']
+                if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc':
+                    short_topic_key = find_msmarco_table_topic_set_key_v1(topic_key)
+                else:
+                    short_topic_key = find_msmarco_table_topic_set_key_v2(topic_key)
+                runfile = f'run.{args.collection}.{name}.{short_topic_key}.txt'
+                cmd = Template(cmd_template).substitute(topics=topic_key, output=runfile)
+                commands[name][short_topic_key] = cmd
+                for expected in topic_set['scores']:
+                    for metric in expected:
+                        eval_cmd = f'python -m pyserini.eval.trec_eval ' + \
+                                   f'{trec_eval_metric_definitions[args.collection][eval_key][metric]} {eval_key} {runfile}'
+                        eval_commands[name][short_topic_key] += eval_cmd + '\n'
+                        table[name][short_topic_key][metric] = expected[metric]
+    if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc':
+        row_cnt = 1
+        html_rows = []
+        for name in models[args.collection]:
+            if not name:
+                # Add blank row for spacing
+                html_rows.append('<tr><td style="border-bottom: 0"></td></tr>')
+                continue
+            s = Template(row_template)
+            s = s.substitute(row_cnt=row_cnt,
+                             condition_name=table_keys[name],
+                             row=row_ids[name],
+                             s1=f'{table[name]["dl19"]["MAP"]:.4f}' if table[name]['dl19']['MAP'] != 0 else '-',
+                             s2=f'{table[name]["dl19"]["nDCG@10"]:.4f}' if table[name]['dl19']['nDCG@10'] != 0 else '-',
+                             s3=f'{table[name]["dl19"]["R@1K"]:.4f}' if table[name]['dl19']['R@1K'] != 0 else '-',
+                             s4=f'{table[name]["dl20"]["MAP"]:.4f}' if table[name]['dl20']['MAP'] != 0 else '-',
+                             s5=f'{table[name]["dl20"]["nDCG@10"]:.4f}' if table[name]['dl20']['nDCG@10'] != 0 else '-',
+                             s6=f'{table[name]["dl20"]["R@1K"]:.4f}' if table[name]['dl20']['R@1K'] != 0 else '-',
+                             s7=f'{table[name]["dev"]["MRR@10"]:.4f}' if table[name]['dev']['MRR@10'] != 0 else '-',
+                             s8=f'{table[name]["dev"]["R@1K"]:.4f}' if table[name]['dev']['R@1K'] != 0 else '-',
+                             cmd1=format_command(commands[name]['dl19']),
+                             cmd2=format_command(commands[name]['dl20']),
+                             cmd3=format_command(commands[name]['dev']),
+                             eval_cmd1=eval_commands[name]['dl19'],
+                             eval_cmd2=eval_commands[name]['dl20'],
+                             eval_cmd3=eval_commands[name]['dev']
+                             )
+            # If we don't have scores, we want to remove the commands also. Use simple regexp substitution.
+            if table[name]['dl19']['MAP'] == 0:
+                s = re.sub(re.compile('Command to generate run on TREC 2019 queries:.*?</div>',
+                                      re.MULTILINE | re.DOTALL),
+                           'Not available.</div>', s)
+            if table[name]['dl20']['MAP'] == 0:
+                s = re.sub(re.compile('Command to generate run on TREC 2020 queries:.*?</div>',
+                                      re.MULTILINE | re.DOTALL),
+                           'Not available.</div>', s)
+            if table[name]['dev']['MRR@10'] == 0:
+                s = re.sub(re.compile('Command to generate run on dev queries:.*?</div>',
+                                      re.MULTILINE | re.DOTALL),
+                           'Not available.</div>', s)
+            html_rows.append(s)
+            row_cnt += 1
+        all_rows = '\n'.join(html_rows)
+        if args.collection == 'msmarco-v1-passage':
+            full_name = 'MS MARCO V1 Passage'
+        else:
+            full_name = 'MS MARCO V1 Document'
+        with open(args.output, 'w') as out:
+            out.write(Template(html_template).substitute(title=full_name, rows=all_rows))
+    else:
+        row_cnt = 1
+        html_rows = []
+        for name in models[args.collection]:
+            if not name:
+                # Add blank row for spacing
+                html_rows.append('<tr><td style="border-bottom: 0"></td></tr>')
+                continue
+            s = Template(row_template)
+            s = s.substitute(row_cnt=row_cnt,
+                             condition_name=table_keys[name],
+                             row=row_ids[name],
+                             s1=f'{table[name]["dl21"]["MAP@100"]:.4f}',
+                             s2=f'{table[name]["dl21"]["nDCG@10"]:.4f}',
+                             s3=f'{table[name]["dl21"]["MRR@100"]:.4f}',
+                             s4=f'{table[name]["dl21"]["R@100"]:.4f}',
+                             s5=f'{table[name]["dl21"]["R@1K"]:.4f}',
+                             s6=f'{table[name]["dev"]["MRR@100"]:.4f}',
+                             s7=f'{table[name]["dev"]["R@1K"]:.4f}',
+                             s8=f'{table[name]["dev2"]["MRR@100"]:.4f}',
+                             s9=f'{table[name]["dev2"]["R@1K"]:.4f}',
+                             cmd1=format_command(commands[name]['dl21']),
+                             cmd2=format_command(commands[name]['dev']),
+                             cmd3=format_command(commands[name]['dev2']),
+                             eval_cmd1=eval_commands[name]['dl21'],
+                             eval_cmd2=eval_commands[name]['dev'],
+                             eval_cmd3=eval_commands[name]['dev2']
+                             )
+            html_rows.append(s)
+            row_cnt += 1
+        all_rows = '\n'.join(html_rows)
+        if args.collection == 'msmarco-v2-passage':
+            full_name = 'MS MARCO V2 Passage'
+        else:
+            full_name = 'MS MARCO V2 Document'
+        with open(args.output, 'w') as out:
+            out.write(Template(html_template).substitute(title=full_name, rows=all_rows))
+def run_conditions(args):
+    start = time.time()
+    table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+    table_keys = {}
+    yaml_file = pkg_resources.resource_filename(__name__, f'{args.collection}.yaml')
+    with open(yaml_file) as f:
+        yaml_data = yaml.safe_load(f)
+        for condition in yaml_data['conditions']:
+            # Either we're running all conditions, or running only the condition specified in --condition
+            if not args.all:
+                if not condition['name'] == args.condition:
+                    continue
+            name = condition['name']
+            display = condition['display']
+            cmd_template = condition['command']
+            print(f'# Running condition "{name}": {display}\n')
+            for topic_set in condition['topics']:
+                topic_key = topic_set['topic_key']
+                eval_key = topic_set['eval_key']
+                short_topic_key = ''
+                if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc':
+                    short_topic_key = find_msmarco_table_topic_set_key_v1(topic_key)
+                else:
+                    short_topic_key = find_msmarco_table_topic_set_key_v2(topic_key)
+                print(f'  - topic_key: {topic_key}')
+                runfile = os.path.join(args.directory, f'run.{args.collection}.{name}.{short_topic_key}.txt')
+                cmd = Template(cmd_template).substitute(topics=topic_key, output=runfile)
+                if args.display_commands:
+                    print(f'\n```bash\n{format_command(cmd)}\n```\n')
+                if not os.path.exists(runfile):
+                    if not args.dry_run:
+                        os.system(cmd)
+                for expected in topic_set['scores']:
+                    for metric in expected:
+                        table_keys[name] = display
+                        if not args.skip_eval:
+                            # If the runfile doesn't exist, we can't evaluate.
+                            # This would be the case if --dry-run were set.
+                            if not os.path.exists(runfile):
+                                continue
+                            score = float(
+                                run_eval_and_return_metric(
+                                    metric,
+                                    eval_key,
+                                    trec_eval_metric_definitions[args.collection][eval_key][metric],
+                                    runfile))
+                            if math.isclose(score, float(expected[metric])):
+                                result_str = ok_str
+                            # Flaky tests
+                            elif args.collection == 'msmarco-v1-passage' \
+                                    and topic_key == 'msmarco-passage-dev-subset' and name == 'ance-pytorch' \
+                                    and metric == 'MRR@10' and abs(score-float(expected[metric])) <= 0.0001:
+                                result_str = okish_str
+                            else:
+                                result_str = fail_str + f' expected {expected[metric]:.4f}'
+                            print(f'    {metric:7}: {score:.4f} {result_str}')
+                            table[name][short_topic_key][metric] = score
+                        else:
+                            table[name][short_topic_key][metric] = expected[metric]
+                if not args.skip_eval:
+                    print('')
+    if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc':
+        print(' ' * 69 + 'TREC 2019' + ' ' * 16 + 'TREC 2020' + ' ' * 12 + 'MS MARCO dev')
+        print(' ' * 62 + 'MAP    nDCG@10    R@1K       MAP nDCG@10    R@1K    MRR@10    R@1K')
+        print(' ' * 62 + '-' * 22 + '    ' + '-' * 22 + '    ' + '-' * 14)
+        if args.condition:
+            # If we've used --condition to specify a specific condition, print out only that row.
+            name = args.condition
+            print(f'{table_keys[name]:60}' +
+                  f'{table[name]["dl19"]["MAP"]:8.4f}{table[name]["dl19"]["nDCG@10"]:8.4f}{table[name]["dl19"]["R@1K"]:8.4f}  ' +
+                  f'{table[name]["dl20"]["MAP"]:8.4f}{table[name]["dl20"]["nDCG@10"]:8.4f}{table[name]["dl20"]["R@1K"]:8.4f}  ' +
+                  f'{table[name]["dev"]["MRR@10"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f}')
+        else:
+            # Otherwise, print out all rows
+            for name in models[args.collection]:
+                if not name:
+                    print('')
+                    continue
+                print(f'{table_keys[name]:60}' +
+                      f'{table[name]["dl19"]["MAP"]:8.4f}{table[name]["dl19"]["nDCG@10"]:8.4f}{table[name]["dl19"]["R@1K"]:8.4f}  ' +
+                      f'{table[name]["dl20"]["MAP"]:8.4f}{table[name]["dl20"]["nDCG@10"]:8.4f}{table[name]["dl20"]["R@1K"]:8.4f}  ' +
+                      f'{table[name]["dev"]["MRR@10"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f}')
+    else:
+        print(' ' * 77 + 'TREC 2021' + ' ' * 18 + 'MS MARCO dev' + ' ' * 6 + 'MS MARCO dev2')
+        print(' ' * 62 + 'MAP@100 nDCG@10 MRR@100 R@100   R@1K     MRR@100   R@1K    MRR@100   R@1K')
+        print(' ' * 62 + '-' * 38 + '    ' + '-' * 14 + '    ' + '-' * 14)
+        if args.condition:
+            # If we've used --condition to specify a specific condition, print out only that row.
+            name = args.condition
+            print(f'{table_keys[name]:60}' +
+                  f'{table[name]["dl21"]["MAP@100"]:8.4f}{table[name]["dl21"]["nDCG@10"]:8.4f}' +
+                  f'{table[name]["dl21"]["MRR@100"]:8.4f}{table[name]["dl21"]["R@100"]:8.4f}{table[name]["dl21"]["R@1K"]:8.4f}  ' +
+                  f'{table[name]["dev"]["MRR@100"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f}  ' +
+                  f'{table[name]["dev2"]["MRR@100"]:8.4f}{table[name]["dev2"]["R@1K"]:8.4f}')
+        else:
+            # Otherwise, print out all rows
+            for name in models[args.collection]:
+                if not name:
+                    print('')
+                    continue
+                print(f'{table_keys[name]:60}' +
+                      f'{table[name]["dl21"]["MAP@100"]:8.4f}{table[name]["dl21"]["nDCG@10"]:8.4f}' +
+                      f'{table[name]["dl21"]["MRR@100"]:8.4f}{table[name]["dl21"]["R@100"]:8.4f}{table[name]["dl21"]["R@1K"]:8.4f}  ' +
+                      f'{table[name]["dev"]["MRR@100"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f}  ' +
+                      f'{table[name]["dev2"]["MRR@100"]:8.4f}{table[name]["dev2"]["R@1K"]:8.4f}')
+    end = time.time()
+    print('\n')
+    print(f'Total elapsed time: {end - start:.0f}s')
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Generate regression matrix for MS MARCO corpora.')
+    parser.add_argument('--collection', type=str,
+                        help='Collection = {v1-passage, v1-doc, v2-passage, v2-doc}.', required=True)
+    # To list all conditions
+    parser.add_argument('--list-conditions', action='store_true', default=False, help='List available conditions.')
+    # For generating reports
+    parser.add_argument('--generate-report', action='store_true', default=False, help='Generate report.')
+    parser.add_argument('--output', type=str, help='File to store report.', required=False)
+    # For actually running the experimental conditions
+    parser.add_argument('--all', action='store_true', default=False, help='Run all conditions.')
+    parser.add_argument('--condition', type=str, help='Condition to run.', required=False)
+    parser.add_argument('--directory', type=str, help='Base directory.', default='', required=False)
+    parser.add_argument('--dry-run', action='store_true', default=False, help='Print out commands but do not execute.')
+    parser.add_argument('--skip-eval', action='store_true', default=False, help='Skip running trec_eval.')
+    parser.add_argument('--display-commands', action='store_true', default=False, help='Display command.')
+    args = parser.parse_args()
+    if args.collection == 'v1-passage':
+        args.collection = 'msmarco-v1-passage'
+    elif args.collection == 'v1-doc':
+        args.collection = 'msmarco-v1-doc'
+    elif args.collection == 'v2-passage':
+        args.collection = 'msmarco-v2-passage'
+    elif args.collection == 'v2-doc':
+        args.collection = 'msmarco-v2-doc'
+    else:
+        raise ValueError(f'Unknown corpus: {args.collection}')
+    if args.list_conditions:
+        list_conditions(args)
+        sys.exit()
+    if args.generate_report:
+        if not args.output:
+            print(f'Must specify report filename with --output.')
+            sys.exit()
+        generate_report(args)
+        sys.exit()
+    if not args.all and not args.condition:
+        print(f'Must specify a specific condition using --condition or use --all to run all conditions.')
+        sys.exit()
+    run_conditions(args)

pyserini/2cr/msmarco_html_row_v1.template ADDED Viewed

	@@ -0,0 +1,81 @@

+<!-- Condition: ${condition_name} -->
+<tr class="accordion-toggle collapsed" id="row${row_cnt}" data-toggle="collapse" data-parent="#row${row_cnt}" href="#collapse${row_cnt}">
+<td class="expand-button"></td>
+<td style="min-width: 85px">$row</td>
+<td style="min-width: 400px">${condition_name}</td>
+<td>$s1</td>
+<td>$s2</td>
+<td>$s3</td>
+<td></td>
+<td>$s4</td>
+<td>$s5</td>
+<td>$s6</td>
+<td></td>
+<td>$s7</td>
+<td>$s8</td>
+</tr>
+<tr class="hide-table-padding">
+<td></td>
+<td colspan="11">
+<div id="collapse${row_cnt}" class="collapse in p-3">
+<!-- Tabs navs -->
+<ul class="nav nav-tabs mb-3" id="row${row_cnt}-tabs" role="tablist">
+  <li class="nav-item" role="presentation">
+    <a class="nav-link active" id="row${row_cnt}-tab1-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab1" role="tab" aria-controls="row${row_cnt}-tab1" aria-selected="true" style="text-transform:none">TREC 2019</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="row${row_cnt}-tab2-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab2" role="tab" aria-controls="row${row_cnt}-tab2" aria-selected="false" style="text-transform:none">TREC 2020</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="row${row_cnt}-tab3-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab3" role="tab" aria-controls="row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">dev</a>
+  </li>
+</ul>
+<!-- Tabs navs -->
+<!-- Tabs content -->
+<div class="tab-content" id="row${row_cnt}-content">
+  <div class="tab-pane fade show active" id="row${row_cnt}-tab1" role="tabpanel" aria-labelledby="row${row_cnt}-tab1">
+Command to generate run on TREC 2019 queries:
+  <blockquote class="mycode">
+<pre><code>$cmd1
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd1}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="row${row_cnt}-tab2" role="tabpanel" aria-labelledby="row${row_cnt}-tab2">
+    Command to generate run on TREC 2020 queries:
+  <blockquote class="mycode">
+<pre><code>$cmd2
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd2}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="row${row_cnt}-tab3" role="tabpanel" aria-labelledby="row${row_cnt}-tab3">
+    Command to generate run on dev queries:
+  <blockquote class="mycode">
+<pre><code>$cmd3
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd3}</code></pre>
+  </blockquote>
+  </div>
+</div>
+<!-- Tabs content -->
+</div></td>
+</tr>

pyserini/2cr/msmarco_html_row_v2.template ADDED Viewed

	@@ -0,0 +1,82 @@

+<!-- Condition: ${condition_name} -->
+<tr class="accordion-toggle collapsed" id="row${row_cnt}" data-toggle="collapse" data-parent="#row${row_cnt}" href="#collapse${row_cnt}">
+<td class="expand-button"></td>
+<td>$row</td>
+<td style="min-width: 400px">${condition_name}</td>
+<td>$s1</td>
+<td>$s2</td>
+<td>$s3</td>
+<td>$s4</td>
+<td>$s5</td>
+<td></td>
+<td>$s6</td>
+<td>$s7</td>
+<td></td>
+<td>$s8</td>
+<td>$s9</td>
+</tr>
+<tr class="hide-table-padding">
+<td></td>
+<td colspan="12">
+<div id="collapse${row_cnt}" class="collapse in p-3">
+<!-- Tabs navs -->
+<ul class="nav nav-tabs mb-3" id="row${row_cnt}-tabs" role="tablist">
+  <li class="nav-item" role="presentation">
+    <a class="nav-link active" id="row${row_cnt}-tab1-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab1" role="tab" aria-controls="row${row_cnt}-tab1" aria-selected="true" style="text-transform:none">TREC 2021</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="row${row_cnt}-tab2-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab2" role="tab" aria-controls="row${row_cnt}-tab2" aria-selected="false" style="text-transform:none">dev</a>
+  </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="row${row_cnt}-tab3-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab3" role="tab" aria-controls="row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">dev2</a>
+  </li>
+</ul>
+<!-- Tabs navs -->
+<!-- Tabs content -->
+<div class="tab-content" id="row${row_cnt}-content">
+  <div class="tab-pane fade show active" id="row${row_cnt}-tab1" role="tabpanel" aria-labelledby="row${row_cnt}-tab1">
+Command to generate run on TREC 2021 queries:
+  <blockquote class="mycode">
+<pre><code>$cmd1
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd1}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="row${row_cnt}-tab2" role="tabpanel" aria-labelledby="row${row_cnt}-tab2">
+    Command to generate run on dev queries:
+  <blockquote class="mycode">
+<pre><code>$cmd2
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd2}</code></pre>
+  </blockquote>
+  </div>
+  <div class="tab-pane fade" id="row${row_cnt}-tab3" role="tabpanel" aria-labelledby="row${row_cnt}-tab3">
+    Command to generate run on dev2 queries:
+  <blockquote class="mycode">
+<pre><code>$cmd3
+</code></pre></blockquote>
+Evaluation commands:
+  <blockquote class="mycode">
+<pre><code>${eval_cmd3}</code></pre>
+  </blockquote>
+  </div>
+</div>
+<!-- Tabs content -->
+</div></td>
+</tr>

pyserini/2cr/msmarco_html_v1_doc.template ADDED Viewed

	@@ -0,0 +1,296 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
+    <meta http-equiv="x-ua-compatible" content="ie=edge" />
+    <title>Pyserini Reproductions: MS MARCO V1 Document</title>
+    <!-- Font Awesome -->
+    <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
+    <!-- Google Fonts Roboto -->
+    <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
+    <!-- MDB -->
+   <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
+    <style>
+tr.hide-table-padding td {
+  padding: 0;
+}
+.expand-button {
+  position: relative;
+}
+.accordion-toggle .expand-button:after {
+  position: absolute;
+  left:.75rem;
+  top: 50%;
+  transform: translate(0, -50%);
+  content: '-';
+}
+.accordion-toggle.collapsed .expand-button:after {
+  content: '+';
+}
+blockquote.mycode {
+  border-left: 3px solid #ccc;
+  margin-left: 25px;
+  margin-top: 15px;
+  padding-left: 15px;
+}
+blockquote.mycode2 {
+  border-left: 3px solid #ccc;
+  margin-left: 25px;
+  padding-top: 10px;
+  padding-bottom: 10px;
+  padding-left: 15px;
+}
+tr th.headertop {
+  border-bottom: none;
+  padding-bottom: 0rem
+}
+tr th.headerbottom {
+  padding-top: 0rem
+}
+.table>:not(caption)>*>*{padding:0.75rem 0.75rem}
+.copy-code-button {
+	border-radius: 0;
+	min-width: 55px;
+	background: none repeat scroll 0 0 transparent;
+	background-color: grey;
+	color: #F1F2F3 !important;
+	cursor: pointer;
+	border-style: none;
+	font-family: 'HELVETICA',sans-serif;
+	font-size: 0.8em;
+	font-weight: normal;
+	text-align: center;
+	text-decoration: none;
+	text-indent: 0;
+	text-transform: uppercase;
+	font-weight: 500;
+	line-height: 1.42rem;
+	margin: 0;
+	padding: 3px 8px;
+	position: absolute !important;
+	top: 0 !important;
+	right: 0 !important;
+}
+.copy-code-button > span {
+	color: #F1F2F3 !important;
+}
+.copy-code-button, ::before, ::after {
+	box-sizing: inherit;
+}
+.copy-code-button::before {
+	content: '';
+	display: inline-block;
+	width: 16px;
+	height: 16px;
+	margin-right: 3px;
+	background-size: contain;
+	background-image: url("data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHN2ZyB3aWR0aD0iMTVweCIgaGVpZ2h0PSIxNXB4IiB2aWV3Qm94PSIwIDAgMTUgMTUiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayI+CiAgICA8IS0tIEdlbmVyYXRvcjogU2tldGNoIDUwLjIgKDU1MDQ3KSAtIGh0dHA6Ly93d3cuYm9oZW1pYW5jb2RpbmcuY29tL3NrZXRjaCAtLT4KICAgIDx0aXRsZT5QYWdlIDE8L3RpdGxlPgogICAgPGRlc2M+Q3JlYXRlZCB3aXRoIFNrZXRjaC48L2Rlc2M+CiAgICA8ZGVmcz48L2RlZnM+CiAgICA8ZyBpZD0iRmxvdyIgc3Ryb2tlPSJub25lIiBzdHJva2Utd2lkdGg9IjEiIGZpbGw9Im5vbmUiIGZpbGwtcnVsZT0iZXZlbm9kZCI+CiAgICAgICAgPGcgaWQ9IkJ0dG5faHRtbCIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTgxOS4wMDAwMDAsIC03NTMuMDAwMDAwKSIgZmlsbD0iI0ZGRkZGRiI+CiAgICAgICAgICAgIDxnIGlkPSJHcm91cC0xIiB0cmFuc2Zvcm09InRyYW5zbGF0ZSgzMTEuMDAwMDAwLCA0MDUuMDAwMDAwKSI+CiAgICAgICAgICAgICAgICA8ZyBpZD0iR3JvdXAtMiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoNTA4LjAwMDAwMCwgMzQyLjAwMDAwMCkiPgogICAgICAgICAgICAgICAgICAgIDxwYXRoIGQ9Ik0xMy45NzcyNzI3LDYgTDMuNDA5MDkwOTEsNiBDMi44NDQ1NDU0NSw2IDIuMzg2MzYzNjQsNi40NTgxODE4MiAyLjM4NjM2MzY0LDcuMDIyNzI3MjcgTDIuMzg2MzYzNjQsMTcuNTkwOTA5MSBDMi4zODYzNjM2NCwxOC4xNTU0NTQ1IDIuODQ0NTQ1NDUsMTguNjEzNjM2NCAzLjQwOTA5MDkxLDE4LjYxMzYzNjQgTDEzLjk3NzI3MjcsMTguNjEzNjM2NCBDMTQuNTQxODE4MiwxOC42MTM2MzY0IDE1LDE4LjE1NTQ1NDUgMTUsMTcuNTkwOTA5MSBMMTUsNy4wMjI3MjcyNyBDMTUsNi40NTgxODE4MiAxNC41NDE4MTgyLDYgMTMuOTc3MjcyNyw2IFogTTE0LjMxODE4MTgsMTcuNTkwOTA5MSBDMTQuMzE4MTgxOCwxNy43NzkwOTA5IDE0LjE2NTQ1NDUsMTcuOTMxODE4MiAxMy45NzcyNzI3LDE3LjkzMTgxODIgTDMuNDA5MDkwOTEsMTcuOTMxODE4MiBDMy4yMjA5MDkwOSwxNy45MzE4MTgyIDMuMDY4MTgxODIsMTcuNzc5MDkwOSAzLjA2ODE4MTgyLDE3LjU5MDkwOTEgTDMuMDY4MTgxODIsNy4wMjI3MjcyNyBDMy4wNjgxODE4Miw2LjgzNDU0NTQ1IDMuMjIwOTA5MDksNi42ODE4MTgxOCAzLjQwOTA5MDkxLDYuNjgxODE4MTggTDEzLjk3NzI3MjcsNi42ODE4MTgxOCBDMTQuMTY1NDU0NSw2LjY4MTgxODE4IDE0LjMxODE4MTgsNi44MzQ1NDU0NSAxNC4zMTgxODE4LDcuMDIyNzI3MjcgTDE0LjMxODE4MTgsMTcuNTkwOTA5MSBaIE0xMS45MzE4MTgyLDE5Ljk3NzI3MjcgQzExLjkzMTgxODIsMjAuMTY1NDU0NSAxMS43NzkwOTA5LDIwLjMxODE4MTggMTEuNTkwOTA5MSwyMC4zMTgxODE4IEwxLjAyMjcyNzI3LDIwLjMxODE4MTggQzAuODM0NTQ1NDU1LDIwLjMxODE4MTggMC42ODE4MTgxODIsMjAuMTY1NDU0NSAwLjY4MTgxODE4MiwxOS45NzcyNzI3IEwwLjY4MTgxODE4Miw5LjQwOTA5MDkxIEMwLjY4MTgxODE4Miw5LjIyMDkwOTA5IDAuODM0NTQ1NDU1LDkuMDY4MTgxODIgMS4wMjI3MjcyNyw5LjA2ODE4MTgyIEwxLjM2MzYzNjM2LDkuMDY4MTgxODIgTDEuMzYzNjM2MzYsOC4zODYzNjM2NCBMMS4wMjI3MjcyNyw4LjM4NjM2MzY0IEMwLjQ1ODE4MTgxOCw4LjM4NjM2MzY0IDAsOC44NDQ1NDU0NSAwLDkuNDA5MDkwOTEgTDAsMTkuOTc3MjcyNyBDMCwyMC41NDE4MTgyIDAuNDU4MTgxODE4LDIxIDEuMDIyNzI3MjcsMjEgTDExLjU5MDkwOTEsMjEgQzEyLjE1NTQ1NDUsMjEgMTIuNjEzNjM2NCwyMC41NDE4MTgyIDEyLjYxMzYzNjQsMTkuOTc3MjcyNyBMMTIuNjEzNjM2NCwxOS42MzYzNjM2IEwxMS45MzE4MTgyLDE5LjYzNjM2MzYgTDExLjkzMTgxODIsMTkuOTc3MjcyNyBaIiBpZD0iUGFnZS0xIj48L3BhdGg+CiAgICAgICAgICAgICAgICA8L2c+CiAgICAgICAgICAgIDwvZz4KICAgICAgICA8L2c+CiAgICA8L2c+Cjwvc3ZnPg==");
+	background-repeat: no-repeat;
+	position: relative;
+	top: 3px;
+}
+.copy-code-button:focus {
+    /* Avoid an ugly focus outline on click in Chrome,
+       but darken the button for accessibility.
+       See https://stackoverflow.com/a/25298082/1481479 */
+    /* background-color: #E6E6E6; */
+	outline: 0;
+}
+pre[class*="prettyprint"] {
+	position: relative;
+	overflow: hidden;
+}
+    </style>
+</head>
+<body>
+    <!-- Background image -->
+    <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
+      <div class="mask" style="
+            background: linear-gradient(
+              45deg,
+              rgba(29, 236, 197, 0.7),
+              rgba(91, 14, 214, 0.7) 100%
+            );
+          ">
+        <div class="container d-flex align-items-center justify-content-center text-center h-100"  style="max-height: 150px">
+          <div class="text-white">
+            <h1 class="mb-3">$title</h1>
+          </div>
+        </div>
+      </div>
+    </div>
+    <!-- Background image -->
+<div class="container my-4">
+<p>The two-click<a href="#" data-mdb-toggle="tooltip" title="What are the two clicks, you ask? Copy and paste!"><sup>*</sup></a> reproduction matrix below provides commands for reproducing experimental results reported in a number of papers, denoted by the references in square brackets.
+Instructions for programmatic execution are shown at the bottom of this page (scroll down).</p>
+<div class="table-responsive">
+  <table class="table">
+    <thead>
+      <tr>
+        <th class="headertop"></th>
+        <th class="headertop"></th>
+        <th class="headertop"></th>
+        <th class="headertop" colspan="4"><b>TREC 2019</b></th>
+        <th class="headertop" colspan="4"><b>TREC 2020</b></th>
+        <th class="headertop" colspan="3"><b>dev</b></th>
+      </tr>
+      <tr>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col"><br/>AP@100</th>
+        <th class="headerbottom" scope="col">nDCG@10</th>
+        <th class="headerbottom" scope="col">R@1K</th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col"><br/>AP@100</th>
+        <th class="headerbottom" scope="col">nDCG@10</th>
+        <th class="headerbottom" scope="col">R@1K</th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col">RR@100</th>
+        <th class="headerbottom" scope="col">R@1K</th>
+      </tr>
+    </thead>
+    <tbody>
+$rows
+    </tbody>
+  </table>
+</div>
+<ul style="list-style-type:none; padding-top: 25px">
+<li><p>[1] Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin.
+<a href="https://cs.uwaterloo.ca/~jimmylin/publications/Ma_etal_SIGIR2022.pdf">Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.</a>
+<i>Proceedings of the 45th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022)</i>, July 2022.</p>
+<p>&nbsp; &nbsp; &nbsp; &nbsp;Numbers in parentheses correspond to rows in Table 2 of the paper.</p></li>
+</ul>
+<div style="padding-top: 20px"/>
+<h4>Programmatic Execution</h4>
+<p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
+To list all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-doc --list-conditions
+</tt></blockquote>
+<p>These conditions correspond to the table rows above.</p>
+<p>For all conditions, just show the commands in a "dry run":</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-doc --all --display-commands --dry-run
+</tt></blockquote>
+<p>To actually run all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-doc --all --display-commands
+</tt></blockquote>
+<p>With the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>To show the commands for a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-doc --condition bm25-doc-default --display-commands --dry-run
+</tt></blockquote>
+<p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
+<p>To actually run a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-doc --condition bm25-doc-default --display-commands
+</tt></blockquote>
+<p>Again, with the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>Finally, to generate this page:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-doc --generate-report --output msmarco-v1-doc.html
+</tt></blockquote>
+<p>The output file <tt>msmarco-v1-doc.html</tt> should be identical to this page.</p>
+<div style="padding-top: 50px"/>
+      </div>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
+    <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
+<script>
+document.querySelectorAll('pre').forEach(function (codeBlock) {
+    var button = document.createElement('button');
+    button.className = 'copy-code-button';
+    button.type = 'button';
+    var s = codeBlock.innerText;
+    button.setAttribute('data-clipboard-text',s);
+    button.innerText = 'Copy';
+    // var pre = codeBlock.parentNode;
+    codeBlock.classList.add('prettyprint');
+    // pre.parentNode.insertBefore(button, pre);
+    codeBlock.appendChild(button);
+});
+var clipboard = new ClipboardJS('.copy-code-button');
+clipboard.on('success', function(e) {
+  console.info('Action:', e.action);
+  console.info('Text:', e.text);
+  console.info('Trigger:', e.trigger);
+  e.trigger.textContent = 'Copied';
+  window.setTimeout(function() {
+    e.trigger.textContent = 'Copy';
+  }, 2000);
+  e.clearSelection();
+});
+clipboard.on('error', function(e) {
+  console.error('Action:', e.action);
+  console.error('Trigger:', e.trigger);
+  e.trigger.textContent = 'Error Copying';
+  window.setTimeout(function() {
+    e.trigger.textContent = 'Copy';
+  }, 2000);
+  e.clearSelection();
+});
+</script>
+</body>
+</html>

pyserini/2cr/msmarco_html_v1_passage.template ADDED Viewed

	@@ -0,0 +1,325 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
+    <meta http-equiv="x-ua-compatible" content="ie=edge" />
+    <title>Pyserini Reproductions: MS MARCO V1 Passage</title>
+    <!-- Font Awesome -->
+    <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
+    <!-- Google Fonts Roboto -->
+    <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
+    <!-- MDB -->
+   <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
+    <style>
+tr.hide-table-padding td {
+  padding: 0;
+}
+.expand-button {
+  position: relative;
+}
+.accordion-toggle .expand-button:after {
+  position: absolute;
+  left:.75rem;
+  top: 50%;
+  transform: translate(0, -50%);
+  content: '-';
+}
+.accordion-toggle.collapsed .expand-button:after {
+  content: '+';
+}
+blockquote.mycode {
+  border-left: 3px solid #ccc;
+  margin-left: 25px;
+  margin-top: 15px;
+  padding-left: 15px;
+}
+blockquote.mycode2 {
+  border-left: 3px solid #ccc;
+  margin-left: 25px;
+  padding-top: 10px;
+  padding-bottom: 10px;
+  padding-left: 15px;
+}
+tr th.headertop {
+  border-bottom: none;
+  padding-bottom: 0rem
+}
+tr th.headerbottom {
+  padding-top: 0rem
+}
+.table>:not(caption)>*>*{padding:0.75rem 0.75rem}
+.copy-code-button {
+	border-radius: 0;
+	min-width: 55px;
+	background: none repeat scroll 0 0 transparent;
+	background-color: grey;
+	color: #F1F2F3 !important;
+	cursor: pointer;
+	border-style: none;
+	font-family: 'HELVETICA',sans-serif;
+	font-size: 0.8em;
+	font-weight: normal;
+	text-align: center;
+	text-decoration: none;
+	text-indent: 0;
+	text-transform: uppercase;
+	font-weight: 500;
+	line-height: 1.42rem;
+	margin: 0;
+	padding: 3px 8px;
+	position: absolute !important;
+	top: 0 !important;
+	right: 0 !important;
+}
+.copy-code-button > span {
+	color: #F1F2F3 !important;
+}
+.copy-code-button, ::before, ::after {
+	box-sizing: inherit;
+}
+.copy-code-button::before {
+	content: '';
+	display: inline-block;
+	width: 16px;
+	height: 16px;
+	margin-right: 3px;
+	background-size: contain;
+	background-image: url("data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHN2ZyB3aWR0aD0iMTVweCIgaGVpZ2h0PSIxNXB4IiB2aWV3Qm94PSIwIDAgMTUgMTUiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayI+CiAgICA8IS0tIEdlbmVyYXRvcjogU2tldGNoIDUwLjIgKDU1MDQ3KSAtIGh0dHA6Ly93d3cuYm9oZW1pYW5jb2RpbmcuY29tL3NrZXRjaCAtLT4KICAgIDx0aXRsZT5QYWdlIDE8L3RpdGxlPgogICAgPGRlc2M+Q3JlYXRlZCB3aXRoIFNrZXRjaC48L2Rlc2M+CiAgICA8ZGVmcz48L2RlZnM+CiAgICA8ZyBpZD0iRmxvdyIgc3Ryb2tlPSJub25lIiBzdHJva2Utd2lkdGg9IjEiIGZpbGw9Im5vbmUiIGZpbGwtcnVsZT0iZXZlbm9kZCI+CiAgICAgICAgPGcgaWQ9IkJ0dG5faHRtbCIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTgxOS4wMDAwMDAsIC03NTMuMDAwMDAwKSIgZmlsbD0iI0ZGRkZGRiI+CiAgICAgICAgICAgIDxnIGlkPSJHcm91cC0xIiB0cmFuc2Zvcm09InRyYW5zbGF0ZSgzMTEuMDAwMDAwLCA0MDUuMDAwMDAwKSI+CiAgICAgICAgICAgICAgICA8ZyBpZD0iR3JvdXAtMiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoNTA4LjAwMDAwMCwgMzQyLjAwMDAwMCkiPgogICAgICAgICAgICAgICAgICAgIDxwYXRoIGQ9Ik0xMy45NzcyNzI3LDYgTDMuNDA5MDkwOTEsNiBDMi44NDQ1NDU0NSw2IDIuMzg2MzYzNjQsNi40NTgxODE4MiAyLjM4NjM2MzY0LDcuMDIyNzI3MjcgTDIuMzg2MzYzNjQsMTcuNTkwOTA5MSBDMi4zODYzNjM2NCwxOC4xNTU0NTQ1IDIuODQ0NTQ1NDUsMTguNjEzNjM2NCAzLjQwOTA5MDkxLDE4LjYxMzYzNjQgTDEzLjk3NzI3MjcsMTguNjEzNjM2NCBDMTQuNTQxODE4MiwxOC42MTM2MzY0IDE1LDE4LjE1NTQ1NDUgMTUsMTcuNTkwOTA5MSBMMTUsNy4wMjI3MjcyNyBDMTUsNi40NTgxODE4MiAxNC41NDE4MTgyLDYgMTMuOTc3MjcyNyw2IFogTTE0LjMxODE4MTgsMTcuNTkwOTA5MSBDMTQuMzE4MTgxOCwxNy43NzkwOTA5IDE0LjE2NTQ1NDUsMTcuOTMxODE4MiAxMy45NzcyNzI3LDE3LjkzMTgxODIgTDMuNDA5MDkwOTEsMTcuOTMxODE4MiBDMy4yMjA5MDkwOSwxNy45MzE4MTgyIDMuMDY4MTgxODIsMTcuNzc5MDkwOSAzLjA2ODE4MTgyLDE3LjU5MDkwOTEgTDMuMDY4MTgxODIsNy4wMjI3MjcyNyBDMy4wNjgxODE4Miw2LjgzNDU0NTQ1IDMuMjIwOTA5MDksNi42ODE4MTgxOCAzLjQwOTA5MDkxLDYuNjgxODE4MTggTDEzLjk3NzI3MjcsNi42ODE4MTgxOCBDMTQuMTY1NDU0NSw2LjY4MTgxODE4IDE0LjMxODE4MTgsNi44MzQ1NDU0NSAxNC4zMTgxODE4LDcuMDIyNzI3MjcgTDE0LjMxODE4MTgsMTcuNTkwOTA5MSBaIE0xMS45MzE4MTgyLDE5Ljk3NzI3MjcgQzExLjkzMTgxODIsMjAuMTY1NDU0NSAxMS43NzkwOTA5LDIwLjMxODE4MTggMTEuNTkwOTA5MSwyMC4zMTgxODE4IEwxLjAyMjcyNzI3LDIwLjMxODE4MTggQzAuODM0NTQ1NDU1LDIwLjMxODE4MTggMC42ODE4MTgxODIsMjAuMTY1NDU0NSAwLjY4MTgxODE4MiwxOS45NzcyNzI3IEwwLjY4MTgxODE4Miw5LjQwOTA5MDkxIEMwLjY4MTgxODE4Miw5LjIyMDkwOTA5IDAuODM0NTQ1NDU1LDkuMDY4MTgxODIgMS4wMjI3MjcyNyw5LjA2ODE4MTgyIEwxLjM2MzYzNjM2LDkuMDY4MTgxODIgTDEuMzYzNjM2MzYsOC4zODYzNjM2NCBMMS4wMjI3MjcyNyw4LjM4NjM2MzY0IEMwLjQ1ODE4MTgxOCw4LjM4NjM2MzY0IDAsOC44NDQ1NDU0NSAwLDkuNDA5MDkwOTEgTDAsMTkuOTc3MjcyNyBDMCwyMC41NDE4MTgyIDAuNDU4MTgxODE4LDIxIDEuMDIyNzI3MjcsMjEgTDExLjU5MDkwOTEsMjEgQzEyLjE1NTQ1NDUsMjEgMTIuNjEzNjM2NCwyMC41NDE4MTgyIDEyLjYxMzYzNjQsMTkuOTc3MjcyNyBMMTIuNjEzNjM2NCwxOS42MzYzNjM2IEwxMS45MzE4MTgyLDE5LjYzNjM2MzYgTDExLjkzMTgxODIsMTkuOTc3MjcyNyBaIiBpZD0iUGFnZS0xIj48L3BhdGg+CiAgICAgICAgICAgICAgICA8L2c+CiAgICAgICAgICAgIDwvZz4KICAgICAgICA8L2c+CiAgICA8L2c+Cjwvc3ZnPg==");
+	background-repeat: no-repeat;
+	position: relative;
+	top: 3px;
+}
+.copy-code-button:focus {
+    /* Avoid an ugly focus outline on click in Chrome,
+       but darken the button for accessibility.
+       See https://stackoverflow.com/a/25298082/1481479 */
+    /* background-color: #E6E6E6; */
+	outline: 0;
+}
+pre[class*="prettyprint"] {
+	position: relative;
+	overflow: hidden;
+}
+    </style>
+</head>
+<body>
+    <!-- Background image -->
+    <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
+      <div class="mask" style="
+            background: linear-gradient(
+              45deg,
+              rgba(29, 236, 197, 0.7),
+              rgba(91, 14, 214, 0.7) 100%
+            );
+          ">
+        <div class="container d-flex align-items-center justify-content-center text-center h-100"  style="max-height: 150px">
+          <div class="text-white">
+            <h1 class="mb-3">$title</h1>
+          </div>
+        </div>
+      </div>
+    </div>
+    <!-- Background image -->
+<div class="container my-4">
+<p>The two-click<a href="#" data-mdb-toggle="tooltip" title="What are the two clicks, you ask? Copy and paste!"><sup>*</sup></a> reproduction matrix below provides commands for reproducing experimental results reported in a number of papers, denoted by the references in square brackets.
+Instructions for programmatic execution are shown at the bottom of this page (scroll down).</p>
+<div class="table-responsive">
+  <table class="table">
+    <thead>
+      <tr>
+        <th class="headertop"></th>
+        <th class="headertop"></th>
+        <th class="headertop"></th>
+        <th class="headertop" colspan="4"><b>TREC 2019</b></th>
+        <th class="headertop" colspan="4"><b>TREC 2020</b></th>
+        <th class="headertop" colspan="3"><b>dev</b></th>
+      </tr>
+      <tr>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col"><br/>AP</th>
+        <th class="headerbottom" scope="col">nDCG@10</th>
+        <th class="headerbottom" scope="col">R@1K</th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col"><br/>AP</th>
+        <th class="headerbottom" scope="col">nDCG@10</th>
+        <th class="headerbottom" scope="col">R@1K</th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col">RR@10</th>
+        <th class="headerbottom" scope="col">R@1K</th>
+      </tr>
+    </thead>
+    <tbody>
+$rows
+    </tbody>
+  </table>
+</div>
+<ul style="list-style-type:none; padding-top: 25px">
+<li><p>[1] Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin.
+<a href="https://cs.uwaterloo.ca/~jimmylin/publications/Ma_etal_SIGIR2022.pdf">Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.</a>
+<i>Proceedings of the 45th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022)</i>, July 2022.</p>
+<p>&nbsp; &nbsp; &nbsp; &nbsp;Numbers in parentheses correspond to rows in Table 1 of the paper.</p></li>
+<li><p>[2] Thibault Formal, Carlos Lassance, Benjamin Piwowarski, and Stéphane Clinchant
+<a href="https://arxiv.org/abs/2205.04733">From Distillation to Hard Negative Sampling: Making Sparse Neural IR Models More Effective.</a>
+<i>Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022), May 2022.</i></p></li>
+<li><p>[3] Lee Xiong, Chenyan Xiong, Ye Li, Kwok-Fung Tang, Jialin Liu, Paul N. Bennett, Junaid Ahmed, and Arnold Overwijk.
+<a href="https://openreview.net/forum?id=zeFrfgyZln">Approximate Nearest Neighbor Negative Contrastive Learning for Dense Text Retrieval.</a>
+<i>Proceedings of the 9th International Conference on Learning Representations (ICLR 2021), May 2021.</i></p></li>
+<li><p>[4] Sebastian Hofstätter, Sophia Althammer, Michael Schröder, Mete Sertkan, and Allan Hanbury.
+<a href="https://arxiv.org/abs/2010.02666">Improving Efficient Neural Ranking Models with Cross-Architecture Knowledge Distillation.</a>
+<i>arXiv:2010.02666</i>, October 2020.</p></li>
+<li><p>[5] Sebastian Hofstätter, Sheng-Chieh Lin, Jheng-Hong Yang, Jimmy Lin, and Allan Hanbury.
+<a href="https://dl.acm.org/doi/10.1145/3404835.3462891">Efficiently Teaching an Effective Dense Retriever with Balanced Topic Aware Sampling.</a>
+<i>Proceedings of the 44th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2021)</i>, pages 113-122, July 2021.</p></li>
+<li><p>[6] Sheng-Chieh Lin, Jheng-Hong Yang, and Jimmy Lin.
+<a href="https://aclanthology.org/2021.repl4nlp-1.17/">In-Batch Negatives for Knowledge Distillation with Tightly-Coupled Teachers for Dense Retrieval.</a>
+<i>Proceedings of the 6th Workshop on Representation Learning for NLP (RepL4NLP-2021)</i>, pages 163-173, August 2021.</p></li>
+<li><p>[7] Minghan Li, Sheng-Chieh Lin, Xueguang Ma, Jimmy Lin.
+<a href="https://arxiv.org/abs/2302.06587">SLIM: Sparsified Late Interaction for Multi-Vector Retrieval with Inverted Indexes.</a>
+<i>arXiv:2302.06587</i>, Feburary 2023.</p></li>
+<li><p>[8] Sheng-Chieh Lin, Minghan Li and Jimmy Lin.
+<a href="https://arxiv.org/abs/2208.00511">Aggretriever: A Simple Approach to Aggregate Textual Representation for Robust Dense Passage Retrieval.</a>
+<i>arXiv:2208.00511</i>, July 2022.</p></li>
+</ul>
+<div style="padding-top: 20px"/>
+<h4>Programmatic Execution</h4>
+<p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
+To list all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-passage --list-conditions
+</tt></blockquote>
+<p>These conditions correspond to the table rows above.</p>
+<p>For all conditions, just show the commands in a "dry run":</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-passage --all --display-commands --dry-run
+</tt></blockquote>
+<p>To actually run all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-passage --all --display-commands
+</tt></blockquote>
+<p>With the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>To show the commands for a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-passage --condition bm25-default --display-commands --dry-run
+</tt></blockquote>
+<p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
+<p>To actually run a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-passage --condition bm25-default --display-commands
+</tt></blockquote>
+<p>Again, with the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>Finally, to generate this page:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-passage --generate-report --output msmarco-v1-passage.html
+</tt></blockquote>
+<p>The output file <tt>msmarco-v1-passage.html</tt> should be identical to this page.</p>
+<div style="padding-top: 50px"/>
+      </div>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
+    <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
+<script>
+document.querySelectorAll('pre').forEach(function (codeBlock) {
+    var button = document.createElement('button');
+    button.className = 'copy-code-button';
+    button.type = 'button';
+    var s = codeBlock.innerText;
+    button.setAttribute('data-clipboard-text',s);
+    button.innerText = 'Copy';
+    // var pre = codeBlock.parentNode;
+    codeBlock.classList.add('prettyprint');
+    // pre.parentNode.insertBefore(button, pre);
+    codeBlock.appendChild(button);
+});
+var clipboard = new ClipboardJS('.copy-code-button');
+clipboard.on('success', function(e) {
+  console.info('Action:', e.action);
+  console.info('Text:', e.text);
+  console.info('Trigger:', e.trigger);
+  e.trigger.textContent = 'Copied';
+  window.setTimeout(function() {
+    e.trigger.textContent = 'Copy';
+  }, 2000);
+  e.clearSelection();
+});
+clipboard.on('error', function(e) {
+  console.error('Action:', e.action);
+  console.error('Trigger:', e.trigger);
+  e.trigger.textContent = 'Error Copying';
+  window.setTimeout(function() {
+    e.trigger.textContent = 'Copy';
+  }, 2000);
+  e.clearSelection();
+});
+</script>
+</body>
+</html>

pyserini/2cr/msmarco_html_v2_doc.template ADDED Viewed

	@@ -0,0 +1,292 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
+    <meta http-equiv="x-ua-compatible" content="ie=edge" />
+    <title>Pyserini Reproductions: MS MARCO V2 Document</title>
+    <!-- Font Awesome -->
+    <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
+    <!-- Google Fonts Roboto -->
+    <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
+    <!-- MDB -->
+   <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
+    <style>
+tr.hide-table-padding td {
+  padding: 0;
+}
+.expand-button {
+  position: relative;
+}
+.accordion-toggle .expand-button:after {
+  position: absolute;
+  left:.75rem;
+  top: 50%;
+  transform: translate(0, -50%);
+  content: '-';
+}
+.accordion-toggle.collapsed .expand-button:after {
+  content: '+';
+}
+blockquote.mycode {
+  border-left: 3px solid #ccc;
+  margin-left: 25px;
+  margin-top: 15px;
+  padding-left: 15px;
+}
+blockquote.mycode2 {
+  border-left: 3px solid #ccc;
+  margin-left: 25px;
+  padding-top: 10px;
+  padding-bottom: 10px;
+  padding-left: 15px;
+}
+tr th.headertop {
+  border-bottom: none;
+  padding-bottom: 0rem
+}
+tr th.headerbottom {
+  padding-top: 0rem
+}
+.table>:not(caption)>*>*{padding:0.75rem 0.75rem}
+.copy-code-button {
+	border-radius: 0;
+	min-width: 55px;
+	background: none repeat scroll 0 0 transparent;
+	background-color: grey;
+	color: #F1F2F3 !important;
+	cursor: pointer;
+	border-style: none;
+	font-family: 'HELVETICA',sans-serif;
+	font-size: 0.8em;
+	font-weight: normal;
+	text-align: center;
+	text-decoration: none;
+	text-indent: 0;
+	text-transform: uppercase;
+	font-weight: 500;
+	line-height: 1.42rem;
+	margin: 0;
+	padding: 3px 8px;
+	position: absolute !important;
+	top: 0 !important;
+	right: 0 !important;
+}
+.copy-code-button > span {
+	color: #F1F2F3 !important;
+}
+.copy-code-button, ::before, ::after {
+	box-sizing: inherit;
+}
+.copy-code-button::before {
+	content: '';
+	display: inline-block;
+	width: 16px;
+	height: 16px;
+	margin-right: 3px;
+	background-size: contain;
+	background-image: url("data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHN2ZyB3aWR0aD0iMTVweCIgaGVpZ2h0PSIxNXB4IiB2aWV3Qm94PSIwIDAgMTUgMTUiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayI+CiAgICA8IS0tIEdlbmVyYXRvcjogU2tldGNoIDUwLjIgKDU1MDQ3KSAtIGh0dHA6Ly93d3cuYm9oZW1pYW5jb2RpbmcuY29tL3NrZXRjaCAtLT4KICAgIDx0aXRsZT5QYWdlIDE8L3RpdGxlPgogICAgPGRlc2M+Q3JlYXRlZCB3aXRoIFNrZXRjaC48L2Rlc2M+CiAgICA8ZGVmcz48L2RlZnM+CiAgICA8ZyBpZD0iRmxvdyIgc3Ryb2tlPSJub25lIiBzdHJva2Utd2lkdGg9IjEiIGZpbGw9Im5vbmUiIGZpbGwtcnVsZT0iZXZlbm9kZCI+CiAgICAgICAgPGcgaWQ9IkJ0dG5faHRtbCIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTgxOS4wMDAwMDAsIC03NTMuMDAwMDAwKSIgZmlsbD0iI0ZGRkZGRiI+CiAgICAgICAgICAgIDxnIGlkPSJHcm91cC0xIiB0cmFuc2Zvcm09InRyYW5zbGF0ZSgzMTEuMDAwMDAwLCA0MDUuMDAwMDAwKSI+CiAgICAgICAgICAgICAgICA8ZyBpZD0iR3JvdXAtMiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoNTA4LjAwMDAwMCwgMzQyLjAwMDAwMCkiPgogICAgICAgICAgICAgICAgICAgIDxwYXRoIGQ9Ik0xMy45NzcyNzI3LDYgTDMuNDA5MDkwOTEsNiBDMi44NDQ1NDU0NSw2IDIuMzg2MzYzNjQsNi40NTgxODE4MiAyLjM4NjM2MzY0LDcuMDIyNzI3MjcgTDIuMzg2MzYzNjQsMTcuNTkwOTA5MSBDMi4zODYzNjM2NCwxOC4xNTU0NTQ1IDIuODQ0NTQ1NDUsMTguNjEzNjM2NCAzLjQwOTA5MDkxLDE4LjYxMzYzNjQgTDEzLjk3NzI3MjcsMTguNjEzNjM2NCBDMTQuNTQxODE4MiwxOC42MTM2MzY0IDE1LDE4LjE1NTQ1NDUgMTUsMTcuNTkwOTA5MSBMMTUsNy4wMjI3MjcyNyBDMTUsNi40NTgxODE4MiAxNC41NDE4MTgyLDYgMTMuOTc3MjcyNyw2IFogTTE0LjMxODE4MTgsMTcuNTkwOTA5MSBDMTQuMzE4MTgxOCwxNy43NzkwOTA5IDE0LjE2NTQ1NDUsMTcuOTMxODE4MiAxMy45NzcyNzI3LDE3LjkzMTgxODIgTDMuNDA5MDkwOTEsMTcuOTMxODE4MiBDMy4yMjA5MDkwOSwxNy45MzE4MTgyIDMuMDY4MTgxODIsMTcuNzc5MDkwOSAzLjA2ODE4MTgyLDE3LjU5MDkwOTEgTDMuMDY4MTgxODIsNy4wMjI3MjcyNyBDMy4wNjgxODE4Miw2LjgzNDU0NTQ1IDMuMjIwOTA5MDksNi42ODE4MTgxOCAzLjQwOTA5MDkxLDYuNjgxODE4MTggTDEzLjk3NzI3MjcsNi42ODE4MTgxOCBDMTQuMTY1NDU0NSw2LjY4MTgxODE4IDE0LjMxODE4MTgsNi44MzQ1NDU0NSAxNC4zMTgxODE4LDcuMDIyNzI3MjcgTDE0LjMxODE4MTgsMTcuNTkwOTA5MSBaIE0xMS45MzE4MTgyLDE5Ljk3NzI3MjcgQzExLjkzMTgxODIsMjAuMTY1NDU0NSAxMS43NzkwOTA5LDIwLjMxODE4MTggMTEuNTkwOTA5MSwyMC4zMTgxODE4IEwxLjAyMjcyNzI3LDIwLjMxODE4MTggQzAuODM0NTQ1NDU1LDIwLjMxODE4MTggMC42ODE4MTgxODIsMjAuMTY1NDU0NSAwLjY4MTgxODE4MiwxOS45NzcyNzI3IEwwLjY4MTgxODE4Miw5LjQwOTA5MDkxIEMwLjY4MTgxODE4Miw5LjIyMDkwOTA5IDAuODM0NTQ1NDU1LDkuMDY4MTgxODIgMS4wMjI3MjcyNyw5LjA2ODE4MTgyIEwxLjM2MzYzNjM2LDkuMDY4MTgxODIgTDEuMzYzNjM2MzYsOC4zODYzNjM2NCBMMS4wMjI3MjcyNyw4LjM4NjM2MzY0IEMwLjQ1ODE4MTgxOCw4LjM4NjM2MzY0IDAsOC44NDQ1NDU0NSAwLDkuNDA5MDkwOTEgTDAsMTkuOTc3MjcyNyBDMCwyMC41NDE4MTgyIDAuNDU4MTgxODE4LDIxIDEuMDIyNzI3MjcsMjEgTDExLjU5MDkwOTEsMjEgQzEyLjE1NTQ1NDUsMjEgMTIuNjEzNjM2NCwyMC41NDE4MTgyIDEyLjYxMzYzNjQsMTkuOTc3MjcyNyBMMTIuNjEzNjM2NCwxOS42MzYzNjM2IEwxMS45MzE4MTgyLDE5LjYzNjM2MzYgTDExLjkzMTgxODIsMTkuOTc3MjcyNyBaIiBpZD0iUGFnZS0xIj48L3BhdGg+CiAgICAgICAgICAgICAgICA8L2c+CiAgICAgICAgICAgIDwvZz4KICAgICAgICA8L2c+CiAgICA8L2c+Cjwvc3ZnPg==");
+	background-repeat: no-repeat;
+	position: relative;
+	top: 3px;
+}
+.copy-code-button:focus {
+    /* Avoid an ugly focus outline on click in Chrome,
+       but darken the button for accessibility.
+       See https://stackoverflow.com/a/25298082/1481479 */
+    /* background-color: #E6E6E6; */
+	outline: 0;
+}
+pre[class*="prettyprint"] {
+	position: relative;
+	overflow: hidden;
+}
+    </style>
+</head>
+<body>
+    <!-- Background image -->
+    <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
+      <div class="mask" style="
+            background: linear-gradient(
+              45deg,
+              rgba(29, 236, 197, 0.7),
+              rgba(91, 14, 214, 0.7) 100%
+            );
+          ">
+        <div class="container d-flex align-items-center justify-content-center text-center h-100"  style="max-height: 150px">
+          <div class="text-white">
+            <h1 class="mb-3">$title</h1>
+          </div>
+        </div>
+      </div>
+    </div>
+    <!-- Background image -->
+<div class="container my-4">
+<p>The two-click<a href="#" data-mdb-toggle="tooltip" title="What are the two clicks, you ask? Copy and paste!"><sup>*</sup></a> reproduction matrix below provides commands for reproducing experimental results reported in the following paper.
+Numbered rows correspond to tables in the paper; additional conditions are provided for comparison purposes.</p>
+<p class="note note-light">Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin. <a href="https://cs.uwaterloo.ca/~jimmylin/publications/Ma_etal_SIGIR2022.pdf">Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.</a>
+<i>Proceedings of the 45th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022)</i>, July 2022.</p>
+<p>Instructions for programmatic execution are shown at the bottom of this page (scroll down).</p>
+<div class="table-responsive">
+  <table class="table">
+    <thead>
+      <tr>
+        <th class="headertop"></th>
+        <th class="headertop"></th>
+        <th class="headertop"></th>
+        <th class="headertop" colspan="6"><b>TREC 2021</b></th>
+        <th class="headertop" colspan="3"><b>dev</b></th>
+        <th class="headertop" colspan="3"><b>dev2</b></th>
+      </tr>
+      <tr>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col"><br/>AP</th>
+        <th class="headerbottom" scope="col">nDCG@10</th>
+        <th class="headerbottom" scope="col">RR@100</th>
+        <th class="headerbottom" scope="col">R@100</th>
+        <th class="headerbottom" scope="col">R@1K</th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col">RR@100</th>
+        <th class="headerbottom" scope="col">R@1K</th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col">RR@100</th>
+        <th class="headerbottom" scope="col">R@1K</th>
+      </tr>
+    </thead>
+    <tbody>
+$rows
+    </tbody>
+  </table>
+</div>
+<div style="padding-top: 20px"/>
+<h4>Programmatic Execution</h4>
+<p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
+To list all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-doc --list-conditions
+</tt></blockquote>
+<p>These conditions correspond to the table rows above.</p>
+<p>For all conditions, just show the commands in a "dry run":</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-doc --all --display-commands --dry-run
+</tt></blockquote>
+<p>To actually run all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-doc --all --display-commands
+</tt></blockquote>
+<p>With the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>To show the commands for a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-doc --condition bm25-doc-default --display-commands --dry-run
+</tt></blockquote>
+<p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
+<p>To actually run a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-doc --condition bm25-doc-default --display-commands
+</tt></blockquote>
+<p>Again, with the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>Finally, to generate this page:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-doc --generate-report --output msmarco-v2-doc.html
+</tt></blockquote>
+<p>The output file <tt>msmarco-v2-doc.html</tt> should be identical to this page.</p>
+<div style="padding-top: 50px"/>
+      </div>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
+    <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
+<script>
+document.querySelectorAll('pre').forEach(function (codeBlock) {
+    var button = document.createElement('button');
+    button.className = 'copy-code-button';
+    button.type = 'button';
+    var s = codeBlock.innerText;
+    button.setAttribute('data-clipboard-text',s);
+    button.innerText = 'Copy';
+    // var pre = codeBlock.parentNode;
+    codeBlock.classList.add('prettyprint');
+    // pre.parentNode.insertBefore(button, pre);
+    codeBlock.appendChild(button);
+});
+var clipboard = new ClipboardJS('.copy-code-button');
+clipboard.on('success', function(e) {
+  console.info('Action:', e.action);
+  console.info('Text:', e.text);
+  console.info('Trigger:', e.trigger);
+  e.trigger.textContent = 'Copied';
+  window.setTimeout(function() {
+    e.trigger.textContent = 'Copy';
+  }, 2000);
+  e.clearSelection();
+});
+clipboard.on('error', function(e) {
+  console.error('Action:', e.action);
+  console.error('Trigger:', e.trigger);
+  e.trigger.textContent = 'Error Copying';
+  window.setTimeout(function() {
+    e.trigger.textContent = 'Copy';
+  }, 2000);
+  e.clearSelection();
+});
+</script>
+</body>
+</html>

pyserini/2cr/msmarco_html_v2_passage.template ADDED Viewed

	@@ -0,0 +1,292 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
+    <meta http-equiv="x-ua-compatible" content="ie=edge" />
+    <title>Pyserini Reproductions: MS MARCO V2 Passage</title>
+    <!-- Font Awesome -->
+    <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
+    <!-- Google Fonts Roboto -->
+    <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
+    <!-- MDB -->
+   <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
+    <style>
+tr.hide-table-padding td {
+  padding: 0;
+}
+.expand-button {
+  position: relative;
+}
+.accordion-toggle .expand-button:after {
+  position: absolute;
+  left:.75rem;
+  top: 50%;
+  transform: translate(0, -50%);
+  content: '-';
+}
+.accordion-toggle.collapsed .expand-button:after {
+  content: '+';
+}
+blockquote.mycode {
+  border-left: 3px solid #ccc;
+  margin-left: 25px;
+  margin-top: 15px;
+  padding-left: 15px;
+}
+blockquote.mycode2 {
+  border-left: 3px solid #ccc;
+  margin-left: 25px;
+  padding-top: 10px;
+  padding-bottom: 10px;
+  padding-left: 15px;
+}
+tr th.headertop {
+  border-bottom: none;
+  padding-bottom: 0rem
+}
+tr th.headerbottom {
+  padding-top: 0rem
+}
+.table>:not(caption)>*>*{padding:0.75rem 0.75rem}
+.copy-code-button {
+	border-radius: 0;
+	min-width: 55px;
+	background: none repeat scroll 0 0 transparent;
+	background-color: grey;
+	color: #F1F2F3 !important;
+	cursor: pointer;
+	border-style: none;
+	font-family: 'HELVETICA',sans-serif;
+	font-size: 0.8em;
+	font-weight: normal;
+	text-align: center;
+	text-decoration: none;
+	text-indent: 0;
+	text-transform: uppercase;
+	font-weight: 500;
+	line-height: 1.42rem;
+	margin: 0;
+	padding: 3px 8px;
+	position: absolute !important;
+	top: 0 !important;
+	right: 0 !important;
+}
+.copy-code-button > span {
+	color: #F1F2F3 !important;
+}
+.copy-code-button, ::before, ::after {
+	box-sizing: inherit;
+}
+.copy-code-button::before {
+	content: '';
+	display: inline-block;
+	width: 16px;
+	height: 16px;
+	margin-right: 3px;
+	background-size: contain;
+	background-image: url("data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHN2ZyB3aWR0aD0iMTVweCIgaGVpZ2h0PSIxNXB4IiB2aWV3Qm94PSIwIDAgMTUgMTUiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayI+CiAgICA8IS0tIEdlbmVyYXRvcjogU2tldGNoIDUwLjIgKDU1MDQ3KSAtIGh0dHA6Ly93d3cuYm9oZW1pYW5jb2RpbmcuY29tL3NrZXRjaCAtLT4KICAgIDx0aXRsZT5QYWdlIDE8L3RpdGxlPgogICAgPGRlc2M+Q3JlYXRlZCB3aXRoIFNrZXRjaC48L2Rlc2M+CiAgICA8ZGVmcz48L2RlZnM+CiAgICA8ZyBpZD0iRmxvdyIgc3Ryb2tlPSJub25lIiBzdHJva2Utd2lkdGg9IjEiIGZpbGw9Im5vbmUiIGZpbGwtcnVsZT0iZXZlbm9kZCI+CiAgICAgICAgPGcgaWQ9IkJ0dG5faHRtbCIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTgxOS4wMDAwMDAsIC03NTMuMDAwMDAwKSIgZmlsbD0iI0ZGRkZGRiI+CiAgICAgICAgICAgIDxnIGlkPSJHcm91cC0xIiB0cmFuc2Zvcm09InRyYW5zbGF0ZSgzMTEuMDAwMDAwLCA0MDUuMDAwMDAwKSI+CiAgICAgICAgICAgICAgICA8ZyBpZD0iR3JvdXAtMiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoNTA4LjAwMDAwMCwgMzQyLjAwMDAwMCkiPgogICAgICAgICAgICAgICAgICAgIDxwYXRoIGQ9Ik0xMy45NzcyNzI3LDYgTDMuNDA5MDkwOTEsNiBDMi44NDQ1NDU0NSw2IDIuMzg2MzYzNjQsNi40NTgxODE4MiAyLjM4NjM2MzY0LDcuMDIyNzI3MjcgTDIuMzg2MzYzNjQsMTcuNTkwOTA5MSBDMi4zODYzNjM2NCwxOC4xNTU0NTQ1IDIuODQ0NTQ1NDUsMTguNjEzNjM2NCAzLjQwOTA5MDkxLDE4LjYxMzYzNjQgTDEzLjk3NzI3MjcsMTguNjEzNjM2NCBDMTQuNTQxODE4MiwxOC42MTM2MzY0IDE1LDE4LjE1NTQ1NDUgMTUsMTcuNTkwOTA5MSBMMTUsNy4wMjI3MjcyNyBDMTUsNi40NTgxODE4MiAxNC41NDE4MTgyLDYgMTMuOTc3MjcyNyw2IFogTTE0LjMxODE4MTgsMTcuNTkwOTA5MSBDMTQuMzE4MTgxOCwxNy43NzkwOTA5IDE0LjE2NTQ1NDUsMTcuOTMxODE4MiAxMy45NzcyNzI3LDE3LjkzMTgxODIgTDMuNDA5MDkwOTEsMTcuOTMxODE4MiBDMy4yMjA5MDkwOSwxNy45MzE4MTgyIDMuMDY4MTgxODIsMTcuNzc5MDkwOSAzLjA2ODE4MTgyLDE3LjU5MDkwOTEgTDMuMDY4MTgxODIsNy4wMjI3MjcyNyBDMy4wNjgxODE4Miw2LjgzNDU0NTQ1IDMuMjIwOTA5MDksNi42ODE4MTgxOCAzLjQwOTA5MDkxLDYuNjgxODE4MTggTDEzLjk3NzI3MjcsNi42ODE4MTgxOCBDMTQuMTY1NDU0NSw2LjY4MTgxODE4IDE0LjMxODE4MTgsNi44MzQ1NDU0NSAxNC4zMTgxODE4LDcuMDIyNzI3MjcgTDE0LjMxODE4MTgsMTcuNTkwOTA5MSBaIE0xMS45MzE4MTgyLDE5Ljk3NzI3MjcgQzExLjkzMTgxODIsMjAuMTY1NDU0NSAxMS43NzkwOTA5LDIwLjMxODE4MTggMTEuNTkwOTA5MSwyMC4zMTgxODE4IEwxLjAyMjcyNzI3LDIwLjMxODE4MTggQzAuODM0NTQ1NDU1LDIwLjMxODE4MTggMC42ODE4MTgxODIsMjAuMTY1NDU0NSAwLjY4MTgxODE4MiwxOS45NzcyNzI3IEwwLjY4MTgxODE4Miw5LjQwOTA5MDkxIEMwLjY4MTgxODE4Miw5LjIyMDkwOTA5IDAuODM0NTQ1NDU1LDkuMDY4MTgxODIgMS4wMjI3MjcyNyw5LjA2ODE4MTgyIEwxLjM2MzYzNjM2LDkuMDY4MTgxODIgTDEuMzYzNjM2MzYsOC4zODYzNjM2NCBMMS4wMjI3MjcyNyw4LjM4NjM2MzY0IEMwLjQ1ODE4MTgxOCw4LjM4NjM2MzY0IDAsOC44NDQ1NDU0NSAwLDkuNDA5MDkwOTEgTDAsMTkuOTc3MjcyNyBDMCwyMC41NDE4MTgyIDAuNDU4MTgxODE4LDIxIDEuMDIyNzI3MjcsMjEgTDExLjU5MDkwOTEsMjEgQzEyLjE1NTQ1NDUsMjEgMTIuNjEzNjM2NCwyMC41NDE4MTgyIDEyLjYxMzYzNjQsMTkuOTc3MjcyNyBMMTIuNjEzNjM2NCwxOS42MzYzNjM2IEwxMS45MzE4MTgyLDE5LjYzNjM2MzYgTDExLjkzMTgxODIsMTkuOTc3MjcyNyBaIiBpZD0iUGFnZS0xIj48L3BhdGg+CiAgICAgICAgICAgICAgICA8L2c+CiAgICAgICAgICAgIDwvZz4KICAgICAgICA8L2c+CiAgICA8L2c+Cjwvc3ZnPg==");
+	background-repeat: no-repeat;
+	position: relative;
+	top: 3px;
+}
+.copy-code-button:focus {
+    /* Avoid an ugly focus outline on click in Chrome,
+       but darken the button for accessibility.
+       See https://stackoverflow.com/a/25298082/1481479 */
+    /* background-color: #E6E6E6; */
+	outline: 0;
+}
+pre[class*="prettyprint"] {
+	position: relative;
+	overflow: hidden;
+}
+    </style>
+</head>
+<body>
+    <!-- Background image -->
+    <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
+      <div class="mask" style="
+            background: linear-gradient(
+              45deg,
+              rgba(29, 236, 197, 0.7),
+              rgba(91, 14, 214, 0.7) 100%
+            );
+          ">
+        <div class="container d-flex align-items-center justify-content-center text-center h-100"  style="max-height: 150px">
+          <div class="text-white">
+            <h1 class="mb-3">$title</h1>
+          </div>
+        </div>
+      </div>
+    </div>
+    <!-- Background image -->
+<div class="container my-4">
+<p>The two-click<a href="#" data-mdb-toggle="tooltip" title="What are the two clicks, you ask? Copy and paste!"><sup>*</sup></a> reproduction matrix below provides commands for reproducing experimental results reported in the following paper.
+Numbered rows correspond to tables in the paper; additional conditions are provided for comparison purposes.</p>
+<p class="note note-light">Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin. <a href="https://cs.uwaterloo.ca/~jimmylin/publications/Ma_etal_SIGIR2022.pdf">Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.</a>
+<i>Proceedings of the 45th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022)</i>, July 2022.</p>
+<p>Instructions for programmatic execution are shown at the bottom of this page (scroll down).</p>
+<div class="table-responsive">
+  <table class="table">
+    <thead>
+      <tr>
+        <th class="headertop"></th>
+        <th class="headertop"></th>
+        <th class="headertop"></th>
+        <th class="headertop" colspan="6"><b>TREC 2021</b></th>
+        <th class="headertop" colspan="3"><b>dev</b></th>
+        <th class="headertop" colspan="3"><b>dev2</b></th>
+      </tr>
+      <tr>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col"><br/>AP</th>
+        <th class="headerbottom" scope="col">nDCG@10</th>
+        <th class="headerbottom" scope="col">RR@100</th>
+        <th class="headerbottom" scope="col">R@100</th>
+        <th class="headerbottom" scope="col">R@1K</th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col">RR@100</th>
+        <th class="headerbottom" scope="col">R@1K</th>
+        <th class="headerbottom" scope="col"></th>
+        <th class="headerbottom" scope="col">RR@100</th>
+        <th class="headerbottom" scope="col">R@1K</th>
+      </tr>
+    </thead>
+    <tbody>
+$rows
+    </tbody>
+  </table>
+</div>
+<div style="padding-top: 20px"/>
+<h4>Programmatic Execution</h4>
+<p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
+To list all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-passage --list-conditions
+</tt></blockquote>
+<p>These conditions correspond to the table rows above.</p>
+<p>For all conditions, just show the commands in a "dry run":</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-passage --all --display-commands --dry-run
+</tt></blockquote>
+<p>To actually run all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-passage --all --display-commands
+</tt></blockquote>
+<p>With the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>To show the commands for a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-passage --condition bm25-default --display-commands --dry-run
+</tt></blockquote>
+<p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
+<p>To actually run a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-passage --condition bm25-default --display-commands
+</tt></blockquote>
+<p>Again, with the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>Finally, to generate this page:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-passage --generate-report --output msmarco-v2-passage.html
+</tt></blockquote>
+<p>The output file <tt>msmarco-v2-passage.html</tt> should be identical to this page.</p>
+<div style="padding-top: 50px"/>
+      </div>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
+    <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
+<script>
+document.querySelectorAll('pre').forEach(function (codeBlock) {
+    var button = document.createElement('button');
+    button.className = 'copy-code-button';
+    button.type = 'button';
+    var s = codeBlock.innerText;
+    button.setAttribute('data-clipboard-text',s);
+    button.innerText = 'Copy';
+    // var pre = codeBlock.parentNode;
+    codeBlock.classList.add('prettyprint');
+    // pre.parentNode.insertBefore(button, pre);
+    codeBlock.appendChild(button);
+});
+var clipboard = new ClipboardJS('.copy-code-button');
+clipboard.on('success', function(e) {
+  console.info('Action:', e.action);
+  console.info('Text:', e.text);
+  console.info('Trigger:', e.trigger);
+  e.trigger.textContent = 'Copied';
+  window.setTimeout(function() {
+    e.trigger.textContent = 'Copy';
+  }, 2000);
+  e.clearSelection();
+});
+clipboard.on('error', function(e) {
+  console.error('Action:', e.action);
+  console.error('Trigger:', e.trigger);
+  e.trigger.textContent = 'Error Copying';
+  window.setTimeout(function() {
+    e.trigger.textContent = 'Copy';
+  }, 2000);
+  e.clearSelection();
+});
+</script>
+</body>
+</html>

pyserini/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

pyserini/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (165 Bytes). View file

pyserini/__pycache__/encoded_corpus_info.cpython-310.pyc ADDED Viewed

Binary file (971 Bytes). View file

pyserini/__pycache__/encoded_query_info.cpython-310.pyc ADDED Viewed

Binary file (15.2 kB). View file

pyserini/__pycache__/evaluate_script_info.cpython-310.pyc ADDED Viewed

Binary file (749 Bytes). View file

pyserini/__pycache__/prebuilt_index_info.cpython-310.pyc ADDED Viewed

Binary file (179 kB). View file

pyserini/__pycache__/pyclass.cpython-310.pyc ADDED Viewed

Binary file (736 Bytes). View file

pyserini/__pycache__/setup.cpython-310.pyc ADDED Viewed

Binary file (780 Bytes). View file

pyserini/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (8.03 kB). View file

pyserini/analysis/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from ._base import get_lucene_analyzer, Analyzer, JAnalyzer, JAnalyzerUtils, JDefaultEnglishAnalyzer, JWhiteSpaceAnalyzer
+__all__ = ['get_lucene_analyzer', 'Analyzer', 'JAnalyzer', 'JAnalyzerUtils', 'JDefaultEnglishAnalyzer', 'JWhiteSpaceAnalyzer']

pyserini/analysis/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (361 Bytes). View file

pyserini/analysis/__pycache__/_base.cpython-310.pyc ADDED Viewed

Binary file (5 kB). View file

pyserini/analysis/_base.py ADDED Viewed

	@@ -0,0 +1,166 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import List
+from ..pyclass import autoclass
+# Wrappers around Lucene classes
+JAnalyzer = autoclass('org.apache.lucene.analysis.Analyzer')
+JArabicAnalyzer = autoclass('org.apache.lucene.analysis.ar.ArabicAnalyzer')
+JBengaliAnalyzer = autoclass('org.apache.lucene.analysis.bn.BengaliAnalyzer')
+JCJKAnalyzer = autoclass('org.apache.lucene.analysis.cjk.CJKAnalyzer')
+JDanishAnalyzer = autoclass('org.apache.lucene.analysis.da.DanishAnalyzer')
+JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer')
+JDutchAnalyzer = autoclass('org.apache.lucene.analysis.nl.DutchAnalyzer')
+JFinnishAnalyzer = autoclass('org.apache.lucene.analysis.fi.FinnishAnalyzer')
+JFrenchAnalyzer = autoclass('org.apache.lucene.analysis.fr.FrenchAnalyzer')
+JGermanAnalyzer = autoclass('org.apache.lucene.analysis.de.GermanAnalyzer')
+JHindiAnalyzer = autoclass('org.apache.lucene.analysis.hi.HindiAnalyzer')
+JHungarianAnalyzer = autoclass('org.apache.lucene.analysis.hu.HungarianAnalyzer')
+JIndonesianAnalyzer = autoclass('org.apache.lucene.analysis.id.IndonesianAnalyzer')
+JItalianAnalyzer = autoclass('org.apache.lucene.analysis.it.ItalianAnalyzer')
+JJapaneseAnalyzer = autoclass('org.apache.lucene.analysis.ja.JapaneseAnalyzer')
+JNorwegianAnalyzer = autoclass('org.apache.lucene.analysis.no.NorwegianAnalyzer')
+JPortugueseAnalyzer = autoclass('org.apache.lucene.analysis.pt.PortugueseAnalyzer')
+JRussianAnalyzer = autoclass('org.apache.lucene.analysis.ru.RussianAnalyzer')
+JSpanishAnalyzer = autoclass('org.apache.lucene.analysis.es.SpanishAnalyzer')
+JSwedishAnalyzer = autoclass('org.apache.lucene.analysis.sv.SwedishAnalyzer')
+JTeluguAnalyzer = autoclass('org.apache.lucene.analysis.te.TeluguAnalyzer')
+JThaiAnalyzer = autoclass('org.apache.lucene.analysis.th.ThaiAnalyzer')
+JTurkishAnalyzer = autoclass('org.apache.lucene.analysis.tr.TurkishAnalyzer')
+JWhiteSpaceAnalyzer = autoclass('org.apache.lucene.analysis.core.WhitespaceAnalyzer')
+JCharArraySet = autoclass('org.apache.lucene.analysis.CharArraySet')
+# Wrappers around Anserini classes
+JAnalyzerUtils = autoclass('io.anserini.analysis.AnalyzerUtils')
+JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer')
+JTweetAnalyzer = autoclass('io.anserini.analysis.TweetAnalyzer')
+JHuggingFaceTokenizerAnalyzer = autoclass('io.anserini.analysis.HuggingFaceTokenizerAnalyzer')
+def get_lucene_analyzer(language: str='en', stemming: bool=True, stemmer: str='porter', stopwords: bool=True, huggingFaceTokenizer: str=None) -> JAnalyzer:
+    """Create a Lucene ``Analyzer`` with specific settings.
+    Parameters
+    ----------
+    language : str
+        Name of analyzer.
+    stemming : bool
+        Set to stem.
+    stemmer : str
+        Stemmer to use.
+    stopwords : bool
+        Set to filter stopwords.
+    huggingFaceTokenizer: str
+        a huggingface model id or path to a tokenizer.json file
+    Returns
+    -------
+    JAnalyzer
+        Java ``Analyzer`` with specified settings.
+    """
+    if language.lower() == 'ar':
+        return JArabicAnalyzer()
+    elif language.lower() == 'bn':
+        return JBengaliAnalyzer()
+    elif language.lower() in ['zh', 'ko']:
+        return JCJKAnalyzer()
+    elif language.lower() == 'da':
+        return JDanishAnalyzer()
+    elif language.lower() == 'nl':
+        return JDutchAnalyzer()
+    elif language.lower() == 'fi':
+        return JFinnishAnalyzer()
+    elif language.lower() == 'fr':
+        return JFrenchAnalyzer()
+    elif language.lower() == 'de':
+        return JGermanAnalyzer()
+    elif language.lower() == 'hi':
+        return JHindiAnalyzer()
+    elif language.lower() == 'hu':
+        return JHungarianAnalyzer()
+    elif language.lower() == 'id':
+        return JIndonesianAnalyzer()
+    elif language.lower() == 'it':
+        return JItalianAnalyzer()
+    elif language.lower() == 'ja':
+        return JJapaneseAnalyzer()
+    elif language.lower() == 'no':
+        return JNorwegianAnalyzer()
+    elif language.lower() == 'pt':
+        return JPortugueseAnalyzer()
+    elif language.lower() == 'ru':
+        return JRussianAnalyzer()
+    elif language.lower() == 'es':
+        return JSpanishAnalyzer()
+    elif language.lower() == 'te':
+        return JTeluguAnalyzer()
+    elif language.lower() == 'th':
+        return JThaiAnalyzer()
+    elif language.lower() == 'tr':
+        return JTurkishAnalyzer()
+    elif language.lower() == 'tweet':
+        return JTweetAnalyzer()
+    elif language.lower() == 'hgf_tokenizer':
+        return JHuggingFaceTokenizerAnalyzer(huggingFaceTokenizer)
+    elif language.lower() == 'en':
+        if stemming:
+            if stopwords:
+                return JDefaultEnglishAnalyzer.newStemmingInstance(stemmer)
+            else:
+                return JDefaultEnglishAnalyzer.newStemmingInstance(stemmer, JCharArraySet.EMPTY_SET)
+        else:
+            if stopwords:
+                return JDefaultEnglishAnalyzer.newNonStemmingInstance()
+            else:
+                return JDefaultEnglishAnalyzer.newNonStemmingInstance(JCharArraySet.EMPTY_SET)
+    else:
+        raise ValueError('Invalid configuration.')
+class Analyzer:
+    """Python wrapper around a Lucene ``Analyzer`` to simplify analysis.
+    Parameters
+    ----------
+    analyzer : JAnalyzer
+        Lucene ``Analyzer``.
+    """
+    def __init__(self, analyzer):
+        if not isinstance(analyzer, JAnalyzer):
+            raise TypeError('Invalid JAnalyzer!')
+        self.analyzer = analyzer
+    def analyze(self, text: str) -> List[str]:
+        """Analyze a piece of text.
+        Parameters
+        ----------
+        text : str
+            Text to analyze.
+        Returns
+        -------
+        List[str]
+            List of tokens corresponding to the output of the analyzer.
+        """
+        results = JAnalyzerUtils.analyze(self.analyzer, text)
+        tokens = []
+        for token in results.toArray():
+            tokens.append(token)
+        return tokens

pyserini/collection/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from ._base import Collection, FileSegment, SourceDocument
+from ._collection_support import Cord19Article
+__all__ = ['Collection', 'FileSegment', 'SourceDocument', 'Cord19Article']

pyserini/collection/_base.py ADDED Viewed

	@@ -0,0 +1,153 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import logging
+import re
+from enum import Enum
+from ..multithreading import Counters
+from ..pyclass import autoclass, cast, JPaths
+logger = logging.getLogger(__name__)
+JFileSegment = autoclass('io.anserini.collection.FileSegment')
+JSourceDocument = autoclass('io.anserini.collection.SourceDocument')
+class JCollections(Enum):
+    AclAnthology = autoclass('io.anserini.collection.AclAnthology')
+    CarCollection = autoclass('io.anserini.collection.CarCollection')
+    Cord19AbstractCollection = autoclass('io.anserini.collection.Cord19AbstractCollection')
+    ClueWeb09Collection = autoclass('io.anserini.collection.ClueWeb09Collection')
+    ClueWeb12Collection = autoclass('io.anserini.collection.ClueWeb12Collection')
+    HtmlCollection = autoclass('io.anserini.collection.HtmlCollection')
+    JsonCollection = autoclass('io.anserini.collection.JsonCollection')
+    NewYorkTimesCollection = autoclass('io.anserini.collection.NewYorkTimesCollection')
+    TrecCollection = autoclass('io.anserini.collection.TrecCollection')
+    TrecwebCollection = autoclass('io.anserini.collection.TrecwebCollection')
+    TweetCollection = autoclass('io.anserini.collection.TweetCollection')
+    WashingtonPostCollection = autoclass('io.anserini.collection.WashingtonPostCollection')
+    WikipediaCollection = autoclass('io.anserini.collection.WikipediaCollection')
+class Collection:
+    """
+    Iterable wrapper class for Anserini's DocumentCollection.
+    Parameters
+    ----------
+    collection_class : str
+        Name of collection class to instantiate
+    collection_path : str
+        Path to directory containing collection
+    """
+    def __init__(self, collection_class, collection_path):
+        self.counters = Counters()
+        self.collection_class = collection_class
+        self.collection_path = JPaths.get(collection_path)
+        self.object = self._get_collection()
+        self.collection_iterator = self.object.iterator()
+    def _get_collection(self):
+        try:
+            return JCollections[self.collection_class].value(self.collection_path)
+        except:
+            raise ValueError(self.collection_class)
+    def __iter__(self):
+        return self
+    def __next__(self):
+        if self.collection_iterator.hasNext():
+            fs = self.collection_iterator.next()
+            return FileSegment(self, fs, fs.getSegmentPath())
+        else:
+            raise StopIteration
+class FileSegment:
+    """
+    Iterable wrapper class for Anserini's FileSegment.
+    Parameters
+    ----------
+    collection : Collection
+        Parent collection of the file segment
+    segment : JFileSegment
+        FileSegment object to create wrapper from
+    segment_path : str
+        Path to file backing the file segment
+    """
+    def __init__(self, collection, segment, segment_path):
+        self.collection = collection
+        try:
+            self.object = cast(collection.object.getClass().getName() +
+                               '$Segment', segment)
+        except:
+            logger.exception('Exception from casting FileSegment type...')
+            self.object = cast('io.anserini.collection.FileSegment', segment)
+        self.segment_iterator = self.object.iterator()
+        self.segment_path = segment_path
+        self.segment_name = re.sub(r'\\|\/', '-', collection.collection_path.relativize(segment_path).toString())
+    def __iter__(self):
+        return self
+    def __next__(self):
+        if self.object.iterator().hasNext():
+            d = self.object.iterator().next()
+            return SourceDocument(self, d)
+        else:
+            # log if iteration stopped by error
+            if self.object.getErrorStatus():
+                logger.error(self.segment_name + ': Error from segment iteration, stopping...')
+                self.collection.counters.errors.increment()
+            # stop iteration and log skipped documents
+            skipped = self.object.getSkippedCount()
+            if skipped > 0:
+                self.collection.counters.skips.increment(skipped)
+                logger.warning(self.segment_name + ': ' + str(skipped) + ' documents skipped')
+            self.object.close()
+            raise StopIteration
+class SourceDocument:
+    """
+    Wrapper class for Anserini's SourceDocument.
+    Parameters
+    ----------
+    segment : FileSegment
+        Parent segment of the source document
+    document : io.anserini.collection.SourceDocument
+        SourceDocument object to create wrapper from
+    """
+    def __init__(self, segment, document):
+        if not isinstance(document, JSourceDocument):
+            raise TypeError('Invalid JSourceDocument!')
+        self.segment = segment
+        self.object = document
+        self.id = self.object.id()
+        self.indexable = self.object.indexable()
+        self.contents = self.object.contents()
+        self.raw = self.object.raw()

pyserini/collection/_collection_support.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Implementations of support for specific collections.
+import json
+class Cord19Article:
+    """Wrapper class for a raw JSON article from AI2's COVID-19 Open Research Dataset (CORD-19).
+    Parameters
+    ----------
+    doc : str
+        A JSON string of a CORD-19 article.
+    """
+    def __init__(self, doc):
+        self.json = json.loads(doc)
+        # Performs some basic error checking, throws an exception if user tries to instantiate with something
+        # that isn't from CORD-19.
+        if 'cord_uid' in self.json:
+            self.full_text = False
+        elif 'paper_id' in self.json:
+            self.full_text = True
+        else:
+            raise TypeError
+    def is_full_text(self):
+        return self.json['has_full_text']
+    def cord_uid(self):
+        return self.json['cord_uid']
+    def bib_entries(self):
+        return self.json['bib_entries']
+    def title(self):
+        try:
+            if self.is_full_text():
+                return self.json['metadata']['title']
+            else:
+                return self.json['csv_metadata']['title']
+        except KeyError:
+            return ''
+    def abstract(self):
+        try:
+            # For a full-text article, we can grab the abstract from two independent sources, the metadata or the
+            # actual full text. Here, we make the decision to use the metadata, even for full text.
+            return self.json['csv_metadata']['abstract']
+        except KeyError:
+            return ''
+    def metadata(self):
+        return self.json['csv_metadata']
+    def body(self):
+        try:
+            if self.is_full_text():
+                return [entry['text'] for entry in self.json['body_text']]
+            else:
+                return []
+        except KeyError:
+            return ''

pyserini/demo/acl.py ADDED Viewed

	@@ -0,0 +1,124 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+This script provides an interactive web interface demo for retrieval on the ACL dataset.
+It requires `flask` (`pip install flask~=2.2.0`).
+An example command looks like `python -m pyserini.demo.acl` that starts up a server on port 8080.
+The demo can be accessed via "http://localhost:8080" in a web browser.
+Additional arguments include:
+    --port [PORT] --hits [Number of hits]
+    --k1 [BM25 k1] --b [BM25 b] --device [cpu, cuda]
+"""
+import json
+import logging
+from argparse import ArgumentParser
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
+from flask import Flask, render_template, request, flash, jsonify
+from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder
+logging.basicConfig(
+    format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+    level=logging.INFO,
+)
+logger = logging.getLogger('acl-demo')
+VERSION = '1.0'
+Searcher = Union[FaissSearcher, LuceneSearcher]
+def create_app(k: int, load_searcher_fn: Callable[[str], Tuple[Searcher, str]]):
+    app = Flask(__name__)
+    lang = 'en'
+    searcher, retriever = load_searcher_fn(lang)
+    @app.route('/')
+    def index():
+        nonlocal lang, searcher, retriever
+        return render_template('acl.html', lang=lang, retriever=retriever)
+    @app.route('/search', methods=['GET', 'POST'])
+    def search():
+        nonlocal lang, searcher, retriever
+        query = request.form['q']
+        if not query:
+            search_results = []
+            flash('Question is required')
+        else:
+            hits = searcher.search(query, k=k)
+            docs = [searcher.doc(hit.docid) for hit in hits]
+            search_results = [
+                {
+                    'rank': r + 1,
+                    'docid': hit.docid,
+                    'doc': docs[r].contents(),
+                    'score': hit.score,
+                }
+                for r, hit in enumerate(hits)
+            ]
+        return render_template(
+            'acl.html', search_results=search_results, query=query, lang=lang, retriever=retriever
+        )
+    return app
+def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher, str):
+    searcher = LuceneSearcher('indexes/lucene-index-acl-paragraph')
+    searcher.set_language(language)
+    if k1 is not None and b is not None:
+        searcher.set_bm25(k1, b)
+        retriever_name = f'BM25 (k1={k1}, b={b})'
+    else:
+        retriever_name = 'BM25'
+    return searcher, retriever_name
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('--k1', type=float, help='BM25 k1 parameter.')
+    parser.add_argument('--b', type=float, help='BM25 b parameter.')
+    parser.add_argument('--hits', type=int, default=10, help='Number of hits returned by the retriever')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='cpu',
+        help='Device to run query encoder, cpu or [cuda:0, cuda:1, ...] (used only when index is based on FAISS)',
+    )
+    parser.add_argument(
+        '--port',
+        default=8080,
+        type=int,
+        help='Web server port',
+    )
+    args = parser.parse_args()
+    load_fn = partial(_load_sparse_searcher, k1=args.k1, b=args.b)
+    app = create_app(args.hits, load_fn)
+    app.run(host='0.0.0.0', port=args.port)
+if __name__ == '__main__':
+    main()

pyserini/demo/dpr.py ADDED Viewed

	@@ -0,0 +1,105 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import cmd
+import json
+import random
+from pyserini.search.lucene import LuceneSearcher
+from pyserini.search.faiss import FaissSearcher, DprQueryEncoder
+from pyserini.search.hybrid import HybridSearcher
+from pyserini import search
+class DPRDemo(cmd.Cmd):
+    nq_dev_topics = list(search.get_topics('dpr-nq-dev').values())
+    trivia_dev_topics = list(search.get_topics('dpr-trivia-dev').values())
+    ssearcher = LuceneSearcher.from_prebuilt_index('wikipedia-dpr')
+    searcher = ssearcher
+    encoder = DprQueryEncoder("facebook/dpr-question_encoder-multiset-base")
+    index = 'wikipedia-dpr-multi-bf'
+    dsearcher = FaissSearcher.from_prebuilt_index(
+        index,
+        encoder
+    )
+    hsearcher = HybridSearcher(dsearcher, ssearcher)
+    k = 10
+    prompt = '>>> '
+    def precmd(self, line):
+        if line[0] == '/':
+            line = line[1:]
+        return line
+    def do_help(self, arg):
+        print(f'/help    : returns this message')
+        print(f'/k [NUM] : sets k (number of hits to return) to [NUM]')
+        print(f'/mode [MODE] : sets retriever type to [MODE] (one of sparse, dense, hybrid)')
+        print(f'/random [COLLECTION]: returns results for a random question from the dev subset [COLLECTION] (one of nq, trivia).')
+    def do_k(self, arg):
+        print(f'setting k = {int(arg)}')
+        self.k = int(arg)
+    def do_mode(self, arg):
+        if arg == "sparse":
+            self.searcher = self.ssearcher
+        elif arg == "dense":
+            self.searcher = self.dsearcher
+        elif arg == "hybrid":
+            self.searcher = self.hsearcher
+        else:
+            print(
+                f'Mode "{arg}" is invalid. Mode should be one of [sparse, dense, hybrid].')
+            return
+        print(f'setting retriver = {arg}')
+    def do_random(self, arg):
+        if arg == "nq":
+            topics = self.nq_dev_topics
+        elif arg == "trivia":
+            topics = self.trivia_dev_topics
+        else:
+            print(
+                f'Collection "{arg}" is invalid. Collection should be one of [nq, trivia].')
+            return
+        q = random.choice(topics)['title']
+        print(f'question: {q}')
+        self.default(q)
+    def do_EOF(self, line):
+        return True
+    def default(self, q):
+        hits = self.searcher.search(q, self.k)
+        for i in range(0, len(hits)):
+            raw_doc = None
+            if isinstance(self.searcher, LuceneSearcher):
+                raw_doc = hits[i].raw
+            else:
+                doc = self.searcher.doc(hits[i].docid)
+                if doc:
+                    raw_doc = doc.raw()
+            jsondoc = json.loads(raw_doc)
+            print(f'{i + 1:2} {hits[i].score:.5f} {jsondoc["contents"]}')
+if __name__ == '__main__':
+    DPRDemo().cmdloop()

pyserini/demo/miracl.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+This script provides an interactive web interface demo for retrieval on the MIRACL dataset.
+It requires `flask` (`pip install flask~=2.2.0`).
+An example command looks like `python -m pyserini.demo.miracl` that starts up a server on port 8080.
+The demo can be accessed via "http://localhost:8080" in a web browser.
+Additional arguments include:
+    --port [PORT] --hits [Number of hits] --index [BM25 or mdpr-tied-pft-msmarco]
+    --k1 [BM25 k1] --b [BM25 b] --device [cpu, cuda]
+"""
+import json
+import logging
+from argparse import ArgumentParser
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
+from flask import Flask, render_template, request, flash, jsonify
+from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder
+logging.basicConfig(
+    format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+    level=logging.INFO,
+)
+logger = logging.getLogger('miracl-demo')
+VERSION = '1.0'
+LANGUAGES = ('ar', 'bn', 'en', 'es', 'fa', 'fi', 'fr', 'hi', 'id', 'ja', 'ko', 'ru', 'sw', 'te', 'th', 'zh')
+Searcher = Union[FaissSearcher, LuceneSearcher]
+def create_app(k: int, load_searcher_fn: Callable[[str], Tuple[Searcher, str]]):
+    app = Flask(__name__)
+    lang = LANGUAGES[0]
+    searcher, retriever = load_searcher_fn(lang)
+    @app.route('/')
+    def index():
+        nonlocal lang, searcher, retriever
+        return render_template('miracl.html', lang=lang, retriever=retriever)
+    @app.route('/search', methods=['GET', 'POST'])
+    def search():
+        nonlocal lang, searcher, retriever
+        query = request.form['q']
+        if not query:
+            search_results = []
+            flash('Question is required')
+        else:
+            hits = searcher.search(query, k=k)
+            docs = [json.loads(searcher.doc(hit.docid).raw()) for hit in hits]
+            search_results = [
+                {
+                    'rank': r + 1,
+                    'docid': hit.docid,
+                    'doc': docs[r]['text'],
+                    'title': docs[r]['title'],
+                    'score': hit.score,
+                }
+                for r, hit in enumerate(hits)
+            ]
+        return render_template(
+            'miracl.html', search_results=search_results, query=query, lang=lang, retriever=retriever
+        )
+    @app.route('/lang', methods=['GET'])
+    def change_language():
+        nonlocal lang, searcher, retriever
+        new_lang = request.args.get('new_lang', '', type=str)
+        if not new_lang or new_lang not in LANGUAGES:
+            return
+        lang = new_lang
+        searcher, retriever = load_searcher_fn(lang)
+        return jsonify(lang=lang)
+    return app
+def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher, str):
+    searcher = LuceneSearcher.from_prebuilt_index(f'miracl-v{VERSION}-{language}')
+    searcher.set_language(language)
+    if k1 is not None and b is not None:
+        searcher.set_bm25(k1, b)
+        retriever_name = f'BM25 (k1={k1}, b={b})'
+    else:
+        retriever_name = 'BM25'
+    return searcher, retriever_name
+def _load_faiss_searcher(language: str, device:  str) -> (Searcher, str):
+    query_encoder = AutoQueryEncoder(encoder_dir='castorini/mdpr-tied-pft-msmarco', device=device)
+    searcher = FaissSearcher.from_prebuilt_index(
+        f'miracl-v{VERSION}-{language}-mdpr-tied-pft-msmarco', query_encoder
+    )
+    retriever_name = 'mDPR-pFT-MSMARCO'
+    return searcher, retriever_name
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('--index', default='BM25', choices=('BM25', 'mdpr-tied-pft-msmarco'), help='Index type.')
+    parser.add_argument('--k1', type=float, help='BM25 k1 parameter.')
+    parser.add_argument('--b', type=float, help='BM25 b parameter.')
+    parser.add_argument('--hits', type=int, default=10, help='Number of hits returned by the retriever')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='cpu',
+        help='Device to run query encoder, cpu or [cuda:0, cuda:1, ...] (used only when index is based on FAISS)',
+    )
+    parser.add_argument(
+        '--port',
+        default=8080,
+        type=int,
+        help='Web server port',
+    )
+    args = parser.parse_args()
+    if args.index == 'mdpr-tied-pft-msmarco':
+        load_fn = partial(_load_faiss_searcher, device=args.device)
+    else:
+        load_fn = partial(_load_sparse_searcher, k1=args.k1, b=args.b)
+    app = create_app(args.hits, load_fn)
+    app.run(host='0.0.0.0', port=args.port)
+if __name__ == '__main__':
+    main()

pyserini/demo/msmarco.py ADDED Viewed

	@@ -0,0 +1,118 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import cmd
+import json
+import os
+import random
+from pyserini.search.lucene import LuceneSearcher
+from pyserini.search.faiss import FaissSearcher, TctColBertQueryEncoder, AnceQueryEncoder
+from pyserini.search.hybrid import HybridSearcher
+from pyserini import search
+class MsMarcoDemo(cmd.Cmd):
+    dev_topics = list(search.get_topics('msmarco-passage-dev-subset').values())
+    ssearcher = LuceneSearcher.from_prebuilt_index('msmarco-passage')
+    dsearcher = None
+    hsearcher = None
+    searcher = ssearcher
+    k = 10
+    prompt = '>>> '
+    # https://stackoverflow.com/questions/35213134/command-prefixes-in-python-cli-using-cmd-in-pythons-standard-library
+    def precmd(self, line):
+        if line[0] == '/':
+            line = line[1:]
+        return line
+    def do_help(self, arg):
+        print(f'/help    : returns this message')
+        print(f'/k [NUM] : sets k (number of hits to return) to [NUM]')
+        print(f'/model [MODEL] : sets encoder to use the model [MODEL] (one of tct, ance)')
+        print(f'/mode [MODE] : sets retriever type to [MODE] (one of sparse, dense, hybrid)')
+        print(f'/random : returns results for a random question from dev subset')
+    def do_k(self, arg):
+        print(f'setting k = {int(arg)}')
+        self.k = int(arg)
+    def do_mode(self, arg):
+        if arg == "sparse":
+            self.searcher = self.ssearcher
+        elif arg == "dense":
+            if self.dsearcher is None:
+                print(f'Specify model through /model before using dense retrieval.')
+                return
+            self.searcher = self.dsearcher
+        elif arg == "hybrid":
+            if self.hsearcher is None:
+                print(f'Specify model through /model before using hybrid retrieval.')
+                return
+            self.searcher = self.hsearcher
+        else:
+            print(
+                f'Mode "{arg}" is invalid. Mode should be one of [sparse, dense, hybrid].')
+            return
+        print(f'setting retriver = {arg}')
+    def do_model(self, arg):
+        if arg == "tct":
+            encoder = TctColBertQueryEncoder("castorini/tct_colbert-msmarco")
+            index = "msmarco-passage-tct_colbert-hnsw"
+        elif arg == "ance":
+            encoder = AnceQueryEncoder("castorini/ance-msmarco-passage")
+            index = "msmarco-passage-ance-bf"
+        else:
+            print(
+                f'Model "{arg}" is invalid. Model should be one of [tct, ance].')
+            return
+        self.dsearcher = FaissSearcher.from_prebuilt_index(
+            index,
+            encoder
+        )
+        self.hsearcher = HybridSearcher(self.dsearcher, self.ssearcher)
+        print(f'setting model = {arg}')
+    def do_random(self, arg):
+        q = random.choice(self.dev_topics)['title']
+        print(f'question: {q}')
+        self.default(q)
+    def do_EOF(self, line):
+        return True
+    def default(self, q):
+        hits = self.searcher.search(q, self.k)
+        for i in range(0, len(hits)):
+            raw_doc = None
+            if isinstance(self.searcher, LuceneSearcher):
+                raw_doc = hits[i].raw
+            else:
+                doc = self.searcher.doc(hits[i].docid)
+                if doc:
+                    raw_doc = doc.raw()
+            jsondoc = json.loads(raw_doc)
+            print(f'{i + 1:2} {hits[i].score:.5f} {jsondoc["contents"]}')
+if __name__ == '__main__':
+    MsMarcoDemo().cmdloop()

pyserini/demo/templates/acl.html ADDED Viewed

	@@ -0,0 +1,74 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta property="og:title" content="ACL 🌍🙌🌏">
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.2/dist/css/bootstrap.min.css" rel="stylesheet"
+          integrity="sha384-Zenh87qX5JnK2Jl0vWa8Ck2rdkQ2Bzep5IDxbcnCeuOxjzrPF/et3URy9Bv1WTRi" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.9.1/font/bootstrap-icons.css">
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.2.2/dist/js/bootstrap.bundle.min.js"
+            integrity="sha384-OERcA2EqjJCMA+/3y+gxIOqMEjwtxJY7qPCqsdltbNJuaOe923+mo//f6V8Qbsw3"
+            crossorigin="anonymous"></script>
+    <script src="https://cdn.jsdelivr.net/npm/jquery@3.6.1/dist/jquery.min.js"></script>
+    <script>
+        $SCRIPT_ROOT = {{ request.script_root|tojson }};
+    </script>
+    <title>ACL 🌍🙌🌏 Demo</title>
+</head>
+<body>
+<div style="display: flex; align-items: center; gap: 10px;">
+    <h2>ACL</h2>
+    <img src="https://aclanthology.org/images/acl-logo.svg" alt="acl logo" width="50px">
+    <h2>Demo</h2>
+</div>
+<br/>
+<div class="container text-center">
+    {% for message in get_flashed_messages() %}
+        <div class="alert">{{ message }}</div>
+    {% endfor %}
+    <form action="/search" method="post">
+        <div class="row-cols-3">
+            <div class="input-group mb-3">
+                <input type="text" class="form-control" placeholder="Enter a Question" aria-label="Question" name="q"
+                       aria-describedby="button-addon2" value="{{ query if query else '' }}">
+                <button class="btn btn-outline-secondary" type="submit" id="button-addon2"><i class="bi bi-search"></i>
+                </button>
+            </div>
+        </div>
+    </form>
+    {% if search_results %}
+        <div class="row">
+            <table class="table">
+                <thead>
+                <tr>
+                    <th scope="col">#</th>
+                    <th scope="col">Score</th>
+                    <th scope="col">Passage ID</th>
+                    <th scope="col">Content</th>
+                </tr>
+                </thead>
+                <tbody class="table-group-divider">
+                {% for res in search_results %}
+                    <tr class="{{ 'table-secondary' if res['rank'] % 2 else 'table-light' }}">
+                        <th scope="row">{{ res["rank"] }}</th>
+                        <td>{{ "%.2f"|format(res["score"]) }}</td>
+                        <td>{{ res["docid"] }}</td>
+                        <td style="word-wrap: break-word;min-width: 600px;max-width: 600px;"
+                            class="text-{{ 'end' if lang in ('ar', 'fa') else 'start' }}">
+                            <small>{{ res["doc"] }}</small>
+                        </td>
+                    </tr>
+                {% endfor %}
+                </tbody>
+            </table>
+        </div>
+    {% endif %}
+</div>
+</body>
+</html>

pyserini/demo/templates/assets/acl-logo.svg ADDED Viewed

pyserini/demo/templates/miracl.html ADDED Viewed

	@@ -0,0 +1,127 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta property="og:title" content="MIRACL 🌍🙌🌏">
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.2/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-Zenh87qX5JnK2Jl0vWa8Ck2rdkQ2Bzep5IDxbcnCeuOxjzrPF/et3URy9Bv1WTRi" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.9.1/font/bootstrap-icons.css">
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.2.2/dist/js/bootstrap.bundle.min.js" integrity="sha384-OERcA2EqjJCMA+/3y+gxIOqMEjwtxJY7qPCqsdltbNJuaOe923+mo//f6V8Qbsw3" crossorigin="anonymous"></script>
+    <script src="https://cdn.jsdelivr.net/npm/jquery@3.6.1/dist/jquery.min.js"></script>
+    <script>
+      $SCRIPT_ROOT = {{ request.script_root|tojson }};
+      $( document ).ready(function() {
+        $("#loading").hide();
+        $('#language').val("{{lang}}");
+      });
+      $(function() {
+        $('#language').on('change', function() {
+          $.getJSON($SCRIPT_ROOT + '/lang', {
+            new_lang: this.value,
+          }, function(data) {
+            $("#language").removeAttr('disabled');
+            $("#loading").hide();
+          });
+          $(this).attr('disabled','disabled');
+          $("#loading").show();
+          return false;
+        });
+      });
+    </script>
+    <title>MIRACL 🌍🙌🌏 Demo</title>
+</head>
+<body>
+    <h2>MIRACL 🌍🙌🌏 Demo</h2>
+    <h4>Multilingual Information Retrieval Across a Continuum of Languages</h4>
+    <br/>
+    <p class="lead">
+        <a href="http://miracl.ai/">MIRACL</a> is a multilingual dataset for ad hoc retrieval that consists of 18 different languages, collectively encompassing over three billion native speakers around the world.
+    </p>
+    <div class="row g-3 align-items-center">
+        <label class="col-auto" for="language">This demo running on the language</label>
+        <div class="col-auto">
+            <select class="form-select form-select-sm" aria-label=".form-select-sm" id="language">
+                <option value="ar">Arabic</option>
+                <option value="bn">Bengali</option>
+                <option value="en">English</option>
+                <option value="es">Spanish</option>
+                <option value="fa">Persian</option>
+                <option value="fi">Finnish</option>
+                <option value="fr">French</option>
+                <option value="hi">Hindi</option>
+                <option value="id">Indonesian</option>
+                <option value="ja">Japanese</option>
+                <option value="ko">Korean</option>
+                <option value="ru">Russian</option>
+                <option value="sw">Swahili</option>
+                <option value="te">Telugu</option>
+                <option value="th">Thai</option>
+                <option value="zh">Chinese</option>
+            </select>
+        </div>
+        <div class="col-auto">
+            <div class="spinner-border text-secondary" role="status" id="loading">
+                <span class="visually-hidden">Loading...</span>
+            </div>
+        </div>
+        <div class="col-auto">
+            <span>
+            retrieves passages using <em>{{retriever}}</em>.
+            </span>
+        </div>
+    </div>
+    <br/>
+    <div class="container text-center">
+        {% for message in get_flashed_messages() %}
+            <div class="alert">{{ message }}</div>
+        {% endfor %}
+        <form action="/search" method="post">
+            <div class="row-cols-3">
+                <div class="input-group mb-3">
+                    <input type="text" class="form-control" placeholder="Enter a Question" aria-label="Question" name="q" aria-describedby="button-addon2" value="{{query if query else ''}}">
+                    <button class="btn btn-outline-secondary" type="submit" id="button-addon2"><i class="bi bi-search"></i></button>
+                </div>
+            </div>
+        </form>
+        {% if search_results %}
+        <div class="row">
+            <table class="table">
+              <thead>
+                <tr>
+                  <th scope="col">#</th>
+                  <th scope="col">Score</th>
+                  <th scope="col">Passage ID</th>
+                  <th scope="col">Title</th>
+                  <th scope="col">Content</th>
+                </tr>
+              </thead>
+              <tbody class="table-group-divider">
+              {% for res in search_results %}
+                <tr class="{{'table-secondary' if res['rank'] % 2 else 'table-light'}}">
+                  <th scope="row">{{res["rank"]}}</th>
+                  <td>{{"%.2f"|format(res["score"])}}</td>
+                  <td>{{res["docid"]}}</td>
+                  <td>{{res["title"]}}</td>
+                    <td style="word-wrap: break-word;min-width: 600px;max-width: 600px;" class="text-{{'end' if lang in ('ar', 'fa') else 'start'}}">
+                        <small>{{res["doc"]}}</small>
+                    </td>
+                </tr>
+              {% endfor %}
+              </tbody>
+            </table>
+        </div>
+        {% endif %}
+    </div>
+</body>
+</html>

pyserini/dsearch.py ADDED Viewed

	@@ -0,0 +1,46 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Deprecated. The package ``pyserini.dsearch` has been renamed `pyserini.search.faiss`. Stubs are retained here for
+redirection purpose to ensure that code in existing published papers remain function (with warnings)."""
+import os
+import sys
+import pyserini.search.faiss
+from pyserini.search.faiss import TctColBertQueryEncoder
+__all__ = ['SimpleDenseSearcher', 'BinaryDenseSearcher', 'TctColBertQueryEncoder']
+class SimpleDenseSearcher(pyserini.search.faiss.FaissSearcher):
+    def __new__(cls, *args, **kwargs):
+        print('pyserini.dsearch.SimpleDenseSearcher class has been deprecated, '
+              'please use FaissSearcher from pyserini.search.faiss instead')
+        return super().__new__(cls)
+class BinaryDenseSearcher(pyserini.search.faiss.BinaryDenseSearcher):
+    def __new__(cls, *args, **kwargs):
+        print('pyserini.dsearch.BinaryDenseSearcher class has been deprecated, '
+              'please use BinaryDenseSearcher from pyserini.search.faiss instead')
+        return super().__new__(cls)
+if __name__ == "__main__":
+    print('WARNING: pyserini.dsearch is deprecated, please use pyserini.search.faiss instead!')
+    args = " ".join(sys.argv[1:])
+    os.system(f'python -m pyserini.search.faiss {args}')

pyserini/encode/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from ._base import DocumentEncoder, QueryEncoder, JsonlCollectionIterator,\
+    RepresentationWriter, FaissRepresentationWriter, JsonlRepresentationWriter, PcaEncoder
+from ._ance import AnceEncoder, AnceDocumentEncoder, AnceQueryEncoder
+from ._auto import AutoQueryEncoder, AutoDocumentEncoder
+from ._dpr import DprDocumentEncoder, DprQueryEncoder
+from ._tct_colbert import TctColBertDocumentEncoder, TctColBertQueryEncoder
+from ._aggretriever import AggretrieverDocumentEncoder, AggretrieverQueryEncoder
+from ._unicoil import UniCoilEncoder, UniCoilDocumentEncoder, UniCoilQueryEncoder
+from ._cached_data import CachedDataQueryEncoder
+from ._tok_freq import TokFreqQueryEncoder
+from ._splade import SpladeQueryEncoder
+from ._slim import SlimQueryEncoder

pyserini/encode/__main__.py ADDED Viewed

	@@ -0,0 +1,147 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import argparse
+import sys
+from pyserini.encode import JsonlRepresentationWriter, FaissRepresentationWriter, JsonlCollectionIterator
+from pyserini.encode import DprDocumentEncoder, TctColBertDocumentEncoder, AnceDocumentEncoder, AggretrieverDocumentEncoder, AutoDocumentEncoder
+from pyserini.encode import UniCoilDocumentEncoder
+encoder_class_map = {
+    "dpr": DprDocumentEncoder,
+    "tct_colbert": TctColBertDocumentEncoder,
+    "aggretriever": AggretrieverDocumentEncoder,
+    "ance": AnceDocumentEncoder,
+    "sentence-transformers": AutoDocumentEncoder,
+    "unicoil": UniCoilDocumentEncoder,
+    "auto": AutoDocumentEncoder,
+}
+ALLOWED_POOLING_OPTS = ["cls","mean"]
+def init_encoder(encoder, encoder_class, device):
+    _encoder_class = encoder_class
+    # determine encoder_class
+    if encoder_class is not None:
+        encoder_class = encoder_class_map[encoder_class]
+    else:
+        # if any class keyword was matched in the given encoder name,
+        # use that encoder class
+        for class_keyword in encoder_class_map:
+            if class_keyword in encoder.lower():
+                encoder_class = encoder_class_map[class_keyword]
+                break
+        # if none of the class keyword was matched,
+        # use the AutoDocumentEncoder
+        if encoder_class is None:
+            encoder_class = AutoDocumentEncoder
+    # prepare arguments to encoder class
+    kwargs = dict(model_name=encoder, device=device)
+    if (_encoder_class == "sentence-transformers") or ("sentence-transformers" in encoder):
+        kwargs.update(dict(pooling='mean', l2_norm=True))
+    if (_encoder_class == "contriever") or ("contriever" in encoder):
+        kwargs.update(dict(pooling='mean', l2_norm=False))
+    return encoder_class(**kwargs)
+def parse_args(parser, commands):
+    # Divide argv by commands
+    split_argv = [[]]
+    for c in sys.argv[1:]:
+        if c in commands.choices:
+            split_argv.append([c])
+        else:
+            split_argv[-1].append(c)
+    # Initialize namespace
+    args = argparse.Namespace()
+    for c in commands.choices:
+        setattr(args, c, None)
+    # Parse each command
+    parser.parse_args(split_argv[0], namespace=args)  # Without command
+    for argv in split_argv[1:]:  # Commands
+        n = argparse.Namespace()
+        setattr(args, argv[0], n)
+        parser.parse_args(argv, namespace=n)
+    return args
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    commands = parser.add_subparsers(title='sub-commands')
+    input_parser = commands.add_parser('input')
+    input_parser.add_argument('--corpus', type=str,
+                              help='directory that contains corpus files to be encoded, in jsonl format.',
+                              required=True)
+    input_parser.add_argument('--fields', help='fields that contents in jsonl has (in order)',
+                              nargs='+', default=['text'], required=False)
+    input_parser.add_argument('--docid-field',
+                              help='name of document id field name. If you have a custom id with a name other than "id", "_id" or "docid", then use this argument',
+                              default=None, required=False)
+    input_parser.add_argument('--delimiter', help='delimiter for the fields', default='\n', required=False)
+    input_parser.add_argument('--shard-id', type=int, help='shard-id 0-based', default=0, required=False)
+    input_parser.add_argument('--shard-num', type=int, help='number of shards', default=1, required=False)
+    output_parser = commands.add_parser('output')
+    output_parser.add_argument('--embeddings', type=str, help='directory to store encoded corpus', required=True)
+    output_parser.add_argument('--to-faiss', action='store_true', default=False)
+    encoder_parser = commands.add_parser('encoder')
+    encoder_parser.add_argument('--encoder', type=str, help='encoder name or path', required=True)
+    encoder_parser.add_argument('--encoder-class', type=str, required=False, default=None,
+                                choices=["dpr", "bpr", "tct_colbert", "ance", "sentence-transformers", "auto"],
+                                help='which query encoder class to use. `default` would infer from the args.encoder')
+    encoder_parser.add_argument('--fields', help='fields to encode', nargs='+', default=['text'], required=False)
+    encoder_parser.add_argument('--batch-size', type=int, help='batch size', default=64, required=False)
+    encoder_parser.add_argument('--max-length', type=int, help='max length', default=256, required=False)
+    encoder_parser.add_argument('--dimension', type=int, help='dimension', default=768, required=False)
+    encoder_parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]',
+                                default='cuda:0', required=False)
+    encoder_parser.add_argument('--fp16', action='store_true', default=False)
+    encoder_parser.add_argument('--add-sep', action='store_true', default=False)
+    encoder_parser.add_argument('--pooling', type=str, default='cls', help='for auto classes, allow the ability to dictate pooling strategy', required=False)
+    args = parse_args(parser, commands)
+    delimiter = args.input.delimiter.replace("\\n", "\n")  # argparse would add \ prior to the passed '\n\n'
+    encoder = init_encoder(args.encoder.encoder, args.encoder.encoder_class, device=args.encoder.device)
+    if type(encoder).__name__ == "AutoDocumentEncoder":
+        if args.encoder.pooling in ALLOWED_POOLING_OPTS:
+            encoder.pooling = args.encoder.pooling
+        else:
+            raise ValueError(f"Only allowed to use pooling types {ALLOWED_POOLING_OPTS}. You entered {args.encoder.pooling}")
+    if args.output.to_faiss:
+        embedding_writer = FaissRepresentationWriter(args.output.embeddings, dimension=args.encoder.dimension)
+    else:
+        embedding_writer = JsonlRepresentationWriter(args.output.embeddings)
+    collection_iterator = JsonlCollectionIterator(args.input.corpus, args.input.fields, args.input.docid_field, delimiter)
+    with embedding_writer:
+        for batch_info in collection_iterator(args.encoder.batch_size, args.input.shard_id, args.input.shard_num):
+            kwargs = {
+                'texts': batch_info['text'],
+                'titles': batch_info['title'] if 'title' in args.encoder.fields else None,
+                'expands': batch_info['expand'] if 'expand' in args.encoder.fields else None,
+                'fp16': args.encoder.fp16,
+                'max_length': args.encoder.max_length,
+                'add_sep': args.encoder.add_sep,
+            }
+            embeddings = encoder.encode(**kwargs)
+            batch_info['vector'] = embeddings
+            embedding_writer.write(batch_info, args.input.fields)

pyserini/encode/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.15 kB). View file

pyserini/encode/__pycache__/_aggretriever.cpython-310.pyc ADDED Viewed

Binary file (6.24 kB). View file