diff --git a/pyserini/2cr/_base.py b/pyserini/2cr/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..8225e9c0c3d4026fc19f048b27aafae37bc4b277 --- /dev/null +++ b/pyserini/2cr/_base.py @@ -0,0 +1,95 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import subprocess + +fail_str = '\033[91m[FAIL]\033[0m' +ok_str = '[OK]' +okish_str = '\033[94m[OKish]\033[0m' + + +def run_command(cmd): + process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + stdout = stdout.decode('utf-8') + stderr = stderr.decode('utf-8') + + return stdout, stderr + + +def run_eval_and_return_metric(metric, eval_key, defs, runfile): + eval_cmd = f'python -m pyserini.eval.trec_eval {defs} {eval_key} {runfile}' + eval_stdout, eval_stderr = run_command(eval_cmd) + + for line in eval_stdout.split('\n'): + parts = line.split('\t') + if len(parts) == 3 and parts[1] == 'all': + return round(float(parts[2]), 4) + + return 0.0 + + +def run_dpr_retrieval_eval_and_return_metric(defs, json_file): + """Generate dpr retrieval evaluation scores + + Args: + defs: topk definitions (e.g., '--topk 5 20') + json_file: dpr retrieval json file + + Returns: + topk: a dictionary of topk scores (e.g., {"Top5": }) + """ + eval_cmd = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {json_file} {defs} ' + eval_stdout, eval_stderr = run_command(eval_cmd) + topk = {} + for line in eval_stdout.split('\n'): + parts = line.split('\t') + if len(parts) == 2 and 'accuracy' in parts[1]: + topk.update({parts[0]:round(float(parts[1][10:])*100, 4)}) + return topk + + +def convert_trec_run_to_dpr_retrieval_json(topics,index,runfile,output): + """Convert trec runfile to dpr retrieval json file + + Args: + topics: topics field + index: index field + runfile: input runfile + output: output jsonfile + + Returns: + exit status: exit status + """ + cmd = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics {topics} --index {index} --input {runfile} --output {output}' + return os.system(cmd) + + +def run_fusion(run_ls, output, k): + """run fusion command and return status code + + Args: + run_ls: a list of runfile paths + output: output path + k: topk value + + Returns: + status code: status code + """ + run_files = ' '.join(run_ls) + cmd = f'python -m pyserini.fusion --runs {run_files} --output {output} --k {k}' + return os.system(cmd) diff --git a/pyserini/2cr/miracl.py b/pyserini/2cr/miracl.py new file mode 100644 index 0000000000000000000000000000000000000000..d3c0bb3223f62bfcc8649e553c0b11b05eaeee0b --- /dev/null +++ b/pyserini/2cr/miracl.py @@ -0,0 +1,447 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import math +import os +import sys +import time +import subprocess +import pkg_resources +from collections import defaultdict, OrderedDict +from string import Template + +import yaml + +from ._base import run_eval_and_return_metric, ok_str, okish_str, fail_str + +languages = [ + ['ar', 'arabic'], + ['bn', 'bengali'], + ['en', 'english'], + ['es', 'spanish'], + ['fa', 'persian'], + ['fi', 'finnish'], + ['fr', 'french'], + ['hi', 'hindi'], + ['id', 'indonesian'], + ['ja', 'japanese'], + ['ko', 'korean'], + ['ru', 'russian'], + ['sw', 'swahili'], + ['te', 'telugu'], + ['th', 'thai'], + ['zh', 'chinese'], + ['de', 'german'], + ['yo', 'yoruba'] +] + +html_display = OrderedDict() +html_display['bm25'] = 'BM25' +html_display['mdpr-tied-pft-msmarco'] = 'mDPR (tied encoders), pre-FT w/ MS MARCO' +html_display['mdpr-tied-pft-msmarco-ft-all'] = 'mDPR (tied encoders), pre-FT w/ MS MARCO then FT w/ all Mr. TyDi' +html_display['bm25-mdpr-tied-pft-msmarco-hybrid'] = 'Hybrid of `bm25` and `mdpr-tied-pft-msmarco`' +html_display['mdpr-tied-pft-msmarco-ft-miracl'] = 'mDPR (tied encoders), pre-FT w/ MS MARCO then in-lang FT w/ MIRACL' +html_display['mcontriever-tied-pft-msmarco'] = 'mContriever (tied encoders), pre-FT w/ MS MARCO' + +models = list(html_display) + +trec_eval_metric_definitions = { + 'nDCG@10': '-c -M 100 -m ndcg_cut.10', + 'R@100': '-c -m recall.100', +} + + +def format_run_command(raw): + return raw.replace('--lang', '\\\n --lang') \ + .replace('--encoder', '\\\n --encoder') \ + .replace('--topics', '\\\n --topics') \ + .replace('--index', '\\\n --index') \ + .replace('--output ', '\\\n --output ') \ + .replace('--runs', '\\\n --runs ') \ + .replace('--batch ', '\\\n --batch ') \ + .replace('--threads 12', '--threads 12 \\\n ') + + +def format_eval_command(raw): + return raw.replace('-c ', '\\\n -c ') \ + .replace(raw.split()[-1], f'\\\n {raw.split()[-1]}') + + +def read_file(f): + fin = open(f, 'r') + text = fin.read() + fin.close() + + return text + + +def list_conditions(): + print('Conditions:\n-----------') + for condition, _ in html_display.items(): + print(condition) + print('\nLanguages\n---------') + for language in languages: + print(language[0]) + + +def generate_table_rows(table, row_template, commands, eval_commands, table_id, split, metric): + row_cnt = 1 + html_rows = [] + + for model in models: + s = Template(row_template) + + keys = {} + used_langs = 0 + for lang in languages: + keys[lang[0]] = f'{model}.{lang[0]}' + used_langs += 1 if table[keys[lang[0]]][split][metric] != 0 else 0 + + sum = table[keys["ar"]][split][metric] + \ + table[keys["bn"]][split][metric] + \ + table[keys["en"]][split][metric] + \ + table[keys["es"]][split][metric] + \ + table[keys["fa"]][split][metric] + \ + table[keys["fi"]][split][metric] + \ + table[keys["fr"]][split][metric] + \ + table[keys["hi"]][split][metric] + \ + table[keys["id"]][split][metric] + \ + table[keys["ja"]][split][metric] + \ + table[keys["ko"]][split][metric] + \ + table[keys["ru"]][split][metric] + \ + table[keys["sw"]][split][metric] + \ + table[keys["te"]][split][metric] + \ + table[keys["th"]][split][metric] + \ + table[keys["zh"]][split][metric] + \ + table[keys["de"]][split][metric] + \ + table[keys["yo"]][split][metric] + avg = sum / used_langs + + s = s.substitute(table_cnt=table_id, + row_cnt=row_cnt, + model=html_display[model], + ar=f'{table[keys["ar"]][split][metric]:.3f}', + bn=f'{table[keys["bn"]][split][metric]:.3f}', + en=f'{table[keys["en"]][split][metric]:.3f}', + es=f'{table[keys["es"]][split][metric]:.3f}', + fa=f'{table[keys["fa"]][split][metric]:.3f}', + fi=f'{table[keys["fi"]][split][metric]:.3f}', + fr=f'{table[keys["fr"]][split][metric]:.3f}', + hi=f'{table[keys["hi"]][split][metric]:.3f}', + id=f'{table[keys["id"]][split][metric]:.3f}', + ja=f'{table[keys["ja"]][split][metric]:.3f}', + ko=f'{table[keys["ko"]][split][metric]:.3f}', + ru=f'{table[keys["ru"]][split][metric]:.3f}', + sw=f'{table[keys["sw"]][split][metric]:.3f}', + te=f'{table[keys["te"]][split][metric]:.3f}', + th=f'{table[keys["th"]][split][metric]:.3f}', + zh=f'{table[keys["zh"]][split][metric]:.3f}', + de=f'{table[keys["de"]][split][metric]:.3f}', + yo=f'{table[keys["yo"]][split][metric]:.3f}', + avg=f'{avg:.3f}', + cmd1=f'{commands[keys["ar"]]}', + cmd2=f'{commands[keys["bn"]]}', + cmd3=f'{commands[keys["en"]]}', + cmd4=f'{commands[keys["es"]]}', + cmd5=f'{commands[keys["fa"]]}', + cmd6=f'{commands[keys["fi"]]}', + cmd7=f'{commands[keys["fr"]]}', + cmd8=f'{commands[keys["hi"]]}', + cmd9=f'{commands[keys["id"]]}', + cmd10=f'{commands[keys["ja"]]}', + cmd11=f'{commands[keys["ko"]]}', + cmd12=f'{commands[keys["ru"]]}', + cmd13=f'{commands[keys["sw"]]}', + cmd14=f'{commands[keys["te"]]}', + cmd15=f'{commands[keys["th"]]}', + cmd16=f'{commands[keys["zh"]]}', + cmd17=f'{commands[keys["de"]]}', + cmd18=f'{commands[keys["yo"]]}', + eval_cmd1=f'{eval_commands[keys["ar"]][metric]}', + eval_cmd2=f'{eval_commands[keys["bn"]][metric]}', + eval_cmd3=f'{eval_commands[keys["en"]][metric]}', + eval_cmd4=f'{eval_commands[keys["es"]][metric]}', + eval_cmd5=f'{eval_commands[keys["fa"]][metric]}', + eval_cmd6=f'{eval_commands[keys["fi"]][metric]}', + eval_cmd7=f'{eval_commands[keys["fr"]][metric]}', + eval_cmd8=f'{eval_commands[keys["hi"]][metric]}', + eval_cmd9=f'{eval_commands[keys["id"]][metric]}', + eval_cmd10=f'{eval_commands[keys["ja"]][metric]}', + eval_cmd11=f'{eval_commands[keys["ko"]][metric]}', + eval_cmd12=f'{eval_commands[keys["ru"]][metric]}', + eval_cmd13=f'{eval_commands[keys["sw"]][metric]}', + eval_cmd14=f'{eval_commands[keys["te"]][metric]}', + eval_cmd15=f'{eval_commands[keys["th"]][metric]}', + eval_cmd16=f'{eval_commands[keys["zh"]][metric]}', + eval_cmd17=f'{eval_commands[keys["de"]][metric]}', + eval_cmd18=f'{eval_commands[keys["yo"]][metric]}' + ) + + s = s.replace("0.000", "--") + html_rows.append(s) + row_cnt += 1 + + return html_rows + + +def print_results(table, metric, split): + print(f'Metric = {metric}, Split = {split}') + print(' ' * 35, end='') + for lang in languages: + print(f'{lang[0]:3} ', end='') + print('') + for model in models: + print(f'{model:33}', end='') + for lang in languages: + key = f'{model}.{lang[0]}' + print(f'{table[key][split][metric]:7.3f}', end='') + print('') + print('') + + +def extract_topic_fn_from_cmd(cmd): + cmd = cmd.split() + topic_idx = cmd.index('--topics') + return cmd[topic_idx + 1] + + +def generate_report(args): + table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) + commands = defaultdict(lambda: '') + eval_commands = defaultdict(lambda: defaultdict(lambda: '')) + + html_template = read_file(pkg_resources.resource_filename(__name__, 'miracl_html.template')) + table_template = read_file(pkg_resources.resource_filename(__name__, 'miracl_html_table.template')) + row_template = read_file(pkg_resources.resource_filename(__name__, 'miracl_html_table_row.template')) + + with open(pkg_resources.resource_filename(__name__, 'miracl.yaml')) as f: + yaml_data = yaml.safe_load(f) + for condition in yaml_data['conditions']: + name = condition['name'] + eval_key = condition['eval_key'] + cmd_template = condition['command'] + cmd_lst = cmd_template.split() + lang = name.split('.')[-1] + is_hybrid_run = 'hybrid' in name + + for splits in condition['splits']: + split = splits['split'] + if is_hybrid_run: + hits = int(cmd_lst[cmd_lst.index('--k') + 1]) + else: + hits = int(cmd_lst[cmd_lst.index('--hits') + 1]) + + runfile = os.path.join(args.directory, f'run.miracl.{name}.{split}.txt') + if is_hybrid_run: + bm25_output = os.path.join(args.directory, + f'run.miracl.bm25.{lang}.{split}.top{hits}.txt') + mdpr_output = os.path.join(args.directory, + f'run.miracl.mdpr-tied-pft-msmarco.{lang}.{split}.top{hits}.txt') + expected_args = dict(output=runfile, bm25_output=bm25_output, mdpr_output=mdpr_output) + else: + expected_args = dict(split=split, output=runfile) + + if not all([f"${k}" in cmd_template or f"${{{k}}}" in cmd_template for k in expected_args]): + raise ValueError(f"Not all arguements {list(expected_args)} detected from inputs: {cmd_template}.") + cmd = Template(cmd_template).substitute(**expected_args) + commands[name] = format_run_command(cmd) + + for expected in splits['scores']: + for metric in expected: + if str(expected[metric])[-1] == "5": + # without adding espilon, there is a chance that f-string would round 0.5 to 0 rather than 1 + # e.g., 0.8885 -> 0.888 rather than 0.889 + # add a espilon to the expected score to avoid rounding error + expected[metric] += 1e-5 + table[name][split][metric] = expected[metric] + + eval_cmd = f'python -m pyserini.eval.trec_eval ' + \ + f'{trec_eval_metric_definitions[metric]} {eval_key}-{split} {runfile}' + eval_commands[name][metric] = format_eval_command(eval_cmd) + + tables_html = [] + + split = 'dev' + + # Build the table for MRR@100, test queries + html_rows = generate_table_rows(table, row_template, commands, eval_commands, 1, split, 'nDCG@10') + all_rows = '\n'.join(html_rows) + tables_html.append(Template(table_template).substitute(desc=f'nDCG@10, {split} queries', rows=all_rows)) + + # Build the table for R@100, test queries + html_rows = generate_table_rows(table, row_template, commands, eval_commands, 2, split, 'R@100') + all_rows = '\n'.join(html_rows) + tables_html.append(Template(table_template).substitute(desc=f'Recall@100, {split} queries', rows=all_rows)) + + with open(args.output, 'w') as out: + out.write(Template(html_template).substitute(title='MIRACL', tables=' '.join(tables_html))) + + +def run_conditions(args): + if args.condition == 'mdpr-tied-pft-msmarco-ft-miracl' and args.language in ['de', 'yo']: + print('MIRACL de and yo datasets do not have train splits to finetune with') + return + + start = time.time() + + table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) + + with open(pkg_resources.resource_filename(__name__, 'miracl.yaml')) as f: + yaml_data = yaml.safe_load(f) + for condition in yaml_data['conditions']: + name = condition['name'] + encoder = name.split('.')[0] + lang = name.split('.')[-1] + if args.all: + pass + elif args.condition != encoder: + continue + elif args.language and args.language != lang: + continue + eval_key = condition['eval_key'] + cmd_template = condition['command'] + cmd_lst = cmd_template.split() + + print(f'condition {name}:') + is_hybrid_run = 'hybrid' in name + + for splits in condition['splits']: + split = splits['split'] + if is_hybrid_run: + hits = int(cmd_lst[cmd_lst.index('--k') + 1]) + else: + hits = int(cmd_lst[cmd_lst.index('--hits') + 1]) + + print(f' - split: {split}') + + runfile = os.path.join(args.directory, f'run.miracl.{name}.{split}.top{hits}.txt') + if is_hybrid_run: + bm25_output = os.path.join(args.directory, + f'run.miracl.bm25.{lang}.{split}.top{hits}.txt') + mdpr_output = os.path.join(args.directory, + f'run.miracl.mdpr-tied-pft-msmarco.{lang}.{split}.top{hits}.txt') + if not os.path.exists(bm25_output): + print(f'Missing BM25 file: {bm25_output}') + continue + if not os.path.exists(mdpr_output): + print(f'Missing mDPR file: {mdpr_output}') + continue + cmd = Template(cmd_template).substitute(split=split, output=runfile, bm25_output=bm25_output, + mdpr_output=mdpr_output) + else: + cmd = Template(cmd_template).substitute(split=split, output=runfile) + + # In the yaml file, the topics are written as something like '--topics miracl-v1.0-ar-${split}' + # This works for the dev split because the topics are directly included in Anserini/Pyserini. + # For this training split, we have to map the symbol into a file in tools/topics-and-qrels/ + # Here, we assume that the developer has cloned the miracl repo and placed the topics there. + if split == 'train': + cmd = cmd.replace(f'--topics miracl-v1.0-{lang}-{split}', + f'--topics tools/topics-and-qrels/topics.miracl-v1.0-{lang}-{split}.tsv') + + if args.display_commands: + print(f'\n```bash\n{format_run_command(cmd)}\n```\n') + + if not os.path.exists(runfile): + if not args.dry_run: + rtn = subprocess.run(cmd.split(), capture_output=True) + stderr = rtn.stderr.decode() + if '--topics' in cmd: + topic_fn = extract_topic_fn_from_cmd(cmd) + if f'ValueError: Topic {topic_fn} Not Found' in stderr: + print(f'Skipping {topic_fn}: file not found.') + continue + + for expected in splits['scores']: + for metric in expected: + if not args.skip_eval: + # We have the translate the training qrels into a file located in tools/topics-and-qrels/ + # because they are not included with Anserini/Pyserini by default. + # Here, we assume that the developer has cloned the miracl repo and placed the qrels there. + if split == 'train': + qrels = f'tools/topics-and-qrels/qrels.{eval_key}-train.tsv' + else: + qrels = f'{eval_key}-{split}' + score = float(run_eval_and_return_metric(metric, qrels, + trec_eval_metric_definitions[metric], runfile)) + if math.isclose(score, float(expected[metric])): + result_str = ok_str + # Flaky tests + elif (name == 'mdpr-tied-pft-msmarco.hi' and split == 'train' + and math.isclose(score, float(expected[metric]), abs_tol=2e-4)) or \ + (name == 'mdpr-tied-pft-msmarco-ft-all.ru' + and split == 'dev' and metric == 'nDCG@10' + and math.isclose(score, float(expected[metric]), abs_tol=2e-4)) or \ + (name == 'bm25-mdpr-tied-pft-msmarco-hybrid.te' + and split == 'train' and metric == 'nDCG@10' + and math.isclose(score, float(expected[metric]), abs_tol=2e-4)) or \ + (name == 'bm25-mdpr-tied-pft-msmarco-hybrid.zh' + and split == 'dev' and metric == 'nDCG@10' + and math.isclose(score, float(expected[metric]), abs_tol=2e-4)): + result_str = okish_str + else: + result_str = fail_str + f' expected {expected[metric]:.4f}' + print(f' {metric:7}: {score:.4f} {result_str}') + table[name][split][metric] = score + else: + table[name][split][metric] = expected[metric] + + print('') + + for metric in ['nDCG@10', 'R@100']: + for split in ['dev', 'train']: + print_results(table, metric, split) + + end = time.time() + print(f'Total elapsed time: {end - start:.0f}s') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Generate regression matrix for MIRACL.') + parser.add_argument('--condition', type=str, + help='Condition to run', required=False) + # To list all conditions + parser.add_argument('--list-conditions', action='store_true', default=False, help='List available conditions.') + # For generating reports + parser.add_argument('--generate-report', action='store_true', default=False, help='Generate report.') + parser.add_argument('--output', type=str, help='File to store report.', required=False) + # For actually running the experimental conditions + parser.add_argument('--all', action='store_true', default=False, help='Run using all languages.') + parser.add_argument('--language', type=str, help='Language to run.', required=False) + parser.add_argument('--directory', type=str, help='Base directory.', default='', required=False) + parser.add_argument('--dry-run', action='store_true', default=False, help='Print out commands but do not execute.') + parser.add_argument('--skip-eval', action='store_true', default=False, help='Skip running trec_eval.') + parser.add_argument('--display-commands', action='store_true', default=False, help='Display command.') + args = parser.parse_args() + + if args.list_conditions: + list_conditions() + sys.exit() + + if args.generate_report: + if not args.output: + print(f'Must specify report filename with --output.') + sys.exit() + + generate_report(args) + sys.exit() + + if args.all and (args.condition or args.language): + print('Specifying --all will run all conditions and languages') + sys.exit() + + run_conditions(args) diff --git a/pyserini/2cr/miracl.yaml b/pyserini/2cr/miracl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9659442317dada43eb6ec3a20fd2a862b840e09e --- /dev/null +++ b/pyserini/2cr/miracl.yaml @@ -0,0 +1,1180 @@ +conditions: + # BM25 + - name: bm25.ar + eval_key: miracl-v1.0-ar + command: python -m pyserini.search.lucene --language ar --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4434 + R@100: 0.8562 + - split: dev + scores: + - nDCG@10: 0.4809 + R@100: 0.8885 + - name: bm25.bn + eval_key: miracl-v1.0-bn + command: python -m pyserini.search.lucene --language bn --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5122 + R@100: 0.8934 + - split: dev + scores: + - nDCG@10: 0.5079 + R@100: 0.9088 + - name: bm25.en + eval_key: miracl-v1.0-en + command: python -m pyserini.search.lucene --language en --topics miracl-v1.0-en-${split} --index miracl-v1.0-en --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3415 + R@100: 0.7928 + - split: dev + scores: + - nDCG@10: 0.3506 + R@100: 0.8190 + - name: bm25.es + eval_key: miracl-v1.0-es + command: python -m pyserini.search.lucene --language es --topics miracl-v1.0-es-${split} --index miracl-v1.0-es --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3030 + R@100: 0.7020 + - split: dev + scores: + - nDCG@10: 0.3193 + R@100: 0.7018 + - name: bm25.fa + eval_key: miracl-v1.0-fa + command: python -m pyserini.search.lucene --language fa --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3270 + R@100: 0.7139 + - split: dev + scores: + - nDCG@10: 0.3334 + R@100: 0.7306 + - name: bm25.fi + eval_key: miracl-v1.0-fi + command: python -m pyserini.search.lucene --language fi --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5106 + R@100: 0.8471 + - split: dev + scores: + - nDCG@10: 0.5513 + R@100: 0.8910 + - name: bm25.fr + eval_key: miracl-v1.0-fr + command: python -m pyserini.search.lucene --language fr --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.2152 + R@100: 0.6601 + - split: dev + scores: + - nDCG@10: 0.1832 + R@100: 0.6528 + - name: bm25.hi + eval_key: miracl-v1.0-hi + command: python -m pyserini.search.lucene --language hi --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4745 + R@100: 0.9016 + - split: dev + scores: + - nDCG@10: 0.4578 + R@100: 0.8679 + - name: bm25.id + eval_key: miracl-v1.0-id + command: python -m pyserini.search.lucene --language id --topics miracl-v1.0-id-${split} --index miracl-v1.0-id --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4844 + R@100: 0.9234 + - split: dev + scores: + - nDCG@10: 0.4486 + R@100: 0.9041 + - name: bm25.ja + eval_key: miracl-v1.0-ja + command: python -m pyserini.search.lucene --language ja --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3796 + R@100: 0.8225 + - split: dev + scores: + - nDCG@10: 0.3689 + R@100: 0.8048 + - name: bm25.ko + eval_key: miracl-v1.0-ko + command: python -m pyserini.search.lucene --language ko --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4279 + R@100: 0.7572 + - split: dev + scores: + - nDCG@10: 0.4190 + R@100: 0.7831 + - name: bm25.ru + eval_key: miracl-v1.0-ru + command: python -m pyserini.search.lucene --language ru --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3153 + R@100: 0.6464 + - split: dev + scores: + - nDCG@10: 0.3342 + R@100: 0.6614 + - name: bm25.sw + eval_key: miracl-v1.0-sw + command: python -m pyserini.search.lucene --language sw --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3356 + R@100: 0.6499 + - split: dev + scores: + - nDCG@10: 0.3826 + R@100: 0.7008 + - name: bm25.te + eval_key: miracl-v1.0-te + command: python -m pyserini.search.lucene --language te --topics miracl-v1.0-te-${split} --index miracl-v1.0-te --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4814 + R@100: 0.8077 + - split: dev + scores: + - nDCG@10: 0.4942 + R@100: 0.8307 + - name: bm25.th + eval_key: miracl-v1.0-th + command: python -m pyserini.search.lucene --language th --topics miracl-v1.0-th-${split} --index miracl-v1.0-th --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4629 + R@100: 0.8768 + - split: dev + scores: + - nDCG@10: 0.4838 + R@100: 0.8874 + - name: bm25.zh + eval_key: miracl-v1.0-zh + command: python -m pyserini.search.lucene --language zh --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.2018 + R@100: 0.5541 + - split: dev + scores: + - nDCG@10: 0.1801 + R@100: 0.5599 + - name: bm25.de + eval_key: miracl-v1.0-de + command: python -m pyserini.search.lucene --language de --topics miracl-v1.0-de-${split} --index miracl-v1.0-de --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.2262 + R@100: 0.5724 + - name: bm25.yo + eval_key: miracl-v1.0-yo + command: python -m pyserini.search.lucene --pretokenized --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo --output $output --batch 128 --threads 16 --bm25 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.4059 + R@100: 0.7325 + + # mdpr-tied-pft-msmarco + - name: mdpr-tied-pft-msmarco.ar + eval_key: miracl-v1.0-ar + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4653 + R@100: 0.8293 + - split: dev + scores: + - nDCG@10: 0.4993 + R@100: 0.8407 + - name: mdpr-tied-pft-msmarco.bn + eval_key: miracl-v1.0-bn + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4362 + R@100: 0.8045 + - split: dev + scores: + - nDCG@10: 0.4427 + R@100: 0.8193 + - name: mdpr-tied-pft-msmarco.en + eval_key: miracl-v1.0-en + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3986 + R@100: 0.7779 + - split: dev + scores: + - nDCG@10: 0.3938 + R@100: 0.7675 + - name: mdpr-tied-pft-msmarco.es + eval_key: miracl-v1.0-es + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4637 + R@100: 0.8654 + - split: dev + scores: + - nDCG@10: 0.4777 + R@100: 0.8643 + - name: mdpr-tied-pft-msmarco.fa + eval_key: miracl-v1.0-fa + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4882 + R@100: 0.9092 + - split: dev + scores: + - nDCG@10: 0.4800 + R@100: 0.8980 + - name: mdpr-tied-pft-msmarco.fi + eval_key: miracl-v1.0-fi + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4426 + R@100: 0.7611 + - split: dev + scores: + - nDCG@10: 0.4721 + R@100: 0.7877 + - name: mdpr-tied-pft-msmarco.fr + eval_key: miracl-v1.0-fr + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4372 + R@100: 0.9268 + - split: dev + scores: + - nDCG@10: 0.4352 + R@100: 0.9154 + - name: mdpr-tied-pft-msmarco.hi + eval_key: miracl-v1.0-hi + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3685 + R@100: 0.7780 + - split: dev + scores: + - nDCG@10: 0.3830 + R@100: 0.7755 + - name: mdpr-tied-pft-msmarco.id + eval_key: miracl-v1.0-id + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.2549 + R@100: 0.5610 + - split: dev + scores: + - nDCG@10: 0.2719 + R@100: 0.5734 + - name: mdpr-tied-pft-msmarco.ja + eval_key: miracl-v1.0-ja + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4342 + R@100: 0.8211 + - split: dev + scores: + - nDCG@10: 0.4390 + R@100: 0.8254 + - name: mdpr-tied-pft-msmarco.ko + eval_key: miracl-v1.0-ko + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4147 + R@100: 0.7699 + - split: dev + scores: + - nDCG@10: 0.4189 + R@100: 0.7369 + - name: mdpr-tied-pft-msmarco.ru + eval_key: miracl-v1.0-ru + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3812 + R@100: 0.7854 + - split: dev + scores: + - nDCG@10: 0.4073 + R@100: 0.7972 + - name: mdpr-tied-pft-msmarco.sw + eval_key: miracl-v1.0-sw + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.2973 + R@100: 0.5761 + - split: dev + scores: + - nDCG@10: 0.2990 + R@100: 0.6158 + - name: mdpr-tied-pft-msmarco.te + eval_key: miracl-v1.0-te + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3723 + R@100: 0.7698 + - split: dev + scores: + - nDCG@10: 0.3557 + R@100: 0.7619 + - name: mdpr-tied-pft-msmarco.th + eval_key: miracl-v1.0-th + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3451 + R@100: 0.6728 + - split: dev + scores: + - nDCG@10: 0.3578 + R@100: 0.6783 + - name: mdpr-tied-pft-msmarco.zh + eval_key: miracl-v1.0-zh + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5040 + R@100: 0.9355 + - split: dev + scores: + - nDCG@10: 0.5116 + R@100: 0.9436 + - name: mdpr-tied-pft-msmarco.de + eval_key: miracl-v1.0-de + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-de-${split} --index miracl-v1.0-de-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.4895 + R@100: 0.8983 + - name: mdpr-tied-pft-msmarco.yo + eval_key: miracl-v1.0-yo + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.4439 + R@100: 0.8403 + + # mdpr-tied-pft-msmarco-ft-all + - name: mdpr-tied-pft-msmarco-ft-all.ar + eval_key: miracl-v1.0-ar + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.6954 + R@100: 0.8542 + - split: dev + scores: + - nDCG@10: 0.5782 + R@100: 0.7953 + - name: mdpr-tied-pft-msmarco-ft-all.bn + eval_key: miracl-v1.0-bn + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.6823 + R@100: 0.8646 + - split: dev + scores: + - nDCG@10: 0.5804 + R@100: 0.8480 + - name: mdpr-tied-pft-msmarco-ft-all.en + eval_key: miracl-v1.0-en + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3491 + R@100: 0.5678 + - split: dev + scores: + - nDCG@10: 0.2813 + R@100: 0.5083 + - name: mdpr-tied-pft-msmarco-ft-all.es + eval_key: miracl-v1.0-es + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.2488 + R@100: 0.4799 + - split: dev + scores: + - nDCG@10: 0.2509 + R@100: 0.4706 + - name: mdpr-tied-pft-msmarco-ft-all.fa + eval_key: miracl-v1.0-fa + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3809 + R@100: 0.6899 + - split: dev + scores: + - nDCG@10: 0.3836 + R@100: 0.6863 + - name: mdpr-tied-pft-msmarco-ft-all.fi + eval_key: miracl-v1.0-fi + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.7738 + R@100: 0.9081 + - split: dev + scores: + - nDCG@10: 0.5694 + R@100: 0.7984 + - name: mdpr-tied-pft-msmarco-ft-all.fr + eval_key: miracl-v1.0-fr + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.2989 + R@100: 0.6197 + - split: dev + scores: + - nDCG@10: 0.3010 + R@100: 0.6005 + - name: mdpr-tied-pft-msmarco-ft-all.hi + eval_key: miracl-v1.0-hi + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3336 + R@100: 0.6388 + - split: dev + scores: + - nDCG@10: 0.3286 + R@100: 0.6371 + - name: mdpr-tied-pft-msmarco-ft-all.id + eval_key: miracl-v1.0-id + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3321 + R@100: 0.5492 + - split: dev + scores: + - nDCG@10: 0.3462 + R@100: 0.5841 + - name: mdpr-tied-pft-msmarco-ft-all.ja + eval_key: miracl-v1.0-ja + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.6378 + R@100: 0.7950 + - split: dev + scores: + - nDCG@10: 0.4999 + R@100: 0.7451 + - name: mdpr-tied-pft-msmarco-ft-all.ko + eval_key: miracl-v1.0-ko + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5795 + R@100: 0.7850 + - split: dev + scores: + - nDCG@10: 0.4864 + R@100: 0.7183 + - name: mdpr-tied-pft-msmarco-ft-all.ru + eval_key: miracl-v1.0-ru + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.6011 + R@100: 0.8188 + - split: dev + scores: + - nDCG@10: 0.3933 + R@100: 0.6707 + - name: mdpr-tied-pft-msmarco-ft-all.sw + eval_key: miracl-v1.0-sw + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.8882 + R@100: 0.9710 + - split: dev + scores: + - nDCG@10: 0.6575 + R@100: 0.8883 + - name: mdpr-tied-pft-msmarco-ft-all.te + eval_key: miracl-v1.0-te + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.8757 + R@100: 0.9725 + - split: dev + scores: + - nDCG@10: 0.7783 + R@100: 0.9513 + - name: mdpr-tied-pft-msmarco-ft-all.th + eval_key: miracl-v1.0-th + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.7761 + R@100: 0.9241 + - split: dev + scores: + - nDCG@10: 0.5975 + R@100: 0.8360 + - name: mdpr-tied-pft-msmarco-ft-all.zh + eval_key: miracl-v1.0-zh + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3446 + R@100: 0.6608 + - split: dev + scores: + - nDCG@10: 0.3575 + R@100: 0.6725 + - name: mdpr-tied-pft-msmarco-ft-all.de + eval_key: miracl-v1.0-de + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-de-${split} --index miracl-v1.0-de-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.3219 + R@100: 0.5990 + - name: mdpr-tied-pft-msmarco-ft-all.yo + eval_key: miracl-v1.0-yo + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.5983 + R@100: 0.8908 + + - name: bm25-mdpr-tied-pft-msmarco-hybrid.ar + eval_key: miracl-v1.0-ar + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.6259 + R@100: 0.9173 + - split: dev + scores: + - nDCG@10: 0.6729 + R@100: 0.9405 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.bn + eval_key: miracl-v1.0-bn + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.6587 + R@100: 0.9297 + - split: dev + scores: + - nDCG@10: 0.6540 + R@100: 0.9321 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.en + eval_key: miracl-v1.0-en + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5347 + R@100: 0.8772 + - split: dev + scores: + - nDCG@10: 0.5488 + R@100: 0.8815 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.es + eval_key: miracl-v1.0-es + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.6234 + R@100: 0.9425 + - split: dev + scores: + - nDCG@10: 0.6413 + R@100: 0.9479 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.fa + eval_key: miracl-v1.0-fa + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5890 + R@100: 0.9433 + - split: dev + scores: + - nDCG@10: 0.5935 + R@100: 0.9374 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.fi + eval_key: miracl-v1.0-fi + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + scores: + - nDCG@10: 0.6164 + R@100: 0.8506 + - split: dev + scores: + - nDCG@10: 0.6716 + R@100: 0.8949 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.fr + eval_key: miracl-v1.0-fr + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5299 + R@100: 0.9709 + - split: dev + scores: + - nDCG@10: 0.5233 + R@100: 0.9647 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.hi + eval_key: miracl-v1.0-hi + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.6217 + R@100: 0.9059 + - split: dev + scores: + - nDCG@10: 0.6157 + R@100: 0.9115 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.id + eval_key: miracl-v1.0-id + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4442 + R@100: 0.7595 + - split: dev + scores: + - nDCG@10: 0.4433 + R@100: 0.7683 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.ja + eval_key: miracl-v1.0-ja + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5795 + R@100: 0.9082 + - split: dev + scores: + - nDCG@10: 0.5757 + R@100: 0.9036 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.ko + eval_key: miracl-v1.0-ko + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5758 + R@100: 0.8744 + - split: dev + scores: + - nDCG@10: 0.6086 + R@100: 0.8997 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.ru + eval_key: miracl-v1.0-ru + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4921 + R@100: 0.8494 + - split: dev + scores: + - nDCG@10: 0.5323 + R@100: 0.8738 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.sw + eval_key: miracl-v1.0-sw + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4100 + R@100: 0.6987 + - split: dev + scores: + - nDCG@10: 0.4457 + R@100: 0.7254 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.te + eval_key: miracl-v1.0-te + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.6000 + R@100: 0.8717 + - split: dev + scores: + - nDCG@10: 0.6021 + R@100: 0.8569 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.th + eval_key: miracl-v1.0-th + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5669 + R@100: 0.8195 + - split: dev + scores: + - nDCG@10: 0.5990 + R@100: 0.8228 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.zh + eval_key: miracl-v1.0-zh + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5209 + R@100: 0.9576 + - split: dev + scores: + - nDCG@10: 0.5254 + R@100: 0.9587 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.de + eval_key: miracl-v1.0-de + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.5643 + R@100: 0.9482 + - name: bm25-mdpr-tied-pft-msmarco-hybrid.yo + eval_key: miracl-v1.0-yo + command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.6114 + R@100: 0.9496 + + # mdpr-tied-pft-msmarco-ft-miracl-ft-miracl + - name: mdpr-tied-pft-msmarco-ft-miracl.ar + eval_key: miracl-v1.0-ar + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ar --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mdpr-tied-pft-msmarco-ft-miracl-ar --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.7252 + R@100: 0.9489 + - name: mdpr-tied-pft-msmarco-ft-miracl.bn + eval_key: miracl-v1.0-bn + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-bn --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mdpr-tied-pft-msmarco-ft-miracl-bn --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.6842 + R@100: 0.9547 + - name: mdpr-tied-pft-msmarco-ft-miracl.en + eval_key: miracl-v1.0-en + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-en --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mdpr-tied-pft-msmarco-ft-miracl-en --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.4878 + R@100: 0.8341 + - name: mdpr-tied-pft-msmarco-ft-miracl.es + eval_key: miracl-v1.0-es + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-es --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mdpr-tied-pft-msmarco-ft-miracl-es --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.5648 + R@100: 0.9109 + - name: mdpr-tied-pft-msmarco-ft-miracl.fa + eval_key: miracl-v1.0-fa + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-fa --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mdpr-tied-pft-msmarco-ft-miracl-fa --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.5934 + R@100: 0.9133 + - name: mdpr-tied-pft-msmarco-ft-miracl.fi + eval_key: miracl-v1.0-fi + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-fi --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mdpr-tied-pft-msmarco-ft-miracl-fi --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.7139 + R@100: 0.9479 + - name: mdpr-tied-pft-msmarco-ft-miracl.fr + eval_key: miracl-v1.0-fr + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-fr --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mdpr-tied-pft-msmarco-ft-miracl-fr --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.5893 + R@100: 0.9537 + - name: mdpr-tied-pft-msmarco-ft-miracl.hi + eval_key: miracl-v1.0-hi + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-hi --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mdpr-tied-pft-msmarco-ft-miracl-hi --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.5164 + R@100: 0.8862 + - name: mdpr-tied-pft-msmarco-ft-miracl.id + eval_key: miracl-v1.0-id + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-id --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mdpr-tied-pft-msmarco-ft-miracl-id --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.4959 + R@100: 0.8642 + - name: mdpr-tied-pft-msmarco-ft-miracl.ja + eval_key: miracl-v1.0-ja + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ja --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mdpr-tied-pft-msmarco-ft-miracl-ja --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.6416 + R@100: 0.9225 + - name: mdpr-tied-pft-msmarco-ft-miracl.ko + eval_key: miracl-v1.0-ko + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ko --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mdpr-tied-pft-msmarco-ft-miracl-ko --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.5901 + R@100: 0.8857 + - name: mdpr-tied-pft-msmarco-ft-miracl.ru + eval_key: miracl-v1.0-ru + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ru --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mdpr-tied-pft-msmarco-ft-miracl-ru --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.5974 + R@100: 0.9099 + - name: mdpr-tied-pft-msmarco-ft-miracl.sw + eval_key: miracl-v1.0-sw + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-sw --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mdpr-tied-pft-msmarco-ft-miracl-sw --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.6853 + R@100: 0.9367 + - name: mdpr-tied-pft-msmarco-ft-miracl.te + eval_key: miracl-v1.0-te + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-te --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mdpr-tied-pft-msmarco-ft-miracl-te --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.8037 + R@100: 0.9616 + - name: mdpr-tied-pft-msmarco-ft-miracl.th + eval_key: miracl-v1.0-th + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-th --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mdpr-tied-pft-msmarco-ft-miracl-th --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.6951 + R@100: 0.9311 + - name: mdpr-tied-pft-msmarco-ft-miracl.zh + eval_key: miracl-v1.0-zh + command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-zh --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mdpr-tied-pft-msmarco-ft-miracl-zh --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.6500 + R@100: 0.9631 + + # mcontriever + - name: mcontriever-tied-pft-msmarco.ar + eval_key: miracl-v1.0-ar + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5027 + R@100: 0.9166 + - split: dev + scores: + - nDCG@10: 0.5248 + R@100: 0.9253 + - name: mcontriever-tied-pft-msmarco.bn + eval_key: miracl-v1.0-bn + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5138 + R@100: 0.9313 + - split: dev + scores: + - nDCG@10: 0.5011 + R@100: 0.9205 + - name: mcontriever-tied-pft-msmarco.en + eval_key: miracl-v1.0-en + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3579 + R@100: 0.7990 + - split: dev + scores: + - nDCG@10: 0.3637 + R@100: 0.7967 + - name: mcontriever-tied-pft-msmarco.es + eval_key: miracl-v1.0-es + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4081 + R@100: 0.8339 + - split: dev + scores: + - nDCG@10: 0.4184 + R@100: 0.8411 + - name: mcontriever-tied-pft-msmarco.fa + eval_key: miracl-v1.0-fa + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.2263 + R@100: 0.6374 + - split: dev + scores: + - nDCG@10: 0.2152 + R@100: 0.6540 + - name: mcontriever-tied-pft-msmarco.fi + eval_key: miracl-v1.0-fi + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5680 + R@100: 0.9369 + - split: dev + scores: + - nDCG@10: 0.6019 + R@100: 0.9527 + - name: mcontriever-tied-pft-msmarco.fr + eval_key: miracl-v1.0-fr + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3332 + R@100: 0.8341 + - split: dev + scores: + - nDCG@10: 0.3140 + R@100: 0.8243 + - name: mcontriever-tied-pft-msmarco.hi + eval_key: miracl-v1.0-hi + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.2886 + R@100: 0.6664 + - split: dev + scores: + - nDCG@10: 0.2864 + R@100: 0.6461 + - name: mcontriever-tied-pft-msmarco.id + eval_key: miracl-v1.0-id + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3748 + R@100: 0.7955 + - split: dev + scores: + - nDCG@10: 0.3915 + R@100: 0.8015 + - name: mcontriever-tied-pft-msmarco.ja + eval_key: miracl-v1.0-ja + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4402 + R@100: 0.8813 + - split: dev + scores: + - nDCG@10: 0.4240 + R@100: 0.8783 + - name: mcontriever-tied-pft-msmarco.ko + eval_key: miracl-v1.0-ko + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4799 + R@100: 0.8672 + - split: dev + scores: + - nDCG@10: 0.4829 + R@100: 0.8753 + - name: mcontriever-tied-pft-msmarco.ru + eval_key: miracl-v1.0-ru + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.3811 + R@100: 0.8369 + - split: dev + scores: + - nDCG@10: 0.3913 + R@100: 0.8500 + - name: mcontriever-tied-pft-msmarco.sw + eval_key: miracl-v1.0-sw + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5568 + R@100: 0.9130 + - split: dev + scores: + - nDCG@10: 0.5600 + R@100: 0.9108 + - name: mcontriever-tied-pft-msmarco.te + eval_key: miracl-v1.0-te + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5260 + R@100: 0.9457 + - split: dev + scores: + - nDCG@10: 0.5283 + R@100: 0.9612 + - name: mcontriever-tied-pft-msmarco.th + eval_key: miracl-v1.0-th + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.5299 + R@100: 0.9361 + - split: dev + scores: + - nDCG@10: 0.5173 + R@100: 0.9361 + - name: mcontriever-tied-pft-msmarco.zh + eval_key: miracl-v1.0-zh + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: train + scores: + - nDCG@10: 0.4283 + R@100: 0.8745 + - split: dev + scores: + - nDCG@10: 0.4097 + R@100: 0.9026 + - name: mcontriever-tied-pft-msmarco.de + eval_key: miracl-v1.0-de + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-de-${split} --index miracl-v1.0-de-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.4079 + R@100: 0.8407 + - name: mcontriever-tied-pft-msmarco.yo + eval_key: miracl-v1.0-yo + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000 + splits: + - split: dev + scores: + - nDCG@10: 0.4150 + R@100: 0.7703 diff --git a/pyserini/2cr/miracl_html.template b/pyserini/2cr/miracl_html.template new file mode 100644 index 0000000000000000000000000000000000000000..2c16881851e9c6e87d475acac1309a227f8c139d --- /dev/null +++ b/pyserini/2cr/miracl_html.template @@ -0,0 +1,256 @@ + + + + + + + Pyserini Reproductions + + + + + + + + + + + + +
+
+
+
+

$title

+
+
+
+
+ + +
+ + $tables + + + +
+ +

Programmatic Execution

+ +

All experimental runs shown in the above table can be programmatically executed based on the instructions below. +To list all the experimental conditions:

+ +
+python -m pyserini.2cr.miracl --list-conditions +
+ +

Run all languages for a specific condition and show commands:

+ +
+python -m pyserini.2cr.miracl --condition bm25 --display-commands +
+ +

Run a particular language for a specific condition and show commands:

+ +
+python -m pyserini.2cr.miracl --condition bm25 --language ko --display-commands +
+ +

Run all languages for all conditions and show commands:

+ +
+python -m pyserini.2cr.miracl --all --display-commands +
+ +

With the above commands, run files will be placed in the current directory. Use the option --directory runs to place the runs in a sub-directory.

+ +

For a specific condition, just show the commands and do not run:

+ +
+python -m pyserini.2cr.miracl --condition bm25 --display-commands --dry-run +
+ +

This will generate exactly the commands for a specific condition above (corresponding to a row in the table).

+ +

For a specific condition and language, just show the commands and do not run:

+ +
+python -m pyserini.2cr.miracl --condition bm25 --language ko --display-commands --dry-run +
+ +

For all conditions, just show the commands and do not run and skip evaluation:

+ +
+python -m pyserini.2cr.miracl --all --display-commands --dry-run --skip-eval +
+ +

Finally, to generate this page:

+ +
+python -m pyserini.2cr.miracl --generate-report --output docs/2cr/miracl.html +
+ +

The output file miracl.html should be identical to this page.

+ +
+ +
+ + + + + + + + + + diff --git a/pyserini/2cr/miracl_html_table.template b/pyserini/2cr/miracl_html_table.template new file mode 100644 index 0000000000000000000000000000000000000000..c008b40149c0fe1fd71d0d20e941374917824a64 --- /dev/null +++ b/pyserini/2cr/miracl_html_table.template @@ -0,0 +1,35 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +$rows + + +
$descarbnenesfafifrhiidjakoruswtethzhdeyoavg
+
diff --git a/pyserini/2cr/miracl_html_table_row.template b/pyserini/2cr/miracl_html_table_row.template new file mode 100644 index 0000000000000000000000000000000000000000..24a78c846692b22c275bbfd9f5124dd7b2ae4c56 --- /dev/null +++ b/pyserini/2cr/miracl_html_table_row.template @@ -0,0 +1,336 @@ + + + +$model +$ar +$bn +$en +$es +$fa +$fi +$fr +$hi +$id +$ja +$ko +$ru +$sw +$te +$th +$zh +$de +$yo + +$avg + + + +
+ + + + + + +
+
+Command to generate run: + +
+
$cmd1
+
+Evaluation commands: + +
+
${eval_cmd1}
+
+ +
+
+Command to generate run: + +
+
$cmd2
+
+Evaluation commands: + +
+
${eval_cmd2}
+
+ +
+
+Command to generate run: + +
+
$cmd3
+
+Evaluation commands: + +
+
${eval_cmd3}
+
+ +
+
+Command to generate run: + +
+
$cmd4
+
+Evaluation commands: + +
+
${eval_cmd4}
+
+ +
+
+Command to generate run: + +
+
$cmd5
+
+Evaluation commands: + +
+
${eval_cmd5}
+
+ +
+
+Command to generate run: + +
+
$cmd6
+
+Evaluation commands: + +
+
${eval_cmd6}
+
+ +
+
+Command to generate run: + +
+
$cmd7
+
+Evaluation commands: + +
+
${eval_cmd7}
+
+ +
+
+Command to generate run: + +
+
$cmd8
+
+Evaluation commands: + +
+
${eval_cmd8}
+
+ +
+
+Command to generate run: + +
+
$cmd9
+
+Evaluation commands: + +
+
${eval_cmd9}
+
+ +
+
+Command to generate run: + +
+
$cmd10
+
+Evaluation commands: + +
+
${eval_cmd10}
+
+ +
+
+Command to generate run: + +
+
$cmd11
+
+Evaluation commands: + +
+
${eval_cmd11}
+
+ +
+
+Command to generate run: + +
+
$cmd12
+
+Evaluation commands: + +
+
${eval_cmd12}
+
+ +
+
+Command to generate run: + +
+
$cmd13
+
+Evaluation commands: + +
+
${eval_cmd13}
+
+ +
+
+ +Command to generate run: + +
+
$cmd14
+
+Evaluation commands: + +
+
${eval_cmd14}
+
+ +
+
+ +Command to generate run: + +
+
$cmd15
+
+Evaluation commands: + +
+
${eval_cmd15}
+
+ +
+ +
+Command to generate run: + +
+
$cmd16
+
+Evaluation commands: + +
+
${eval_cmd16}
+
+ +
+ +
+Command to generate run: + +
+
$cmd17
+
+Evaluation commands: + +
+
${eval_cmd17}
+
+ +
+ +
+Command to generate run: + +
+
$cmd18
+
+Evaluation commands: + +
+
${eval_cmd18}
+
+ +
+ +
+ + +
+ diff --git a/pyserini/2cr/mrtydi.py b/pyserini/2cr/mrtydi.py new file mode 100644 index 0000000000000000000000000000000000000000..9ef2c78ef589b394b4918a3401582ea7e864ee01 --- /dev/null +++ b/pyserini/2cr/mrtydi.py @@ -0,0 +1,330 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from collections import defaultdict +from string import Template + +import argparse +import math +import os +import pkg_resources +import sys +import time +import yaml + +from ._base import run_eval_and_return_metric, ok_str, okish_str, fail_str + +languages = [ + ['ar', 'arabic'], + ['bn', 'bengali'], + ['en', 'english'], + ['fi', 'finnish'], + ['id', 'indonesian'], + ['ja', 'japanese'], + ['ko', 'korean'], + ['ru', 'russian'], + ['sw', 'swahili'], + ['te', 'telugu'], + ['th', 'thai'] +] + +models = ['bm25', 'mdpr-split-pft-nq', 'mdpr-tied-pft-nq', 'mdpr-tied-pft-msmarco', 'mdpr-tied-pft-msmarco-ft-all'] + +html_display = { + 'bm25': 'BM25', + 'mdpr-split-pft-nq': 'mDPR (split encoders), pre-FT w/ NQ', + 'mdpr-tied-pft-nq': 'mDPR (tied encoders), pre-FT w/ NQ', + 'mdpr-tied-pft-msmarco': 'mDPR (tied encoders), pre-FT w/ MS MARCO', + 'mdpr-tied-pft-msmarco-ft-all': 'mDPR (tied encoders), pre-FT w/ MS MARCO, FT w/ all' +} + +trec_eval_metric_definitions = { + 'MRR@100': '-c -M 100 -m recip_rank', + 'R@100': '-c -m recall.100', +} + + +def format_run_command(raw): + return raw.replace('--lang', '\\\n --lang')\ + .replace('--encoder', '\\\n --encoder')\ + .replace('--topics', '\\\n --topics')\ + .replace('--index', '\\\n --index')\ + .replace('--output ', '\\\n --output ')\ + .replace('--batch ', '\\\n --batch ') \ + .replace('--threads 12', '--threads 12 \\\n ') + + +def format_eval_command(raw): + return raw.replace('-c ', '\\\n -c ')\ + .replace(raw.split()[-1], f'\\\n {raw.split()[-1]}') + + +def read_file(f): + fin = open(f, 'r') + text = fin.read() + fin.close() + + return text + + +def list_conditions(): + print('Conditions:\n-----------') + for condition in models: + print(condition) + print('\nLanguages\n---------') + for language in languages: + print(language[0]) + + +def print_results(table, metric, split): + print(f'Metric = {metric}, Split = {split}') + print(' ' * 32, end='') + for lang in languages: + print(f'{lang[0]:3} ', end='') + print('') + for model in models: + print(f'{model:30}', end='') + for lang in languages: + key = f'{model}.{lang[0]}' + print(f'{table[key][split][metric]:7.3f}', end='') + print('') + print('') + + +def generate_table_rows(table, row_template, commands, eval_commands, table_id, split, metric): + row_cnt = 1 + html_rows = [] + + for model in models: + s = Template(row_template) + + keys = {} + for lang in languages: + keys[lang[0]] = f'{model}.{lang[0]}' + + sum = table[keys["ar"]][split][metric] + \ + table[keys["bn"]][split][metric] + \ + table[keys["en"]][split][metric] + \ + table[keys["fi"]][split][metric] + \ + table[keys["id"]][split][metric] + \ + table[keys["ja"]][split][metric] + \ + table[keys["ko"]][split][metric] + \ + table[keys["ru"]][split][metric] + \ + table[keys["sw"]][split][metric] + \ + table[keys["te"]][split][metric] + \ + table[keys["th"]][split][metric] + avg = sum / 11 + + s = s.substitute(table_cnt=table_id, + row_cnt=row_cnt, + model=html_display[model], + ar=f'{table[keys["ar"]][split][metric]:.3f}', + bn=f'{table[keys["bn"]][split][metric]:.3f}', + en=f'{table[keys["en"]][split][metric]:.3f}', + fi=f'{table[keys["fi"]][split][metric]:.3f}', + id=f'{table[keys["id"]][split][metric]:.3f}', + ja=f'{table[keys["ja"]][split][metric]:.3f}', + ko=f'{table[keys["ko"]][split][metric]:.3f}', + ru=f'{table[keys["ru"]][split][metric]:.3f}', + sw=f'{table[keys["sw"]][split][metric]:.3f}', + te=f'{table[keys["te"]][split][metric]:.3f}', + th=f'{table[keys["th"]][split][metric]:.3f}', + avg=f'{avg:.3f}', + cmd1=f'{commands[keys["ar"]]}', + cmd2=f'{commands[keys["bn"]]}', + cmd3=f'{commands[keys["en"]]}', + cmd4=f'{commands[keys["fi"]]}', + cmd5=f'{commands[keys["id"]]}', + cmd6=f'{commands[keys["ja"]]}', + cmd7=f'{commands[keys["ko"]]}', + cmd8=f'{commands[keys["ru"]]}', + cmd9=f'{commands[keys["sw"]]}', + cmd10=f'{commands[keys["te"]]}', + cmd11=f'{commands[keys["th"]]}', + eval_cmd1=f'{eval_commands[keys["ar"]][metric]}', + eval_cmd2=f'{eval_commands[keys["bn"]][metric]}', + eval_cmd3=f'{eval_commands[keys["en"]][metric]}', + eval_cmd4=f'{eval_commands[keys["fi"]][metric]}', + eval_cmd5=f'{eval_commands[keys["id"]][metric]}', + eval_cmd6=f'{eval_commands[keys["ja"]][metric]}', + eval_cmd7=f'{eval_commands[keys["ko"]][metric]}', + eval_cmd8=f'{eval_commands[keys["ru"]][metric]}', + eval_cmd9=f'{eval_commands[keys["sw"]][metric]}', + eval_cmd10=f'{eval_commands[keys["te"]][metric]}', + eval_cmd11=f'{eval_commands[keys["th"]][metric]}' + ) + + html_rows.append(s) + row_cnt += 1 + + return html_rows + + +def generate_report(args): + table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) + commands = defaultdict(lambda: '') + eval_commands = defaultdict(lambda: defaultdict(lambda: '')) + + html_template = read_file(pkg_resources.resource_filename(__name__, 'mrtydi_html.template')) + table_template = read_file(pkg_resources.resource_filename(__name__, 'mrtydi_html_table.template')) + row_template = read_file(pkg_resources.resource_filename(__name__, 'mrtydi_html_table_row.template')) + + with open(pkg_resources.resource_filename(__name__, 'mrtydi.yaml')) as f: + yaml_data = yaml.safe_load(f) + for condition in yaml_data['conditions']: + name = condition['name'] + eval_key = condition['eval_key'] + cmd_template = condition['command'] + + for splits in condition['splits']: + split = splits['split'] + + runfile = os.path.join(args.directory, f'run.mrtydi.{name}.{split}.txt') + cmd = Template(cmd_template).substitute(split=split, output=runfile) + commands[name] = format_run_command(cmd) + + for expected in splits['scores']: + for metric in expected: + table[name][split][metric] = expected[metric] + + eval_cmd = f'python -m pyserini.eval.trec_eval ' + \ + f'{trec_eval_metric_definitions[metric]} {eval_key}-{split} {runfile}' + eval_commands[name][metric] = format_eval_command(eval_cmd) + + tables_html = [] + + # Build the table for MRR@100, test queries + html_rows = generate_table_rows(table, row_template, commands, eval_commands, 1, 'test', 'MRR@100') + all_rows = '\n'.join(html_rows) + tables_html.append(Template(table_template).substitute(desc='MRR@100, test queries', rows=all_rows)) + + # Build the table for R@100, test queries + html_rows = generate_table_rows(table, row_template, commands, eval_commands, 2, 'test', 'R@100') + all_rows = '\n'.join(html_rows) + tables_html.append(Template(table_template).substitute(desc='Recall@100, test queries', rows=all_rows)) + + with open(args.output, 'w') as out: + out.write(Template(html_template).substitute(title='Mr.TyDi', tables=' '.join(tables_html))) + + +def run_conditions(args): + start = time.time() + + table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) + + with open(pkg_resources.resource_filename(__name__, 'mrtydi.yaml')) as f: + yaml_data = yaml.safe_load(f) + for condition in yaml_data['conditions']: + name = condition['name'] + encoder = name.split('.')[0] + lang = name.split('.')[-1] + if args.all: + pass + elif args.condition != encoder: + continue + elif args.language and args.language != lang: + continue + eval_key = condition['eval_key'] + cmd_template = condition['command'] + + print(f'condition {name}:') + + for splits in condition['splits']: + split = splits['split'] + + print(f' - split: {split}') + + runfile = os.path.join(args.directory, f'run.mrtydi.{name}.{split}.txt') + cmd = Template(cmd_template).substitute(split=split, output=runfile) + + if args.display_commands: + print(f'\n```bash\n{format_run_command(cmd)}\n```\n') + + if not os.path.exists(runfile): + if not args.dry_run: + os.system(cmd) + + for expected in splits['scores']: + for metric in expected: + if not args.skip_eval: + score = float(run_eval_and_return_metric(metric, f'{eval_key}-{split}', + trec_eval_metric_definitions[metric], runfile)) + if math.isclose(score, float(expected[metric])): + result_str = ok_str + # Flaky test: small difference on orca + elif name == 'mdpr-tied-pft-nq.te' and split == 'dev' \ + and math.isclose(score, float(expected[metric]), abs_tol=2e-4): + result_str = okish_str + # Flaky test: small difference on orca + elif name == 'mdpr-tied-pft-msmarco-ft-all.ko' and split == 'train' \ + and math.isclose(score, float(expected[metric]), abs_tol=4e-4): + result_str = okish_str + # Flaky test: small difference on Mac Studio (M1) + elif name == 'mdpr-tied-pft-msmarco.th' and split == 'train' \ + and math.isclose(score, float(expected[metric]), abs_tol=3e-4): + result_str = okish_str + else: + result_str = fail_str + f' expected {expected[metric]:.4f}' + print(f' {metric:7}: {score:.4f} {result_str}') + table[name][split][metric] = score + else: + table[name][split][metric] = expected[metric] + + print('') + + for metric in ['MRR@100', 'R@100']: + for split in ['test', 'dev', 'train']: + print_results(table, metric, split) + + end = time.time() + print(f'Total elapsed time: {end - start:.0f}s') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Generate regression matrix for MIRACL.') + parser.add_argument('--condition', type=str, + help='Condition to run', required=False) + # To list all conditions + parser.add_argument('--list-conditions', action='store_true', default=False, help='List available conditions.') + # For generating reports + parser.add_argument('--generate-report', action='store_true', default=False, help='Generate report.') + parser.add_argument('--output', type=str, help='File to store report.', required=False) + # For actually running the experimental conditions + parser.add_argument('--all', action='store_true', default=False, help='Run using all languages.') + parser.add_argument('--language', type=str, help='Language to run.', required=False) + parser.add_argument('--directory', type=str, help='Base directory.', default='', required=False) + parser.add_argument('--dry-run', action='store_true', default=False, help='Print out commands but do not execute.') + parser.add_argument('--skip-eval', action='store_true', default=False, help='Skip running trec_eval.') + parser.add_argument('--display-commands', action='store_true', default=False, help='Display command.') + args = parser.parse_args() + + if args.list_conditions: + list_conditions() + sys.exit() + + if args.generate_report: + if not args.output: + print(f'Must specify report filename with --output.') + sys.exit() + + generate_report(args) + sys.exit() + + if args.all and (args.condition or args.language): + print('Specifying --all will run all conditions and languages') + sys.exit() + + run_conditions(args) diff --git a/pyserini/2cr/mrtydi.yaml b/pyserini/2cr/mrtydi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..333e9509e46c8265a0b72747ffa76742e49e3fdf --- /dev/null +++ b/pyserini/2cr/mrtydi.yaml @@ -0,0 +1,890 @@ +conditions: + # mDPR, tied encoders, pFT w/ MS MARCO, FT all + - name: mdpr-tied-pft-msmarco-ft-all.ar + eval_key: mrtydi-v1.1-arabic + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.9505 + R@100: 1.0000 + - split: dev + scores: + - MRR@100: 0.6913 + R@100: 0.9165 + - split: test + scores: + - MRR@100: 0.6949 + R@100: 0.9004 + - name: mdpr-tied-pft-msmarco-ft-all.bn + eval_key: mrtydi-v1.1-bengali + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.9620 + R@100: 1.0000 + - split: dev + scores: + - MRR@100: 0.5897 + R@100: 0.8977 + - split: test + scores: + - MRR@100: 0.6228 + R@100: 0.9550 + - name: mdpr-tied-pft-msmarco-ft-all.en + eval_key: mrtydi-v1.1-english + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.8278 + R@100: 1.0000 + - split: dev + scores: + - MRR@100: 0.5357 + R@100: 0.8884 + - split: test + scores: + - MRR@100: 0.4916 + R@100: 0.8414 + - name: mdpr-tied-pft-msmarco-ft-all.fi + eval_key: mrtydi-v1.1-finnish + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.9577 + R@100: 0.9997 + - split: dev + scores: + - MRR@100: 0.6626 + R@100: 0.9171 + - split: test + scores: + - MRR@100: 0.5595 + R@100: 0.8563 + - name: mdpr-tied-pft-msmarco-ft-all.id + eval_key: mrtydi-v1.1-indonesian + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.9469 + R@100: 1.0000 + - split: dev + scores: + - MRR@100: 0.6294 + R@100: 0.9150 + - split: test + scores: + - MRR@100: 0.5783 + R@100: 0.8609 + - name: mdpr-tied-pft-msmarco-ft-all.ja + eval_key: mrtydi-v1.1-japanese + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.8802 + R@100: 0.9997 + - split: dev + scores: + - MRR@100: 0.5505 + R@100: 0.8696 + - split: test + scores: + - MRR@100: 0.5007 + R@100: 0.8130 + - name: mdpr-tied-pft-msmarco-ft-all.ko + eval_key: mrtydi-v1.1-korean + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.9195 + R@100: 1.0000 + - split: dev + scores: + - MRR@100: 0.5645 + R@100: 0.8663 + - split: test + scores: + - MRR@100: 0.4861 + R@100: 0.7854 + - name: mdpr-tied-pft-msmarco-ft-all.ru + eval_key: mrtydi-v1.1-russian + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.8473 + R@100: 0.9994 + - split: dev + scores: + - MRR@100: 0.5104 + R@100: 0.8720 + - split: test + scores: + - MRR@100: 0.5161 + R@100: 0.8432 + - name: mdpr-tied-pft-msmarco-ft-all.sw + eval_key: mrtydi-v1.1-swahili + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.9515 + R@100: 1.0000 + - split: dev + scores: + - MRR@100: 0.6404 + R@100: 0.9018 + - split: test + scores: + - MRR@100: 0.6438 + R@100: 0.8756 + - name: mdpr-tied-pft-msmarco-ft-all.te + eval_key: mrtydi-v1.1-telugu + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.9679 + R@100: 1.0000 + - split: dev + scores: + - MRR@100: 0.7962 + R@100: 0.9593 + - split: test + scores: + - MRR@100: 0.8908 + R@100: 0.9659 + - name: mdpr-tied-pft-msmarco-ft-all.th + eval_key: mrtydi-v1.1-thai + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.9504 + R@100: 1.0000 + - split: dev + scores: + - MRR@100: 0.6670 + R@100: 0.9114 + - split: test + scores: + - MRR@100: 0.6175 + R@100: 0.8826 + + # mDPR, tied encoders, pFT w/ MS MARCO + - name: mdpr-tied-pft-msmarco.ar + eval_key: mrtydi-v1.1-arabic + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-tied-pft-msmarco --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.3957 + R@100: 0.7818 + - split: dev + scores: + - MRR@100: 0.3978 + R@100: 0.7778 + - split: test + scores: + - MRR@100: 0.4414 + R@100: 0.7971 + - name: mdpr-tied-pft-msmarco.bn + eval_key: mrtydi-v1.1-bengali + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-tied-pft-msmarco --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2920 + R@100: 0.7323 + - split: dev + scores: + - MRR@100: 0.2993 + R@100: 0.7318 + - split: test + scores: + - MRR@100: 0.3969 + R@100: 0.7838 + - name: mdpr-tied-pft-msmarco.en + eval_key: mrtydi-v1.1-english + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-tied-pft-msmarco --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.3374 + R@100: 0.8111 + - split: dev + scores: + - MRR@100: 0.3451 + R@100: 0.7995 + - split: test + scores: + - MRR@100: 0.3270 + R@100: 0.7536 + - name: mdpr-tied-pft-msmarco.fi + eval_key: mrtydi-v1.1-finnish + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-tied-pft-msmarco --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.3668 + R@100: 0.7337 + - split: dev + scores: + - MRR@100: 0.3636 + R@100: 0.7371 + - split: test + scores: + - MRR@100: 0.2750 + R@100: 0.6471 + - name: mdpr-tied-pft-msmarco.id + eval_key: mrtydi-v1.1-indonesian + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-tied-pft-msmarco --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2794 + R@100: 0.7044 + - split: dev + scores: + - MRR@100: 0.2853 + R@100: 0.7198 + - split: test + scores: + - MRR@100: 0.3520 + R@100: 0.7356 + - name: mdpr-tied-pft-msmarco.ja + eval_key: mrtydi-v1.1-japanese + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-tied-pft-msmarco --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.3089 + R@100: 0.7603 + - split: dev + scores: + - MRR@100: 0.3108 + R@100: 0.7597 + - split: test + scores: + - MRR@100: 0.3107 + R@100: 0.7317 + - name: mdpr-tied-pft-msmarco.ko + eval_key: mrtydi-v1.1-korean + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-tied-pft-msmarco --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.3003 + R@100: 0.6907 + - split: dev + scores: + - MRR@100: 0.3017 + R@100: 0.7046 + - split: test + scores: + - MRR@100: 0.2820 + R@100: 0.6172 + - name: mdpr-tied-pft-msmarco.ru + eval_key: mrtydi-v1.1-russian + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-tied-pft-msmarco --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2856 + R@100: 0.7305 + - split: dev + scores: + - MRR@100: 0.2943 + R@100: 0.7404 + - split: test + scores: + - MRR@100: 0.3561 + R@100: 0.7432 + - name: mdpr-tied-pft-msmarco.sw + eval_key: mrtydi-v1.1-swahili + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-tied-pft-msmarco --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2491 + R@100: 0.5195 + - split: dev + scores: + - MRR@100: 0.2447 + R@100: 0.5266 + - split: test + scores: + - MRR@100: 0.3418 + R@100: 0.6343 + - name: mdpr-tied-pft-msmarco.te + eval_key: mrtydi-v1.1-telugu + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-tied-pft-msmarco --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.3059 + R@100: 0.7510 + - split: dev + scores: + - MRR@100: 0.2995 + R@100: 0.7355 + - split: test + scores: + - MRR@100: 0.3102 + R@100: 0.7817 + - name: mdpr-tied-pft-msmarco.th + eval_key: mrtydi-v1.1-thai + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-tied-pft-msmarco --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2334 + R@100: 0.5851 + - split: dev + scores: + - MRR@100: 0.2407 + R@100: 0.5795 + - split: test + scores: + - MRR@100: 0.2693 + R@100: 0.5945 + + # mDPR, tied encoders, pFT w/ NQ + - name: mdpr-tied-pft-nq.ar + eval_key: mrtydi-v1.1-arabic + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-tied-pft-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2087 + R@100: 0.5854 + - split: dev + scores: + - MRR@100: 0.2132 + R@100: 0.5868 + - split: test + scores: + - MRR@100: 0.2214 + R@100: 0.6001 + - name: mdpr-tied-pft-nq.bn + eval_key: mrtydi-v1.1-bengali + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-tied-pft-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2371 + R@100: 0.6281 + - split: dev + scores: + - MRR@100: 0.2414 + R@100: 0.6409 + - split: test + scores: + - MRR@100: 0.2535 + R@100: 0.7072 + - name: mdpr-tied-pft-nq.en + eval_key: mrtydi-v1.1-english + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-tied-pft-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2441 + R@100: 0.7217 + - split: dev + scores: + - MRR@100: 0.2359 + R@100: 0.7187 + - split: test + scores: + - MRR@100: 0.2433 + R@100: 0.6893 + - name: mdpr-tied-pft-nq.fi + eval_key: mrtydi-v1.1-finnish + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-tied-pft-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2996 + R@100: 0.6787 + - split: dev + scores: + - MRR@100: 0.3252 + R@100: 0.7037 + - split: test + scores: + - MRR@100: 0.2444 + R@100: 0.6401 + - name: mdpr-tied-pft-nq.id + eval_key: mrtydi-v1.1-indonesian + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-tied-pft-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2706 + R@100: 0.7322 + - split: dev + scores: + - MRR@100: 0.2719 + R@100: 0.7394 + - split: test + scores: + - MRR@100: 0.2815 + R@100: 0.6914 + - name: mdpr-tied-pft-nq.ja + eval_key: mrtydi-v1.1-japanese + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-tied-pft-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2165 + R@100: 0.6043 + - split: dev + scores: + - MRR@100: 0.2299 + R@100: 0.6239 + - split: test + scores: + - MRR@100: 0.2058 + R@100: 0.5734 + - name: mdpr-tied-pft-nq.ko + eval_key: mrtydi-v1.1-korean + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-tied-pft-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2527 + R@100: 0.6556 + - split: dev + scores: + - MRR@100: 0.2680 + R@100: 0.6271 + - split: test + scores: + - MRR@100: 0.2234 + R@100: 0.5499 + - name: mdpr-tied-pft-nq.ru + eval_key: mrtydi-v1.1-russian + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-tied-pft-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2160 + R@100: 0.6262 + - split: dev + scores: + - MRR@100: 0.2263 + R@100: 0.6444 + - split: test + scores: + - MRR@100: 0.2501 + R@100: 0.6181 + - name: mdpr-tied-pft-nq.sw + eval_key: mrtydi-v1.1-swahili + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-tied-pft-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2383 + R@100: 0.5707 + - split: dev + scores: + - MRR@100: 0.2543 + R@100: 0.6138 + - split: test + scores: + - MRR@100: 0.2621 + R@100: 0.5965 + - name: mdpr-tied-pft-nq.te + eval_key: mrtydi-v1.1-telugu + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-tied-pft-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.1483 + R@100: 0.4162 + - split: dev + scores: + - MRR@100: 0.1494 + R@100: 0.3967 + - split: test + scores: + - MRR@100: 0.0970 + R@100: 0.2454 + - name: mdpr-tied-pft-nq.th + eval_key: mrtydi-v1.1-thai + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-tied-pft-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.1426 + R@100: 0.4717 + - split: dev + scores: + - MRR@100: 0.1618 + R@100: 0.4637 + - split: test + scores: + - MRR@100: 0.1575 + R@100: 0.4550 + + # mDPR, split encoders, pFT w/ NQ + - name: mdpr-split-pft-nq.ar + eval_key: mrtydi-v1.1-arabic + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2510 + R@100: 0.6384 + - split: dev + scores: + - MRR@100: 0.2449 + R@100: 0.6334 + - split: test + scores: + - MRR@100: 0.2907 + R@100: 0.6502 + - name: mdpr-split-pft-nq.bn + eval_key: mrtydi-v1.1-bengali + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2293 + R@100: 0.6454 + - split: dev + scores: + - MRR@100: 0.2367 + R@100: 0.6511 + - split: test + scores: + - MRR@100: 0.2911 + R@100: 0.7793 + - name: mdpr-split-pft-nq.en + eval_key: mrtydi-v1.1-english + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2862 + R@100: 0.7372 + - split: dev + scores: + - MRR@100: 0.2821 + R@100: 0.7437 + - split: test + scores: + - MRR@100: 0.2907 + R@100: 0.6779 + - name: mdpr-split-pft-nq.fi + eval_key: mrtydi-v1.1-finnish + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2473 + R@100: 0.6289 + - split: dev + scores: + - MRR@100: 0.2466 + R@100: 0.6283 + - split: test + scores: + - MRR@100: 0.2050 + R@100: 0.5680 + - name: mdpr-split-pft-nq.id + eval_key: mrtydi-v1.1-indonesian + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2351 + R@100: 0.6952 + - split: dev + scores: + - MRR@100: 0.2475 + R@100: 0.7181 + - split: test + scores: + - MRR@100: 0.2705 + R@100: 0.6848 + - name: mdpr-split-pft-nq.ja + eval_key: mrtydi-v1.1-japanese + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.1967 + R@100: 0.5983 + - split: dev + scores: + - MRR@100: 0.2055 + R@100: 0.6142 + - split: test + scores: + - MRR@100: 0.2119 + R@100: 0.5840 + - name: mdpr-split-pft-nq.ko + eval_key: mrtydi-v1.1-korean + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2383 + R@100: 0.6180 + - split: dev + scores: + - MRR@100: 0.2343 + R@100: 0.6238 + - split: test + scores: + - MRR@100: 0.2345 + R@100: 0.5325 + - name: mdpr-split-pft-nq.ru + eval_key: mrtydi-v1.1-russian + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2317 + R@100: 0.6534 + - split: dev + scores: + - MRR@100: 0.2490 + R@100: 0.6553 + - split: test + scores: + - MRR@100: 0.2820 + R@100: 0.6474 + - name: mdpr-split-pft-nq.sw + eval_key: mrtydi-v1.1-swahili + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.1457 + R@100: 0.4481 + - split: dev + scores: + - MRR@100: 0.1547 + R@100: 0.4724 + - split: test + scores: + - MRR@100: 0.1883 + R@100: 0.5281 + - name: mdpr-split-pft-nq.te + eval_key: mrtydi-v1.1-telugu + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.1489 + R@100: 0.4905 + - split: dev + scores: + - MRR@100: 0.1503 + R@100: 0.4934 + - split: test + scores: + - MRR@100: 0.1099 + R@100: 0.3661 + - name: mdpr-split-pft-nq.th + eval_key: mrtydi-v1.1-thai + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-nq --output $output --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.1603 + R@100: 0.4983 + - split: dev + scores: + - MRR@100: 0.1584 + R@100: 0.5083 + - split: test + scores: + - MRR@100: 0.1709 + R@100: 0.5146 + + # BM25 + - name: bm25.ar + eval_key: mrtydi-v1.1-arabic + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ar --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic --output $output --bm25 --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.3356 + R@100: 0.7944 + - split: dev + scores: + - MRR@100: 0.3462 + R@100: 0.7872 + - split: test + scores: + - MRR@100: 0.3682 + R@100: 0.7928 + - name: bm25.bn + eval_key: mrtydi-v1.1-bengali + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language bn --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali --output $output --bm25 --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.3566 + - R@100: 0.8336 + - split: dev + scores: + - MRR@100: 0.3385 + - R@100: 0.8432 + - split: test + scores: + - MRR@100: 0.4182 + - R@100: 0.8694 + - name: bm25.en + eval_key: mrtydi-v1.1-english + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language en --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english --output $output --bm25 --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.1592 + - R@100: 0.5785 + - split: dev + scores: + - MRR@100: 0.1685 + - R@100: 0.6196 + - split: test + scores: + - MRR@100: 0.1404 + - R@100: 0.5365 + - name: bm25.fi + eval_key: mrtydi-v1.1-finnish + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language fi --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish --output $output --bm25 --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.4101 + - R@100: 0.8198 + - split: dev + scores: + - MRR@100: 0.4136 + - R@100: 0.8285 + - split: test + scores: + - MRR@100: 0.2836 + - R@100: 0.7196 + - name: bm25.id + eval_key: mrtydi-v1.1-indonesian + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language id --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian --output $output --bm25 --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2972 + - R@100: 0.7948 + - split: dev + scores: + - MRR@100: 0.2937 + - R@100: 0.7827 + - split: test + scores: + - MRR@100: 0.3762 + - R@100: 0.8426 + - name: bm25.ja + eval_key: mrtydi-v1.1-japanese + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ja --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese --output $output --bm25 --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2262 + - R@100: 0.7290 + - split: dev + scores: + - MRR@100: 0.2250 + - R@100: 0.7252 + - split: test + scores: + - MRR@100: 0.2125 + - R@100: 0.6431 + - name: bm25.ko + eval_key: mrtydi-v1.1-korean + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ko --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean --output $output --bm25 --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2596 + - R@100: 0.6178 + - split: dev + scores: + - MRR@100: 0.2888 + - R@100: 0.6733 + - split: test + scores: + - MRR@100: 0.2848 + - R@100: 0.6188 + - name: bm25.ru + eval_key: mrtydi-v1.1-russian + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ru --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian --output $output --bm25 --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2229 + - R@100: 0.5779 + - split: dev + scores: + - MRR@100: 0.2202 + - R@100: 0.5760 + - split: test + scores: + - MRR@100: 0.3163 + - R@100: 0.6541 + - name: bm25.sw + eval_key: mrtydi-v1.1-swahili + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language sw --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili --output $output --bm25 --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.2610 + - R@100: 0.5903 + - split: dev + scores: + - MRR@100: 0.2693 + - R@100: 0.5789 + - split: test + scores: + - MRR@100: 0.3893 + - R@100: 0.7642 + - name: bm25.te + eval_key: mrtydi-v1.1-telugu + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language te --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu --output $output --bm25 --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.4204 + - R@100: 0.8229 + - split: dev + scores: + - MRR@100: 0.4269 + - R@100: 0.8362 + - split: test + scores: + - MRR@100: 0.5283 + - R@100: 0.8971 + - name: bm25.th + eval_key: mrtydi-v1.1-thai + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language th --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai --output $output --bm25 --hits 100 + splits: + - split: train + scores: + - MRR@100: 0.3543 + - R@100: 0.8349 + - split: dev + scores: + - MRR@100: 0.3586 + - R@100: 0.8536 + - split: test + scores: + - MRR@100: 0.4012 + - R@100: 0.8529 diff --git a/pyserini/2cr/mrtydi_html.template b/pyserini/2cr/mrtydi_html.template new file mode 100644 index 0000000000000000000000000000000000000000..21ec6fcd0d79b631e56f418f78ba0bf9dd0feb3e --- /dev/null +++ b/pyserini/2cr/mrtydi_html.template @@ -0,0 +1,256 @@ + + + + + + + Pyserini Reproductions + + + + + + + + + + + + +
+
+
+
+

$title

+
+
+
+
+ + +
+ + $tables + + + +
+ +

Programmatic Execution

+ +

All experimental runs shown in the above table can be programmatically executed based on the instructions below. + To list all the experimental conditions:

+ +
+ python -m pyserini.2cr.mrtydi --list-conditions +
+ +

Run all languages for a specific condition and show commands:

+ +
+ python -m pyserini.2cr.mrtydi --condition bm25 --display-commands +
+ +

Run a particular language for a specific condition and show commands:

+ +
+ python -m pyserini.2cr.mrtydi --condition bm25 --language ko --display-commands +
+ +

Run all languages for all conditions and show commands:

+ +
+ python -m pyserini.2cr.mrtydi --all --display-commands +
+ +

With the above commands, run files will be placed in the current directory. Use the option --directory runs to place the runs in a sub-directory.

+ +

For a specific condition, just show the commands and do not run:

+ +
+ python -m pyserini.2cr.mrtydi --condition bm25 --display-commands --dry-run +
+ +

This will generate exactly the commands for a specific condition above (corresponding to a row in the table).

+ +

For a specific condition and language, just show the commands and do not run:

+ +
+ python -m pyserini.2cr.mrtydi --condition bm25 --language ko --display-commands --dry-run +
+ +

For all conditions, just show the commands and do not run and skip evaluation:

+ +
+ python -m pyserini.2cr.mrtydi --all --display-commands --dry-run --skip-eval +
+ +

Finally, to generate this page:

+ +
+ python -m pyserini.2cr.mrtydi --generate-report --output docs/2cr/mrtydi.html +
+ +

The output file mrtydi.html should be identical to this page.

+ +
+ +
+ + + + + + + + + + diff --git a/pyserini/2cr/mrtydi_html_table.template b/pyserini/2cr/mrtydi_html_table.template new file mode 100644 index 0000000000000000000000000000000000000000..f61a3d2102360cfa860c75ecd91f0dd13d4fe049 --- /dev/null +++ b/pyserini/2cr/mrtydi_html_table.template @@ -0,0 +1,28 @@ +
+ + + + + + + + + + + + + + + + + + + + + + +$rows + + +
$descarbnenfiidjakoruswtethavg
+
diff --git a/pyserini/2cr/mrtydi_html_table_row.template b/pyserini/2cr/mrtydi_html_table_row.template new file mode 100644 index 0000000000000000000000000000000000000000..913cbc5b7cfc6a38cd92f14321e12ca6b48b2aa1 --- /dev/null +++ b/pyserini/2cr/mrtydi_html_table_row.template @@ -0,0 +1,212 @@ + + + +$model +$ar +$bn +$en +$fi +$id +$ja +$ko +$ru +$sw +$te +$th + +$avg + + + + + +
+ + + + + + +
+
+Command to generate run: + +
+
$cmd1
+
+Evaluation commands: + +
+
${eval_cmd1}
+
+ +
+
+Command to generate run: + +
+
$cmd2
+
+Evaluation commands: + +
+
${eval_cmd2}
+
+ +
+
+Command to generate run: + +
+
$cmd3
+
+Evaluation commands: + +
+
${eval_cmd3}
+
+ +
+
+Command to generate run: + +
+
$cmd4
+
+Evaluation commands: + +
+
${eval_cmd4}
+
+ +
+
+Command to generate run: + +
+
$cmd5
+
+Evaluation commands: + +
+
${eval_cmd5}
+
+ +
+
+Command to generate run: + +
+
$cmd6
+
+Evaluation commands: + +
+
${eval_cmd6}
+
+ +
+
+Command to generate run: + +
+
$cmd7
+
+Evaluation commands: + +
+
${eval_cmd7}
+
+ +
+
+Command to generate run: + +
+
$cmd8
+
+Evaluation commands: + +
+
${eval_cmd8}
+
+ +
+
+Command to generate run: + +
+
$cmd9
+
+Evaluation commands: + +
+
${eval_cmd9}
+
+ +
+
+Command to generate run: + +
+
$cmd10
+
+Evaluation commands: + +
+
${eval_cmd10}
+
+ +
+
+Command to generate run: + +
+
$cmd11
+
+Evaluation commands: + +
+
${eval_cmd11}
+
+ +
+
+ + +
+ \ No newline at end of file diff --git a/pyserini/2cr/msmarco-v1-doc.yaml b/pyserini/2cr/msmarco-v1-doc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ca015d38c50fb0c0bec6f5f42aa2884ef3a59081 --- /dev/null +++ b/pyserini/2cr/msmarco-v1-doc.yaml @@ -0,0 +1,539 @@ +conditions: + - name: bm25-doc-tuned + display: BM25 doc (k1=4.46, b=0.82) + display-html: BM25 doc (k1=4.46, b=0.82) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.2767 + R@1K: 0.9357 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2336 + nDCG@10: 0.5233 + R@1K: 0.6757 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.3581 + nDCG@10: 0.5061 + R@1K: 0.7776 + - name: bm25-doc-default + display: BM25 doc (k1=0.9, b=0.4) + display-html: BM25 doc (k1=0.9, b=0.4) + display-row: "[1] — (1a)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.2299 + R@1K: 0.8856 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2434 + nDCG@10: 0.5176 + R@1K: 0.6966 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.3793 + nDCG@10: 0.5286 + R@1K: 0.8085 + - name: bm25-doc-segmented-tuned + display: BM25 doc segmented (k1=2.16, b=0.61) + display-html: BM25 doc segmented (k1=2.16, b=0.61) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.2756 + R@1K: 0.9311 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2398 + nDCG@10: 0.5389 + R@1K: 0.6565 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.3458 + nDCG@10: 0.5213 + R@1K: 0.7725 + - name: bm25-doc-segmented-default + display: BM25 doc segmented (k1=0.9, b=0.4) + display-html: BM25 doc segmented (k1=0.9, b=0.4) + display-row: "[1] — (1b)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.2684 + R@1K: 0.9178 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2449 + nDCG@10: 0.5302 + R@1K: 0.6871 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.3586 + nDCG@10: 0.5281 + R@1K: 0.7755 + - name: bm25-rm3-doc-tuned + display: BM25+RM3 doc (k1=4.46, b=0.82) + display-html: BM25+RM3 doc (k1=4.46, b=0.82) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rm3 + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.2227 + R@1K: 0.9303 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2638 + nDCG@10: 0.5526 + R@1K: 0.7188 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.3610 + nDCG@10: 0.5195 + R@1K: 0.8180 + - name: bm25-rm3-doc-default + display: BM25+RM3 doc (k1=0.9, b=0.4) + display-html: BM25+RM3 doc (k1=0.9, b=0.4) + display-row: "[1] — (1c)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4 + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.1618 + R@1K: 0.8783 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2773 + nDCG@10: 0.5174 + R@1K: 0.7507 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.4015 + nDCG@10: 0.5254 + R@1K: 0.8259 + - name: bm25-rm3-doc-segmented-tuned + display: BM25+RM3 doc segmented (k1=2.16, b=0.61) + display-html: BM25+RM3 doc segmented (k1=2.16, b=0.61) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.2448 + R@1K: 0.9359 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2655 + nDCG@10: 0.5392 + R@1K: 0.7037 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.3471 + nDCG@10: 0.5030 + R@1K: 0.8056 + - name: bm25-rm3-doc-segmented-default + display: BM25+RM3 doc segmented (k1=0.9, b=0.4) + display-html: BM25+RM3 doc segmented (k1=0.9, b=0.4) + display-row: "[1] — (1d)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.2413 + R@1K: 0.9351 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2892 + nDCG@10: 0.5684 + R@1K: 0.7368 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.3792 + nDCG@10: 0.5202 + R@1K: 0.8023 + - name: bm25-rocchio-doc-tuned + display: BM25+Rocchio doc (k1=4.46, b=0.82) + display-html: BM25+Rocchio doc (k1=4.46, b=0.82) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rocchio + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.2242 + R@1K: 0.9314 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2657 + nDCG@10: 0.5584 + R@1K: 0.7299 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.3628 + nDCG@10: 0.5199 + R@1K: 0.8217 + - name: bm25-rocchio-doc-default + display: BM25+Rocchio doc (k1=0.9, b=0.4) + display-html: BM25+Rocchio doc (k1=0.9, b=0.4) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rocchio --k1 0.9 --b 0.4 + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.1624 + R@1K: 0.8789 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2811 + nDCG@10: 0.5256 + R@1K: 0.7546 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.4089 + nDCG@10: 0.5192 + R@1K: 0.8273 + - name: bm25-rocchio-doc-segmented-tuned + display: BM25+Rocchio doc segmented (k1=2.16, b=0.61) + display-html: BM25+Rocchio doc segmented (k1=2.16, b=0.61) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rocchio --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.2475 + R@1K: 0.9395 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2672 + nDCG@10: 0.5421 + R@1K: 0.7115 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.3521 + nDCG@10: 0.4997 + R@1K: 0.8042 + - name: bm25-rocchio-doc-segmented-default + display: BM25+Rocchio doc segmented (k1=0.9, b=0.4) + display-html: BM25+Rocchio doc segmented (k1=0.9, b=0.4) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rocchio --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.2447 + R@1K: 0.9351 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2889 + nDCG@10: 0.5570 + R@1K: 0.7423 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.3830 + nDCG@10: 0.5226 + R@1K: 0.8102 + - name: bm25-d2q-t5-doc-tuned + display: BM25 w/ doc2query-T5 doc (k1=4.68, b=0.87) + display-html: BM25 w/ doc2query-T5 doc (k1=4.68, b=0.87) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5 --topics $topics --output $output --bm25 + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.3269 + R@1K: 0.9553 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2620 + nDCG@10: 0.5972 + R@1K: 0.6867 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.4099 + nDCG@10: 0.5852 + R@1K: 0.8105 + - name: bm25-d2q-t5-doc-default + display: BM25 w/ doc2query-T5 doc (k1=0.9, b=0.4) + display-html: BM25 w/ doc2query-T5 doc (k1=0.9, b=0.4) + display-row: "[1] — (2a)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5 --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.2880 + R@1K: 0.9259 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2700 + nDCG@10: 0.5968 + R@1K: 0.7190 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.4230 + nDCG@10: 0.5885 + R@1K: 0.8403 + - name: bm25-d2q-t5-doc-segmented-tuned + display: BM25 w/ doc2query-T5 doc segmented (k1=2.56, b=0.59) + display-html: BM25 w/ doc2query-T5 doc segmented (k1=2.56, b=0.59) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5 --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.3209 + R@1K: 0.9530 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2658 + nDCG@10: 0.6273 + R@1K: 0.6707 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.4047 + nDCG@10: 0.5943 + R@1K: 0.7968 + - name: bm25-d2q-t5-doc-segmented-default + display: BM25 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4) + display-html: BM25 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4) + display-row: "[1] — (2b)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5 --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.3179 + R@1K: 0.9490 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2798 + nDCG@10: 0.6119 + R@1K: 0.7165 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.4150 + nDCG@10: 0.5957 + R@1K: 0.8046 + - name: bm25-rm3-d2q-t5-doc-tuned + display: BM25+RM3 w/ doc2query-T5 doc (k1=4.68, b=0.87) + display-html: BM25+RM3 w/ doc2query-T5 doc (k1=4.68, b=0.87) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.2623 + R@1K: 0.9522 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2813 + nDCG@10: 0.6091 + R@1K: 0.7184 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.4100 + nDCG@10: 0.5745 + R@1K: 0.8238 + - name: bm25-rm3-d2q-t5-doc-default + display: BM25+RM3 w/ doc2query-T5 doc (k1=0.9, b=0.4) + display-html: BM25+RM3 w/ doc2query-T5 doc (k1=0.9, b=0.4) + display-row: "[1] — (2c)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4 + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.1834 + R@1K: 0.9126 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.3045 + nDCG@10: 0.5904 + R@1K: 0.7737 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.4230 + nDCG@10: 0.5427 + R@1K: 0.8631 + - name: bm25-rm3-d2q-t5-doc-segmented-tuned + display: BM25+RM3 w/ doc2query-T5 doc segmented (k1=2.56, b=0.59) + display-html: BM25+RM3 w/ doc2query-T5 doc segmented (k1=2.56, b=0.59) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.2973 + R@1K: 0.9563 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2892 + nDCG@10: 0.6247 + R@1K: 0.7069 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.4016 + nDCG@10: 0.5711 + R@1K: 0.8156 + - name: bm25-rm3-d2q-t5-doc-segmented-default + display: BM25+RM3 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4) + display-html: BM25+RM3 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4) + display-row: "[1] — (2d)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.2803 + R@1K: 0.9551 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.3030 + nDCG@10: 0.6290 + R@1K: 0.7483 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.4271 + nDCG@10: 0.5851 + R@1K: 0.8266 + - name: unicoil-noexp-pytorch + display: "uniCOIL (noexp): query inference with PyTorch" + display-html: "uniCOIL (noexp): query inference with PyTorch" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil-noexp --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.3410 + R@1K: 0.9420 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2661 + nDCG@10: 0.6347 + R@1K: 0.6385 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.3698 + nDCG@10: 0.5906 + R@1K: 0.7621 + - name: unicoil-noexp + display: "uniCOIL (noexp): pre-encoded" + display-html: "uniCOIL (noexp): pre-encoded queries" + display-row: "[1] — (3a)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil-noexp --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-doc-dev-unicoil-noexp + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.3409 + R@1K: 0.9420 + - topic_key: dl19-doc-unicoil-noexp + eval_key: dl19-doc + scores: + - MAP: 0.2665 + nDCG@10: 0.6349 + R@1K: 0.6391 + - topic_key: dl20-unicoil-noexp + eval_key: dl20-doc + scores: + - MAP: 0.3698 + nDCG@10: 0.5893 + R@1K: 0.7623 + - name: unicoil-pytorch + display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch" + display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-doc-dev + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.3532 + R@1K: 0.9546 + - topic_key: dl19-doc + eval_key: dl19-doc + scores: + - MAP: 0.2789 + nDCG@10: 0.6396 + R@1K: 0.6654 + - topic_key: dl20 + eval_key: dl20-doc + scores: + - MAP: 0.3881 + nDCG@10: 0.6030 + R@1K: 0.7866 + - name: unicoil + display: "uniCOIL (w/ doc2query-T5): pre-encoded" + display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries" + display-row: "[1] — (3b)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-doc-dev-unicoil + eval_key: msmarco-doc-dev + scores: + - MRR@10: 0.3531 + R@1K: 0.9546 + - topic_key: dl19-doc-unicoil + eval_key: dl19-doc + scores: + - MAP: 0.2789 + nDCG@10: 0.6396 + R@1K: 0.6652 + - topic_key: dl20-unicoil + eval_key: dl20-doc + scores: + - MAP: 0.3882 + nDCG@10: 0.6033 + R@1K: 0.7869 diff --git a/pyserini/2cr/msmarco-v1-passage.yaml b/pyserini/2cr/msmarco-v1-passage.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ef3932cb394c7af153b6cafb75f870e97887ca64 --- /dev/null +++ b/pyserini/2cr/msmarco-v1-passage.yaml @@ -0,0 +1,764 @@ +conditions: + - name: bm25-rocchio-d2q-t5-tuned + display: BM25+Rocchio w/ doc2query-T5 (k1=2.18, b=0.86) + display-html: BM25+Rocchio w/ doc2query-T5 (k1=2.18, b=0.86) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rocchio + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.2395 + R@1K: 0.9535 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4339 + nDCG@10: 0.6559 + R@1K: 0.8465 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4376 + nDCG@10: 0.6224 + R@1K: 0.8641 + - name: bm25-rocchio-d2q-t5-default + display: BM25+Rocchio w/ doc2query-T5 (k1=0.9, b=0.4) + display-html: BM25+Rocchio w/ doc2query-T5 (k1=0.9, b=0.4) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rocchio --k1 0.9 --b 0.4 + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.2158 + R@1K: 0.9467 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4469 + nDCG@10: 0.6538 + R@1K: 0.8855 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4246 + nDCG@10: 0.6102 + R@1K: 0.8675 + - name: bm25-rocchio-default + display: BM25+Rocchio (k1=0.9, b=0.4) + display-html: BM25+Rocchio (k1=0.9, b=0.4) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --rocchio + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.1595 + R@1K: 0.8620 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.3474 + nDCG@10: 0.5275 + R@1K: 0.8007 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.3115 + nDCG@10: 0.4910 + R@1K: 0.8156 + - name: bm25-rocchio-tuned + display: BM25+Rocchio (k1=0.82, b=0.68) + display-html: BM25+Rocchio (k1=0.82, b=0.68) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --rocchio + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.1684 + R@1K: 0.8726 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.3396 + nDCG@10: 0.5275 + R@1K: 0.7948 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.3120 + nDCG@10: 0.4908 + R@1K: 0.8327 + - name: distilbert-kd-tasb-pytorch + display: "DistilBERT KD TASB: query inference with PyTorch" + display-html: "DistilBERT KD TASB: query inference with PyTorch" + display-row: "[5]" + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-tas_b-b256 --topics $topics --encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco --output $output + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3444 + R@1K: 0.9771 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4590 + nDCG@10: 0.7210 + R@1K: 0.8406 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4698 + nDCG@10: 0.6854 + R@1K: 0.8727 + - name: distilbert-kd-tasb + display: "DistilBERT KD TASB: pre-encoded" + display-html: "DistilBERT KD TASB: pre-encoded queries" + display-row: "[5]" + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-tas_b-b256 --topics $topics --encoded-queries distilbert_tas_b-$topics --output $output + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3444 + R@1K: 0.9771 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4590 + nDCG@10: 0.7210 + R@1K: 0.8406 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4698 + nDCG@10: 0.6854 + R@1K: 0.8727 + - name: distilbert-kd-pytorch + display: "DistilBERT KD: query inference with PyTorch" + display-html: "DistilBERT KD: query inference with PyTorch" + display-row: "[4]" + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-margin-mse-t2 --topics $topics --encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco --output $output + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3251 + R@1K: 0.9553 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4053 + nDCG@10: 0.6994 + R@1K: 0.7653 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4159 + nDCG@10: 0.6447 + R@1K: 0.7953 + - name: distilbert-kd + display: "DistilBERT KD: pre-encoded" + display-html: "DistilBERT KD: pre-encoded queries" + display-row: "[4]" + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-margin-mse-t2 --topics $topics --encoded-queries distilbert_kd-$topics --output $output + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3251 + R@1K: 0.9553 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4053 + nDCG@10: 0.6994 + R@1K: 0.7653 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4159 + nDCG@10: 0.6447 + R@1K: 0.7953 + - name: ance-pytorch + display: "ANCE: query inference with PyTorch" + display-html: "ANCE: query inference with PyTorch" + display-row: "[3]" + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.ance --topics $topics --encoder castorini/ance-msmarco-passage --output $output + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3302 + R@1K: 0.9587 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.3710 + nDCG@10: 0.6452 + R@1K: 0.7554 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4076 + nDCG@10: 0.6458 + R@1K: 0.7764 + - name: ance + display: "ANCE: pre-encoded" + display-html: "ANCE: pre-encoded queries" + display-row: "[3]" + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.ance --topics $topics --encoded-queries ance-$topics --output $output + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3302 + R@1K: 0.9584 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.3710 + nDCG@10: 0.6452 + R@1K: 0.7554 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4076 + nDCG@10: 0.6458 + R@1K: 0.7764 + - name: bm25-tuned + display: BM25 (k1=0.82, b=0.68) + display-html: BM25 (k1=0.82, b=0.68) + command: python -m pyserini.search.lucene --topics $topics --index msmarco-v1-passage --output $output --bm25 + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.1875 + R@1K: 0.8573 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.2903 + nDCG@10: 0.4973 + R@1K: 0.7450 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.2876 + nDCG@10: 0.4876 + R@1K: 0.8031 + - name: bm25-rm3-tuned + display: BM25+RM3 (k1=0.82, b=0.68) + display-html: BM25+RM3 (k1=0.82, b=0.68) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --rm3 + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.1646 + R@1K: 0.8704 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.3339 + nDCG@10: 0.5147 + R@1K: 0.7950 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.3017 + nDCG@10: 0.4924 + R@1K: 0.8292 + - name: bm25-default + display: BM25 (k1=0.9, b=0.4) + display-html: BM25 (k1=0.9, b=0.4) + display-row: "[1] — (1a)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.1840 + R@1K: 0.8526 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.3013 + nDCG@10: 0.5058 + R@1K: 0.7501 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.2856 + nDCG@10: 0.4796 + R@1K: 0.7863 + - name: bm25-rm3-default + display: BM25+RM3 (k1=0.9, b=0.4) + display-html: BM25+RM3 (k1=0.9, b=0.4) + display-row: "[1] — (1b)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --rm3 + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.1566 + R@1K: 0.8606 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.3416 + nDCG@10: 0.5216 + R@1K: 0.8136 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.3006 + nDCG@10: 0.4896 + R@1K: 0.8236 + - name: bm25-d2q-t5-tuned + display: BM25 w/ doc2query-T5 (k1=2.18, b=0.86) + display-html: BM25 w/ doc2query-T5 (k1=2.18, b=0.86) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5 --topics $topics --output $output --bm25 + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.2816 + R@1K: 0.9506 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4046 + nDCG@10: 0.6336 + R@1K: 0.8134 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4171 + nDCG@10: 0.6265 + R@1K: 0.8393 + - name: bm25-d2q-t5-default + display: BM25 w/ doc2query-T5 (k1=0.9, b=0.4) + display-html: BM25 w/ doc2query-T5 (k1=0.9, b=0.4) + display-row: "[1] — (2a)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5 --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.2723 + R@1K: 0.9470 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4034 + nDCG@10: 0.6417 + R@1K: 0.8310 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4074 + nDCG@10: 0.6187 + R@1K: 0.8452 + - name: bm25-rm3-d2q-t5-tuned + display: BM25+RM3 w/ doc2query-T5 (k1=2.18, b=0.86) + display-html: BM25+RM3 w/ doc2query-T5 (k1=2.18, b=0.86) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.2382 + R@1K: 0.9528 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4377 + nDCG@10: 0.6537 + R@1K: 0.8443 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4348 + nDCG@10: 0.6235 + R@1K: 0.8605 + - name: bm25-rm3-d2q-t5-default + display: BM25+RM3 w/ doc2query-T5 (k1=0.9, b=0.4) + display-html: BM25+RM3 w/ doc2query-T5 (k1=0.9, b=0.4) + display-row: "[1] — (2b)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4 + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.2139 + R@1K: 0.9460 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4483 + nDCG@10: 0.6586 + R@1K: 0.8863 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4286 + nDCG@10: 0.6131 + R@1K: 0.8700 + - name: unicoil-pytorch + display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch" + display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --hits 1000 --impact + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3509 + R@1K: 0.9581 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4617 + nDCG@10: 0.7027 + R@1K: 0.8291 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4429 + nDCG@10: 0.6745 + R@1K: 0.8433 + - name: unicoil-onnx + display: "uniCOIL (w/ doc2query-T5): query inference with ONNX" + display-html: "uniCOIL (w/ doc2query-T5): query inference with ONNX" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil --topics $topics --onnx-encoder UniCoil --output $output --hits 1000 --impact + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3509 + R@1K: 0.9581 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4617 + nDCG@10: 0.7027 + R@1K: 0.8291 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4429 + nDCG@10: 0.6745 + R@1K: 0.8433 + - name: unicoil + display: "uniCOIL (w/ doc2query-T5): pre-encoded" + display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries" + display-row: "[1] — (3b)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil --topics $topics --output $output --hits 1000 --impact + topics: + - topic_key: msmarco-passage-dev-subset-unicoil + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3516 + R@1K: 0.9582 + - topic_key: dl19-passage-unicoil + eval_key: dl19-passage + scores: + - MAP: 0.4612 + nDCG@10: 0.7024 + R@1K: 0.8292 + - topic_key: dl20-unicoil + eval_key: dl20-passage + scores: + - MAP: 0.4430 + nDCG@10: 0.6745 + R@1K: 0.8430 + - name: unicoil-noexp-pytorch + display: "uniCOIL (noexp): query inference with PyTorch" + display-html: "uniCOIL (noexp): query inference with PyTorch" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil-noexp --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --hits 1000 --impact + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3153 + R@1K: 0.9239 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4033 + nDCG@10: 0.6434 + R@1K: 0.7752 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4022 + nDCG@10: 0.6524 + R@1K: 0.7861 + - name: unicoil-noexp-onnx + display: "uniCOIL (noexp): query inference with ONNX" + display-html: "uniCOIL (noexp): query inference with ONNX" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil-noexp --topics $topics --onnx-encoder UniCoil --output $output --hits 1000 --impact + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3119 + R@1K: 0.9239 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4061 + nDCG@10: 0.6531 + R@1K: 0.7809 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.3909 + nDCG@10: 0.6388 + R@1K: 0.7915 + - name: unicoil-noexp + display: "uniCOIL (noexp): pre-encoded" + display-html: "uniCOIL (noexp): pre-encoded queries" + display-row: "[1] — (3a)" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil-noexp --topics $topics --output $output --hits 1000 --impact + topics: + - topic_key: msmarco-passage-dev-subset-unicoil-noexp + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3153 + R@1K: 0.9239 + - topic_key: dl19-passage-unicoil-noexp + eval_key: dl19-passage + scores: + - MAP: 0.4033 + nDCG@10: 0.6433 + R@1K: 0.7752 + - topic_key: dl20-unicoil-noexp + eval_key: dl20-passage + scores: + - MAP: 0.4021 + nDCG@10: 0.6523 + R@1K: 0.7861 + - name: splade-pp-ed-onnx + display: "SPLADE++ EnsembleDistil: query inference with ONNX" + display-html: "SPLADE++ EnsembleDistil: query inference with ONNX" + display-row: "[2]" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-splade-pp-ed --topics $topics --onnx-encoder SpladePlusPlusEnsembleDistil --output $output --hits 1000 --impact + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3830 + R@1K: 0.9831 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.5054 + nDCG@10: 0.7320 + R@1K: 0.8724 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.5002 + nDCG@10: 0.7198 + R@1K: 0.8995 + - name: splade-pp-sd-onnx + display: "SPLADE++ SelfDistil: query inference with ONNX" + display-html: "SPLADE++ SelfDistil: query inference with ONNX" + display-row: "[2]" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-splade-pp-sd --topics $topics --onnx-encoder SpladePlusPlusSelfDistil --output $output --hits 1000 --impact + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3778 + R@1K: 0.9846 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4997 + nDCG@10: 0.7356 + R@1K: 0.8758 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.5140 + nDCG@10: 0.7285 + R@1K: 0.9023 + - name: tct_colbert-v2-hnp-pytorch + display: "TCT_ColBERT-V2-HN+: query inference with PyTorch" + display-html: "TCT_ColBERT-V2-HN+: query inference with PyTorch" + display-row: "[6]" + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.tct_colbert-v2-hnp --topics $topics --encoder castorini/tct_colbert-v2-hnp-msmarco --output $output + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3584 + R@1K: 0.9695 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4469 + nDCG@10: 0.7204 + R@1K: 0.8261 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4754 + nDCG@10: 0.6882 + R@1K: 0.8429 + - name: tct_colbert-v2-hnp + display: "TCT_ColBERT-V2-HN+: pre-encoded" + display-html: "TCT_ColBERT-V2-HN+: pre-encoded queries" + display-row: "[6]" + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.tct_colbert-v2-hnp --topics $topics --encoded-queries tct_colbert-v2-hnp-$topics --output $output + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3584 + R@1K: 0.9695 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4469 + nDCG@10: 0.7204 + R@1K: 0.8261 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4754 + nDCG@10: 0.6882 + R@1K: 0.8429 + - name: slimr + display: "SLIM: query inference with PyTorch" + display-html: "SLIM: query inference with PyTorch" + display-row: "[7]" + command: python -m pyserini.search.lucene --threads 16 --batch 128 --index msmarco-v1-passage-slimr --topics $topics --encoder castorini/slimr-msmarco-passage --encoded-corpus scipy-sparse-vectors.msmarco-v1-passage-slimr --output $output --output-format msmarco --hits 1000 --impact --min-idf 3 + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3581 + R@1K: 0.9620 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4509 + nDCG@10: 0.7010 + R@1K: 0.8241 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4419 + nDCG@10: 0.6403 + R@1K: 0.8543 + - name: slimr-pp + display: "SLIM++: query inference with PyTorch" + display-html: "SLIM++: query inference with PyTorch" + display-row: "[7]" + command: python -m pyserini.search.lucene --threads 16 --batch 128 --index msmarco-v1-passage-slimr-pp --topics $topics --encoder castorini/slimr-pp-msmarco-passage --encoded-corpus scipy-sparse-vectors.msmarco-v1-passage-slimr-pp --output $output --output-format msmarco --hits 1000 --impact --min-idf 3 + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.4032 + R@1K: 0.9680 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4687 + nDCG@10: 0.7140 + R@1K: 0.8415 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4906 + nDCG@10: 0.7021 + R@1K: 0.8551 + - name: aggretriever-distilbert-pytorch + display: "Aggretriever-DistilBERT: query inference with PyTorch" + display-html: "Aggretriever-DistilBERT: query inference with PyTorch" + display-row: "[8]" + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.aggretriever-distilbert --topics $topics --encoder castorini/aggretriever-distilbert --output $output + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3412 + R@1K: 0.9604 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4301 + nDCG@10: 0.6816 + R@1K: 0.8023 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4329 + nDCG@10: 0.6726 + R@1K: 0.8351 + - name: aggretriever-cocondenser-pytorch + display: "Aggretriever-coCondenser: query inference with PyTorch" + display-html: "Aggretriever-coCondenser: query inference with PyTorch" + display-row: "[8]" + command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.aggretriever-cocondenser --topics $topics --encoder castorini/aggretriever-cocondenser --output $output + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3619 + R@1K: 0.9735 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4350 + nDCG@10: 0.6837 + R@1K: 0.8078 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4710 + nDCG@10: 0.6972 + R@1K: 0.8555 + - name: openai-ada2 + display: "OpenAI ada2: pre-encoded queries" + display-html: "OpenAI ada2: pre-encoded queries" + command: python -m pyserini.search.faiss --threads 16 --batch-size 128 --index msmarco-v1-passage.openai-ada2 --topics $topics --encoded-queries openai-ada2-$topics --output $output + topics: + - topic_key: msmarco-passage-dev-subset + eval_key: msmarco-passage-dev-subset + scores: + - MRR@10: 0.3435 + R@1K: 0.9858 + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.4788 + nDCG@10: 0.7035 + R@1K: 0.8629 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4771 + nDCG@10: 0.6759 + R@1K: 0.8705 + - name: openai-ada2-hyde + display: "HyDE-OpenAI ada2: pre-encoded queries" + display-html: "HyDE-OpenAI ada2: pre-encoded queries" + command: python -m pyserini.search.faiss --threads 16 --batch-size 128 --index msmarco-v1-passage.openai-ada2 --topics $topics --encoded-queries openai-ada2-$topics-hyde --output $output + topics: + - topic_key: dl19-passage + eval_key: dl19-passage + scores: + - MAP: 0.5125 + nDCG@10: 0.7163 + R@1K: 0.9002 + - topic_key: dl20 + eval_key: dl20-passage + scores: + - MAP: 0.4938 + nDCG@10: 0.6666 + R@1K: 0.8919 \ No newline at end of file diff --git a/pyserini/2cr/msmarco-v2-doc.yaml b/pyserini/2cr/msmarco-v2-doc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6287f8a3be2102db1e55eaea5df5242fd5a15ff7 --- /dev/null +++ b/pyserini/2cr/msmarco-v2-doc.yaml @@ -0,0 +1,287 @@ +conditions: + - name: bm25-doc-default + display: BM25 doc (k1=0.9, b=0.4) + display-html: BM25 doc (k1=0.9, b=0.4) + display-row: (1a) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc --topics $topics --output $output --bm25 + topics: + - topic_key: msmarco-v2-doc-dev + eval_key: msmarco-v2-doc-dev + scores: + - MRR@100: 0.1572 + R@1K: 0.8054 + - topic_key: msmarco-v2-doc-dev2 + eval_key: msmarco-v2-doc-dev2 + scores: + - MRR@100: 0.1659 + R@1K: 0.8029 + - topic_key: dl21 + eval_key: dl21-doc + scores: + - MAP@100: 0.2126 + nDCG@10: 0.5116 + MRR@100: 0.8367 + R@100: 0.3195 + R@1K: 0.6739 + - name: bm25-doc-segmented-default + display: BM25 doc segmented (k1=0.9, b=0.4) + display-html: BM25 doc segmented (k1=0.9, b=0.4) + display-row: (1b) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-v2-doc-dev + eval_key: msmarco-v2-doc-dev + scores: + - MRR@100: 0.1896 + R@1K: 0.8542 + - topic_key: msmarco-v2-doc-dev2 + eval_key: msmarco-v2-doc-dev2 + scores: + - MRR@100: 0.1930 + R@1K: 0.8549 + - topic_key: dl21 + eval_key: dl21-doc + scores: + - MAP@100: 0.2436 + nDCG@10: 0.5776 + MRR@100: 0.8937 + R@100: 0.3478 + R@1K: 0.6930 + - name: bm25-rm3-doc-default + display: BM25+RM3 doc (k1=0.9, b=0.4) + display-html: BM25+RM3 doc (k1=0.9, b=0.4) + display-row: (1c) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc --topics $topics --output $output --bm25 --rm3 + topics: + - topic_key: msmarco-v2-doc-dev + eval_key: msmarco-v2-doc-dev + scores: + - MRR@100: 0.0974 + R@1K: 0.7699 + - topic_key: msmarco-v2-doc-dev2 + eval_key: msmarco-v2-doc-dev2 + scores: + - MRR@100: 0.1033 + R@1K: 0.7736 + - topic_key: dl21 + eval_key: dl21-doc + scores: + - MAP@100: 0.2452 + nDCG@10: 0.5304 + MRR@100: 0.7914 + R@100: 0.3376 + R@1K: 0.7341 + - name: bm25-rm3-doc-segmented-default + display: BM25+RM3 doc segmented (k1=0.9, b=0.4) + display-html: BM25+RM3 doc segmented (k1=0.9, b=0.4) + display-row: (1d) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-v2-doc-dev + eval_key: msmarco-v2-doc-dev + scores: + - MRR@100: 0.1660 + R@1K: 0.8608 + - topic_key: msmarco-v2-doc-dev2 + eval_key: msmarco-v2-doc-dev2 + scores: + - MRR@100: 0.1702 + R@1K: 0.8639 + - topic_key: dl21 + eval_key: dl21-doc + scores: + - MAP@100: 0.2936 + nDCG@10: 0.6189 + MRR@100: 0.9076 + R@100: 0.3890 + R@1K: 0.7678 + - name: bm25-d2q-t5-doc-default + display: BM25 w/ doc2query-T5 doc (k1=0.9, b=0.4) + display-html: BM25 w/ doc2query-T5 doc (k1=0.9, b=0.4) + display-row: (2a) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-d2q-t5 --topics $topics --output $output --bm25 + topics: + - topic_key: msmarco-v2-doc-dev + eval_key: msmarco-v2-doc-dev + scores: + - MRR@100: 0.2011 + R@1K: 0.8614 + - topic_key: msmarco-v2-doc-dev2 + eval_key: msmarco-v2-doc-dev2 + scores: + - MRR@100: 0.2012 + R@1K: 0.8568 + - topic_key: dl21 + eval_key: dl21-doc + scores: + - MAP@100: 0.2387 + nDCG@10: 0.5792 + MRR@100: 0.8866 + R@100: 0.3443 + R@1K: 0.7066 + - name: bm25-d2q-t5-doc-segmented-default + display: BM25 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4) + display-html: BM25 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4) + display-row: (2b) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-d2q-t5 --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-v2-doc-dev + eval_key: msmarco-v2-doc-dev + scores: + - MRR@100: 0.2226 + R@1K: 0.8982 + - topic_key: msmarco-v2-doc-dev2 + eval_key: msmarco-v2-doc-dev2 + scores: + - MRR@100: 0.2234 + R@1K: 0.8952 + - topic_key: dl21 + eval_key: dl21-doc + scores: + - MAP@100: 0.2683 + nDCG@10: 0.6289 + MRR@100: 0.9454 + R@100: 0.3656 + R@1K: 0.7202 + - name: bm25-rm3-d2q-t5-doc-default + display: BM25+RM3 w/ doc2query-T5 doc (k1=0.9, b=0.4) + display-html: BM25+RM3 w/ doc2query-T5 doc (k1=0.9, b=0.4) + display-row: (2c) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 + topics: + - topic_key: msmarco-v2-doc-dev + eval_key: msmarco-v2-doc-dev + scores: + - MRR@100: 0.1141 + R@1K: 0.8191 + - topic_key: msmarco-v2-doc-dev2 + eval_key: msmarco-v2-doc-dev2 + scores: + - MRR@100: 0.1170 + R@1K: 0.8247 + - topic_key: dl21 + eval_key: dl21-doc + scores: + - MAP@100: 0.2611 + nDCG@10: 0.5375 + MRR@100: 0.8255 + R@100: 0.3580 + R@1K: 0.7574 + - name: bm25-rm3-d2q-t5-doc-segmented-default + display: BM25+RM3 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4) + display-html: BM25+RM3 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4) + display-row: (2d) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-v2-doc-dev + eval_key: msmarco-v2-doc-dev + scores: + - MRR@100: 0.1975 + R@1K: 0.9002 + - topic_key: msmarco-v2-doc-dev2 + eval_key: msmarco-v2-doc-dev2 + scores: + - MRR@100: 0.1978 + R@1K: 0.8972 + - topic_key: dl21 + eval_key: dl21-doc + scores: + - MAP@100: 0.3191 + nDCG@10: 0.6559 + MRR@100: 0.8989 + R@100: 0.4131 + R@1K: 0.7948 + - name: unicoil-noexp + display: "uniCOIL (noexp): pre-encoded" + display-html: "uniCOIL (noexp): pre-encoded queries" + display-row: (3a) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-noexp-0shot --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-v2-doc-dev-unicoil-noexp + eval_key: msmarco-v2-doc-dev + scores: + - MRR@100: 0.2231 + R@1K: 0.8987 + - topic_key: msmarco-v2-doc-dev2-unicoil-noexp + eval_key: msmarco-v2-doc-dev2 + scores: + - MRR@100: 0.2314 + R@1K: 0.8995 + - topic_key: dl21-unicoil-noexp + eval_key: dl21-doc + scores: + - MAP@100: 0.2587 + nDCG@10: 0.6495 + MRR@100: 0.9282 + R@100: 0.3563 + R@1K: 0.6787 + - name: unicoil-noexp-otf + display: "uniCOIL (noexp): query inference with PyTorch" + display-html: "uniCOIL (noexp): query inference with PyTorch" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-noexp-0shot --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-v2-doc-dev + eval_key: msmarco-v2-doc-dev + scores: + - MRR@100: 0.2232 + R@1K: 0.8987 + - topic_key: msmarco-v2-doc-dev2 + eval_key: msmarco-v2-doc-dev2 + scores: + - MRR@100: 0.2314 + R@1K: 0.8993 + - topic_key: dl21 + eval_key: dl21-doc + scores: + - MAP@100: 0.2589 + nDCG@10: 0.6501 + MRR@100: 0.9282 + R@100: 0.3574 + R@1K: 0.6782 + - name: unicoil + display: "uniCOIL (w/ doc2query-T5): pre-encoded" + display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries" + display-row: (3b) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-0shot --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-v2-doc-dev-unicoil + eval_key: msmarco-v2-doc-dev + scores: + - MRR@100: 0.2419 + R@1K: 0.9122 + - topic_key: msmarco-v2-doc-dev2-unicoil + eval_key: msmarco-v2-doc-dev2 + scores: + - MRR@100: 0.2445 + R@1K: 0.9172 + - topic_key: dl21-unicoil + eval_key: dl21-doc + scores: + - MAP@100: 0.2718 + nDCG@10: 0.6783 + MRR@100: 0.9684 + R@100: 0.3700 + R@1K: 0.7069 + - name: unicoil-otf + display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch" + display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-0shot --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage + topics: + - topic_key: msmarco-v2-doc-dev + eval_key: msmarco-v2-doc-dev + scores: + - MRR@100: 0.2419 + R@1K: 0.9120 + - topic_key: msmarco-v2-doc-dev2 + eval_key: msmarco-v2-doc-dev2 + scores: + - MRR@100: 0.2447 + R@1K: 0.9174 + - topic_key: dl21 + eval_key: dl21-doc + scores: + - MAP@100: 0.2720 + nDCG@10: 0.6782 + MRR@100: 0.9684 + R@100: 0.3702 + R@1K: 0.7071 diff --git a/pyserini/2cr/msmarco-v2-passage.yaml b/pyserini/2cr/msmarco-v2-passage.yaml new file mode 100644 index 0000000000000000000000000000000000000000..06383d3657b9d926d96e914b9597c02ea5c396d6 --- /dev/null +++ b/pyserini/2cr/msmarco-v2-passage.yaml @@ -0,0 +1,287 @@ +conditions: + - name: bm25-default + display: BM25 original passage (k1=0.9, b=0.4) + display-html: BM25 original passage (k1=0.9, b=0.4) + display-row: (1a) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage --topics $topics --output $output --bm25 + topics: + - topic_key: msmarco-v2-passage-dev + eval_key: msmarco-v2-passage-dev + scores: + - MRR@100: 0.0719 + R@1K: 0.5733 + - topic_key: msmarco-v2-passage-dev2 + eval_key: msmarco-v2-passage-dev2 + scores: + - MRR@100: 0.0802 + R@1K: 0.5839 + - topic_key: dl21 + eval_key: dl21-passage + scores: + - MAP@100: 0.1357 + nDCG@10: 0.4458 + MRR@100: 0.5060 + R@100: 0.3261 + R@1K: 0.6149 + - name: bm25-augmented-default + display: BM25 augmented passage (k1=0.9, b=0.4) + display-html: BM25 augmented passage (k1=0.9, b=0.4) + display-row: (1b) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented --topics $topics --output $output --bm25 + topics: + - topic_key: msmarco-v2-passage-dev + eval_key: msmarco-v2-passage-dev + scores: + - MRR@100: 0.0872 + R@1K: 0.6925 + - topic_key: msmarco-v2-passage-dev2 + eval_key: msmarco-v2-passage-dev2 + scores: + - MRR@100: 0.0917 + R@1K: 0.6933 + - topic_key: dl21 + eval_key: dl21-passage + scores: + - MAP@100: 0.0977 + nDCG@10: 0.3977 + MRR@100: 0.5303 + R@100: 0.2709 + R@1K: 0.5835 + - name: bm25-rm3-default + display: BM25+RM3 original passage (k1=0.9, b=0.4) + display-html: BM25+RM3 original passage (k1=0.9, b=0.4) + display-row: (1c) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage --topics $topics --output $output --bm25 --rm3 + topics: + - topic_key: msmarco-v2-passage-dev + eval_key: msmarco-v2-passage-dev + scores: + - MRR@100: 0.0630 + R@1K: 0.5947 + - topic_key: msmarco-v2-passage-dev2 + eval_key: msmarco-v2-passage-dev2 + scores: + - MRR@100: 0.0659 + R@1K: 0.6062 + - topic_key: dl21 + eval_key: dl21-passage + scores: + - MAP@100: 0.1666 + nDCG@10: 0.4455 + MRR@100: 0.5202 + R@100: 0.3499 + R@1K: 0.6616 + - name: bm25-rm3-augmented-default + display: BM25+RM3 augmented passage (k1=0.9, b=0.4) + display-html: BM25+RM3 augmented passage (k1=0.9, b=0.4) + display-row: (1d) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented --topics $topics --output $output --bm25 --rm3 + topics: + - topic_key: msmarco-v2-passage-dev + eval_key: msmarco-v2-passage-dev + scores: + - MRR@100: 0.0667 + R@1K: 0.6857 + - topic_key: msmarco-v2-passage-dev2 + eval_key: msmarco-v2-passage-dev2 + scores: + - MRR@100: 0.0700 + R@1K: 0.6826 + - topic_key: dl21 + eval_key: dl21-passage + scores: + - MAP@100: 0.1050 + nDCG@10: 0.3869 + MRR@100: 0.4915 + R@100: 0.2807 + R@1K: 0.6298 + - name: bm25-d2q-t5-default + display: BM25 w/ doc2query-T5 original passage (k1=0.9, b=0.4) + display-html: BM25 w/ doc2query-T5 original passage (k1=0.9, b=0.4) + display-row: (2a) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-d2q-t5 --topics $topics --output $output --bm25 + topics: + - topic_key: msmarco-v2-passage-dev + eval_key: msmarco-v2-passage-dev + scores: + - MRR@100: 0.1072 + R@1K: 0.7083 + - topic_key: msmarco-v2-passage-dev2 + eval_key: msmarco-v2-passage-dev2 + scores: + - MRR@100: 0.1123 + R@1K: 0.7151 + - topic_key: dl21 + eval_key: dl21-passage + scores: + - MAP@100: 0.1874 + nDCG@10: 0.4816 + MRR@100: 0.6848 + R@100: 0.4076 + R@1K: 0.7078 + - name: bm25-d2q-t5-augmented-default + display: BM25 w/ doc2query-T5 augmented passage (k1=0.9, b=0.4) + display-html: BM25 w/ doc2query-T5 augmented passage (k1=0.9, b=0.4) + display-row: (2b) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented-d2q-t5 --topics $topics --output $output --bm25 + topics: + - topic_key: msmarco-v2-passage-dev + eval_key: msmarco-v2-passage-dev + scores: + - MRR@100: 0.1172 + R@1K: 0.7647 + - topic_key: msmarco-v2-passage-dev2 + eval_key: msmarco-v2-passage-dev2 + scores: + - MRR@100: 0.1170 + R@1K: 0.7659 + - topic_key: dl21 + eval_key: dl21-passage + scores: + - MAP@100: 0.1649 + nDCG@10: 0.4702 + MRR@100: 0.6391 + R@100: 0.3883 + R@1K: 0.6962 + - name: bm25-rm3-d2q-t5-default + display: BM25+RM3 w/ doc2query-T5 original passage (k1=0.9, b=0.4) + display-html: BM25+RM3 w/ doc2query-T5 original passage (k1=0.9, b=0.4) + display-row: (2c) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 + topics: + - topic_key: msmarco-v2-passage-dev + eval_key: msmarco-v2-passage-dev + scores: + - MRR@100: 0.0947 + R@1K: 0.7181 + - topic_key: msmarco-v2-passage-dev2 + eval_key: msmarco-v2-passage-dev2 + scores: + - MRR@100: 0.0984 + R@1K: 0.7222 + - topic_key: dl21 + eval_key: dl21-passage + scores: + - MAP@100: 0.2285 + nDCG@10: 0.5098 + MRR@100: 0.6548 + R@100: 0.4499 + R@1K: 0.7537 + - name: bm25-rm3-d2q-t5-augmented-default + display: BM25+RM3 w/ doc2query-T5 augmented passage (k1=0.9, b=0.4) + display-html: BM25+RM3 w/ doc2query-T5 augmented passage (k1=0.9, b=0.4) + display-row: (2d) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 + topics: + - topic_key: msmarco-v2-passage-dev + eval_key: msmarco-v2-passage-dev + scores: + - MRR@100: 0.0883 + R@1K: 0.7607 + - topic_key: msmarco-v2-passage-dev2 + eval_key: msmarco-v2-passage-dev2 + scores: + - MRR@100: 0.0904 + R@1K: 0.7649 + - topic_key: dl21 + eval_key: dl21-passage + scores: + - MAP@100: 0.1930 + nDCG@10: 0.4812 + MRR@100: 0.5958 + R@100: 0.4321 + R@1K: 0.7672 + - name: unicoil + display: "uniCOIL (w/ doc2query-T5): pre-encoded" + display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries" + display-row: (3b) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-0shot --topics $topics --output $output --hits 1000 --impact + topics: + - topic_key: msmarco-v2-passage-dev-unicoil + eval_key: msmarco-v2-passage-dev + scores: + - MRR@100: 0.1499 + R@1K: 0.7616 + - topic_key: msmarco-v2-passage-dev2-unicoil + eval_key: msmarco-v2-passage-dev2 + scores: + - MRR@100: 0.1577 + R@1K: 0.7671 + - topic_key: dl21-unicoil + eval_key: dl21-passage + scores: + - MAP@100: 0.2538 + nDCG@10: 0.6159 + MRR@100: 0.7311 + R@100: 0.4731 + R@1K: 0.7551 + - name: unicoil-otf + display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch" + display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-0shot --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --hits 1000 --impact + topics: + - topic_key: msmarco-v2-passage-dev + eval_key: msmarco-v2-passage-dev + scores: + - MRR@100: 0.1501 + R@1K: 0.7613 + - topic_key: msmarco-v2-passage-dev2 + eval_key: msmarco-v2-passage-dev2 + scores: + - MRR@100: 0.1576 + R@1K: 0.7676 + - topic_key: dl21 + eval_key: dl21-passage + scores: + - MAP@100: 0.2539 + nDCG@10: 0.6160 + MRR@100: 0.7311 + R@100: 0.4723 + R@1K: 0.7560 + - name: unicoil-noexp + display: "uniCOIL (noexp): pre-encoded" + display-html: "uniCOIL (noexp): pre-encoded queries" + display-row: (3a) + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-noexp-0shot --topics $topics --output $output --hits 1000 --impact + topics: + - topic_key: msmarco-v2-passage-dev-unicoil-noexp + eval_key: msmarco-v2-passage-dev + scores: + - MRR@100: 0.1342 + R@1K: 0.7010 + - topic_key: msmarco-v2-passage-dev2-unicoil-noexp + eval_key: msmarco-v2-passage-dev2 + scores: + - MRR@100: 0.1385 + R@1K: 0.7114 + - topic_key: dl21-unicoil-noexp + eval_key: dl21-passage + scores: + - MAP@100: 0.2193 + nDCG@10: 0.5756 + MRR@100: 0.6991 + R@100: 0.4246 + R@1K: 0.6897 + - name: unicoil-noexp-otf + display: "uniCOIL (noexp): query inference with PyTorch" + display-html: "uniCOIL (noexp): query inference with PyTorch" + command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-noexp-0shot --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --hits 1000 --impact + topics: + - topic_key: msmarco-v2-passage-dev + eval_key: msmarco-v2-passage-dev + scores: + - MRR@100: 0.1343 + R@1K: 0.7010 + - topic_key: msmarco-v2-passage-dev2 + eval_key: msmarco-v2-passage-dev2 + scores: + - MRR@100: 0.1385 + R@1K: 0.7114 + - topic_key: dl21 + eval_key: dl21-passage + scores: + - MAP@100: 0.2194 + nDCG@10: 0.5759 + MRR@100: 0.6991 + R@100: 0.4247 + R@1K: 0.6893 diff --git a/pyserini/2cr/msmarco.py b/pyserini/2cr/msmarco.py new file mode 100644 index 0000000000000000000000000000000000000000..acc0e5e31a3197a9425735af1f344919ca5abd12 --- /dev/null +++ b/pyserini/2cr/msmarco.py @@ -0,0 +1,600 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import math +import os +import re +import sys +import time +from collections import defaultdict +from string import Template + +import pkg_resources +import yaml + +from ._base import run_eval_and_return_metric, ok_str, okish_str, fail_str + +# The models: the rows of the results table will be ordered this way. +models = { + # MS MARCO v1 passage + 'msmarco-v1-passage': + ['bm25-default', + 'bm25-rm3-default', + 'bm25-rocchio-default', + '', + 'bm25-tuned', + 'bm25-rm3-tuned', + 'bm25-rocchio-tuned', + '', + 'bm25-d2q-t5-default', + 'bm25-rm3-d2q-t5-default', + 'bm25-rocchio-d2q-t5-default', + '', + 'bm25-d2q-t5-tuned', + 'bm25-rm3-d2q-t5-tuned', + 'bm25-rocchio-d2q-t5-tuned', + '', + 'unicoil', + 'unicoil-pytorch', + 'unicoil-onnx', + 'unicoil-noexp', + 'unicoil-noexp-pytorch', + 'unicoil-noexp-onnx', + '', + 'splade-pp-ed-onnx', + 'splade-pp-sd-onnx', + '', + 'ance', + 'ance-pytorch', + '', + 'distilbert-kd', + 'distilbert-kd-pytorch', + 'distilbert-kd-tasb', + 'distilbert-kd-tasb-pytorch', + '', + 'tct_colbert-v2-hnp', + 'tct_colbert-v2-hnp-pytorch', + '', + 'slimr', + 'slimr-pp', + '', + 'aggretriever-distilbert-pytorch', + 'aggretriever-cocondenser-pytorch', + '', + 'openai-ada2', + 'openai-ada2-hyde'], + + # MS MARCO v1 doc + 'msmarco-v1-doc': + ['bm25-doc-default', + 'bm25-doc-segmented-default', + 'bm25-rm3-doc-default', + 'bm25-rm3-doc-segmented-default', + 'bm25-rocchio-doc-default', + 'bm25-rocchio-doc-segmented-default', + '', + 'bm25-doc-tuned', + 'bm25-doc-segmented-tuned', + 'bm25-rm3-doc-tuned', + 'bm25-rm3-doc-segmented-tuned', + 'bm25-rocchio-doc-tuned', + 'bm25-rocchio-doc-segmented-tuned', + '', + 'bm25-d2q-t5-doc-default', + 'bm25-d2q-t5-doc-segmented-default', + 'bm25-rm3-d2q-t5-doc-default', + 'bm25-rm3-d2q-t5-doc-segmented-default', + '', + 'bm25-d2q-t5-doc-tuned', + 'bm25-d2q-t5-doc-segmented-tuned', + 'bm25-rm3-d2q-t5-doc-tuned', + 'bm25-rm3-d2q-t5-doc-segmented-tuned', + '', + 'unicoil-noexp', + 'unicoil-noexp-pytorch', + '', + 'unicoil', + 'unicoil-pytorch'], + + # MS MARCO v2 passage + 'msmarco-v2-passage': + ['bm25-default', + 'bm25-augmented-default', + 'bm25-rm3-default', + 'bm25-rm3-augmented-default', + '', + 'bm25-d2q-t5-default', + 'bm25-d2q-t5-augmented-default', + 'bm25-rm3-d2q-t5-default', + 'bm25-rm3-d2q-t5-augmented-default', + '', + 'unicoil-noexp', + 'unicoil', + '', + 'unicoil-noexp-otf', + 'unicoil-otf'], + + # MS MARCO v2 doc + 'msmarco-v2-doc': + ['bm25-doc-default', + 'bm25-doc-segmented-default', + 'bm25-rm3-doc-default', + 'bm25-rm3-doc-segmented-default', + '', + 'bm25-d2q-t5-doc-default', + 'bm25-d2q-t5-doc-segmented-default', + 'bm25-rm3-d2q-t5-doc-default', + 'bm25-rm3-d2q-t5-doc-segmented-default', + '', + 'unicoil-noexp', + 'unicoil', + '', + 'unicoil-noexp-otf', + 'unicoil-otf' + ] +} + +trec_eval_metric_definitions = { + 'msmarco-v1-passage': { + 'msmarco-passage-dev-subset': { + 'MRR@10': '-c -M 10 -m recip_rank', + 'R@1K': '-c -m recall.1000' + }, + 'dl19-passage': { + 'MAP': '-c -l 2 -m map', + 'nDCG@10': '-c -m ndcg_cut.10', + 'R@1K': '-c -l 2 -m recall.1000' + }, + 'dl20-passage': { + 'MAP': '-c -l 2 -m map', + 'nDCG@10': '-c -m ndcg_cut.10', + 'R@1K': '-c -l 2 -m recall.1000' + } + }, + 'msmarco-v1-doc': { + 'msmarco-doc-dev': { + 'MRR@10': '-c -M 100 -m recip_rank', + 'R@1K': '-c -m recall.1000' + }, + 'dl19-doc': { + 'MAP': '-c -M 100 -m map', + 'nDCG@10': '-c -m ndcg_cut.10', + 'R@1K': '-c -m recall.1000' + }, + 'dl20-doc': { + 'MAP': '-c -M 100 -m map', + 'nDCG@10': '-c -m ndcg_cut.10', + 'R@1K': '-c -m recall.1000' + } + }, + 'msmarco-v2-passage': { + 'msmarco-v2-passage-dev': { + 'MRR@100': '-c -M 100 -m recip_rank', + 'R@1K': '-c -m recall.1000' + }, + 'msmarco-v2-passage-dev2': { + 'MRR@100': '-c -M 100 -m recip_rank', + 'R@1K': '-c -m recall.1000' + }, + 'dl21-passage': { + 'MAP@100': '-c -l 2 -M 100 -m map', + 'nDCG@10': '-c -m ndcg_cut.10', + 'MRR@100': '-c -l 2 -M 100 -m recip_rank', + 'R@100': '-c -l 2 -m recall.100', + 'R@1K': '-c -l 2 -m recall.1000' + } + }, + 'msmarco-v2-doc': { + 'msmarco-v2-doc-dev': { + 'MRR@100': '-c -M 100 -m recip_rank', + 'R@1K': '-c -m recall.1000' + }, + 'msmarco-v2-doc-dev2': { + 'MRR@100': '-c -M 100 -m recip_rank', + 'R@1K': '-c -m recall.1000' + }, + 'dl21-doc': { + 'MAP@100': '-c -M 100 -m map', + 'nDCG@10': '-c -m ndcg_cut.10', + 'MRR@100': '-c -M 100 -m recip_rank', + 'R@100': '-c -m recall.100', + 'R@1K': '-c -m recall.1000' + } + } +} + + +def find_msmarco_table_topic_set_key_v1(topic_key): + # E.g., we want to map variants like 'dl19-passage-unicoil' and 'dl19-passage' both into 'dl19' + key = '' + if topic_key.startswith('dl19'): + key = 'dl19' + elif topic_key.startswith('dl20'): + key = 'dl20' + elif topic_key.startswith('msmarco'): + key = 'dev' + + return key + + +def find_msmarco_table_topic_set_key_v2(topic_key): + key = '' + if topic_key.endswith('dev') or topic_key.endswith('dev-unicoil') or topic_key.endswith('dev-unicoil-noexp'): + key = 'dev' + elif topic_key.endswith('dev2') or topic_key.endswith('dev2-unicoil') or topic_key.endswith('dev2-unicoil-noexp'): + key = 'dev2' + elif topic_key.startswith('dl21'): + key = 'dl21' + + return key + + +def format_command(raw): + # After "--output foo.txt" are additional options like "--hits 1000 --impact". + # We want these on a separate line for better readability, but note that sometimes that might + # be the end of the command, in which case we don't want to add an extra line break. + return raw.replace('--topics', '\\\n --topics') \ + .replace('--threads', '\\\n --threads')\ + .replace('--index', '\\\n --index')\ + .replace('--output ', '\\\n --output ')\ + .replace('--encoder', '\\\n --encoder')\ + .replace('--onnx-encoder', '\\\n --onnx-encoder')\ + .replace('--encoded-corpus', '\\\n --encoded-corpus')\ + .replace('.txt ', '.txt \\\n ') + + +def read_file(f): + fin = open(f, 'r') + text = fin.read() + fin.close() + + return text + + +def list_conditions(args): + for condition in models[args.collection]: + if condition == '': + continue + print(condition) + + +def generate_report(args): + yaml_file = pkg_resources.resource_filename(__name__, f'{args.collection}.yaml') + + if args.collection == 'msmarco-v1-passage': + html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v1_passage.template')) + row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v1.template')) + elif args.collection == 'msmarco-v1-doc': + html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v1_doc.template')) + row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v1.template')) + elif args.collection == 'msmarco-v2-passage': + html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v2_passage.template')) + row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v2.template')) + elif args.collection == 'msmarco-v2-doc': + html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v2_doc.template')) + row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v2.template')) + else: + raise ValueError(f'Unknown corpus: {args.collection}') + + table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) + commands = defaultdict(lambda: defaultdict(lambda: '')) + eval_commands = defaultdict(lambda: defaultdict(lambda: '')) + + table_keys = {} + row_ids = {} + + with open(yaml_file) as f: + yaml_data = yaml.safe_load(f) + for condition in yaml_data['conditions']: + name = condition['name'] + display = condition['display-html'] + row_id = condition['display-row'] if 'display-row' in condition else '' + cmd_template = condition['command'] + + row_ids[name] =row_id + table_keys[name] = display + + for topic_set in condition['topics']: + topic_key = topic_set['topic_key'] + eval_key = topic_set['eval_key'] + + if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc': + short_topic_key = find_msmarco_table_topic_set_key_v1(topic_key) + else: + short_topic_key = find_msmarco_table_topic_set_key_v2(topic_key) + + runfile = f'run.{args.collection}.{name}.{short_topic_key}.txt' + cmd = Template(cmd_template).substitute(topics=topic_key, output=runfile) + commands[name][short_topic_key] = cmd + + for expected in topic_set['scores']: + for metric in expected: + eval_cmd = f'python -m pyserini.eval.trec_eval ' + \ + f'{trec_eval_metric_definitions[args.collection][eval_key][metric]} {eval_key} {runfile}' + eval_commands[name][short_topic_key] += eval_cmd + '\n' + table[name][short_topic_key][metric] = expected[metric] + + if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc': + row_cnt = 1 + + html_rows = [] + for name in models[args.collection]: + if not name: + # Add blank row for spacing + html_rows.append('') + continue + s = Template(row_template) + s = s.substitute(row_cnt=row_cnt, + condition_name=table_keys[name], + row=row_ids[name], + s1=f'{table[name]["dl19"]["MAP"]:.4f}' if table[name]['dl19']['MAP'] != 0 else '-', + s2=f'{table[name]["dl19"]["nDCG@10"]:.4f}' if table[name]['dl19']['nDCG@10'] != 0 else '-', + s3=f'{table[name]["dl19"]["R@1K"]:.4f}' if table[name]['dl19']['R@1K'] != 0 else '-', + s4=f'{table[name]["dl20"]["MAP"]:.4f}' if table[name]['dl20']['MAP'] != 0 else '-', + s5=f'{table[name]["dl20"]["nDCG@10"]:.4f}' if table[name]['dl20']['nDCG@10'] != 0 else '-', + s6=f'{table[name]["dl20"]["R@1K"]:.4f}' if table[name]['dl20']['R@1K'] != 0 else '-', + s7=f'{table[name]["dev"]["MRR@10"]:.4f}' if table[name]['dev']['MRR@10'] != 0 else '-', + s8=f'{table[name]["dev"]["R@1K"]:.4f}' if table[name]['dev']['R@1K'] != 0 else '-', + cmd1=format_command(commands[name]['dl19']), + cmd2=format_command(commands[name]['dl20']), + cmd3=format_command(commands[name]['dev']), + eval_cmd1=eval_commands[name]['dl19'], + eval_cmd2=eval_commands[name]['dl20'], + eval_cmd3=eval_commands[name]['dev'] + ) + + # If we don't have scores, we want to remove the commands also. Use simple regexp substitution. + if table[name]['dl19']['MAP'] == 0: + s = re.sub(re.compile('Command to generate run on TREC 2019 queries:.*?
', + re.MULTILINE | re.DOTALL), + 'Not available.
', s) + if table[name]['dl20']['MAP'] == 0: + s = re.sub(re.compile('Command to generate run on TREC 2020 queries:.*?
', + re.MULTILINE | re.DOTALL), + 'Not available.
', s) + if table[name]['dev']['MRR@10'] == 0: + s = re.sub(re.compile('Command to generate run on dev queries:.*?', + re.MULTILINE | re.DOTALL), + 'Not available.', s) + + html_rows.append(s) + row_cnt += 1 + + all_rows = '\n'.join(html_rows) + if args.collection == 'msmarco-v1-passage': + full_name = 'MS MARCO V1 Passage' + else: + full_name = 'MS MARCO V1 Document' + + with open(args.output, 'w') as out: + out.write(Template(html_template).substitute(title=full_name, rows=all_rows)) + else: + row_cnt = 1 + + html_rows = [] + for name in models[args.collection]: + if not name: + # Add blank row for spacing + html_rows.append('') + continue + s = Template(row_template) + s = s.substitute(row_cnt=row_cnt, + condition_name=table_keys[name], + row=row_ids[name], + s1=f'{table[name]["dl21"]["MAP@100"]:.4f}', + s2=f'{table[name]["dl21"]["nDCG@10"]:.4f}', + s3=f'{table[name]["dl21"]["MRR@100"]:.4f}', + s4=f'{table[name]["dl21"]["R@100"]:.4f}', + s5=f'{table[name]["dl21"]["R@1K"]:.4f}', + s6=f'{table[name]["dev"]["MRR@100"]:.4f}', + s7=f'{table[name]["dev"]["R@1K"]:.4f}', + s8=f'{table[name]["dev2"]["MRR@100"]:.4f}', + s9=f'{table[name]["dev2"]["R@1K"]:.4f}', + cmd1=format_command(commands[name]['dl21']), + cmd2=format_command(commands[name]['dev']), + cmd3=format_command(commands[name]['dev2']), + eval_cmd1=eval_commands[name]['dl21'], + eval_cmd2=eval_commands[name]['dev'], + eval_cmd3=eval_commands[name]['dev2'] + ) + html_rows.append(s) + row_cnt += 1 + + all_rows = '\n'.join(html_rows) + if args.collection == 'msmarco-v2-passage': + full_name = 'MS MARCO V2 Passage' + else: + full_name = 'MS MARCO V2 Document' + + with open(args.output, 'w') as out: + out.write(Template(html_template).substitute(title=full_name, rows=all_rows)) + + +def run_conditions(args): + start = time.time() + + table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) + table_keys = {} + + yaml_file = pkg_resources.resource_filename(__name__, f'{args.collection}.yaml') + + with open(yaml_file) as f: + yaml_data = yaml.safe_load(f) + for condition in yaml_data['conditions']: + # Either we're running all conditions, or running only the condition specified in --condition + if not args.all: + if not condition['name'] == args.condition: + continue + + name = condition['name'] + display = condition['display'] + cmd_template = condition['command'] + + print(f'# Running condition "{name}": {display}\n') + for topic_set in condition['topics']: + topic_key = topic_set['topic_key'] + eval_key = topic_set['eval_key'] + + short_topic_key = '' + if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc': + short_topic_key = find_msmarco_table_topic_set_key_v1(topic_key) + else: + short_topic_key = find_msmarco_table_topic_set_key_v2(topic_key) + + print(f' - topic_key: {topic_key}') + + runfile = os.path.join(args.directory, f'run.{args.collection}.{name}.{short_topic_key}.txt') + cmd = Template(cmd_template).substitute(topics=topic_key, output=runfile) + + if args.display_commands: + print(f'\n```bash\n{format_command(cmd)}\n```\n') + + if not os.path.exists(runfile): + if not args.dry_run: + os.system(cmd) + + for expected in topic_set['scores']: + for metric in expected: + table_keys[name] = display + if not args.skip_eval: + # If the runfile doesn't exist, we can't evaluate. + # This would be the case if --dry-run were set. + if not os.path.exists(runfile): + continue + + score = float( + run_eval_and_return_metric( + metric, + eval_key, + trec_eval_metric_definitions[args.collection][eval_key][metric], + runfile)) + if math.isclose(score, float(expected[metric])): + result_str = ok_str + # Flaky tests + elif args.collection == 'msmarco-v1-passage' \ + and topic_key == 'msmarco-passage-dev-subset' and name == 'ance-pytorch' \ + and metric == 'MRR@10' and abs(score-float(expected[metric])) <= 0.0001: + result_str = okish_str + else: + result_str = fail_str + f' expected {expected[metric]:.4f}' + print(f' {metric:7}: {score:.4f} {result_str}') + table[name][short_topic_key][metric] = score + else: + table[name][short_topic_key][metric] = expected[metric] + + if not args.skip_eval: + print('') + + if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc': + print(' ' * 69 + 'TREC 2019' + ' ' * 16 + 'TREC 2020' + ' ' * 12 + 'MS MARCO dev') + print(' ' * 62 + 'MAP nDCG@10 R@1K MAP nDCG@10 R@1K MRR@10 R@1K') + print(' ' * 62 + '-' * 22 + ' ' + '-' * 22 + ' ' + '-' * 14) + + if args.condition: + # If we've used --condition to specify a specific condition, print out only that row. + name = args.condition + print(f'{table_keys[name]:60}' + + f'{table[name]["dl19"]["MAP"]:8.4f}{table[name]["dl19"]["nDCG@10"]:8.4f}{table[name]["dl19"]["R@1K"]:8.4f} ' + + f'{table[name]["dl20"]["MAP"]:8.4f}{table[name]["dl20"]["nDCG@10"]:8.4f}{table[name]["dl20"]["R@1K"]:8.4f} ' + + f'{table[name]["dev"]["MRR@10"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f}') + else: + # Otherwise, print out all rows + for name in models[args.collection]: + if not name: + print('') + continue + print(f'{table_keys[name]:60}' + + f'{table[name]["dl19"]["MAP"]:8.4f}{table[name]["dl19"]["nDCG@10"]:8.4f}{table[name]["dl19"]["R@1K"]:8.4f} ' + + f'{table[name]["dl20"]["MAP"]:8.4f}{table[name]["dl20"]["nDCG@10"]:8.4f}{table[name]["dl20"]["R@1K"]:8.4f} ' + + f'{table[name]["dev"]["MRR@10"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f}') + else: + print(' ' * 77 + 'TREC 2021' + ' ' * 18 + 'MS MARCO dev' + ' ' * 6 + 'MS MARCO dev2') + print(' ' * 62 + 'MAP@100 nDCG@10 MRR@100 R@100 R@1K MRR@100 R@1K MRR@100 R@1K') + print(' ' * 62 + '-' * 38 + ' ' + '-' * 14 + ' ' + '-' * 14) + + if args.condition: + # If we've used --condition to specify a specific condition, print out only that row. + name = args.condition + print(f'{table_keys[name]:60}' + + f'{table[name]["dl21"]["MAP@100"]:8.4f}{table[name]["dl21"]["nDCG@10"]:8.4f}' + + f'{table[name]["dl21"]["MRR@100"]:8.4f}{table[name]["dl21"]["R@100"]:8.4f}{table[name]["dl21"]["R@1K"]:8.4f} ' + + f'{table[name]["dev"]["MRR@100"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f} ' + + f'{table[name]["dev2"]["MRR@100"]:8.4f}{table[name]["dev2"]["R@1K"]:8.4f}') + else: + # Otherwise, print out all rows + for name in models[args.collection]: + if not name: + print('') + continue + print(f'{table_keys[name]:60}' + + f'{table[name]["dl21"]["MAP@100"]:8.4f}{table[name]["dl21"]["nDCG@10"]:8.4f}' + + f'{table[name]["dl21"]["MRR@100"]:8.4f}{table[name]["dl21"]["R@100"]:8.4f}{table[name]["dl21"]["R@1K"]:8.4f} ' + + f'{table[name]["dev"]["MRR@100"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f} ' + + f'{table[name]["dev2"]["MRR@100"]:8.4f}{table[name]["dev2"]["R@1K"]:8.4f}') + + end = time.time() + + print('\n') + print(f'Total elapsed time: {end - start:.0f}s') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Generate regression matrix for MS MARCO corpora.') + parser.add_argument('--collection', type=str, + help='Collection = {v1-passage, v1-doc, v2-passage, v2-doc}.', required=True) + # To list all conditions + parser.add_argument('--list-conditions', action='store_true', default=False, help='List available conditions.') + # For generating reports + parser.add_argument('--generate-report', action='store_true', default=False, help='Generate report.') + parser.add_argument('--output', type=str, help='File to store report.', required=False) + # For actually running the experimental conditions + parser.add_argument('--all', action='store_true', default=False, help='Run all conditions.') + parser.add_argument('--condition', type=str, help='Condition to run.', required=False) + parser.add_argument('--directory', type=str, help='Base directory.', default='', required=False) + parser.add_argument('--dry-run', action='store_true', default=False, help='Print out commands but do not execute.') + parser.add_argument('--skip-eval', action='store_true', default=False, help='Skip running trec_eval.') + parser.add_argument('--display-commands', action='store_true', default=False, help='Display command.') + args = parser.parse_args() + + if args.collection == 'v1-passage': + args.collection = 'msmarco-v1-passage' + elif args.collection == 'v1-doc': + args.collection = 'msmarco-v1-doc' + elif args.collection == 'v2-passage': + args.collection = 'msmarco-v2-passage' + elif args.collection == 'v2-doc': + args.collection = 'msmarco-v2-doc' + else: + raise ValueError(f'Unknown corpus: {args.collection}') + + if args.list_conditions: + list_conditions(args) + sys.exit() + + if args.generate_report: + if not args.output: + print(f'Must specify report filename with --output.') + sys.exit() + + generate_report(args) + sys.exit() + + if not args.all and not args.condition: + print(f'Must specify a specific condition using --condition or use --all to run all conditions.') + sys.exit() + + run_conditions(args) diff --git a/pyserini/2cr/msmarco_html_row_v1.template b/pyserini/2cr/msmarco_html_row_v1.template new file mode 100644 index 0000000000000000000000000000000000000000..c032b14e4643b2d9b2a7a109d55d0c99611e129d --- /dev/null +++ b/pyserini/2cr/msmarco_html_row_v1.template @@ -0,0 +1,81 @@ + + + +$row +${condition_name} +$s1 +$s2 +$s3 + +$s4 +$s5 +$s6 + +$s7 +$s8 + + + + +
+ + + + + + +
+
+Command to generate run on TREC 2019 queries: + +
+
$cmd1
+
+Evaluation commands: + +
+
${eval_cmd1}
+
+ +
+
+ Command to generate run on TREC 2020 queries: + +
+
$cmd2
+
+Evaluation commands: + +
+
${eval_cmd2}
+
+ +
+
+ Command to generate run on dev queries: + +
+
$cmd3
+
+Evaluation commands: + +
+
${eval_cmd3}
+
+ +
+
+ + +
+ \ No newline at end of file diff --git a/pyserini/2cr/msmarco_html_row_v2.template b/pyserini/2cr/msmarco_html_row_v2.template new file mode 100644 index 0000000000000000000000000000000000000000..a1442c96f7d105002ead16bf7808a58b3d6ed5ee --- /dev/null +++ b/pyserini/2cr/msmarco_html_row_v2.template @@ -0,0 +1,82 @@ + + + +$row +${condition_name} +$s1 +$s2 +$s3 +$s4 +$s5 + +$s6 +$s7 + +$s8 +$s9 + + + + +
+ + + + + + +
+
+Command to generate run on TREC 2021 queries: + +
+
$cmd1
+
+Evaluation commands: + +
+
${eval_cmd1}
+
+ +
+
+ Command to generate run on dev queries: + +
+
$cmd2
+
+Evaluation commands: + +
+
${eval_cmd2}
+
+ +
+
+ Command to generate run on dev2 queries: + +
+
$cmd3
+
+Evaluation commands: + +
+
${eval_cmd3}
+
+ +
+
+ + +
+ \ No newline at end of file diff --git a/pyserini/2cr/msmarco_html_v1_doc.template b/pyserini/2cr/msmarco_html_v1_doc.template new file mode 100644 index 0000000000000000000000000000000000000000..50a41306828c81177a05ded8eed864a33775f155 --- /dev/null +++ b/pyserini/2cr/msmarco_html_v1_doc.template @@ -0,0 +1,296 @@ + + + + + + + Pyserini Reproductions: MS MARCO V1 Document + + + + + + + + + + + + +
+
+
+
+

$title

+
+
+
+
+ + +
+ +

The two-click* reproduction matrix below provides commands for reproducing experimental results reported in a number of papers, denoted by the references in square brackets. +Instructions for programmatic execution are shown at the bottom of this page (scroll down).

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +$rows + + +
TREC 2019TREC 2020dev

AP@100
nDCG@10R@1K
AP@100
nDCG@10R@1KRR@100R@1K
+
+ + + +
+ +

Programmatic Execution

+ +

All experimental runs shown in the above table can be programmatically executed based on the instructions below. +To list all the experimental conditions:

+ +
+python -m pyserini.2cr.msmarco --collection v1-doc --list-conditions +
+ +

These conditions correspond to the table rows above.

+ +

For all conditions, just show the commands in a "dry run":

+ +
+python -m pyserini.2cr.msmarco --collection v1-doc --all --display-commands --dry-run +
+ +

To actually run all the experimental conditions:

+ +
+python -m pyserini.2cr.msmarco --collection v1-doc --all --display-commands +
+ +

With the above command, run files will be placed in the current directory. +Use the option --directory runs/ to place the runs in a sub-directory.

+ +

To show the commands for a specific condition:

+ +
+python -m pyserini.2cr.msmarco --collection v1-doc --condition bm25-doc-default --display-commands --dry-run +
+ +

This will generate exactly the commands for a specific condition above (corresponding to a row in the table).

+ +

To actually run a specific condition:

+ +
+python -m pyserini.2cr.msmarco --collection v1-doc --condition bm25-doc-default --display-commands +
+ +

Again, with the above command, run files will be placed in the current directory. +Use the option --directory runs/ to place the runs in a sub-directory.

+ +

Finally, to generate this page:

+ +
+python -m pyserini.2cr.msmarco --collection v1-doc --generate-report --output msmarco-v1-doc.html +
+ +

The output file msmarco-v1-doc.html should be identical to this page.

+ +
+ +
+ + + + + + + + + + + + diff --git a/pyserini/2cr/msmarco_html_v1_passage.template b/pyserini/2cr/msmarco_html_v1_passage.template new file mode 100644 index 0000000000000000000000000000000000000000..b3cd36bc52e2b556693bb2f2243c3049e98d3a33 --- /dev/null +++ b/pyserini/2cr/msmarco_html_v1_passage.template @@ -0,0 +1,325 @@ + + + + + + + Pyserini Reproductions: MS MARCO V1 Passage + + + + + + + + + + + + +
+
+
+
+

$title

+
+
+
+
+ + +
+ +

The two-click* reproduction matrix below provides commands for reproducing experimental results reported in a number of papers, denoted by the references in square brackets. +Instructions for programmatic execution are shown at the bottom of this page (scroll down).

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +$rows + + +
TREC 2019TREC 2020dev

AP
nDCG@10R@1K
AP
nDCG@10R@1KRR@10R@1K
+
+ + + +
+ +

Programmatic Execution

+ +

All experimental runs shown in the above table can be programmatically executed based on the instructions below. +To list all the experimental conditions:

+ +
+python -m pyserini.2cr.msmarco --collection v1-passage --list-conditions +
+ +

These conditions correspond to the table rows above.

+ +

For all conditions, just show the commands in a "dry run":

+ +
+python -m pyserini.2cr.msmarco --collection v1-passage --all --display-commands --dry-run +
+ +

To actually run all the experimental conditions:

+ +
+python -m pyserini.2cr.msmarco --collection v1-passage --all --display-commands +
+ +

With the above command, run files will be placed in the current directory. +Use the option --directory runs/ to place the runs in a sub-directory.

+ +

To show the commands for a specific condition:

+ +
+python -m pyserini.2cr.msmarco --collection v1-passage --condition bm25-default --display-commands --dry-run +
+ +

This will generate exactly the commands for a specific condition above (corresponding to a row in the table).

+ +

To actually run a specific condition:

+ +
+python -m pyserini.2cr.msmarco --collection v1-passage --condition bm25-default --display-commands +
+ +

Again, with the above command, run files will be placed in the current directory. +Use the option --directory runs/ to place the runs in a sub-directory.

+ +

Finally, to generate this page:

+ +
+python -m pyserini.2cr.msmarco --collection v1-passage --generate-report --output msmarco-v1-passage.html +
+ +

The output file msmarco-v1-passage.html should be identical to this page.

+ +
+ +
+ + + + + + + + + + + + diff --git a/pyserini/2cr/msmarco_html_v2_doc.template b/pyserini/2cr/msmarco_html_v2_doc.template new file mode 100644 index 0000000000000000000000000000000000000000..a3379e14af7046a33e4e9f81b70cf9c3dca9de32 --- /dev/null +++ b/pyserini/2cr/msmarco_html_v2_doc.template @@ -0,0 +1,292 @@ + + + + + + + Pyserini Reproductions: MS MARCO V2 Document + + + + + + + + + + + + +
+
+
+
+

$title

+
+
+
+
+ + +
+ +

The two-click* reproduction matrix below provides commands for reproducing experimental results reported in the following paper. +Numbered rows correspond to tables in the paper; additional conditions are provided for comparison purposes.

+ +

Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin. Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2. +Proceedings of the 45th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022), July 2022.

+ +

Instructions for programmatic execution are shown at the bottom of this page (scroll down).

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +$rows + + +
TREC 2021devdev2

AP
nDCG@10RR@100R@100R@1KRR@100R@1KRR@100R@1K
+
+ +
+ +

Programmatic Execution

+ +

All experimental runs shown in the above table can be programmatically executed based on the instructions below. +To list all the experimental conditions:

+ +
+python -m pyserini.2cr.msmarco --collection v2-doc --list-conditions +
+ +

These conditions correspond to the table rows above.

+ +

For all conditions, just show the commands in a "dry run":

+ +
+python -m pyserini.2cr.msmarco --collection v2-doc --all --display-commands --dry-run +
+ +

To actually run all the experimental conditions:

+ +
+python -m pyserini.2cr.msmarco --collection v2-doc --all --display-commands +
+ +

With the above command, run files will be placed in the current directory. +Use the option --directory runs/ to place the runs in a sub-directory.

+ +

To show the commands for a specific condition:

+ +
+python -m pyserini.2cr.msmarco --collection v2-doc --condition bm25-doc-default --display-commands --dry-run +
+ +

This will generate exactly the commands for a specific condition above (corresponding to a row in the table).

+ +

To actually run a specific condition:

+ +
+python -m pyserini.2cr.msmarco --collection v2-doc --condition bm25-doc-default --display-commands +
+ +

Again, with the above command, run files will be placed in the current directory. +Use the option --directory runs/ to place the runs in a sub-directory.

+ +

Finally, to generate this page:

+ +
+python -m pyserini.2cr.msmarco --collection v2-doc --generate-report --output msmarco-v2-doc.html +
+ +

The output file msmarco-v2-doc.html should be identical to this page.

+ +
+ +
+ + + + + + + + + + + + diff --git a/pyserini/2cr/msmarco_html_v2_passage.template b/pyserini/2cr/msmarco_html_v2_passage.template new file mode 100644 index 0000000000000000000000000000000000000000..79d0d012ba1790413305e4587de4663da741a012 --- /dev/null +++ b/pyserini/2cr/msmarco_html_v2_passage.template @@ -0,0 +1,292 @@ + + + + + + + Pyserini Reproductions: MS MARCO V2 Passage + + + + + + + + + + + + +
+
+
+
+

$title

+
+
+
+
+ + +
+ +

The two-click* reproduction matrix below provides commands for reproducing experimental results reported in the following paper. +Numbered rows correspond to tables in the paper; additional conditions are provided for comparison purposes.

+ +

Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin. Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2. +Proceedings of the 45th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022), July 2022.

+ +

Instructions for programmatic execution are shown at the bottom of this page (scroll down).

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +$rows + + +
TREC 2021devdev2

AP
nDCG@10RR@100R@100R@1KRR@100R@1KRR@100R@1K
+
+ +
+ +

Programmatic Execution

+ +

All experimental runs shown in the above table can be programmatically executed based on the instructions below. +To list all the experimental conditions:

+ +
+python -m pyserini.2cr.msmarco --collection v2-passage --list-conditions +
+ +

These conditions correspond to the table rows above.

+ +

For all conditions, just show the commands in a "dry run":

+ +
+python -m pyserini.2cr.msmarco --collection v2-passage --all --display-commands --dry-run +
+ +

To actually run all the experimental conditions:

+ +
+python -m pyserini.2cr.msmarco --collection v2-passage --all --display-commands +
+ +

With the above command, run files will be placed in the current directory. +Use the option --directory runs/ to place the runs in a sub-directory.

+ +

To show the commands for a specific condition:

+ +
+python -m pyserini.2cr.msmarco --collection v2-passage --condition bm25-default --display-commands --dry-run +
+ +

This will generate exactly the commands for a specific condition above (corresponding to a row in the table).

+ +

To actually run a specific condition:

+ +
+python -m pyserini.2cr.msmarco --collection v2-passage --condition bm25-default --display-commands +
+ +

Again, with the above command, run files will be placed in the current directory. +Use the option --directory runs/ to place the runs in a sub-directory.

+ +

Finally, to generate this page:

+ +
+python -m pyserini.2cr.msmarco --collection v2-passage --generate-report --output msmarco-v2-passage.html +
+ +

The output file msmarco-v2-passage.html should be identical to this page.

+ +
+ +
+ + + + + + + + + + + + diff --git a/pyserini/__init__.py b/pyserini/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/pyserini/__init__.py @@ -0,0 +1 @@ + diff --git a/pyserini/__pycache__/__init__.cpython-310.pyc b/pyserini/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4106b0596fcf5ca13ed77ff9719e88f4a2ea3681 Binary files /dev/null and b/pyserini/__pycache__/__init__.cpython-310.pyc differ diff --git a/pyserini/__pycache__/encoded_corpus_info.cpython-310.pyc b/pyserini/__pycache__/encoded_corpus_info.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c00f81eac20bff80743eb61392866798292d821 Binary files /dev/null and b/pyserini/__pycache__/encoded_corpus_info.cpython-310.pyc differ diff --git a/pyserini/__pycache__/encoded_query_info.cpython-310.pyc b/pyserini/__pycache__/encoded_query_info.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..055e4a201eda222f064e11768c0a2c72469beef5 Binary files /dev/null and b/pyserini/__pycache__/encoded_query_info.cpython-310.pyc differ diff --git a/pyserini/__pycache__/evaluate_script_info.cpython-310.pyc b/pyserini/__pycache__/evaluate_script_info.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d01bc2745f1a182bedd2be20587ded37518e9bd9 Binary files /dev/null and b/pyserini/__pycache__/evaluate_script_info.cpython-310.pyc differ diff --git a/pyserini/__pycache__/prebuilt_index_info.cpython-310.pyc b/pyserini/__pycache__/prebuilt_index_info.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70c9d8d071d996dccdb1769f0d48c6bfb8265c66 Binary files /dev/null and b/pyserini/__pycache__/prebuilt_index_info.cpython-310.pyc differ diff --git a/pyserini/__pycache__/pyclass.cpython-310.pyc b/pyserini/__pycache__/pyclass.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..033ca3d542385209f04449741f017aa3969fd36d Binary files /dev/null and b/pyserini/__pycache__/pyclass.cpython-310.pyc differ diff --git a/pyserini/__pycache__/setup.cpython-310.pyc b/pyserini/__pycache__/setup.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66eb51a4e075d39e7922d6e73fba6f7fe00a6621 Binary files /dev/null and b/pyserini/__pycache__/setup.cpython-310.pyc differ diff --git a/pyserini/__pycache__/util.cpython-310.pyc b/pyserini/__pycache__/util.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98c8aaebd3e23b48e7eb9e681b3245b6e18403d9 Binary files /dev/null and b/pyserini/__pycache__/util.cpython-310.pyc differ diff --git a/pyserini/analysis/__init__.py b/pyserini/analysis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d3eed751bfa2cfda94a83221474328732d7c7d0f --- /dev/null +++ b/pyserini/analysis/__init__.py @@ -0,0 +1,19 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ._base import get_lucene_analyzer, Analyzer, JAnalyzer, JAnalyzerUtils, JDefaultEnglishAnalyzer, JWhiteSpaceAnalyzer + +__all__ = ['get_lucene_analyzer', 'Analyzer', 'JAnalyzer', 'JAnalyzerUtils', 'JDefaultEnglishAnalyzer', 'JWhiteSpaceAnalyzer'] diff --git a/pyserini/analysis/__pycache__/__init__.cpython-310.pyc b/pyserini/analysis/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42f55a464d39b79106e2351f322d120fc8bac499 Binary files /dev/null and b/pyserini/analysis/__pycache__/__init__.cpython-310.pyc differ diff --git a/pyserini/analysis/__pycache__/_base.cpython-310.pyc b/pyserini/analysis/__pycache__/_base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df59a778b125d30289ba4241adadedbc2c3884bf Binary files /dev/null and b/pyserini/analysis/__pycache__/_base.cpython-310.pyc differ diff --git a/pyserini/analysis/_base.py b/pyserini/analysis/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..7ca17c5ec6cafb9171a01cbdcac6303b5fc37526 --- /dev/null +++ b/pyserini/analysis/_base.py @@ -0,0 +1,166 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import List + +from ..pyclass import autoclass + +# Wrappers around Lucene classes +JAnalyzer = autoclass('org.apache.lucene.analysis.Analyzer') +JArabicAnalyzer = autoclass('org.apache.lucene.analysis.ar.ArabicAnalyzer') +JBengaliAnalyzer = autoclass('org.apache.lucene.analysis.bn.BengaliAnalyzer') +JCJKAnalyzer = autoclass('org.apache.lucene.analysis.cjk.CJKAnalyzer') +JDanishAnalyzer = autoclass('org.apache.lucene.analysis.da.DanishAnalyzer') +JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer') +JDutchAnalyzer = autoclass('org.apache.lucene.analysis.nl.DutchAnalyzer') +JFinnishAnalyzer = autoclass('org.apache.lucene.analysis.fi.FinnishAnalyzer') +JFrenchAnalyzer = autoclass('org.apache.lucene.analysis.fr.FrenchAnalyzer') +JGermanAnalyzer = autoclass('org.apache.lucene.analysis.de.GermanAnalyzer') +JHindiAnalyzer = autoclass('org.apache.lucene.analysis.hi.HindiAnalyzer') +JHungarianAnalyzer = autoclass('org.apache.lucene.analysis.hu.HungarianAnalyzer') +JIndonesianAnalyzer = autoclass('org.apache.lucene.analysis.id.IndonesianAnalyzer') +JItalianAnalyzer = autoclass('org.apache.lucene.analysis.it.ItalianAnalyzer') +JJapaneseAnalyzer = autoclass('org.apache.lucene.analysis.ja.JapaneseAnalyzer') +JNorwegianAnalyzer = autoclass('org.apache.lucene.analysis.no.NorwegianAnalyzer') +JPortugueseAnalyzer = autoclass('org.apache.lucene.analysis.pt.PortugueseAnalyzer') +JRussianAnalyzer = autoclass('org.apache.lucene.analysis.ru.RussianAnalyzer') +JSpanishAnalyzer = autoclass('org.apache.lucene.analysis.es.SpanishAnalyzer') +JSwedishAnalyzer = autoclass('org.apache.lucene.analysis.sv.SwedishAnalyzer') +JTeluguAnalyzer = autoclass('org.apache.lucene.analysis.te.TeluguAnalyzer') +JThaiAnalyzer = autoclass('org.apache.lucene.analysis.th.ThaiAnalyzer') +JTurkishAnalyzer = autoclass('org.apache.lucene.analysis.tr.TurkishAnalyzer') +JWhiteSpaceAnalyzer = autoclass('org.apache.lucene.analysis.core.WhitespaceAnalyzer') +JCharArraySet = autoclass('org.apache.lucene.analysis.CharArraySet') + +# Wrappers around Anserini classes +JAnalyzerUtils = autoclass('io.anserini.analysis.AnalyzerUtils') +JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer') +JTweetAnalyzer = autoclass('io.anserini.analysis.TweetAnalyzer') +JHuggingFaceTokenizerAnalyzer = autoclass('io.anserini.analysis.HuggingFaceTokenizerAnalyzer') + + +def get_lucene_analyzer(language: str='en', stemming: bool=True, stemmer: str='porter', stopwords: bool=True, huggingFaceTokenizer: str=None) -> JAnalyzer: + """Create a Lucene ``Analyzer`` with specific settings. + + Parameters + ---------- + language : str + Name of analyzer. + stemming : bool + Set to stem. + stemmer : str + Stemmer to use. + stopwords : bool + Set to filter stopwords. + huggingFaceTokenizer: str + a huggingface model id or path to a tokenizer.json file + + Returns + ------- + JAnalyzer + Java ``Analyzer`` with specified settings. + """ + if language.lower() == 'ar': + return JArabicAnalyzer() + elif language.lower() == 'bn': + return JBengaliAnalyzer() + elif language.lower() in ['zh', 'ko']: + return JCJKAnalyzer() + elif language.lower() == 'da': + return JDanishAnalyzer() + elif language.lower() == 'nl': + return JDutchAnalyzer() + elif language.lower() == 'fi': + return JFinnishAnalyzer() + elif language.lower() == 'fr': + return JFrenchAnalyzer() + elif language.lower() == 'de': + return JGermanAnalyzer() + elif language.lower() == 'hi': + return JHindiAnalyzer() + elif language.lower() == 'hu': + return JHungarianAnalyzer() + elif language.lower() == 'id': + return JIndonesianAnalyzer() + elif language.lower() == 'it': + return JItalianAnalyzer() + elif language.lower() == 'ja': + return JJapaneseAnalyzer() + elif language.lower() == 'no': + return JNorwegianAnalyzer() + elif language.lower() == 'pt': + return JPortugueseAnalyzer() + elif language.lower() == 'ru': + return JRussianAnalyzer() + elif language.lower() == 'es': + return JSpanishAnalyzer() + elif language.lower() == 'te': + return JTeluguAnalyzer() + elif language.lower() == 'th': + return JThaiAnalyzer() + elif language.lower() == 'tr': + return JTurkishAnalyzer() + elif language.lower() == 'tweet': + return JTweetAnalyzer() + elif language.lower() == 'hgf_tokenizer': + return JHuggingFaceTokenizerAnalyzer(huggingFaceTokenizer) + elif language.lower() == 'en': + if stemming: + if stopwords: + return JDefaultEnglishAnalyzer.newStemmingInstance(stemmer) + else: + return JDefaultEnglishAnalyzer.newStemmingInstance(stemmer, JCharArraySet.EMPTY_SET) + else: + if stopwords: + return JDefaultEnglishAnalyzer.newNonStemmingInstance() + else: + return JDefaultEnglishAnalyzer.newNonStemmingInstance(JCharArraySet.EMPTY_SET) + else: + raise ValueError('Invalid configuration.') + + +class Analyzer: + """Python wrapper around a Lucene ``Analyzer`` to simplify analysis. + + Parameters + ---------- + analyzer : JAnalyzer + Lucene ``Analyzer``. + """ + + def __init__(self, analyzer): + if not isinstance(analyzer, JAnalyzer): + raise TypeError('Invalid JAnalyzer!') + self.analyzer = analyzer + + def analyze(self, text: str) -> List[str]: + """Analyze a piece of text. + + Parameters + ---------- + text : str + Text to analyze. + + Returns + ------- + List[str] + List of tokens corresponding to the output of the analyzer. + """ + results = JAnalyzerUtils.analyze(self.analyzer, text) + tokens = [] + for token in results.toArray(): + tokens.append(token) + return tokens diff --git a/pyserini/collection/__init__.py b/pyserini/collection/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..464516a633a1fe9c4f8943f2717b3671cba3ba7a --- /dev/null +++ b/pyserini/collection/__init__.py @@ -0,0 +1,20 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ._base import Collection, FileSegment, SourceDocument +from ._collection_support import Cord19Article + +__all__ = ['Collection', 'FileSegment', 'SourceDocument', 'Cord19Article'] diff --git a/pyserini/collection/_base.py b/pyserini/collection/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..ad6084c2232896671c75ac6a297f76a9c30a0c00 --- /dev/null +++ b/pyserini/collection/_base.py @@ -0,0 +1,153 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import logging +import re +from enum import Enum + +from ..multithreading import Counters +from ..pyclass import autoclass, cast, JPaths + +logger = logging.getLogger(__name__) + + +JFileSegment = autoclass('io.anserini.collection.FileSegment') +JSourceDocument = autoclass('io.anserini.collection.SourceDocument') + + +class JCollections(Enum): + AclAnthology = autoclass('io.anserini.collection.AclAnthology') + CarCollection = autoclass('io.anserini.collection.CarCollection') + Cord19AbstractCollection = autoclass('io.anserini.collection.Cord19AbstractCollection') + ClueWeb09Collection = autoclass('io.anserini.collection.ClueWeb09Collection') + ClueWeb12Collection = autoclass('io.anserini.collection.ClueWeb12Collection') + HtmlCollection = autoclass('io.anserini.collection.HtmlCollection') + JsonCollection = autoclass('io.anserini.collection.JsonCollection') + NewYorkTimesCollection = autoclass('io.anserini.collection.NewYorkTimesCollection') + TrecCollection = autoclass('io.anserini.collection.TrecCollection') + TrecwebCollection = autoclass('io.anserini.collection.TrecwebCollection') + TweetCollection = autoclass('io.anserini.collection.TweetCollection') + WashingtonPostCollection = autoclass('io.anserini.collection.WashingtonPostCollection') + WikipediaCollection = autoclass('io.anserini.collection.WikipediaCollection') + + +class Collection: + """ + Iterable wrapper class for Anserini's DocumentCollection. + + Parameters + ---------- + collection_class : str + Name of collection class to instantiate + collection_path : str + Path to directory containing collection + """ + + def __init__(self, collection_class, collection_path): + self.counters = Counters() + self.collection_class = collection_class + self.collection_path = JPaths.get(collection_path) + self.object = self._get_collection() + self.collection_iterator = self.object.iterator() + + def _get_collection(self): + try: + return JCollections[self.collection_class].value(self.collection_path) + except: + raise ValueError(self.collection_class) + + def __iter__(self): + return self + + def __next__(self): + if self.collection_iterator.hasNext(): + fs = self.collection_iterator.next() + return FileSegment(self, fs, fs.getSegmentPath()) + else: + raise StopIteration + + +class FileSegment: + """ + Iterable wrapper class for Anserini's FileSegment. + + Parameters + ---------- + collection : Collection + Parent collection of the file segment + segment : JFileSegment + FileSegment object to create wrapper from + segment_path : str + Path to file backing the file segment + """ + + def __init__(self, collection, segment, segment_path): + self.collection = collection + try: + self.object = cast(collection.object.getClass().getName() + + '$Segment', segment) + except: + logger.exception('Exception from casting FileSegment type...') + self.object = cast('io.anserini.collection.FileSegment', segment) + + self.segment_iterator = self.object.iterator() + self.segment_path = segment_path + self.segment_name = re.sub(r'\\|\/', '-', collection.collection_path.relativize(segment_path).toString()) + + def __iter__(self): + return self + + def __next__(self): + if self.object.iterator().hasNext(): + d = self.object.iterator().next() + return SourceDocument(self, d) + else: + # log if iteration stopped by error + if self.object.getErrorStatus(): + logger.error(self.segment_name + ': Error from segment iteration, stopping...') + self.collection.counters.errors.increment() + + # stop iteration and log skipped documents + skipped = self.object.getSkippedCount() + if skipped > 0: + self.collection.counters.skips.increment(skipped) + logger.warning(self.segment_name + ': ' + str(skipped) + ' documents skipped') + self.object.close() + raise StopIteration + + +class SourceDocument: + """ + Wrapper class for Anserini's SourceDocument. + + Parameters + ---------- + + segment : FileSegment + Parent segment of the source document + document : io.anserini.collection.SourceDocument + SourceDocument object to create wrapper from + """ + + def __init__(self, segment, document): + if not isinstance(document, JSourceDocument): + raise TypeError('Invalid JSourceDocument!') + self.segment = segment + self.object = document + self.id = self.object.id() + self.indexable = self.object.indexable() + self.contents = self.object.contents() + self.raw = self.object.raw() diff --git a/pyserini/collection/_collection_support.py b/pyserini/collection/_collection_support.py new file mode 100644 index 0000000000000000000000000000000000000000..843bc0a74d9b54067b8aa446f9904bfb7dbe780c --- /dev/null +++ b/pyserini/collection/_collection_support.py @@ -0,0 +1,78 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Implementations of support for specific collections. + +import json + + +class Cord19Article: + """Wrapper class for a raw JSON article from AI2's COVID-19 Open Research Dataset (CORD-19). + + Parameters + ---------- + doc : str + A JSON string of a CORD-19 article. + """ + + def __init__(self, doc): + self.json = json.loads(doc) + # Performs some basic error checking, throws an exception if user tries to instantiate with something + # that isn't from CORD-19. + if 'cord_uid' in self.json: + self.full_text = False + elif 'paper_id' in self.json: + self.full_text = True + else: + raise TypeError + + def is_full_text(self): + return self.json['has_full_text'] + + def cord_uid(self): + return self.json['cord_uid'] + + def bib_entries(self): + return self.json['bib_entries'] + + def title(self): + try: + if self.is_full_text(): + return self.json['metadata']['title'] + else: + return self.json['csv_metadata']['title'] + except KeyError: + return '' + + def abstract(self): + try: + # For a full-text article, we can grab the abstract from two independent sources, the metadata or the + # actual full text. Here, we make the decision to use the metadata, even for full text. + return self.json['csv_metadata']['abstract'] + except KeyError: + return '' + + def metadata(self): + return self.json['csv_metadata'] + + def body(self): + try: + if self.is_full_text(): + return [entry['text'] for entry in self.json['body_text']] + else: + return [] + except KeyError: + return '' diff --git a/pyserini/demo/acl.py b/pyserini/demo/acl.py new file mode 100644 index 0000000000000000000000000000000000000000..57d0b6f7d9d434ce9b4d1e6596f332c0343e4c95 --- /dev/null +++ b/pyserini/demo/acl.py @@ -0,0 +1,124 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This script provides an interactive web interface demo for retrieval on the ACL dataset. +It requires `flask` (`pip install flask~=2.2.0`). +An example command looks like `python -m pyserini.demo.acl` that starts up a server on port 8080. +The demo can be accessed via "http://localhost:8080" in a web browser. +Additional arguments include: + --port [PORT] --hits [Number of hits] + --k1 [BM25 k1] --b [BM25 b] --device [cpu, cuda] +""" +import json +import logging +from argparse import ArgumentParser +from functools import partial +from typing import Callable, Optional, Tuple, Union + +from flask import Flask, render_template, request, flash, jsonify +from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder + +logging.basicConfig( + format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logger = logging.getLogger('acl-demo') + +VERSION = '1.0' +Searcher = Union[FaissSearcher, LuceneSearcher] + + +def create_app(k: int, load_searcher_fn: Callable[[str], Tuple[Searcher, str]]): + app = Flask(__name__) + + lang = 'en' + searcher, retriever = load_searcher_fn(lang) + + @app.route('/') + def index(): + nonlocal lang, searcher, retriever + return render_template('acl.html', lang=lang, retriever=retriever) + + @app.route('/search', methods=['GET', 'POST']) + def search(): + nonlocal lang, searcher, retriever + query = request.form['q'] + if not query: + search_results = [] + flash('Question is required') + else: + hits = searcher.search(query, k=k) + docs = [searcher.doc(hit.docid) for hit in hits] + search_results = [ + { + 'rank': r + 1, + 'docid': hit.docid, + 'doc': docs[r].contents(), + 'score': hit.score, + } + for r, hit in enumerate(hits) + ] + return render_template( + 'acl.html', search_results=search_results, query=query, lang=lang, retriever=retriever + ) + + + return app + + +def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher, str): + searcher = LuceneSearcher('indexes/lucene-index-acl-paragraph') + searcher.set_language(language) + if k1 is not None and b is not None: + searcher.set_bm25(k1, b) + retriever_name = f'BM25 (k1={k1}, b={b})' + else: + retriever_name = 'BM25' + + return searcher, retriever_name + + +def main(): + parser = ArgumentParser() + + parser.add_argument('--k1', type=float, help='BM25 k1 parameter.') + parser.add_argument('--b', type=float, help='BM25 b parameter.') + parser.add_argument('--hits', type=int, default=10, help='Number of hits returned by the retriever') + parser.add_argument( + '--device', + type=str, + default='cpu', + help='Device to run query encoder, cpu or [cuda:0, cuda:1, ...] (used only when index is based on FAISS)', + ) + parser.add_argument( + '--port', + default=8080, + type=int, + help='Web server port', + ) + + args = parser.parse_args() + + load_fn = partial(_load_sparse_searcher, k1=args.k1, b=args.b) + + app = create_app(args.hits, load_fn) + app.run(host='0.0.0.0', port=args.port) + + +if __name__ == '__main__': + main() diff --git a/pyserini/demo/dpr.py b/pyserini/demo/dpr.py new file mode 100644 index 0000000000000000000000000000000000000000..02e9aca428be06bd1a54505fa4f6542e4f9d6ad3 --- /dev/null +++ b/pyserini/demo/dpr.py @@ -0,0 +1,105 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import cmd +import json +import random + +from pyserini.search.lucene import LuceneSearcher +from pyserini.search.faiss import FaissSearcher, DprQueryEncoder +from pyserini.search.hybrid import HybridSearcher +from pyserini import search + + +class DPRDemo(cmd.Cmd): + nq_dev_topics = list(search.get_topics('dpr-nq-dev').values()) + trivia_dev_topics = list(search.get_topics('dpr-trivia-dev').values()) + + ssearcher = LuceneSearcher.from_prebuilt_index('wikipedia-dpr') + searcher = ssearcher + + encoder = DprQueryEncoder("facebook/dpr-question_encoder-multiset-base") + index = 'wikipedia-dpr-multi-bf' + dsearcher = FaissSearcher.from_prebuilt_index( + index, + encoder + ) + hsearcher = HybridSearcher(dsearcher, ssearcher) + + k = 10 + prompt = '>>> ' + + def precmd(self, line): + if line[0] == '/': + line = line[1:] + return line + + def do_help(self, arg): + print(f'/help : returns this message') + print(f'/k [NUM] : sets k (number of hits to return) to [NUM]') + print(f'/mode [MODE] : sets retriever type to [MODE] (one of sparse, dense, hybrid)') + print(f'/random [COLLECTION]: returns results for a random question from the dev subset [COLLECTION] (one of nq, trivia).') + + def do_k(self, arg): + print(f'setting k = {int(arg)}') + self.k = int(arg) + + def do_mode(self, arg): + if arg == "sparse": + self.searcher = self.ssearcher + elif arg == "dense": + self.searcher = self.dsearcher + elif arg == "hybrid": + self.searcher = self.hsearcher + else: + print( + f'Mode "{arg}" is invalid. Mode should be one of [sparse, dense, hybrid].') + return + print(f'setting retriver = {arg}') + + def do_random(self, arg): + if arg == "nq": + topics = self.nq_dev_topics + elif arg == "trivia": + topics = self.trivia_dev_topics + else: + print( + f'Collection "{arg}" is invalid. Collection should be one of [nq, trivia].') + return + q = random.choice(topics)['title'] + print(f'question: {q}') + self.default(q) + + def do_EOF(self, line): + return True + + def default(self, q): + hits = self.searcher.search(q, self.k) + + for i in range(0, len(hits)): + raw_doc = None + if isinstance(self.searcher, LuceneSearcher): + raw_doc = hits[i].raw + else: + doc = self.searcher.doc(hits[i].docid) + if doc: + raw_doc = doc.raw() + jsondoc = json.loads(raw_doc) + print(f'{i + 1:2} {hits[i].score:.5f} {jsondoc["contents"]}') + + +if __name__ == '__main__': + DPRDemo().cmdloop() diff --git a/pyserini/demo/miracl.py b/pyserini/demo/miracl.py new file mode 100644 index 0000000000000000000000000000000000000000..ffecb93f2557658f8b5a3bd3546fb9be295f9240 --- /dev/null +++ b/pyserini/demo/miracl.py @@ -0,0 +1,149 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This script provides an interactive web interface demo for retrieval on the MIRACL dataset. +It requires `flask` (`pip install flask~=2.2.0`). +An example command looks like `python -m pyserini.demo.miracl` that starts up a server on port 8080. +The demo can be accessed via "http://localhost:8080" in a web browser. +Additional arguments include: + --port [PORT] --hits [Number of hits] --index [BM25 or mdpr-tied-pft-msmarco] + --k1 [BM25 k1] --b [BM25 b] --device [cpu, cuda] +""" +import json +import logging +from argparse import ArgumentParser +from functools import partial +from typing import Callable, Optional, Tuple, Union + +from flask import Flask, render_template, request, flash, jsonify +from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder + +logging.basicConfig( + format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logger = logging.getLogger('miracl-demo') + +VERSION = '1.0' +LANGUAGES = ('ar', 'bn', 'en', 'es', 'fa', 'fi', 'fr', 'hi', 'id', 'ja', 'ko', 'ru', 'sw', 'te', 'th', 'zh') +Searcher = Union[FaissSearcher, LuceneSearcher] + + +def create_app(k: int, load_searcher_fn: Callable[[str], Tuple[Searcher, str]]): + app = Flask(__name__) + + lang = LANGUAGES[0] + searcher, retriever = load_searcher_fn(lang) + + @app.route('/') + def index(): + nonlocal lang, searcher, retriever + return render_template('miracl.html', lang=lang, retriever=retriever) + + @app.route('/search', methods=['GET', 'POST']) + def search(): + nonlocal lang, searcher, retriever + query = request.form['q'] + if not query: + search_results = [] + flash('Question is required') + else: + hits = searcher.search(query, k=k) + docs = [json.loads(searcher.doc(hit.docid).raw()) for hit in hits] + search_results = [ + { + 'rank': r + 1, + 'docid': hit.docid, + 'doc': docs[r]['text'], + 'title': docs[r]['title'], + 'score': hit.score, + } + for r, hit in enumerate(hits) + ] + return render_template( + 'miracl.html', search_results=search_results, query=query, lang=lang, retriever=retriever + ) + + @app.route('/lang', methods=['GET']) + def change_language(): + nonlocal lang, searcher, retriever + new_lang = request.args.get('new_lang', '', type=str) + if not new_lang or new_lang not in LANGUAGES: + return + + lang = new_lang + searcher, retriever = load_searcher_fn(lang) + return jsonify(lang=lang) + + return app + + +def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher, str): + searcher = LuceneSearcher.from_prebuilt_index(f'miracl-v{VERSION}-{language}') + searcher.set_language(language) + if k1 is not None and b is not None: + searcher.set_bm25(k1, b) + retriever_name = f'BM25 (k1={k1}, b={b})' + else: + retriever_name = 'BM25' + + return searcher, retriever_name + + +def _load_faiss_searcher(language: str, device: str) -> (Searcher, str): + query_encoder = AutoQueryEncoder(encoder_dir='castorini/mdpr-tied-pft-msmarco', device=device) + searcher = FaissSearcher.from_prebuilt_index( + f'miracl-v{VERSION}-{language}-mdpr-tied-pft-msmarco', query_encoder + ) + retriever_name = 'mDPR-pFT-MSMARCO' + return searcher, retriever_name + + +def main(): + parser = ArgumentParser() + + parser.add_argument('--index', default='BM25', choices=('BM25', 'mdpr-tied-pft-msmarco'), help='Index type.') + parser.add_argument('--k1', type=float, help='BM25 k1 parameter.') + parser.add_argument('--b', type=float, help='BM25 b parameter.') + parser.add_argument('--hits', type=int, default=10, help='Number of hits returned by the retriever') + parser.add_argument( + '--device', + type=str, + default='cpu', + help='Device to run query encoder, cpu or [cuda:0, cuda:1, ...] (used only when index is based on FAISS)', + ) + parser.add_argument( + '--port', + default=8080, + type=int, + help='Web server port', + ) + + args = parser.parse_args() + + if args.index == 'mdpr-tied-pft-msmarco': + load_fn = partial(_load_faiss_searcher, device=args.device) + else: + load_fn = partial(_load_sparse_searcher, k1=args.k1, b=args.b) + + app = create_app(args.hits, load_fn) + app.run(host='0.0.0.0', port=args.port) + + +if __name__ == '__main__': + main() diff --git a/pyserini/demo/msmarco.py b/pyserini/demo/msmarco.py new file mode 100644 index 0000000000000000000000000000000000000000..b73276d1c4b20ea88aea10155a4d0f25d2764b56 --- /dev/null +++ b/pyserini/demo/msmarco.py @@ -0,0 +1,118 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import cmd +import json +import os +import random + +from pyserini.search.lucene import LuceneSearcher +from pyserini.search.faiss import FaissSearcher, TctColBertQueryEncoder, AnceQueryEncoder +from pyserini.search.hybrid import HybridSearcher +from pyserini import search + + +class MsMarcoDemo(cmd.Cmd): + dev_topics = list(search.get_topics('msmarco-passage-dev-subset').values()) + + ssearcher = LuceneSearcher.from_prebuilt_index('msmarco-passage') + dsearcher = None + hsearcher = None + searcher = ssearcher + + k = 10 + prompt = '>>> ' + + # https://stackoverflow.com/questions/35213134/command-prefixes-in-python-cli-using-cmd-in-pythons-standard-library + def precmd(self, line): + if line[0] == '/': + line = line[1:] + return line + + def do_help(self, arg): + print(f'/help : returns this message') + print(f'/k [NUM] : sets k (number of hits to return) to [NUM]') + print(f'/model [MODEL] : sets encoder to use the model [MODEL] (one of tct, ance)') + print(f'/mode [MODE] : sets retriever type to [MODE] (one of sparse, dense, hybrid)') + print(f'/random : returns results for a random question from dev subset') + + def do_k(self, arg): + print(f'setting k = {int(arg)}') + self.k = int(arg) + + def do_mode(self, arg): + if arg == "sparse": + self.searcher = self.ssearcher + elif arg == "dense": + if self.dsearcher is None: + print(f'Specify model through /model before using dense retrieval.') + return + self.searcher = self.dsearcher + elif arg == "hybrid": + if self.hsearcher is None: + print(f'Specify model through /model before using hybrid retrieval.') + return + self.searcher = self.hsearcher + else: + print( + f'Mode "{arg}" is invalid. Mode should be one of [sparse, dense, hybrid].') + return + print(f'setting retriver = {arg}') + + def do_model(self, arg): + if arg == "tct": + encoder = TctColBertQueryEncoder("castorini/tct_colbert-msmarco") + index = "msmarco-passage-tct_colbert-hnsw" + elif arg == "ance": + encoder = AnceQueryEncoder("castorini/ance-msmarco-passage") + index = "msmarco-passage-ance-bf" + else: + print( + f'Model "{arg}" is invalid. Model should be one of [tct, ance].') + return + + self.dsearcher = FaissSearcher.from_prebuilt_index( + index, + encoder + ) + self.hsearcher = HybridSearcher(self.dsearcher, self.ssearcher) + print(f'setting model = {arg}') + + def do_random(self, arg): + q = random.choice(self.dev_topics)['title'] + print(f'question: {q}') + self.default(q) + + def do_EOF(self, line): + return True + + def default(self, q): + hits = self.searcher.search(q, self.k) + + for i in range(0, len(hits)): + raw_doc = None + if isinstance(self.searcher, LuceneSearcher): + raw_doc = hits[i].raw + else: + doc = self.searcher.doc(hits[i].docid) + if doc: + raw_doc = doc.raw() + jsondoc = json.loads(raw_doc) + print(f'{i + 1:2} {hits[i].score:.5f} {jsondoc["contents"]}') + + +if __name__ == '__main__': + MsMarcoDemo().cmdloop() diff --git a/pyserini/demo/templates/acl.html b/pyserini/demo/templates/acl.html new file mode 100644 index 0000000000000000000000000000000000000000..f3e3e881b069cfc45bf26fc764390f2e1b8b7d38 --- /dev/null +++ b/pyserini/demo/templates/acl.html @@ -0,0 +1,74 @@ + + + + + + + + + + + + ACL 🌍🙌🌏 Demo + + + +
+

ACL

+ acl logo +

Demo

+
+
+ +
+ {% for message in get_flashed_messages() %} +
{{ message }}
+ {% endfor %} + +
+
+
+ + +
+
+
+ + {% if search_results %} +
+ + + + + + + + + + + {% for res in search_results %} + + + + + + + + {% endfor %} + +
#ScorePassage IDContent
{{ res["rank"] }}{{ "%.2f"|format(res["score"]) }}{{ res["docid"] }} + {{ res["doc"] }} +
+
+ {% endif %} +
+ + \ No newline at end of file diff --git a/pyserini/demo/templates/assets/acl-logo.svg b/pyserini/demo/templates/assets/acl-logo.svg new file mode 100644 index 0000000000000000000000000000000000000000..8b2d548dff356aaa98a0e3dbcee669e4af754f43 --- /dev/null +++ b/pyserini/demo/templates/assets/acl-logo.svg @@ -0,0 +1,10 @@ + + + +Created with Fabric.js 5.3.0 + + + + + + \ No newline at end of file diff --git a/pyserini/demo/templates/miracl.html b/pyserini/demo/templates/miracl.html new file mode 100644 index 0000000000000000000000000000000000000000..28e49fdee803768af81603c0b92583253c37b50e --- /dev/null +++ b/pyserini/demo/templates/miracl.html @@ -0,0 +1,127 @@ + + + + + + + + + + + + MIRACL 🌍🙌🌏 Demo + + +

MIRACL 🌍🙌🌏 Demo

+

Multilingual Information Retrieval Across a Continuum of Languages

+ +
+ +

+ MIRACL is a multilingual dataset for ad hoc retrieval that consists of 18 different languages, collectively encompassing over three billion native speakers around the world. +

+ +
+ +
+ +
+
+
+ Loading... +
+
+
+ + retrieves passages using {{retriever}}. + +
+
+ +
+ +
+ {% for message in get_flashed_messages() %} +
{{ message }}
+ {% endfor %} + +
+
+
+ + +
+
+
+ + {% if search_results %} +
+ + + + + + + + + + + + {% for res in search_results %} + + + + + + + + {% endfor %} + +
#ScorePassage IDTitleContent
{{res["rank"]}}{{"%.2f"|format(res["score"])}}{{res["docid"]}}{{res["title"]}} + {{res["doc"]}} +
+
+ {% endif %} +
+ + \ No newline at end of file diff --git a/pyserini/dsearch.py b/pyserini/dsearch.py new file mode 100644 index 0000000000000000000000000000000000000000..72947e9e5662f2b293a966576192abb053a9c3e6 --- /dev/null +++ b/pyserini/dsearch.py @@ -0,0 +1,46 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Deprecated. The package ``pyserini.dsearch` has been renamed `pyserini.search.faiss`. Stubs are retained here for +redirection purpose to ensure that code in existing published papers remain function (with warnings).""" + +import os +import sys + +import pyserini.search.faiss +from pyserini.search.faiss import TctColBertQueryEncoder + +__all__ = ['SimpleDenseSearcher', 'BinaryDenseSearcher', 'TctColBertQueryEncoder'] + + +class SimpleDenseSearcher(pyserini.search.faiss.FaissSearcher): + def __new__(cls, *args, **kwargs): + print('pyserini.dsearch.SimpleDenseSearcher class has been deprecated, ' + 'please use FaissSearcher from pyserini.search.faiss instead') + return super().__new__(cls) + + +class BinaryDenseSearcher(pyserini.search.faiss.BinaryDenseSearcher): + def __new__(cls, *args, **kwargs): + print('pyserini.dsearch.BinaryDenseSearcher class has been deprecated, ' + 'please use BinaryDenseSearcher from pyserini.search.faiss instead') + return super().__new__(cls) + + +if __name__ == "__main__": + print('WARNING: pyserini.dsearch is deprecated, please use pyserini.search.faiss instead!') + args = " ".join(sys.argv[1:]) + os.system(f'python -m pyserini.search.faiss {args}') diff --git a/pyserini/encode/__init__.py b/pyserini/encode/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ba0224d417c96744da02ad8b0f73f651670b5f25 --- /dev/null +++ b/pyserini/encode/__init__.py @@ -0,0 +1,28 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ._base import DocumentEncoder, QueryEncoder, JsonlCollectionIterator,\ + RepresentationWriter, FaissRepresentationWriter, JsonlRepresentationWriter, PcaEncoder +from ._ance import AnceEncoder, AnceDocumentEncoder, AnceQueryEncoder +from ._auto import AutoQueryEncoder, AutoDocumentEncoder +from ._dpr import DprDocumentEncoder, DprQueryEncoder +from ._tct_colbert import TctColBertDocumentEncoder, TctColBertQueryEncoder +from ._aggretriever import AggretrieverDocumentEncoder, AggretrieverQueryEncoder +from ._unicoil import UniCoilEncoder, UniCoilDocumentEncoder, UniCoilQueryEncoder +from ._cached_data import CachedDataQueryEncoder +from ._tok_freq import TokFreqQueryEncoder +from ._splade import SpladeQueryEncoder +from ._slim import SlimQueryEncoder \ No newline at end of file diff --git a/pyserini/encode/__main__.py b/pyserini/encode/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..64572c9fa7ce0bb7f24aa34531e96300ce8efe54 --- /dev/null +++ b/pyserini/encode/__main__.py @@ -0,0 +1,147 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import sys + +from pyserini.encode import JsonlRepresentationWriter, FaissRepresentationWriter, JsonlCollectionIterator +from pyserini.encode import DprDocumentEncoder, TctColBertDocumentEncoder, AnceDocumentEncoder, AggretrieverDocumentEncoder, AutoDocumentEncoder +from pyserini.encode import UniCoilDocumentEncoder + + +encoder_class_map = { + "dpr": DprDocumentEncoder, + "tct_colbert": TctColBertDocumentEncoder, + "aggretriever": AggretrieverDocumentEncoder, + "ance": AnceDocumentEncoder, + "sentence-transformers": AutoDocumentEncoder, + "unicoil": UniCoilDocumentEncoder, + "auto": AutoDocumentEncoder, +} +ALLOWED_POOLING_OPTS = ["cls","mean"] + +def init_encoder(encoder, encoder_class, device): + _encoder_class = encoder_class + + # determine encoder_class + if encoder_class is not None: + encoder_class = encoder_class_map[encoder_class] + else: + # if any class keyword was matched in the given encoder name, + # use that encoder class + for class_keyword in encoder_class_map: + if class_keyword in encoder.lower(): + encoder_class = encoder_class_map[class_keyword] + break + + # if none of the class keyword was matched, + # use the AutoDocumentEncoder + if encoder_class is None: + encoder_class = AutoDocumentEncoder + + # prepare arguments to encoder class + kwargs = dict(model_name=encoder, device=device) + if (_encoder_class == "sentence-transformers") or ("sentence-transformers" in encoder): + kwargs.update(dict(pooling='mean', l2_norm=True)) + if (_encoder_class == "contriever") or ("contriever" in encoder): + kwargs.update(dict(pooling='mean', l2_norm=False)) + return encoder_class(**kwargs) + + +def parse_args(parser, commands): + # Divide argv by commands + split_argv = [[]] + for c in sys.argv[1:]: + if c in commands.choices: + split_argv.append([c]) + else: + split_argv[-1].append(c) + # Initialize namespace + args = argparse.Namespace() + for c in commands.choices: + setattr(args, c, None) + # Parse each command + parser.parse_args(split_argv[0], namespace=args) # Without command + for argv in split_argv[1:]: # Commands + n = argparse.Namespace() + setattr(args, argv[0], n) + parser.parse_args(argv, namespace=n) + return args + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + commands = parser.add_subparsers(title='sub-commands') + input_parser = commands.add_parser('input') + input_parser.add_argument('--corpus', type=str, + help='directory that contains corpus files to be encoded, in jsonl format.', + required=True) + input_parser.add_argument('--fields', help='fields that contents in jsonl has (in order)', + nargs='+', default=['text'], required=False) + input_parser.add_argument('--docid-field', + help='name of document id field name. If you have a custom id with a name other than "id", "_id" or "docid", then use this argument', + default=None, required=False) + input_parser.add_argument('--delimiter', help='delimiter for the fields', default='\n', required=False) + input_parser.add_argument('--shard-id', type=int, help='shard-id 0-based', default=0, required=False) + input_parser.add_argument('--shard-num', type=int, help='number of shards', default=1, required=False) + + output_parser = commands.add_parser('output') + output_parser.add_argument('--embeddings', type=str, help='directory to store encoded corpus', required=True) + output_parser.add_argument('--to-faiss', action='store_true', default=False) + + encoder_parser = commands.add_parser('encoder') + encoder_parser.add_argument('--encoder', type=str, help='encoder name or path', required=True) + encoder_parser.add_argument('--encoder-class', type=str, required=False, default=None, + choices=["dpr", "bpr", "tct_colbert", "ance", "sentence-transformers", "auto"], + help='which query encoder class to use. `default` would infer from the args.encoder') + encoder_parser.add_argument('--fields', help='fields to encode', nargs='+', default=['text'], required=False) + encoder_parser.add_argument('--batch-size', type=int, help='batch size', default=64, required=False) + encoder_parser.add_argument('--max-length', type=int, help='max length', default=256, required=False) + encoder_parser.add_argument('--dimension', type=int, help='dimension', default=768, required=False) + encoder_parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]', + default='cuda:0', required=False) + encoder_parser.add_argument('--fp16', action='store_true', default=False) + encoder_parser.add_argument('--add-sep', action='store_true', default=False) + encoder_parser.add_argument('--pooling', type=str, default='cls', help='for auto classes, allow the ability to dictate pooling strategy', required=False) + + args = parse_args(parser, commands) + delimiter = args.input.delimiter.replace("\\n", "\n") # argparse would add \ prior to the passed '\n\n' + + encoder = init_encoder(args.encoder.encoder, args.encoder.encoder_class, device=args.encoder.device) + if type(encoder).__name__ == "AutoDocumentEncoder": + if args.encoder.pooling in ALLOWED_POOLING_OPTS: + encoder.pooling = args.encoder.pooling + else: + raise ValueError(f"Only allowed to use pooling types {ALLOWED_POOLING_OPTS}. You entered {args.encoder.pooling}") + if args.output.to_faiss: + embedding_writer = FaissRepresentationWriter(args.output.embeddings, dimension=args.encoder.dimension) + else: + embedding_writer = JsonlRepresentationWriter(args.output.embeddings) + collection_iterator = JsonlCollectionIterator(args.input.corpus, args.input.fields, args.input.docid_field, delimiter) + + with embedding_writer: + for batch_info in collection_iterator(args.encoder.batch_size, args.input.shard_id, args.input.shard_num): + kwargs = { + 'texts': batch_info['text'], + 'titles': batch_info['title'] if 'title' in args.encoder.fields else None, + 'expands': batch_info['expand'] if 'expand' in args.encoder.fields else None, + 'fp16': args.encoder.fp16, + 'max_length': args.encoder.max_length, + 'add_sep': args.encoder.add_sep, + } + embeddings = encoder.encode(**kwargs) + batch_info['vector'] = embeddings + embedding_writer.write(batch_info, args.input.fields) diff --git a/pyserini/encode/__pycache__/__init__.cpython-310.pyc b/pyserini/encode/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2910416445593e340c14fcd8871f29ab34d13c3c Binary files /dev/null and b/pyserini/encode/__pycache__/__init__.cpython-310.pyc differ diff --git a/pyserini/encode/__pycache__/_aggretriever.cpython-310.pyc b/pyserini/encode/__pycache__/_aggretriever.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4735d5cb6a422069ee41b20baf670ab7816e9cf1 Binary files /dev/null and b/pyserini/encode/__pycache__/_aggretriever.cpython-310.pyc differ diff --git a/pyserini/encode/__pycache__/_ance.cpython-310.pyc b/pyserini/encode/__pycache__/_ance.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e33e1abc57ac3a728756f989719cca1719726e4 Binary files /dev/null and b/pyserini/encode/__pycache__/_ance.cpython-310.pyc differ diff --git a/pyserini/encode/__pycache__/_auto.cpython-310.pyc b/pyserini/encode/__pycache__/_auto.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67890b7336ea5c894c98e15a1d79d1ecf49fa308 Binary files /dev/null and b/pyserini/encode/__pycache__/_auto.cpython-310.pyc differ diff --git a/pyserini/encode/__pycache__/_base.cpython-310.pyc b/pyserini/encode/__pycache__/_base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2679bfe25ecd0008dea2cb35057971817285e72d Binary files /dev/null and b/pyserini/encode/__pycache__/_base.cpython-310.pyc differ diff --git a/pyserini/encode/__pycache__/_cached_data.cpython-310.pyc b/pyserini/encode/__pycache__/_cached_data.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2ab4c5c0ca3194304da58b88bed92b08ba0d4c4 Binary files /dev/null and b/pyserini/encode/__pycache__/_cached_data.cpython-310.pyc differ diff --git a/pyserini/encode/__pycache__/_dpr.cpython-310.pyc b/pyserini/encode/__pycache__/_dpr.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bcb35e9cb6539e11a5964d29d828ee9902a3749f Binary files /dev/null and b/pyserini/encode/__pycache__/_dpr.cpython-310.pyc differ diff --git a/pyserini/encode/__pycache__/_slim.cpython-310.pyc b/pyserini/encode/__pycache__/_slim.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18c5a21734359836fa7d9b86029571f148c8fdf3 Binary files /dev/null and b/pyserini/encode/__pycache__/_slim.cpython-310.pyc differ diff --git a/pyserini/encode/__pycache__/_splade.cpython-310.pyc b/pyserini/encode/__pycache__/_splade.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e4f84d77a9399a3d7dd50b3d48b71931e2594c7 Binary files /dev/null and b/pyserini/encode/__pycache__/_splade.cpython-310.pyc differ diff --git a/pyserini/encode/__pycache__/_tct_colbert.cpython-310.pyc b/pyserini/encode/__pycache__/_tct_colbert.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62034a0efa0f792f03b4cbf289e9afe4463da0d8 Binary files /dev/null and b/pyserini/encode/__pycache__/_tct_colbert.cpython-310.pyc differ diff --git a/pyserini/encode/__pycache__/_tok_freq.cpython-310.pyc b/pyserini/encode/__pycache__/_tok_freq.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..155571c7dd0acde6711100105bceb2aa199f2942 Binary files /dev/null and b/pyserini/encode/__pycache__/_tok_freq.cpython-310.pyc differ diff --git a/pyserini/encode/__pycache__/_unicoil.cpython-310.pyc b/pyserini/encode/__pycache__/_unicoil.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ace03161a21e595861e8c2ca7a22a02642c00b6 Binary files /dev/null and b/pyserini/encode/__pycache__/_unicoil.cpython-310.pyc differ diff --git a/pyserini/encode/_aggretriever.py b/pyserini/encode/_aggretriever.py new file mode 100644 index 0000000000000000000000000000000000000000..224eb2b05c826d61d7933b8fa81995985cb29ceb --- /dev/null +++ b/pyserini/encode/_aggretriever.py @@ -0,0 +1,188 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional +import numpy as np +import torch +from torch import Tensor +import torch.nn as nn +if torch.cuda.is_available(): + from torch.cuda.amp import autocast + +from transformers import DistilBertConfig, BertConfig +from transformers import AutoModelForMaskedLM, AutoTokenizer, PreTrainedModel +from pyserini.encode import DocumentEncoder, QueryEncoder + +class BERTAggretrieverEncoder(PreTrainedModel): + config_class = BertConfig + base_model_prefix = 'encoder' + load_tf_weights = None + + def __init__(self, config: BertConfig): + super().__init__(config) + self.config = config + self.softmax = nn.Softmax(dim=-1) + self.encoder = AutoModelForMaskedLM.from_config(config) + self.tok_proj = torch.nn.Linear(config.hidden_size, 1) + self.cls_proj = torch.nn.Linear(config.hidden_size, 128) + self.init_weights() + + # Copied from https://github.com/castorini/dhr/blob/main/tevatron/Aggretriever/utils.py + def cal_remove_dim(self, dims, vocab_size=30522): + remove_dims = vocab_size % dims + if remove_dims > 1000: # the first 1000 tokens in BERT are useless + remove_dims -= dims + return remove_dims + + # Copied from https://github.com/castorini/dhr/blob/main/tevatron/Aggretriever/utils.py + def aggregate(self, + lexical_reps: Tensor, + dims: int = 640, + remove_dims: int = -198, + full: bool = True + ): + if full: + remove_dims = self.cal_remove_dim(dims*2) + batch_size = lexical_reps.shape[0] + if remove_dims >= 0: + lexical_reps = lexical_reps[:, remove_dims:].view(batch_size, -1, dims*2) + else: + lexical_reps = torch.nn.functional.pad(lexical_reps, (0, -remove_dims), "constant", 0).view(batch_size, -1, dims*2) + tok_reps, _ = lexical_reps.max(1) + positive_tok_reps = tok_reps[:, 0:2*dims:2] + negative_tok_reps = tok_reps[:, 1:2*dims:2] + positive_mask = positive_tok_reps > negative_tok_reps + negative_mask = positive_tok_reps <= negative_tok_reps + tok_reps = positive_tok_reps * positive_mask - negative_tok_reps * negative_mask + else: + remove_dims = self.cal_remove_dim(dims) + batch_size = lexical_reps.shape[0] + lexical_reps = lexical_reps[:, remove_dims:].view(batch_size, -1, dims) + tok_reps, index_reps = lexical_reps.max(1) + return tok_reps + + # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights + def _init_weights(self, module): + """ Initialize the weights """ + if isinstance(module, (torch.nn.Linear, torch.nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, torch.nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, torch.nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + def init_weights(self): + self.encoder.init_weights() + self.tok_proj.apply(self._init_weights) + self.cls_proj.apply(self._init_weights) + + def forward( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: torch.Tensor = None, + skip_mlm: bool = False + ): + seq_out = self.encoder(input_ids=input_ids, attention_mask=attention_mask, return_dict=True) + seq_hidden = seq_out.hidden_states[-1] + cls_hidden = seq_hidden[:,0] # get [CLS] embeddings + term_weights = self.tok_proj(seq_hidden[:,1:]) # batch, seq, 1 + if not skip_mlm: + logits = seq_out.logits[:,1:] # batch, seq-1, vocab + logits = self.softmax(logits) + attention_mask = attention_mask[:,1:].unsqueeze(-1) + lexical_reps = torch.max((logits * term_weights) * attention_mask, dim=-2).values + else: + # w/o MLM + lexical_reps = torch.zeros(seq_hidden.shape[0], seq_hidden.shape[1], 30522, dtype=seq_hidden.dtype, device=seq_hidden.device) # (batch, len, vocab) + lexical_reps = torch.scatter(lexical_reps, dim=-1, index=input_ids[:,1:,None], src=term_weights) + lexical_reps = lexical_reps.max(-2).values + + lexical_reps = self.aggregate(lexical_reps, 640) + semantic_reps = self.cls_proj(cls_hidden) + return torch.cat((semantic_reps, lexical_reps), -1) + + +class DistlBERTAggretrieverEncoder(BERTAggretrieverEncoder): + config_class = DistilBertConfig + base_model_prefix = 'encoder' + load_tf_weights = None + + +class AggretrieverDocumentEncoder(DocumentEncoder): + def __init__(self, model_name: str, tokenizer_name=None, device='cuda:0'): + self.device = device + if 'distilbert' in model_name.lower(): + self.model = DistlBERTAggretrieverEncoder.from_pretrained(model_name) + else: + self.model = BERTAggretrieverEncoder.from_pretrained(model_name) + self.model.to(self.device) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name) + + def encode(self, texts, titles=None, fp16=False, max_length=512, **kwargs): + if titles is not None: + texts = [f'{title} {text}' for title, text in zip(titles, texts)] + else: + texts = [text for text in texts] + inputs = self.tokenizer( + texts, + max_length=max_length, + padding="longest", + truncation=True, + add_special_tokens=True, + return_tensors='pt' + ) + inputs.to(self.device) + if fp16: + with autocast(): + with torch.no_grad(): + outputs = self.model(**inputs) + else: + outputs = self.model(**inputs) + return outputs.detach().cpu().numpy() + + +class AggretrieverQueryEncoder(QueryEncoder): + def __init__(self, model_name: str, tokenizer_name=None, device='cuda:0'): + self.device = device + if 'distilbert' in model_name.lower(): + self.model = DistlBERTAggretrieverEncoder.from_pretrained(model_name) + else: + self.model = BERTAggretrieverEncoder.from_pretrained(model_name) + self.model.to(self.device) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name) + + def encode(self, texts, fp16=False, max_length=32, **kwargs): + texts = [text for text in texts] + inputs = self.tokenizer( + texts, + max_length=max_length, + padding="longest", + truncation=True, + add_special_tokens=True, + return_tensors='pt' + ) + inputs.to(self.device) + if fp16: + with autocast(): + with torch.no_grad(): + outputs = self.model(**inputs) + else: + outputs = self.model(**inputs) + return outputs.detach().cpu().numpy() \ No newline at end of file diff --git a/pyserini/encode/_ance.py b/pyserini/encode/_ance.py new file mode 100644 index 0000000000000000000000000000000000000000..10225c2b59a6c406dddbae46a5979703e7b5f750 --- /dev/null +++ b/pyserini/encode/_ance.py @@ -0,0 +1,119 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional + +import torch +from transformers import PreTrainedModel, RobertaConfig, RobertaModel, RobertaTokenizer + +from pyserini.encode import DocumentEncoder, QueryEncoder + + +class AnceEncoder(PreTrainedModel): + config_class = RobertaConfig + base_model_prefix = 'ance_encoder' + load_tf_weights = None + _keys_to_ignore_on_load_missing = [r'position_ids'] + _keys_to_ignore_on_load_unexpected = [r'pooler', r'classifier'] + + def __init__(self, config: RobertaConfig): + super().__init__(config) + self.config = config + self.roberta = RobertaModel(config) + self.embeddingHead = torch.nn.Linear(config.hidden_size, 768) + self.norm = torch.nn.LayerNorm(768) + self.init_weights() + + # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights + def _init_weights(self, module): + """ Initialize the weights """ + if isinstance(module, (torch.nn.Linear, torch.nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, torch.nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, torch.nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + def init_weights(self): + self.roberta.init_weights() + self.embeddingHead.apply(self._init_weights) + self.norm.apply(self._init_weights) + + def forward( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + ): + input_shape = input_ids.size() + device = input_ids.device + if attention_mask is None: + attention_mask = ( + torch.ones(input_shape, device=device) + if input_ids is None + else (input_ids != self.roberta.config.pad_token_id) + ) + outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask) + sequence_output = outputs.last_hidden_state + pooled_output = sequence_output[:, 0, :] + pooled_output = self.norm(self.embeddingHead(pooled_output)) + return pooled_output + + +class AnceDocumentEncoder(DocumentEncoder): + def __init__(self, model_name, tokenizer_name=None, device='cuda:0'): + self.device = device + self.model = AnceEncoder.from_pretrained(model_name) + self.model.to(self.device) + self.tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name or model_name) + + def encode(self, texts, titles=None, max_length=256, **kwargs): + if titles is not None: + texts = [f'{title} {text}' for title, text in zip(titles, texts)] + inputs = self.tokenizer( + texts, + max_length=max_length, + padding='longest', + truncation=True, + add_special_tokens=True, + return_tensors='pt' + ) + inputs.to(self.device) + return self.model(inputs["input_ids"]).detach().cpu().numpy() + + +class AnceQueryEncoder(QueryEncoder): + + def __init__(self, model_name: str, tokenizer_name: str = None, device: str = 'cpu'): + self.device = device + self.model = AnceEncoder.from_pretrained(model_name) + self.model.to(self.device) + self.tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name or tokenizer_name) + + def encode(self, query: str, **kwargs): + inputs = self.tokenizer( + [query], + max_length=64, + padding='longest', + truncation=True, + add_special_tokens=True, + return_tensors='pt' + ) + inputs.to(self.device) + embeddings = self.model(inputs["input_ids"]).detach().cpu().numpy() + return embeddings.flatten() diff --git a/pyserini/encode/_auto.py b/pyserini/encode/_auto.py new file mode 100644 index 0000000000000000000000000000000000000000..5e8cf6cd1b778c8e2874ff482407c2174af1bdc2 --- /dev/null +++ b/pyserini/encode/_auto.py @@ -0,0 +1,99 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +from sklearn.preprocessing import normalize +from transformers import AutoModel, AutoTokenizer + +from pyserini.encode import DocumentEncoder, QueryEncoder + + +class AutoDocumentEncoder(DocumentEncoder): + def __init__(self, model_name, tokenizer_name=None, device='cuda:0', pooling='cls', l2_norm=False): + self.device = device + self.model = AutoModel.from_pretrained(model_name) + self.model.to(self.device) + try: + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name) + except: + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name, use_fast=False) + self.has_model = True + self.pooling = pooling + self.l2_norm = l2_norm + + def encode(self, texts, titles=None, max_length=256, add_sep=False, **kwargs): + shared_tokenizer_kwargs = dict( + max_length=max_length, + truncation=True, + padding='longest', + return_attention_mask=True, + return_token_type_ids=False, + return_tensors='pt', + add_special_tokens=True, + ) + input_kwargs = {} + if not add_sep: + input_kwargs["text"] = [f'{title} {text}' for title, text in zip(titles, texts)] if titles is not None else texts + else: + if titles is not None: + input_kwargs["text"] = titles + input_kwargs["text_pair"] = texts + else: + input_kwargs["text"] = texts + + inputs = self.tokenizer(**input_kwargs, **shared_tokenizer_kwargs) + inputs.to(self.device) + outputs = self.model(**inputs) + if self.pooling == "mean": + embeddings = self._mean_pooling(outputs[0], inputs['attention_mask']).detach().cpu().numpy() + else: + embeddings = outputs[0][:, 0, :].detach().cpu().numpy() + if self.l2_norm: + embeddings = normalize(embeddings, axis=1) + return embeddings + + +class AutoQueryEncoder(QueryEncoder): + def __init__(self, model_name: str, tokenizer_name: str = None, device: str = 'cpu', + pooling: str = 'cls', l2_norm: bool = False, prefix=None): + self.device = device + self.model = AutoModel.from_pretrained(model_name) + self.model.to(self.device) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name) + self.pooling = pooling + self.l2_norm = l2_norm + self.prefix = prefix + + def encode(self, query: str, **kwargs): + if self.prefix: + query = f'{self.prefix} {query}' + inputs = self.tokenizer( + query, + add_special_tokens=True, + return_tensors='pt', + truncation='only_first', + padding='longest', + return_token_type_ids=False, + ) + inputs.to(self.device) + outputs = self.model(**inputs)[0].detach().cpu().numpy() + if self.pooling == "mean": + embeddings = np.average(outputs, axis=-2) + else: + embeddings = outputs[:, 0, :] + if self.l2_norm: + embeddings = normalize(outputs, norm='l2') + return embeddings.flatten() diff --git a/pyserini/encode/_base.py b/pyserini/encode/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..09c4e282001feedfdb3468c695b0d18eccb595b8 --- /dev/null +++ b/pyserini/encode/_base.py @@ -0,0 +1,207 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +import os + +import faiss +import torch +import numpy as np +from tqdm import tqdm + + +class DocumentEncoder: + def encode(self, texts, **kwargs): + pass + + @staticmethod + def _mean_pooling(last_hidden_state, attention_mask): + token_embeddings = last_hidden_state + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) + sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) + return sum_embeddings / sum_mask + + +class QueryEncoder: + def encode(self, text, **kwargs): + pass + + +class PcaEncoder: + def __init__(self, encoder, pca_model_path): + self.encoder = encoder + self.pca_mat = faiss.read_VectorTransform(pca_model_path) + + def encode(self, text, **kwargs): + if isinstance(text, str): + embeddings = self.encoder.encode(text, **kwargs) + embeddings = self.pca_mat.apply_py(np.array([embeddings])) + embeddings = embeddings[0] + else: + embeddings = self.encoder.encode(text, **kwargs) + embeddings = self.pca_mat.apply_py(embeddings) + return embeddings + + +class JsonlCollectionIterator: + def __init__(self, collection_path: str, fields=None, docid_field=None, delimiter="\n"): + if fields: + self.fields = fields + else: + self.fields = ['text'] + self.docid_field = docid_field + self.delimiter = delimiter + self.all_info = self._load(collection_path) + self.size = len(self.all_info['id']) + self.batch_size = 1 + self.shard_id = 0 + self.shard_num = 1 + + def __call__(self, batch_size=1, shard_id=0, shard_num=1): + self.batch_size = batch_size + self.shard_id = shard_id + self.shard_num = shard_num + return self + + def __iter__(self): + total_len = self.size + shard_size = int(total_len / self.shard_num) + start_idx = self.shard_id * shard_size + end_idx = min(start_idx + shard_size, total_len) + if self.shard_id == self.shard_num - 1: + end_idx = total_len + to_yield = {} + for idx in tqdm(range(start_idx, end_idx, self.batch_size)): + for key in self.all_info: + to_yield[key] = self.all_info[key][idx: min(idx + self.batch_size, end_idx)] + yield to_yield + + def _parse_fields_from_info(self, info): + """ + :params info: dict, containing all fields as speicifed in self.fields either under + the key of the field name or under the key of 'contents'. If under `contents`, this + function will parse the input contents into each fields based the self.delimiter + return: List, each corresponds to the value of self.fields + """ + n_fields = len(self.fields) + + # if all fields are under the key of info, read these rather than 'contents' + if all([field in info for field in self.fields]): + return [info[field].strip() for field in self.fields] + + assert "contents" in info, f"contents not found in info: {info}" + contents = info['contents'] + # whether to remove the final self.delimiter (especially \n) + # in CACM, a \n is always there at the end of contents, which we want to remove; + # but in SciFact, Fiqa, and more, there are documents that only have title but not text (e.g. "This is title\n") + # where the trailing \n indicates empty fields + if contents.count(self.delimiter) == n_fields: + # the user appends one more delimiter to the end, we remove it + if contents.endswith(self.delimiter): + # not using .rstrip() as there might be more than one delimiters at the end + contents = contents[:-len(self.delimiter)] + return [field.strip(" ") for field in contents.split(self.delimiter)] + + def _load(self, collection_path): + filenames = [] + if os.path.isfile(collection_path): + filenames.append(collection_path) + else: + for filename in os.listdir(collection_path): + filenames.append(os.path.join(collection_path, filename)) + all_info = {field: [] for field in self.fields} + all_info['id'] = [] + for filename in filenames: + with open(filename) as f: + for line_i, line in tqdm(enumerate(f)): + info = json.loads(line) + if self.docid_field: + _id = info.get(self.docid_field, None) + else: + _id = info.get('id', info.get('_id', info.get('docid', None))) + if _id is None: + raise ValueError(f"Cannot find f'`{self.docid_field if self.docid_field else '`id` or `_id` or `docid'}`' from {filename}.") + all_info['id'].append(str(_id)) + fields_info = self._parse_fields_from_info(info) + if len(fields_info) != len(self.fields): + raise ValueError( + f"{len(fields_info)} fields are found at Line#{line_i} in file {filename}." \ + f"{len(self.fields)} fields expected." \ + f"Line content: {info['contents']}" + ) + + for i in range(len(fields_info)): + all_info[self.fields[i]].append(fields_info[i]) + return all_info + + +class RepresentationWriter: + def __enter__(self): + pass + + def __exit__(self, exc_type, exc_val, exc_tb): + pass + + def write(self, batch_info, fields=None): + pass + + +class JsonlRepresentationWriter(RepresentationWriter): + def __init__(self, dir_path): + self.dir_path = dir_path + self.filename = 'embeddings.jsonl' + self.file = None + + def __enter__(self): + if not os.path.exists(self.dir_path): + os.makedirs(self.dir_path) + self.file = open(os.path.join(self.dir_path, self.filename), 'w') + + def __exit__(self, exc_type, exc_val, exc_tb): + self.file.close() + + def write(self, batch_info, fields=None): + for i in range(len(batch_info['id'])): + contents = "\n".join([batch_info[key][i] for key in fields]) + vector = batch_info['vector'][i] + vector = vector.tolist() if isinstance(vector, np.ndarray) else vector + self.file.write(json.dumps({'id': batch_info['id'][i], + 'contents': contents, + 'vector': vector}) + '\n') + + +class FaissRepresentationWriter(RepresentationWriter): + def __init__(self, dir_path, dimension=768): + self.dir_path = dir_path + self.index_name = 'index' + self.id_file_name = 'docid' + self.dimension = dimension + self.index = faiss.IndexFlatIP(self.dimension) + self.id_file = None + + def __enter__(self): + if not os.path.exists(self.dir_path): + os.makedirs(self.dir_path) + self.id_file = open(os.path.join(self.dir_path, self.id_file_name), 'w') + + def __exit__(self, exc_type, exc_val, exc_tb): + self.id_file.close() + faiss.write_index(self.index, os.path.join(self.dir_path, self.index_name)) + + def write(self, batch_info, fields=None): + for id_ in batch_info['id']: + self.id_file.write(f'{id_}\n') + self.index.add(np.ascontiguousarray(batch_info['vector'])) diff --git a/pyserini/encode/_cached_data.py b/pyserini/encode/_cached_data.py new file mode 100644 index 0000000000000000000000000000000000000000..87182a607e56121765db6690f51eaba3491f5c37 --- /dev/null +++ b/pyserini/encode/_cached_data.py @@ -0,0 +1,38 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import json + +from pyserini.encode import QueryEncoder + + +class CachedDataQueryEncoder(QueryEncoder): + def __init__(self, model_name_or_path): + self.vectors = self._load_from_jsonl(model_name_or_path) + + @staticmethod + def _load_from_jsonl(path): + vectors = {} + with open(path) as f: + for line in f: + info = json.loads(line) + text = info['contents'].strip() + vec = info['vector'] + vectors[text] = vec + return vectors + + def encode(self, text, **kwargs): + return self.vectors[text.strip()] diff --git a/pyserini/encode/_dpr.py b/pyserini/encode/_dpr.py new file mode 100644 index 0000000000000000000000000000000000000000..9e19a387cad4b9692a81c69c55141ce55130f1e5 --- /dev/null +++ b/pyserini/encode/_dpr.py @@ -0,0 +1,64 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer + +from pyserini.encode import DocumentEncoder, QueryEncoder + + +class DprDocumentEncoder(DocumentEncoder): + def __init__(self, model_name, tokenizer_name=None, device='cuda:0'): + self.device = device + self.model = DPRContextEncoder.from_pretrained(model_name) + self.model.to(self.device) + self.tokenizer = DPRContextEncoderTokenizer.from_pretrained(tokenizer_name or model_name) + + def encode(self, texts, titles=None, max_length=256, **kwargs): + if titles: + inputs = self.tokenizer( + titles, + text_pair=texts, + max_length=max_length, + padding='longest', + truncation=True, + add_special_tokens=True, + return_tensors='pt' + ) + else: + inputs = self.tokenizer( + texts, + max_length=max_length, + padding='longest', + truncation=True, + add_special_tokens=True, + return_tensors='pt' + ) + inputs.to(self.device) + return self.model(inputs["input_ids"]).pooler_output.detach().cpu().numpy() + + +class DprQueryEncoder(QueryEncoder): + def __init__(self, model_name: str, tokenizer_name: str = None, device: str = 'cpu'): + self.device = device + self.model = DPRQuestionEncoder.from_pretrained(model_name) + self.model.to(self.device) + self.tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(tokenizer_name or model_name) + + def encode(self, query: str, **kwargs): + input_ids = self.tokenizer(query, return_tensors='pt') + input_ids.to(self.device) + embeddings = self.model(input_ids["input_ids"]).pooler_output.detach().cpu().numpy() + return embeddings.flatten() diff --git a/pyserini/encode/_slim.py b/pyserini/encode/_slim.py new file mode 100644 index 0000000000000000000000000000000000000000..ea994631af8796c290c137ecf3a9ddd4e420a116 --- /dev/null +++ b/pyserini/encode/_slim.py @@ -0,0 +1,62 @@ +import torch +from transformers import AutoModelForMaskedLM, AutoTokenizer +import numpy as np +import scipy + +from pyserini.encode import QueryEncoder + + +class SlimQueryEncoder(QueryEncoder): + def __init__(self, model_name_or_path, tokenizer_name=None, fusion_weight=.99, device='cpu'): + self.device = device + self.fusion_weight = fusion_weight + self.model = AutoModelForMaskedLM.from_pretrained(model_name_or_path) + self.model.to(self.device) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name_or_path) + self.reverse_vocab = {v: k for k, v in self.tokenizer.vocab.items()} + + def encode(self, text, max_length=256, topk=20, return_sparse=False, **kwargs): + inputs = self.tokenizer( + [text], + return_tensors="pt", + padding=True, + truncation=True, + max_length=max_length, + add_special_tokens=True, + ) + outputs = self.model(**inputs, return_dict=True) + attention_mask = inputs["attention_mask"][:, 1:] # remove the cls token + logits = outputs.logits[:, 1:, :] # remove the cls token prediction + # routing, assign every token to top-k expert + full_router_repr = torch.log(1 + torch.relu(logits)) * attention_mask.unsqueeze(-1) + expert_weights, expert_ids = torch.topk(full_router_repr, dim=2, k=topk) # B x T x topk + min_expert_weight = torch.min(expert_weights, -1, True)[0] + sparse_expert_weights = torch.where(full_router_repr >= min_expert_weight, full_router_repr, 0) + return self._output_to_weight_dicts(expert_weights.cpu(), expert_ids.cpu(), sparse_expert_weights.cpu(), attention_mask.cpu(), return_sparse)[0] + + def _output_to_weight_dicts(self, batch_expert_weights, batch_expert_ids, batch_sparse_expert_weights, batch_attention, return_sparse): + to_return = [] + for batch_id, sparse_expert_weights in enumerate(batch_sparse_expert_weights): + tok_vector = scipy.sparse.csr_matrix(sparse_expert_weights.detach().numpy()) + upper_vector, lower_vector = {}, {} + max_term, max_weight = None, 0 + for position, (expert_topk_ids, expert_topk_weights, attention_score) in enumerate(zip(batch_expert_ids[batch_id], + batch_expert_weights[batch_id], + batch_attention[batch_id])): + if attention_score > 0: + for expert_id, expert_weight in zip(expert_topk_ids, expert_topk_weights): + if expert_weight > 0: + term, weight = self.reverse_vocab[expert_id.item()], expert_weight.item() + upper_vector[term] = upper_vector.get(term, 0) + weight + if weight > max_weight: + max_term, max_weight = term, weight + if max_term is not None: + lower_vector[term] = lower_vector.get(term, 0) + weight + fusion_vector = {} + for term, weight in upper_vector.items(): + fusion_vector[term] = self.fusion_weight * weight + (1 - self.fusion_weight) * lower_vector.get(term, 0) + if return_sparse: + to_return.append((fusion_vector, tok_vector)) + else: + to_return.append(fusion_vector) + return to_return \ No newline at end of file diff --git a/pyserini/encode/_splade.py b/pyserini/encode/_splade.py new file mode 100644 index 0000000000000000000000000000000000000000..4c6d97e341b1349c3c4ef6251886db6a6e5fbf99 --- /dev/null +++ b/pyserini/encode/_splade.py @@ -0,0 +1,35 @@ +import torch +from transformers import AutoModelForMaskedLM, AutoTokenizer +import numpy as np + +from pyserini.encode import QueryEncoder + + +class SpladeQueryEncoder(QueryEncoder): + def __init__(self, model_name_or_path, tokenizer_name=None, device='cpu'): + self.device = device + self.model = AutoModelForMaskedLM.from_pretrained(model_name_or_path) + self.model.to(self.device) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name_or_path) + self.reverse_voc = {v: k for k, v in self.tokenizer.vocab.items()} + + def encode(self, text, max_length=256, **kwargs): + inputs = self.tokenizer([text], max_length=max_length, padding='longest', + truncation=True, add_special_tokens=True, + return_tensors='pt').to(self.device) + input_ids = inputs['input_ids'] + input_attention = inputs['attention_mask'] + batch_logits = self.model(input_ids)['logits'] + batch_aggregated_logits, _ = torch.max(torch.log(1 + torch.relu(batch_logits)) + * input_attention.unsqueeze(-1), dim=1) + batch_aggregated_logits = batch_aggregated_logits.cpu().detach().numpy() + return self._output_to_weight_dicts(batch_aggregated_logits)[0] + + def _output_to_weight_dicts(self, batch_aggregated_logits): + to_return = [] + for aggregated_logits in batch_aggregated_logits: + col = np.nonzero(aggregated_logits)[0] + weights = aggregated_logits[col] + d = {self.reverse_voc[k]: float(v) for k, v in zip(list(col), list(weights))} + to_return.append(d) + return to_return diff --git a/pyserini/encode/_tct_colbert.py b/pyserini/encode/_tct_colbert.py new file mode 100644 index 0000000000000000000000000000000000000000..e68dc7d426a746545e3b010b6caf5d3767c0167f --- /dev/null +++ b/pyserini/encode/_tct_colbert.py @@ -0,0 +1,91 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import torch +if torch.cuda.is_available(): + from torch.cuda.amp import autocast +from transformers import BertModel, BertTokenizer, BertTokenizerFast + +from pyserini.encode import DocumentEncoder, QueryEncoder +from onnxruntime import ExecutionMode, SessionOptions, InferenceSession + + +class TctColBertDocumentEncoder(DocumentEncoder): + def __init__(self, model_name: str, tokenizer_name=None, device='cuda:0'): + self.device = device + self.onnx = False + if model_name.endswith('onnx'): + options = SessionOptions() + self.session = InferenceSession(model_name, options) + self.onnx = True + self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name or model_name[:-5]) + else: + self.model = BertModel.from_pretrained(model_name) + self.model.to(self.device) + self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name or model_name) + + def encode(self, texts, titles=None, fp16=False, max_length=512, **kwargs): + if titles is not None: + texts = [f'[CLS] [D] {title} {text}' for title, text in zip(titles, texts)] + else: + texts = ['[CLS] [D] ' + text for text in texts] + inputs = self.tokenizer( + texts, + max_length=max_length, + padding="longest", + truncation=True, + add_special_tokens=False, + return_tensors='pt' + ) + if self.onnx: + inputs_onnx = {name: np.atleast_2d(value) for name, value in inputs.items()} + inputs.to(self.device) + outputs, _ = self.session.run(None, inputs_onnx) + outputs = torch.from_numpy(outputs).to(self.device) + embeddings = self._mean_pooling(outputs[:, 4:, :], inputs['attention_mask'][:, 4:]) + else: + inputs.to(self.device) + if fp16: + with autocast(): + with torch.no_grad(): + outputs = self.model(**inputs) + else: + outputs = self.model(**inputs) + embeddings = self._mean_pooling(outputs["last_hidden_state"][:, 4:, :], inputs['attention_mask'][:, 4:]) + return embeddings.detach().cpu().numpy() + + +class TctColBertQueryEncoder(QueryEncoder): + def __init__(self, model_name: str, tokenizer_name: str = None, device: str = 'cpu'): + self.device = device + self.model = BertModel.from_pretrained(model_name) + self.model.to(self.device) + self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name or model_name) + + def encode(self, query: str, **kwargs): + max_length = 36 # hardcode for now + inputs = self.tokenizer( + '[CLS] [Q] ' + query + '[MASK]' * max_length, + max_length=max_length, + truncation=True, + add_special_tokens=False, + return_tensors='pt' + ) + inputs.to(self.device) + outputs = self.model(**inputs) + embeddings = outputs.last_hidden_state.detach().cpu().numpy() + return np.average(embeddings[:, 4:, :], axis=-2).flatten() diff --git a/pyserini/encode/_tok_freq.py b/pyserini/encode/_tok_freq.py new file mode 100644 index 0000000000000000000000000000000000000000..3574c34123fc38d687b73bb0e258f4be4a0af02e --- /dev/null +++ b/pyserini/encode/_tok_freq.py @@ -0,0 +1,37 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from transformers import AutoTokenizer + +from pyserini.encode import QueryEncoder + + +class TokFreqQueryEncoder(QueryEncoder): + def __init__(self, model_name_or_path=None): + self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) if model_name_or_path else None + + def encode(self, text, **kwargs): + vector = {} + if self.tokenizer is not None: + tok_list = self.tokenizer.tokenize(text) + else: + tok_list = text.strip().split() + for tok in tok_list: + if tok not in vector: + vector[tok] = 1 + else: + vector[tok] += 1 + return vector diff --git a/pyserini/encode/_unicoil.py b/pyserini/encode/_unicoil.py new file mode 100644 index 0000000000000000000000000000000000000000..e0f48b131a371e9066d8de5cb2da60e5e98f1c90 --- /dev/null +++ b/pyserini/encode/_unicoil.py @@ -0,0 +1,175 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional + +import torch +if torch.cuda.is_available(): + from torch.cuda.amp import autocast +from transformers import BertConfig, BertModel, BertTokenizer, PreTrainedModel + +from pyserini.encode import DocumentEncoder, QueryEncoder + + +class UniCoilEncoder(PreTrainedModel): + config_class = BertConfig + base_model_prefix = 'coil_encoder' + load_tf_weights = None + + def __init__(self, config: BertConfig): + super().__init__(config) + self.config = config + self.bert = BertModel(config) + self.tok_proj = torch.nn.Linear(config.hidden_size, 1) + self.init_weights() + + # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights + def _init_weights(self, module): + """ Initialize the weights """ + if isinstance(module, (torch.nn.Linear, torch.nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, torch.nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, torch.nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + def init_weights(self): + self.bert.init_weights() + self.tok_proj.apply(self._init_weights) + + def forward( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + ): + input_shape = input_ids.size() + device = input_ids.device + if attention_mask is None: + attention_mask = ( + torch.ones(input_shape, device=device) + if input_ids is None + else (input_ids != self.bert.config.pad_token_id) + ) + outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) + sequence_output = outputs.last_hidden_state + tok_weights = self.tok_proj(sequence_output) + tok_weights = torch.relu(tok_weights) + return tok_weights + + +class UniCoilDocumentEncoder(DocumentEncoder): + def __init__(self, model_name, tokenizer_name=None, device='cuda:0'): + self.device = device + self.model = UniCoilEncoder.from_pretrained(model_name) + self.model.to(self.device) + self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name or model_name) + + def encode(self, texts, titles=None, expands=None, fp16=False, max_length=512, **kwargs): + if titles: + texts = [f'{title} {text}' for title, text in zip(titles, texts)] + if expands: + input_ids = self._tokenize_with_injects(texts, expands) + else: + input_ids = self.tokenizer(texts, max_length=max_length, padding='longest', + truncation=True, add_special_tokens=True, + return_tensors='pt').to(self.device)["input_ids"] + if fp16: + with autocast(): + with torch.no_grad(): + batch_weights = self.model(input_ids).cpu().detach().numpy() + else: + batch_weights = self.model(input_ids).cpu().detach().numpy() + batch_token_ids = input_ids.cpu().detach().numpy() + return self._output_to_weight_dicts(batch_token_ids, batch_weights) + + def _output_to_weight_dicts(self, batch_token_ids, batch_weights): + to_return = [] + for i in range(len(batch_token_ids)): + weights = batch_weights[i].flatten() + tokens = self.tokenizer.convert_ids_to_tokens(batch_token_ids[i]) + tok_weights = {} + for j in range(len(tokens)): + tok = str(tokens[j]) + weight = float(weights[j]) + if tok == '[CLS]': + continue + if tok == '[PAD]': + break + if tok not in tok_weights: + tok_weights[tok] = weight + elif weight > tok_weights[tok]: + tok_weights[tok] = weight + to_return.append(tok_weights) + return to_return + + def _tokenize_with_injects(self, texts, expands): + tokenized = [] + max_len = 0 + for text, expand in zip(texts, expands): + text_ids = self.tokenizer.encode(text, add_special_tokens=False, max_length=400, truncation=True) + expand_ids = self.tokenizer.encode(expand, add_special_tokens=False, max_length=100, truncation=True) + injects = set() + for tok_id in expand_ids: + if tok_id not in text_ids: + injects.add(tok_id) + all_tok_ids = [101] + text_ids + [102] + list(injects) + [102] # 101: CLS, 102: SEP + tokenized.append(all_tok_ids) + cur_len = len(all_tok_ids) + if cur_len > max_len: + max_len = cur_len + for i in range(len(tokenized)): + tokenized[i] += [0] * (max_len - len(tokenized[i])) + return torch.tensor(tokenized, device=self.device) + + +class UniCoilQueryEncoder(QueryEncoder): + def __init__(self, model_name_or_path, tokenizer_name=None, device='cpu'): + self.device = device + self.model = UniCoilEncoder.from_pretrained(model_name_or_path) + self.model.to(self.device) + self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name or model_name_or_path) + + def encode(self, text, **kwargs): + max_length = 128 # hardcode for now + input_ids = self.tokenizer([text], max_length=max_length, padding='longest', + truncation=True, add_special_tokens=True, + return_tensors='pt').to(self.device)["input_ids"] + batch_weights = self.model(input_ids).cpu().detach().numpy() + batch_token_ids = input_ids.cpu().detach().numpy() + return self._output_to_weight_dicts(batch_token_ids, batch_weights)[0] + + def _output_to_weight_dicts(self, batch_token_ids, batch_weights): + to_return = [] + for i in range(len(batch_token_ids)): + weights = batch_weights[i].flatten() + tokens = self.tokenizer.convert_ids_to_tokens(batch_token_ids[i]) + tok_weights = {} + for j in range(len(tokens)): + tok = str(tokens[j]) + weight = float(weights[j]) + if tok == '[CLS]': + continue + if tok == '[PAD]': + break + if tok not in tok_weights: + tok_weights[tok] = weight + else: + tok_weights[tok] += weight + to_return.append(tok_weights) + return to_return diff --git a/pyserini/encode/merge_faiss_index.py b/pyserini/encode/merge_faiss_index.py new file mode 100644 index 0000000000000000000000000000000000000000..bc0a929cd7e552237c75f955db1c18d2757ba7a4 --- /dev/null +++ b/pyserini/encode/merge_faiss_index.py @@ -0,0 +1,48 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import glob +import argparse + +import faiss +from tqdm import tqdm + + +parser = argparse.ArgumentParser() +parser.add_argument('--dimension', type=int, help='dimension of passage embeddings', required=False, default=768) +parser.add_argument('--input', type=str, help='wildcard directory to input indexes', required=True) +parser.add_argument('--output', type=str, help='directory to output full indexes', required=True) +args = parser.parse_args() +os.makedirs(args.output, exist_ok=True) + +# merge index +new_index = faiss.IndexFlatIP(args.dimension) +docid_files = [] +for index_dir in tqdm(sorted(glob.glob(args.input)), desc="Merging Faiss Index"): + index = faiss.read_index(os.path.join(index_dir, 'index')) + docid_files.append(os.path.join(index_dir, 'docid')) + vectors = index.reconstruct_n(0, index.ntotal) + new_index.add(vectors) + +faiss.write_index(new_index, os.path.join(args.output, 'index')) + +# merge docid +with open(os.path.join(args.output, 'docid'), 'w') as wfd: + for f in docid_files: + with open(f, 'r') as f1: + for line in f1: + wfd.write(line) diff --git a/pyserini/encode/query.py b/pyserini/encode/query.py new file mode 100644 index 0000000000000000000000000000000000000000..4cf53a0c69904d0872529c85107687277944956a --- /dev/null +++ b/pyserini/encode/query.py @@ -0,0 +1,83 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse + +from tqdm import tqdm +import numpy as np +import pandas as pd +from pyserini.query_iterator import DefaultQueryIterator +from pyserini.encode import DprQueryEncoder, TctColBertQueryEncoder, AnceQueryEncoder, AutoQueryEncoder +from pyserini.encode import UniCoilQueryEncoder, SpladeQueryEncoder + + +def init_encoder(encoder, device): + if 'dpr' in encoder.lower(): + return DprQueryEncoder(encoder, device=device) + elif 'tct' in encoder.lower(): + return TctColBertQueryEncoder(encoder, device=device) + elif 'ance' in encoder.lower(): + return AnceQueryEncoder(encoder, device=device, tokenizer_name='roberta-base') + elif 'sentence-transformers' in encoder.lower(): + return AutoQueryEncoder(encoder, device=device, pooling='mean', l2_norm=True) + elif 'unicoil' in encoder.lower(): + return UniCoilQueryEncoder(encoder, device=device) + elif 'splade' in encoder.lower(): + return SpladeQueryEncoder(encoder, device=device) + else: + return AutoQueryEncoder(encoder, device=device) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--topics', type=str, + help='path to topics file in tsv format or self-contained topics name', required=True) + parser.add_argument('--encoder', type=str, help='encoder model name or path', required=True) + parser.add_argument('--weight-range', type=int, help='range of weights for sparse embedding', required=False) + parser.add_argument('--quant-range', type=int, help='range of quantization for sparse embedding', required=False) + parser.add_argument('--output', type=str, help='path to stored encoded queries', required=True) + parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]', + default='cpu', required=False) + args = parser.parse_args() + + encoder = init_encoder(args.encoder, device=args.device) + query_iterator = DefaultQueryIterator.from_topics(args.topics) + + is_sparse = False + query_ids = [] + query_texts = [] + query_embeddings = [] + for topic_id, text in tqdm(query_iterator): + embedding = encoder.encode(text) + if isinstance(embedding, dict): + is_sparse = True + pseudo_str = [] + for tok, weight in embedding.items(): + weight_quanted = int(np.round(weight/args.weight_range*args.quant_range)) + pseudo_str += [tok] * weight_quanted + pseudo_str = " ".join(pseudo_str) + embedding = pseudo_str + query_ids.append(topic_id) + query_texts.append(text) + query_embeddings.append(embedding) + if is_sparse: + with open(args.output, 'w') as f: + for i in range(len(query_ids)): + f.write(f"{query_ids[i]}\t{query_embeddings[i]}\n") + else: + embeddings = {'id': query_ids, 'text': query_texts, 'embedding': query_embeddings} + embeddings = pd.DataFrame(embeddings) + embeddings.to_pickle(args.output) diff --git a/pyserini/encoded_corpus_info.py b/pyserini/encoded_corpus_info.py new file mode 100644 index 0000000000000000000000000000000000000000..c5c47459ef8f82e2cb5d2e37f9388cba3a2ecc77 --- /dev/null +++ b/pyserini/encoded_corpus_info.py @@ -0,0 +1,40 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +CORPUS_INFO = { + "scipy-sparse-vectors.msmarco-v1-passage-slimr": { + "description": "MS MARCO passages-v1 corpus encoded by SLIM trained with BM25 negatives. (Scipy)", + "filename": "scipy-sparse-vectors.msmarco-v1-passage-slimr.20230220.tar.gz", + "urls": [ + "https://vault.cs.uwaterloo.ca/s/4MRXSmiDqNH4mgF/download", + ], + "md5": "7ec96c74dced272712fcbb091bb671a8", + "size (bytes)": 16533697862, + "documents": 8841823, + "downloaded": False + }, + "scipy-sparse-vectors.msmarco-v1-passage-slimr-pp": { + "description": "MS MARCO passages-v1 corpus encoded by SLIM trained with cross-encoder distillation and hardnegative mining (Scipy)", + "filename": "scipy-sparse-vectors.msmarco-v1-passage-slimr-pp.20230220.tar.gz", + "urls": [ + "https://vault.cs.uwaterloo.ca/s/gDJnrYGKsq6ir4w/download", + ], + "md5": "05ce2ce5f64b668a487909ab538ef2a5", + "size (bytes)": 15785241481, + "documents": 8841823, + "downloaded": False + }, +} \ No newline at end of file diff --git a/pyserini/encoded_query_info.py b/pyserini/encoded_query_info.py new file mode 100644 index 0000000000000000000000000000000000000000..620b921c93b6b1010c028e5b6ff72d75b75a19f2 --- /dev/null +++ b/pyserini/encoded_query_info.py @@ -0,0 +1,488 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +QUERY_INFO = { + "aggretriever-cocondenser-msmarco-passage-dev-subset": { + "description": "MS MARCO passage dev set queries encoded by aggretriever-cocondenser", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-aggretriever-cocondenser-msmarco-passage-dev-subset-20230407-f627ef.tar.gz" + ], + "md5": "c30ad20c7b101e3034f41597f0fc1f67", + "size (bytes)": 20859862, + "total_queries": 6980, + "downloaded": False + }, + "aggretriever-distilbert-msmarco-passage-dev-subset": { + "description": "MS MARCO passage dev set queries encoded by aggretriever-distilbert", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-aggretriever-distilbert-msmarco-passage-dev-subset-20230407-f627ef.tar.gz" + ], + "md5": "a6ee094bd681b08e5657ce69185eee82", + "size (bytes)": 20771767, + "total_queries": 6980, + "downloaded": False + }, + "tct_colbert-msmarco-passage-dev-subset": { + "description": "MS MARCO passage dev set queries encoded by TCT-ColBERT", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-tct_colbert-msmarco-passage-dev-subset-20210419-9323ec.tar.gz", + ], + "md5": "b2fe6494241639153f26cc61acf3b39d", + "size (bytes)": 20078757, + "total_queries": 6980, + "downloaded": False + }, + "tct_colbert-v2-msmarco-passage-dev-subset": { + "description": "MS MARCO passage dev set queries encoded by TCT-ColBERT V2", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-tct_colbert-v2-msmarco-passage-dev-subset-20210608-5f341b.tar.gz", + ], + "md5": "ee8d76e596aef02c5027a2ffd0ff66f8", + "size (bytes)": 20072992, + "total_queries": 6980, + "downloaded": False + }, + "tct_colbert-v2-hn-msmarco-passage-dev-subset": { + "description": "MS MARCO passage dev set queries encoded by TCT-ColBERT V2 HN", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-tct_colbert-v2-hn-msmarco-passage-dev-subset-20210608-5f341b.tar.gz", + ], + "md5": "f7e39cf2cd3ee53f7f8f2e0a1821431c", + "size (bytes)": 20074411, + "total_queries": 6980, + "downloaded": False + }, + "tct_colbert-v2-hnp-msmarco-passage-dev-subset": { + "description": "MS MARCO passage dev set queries encoded by TCT-ColBERT V2 HN+", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-tct_colbert-v2-hnp-msmarco-passage-dev-subset-20210608-5f341b.tar.gz", + ], + "md5": "bed8036475774d12915c8af2a44612f4", + "size (bytes)": 20078958, + "total_queries": 6980, + "downloaded": False + }, + "tct_colbert-v2-hnp-dl19-passage": { + "description": "TREC DL19-passage queries encoded by TCT-ColBERT V2 HN+", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-tct_colbert-v2-hnp-dl19-passage-20230124-99b795.tar.gz", + ], + "md5": "ee945fb0a5b17cba4e2e5d51318fbe05", + "size (bytes)": 125193, + "total_queries": 43, + "downloaded": False + }, + "tct_colbert-v2-hnp-dl20": { + "description": "TREC DL20 queries encoded by TCT-ColBERT V2 HN+", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-tct_colbert-v2-hnp-dl20-passage-20230124-99b795.tar.gz", + ], + "md5": "b940d3d38cf5a50a9467a4aa7a59d226", + "size (bytes)": 577645, + "total_queries": 200, + "downloaded": False + }, + "ance-msmarco-passage-dev-subset": { + "description": "MS MARCO passage dev set queries encoded by ANCE", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-ance-msmarco-passage-dev-subset-20210419-9323ec.tar.gz", + ], + "md5": "adad81bb1495eff2f0463e809ecc01b8", + "size (bytes)": 19965095, + "total_queries": 6980, + "downloaded": False + }, + "ance-dl19-passage": { + "description": "TREC DL19 passage queries encoded by ANCE", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-ance-dl19-passage-20230124-99b79.tar.gz", + ], + "md5": "828714ef5481dc49686e14b61881ba06", + "size (bytes)": 124468, + "total_queries": 43, + "downloaded": False + }, + "ance-dl20": { + "description": "TREC DL20 queries encoded by ANCE", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-ance-dl20-passage-20230124-99b79.tar.gz", + ], + "md5": "79acea9812a5c20d0d0817b07b348d15", + "size (bytes)": 574183, + "total_queries": 200, + "downloaded": False + }, + "tct_colbert-msmarco-doc-dev": { + "description": "MS MARCO Document dev set queries encoded by TCT-ColBERT zero-shot", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-tct_colbert-msmarco-doc-dev-20210419-9323ec.tar.gz", + ], + "md5": "565fe57f92b229643b68fa3263f089a9", + "size (bytes)": 14940124, + "total_queries": 6980, + "downloaded": False + }, + "ance_maxp-msmarco-doc-dev": { + "description": "MS MARCO Document dev set queries encoded by ANCE maxp", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-ance_maxp-msmarco-doc-dev-20210419-9323ec.tar.gz", + ], + "md5": "3d41ae797cb97e42649c4f4fa7b97d56", + "size (bytes)": 14854155, + "total_queries": 6980, + "downloaded": False + }, + "sbert-msmarco-passage-dev-subset": { + "description": "MS MARCO passage dev set queries encoded by SBERT", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-sbert-msmarco-passage-dev-subset-20210419-9323ec.tar.gz", + ], + "md5": "dc0d09a0f5803824c1ad46a39417aa1e", + "size (bytes)": 20058701, + "total_queries": 6980, + "downloaded": False + }, + "distilbert_kd-msmarco-passage-dev-subset": { + "description": "MS MARCO passage dev set queries encoded by SBERT", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-distilbert_kd-msmarco-passage-dev-subset-20210419-9323ec.tar.gz", + ], + "md5": "4706ec91183eefa9771e9311fe4799e0", + "size (bytes)": 20013009, + "total_queries": 6980, + "downloaded": False + }, + "distilbert_kd-dl19-passage": { + "description": "TREC DL19 passage queries encoded by SBERT", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-distilbert_kd-dl19-passage-20230124-99b79.tar.gz", + ], + "md5": "c9fe8c8112a7d4fcda1aa606af77e66a", + "size (bytes)": 124760, + "total_queries": 43, + "downloaded": False + }, + "distilbert_kd-dl20": { + "description": "TREC DL20 queries encoded by SBERT", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-distilbert_kd-dl20-passage-20230124-99b79.tar.gz", + ], + "md5": "09fe19984515145a78183a98e44bd699", + "size (bytes)": 575682, + "total_queries": 200, + "downloaded": False + }, + "distilbert_tas_b-msmarco-passage-dev-subset": { + "description": "MS MARCO passage dev set queries encoded by TAS-B", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-distilbert_dot_tas_b_b256-msmarco-passage-dev-subset-20210527-63276f.tar.gz", + ], + "md5": "17a3f81de7ba497728050b83733b1c46", + "size (bytes)": 20016799, + "total_queries": 6980, + "downloaded": False + }, + "distilbert_tas_b-dl19-passage": { + "description": "TREC DL19 passage queries encoded by TAS-B", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-distilbert_dot_tas_b_b256-dl19-passage-20230124-99b795.tar.gz", + ], + "md5": "a0a23a1be77e6e9e5dfacf32dfcd5e9b", + "size (bytes)": 124809, + "total_queries": 43, + "downloaded": False + }, + "distilbert_tas_b-dl20": { + "description": "TREC DL20 queries encoded by TAS-B", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-distilbert_dot_tas_b_b256-dl20-passage-20230124-99b795.tar.gz", + ], + "md5": "8ffb4d5a17a2c028fb5065ef8a394ab3", + "size (bytes)": 575875, + "total_queries": 200, + "downloaded": False + }, + "dpr_multi-nq-dev": { + "description": "Natural Question dev set questions encoded by DPR question encoder trained on multiset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dpr_multi-nq-dev-20210419-9323ec.tar.gz", + ], + "md5": "c2fd32438129e4994ce2ce71e08de875", + "size (bytes)": 25129398, + "total_queries": 8757, + "downloaded": False + }, + "dpr_multi-nq-test": { + "description": "Natural Question test set questions encoded by DPR question encoder trained on multiset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dpr_multi-nq-test-20210419-9323ec.tar.gz", + ], + "md5": "1791f1ed078beb3a00847f75023eb020", + "size (bytes)": 10365005, + "total_queries": 3610, + "downloaded": False + }, + "ance_multi-nq-dev": { + "description": "Natural Question dev set questions encoded by ANCE question encoder trained on multiset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-ance_multi-nq-dev-20210419-9323ec.tar.gz", + ], + "md5": "a3ed32ec8d5a474f61e3c3a9968b26fd", + "size (bytes)": 25163934, + "total_queries": 8757, + "downloaded": False + }, + "ance_multi-nq-test": { + "description": "Natural Question test set questions encoded by ANCE question encoder trained on multiset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-ance_multi-nq-test-20210419-9323ec.tar.gz", + ], + "md5": "a356202b7c8f73758732c893a76a8005", + "size (bytes)": 10379384, + "total_queries": 3610, + "downloaded": False + }, + "dpr_multi-trivia-dev": { + "description": "TriviaQA dev set questions encoded by DPR question encoder trained on multiset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dpr_multi-trivia-dev-20210419-9323ec.tar.gz", + ], + "md5": "efac7b71ef52ca073331e896089456a4", + "size (bytes)": 25517034, + "total_queries": 8837, + "downloaded": False + }, + "dpr_multi-trivia-test": { + "description": "TriviaQA test set questions encoded by DPR question encoder trained on multiset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dpr_multi-trivia-test-20210419-9323ec.tar.gz", + ], + "md5": "01e95455d55d0495d806549f04a02c24", + "size (bytes)": 32664437, + "total_queries": 11313, + "downloaded": False + }, + "ance_multi-trivia-dev": { + "description": "TriviaQA dev set questions encoded by ANCE question encoder trained on multiset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-ance_multi-trivia-dev-20210419-9323ec.tar.gz", + ], + "md5": "bd88499a5785b15ba702173cc0e91417", + "size (bytes)": 25559775, + "total_queries": 8837, + "downloaded": False + }, + "ance_multi-trivia-test": { + "description": "TriviaQA test set questions encoded by ANCE question encoder trained on multiset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-ance_multi-trivia-test-20210419-9323ec.tar.gz", + ], + "md5": "3844dfb7f8feb6b064fa48775a35c6ee", + "size (bytes)": 32717910, + "total_queries": 11313, + "downloaded": False + }, + "dpr_multi-wq-test": { + "description": "Web Questions test set questions encoded by DPR question encoder trained on multiset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dpr_multi-wq-test-20210419-9323ec.tar.gz", + ], + "md5": "19aa721632d05afe031cc2da83a9a5a5", + "size (bytes)": 5826854, + "total_queries": 2032, + "downloaded": False + }, + "dpr_multi-squad-test": { + "description": "SQUAD dev set questions encoded by DPR question encoder trained on multiset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dpr_multi-squad-test-20210419-9323ec.tar.gz", + ], + "md5": "d11e0f801a488d51ad2a63b0748f4ae0", + "size (bytes)": 30328268, + "total_queries": 10570, + "downloaded": False + }, + "dpr_multi-curated-test": { + "description": "CuratedTREC test set questions encoded by DPR question encoder trained on multiset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dpr_multi-curated-test-20210419-9323ec.tar.gz", + ], + "md5": "d1737d3ec5a080d93350ae76b02c7fd1", + "size (bytes)": 1995280, + "total_queries": 694, + "downloaded": False + }, + "dpr_single_nq-nq-dev": { + "description": "NQ dev set questions encoded by DPR question encoder trained on NQ dataset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dpr_single_nq-nq-dev-20210419-9323ec.tar.gz", + ], + "md5": "1a992f8d5336dc8654bba5ab7e375ebe", + "size (bytes)": 25123288, + "total_queries": 8757, + "downloaded": False + }, + "dpr_single_nq-nq-test": { + "description": "NQ test set questions encoded by DPR question encoder trained on NQ dataset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dpr_single_nq-nq-test-20210419-9323ec.tar.gz", + ], + "md5": "e64bb009b6ba8bfe40d4b9967fd69240", + "size (bytes)": 10362252, + "total_queries": 3610, + "downloaded": False + }, + "bpr_single_nq-nq-test": { + "description": "NQ test set questions encoded by BPR question encoder trained on NQ dataset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-bpr_single_nq-nq-test-20210827-8a8f75.tar.gz", + ], + "md5": "b139d5a096ad52d2abc66fb54ec66158", + "size (bytes)": 11094680, + "total_queries": 3610, + "downloaded": False + }, + "dkrr-dpr-nq-retriever-dpr-nq-dev": { + "description": "DPR-NQ dev set questions encoded by castorini/dkrr-dpr-nq-retriever trained on NQ dataset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dkrr-dpr-nq-retriever-dpr-nq-dev-20220304-7ffa54.tar.gz", + ], + "md5": "fe1276ae841bd5be6f3e0daac144273a", + "size (bytes)": 25146740, + "total_queries": 8757, + "downloaded": False + }, + "dkrr-dpr-nq-retriever-dpr-nq-test": { + "description": "DPR-NQ test set questions encoded by castorini/dkrr-dpr-nq-retriever trained on NQ dataset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dkrr-dpr-nq-retriever-dpr-nq-test-20220304-7ffa54.tar.gz", + ], + "md5": "6c7793a0a89e7d10309a6973c52de326", + "size (bytes)": 10370414, + "total_queries": 3610, + "downloaded": False + }, + "dkrr-dpr-nq-retriever-nq-dev": { + "description": "NQ dev set questions encoded by castorini/dkrr-dpr-nq-retriever trained on NQ dataset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dkrr-dpr-nq-retriever-nq-dev-20220304-7ffa54.tar.gz", + ], + "md5": "3c84c7fb6569d7690d5c38be61d3a5a4", + "size (bytes)": 25146526, + "total_queries": 8757, + "downloaded": False + }, + "dkrr-dpr-nq-retriever-nq-test": { + "description": "NQ test set questions encoded by castorini/dkrr-dpr-nq-retriever trained on NQ dataset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dkrr-dpr-nq-retriever-nq-test-20220304-7ffa54.tar.gz", + ], + "md5": "cd3c30fc6dfde160983167b59acb17a3", + "size (bytes)": 10370264, + "total_queries": 3610, + "downloaded": False + }, + "dkrr-dpr-tqa-retriever-dpr-tqa-dev": { + "description": "TriviaQA dev set questions encoded by castorini/dkrr-dpr-tqa-retriever trained on TriviaQA dataset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dkrr-dpr-tqa-retriever-tqa-dev-20220304-7ffa54.tar.gz", + ], + "md5": "f9ca5060cf7794b681cd4fe3d3708c4d", + "size (bytes)": 25540932, + "total_queries": 8837, + "downloaded": False + }, + "dkrr-dpr-tqa-retriever-dpr-tqa-test": { + "description": "TriviaQA test set questions encoded by castorini/dkrr-dpr-tqa-retriever trained on TriviaQA dataset", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-dkrr-dpr-tqa-retriever-tqa-test-20220304-7ffa54.tar.gz", + ], + "md5": "9cbd030c3a4478b7eb8356844bacc45b", + "size (bytes)": 32688909, + "total_queries": 11313, + "downloaded": False + }, + "wiki-6-3-all-dpr2-multi-nq-test": { + "description": "NQ test set questions encoded by castorini/wiki-all-6-3-multi-dpr2-query-encoder.", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-wiki-all-6-3-dpr2-multi-retriever-nq-test-20230103-186fa7.tar.gz", + ], + "md5": "2632ca1392a33e975d505acd5090250a", + "size (bytes)": 10354577, + "total_queries": 3610, + "downloaded": False + }, + "wiki-6-3-all-dpr2-multi-dpr-trivia-test": { + "description": "TriviaQA test set questions encoded by castorini/wiki-all-6-3-multi-dpr2-query-encoder.", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-wiki-all-6-3-dpr2-multi-retriever-dpr-trivia-test-20230103-186fa7.tar.gz", + ], + "md5": "d0abf8ff598daaec35acd972a465b0e2", + "size (bytes)": 32620950, + "total_queries": 11313, + "downloaded": False + }, + "openai-ada2-dl19-passage": { + "description": "TREC DL19 passage queries encoded by OpenAI ada2.", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-openai-ada2-dl19-passage-20230530-e3a58f.tar.gz", + ], + "md5": "ab57dab62c5b43508c661b78d6f7b6b9", + "size (bytes)": 418940, + "total_queries": 43, + "downloaded": False + }, + "openai-ada2-dl20": { + "description": "TREC DL20 passage queries encoded by OpenAI ada2.", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-openai-ada2-dl20-passage-20230530-e3a58f.tar.gz", + ], + "md5": "fe711c1e146647396fd06f125882d01c", + "size (bytes)": 1939404, + "total_queries": 200, + "downloaded": False + }, + "openai-ada2-dl19-passage-hyde": { + "description": "TREC DL19 passage queries encoded by HyDE-OpenAI ada2.", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-openai-ada2-hyde-dl19-passage-20230530-e3a58f.tar.gz", + ], + "md5": "bc981187dc18f3fbf21698605e2349b5", + "size (bytes)": 508400, + "total_queries": 43, + "downloaded": False + }, + "openai-ada2-dl20-hyde": { + "description": "TREC DL20 passage queries encoded by HyDE-OpenAI ada2.", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-openai-ada2-hyde-dl20-passage-20230530-e3a58f.tar.gz", + ], + "md5": "12389d6affdab9231996834f7022beab", + "size (bytes)": 645105, + "total_queries": 200, + "downloaded": False + }, + "openai-ada2-msmarco-passage-dev-subset": { + "description": "MS MARCO passage dev set queries encoded by OpenAI ada2.", + "urls": [ + "https://github.com/castorini/pyserini-data/raw/main/encoded-queries/query-embedding-openai-ada2-msmarco-passage-dev-subset-20230530-e3a58f.tar.gz", + ], + "md5": "0d9c7311e2e3819183d7ae2b4889e4ba", + "size (bytes)": 67615770, + "total_queries": 6980, + "downloaded": False + }, +} diff --git a/pyserini/eval/__init__.py b/pyserini/eval/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/pyserini/eval/convert_msmarco_run_to_trec_run.py b/pyserini/eval/convert_msmarco_run_to_trec_run.py new file mode 100644 index 0000000000000000000000000000000000000000..757ab62abde488a3493c89aff99c96299ad1d13b --- /dev/null +++ b/pyserini/eval/convert_msmarco_run_to_trec_run.py @@ -0,0 +1,34 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Convert an MS MARCO run file to a TREC run file.') + parser.add_argument('--input', required=True, default='', help='Input MS MARCO run file.') + parser.add_argument('--output', required=True, default='', help='Output TREC run file.') + + args = parser.parse_args() + + with open(args.output, 'w') as fout: + for line in open(args.input): + query_id, doc_id, rank = line.strip().split('\t') + score = 1.0 / int(rank) + fout.write('{} Q0 {} {} {} anserini\n'.format( + query_id, doc_id, rank, score)) + + print('Done!') diff --git a/pyserini/eval/convert_trec_run_to_dpr_retrieval_run.py b/pyserini/eval/convert_trec_run_to_dpr_retrieval_run.py new file mode 100644 index 0000000000000000000000000000000000000000..7ef59efd2bda3691edb018871b2725ff2ee02b07 --- /dev/null +++ b/pyserini/eval/convert_trec_run_to_dpr_retrieval_run.py @@ -0,0 +1,85 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import json +import os +from tqdm import tqdm + +from pyserini.search import get_topics, get_topics_with_reader +from pyserini.search.lucene import LuceneSearcher +from pyserini.eval.evaluate_dpr_retrieval import has_answers, SimpleTokenizer + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Convert an TREC run to DPR retrieval result json.') + parser.add_argument('--topics', help='topic name') + parser.add_argument('--topics-file', help='path to a topics file') + parser.add_argument('--topics-reader', help='anserini TopicReader class') + parser.add_argument('--index', required=True, help='Anserini Index that contains raw') + parser.add_argument('--input', required=True, help='Input TREC run file.') + parser.add_argument('--store-raw', action='store_true', help='Store raw text of passage') + parser.add_argument('--regex', action='store_true', default=False, help="regex match") + parser.add_argument('--combine-title-text', action='store_true', help="Make context the concatenation of title and text.") + parser.add_argument('--output', required=True, help='Output DPR Retrieval json file.') + args = parser.parse_args() + + if args.topics_file: + qas = get_topics_with_reader(args.topics_reader, args.topics_file) + elif args.topics: + qas = get_topics(args.topics) + else: + print("No topics file or topics name was provided") + + if os.path.exists(args.index): + searcher = LuceneSearcher(args.index) + else: + searcher = LuceneSearcher.from_prebuilt_index(args.index) + if not searcher: + exit() + + retrieval = {} + tokenizer = SimpleTokenizer() + with open(args.input) as f_in: + for line in tqdm(f_in.readlines()): + question_id, _, doc_id, _, score, _ = line.strip().split() + question_id = int(question_id) + question = qas[question_id]['title'] + answers = qas[question_id]['answers'] + if answers[0] == '"': + answers = answers[1:-1].replace('""', '"') + answers = eval(answers) + if args.combine_title_text: + passage = json.loads(searcher.doc(doc_id).raw()) + ctx = passage['title'] + "\n" + passage['text'] + else: + ctx = json.loads(searcher.doc(doc_id).raw())['contents'] + if question_id not in retrieval: + retrieval[question_id] = {'question': question, 'answers': answers, 'contexts': []} + title, text = ctx.split('\n') + answer_exist = has_answers(text, answers, tokenizer, args.regex) + if args.store_raw: + retrieval[question_id]['contexts'].append( + {'docid': doc_id, + 'score': score, + 'text': ctx, + 'has_answer': answer_exist} + ) + else: + retrieval[question_id]['contexts'].append( + {'docid': doc_id, 'score': score, 'has_answer': answer_exist} + ) + + json.dump(retrieval, open(args.output, 'w'), indent=4, ensure_ascii=False) diff --git a/pyserini/eval/evaluate_dpr_retrieval.py b/pyserini/eval/evaluate_dpr_retrieval.py new file mode 100644 index 0000000000000000000000000000000000000000..e494cd7bcfd746931f7cba8096a52a5f164f4041 --- /dev/null +++ b/pyserini/eval/evaluate_dpr_retrieval.py @@ -0,0 +1,280 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Most of the tokenization code here is copied from Facebook/DPR & DrQA codebase to avoid adding an extra dependency +""" + +import argparse +import copy +import json +import logging +import re +import unicodedata +from tqdm import tqdm +import numpy as np + +import regex + +logger = logging.getLogger(__name__) + + +class Tokens(object): + """A class to represent a list of tokenized text.""" + TEXT = 0 + TEXT_WS = 1 + SPAN = 2 + POS = 3 + LEMMA = 4 + NER = 5 + + def __init__(self, data, annotators, opts=None): + self.data = data + self.annotators = annotators + self.opts = opts or {} + + def __len__(self): + """The number of tokens.""" + return len(self.data) + + def slice(self, i=None, j=None): + """Return a view of the list of tokens from [i, j).""" + new_tokens = copy.copy(self) + new_tokens.data = self.data[i: j] + return new_tokens + + def untokenize(self): + """Returns the original text (with whitespace reinserted).""" + return ''.join([t[self.TEXT_WS] for t in self.data]).strip() + + def words(self, uncased=False): + """Returns a list of the text of each token + Args: + uncased: lower cases text + """ + if uncased: + return [t[self.TEXT].lower() for t in self.data] + else: + return [t[self.TEXT] for t in self.data] + + def offsets(self): + """Returns a list of [start, end) character offsets of each token.""" + return [t[self.SPAN] for t in self.data] + + def pos(self): + """Returns a list of part-of-speech tags of each token. + Returns None if this annotation was not included. + """ + if 'pos' not in self.annotators: + return None + return [t[self.POS] for t in self.data] + + def lemmas(self): + """Returns a list of the lemmatized text of each token. + Returns None if this annotation was not included. + """ + if 'lemma' not in self.annotators: + return None + return [t[self.LEMMA] for t in self.data] + + def entities(self): + """Returns a list of named-entity-recognition tags of each token. + Returns None if this annotation was not included. + """ + if 'ner' not in self.annotators: + return None + return [t[self.NER] for t in self.data] + + def ngrams(self, n=1, uncased=False, filter_fn=None, as_strings=True): + """Returns a list of all ngrams from length 1 to n. + Args: + n: upper limit of ngram length + uncased: lower cases text + filter_fn: user function that takes in an ngram list and returns + True or False to keep or not keep the ngram + as_string: return the ngram as a string vs list + """ + + def _skip(gram): + if not filter_fn: + return False + return filter_fn(gram) + + words = self.words(uncased) + ngrams = [(s, e + 1) + for s in range(len(words)) + for e in range(s, min(s + n, len(words))) + if not _skip(words[s:e + 1])] + + # Concatenate into strings + if as_strings: + ngrams = ['{}'.format(' '.join(words[s:e])) for (s, e) in ngrams] + + return ngrams + + def entity_groups(self): + """Group consecutive entity tokens with the same NER tag.""" + entities = self.entities() + if not entities: + return None + non_ent = self.opts.get('non_ent', 'O') + groups = [] + idx = 0 + while idx < len(entities): + ner_tag = entities[idx] + # Check for entity tag + if ner_tag != non_ent: + # Chomp the sequence + start = idx + while (idx < len(entities) and entities[idx] == ner_tag): + idx += 1 + groups.append((self.slice(start, idx).untokenize(), ner_tag)) + else: + idx += 1 + return groups + + +class Tokenizer(object): + """Base tokenizer class. + Tokenizers implement tokenize, which should return a Tokens class. + """ + + def tokenize(self, text): + raise NotImplementedError + + def shutdown(self): + pass + + def __del__(self): + self.shutdown() + + +class SimpleTokenizer(Tokenizer): + ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+' + NON_WS = r'[^\p{Z}\p{C}]' + + def __init__(self, **kwargs): + """ + Args: + annotators: None or empty set (only tokenizes). + """ + self._regexp = regex.compile( + '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS), + flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE + ) + if len(kwargs.get('annotators', {})) > 0: + logger.warning('%s only tokenizes! Skipping annotators: %s' % + (type(self).__name__, kwargs.get('annotators'))) + self.annotators = set() + + def tokenize(self, text): + data = [] + matches = [m for m in self._regexp.finditer(text)] + for i in range(len(matches)): + # Get text + token = matches[i].group() + + # Get whitespace + span = matches[i].span() + start_ws = span[0] + if i + 1 < len(matches): + end_ws = matches[i + 1].span()[0] + else: + end_ws = span[1] + + # Format data + data.append(( + token, + text[start_ws: end_ws], + span, + )) + return Tokens(data, self.annotators) + + +def regex_match(text, pattern): + """Test if a regex pattern is contained within a text.""" + try: + pattern = re.compile( + pattern, + flags=re.IGNORECASE + re.UNICODE + re.MULTILINE, + ) + except BaseException: + return False + return pattern.search(text) is not None + + +def _normalize(text): + return unicodedata.normalize('NFD', text) + + +def has_answers(text, answers, tokenizer, regex=False): + text = _normalize(text) + if regex: + for ans in answers: + ans = _normalize(ans) + if regex_match(text, ans): + return True + else: + text = tokenizer.tokenize(text).words(uncased=True) + for ans in answers: + ans = _normalize(ans) + ans = tokenizer.tokenize(ans).words(uncased=True) + for i in range(0, len(text) - len(ans) + 1): + if ans == text[i: i + len(ans)]: + return True + return False + + +def evaluate_retrieval(retrieval_file, topk, regex=False): + tokenizer = SimpleTokenizer() + retrieval = json.load(open(retrieval_file)) + accuracy = { k : [] for k in topk } + max_k = max(topk) + + for qid in tqdm(list(retrieval.keys())): + answers = retrieval[qid]['answers'] + contexts = retrieval[qid]['contexts'] + has_ans_idx = max_k # first index in contexts that has answers + + for idx, ctx in enumerate(contexts): + if idx >= max_k: + break + if 'has_answer' in ctx: + if ctx['has_answer']: + has_ans_idx = idx + break + else: + text = ctx['text'].split('\n')[1] # [0] is title, [1] is text + if has_answers(text, answers, tokenizer, regex): + has_ans_idx = idx + break + + for k in topk: + accuracy[k].append(0 if has_ans_idx >= k else 1) + + for k in topk: + print(f'Top{k}\taccuracy: {np.mean(accuracy[k]):.4f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--retrieval', type=str, metavar='path', + help="Path to retrieval output file.") + parser.add_argument('--topk', type=int, nargs='+', help="topk to evaluate") + parser.add_argument('--regex', action='store_true', default=False, help="regex match") + args = parser.parse_args() + + evaluate_retrieval(args.retrieval, args.topk, args.regex) diff --git a/pyserini/eval/evaluate_kilt_retrieval.py b/pyserini/eval/evaluate_kilt_retrieval.py new file mode 100644 index 0000000000000000000000000000000000000000..6de8ac2ece2718cdc5f33e25e1b950770d04ad6f --- /dev/null +++ b/pyserini/eval/evaluate_kilt_retrieval.py @@ -0,0 +1,377 @@ +# NOTE: This code is taken from the original KILT library's retrieval evaluation script +# https://github.com/facebookresearch/KILT/blob/9bcb119a7ed5fda88826058b062d0e45c726c676/kilt/eval_retrieval.py + +import argparse +import pprint +import json +from collections import defaultdict, OrderedDict + +import os +from pyserini.query_iterator import KiltQueryIterator + + +########################################################################################## +# Replaced: +# from kilt import kilt_utils +# With the following directly imported code: + +def load_data(filename): + data = [] + with open(filename, "r") as fin: + lines = fin.readlines() + for line in lines: + data.append(json.loads(line)) + return data + + +########################################################################################## +# Replaced: +# from kilt import eval_downstream +# With the following directly imported code: + +def validate_input(gold_records, guess_records): + + if len(gold_records) != len(guess_records): + print( + "WARNING: DIFFERENT SIZE gold: {} guess: {}".format( + len(gold_records), len(guess_records) + ) + ) + + # align order + gold_ids = [] + for gold in gold_records: + assert str(gold["id"]).strip() not in gold_ids, "Gold IDs should be unique" + gold_ids.append(str(gold["id"]).strip()) + + id2guess_record = {} + for guess in guess_records: + assert ( + str(guess["id"]).strip() not in id2guess_record + ), "Prediction IDs should be unique" + id2guess_record[str(guess["id"]).strip()] = guess + + guess_records = [] + for id in gold_ids: + if id in id2guess_record: + guess_records.append(id2guess_record[id]) + else: + raise ValueError("ERROR: no prediction provided for id: {}".format(id)) + + return gold_records, guess_records + +########################################################################################## + + +def _remove_duplicates(obj): + obj_tmp = [] + for o in obj: + if o not in obj_tmp: + obj_tmp.append(o) + return obj_tmp + + +def _get_ids_list(datapoint, rank_keys, verbose=False): + # collect all gold ids + ids_list = [] + for output in datapoint["output"]: + current_ids_list = [] + if "provenance" in output: + for provenance in output["provenance"]: + if any(rank_key not in provenance for rank_key in rank_keys): + missing = set(rank_keys) - set( + list(provenance.keys()) + ).intersection(set(rank_keys)) + if verbose: + print( + f"WARNING: missing key(s) {missing} in provenance, unable to compute retrieval for those." + ) + else: + current_ids_list.append( + "+".join( + [ + str(provenance[rank_key]).strip() + for rank_key in rank_keys + ] + ) + ) + ids_list.append(_remove_duplicates(current_ids_list)) # remove duplicates + + # consider only unique ids + return ids_list + + +def get_rank(guess_item, gold_item, k, rank_keys, verbose=False): + """ + The main idea is to consider each evidence set as a single point in the rank. + The score in the rank for an evidence set is given by the lowest scored evidence in the set. + """ + + assert k > 0, "k must be a positive integer grater than 0." + + rank = [] + num_distinct_evidence_sets = 0 + + guess_ids = _get_ids_list(guess_item, rank_keys)[0] + + if guess_ids and len(guess_ids) > 0: + + # 1. collect evidence sets and their sizes + evidence_sets = [] + e_size = defaultdict(int) + for output in gold_item["output"]: + if "provenance" in output: + e_set = { + "+".join( + [str(provenance[rank_key]).strip() for rank_key in rank_keys] + ) + for provenance in output["provenance"] + } + if e_set not in evidence_sets: # no duplicate evidence set + evidence_sets.append(e_set) + e_size[len(e_set)] += 1 + num_distinct_evidence_sets = len(evidence_sets) + + # 2. check what's the minimum number of predicted pages needed to get a robust P/R@k + min_prediction_size = 0 + c = 0 + for size, freq in sorted(e_size.items(), reverse=True): + for _ in range(freq): + min_prediction_size += size + c += 1 + if c == k: + break + if c == k: + break + # if the number of evidence sets is smaller than k + min_prediction_size += k - c + + if verbose and len(guess_ids) < min_prediction_size: + print( + f"WARNING: you should provide at least {min_prediction_size} provenance items for a robust recall@{k} computation (you provided {len(guess_ids)} item(s))." + ) + + # 3. rank by gruping pages in each evidence set (each evidence set count as 1), + # the position in the rank of each evidence set is given by the last page in guess_ids + # non evidence pages counts as 1 + rank = [] + for guess_id in guess_ids: + guess_id = str(guess_id).strip() + found = False + for idx, e_set in enumerate(evidence_sets): + + e_set_id = f"evidence_set:{idx}" + + if guess_id in e_set: + found = True + + # remove from the rank previous points referring to this evidence set + if e_set_id in rank: + rank.remove(e_set_id) + + # remove the guess_id from the evidence set + e_set.remove(guess_id) + + if len(e_set) == 0: + # it was the last evidence, it counts as true in the rank + rank.append(True) + else: + # add a point for this partial evidence set + rank.append(e_set_id) + + if not found: + rank.append(False) + + return rank, num_distinct_evidence_sets + + +# 1. Precision computation +def _precision_at_k(rank, k): + + # precision @ k + p = rank[:k].count(True) / k + + return p + + +# 2. Recall computation +def _recall_at_k(rank, num_distinct_evidence_sets, k): + + r = rank[:k].count(True) / num_distinct_evidence_sets + + return r + + +# 3. Success rate computation +def _success_rate_at_k(rank, k): + + # success rate @ k + p = int(True in rank[:k]) + + return p + + +def _computeRprec(guess_ids, gold_ids): + + R = len(gold_ids) + num = 0 + + for prediction in guess_ids[:R]: + if str(prediction).strip() in gold_ids: + num += 1 + + Rprec = num / R if R > 0 else 0 + return Rprec + + +# R-precision https://link.springer.com/referenceworkentry/10.1007%2F978-0-387-39940-9_486 +def rprecision(guess_item, gold_item, rank_keys): + gold_ids_list = _get_ids_list(gold_item, rank_keys) + guess_ids = _get_ids_list(guess_item, rank_keys)[0] + Rprec_vector = [] + for gold_ids in gold_ids_list: + Rprec = _computeRprec(guess_ids, gold_ids) + Rprec_vector.append(Rprec) + return max(Rprec_vector) + + +def get_ranking_metrics(guess_item, gold_item, ks, rank_keys): + + Rprec = 0 + P_at_k = {"precision@{}".format(k): 0 for k in sorted(ks) if k > 0} + R_at_k = {"recall@{}".format(k): 0 for k in sorted(ks) if k > 1} + S_at_k = {"success_rate@{}".format(k): 0 for k in sorted(ks) if k > 1} + + assert ( + "output" in guess_item and len(guess_item["output"]) == 1 + ), f"guess should provide exactly one output for {guess_item['id']}" + + Rprec = rprecision(guess_item, gold_item, rank_keys=rank_keys) + for k in ks: + + # 0. get rank + rank, num_distinct_evidence_sets = get_rank( + guess_item, gold_item, k, rank_keys=rank_keys + ) + + if num_distinct_evidence_sets > 0: + + # 1. precision + P_at_k["precision@{}".format(k)] = _precision_at_k(rank, k) + + # 2. recall + R_at_k["recall@{}".format(k)] = _recall_at_k( + rank, num_distinct_evidence_sets, k + ) + + # 3. success rate + S_at_k["success_rate@{}".format(k)] = _success_rate_at_k(rank, k) + + # else: + # print( + # "WARNING: the number of distinct evidence sets is 0 for {}".format( + # gold_item + # ) + # ) + + return {"Rprec": Rprec, **P_at_k, **R_at_k, **S_at_k} + + +def compute(gold_dataset, guess_dataset, ks, rank_keys): + + ks = sorted([int(x) for x in ks]) + + result = OrderedDict() + result["Rprec"] = 0.0 + for k in ks: + if k > 0: + result["precision@{}".format(k)] = 0.0 + if k > 1: + result["recall@{}".format(k)] = 0.0 + result["success_rate@{}".format(k)] = 0.0 + + assert len(guess_dataset) == len( + gold_dataset + ), "different size gold: {} guess: {}".format(len(guess_dataset), len(gold_dataset)) + + for gold, guess in zip(guess_dataset, gold_dataset): + assert ( + str(gold["id"]).strip() == str(guess["id"]).strip() + ), "Items must have same order with same IDs" + + for guess_item, gold_item in zip(guess_dataset, gold_dataset): + ranking_metrics = get_ranking_metrics(guess_item, gold_item, ks, rank_keys) + result["Rprec"] += ranking_metrics["Rprec"] + for k in ks: + if k > 0: + result["precision@{}".format(k)] += ranking_metrics[ + "precision@{}".format(k) + ] + if k > 1: + result["recall@{}".format(k)] += ranking_metrics["recall@{}".format(k)] + result["success_rate@{}".format(k)] += ranking_metrics[ + "success_rate@{}".format(k) + ] + + if len(guess_dataset) > 0: + result["Rprec"] /= len(guess_dataset) + for k in ks: + if k > 0: + result["precision@{}".format(k)] /= len(guess_dataset) + if k > 1: + result["recall@{}".format(k)] /= len(guess_dataset) + result["success_rate@{}".format(k)] /= len(guess_dataset) + + return result + + +def evaluate(gold, guess, ks, rank_keys): + pp = pprint.PrettyPrinter(indent=4) + + gold_dataset = load_data(gold) + guess_dataset = load_data(guess) + + # 0. validate input + gold_dataset, guess_dataset = validate_input( + gold_dataset, guess_dataset + ) + + # 1. get retrieval metrics + result = compute(gold_dataset, guess_dataset, ks, rank_keys) + + pp.pprint(result) + return result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("guess", help="Guess KILT file") + parser.add_argument("gold", help="Gold KILT file") + parser.add_argument( + "--ks", + type=str, + required=False, + default="1,5,10,20", + help="Comma separated list of positive integers for recall@k and precision@k", + ) + parser.add_argument( + "--rank_keys", + type=str, + required=False, + default="wikipedia_id", + help="Comma separated list of rank keys for recall@k and precision@k", + ) + + args = parser.parse_args() + args.ks = [int(k) for k in args.ks.split(",")] + args.rank_keys = [rank_key for rank_key in args.rank_keys.split(",")] + + ########################################################################################## + # Pyserini change: + # Download gold file if necessary + gold = args.gold + if not os.path.exists(args.gold): + gold = KiltQueryIterator.download_kilt_topics(gold) + ########################################################################################## + + evaluate(gold, args.guess, args.ks, args.rank_keys) diff --git a/pyserini/eval/evaluate_qa_overlap_retrieval.py b/pyserini/eval/evaluate_qa_overlap_retrieval.py new file mode 100644 index 0000000000000000000000000000000000000000..dd5163922e4ab895fd989f2d2fee4555cdf8475a --- /dev/null +++ b/pyserini/eval/evaluate_qa_overlap_retrieval.py @@ -0,0 +1,326 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Most of the tokenization code here is copied from Facebook/DPR & DrQA codebase to avoid adding an extra dependency +""" + +import argparse +import copy +import json +import logging +import re +import unicodedata +from tqdm import tqdm +import numpy as np +import os +import regex +import collections + +logger = logging.getLogger(__name__) + + +DIRNAME = os.path.dirname(os.path.abspath(__file__)) +# download dependencies +if not os.path.exists('data/nq-annotations.jsonl'): + ANNOTATIONS_TO_DOWNLOAD = [ + ('https://dl.fbaipublicfiles.com/qaoverlap/data/nq-annotations.jsonl','nq-annotations.jsonl'), + ('https://dl.fbaipublicfiles.com/qaoverlap/data/triviaqa-annotations.jsonl', 'triviaqa-annotations.jsonl'), + ('https://dl.fbaipublicfiles.com/qaoverlap/data/webquestions-annotations.jsonl','webquestions-annotations.jsonl') + ] + + for link, dest in ANNOTATIONS_TO_DOWNLOAD: + os.system(f'wget {link} -P data/') + +ANNOTATION_PATHS = { + 'tqa': os.path.join(DIRNAME, '../../data/triviaqa-annotations.jsonl'), + 'nq': os.path.join(DIRNAME, '../../data/nq-annotations.jsonl'), + 'webquestions': os.path.join(DIRNAME, '../../data/webquestions-annotations.jsonl'), +} + +class Tokens(object): + """A class to represent a list of tokenized text.""" + TEXT = 0 + TEXT_WS = 1 + SPAN = 2 + POS = 3 + LEMMA = 4 + NER = 5 + + def __init__(self, data, annotators, opts=None): + self.data = data + self.annotators = annotators + self.opts = opts or {} + + def __len__(self): + """The number of tokens.""" + return len(self.data) + + def slice(self, i=None, j=None): + """Return a view of the list of tokens from [i, j).""" + new_tokens = copy.copy(self) + new_tokens.data = self.data[i: j] + return new_tokens + + def untokenize(self): + """Returns the original text (with whitespace reinserted).""" + return ''.join([t[self.TEXT_WS] for t in self.data]).strip() + + def words(self, uncased=False): + """Returns a list of the text of each token + Args: + uncased: lower cases text + """ + if uncased: + return [t[self.TEXT].lower() for t in self.data] + else: + return [t[self.TEXT] for t in self.data] + + def offsets(self): + """Returns a list of [start, end) character offsets of each token.""" + return [t[self.SPAN] for t in self.data] + + def pos(self): + """Returns a list of part-of-speech tags of each token. + Returns None if this annotation was not included. + """ + if 'pos' not in self.annotators: + return None + return [t[self.POS] for t in self.data] + + def lemmas(self): + """Returns a list of the lemmatized text of each token. + Returns None if this annotation was not included. + """ + if 'lemma' not in self.annotators: + return None + return [t[self.LEMMA] for t in self.data] + + def entities(self): + """Returns a list of named-entity-recognition tags of each token. + Returns None if this annotation was not included. + """ + if 'ner' not in self.annotators: + return None + return [t[self.NER] for t in self.data] + + def ngrams(self, n=1, uncased=False, filter_fn=None, as_strings=True): + """Returns a list of all ngrams from length 1 to n. + Args: + n: upper limit of ngram length + uncased: lower cases text + filter_fn: user function that takes in an ngram list and returns + True or False to keep or not keep the ngram + as_string: return the ngram as a string vs list + """ + + def _skip(gram): + if not filter_fn: + return False + return filter_fn(gram) + + words = self.words(uncased) + ngrams = [(s, e + 1) + for s in range(len(words)) + for e in range(s, min(s + n, len(words))) + if not _skip(words[s:e + 1])] + + # Concatenate into strings + if as_strings: + ngrams = ['{}'.format(' '.join(words[s:e])) for (s, e) in ngrams] + + return ngrams + + def entity_groups(self): + """Group consecutive entity tokens with the same NER tag.""" + entities = self.entities() + if not entities: + return None + non_ent = self.opts.get('non_ent', 'O') + groups = [] + idx = 0 + while idx < len(entities): + ner_tag = entities[idx] + # Check for entity tag + if ner_tag != non_ent: + # Chomp the sequence + start = idx + while (idx < len(entities) and entities[idx] == ner_tag): + idx += 1 + groups.append((self.slice(start, idx).untokenize(), ner_tag)) + else: + idx += 1 + return groups + + +class Tokenizer(object): + """Base tokenizer class. + Tokenizers implement tokenize, which should return a Tokens class. + """ + + def tokenize(self, text): + raise NotImplementedError + + def shutdown(self): + pass + + def __del__(self): + self.shutdown() + + +class SimpleTokenizer(Tokenizer): + ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+' + NON_WS = r'[^\p{Z}\p{C}]' + + def __init__(self, **kwargs): + """ + Args: + annotators: None or empty set (only tokenizes). + """ + self._regexp = regex.compile( + '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS), + flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE + ) + if len(kwargs.get('annotators', {})) > 0: + logger.warning('%s only tokenizes! Skipping annotators: %s' % + (type(self).__name__, kwargs.get('annotators'))) + self.annotators = set() + + def tokenize(self, text): + data = [] + matches = [m for m in self._regexp.finditer(text)] + for i in range(len(matches)): + # Get text + token = matches[i].group() + + # Get whitespace + span = matches[i].span() + start_ws = span[0] + if i + 1 < len(matches): + end_ws = matches[i + 1].span()[0] + else: + end_ws = span[1] + + # Format data + data.append(( + token, + text[start_ws: end_ws], + span, + )) + return Tokens(data, self.annotators) + + +def regex_match(text, pattern): + """Test if a regex pattern is contained within a text.""" + try: + pattern = re.compile( + pattern, + flags=re.IGNORECASE + re.UNICODE + re.MULTILINE, + ) + except BaseException: + return False + return pattern.search(text) is not None + + +def _normalize(text): + return unicodedata.normalize('NFD', text) + + +def read_jsonl(path): + with open(path) as f: + return [json.loads(l) for l in f] + + +def read_annotations(annotations_data_path): + return read_jsonl(annotations_data_path) + + +def has_answers(text, answers, tokenizer, regex=False): + text = _normalize(text) + if regex: + for ans in answers: + ans = _normalize(ans) + if regex_match(text, ans): + return True + else: + text = tokenizer.tokenize(text).words(uncased=True) + for ans in answers: + ans = _normalize(ans) + ans = tokenizer.tokenize(ans).words(uncased=True) + for i in range(0, len(text) - len(ans) + 1): + if ans == text[i: i + len(ans)]: + return True + return False + + +def evaluate_retrieval(retrieval_file, topk, annotation_file, regex=False): + tokenizer = SimpleTokenizer() + retrieval = json.load(open(retrieval_file)) + annotations = read_annotations(annotation_file) + annotation_ids = {int(a['id']): a['labels'] for a in annotations} + accuracy = { k : collections.defaultdict(list) for k in topk } + max_k = max(topk) + annotation_labels = [ + 'total', + 'no_overlap', + 'question_overlap', + 'no_question_overlap', + 'answer_overlap', + 'no_answer_overlap', + 'answer_overlap_only' + ] + + + for qid in retrieval.keys(): + answers = retrieval[qid]['answers'] + contexts = retrieval[qid]['contexts'] + has_ans_idx = max_k # first index in contexts that has answers + + for idx, ctx in enumerate(contexts): + if idx >= max_k: + break + if 'has_answer' in ctx: + if ctx['has_answer']: + has_ans_idx = idx + break + else: + text = ctx['text'].split('\n')[1] # [0] is title, [1] is text + if has_answers(text, answers, tokenizer, regex): + has_ans_idx = idx + break + + for annotation_label in annotation_labels: + if annotation_label in annotation_ids[int(qid)] or annotation_label == 'total' or \ + (annotation_label == 'no_overlap' and ('no_question_overlap' in annotation_ids[int(qid)]) and ('no_answer_overlap' in annotation_ids[int(qid)])): + for k in topk: + accuracy[k][annotation_label].append(0 if has_ans_idx >= k else 1) + + for k in topk: + for annotation_label in annotation_labels: + print(f'Top{k}\taccuracy: {np.mean(accuracy[k][annotation_label])} \t {annotation_label}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--retrieval', type=str, metavar='path', + help="Path to retrieval output file.") + parser.add_argument('--topk', type=int, nargs='+', help="topk to evaluate") + parser.add_argument('--regex', action='store_true', default=False, help="regex match") + parser.add_argument('--dataset_name', choices=['nq', 'tqa', 'webquestions'], type=str, + help='name of datset to evaluate on') + args = parser.parse_args() + + evaluate_retrieval(args.retrieval, args.topk, ANNOTATION_PATHS[args.dataset_name], args.regex) diff --git a/pyserini/eval/msmarco_doc_eval.py b/pyserini/eval/msmarco_doc_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..4b818cde3c924d231112eb9f7ec9948af0c08d4e --- /dev/null +++ b/pyserini/eval/msmarco_doc_eval.py @@ -0,0 +1,46 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import subprocess +import sys +import platform + +from pyserini.search import get_qrels_file +from pyserini.util import download_evaluation_script + +script_path = download_evaluation_script('msmarco_doc_eval') +cmd_prefix = ['python', script_path] +args = sys.argv +if len(args) > 1: + cmd = cmd_prefix + args[1:] + for i in range(len(cmd)-1): + if cmd[i] == '--judgments': + if not os.path.exists(cmd[i+1]): + cmd[i+1] = get_qrels_file(cmd[i + 1]) +else: + cmd = cmd_prefix +print(f'Running command: {cmd}') +shell = platform.system() == "Windows" +process = subprocess.Popen(cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=shell) +stdout, stderr = process.communicate() +if stderr: + print(stderr.decode("utf-8")) +print('Results:') +print(stdout.decode("utf-8")) diff --git a/pyserini/eval/msmarco_passage_eval.py b/pyserini/eval/msmarco_passage_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..c5a07f950c9f0eaf00dbf60d8bcd16a69eadeafb --- /dev/null +++ b/pyserini/eval/msmarco_passage_eval.py @@ -0,0 +1,44 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import subprocess +import sys +import platform + +from pyserini.search import get_qrels_file +from pyserini.util import download_evaluation_script + +script_path = download_evaluation_script('msmarco_passage_eval') +cmd_prefix = ['python', script_path] +args = sys.argv +if len(args) > 1: + cmd = cmd_prefix + args[1:] + if not os.path.exists(cmd[-2]): + cmd[-2] = get_qrels_file(cmd[-2]) +else: + cmd = cmd_prefix +print(f'Running command: {cmd}') +shell = platform.system() == "Windows" +process = subprocess.Popen(cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=shell) +stdout, stderr = process.communicate() +if stderr: + print(stderr.decode("utf-8")) +print('Results:') +print(stdout.decode("utf-8")) diff --git a/pyserini/eval/trec_eval.py b/pyserini/eval/trec_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..b72f7c9f81e0539fcd19d43cfa7664c6471b185e --- /dev/null +++ b/pyserini/eval/trec_eval.py @@ -0,0 +1,112 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Example usage +# python -m pyserini.eval.trec_eval -m ndcg_cut.10,20 -m all_trec qrels.dev.small.tsv runs/run.Colbert.txt -remove-unjudged -cutoffs.20,50 + + +import os +import re +import subprocess +import sys +import platform +import pandas as pd +import tempfile + +from pyserini.search import get_qrels_file +from pyserini.util import download_evaluation_script + +script_path = download_evaluation_script('trec_eval') +cmd_prefix = ['java', '-jar', script_path] +args = sys.argv + +# Option to discard non-judged hits in run file +judged_docs_only = '' +judged_result = [] +cutoffs = [] + +if '-remove-unjudged' in args: + judged_docs_only = args.pop(args.index('-remove-unjudged')) + +if any([i.startswith('judged.') for i in args]): + # Find what position the arg is in. + idx = [i.startswith('judged.') for i in args].index(True) + cutoffs = args.pop(idx) + cutoffs = list(map(int, cutoffs[7:].split(','))) + # Get rid of the '-m' before the 'judged.xxx' option + args.pop(idx-1) + +temp_file = '' + +if len(args) > 1: + if not os.path.exists(args[-2]): + args[-2] = get_qrels_file(args[-2]) + if os.path.exists(args[-1]): + # Convert run to trec if it's on msmarco + with open(args[-1]) as f: + first_line = f.readline() + if 'Q0' not in first_line: + temp_file = tempfile.NamedTemporaryFile(delete=False).name + print('msmarco run detected. Converting to trec...') + run = pd.read_csv(args[-1], delim_whitespace=True, header=None, names=['query_id', 'doc_id', 'rank']) + run['score'] = 1 / run['rank'] + run.insert(1, 'Q0', 'Q0') + run['name'] = 'TEMPRUN' + run.to_csv(temp_file, sep='\t', header=None, index=None) + args[-1] = temp_file + + run = pd.read_csv(args[-1], delim_whitespace=True, header=None) + qrels = pd.read_csv(args[-2], delim_whitespace=True, header=None) + + # cast doc_id column as string + run[0] = run[0].astype(str) + qrels[0] = qrels[0].astype(str) + + # Discard non-judged hits + if judged_docs_only: + if not temp_file: + temp_file = tempfile.NamedTemporaryFile(delete=False).name + judged_indexes = pd.merge(run[[0,2]].reset_index(), qrels[[0,2]], on = [0,2])['index'] + run = run.loc[judged_indexes] + run.to_csv(temp_file, sep='\t', header=None, index=None) + args[-1] = temp_file + # Measure judged@cutoffs + for cutoff in cutoffs: + run_cutoff = run.groupby(0).head(cutoff) + judged = len(pd.merge(run_cutoff[[0,2]], qrels[[0,2]], on = [0,2])) / len(run_cutoff) + metric_name = f'judged_{cutoff}' + judged_result.append(f'{metric_name:22}\tall\t{judged:.4f}') + cmd = cmd_prefix + args[1:] +else: + cmd = cmd_prefix + +print(f'Running command: {cmd}') +shell = platform.system() == "Windows" +process = subprocess.Popen(cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=shell) +stdout, stderr = process.communicate() +if stderr: + print(stderr.decode("utf-8")) + +print('Results:') +print(stdout.decode("utf-8").rstrip()) + +for judged in judged_result: + print(judged) + +if temp_file: + os.remove(temp_file) diff --git a/pyserini/evaluate_script_info.py b/pyserini/evaluate_script_info.py new file mode 100644 index 0000000000000000000000000000000000000000..4e583578bda5cad2c3fce388bce6afe233669545 --- /dev/null +++ b/pyserini/evaluate_script_info.py @@ -0,0 +1,37 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +EVALUATION_INFO = { + "trec_eval": { + "description": "TREC evaluation script", + "urls": [ + "https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar", + ], + }, + "msmarco_passage_eval": { + "description": "MSMARCO-passage evaluation script", + "urls": [ + "https://raw.githubusercontent.com/castorini/anserini-tools/master/scripts/msmarco/msmarco_passage_eval.py", + ], + }, + "msmarco_doc_eval": { + "description": "MSMARCO-doc evaluation script", + "urls": [ + "https://raw.githubusercontent.com/castorini/anserini-tools/master/scripts/msmarco/msmarco_doc_eval.py", + ], + } + +} diff --git a/pyserini/external_query_info.py b/pyserini/external_query_info.py new file mode 100644 index 0000000000000000000000000000000000000000..ab953f8ba920f413042608df55a39045d0751bfb --- /dev/null +++ b/pyserini/external_query_info.py @@ -0,0 +1,95 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +KILT_QUERY_INFO = { + "fever-dev-kilt": { + "description": "KILT FEVER dev set", + "urls": ["http://dl.fbaipublicfiles.com/KILT/fever-dev-kilt.jsonl"], + "md5": "ae9a27503d177ba82cdb1e968b1aeac1", + "size (bytes)": 6174139, + "total_queries": 10444 + }, + "aidayago2-dev-kilt": { + "description": "KILT AIDA CoNLL-YAGO dev set", + "urls": ["http://dl.fbaipublicfiles.com/KILT/aidayago2-dev-kilt.jsonl"], + "md5": "262c2350c0a331b26cdcc70590f068f2", + "size (bytes)": 21061554, + "total_queries": 4784 + }, + "wned-dev-kilt": { + "description": "KILT WNED-WIKI dev set", + "urls": ["http://dl.fbaipublicfiles.com/KILT/wned-dev-kilt.jsonl"], + "md5": "b04e18e85c7f87030f5118c21f1297dc", + "size (bytes)": 12868348, + "total_queries": 3396 + }, + "cweb-dev-kilt": { + "description": "KILT WNED-CWEB dev set", + "urls": ["http://dl.fbaipublicfiles.com/KILT/cweb-dev-kilt.jsonl"], + "md5": "bb62b9471cdec028abbe91b19030e9ad", + "size (bytes)": 90228527, + "total_queries": 5599 + }, + "trex-dev-kilt": { + "description": "KILT T-REx dev set", + "urls": ["http://dl.fbaipublicfiles.com/KILT/trex-dev-kilt.jsonl"], + "md5": "ccd3c43891f08b2d5d9adf3e6885c8f9", + "size (bytes)": 3803558, + "total_queries": 5000 + }, + "structured_zeroshot-dev-kilt": { + "description": "KILT Zero-Shot RE dev set", + "urls": ["http://dl.fbaipublicfiles.com/KILT/structured_zeroshot-dev-kilt.jsonl"], + "md5": "b2cb14cb4b00a90352c9ad8317829cfd", + "size (bytes)": 2266707, + "total_queries": 3724 + }, + "nq-dev-kilt": { + "description": "KILT Natural Questions dev set", + "urls": ["http://dl.fbaipublicfiles.com/KILT/nq-dev-kilt.jsonl"], + "md5": "0bb57ca0b4676ed66005b8788d3a3050", + "size (bytes)": 7936566, + "total_queries": 2837 + }, + "hotpotqa-dev-kilt": { + "description": "KILT HotpotQA dev set", + "urls": ["http://dl.fbaipublicfiles.com/KILT/hotpotqa-dev-kilt.jsonl"], + "md5": "3ebc5eeaa5572ec29451eb4b66c29333", + "size (bytes)": 3971321, + "total_queries": 5600 + }, + "triviaqa-dev-kilt": { + "description": "KILT TriviaQA dev set, generated using KILT's scripts/get_triviaqa_input.py", + "urls": ["https://github.com/castorini/pyserini-data/raw/main/queries/triviaqa-dev-kilt.jsonl"], + "md5": "0eda82a7a3e24271d623710fa2a2ff64", + "size (bytes)": 10314686, + "total_queries": 5359 + }, + "eli5-dev-kilt": { + "description": "KILT ELI5 dev set", + "urls": ["http://dl.fbaipublicfiles.com/KILT/eli5-dev-kilt.jsonl"], + "md5": "7abac8b2495581d513b0542916178893", + "size (bytes)": 14149811, + "total_queries": 1507 + }, + "wow-dev-kilt": { + "description": "KILT Wizard of Wikipedia dev set", + "urls": ["http://dl.fbaipublicfiles.com/KILT/wow-dev-kilt.jsonl"], + "md5": "bf4000198be9d8acbab11a57745a6a8b", + "size (bytes)": 2418241, + "total_queries": 3058 + } +} diff --git a/pyserini/fusion/__init__.py b/pyserini/fusion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6eff3bfce97142fca2c7e138ca16afdbdfd1e82e --- /dev/null +++ b/pyserini/fusion/__init__.py @@ -0,0 +1,19 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ._base import average, FusionMethod, interpolation, reciprocal_rank_fusion + +__all__ = ['FusionMethod', 'average', 'interpolation', 'reciprocal_rank_fusion'] diff --git a/pyserini/fusion/__main__.py b/pyserini/fusion/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..640754ec95ecf5a34bcb580e1e35f48f60f359af --- /dev/null +++ b/pyserini/fusion/__main__.py @@ -0,0 +1,49 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +from ._base import FusionMethod +from pyserini.fusion import average, interpolation, reciprocal_rank_fusion +from ..trectools import TrecRun + + +parser = argparse.ArgumentParser(description='Perform various ways of fusion given a list of trec run files.') +parser.add_argument('--runs', type=str, nargs='+', default=[], required=True, + help='List of run files separated by space.') +parser.add_argument('--output', type=str, required=True, help="Path to resulting fused txt.") +parser.add_argument('--runtag', type=str, default="pyserini.fusion", help="Tag name of fused run.") +parser.add_argument('--method', type=FusionMethod, default=FusionMethod.RRF, help="The fusion method to be used.") +parser.add_argument('--rrf.k', dest='rrf_k', type=int, default=60, + help="Parameter k needed for reciprocal rank fusion.") +parser.add_argument('--alpha', type=float, default=0.5, required=False, help='Alpha value used for interpolation.') +parser.add_argument('--depth', type=int, default=1000, required=False, help='Pool depth per topic.') +parser.add_argument('--k', type=int, default=1000, required=False, help='Number of documents to output per topic.') +parser.add_argument('--resort', action='store_true', help='We resort the Trec run files or not') +args = parser.parse_args() + +trec_runs = [TrecRun(filepath=path,resort=args.resort) for path in args.runs] + +fused_run = None +if args.method == FusionMethod.RRF: + fused_run = reciprocal_rank_fusion(trec_runs, rrf_k=args.rrf_k, depth=args.depth, k=args.k) +elif args.method == FusionMethod.INTERPOLATION: + fused_run = interpolation(trec_runs, alpha=args.alpha, depth=args.depth, k=args.k) +elif args.method == FusionMethod.AVERAGE: + fused_run = average(trec_runs, depth=args.depth, k=args.k) +else: + raise NotImplementedError(f'Fusion method {args.method} not implemented.') + +fused_run.save_to_txt(args.output, tag=args.runtag) diff --git a/pyserini/fusion/__pycache__/__init__.cpython-310.pyc b/pyserini/fusion/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6815f48ab837a01da9d9e485fe91954abd21c0df Binary files /dev/null and b/pyserini/fusion/__pycache__/__init__.cpython-310.pyc differ diff --git a/pyserini/fusion/__pycache__/_base.cpython-310.pyc b/pyserini/fusion/__pycache__/_base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a275259efcd250b246d31d8970eb85abd16eab1 Binary files /dev/null and b/pyserini/fusion/__pycache__/_base.cpython-310.pyc differ diff --git a/pyserini/fusion/_base.py b/pyserini/fusion/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..674fc07e9ab98940e80992abdd4e360e5e6aa888 --- /dev/null +++ b/pyserini/fusion/_base.py @@ -0,0 +1,111 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from enum import Enum +from pyserini.trectools import AggregationMethod, RescoreMethod, TrecRun +from typing import List + + +class FusionMethod(Enum): + RRF = 'rrf' + INTERPOLATION = 'interpolation' + AVERAGE = 'average' + + +def reciprocal_rank_fusion(runs: List[TrecRun], rrf_k: int = 60, depth: int = None, k: int = None): + """Perform reciprocal rank fusion on a list of ``TrecRun`` objects. Implementation follows Cormack et al. + (SIGIR 2009) paper titled "Reciprocal Rank Fusion Outperforms Condorcet and Individual Rank Learning Methods." + + Parameters + ---------- + runs : List[TrecRun] + List of ``TrecRun`` objects. + rrf_k : int + Parameter to avoid vanishing importance of lower-ranked documents. Note that this is different from the *k* in + top *k* retrieval; set to 60 by default, per Cormack et al. + depth : int + Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that + the complete list of results is considered. + k : int + Length of final results list. Set to ``None`` by default, which indicates that the union of all input documents + are ranked. + + Returns + ------- + TrecRun + Output ``TrecRun`` that combines input runs via reciprocal rank fusion. + """ + + # TODO: Add option to *not* clone runs, thus making the method destructive, but also more efficient. + rrf_runs = [run.clone().rescore(method=RescoreMethod.RRF, rrf_k=rrf_k) for run in runs] + return TrecRun.merge(rrf_runs, AggregationMethod.SUM, depth=depth, k=k) + + +def interpolation(runs: List[TrecRun], alpha: int = 0.5, depth: int = None, k: int = None): + """Perform fusion by interpolation on a list of exactly two ``TrecRun`` objects. + new_score = first_run_score * alpha + (1 - alpha) * second_run_score. + + Parameters + ---------- + runs : List[TrecRun] + List of ``TrecRun`` objects. Exactly two runs. + alpha : int + Parameter alpha will be applied on the first run and (1 - alpha) will be applied on the second run. + depth : int + Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that + the complete list of results is considered. + k : int + Length of final results list. Set to ``None`` by default, which indicates that the union of all input documents + are ranked. + + Returns + ------- + TrecRun + Output ``TrecRun`` that combines input runs via interpolation. + """ + + if len(runs) != 2: + raise Exception('Interpolation must be performed on exactly two runs.') + + scaled_runs = [] + scaled_runs.append(runs[0].clone().rescore(method=RescoreMethod.SCALE, scale=alpha)) + scaled_runs.append(runs[1].clone().rescore(method=RescoreMethod.SCALE, scale=(1-alpha))) + + return TrecRun.merge(scaled_runs, AggregationMethod.SUM, depth=depth, k=k) + + +def average(runs: List[TrecRun], depth: int = None, k: int = None): + """Perform fusion by averaging on a list of ``TrecRun`` objects. + + Parameters + ---------- + runs : List[TrecRun] + List of ``TrecRun`` objects. + depth : int + Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that + the complete list of results is considered. + k : int + Length of final results list. Set to ``None`` by default, which indicates that the union of all input documents + are ranked. + + Returns + ------- + TrecRun + Output ``TrecRun`` that combines input runs via averaging. + """ + + scaled_runs = [run.clone().rescore(method=RescoreMethod.SCALE, scale=(1/len(runs))) for run in runs] + return TrecRun.merge(scaled_runs, AggregationMethod.SUM, depth=depth, k=k) diff --git a/pyserini/hsearch.py b/pyserini/hsearch.py new file mode 100644 index 0000000000000000000000000000000000000000..7fd7605712ed496e7ac659fc38a50f6cb70b5551 --- /dev/null +++ b/pyserini/hsearch.py @@ -0,0 +1,38 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Deprecated. The package ``pyserini.hsearch` has been renamed `pyserini.search.hybrid`. Stubs are retained here for +redirection purpose to ensure that code in existing published papers remain function (with warnings).""" + +import os +import sys + +from pyserini.search.hybrid import HybridSearcher as NewHybridSearcher + +__all__ = ['HybridSearcher'] + + +class HybridSearcher(NewHybridSearcher): + def __new__(cls, *args, **kwargs): + print('pyserini.hsearch.HybridSearcher class has been deprecated, ' + 'please use HybridSearcher from pyserini.search.hybrid instead') + return super().__new__(cls) + + +if __name__ == "__main__": + print('WARNING: pyserini.hsearch is deprecated, please use pyserini.search.hybrid instead') + args = " ".join(sys.argv[1:]) + os.system(f'python -m pyserini.search.hybrid {args}') diff --git a/pyserini/index/__init__.py b/pyserini/index/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4120db77a55cf527c747e44a510b48f4b36cda44 --- /dev/null +++ b/pyserini/index/__init__.py @@ -0,0 +1,23 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Classes here have been moved to pyserini.index.lucene, e.g., the pyserini.index.Indexer is now +# pyserini.index.lucene.IndexReader. We're importing symbols here and then re-exporting to preserve +# backward compatability to code snippets published in Lin et al. (SIGIR 2021). + +from .lucene._base import Document, Generator, IndexTerm, Posting, IndexReader + +__all__ = ['Document', 'Generator', 'IndexTerm', 'Posting', 'IndexReader'] diff --git a/pyserini/index/__main__.py b/pyserini/index/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..ecce089074c39e5f6e2475cf56a01dcb3fc400eb --- /dev/null +++ b/pyserini/index/__main__.py @@ -0,0 +1,34 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from jnius import autoclass +import sys +import os + +print('pyserini.index is deprecated, please use pyserini.index.lucene.') +args = sys.argv[1:] +# argument check +for i in range(len(args)): + # Convert double hyphen args into single hyphen args for Java: e.g., --input becomes -input + if args[i].startswith('--'): + args[i] = args[i][1:] + if args[i] == '-input': + collection_dir = args[i+1] + if os.path.isfile(collection_dir): + raise ValueError('Argument -input should be a directory.') + +JIndexCollection = autoclass('io.anserini.index.IndexCollection') +JIndexCollection.main(args) diff --git a/pyserini/index/__pycache__/__init__.cpython-310.pyc b/pyserini/index/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d825c5969ec882ae10eec45428783bf040006b39 Binary files /dev/null and b/pyserini/index/__pycache__/__init__.cpython-310.pyc differ diff --git a/pyserini/index/faiss.py b/pyserini/index/faiss.py new file mode 100644 index 0000000000000000000000000000000000000000..50712d378a3a6395373c1549c144a2c5f00a035c --- /dev/null +++ b/pyserini/index/faiss.py @@ -0,0 +1,82 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import json +import os +import argparse +import shutil +import numpy as np + +import faiss +from tqdm import tqdm + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--input', type=str, help='path to embeddings directory', required=True) + parser.add_argument('--output', type=str, help='path to output index dir', required=True) + parser.add_argument('--dim', type=int, default=768, required=False) + parser.add_argument('--hnsw', action="store_true", required=False) + parser.add_argument('--M', type=int, default=256, required=False) + parser.add_argument('--efC', type=int, default=256, required=False) + parser.add_argument('--pq', action="store_true", required=False) + parser.add_argument('--pq-m', type=int, default=192, required=False) + parser.add_argument('--pq-nbits', type=int, default=8, required=False) + parser.add_argument('--threads', type=int, default=12, required=False) + args = parser.parse_args() + + faiss.omp_set_num_threads(args.threads) + + if not os.path.exists(args.output): + os.mkdir(args.output) + + if 'index' in os.listdir(args.input): + shutil.copy(os.path.join(args.input, 'docid'), os.path.join(args.output, 'docid')) + bf_index = faiss.read_index(os.path.join(args.input, 'index')) + vectors = bf_index.reconstruct_n(0, bf_index.ntotal) + else: + vectors = [] + with open(os.path.join(args.output, 'docid'), 'w') as f_out: + for filename in tqdm(os.listdir(args.input)): + path = os.path.join(args.input, filename) + with open(path) as f_in: + for line in f_in: + info = json.loads(line) + docid = info['id'] + vector = info['vector'] + f_out.write(f'{docid}\n') + vectors.append(vector) + vectors = np.array(vectors, dtype='float32') + print(vectors.shape) + + if args.hnsw and args.pq: + index = faiss.IndexHNSWPQ(args.dim, args.pq_m, args.M) + index.hnsw.efConstruction = args.efC + index.metric_type = faiss.METRIC_INNER_PRODUCT + elif args.hnsw: + index = faiss.IndexHNSWFlat(args.dim, args.M, faiss.METRIC_INNER_PRODUCT) + index.hnsw.efConstruction = args.efC + elif args.pq: + index = faiss.IndexPQ(args.dim, args.pq_m, args.pq_nbits, faiss.METRIC_INNER_PRODUCT) + else: + index = faiss.IndexFlatIP(args.dim) + index.verbose = True + + if args.pq: + index.train(vectors) + + index.add(vectors) + print(index.ntotal) + faiss.write_index(index, os.path.join(args.output, 'index')) diff --git a/pyserini/index/lucene/__init__.py b/pyserini/index/lucene/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7926753011dbc9e876747f8048c1e73a56e69394 --- /dev/null +++ b/pyserini/index/lucene/__init__.py @@ -0,0 +1,21 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ._base import Document, Generator, IndexTerm, Posting, IndexReader +from ._indexer import LuceneIndexer, JacksonObjectMapper, JacksonJsonNode + +__all__ = ['Document', 'Generator', 'IndexTerm', 'Posting', 'IndexReader', 'LuceneIndexer', + 'JacksonObjectMapper', 'JacksonJsonNode'] \ No newline at end of file diff --git a/pyserini/index/lucene/__main__.py b/pyserini/index/lucene/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..01d5a05cb1c0268dfe7299aa1c7987c6d90bcc58 --- /dev/null +++ b/pyserini/index/lucene/__main__.py @@ -0,0 +1,36 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from jnius import autoclass +import sys +import os + + +if __name__ == '__main__': + args = sys.argv[1:] + for i in range(len(args)): + if args[i].startswith('--'): + args[i] = args[i][1:] + + # argument check + for i in range(len(args)): + if args[i] == '-input': + collection_dir = args[i+1] + if os.path.isfile(collection_dir): + raise ValueError('Argument -input should be a directory.') + + JIndexCollection = autoclass('io.anserini.index.IndexCollection') + JIndexCollection.main(args) diff --git a/pyserini/index/lucene/__pycache__/__init__.cpython-310.pyc b/pyserini/index/lucene/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6433203dfa737562b8778ec13e82768ef68fd8b Binary files /dev/null and b/pyserini/index/lucene/__pycache__/__init__.cpython-310.pyc differ diff --git a/pyserini/index/lucene/__pycache__/_base.cpython-310.pyc b/pyserini/index/lucene/__pycache__/_base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c14bf8ed2db38c8d7e585056ce433905b183659c Binary files /dev/null and b/pyserini/index/lucene/__pycache__/_base.cpython-310.pyc differ diff --git a/pyserini/index/lucene/__pycache__/_indexer.cpython-310.pyc b/pyserini/index/lucene/__pycache__/_indexer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e7a601224d83c34442c2daf552471970c46b85c Binary files /dev/null and b/pyserini/index/lucene/__pycache__/_indexer.cpython-310.pyc differ diff --git a/pyserini/index/lucene/_base.py b/pyserini/index/lucene/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..9cf5cee1d50d3fc71369ea95bc0c8e1733839d85 --- /dev/null +++ b/pyserini/index/lucene/_base.py @@ -0,0 +1,623 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This module provides Pyserini's Python interface for raw access to Lucene indexes built by Anserini. The main entry +point is the ``IndexReaderUtils`` class, which wraps the Java class with the same name in Anserini. Many of the classes +and methods provided are meant only to provide tools for examining an index and are not optimized for computing over. +""" + +import logging +from enum import Enum +from typing import Dict, Iterator, List, Optional, Tuple +from tqdm import tqdm +import json +import math + +from pyserini.analysis import get_lucene_analyzer, JAnalyzer, JAnalyzerUtils +from pyserini.pyclass import autoclass +from pyserini.util import download_prebuilt_index, get_sparse_indexes_info +from pyserini.prebuilt_index_info import TF_INDEX_INFO, IMPACT_INDEX_INFO + +logger = logging.getLogger(__name__) + + +# Wrappers around Anserini classes +JDocument = autoclass('org.apache.lucene.document.Document') +JIndexReader = autoclass('io.anserini.index.IndexReaderUtils') + + +class JIndexHelpers: + @staticmethod + def JArgs(): + args = autoclass('io.anserini.index.IndexCollection$Args')() + args.storeContents = True + args.storeRaw = True + args.dryRun = True ## So that indexing will be skipped + + return args + + @staticmethod + def JCounters(): + IndexCollection = autoclass('io.anserini.index.IndexCollection') + Counters = autoclass('io.anserini.index.IndexCollection$Counters') + + return Counters(IndexCollection) + + +class Document: + """Wrapper class for a Lucene ``Document``. + + Parameters + ---------- + document : JDocument + Underlying Lucene ``Document``. + """ + + def __init__(self, document): + if document is None: + raise ValueError('Cannot create a Document with None.') + self.object = document + + def docid(self: JDocument) -> str: + return self.object.getField('id').stringValue() + + def id(self: JDocument) -> str: + # Convenient alias for docid() + return self.object.getField('id').stringValue() + + def lucene_document(self: JDocument) -> JDocument: + return self.object + + def contents(self: JDocument) -> str: + return self.object.get('contents') + + def raw(self: JDocument) -> str: + return self.object.get('raw') + + def get(self: JDocument, field: str) -> str: + return self.object.get(field) + + +class JGenerators(Enum): + AclAnthologyGenerator = autoclass('io.anserini.index.generator.AclAnthologyGenerator') + DefaultLuceneDocumentGenerator = autoclass('io.anserini.index.generator.DefaultLuceneDocumentGenerator') + TweetGenerator = autoclass('io.anserini.index.generator.TweetGenerator') + WashingtonPostGenerator = autoclass('io.anserini.index.generator.WashingtonPostGenerator') + + +class Generator: + """Wrapper class for Anserini's generators. + + Parameters + ---------- + generator_class : str + Name of generator class to instantiate + """ + + def __init__(self, generator_class): + self.counters = JIndexHelpers.JCounters() + self.args = JIndexHelpers.JArgs() + self.generator_class = generator_class + self.object = self._get_generator() + + def _get_generator(self): + try: + return JGenerators[self.generator_class].value(self.args) + except: + raise ValueError(self.generator_class) + + def create_document(self, document): + """ + Parameters + ---------- + document : pyserini.collection.pycollection.Document + Collection document to create Lucene document from + + Returns + ------- + result : org.apache.lucene.document.Document + Lucene document generated + """ + return self.object.createDocument(document.object) + + +class IndexTerm: + """Class representing an analyzed term in an index with associated statistics. + + Parameters + ---------- + term : str + Analyzed term. + df : int + Document frequency, the number of documents in the collection that contains the term. + cf : int + Collection frequency, the number of times the term occurs in the entire collection. This value is equal to the + sum of all the term frequencies of the term across all documents in the collection. + """ + + def __init__(self, term, df, cf): + self.term = term + self.df = df + self.cf = cf + + +class Posting: + """Class representing a posting in a postings list. + + Parameters + ---------- + docid : int + Collection ``docid``. + tf : int + Term frequency. + positions : List[int] + List of positions. + """ + + def __init__(self, docid, tf, positions): + self.docid = docid + self.tf = tf + self.positions = positions + + def __repr__(self): + repr = '(' + str(self.docid) + ', ' + str(self.tf) + ')' + if self.positions: + repr += ' [' + ','.join([str(p) for p in self.positions]) + ']' + return repr + + +class IndexReader: + """Wrapper class for ``IndexReaderUtils`` in Anserini. + + Parameters + ---------- + index_dir : str + Path to Lucene index directory. + """ + + def __init__(self, index_dir): + self.object = JIndexReader() + self.reader = self.object.getReader(index_dir) + + @classmethod + def from_prebuilt_index(cls, prebuilt_index_name: str, verbose=False): + """Build an index reader from a prebuilt index; download the index if necessary. + + Parameters + ---------- + prebuilt_index_name : str + Prebuilt index name. + verbose : bool + Print status information. + + Returns + ------- + IndexReader + Index reader built from the prebuilt index. + """ + if verbose: + print(f'Attempting to initialize pre-built index {prebuilt_index_name}.') + + try: + index_dir = download_prebuilt_index(prebuilt_index_name, verbose=verbose) + except ValueError as e: + print(str(e)) + return None + + if verbose: + print(f'Initializing {prebuilt_index_name}...') + + index_reader = cls(index_dir) + # Validate index stats; will throw exception there are any issues. + index_reader.validate(prebuilt_index_name, verbose=verbose) + + return index_reader + + @staticmethod + def list_prebuilt_indexes(): + """Display information about available prebuilt indexes.""" + get_sparse_indexes_info() + + def analyze(self, text: str, analyzer=None) -> List[str]: + """Analyze a piece of text. Applies Anserini's default Lucene analyzer if analyzer not specified. + + Parameters + ---------- + text : str + Text to analyze. + analyzer : analyzer + Analyzer to apply. + Returns + ------- + List[str] + List of tokens corresponding to the output of the analyzer. + """ + if analyzer is None: + results = JAnalyzerUtils.analyze(text) + else: + results = JAnalyzerUtils.analyze(analyzer, text) + tokens = [] + for token in results.toArray(): + tokens.append(token) + return tokens + + def validate(self, prebuilt_index_name: str, verbose=False): + """Validate this index against stored stats for a pre-built index.""" + stats = self.stats() + + if prebuilt_index_name in TF_INDEX_INFO: + if stats['documents'] != TF_INDEX_INFO[prebuilt_index_name]['documents']: + raise ValueError('Pre-built index fails consistency check: "documents" does not match!') + if stats['unique_terms'] != TF_INDEX_INFO[prebuilt_index_name]['unique_terms']: + raise ValueError('Pre-built index fails consistency check: "unique_terms" does not match!') + if stats['total_terms'] != TF_INDEX_INFO[prebuilt_index_name]['total_terms']: + raise ValueError('Pre-built index fails consistency check: "total_terms" does not match!') + elif prebuilt_index_name in IMPACT_INDEX_INFO: + if stats['documents'] != IMPACT_INDEX_INFO[prebuilt_index_name]['documents']: + raise ValueError('Pre-built index fails consistency check: "documents" does not match!') + if stats['unique_terms'] != IMPACT_INDEX_INFO[prebuilt_index_name]['unique_terms']: + raise ValueError('Pre-built index fails consistency check: "unique_terms" does not match!') + if stats['total_terms'] != IMPACT_INDEX_INFO[prebuilt_index_name]['total_terms']: + raise ValueError('Pre-built index fails consistency check: "total_terms" does not match!') + else: + print(f'Unknown pre-built index \'{prebuilt_index_name}\'!') + return False + + if verbose: + print(stats) + print(f'Index passes consistency checks against pre-built index \'{prebuilt_index_name}\'!') + + return True + + def terms(self) -> Iterator[IndexTerm]: + """Return an iterator over analyzed terms in the index. + + Returns + ------- + Iterator[IndexTerm] + Iterator over :class:`IndexTerm` objects corresponding to (analyzed) terms in the index. + """ + term_iterator = self.object.getTerms(self.reader) + while term_iterator.hasNext(): + cur_term = term_iterator.next() + yield IndexTerm(cur_term.getTerm(), cur_term.getDF(), cur_term.getTotalTF()) + + def get_term_counts(self, term: str, analyzer: Optional[JAnalyzer] = get_lucene_analyzer()) -> Tuple[int, int]: + """Return the document frequency and collection frequency of a term. Applies Anserini's default Lucene + ``Analyzer`` if analyzer is not specified. + + Parameters + ---------- + term : str + Unanalyzed term. + analyzer : analyzer + Analyzer to apply. + + Returns + ------- + Tuple[int, int] + Document frequency and collection frequency. + """ + if analyzer is None: + analyzer = get_lucene_analyzer(stemming=False, stopwords=False) + + term_map = self.object.getTermCountsWithAnalyzer(self.reader, term, analyzer) + + return term_map.get('docFreq'), term_map.get('collectionFreq') + + def get_postings_list(self, term: str, analyzer=get_lucene_analyzer()) -> List[Posting]: + """Return the postings list for a term. + + Parameters + ---------- + term : str + Raw term. + analyzer : analyzer + Analyzer to apply. Defaults to Anserini's default. + + Returns + ------- + List[Posting] + List of :class:`Posting` objects corresponding to the postings list for the term. + """ + if analyzer is None: + postings_list = self.object.getPostingsListForAnalyzedTerm(self.reader, term) + else: + postings_list = self.object.getPostingsListWithAnalyzer(self.reader, term, + analyzer) + + if postings_list is None: + return None + + result = [] + for posting in postings_list.toArray(): + result.append(Posting(posting.getDocid(), posting.getTF(), posting.getPositions())) + return result + + def get_document_vector(self, docid: str) -> Optional[Dict[str, int]]: + """Return the document vector for a ``docid``. Note that requesting the document vector of a ``docid`` that + does not exist in the index will return ``None`` (as opposed to an empty dictionary); this forces the caller + to handle ``None`` explicitly and guards against silent errors. + + Parameters + ---------- + docid : str + Collection ``docid``. + + Returns + ------- + Optional[Dict[str, int]] + A dictionary with analyzed terms as keys and their term frequencies as values. + """ + doc_vector_map = self.object.getDocumentVector(self.reader, docid) + if doc_vector_map is None: + return None + doc_vector_dict = {} + for term in doc_vector_map.keySet().toArray(): + doc_vector_dict[term] = doc_vector_map.get(term) + return doc_vector_dict + + def get_term_positions(self, docid: str) -> Optional[Dict[str, int]]: + """Return the term position mapping of the document with ``docid``. Note that the term in the document is + stemmed and stop words may be removed according to your index settings. Also, requesting the document vector of + a ``docid`` that does not exist in the index will return ``None`` (as opposed to an empty dictionary); this + forces the caller to handle ``None`` explicitly and guards against silent errors. + + Parameters + ---------- + docid : str + Collection ``docid``. + + Returns + ------- + Optional[Dict[str, int]] + A tuple contains a dictionary with analyzed terms as keys and corresponding posting list as values + """ + java_term_position_map = self.object.getTermPositions(self.reader, docid) + if java_term_position_map is None: + return None + term_position_map = {} + for term in java_term_position_map.keySet().toArray(): + term_position_map[term] = java_term_position_map.get(term).toArray() + return term_position_map + + def doc(self, docid: str) -> Optional[Document]: + """Return the :class:`Document` corresponding to ``docid``. Returns ``None`` if the ``docid`` does not exist + in the index. + + Parameters + ---------- + docid : str + The collection ``docid``. + + Returns + ------- + Optional[Document] + :class:`Document` corresponding to the ``docid``. + """ + lucene_document = self.object.document(self.reader, docid) + if lucene_document is None: + return None + return Document(lucene_document) + + def doc_by_field(self, field: str, q: str) -> Optional[Document]: + """Return the :class:`Document` based on a ``field`` with ``id``. For example, this method can be used to fetch + document based on alternative primary keys that have been indexed, such as an article's DOI. + + Parameters + ---------- + field : str + The field to look up. + q : str + The document's unique id. + + Returns + ------- + Optional[Document] + :class:`Document` whose ``field`` is ``id``. + """ + lucene_document = self.object.documentByField(self.reader, field, q) + if lucene_document is None: + return None + return Document(lucene_document) + + def doc_raw(self, docid: str) -> Optional[str]: + """Return the raw document contents for a collection ``docid``. + + Parameters + ---------- + docid : str + Collection ``docid``. + + Returns + ------- + Optional[str] + Raw document contents. + """ + return self.object.documentRaw(self.reader, docid) + + def doc_contents(self, docid: str) -> Optional[str]: + """Return the indexed document contents for a collection ``docid``. + + Parameters + ---------- + docid : str + The collection ``docid``. + + Returns + ------- + Optional[str] + Index document contents. + """ + return self.object.documentContents(self.reader, docid) + + def compute_bm25_term_weight(self, docid: str, term: str, analyzer=get_lucene_analyzer(), k1=0.9, b=0.4) -> float: + """Compute the BM25 weight of a term in a document. Specify ``analyzer=None`` for an already analyzed term, + e.g., from the output of :func:`get_document_vector`. + + Parameters + ---------- + docid : str + Collection ``docid``. + term : str + Term. + analyzer : analyzer + Lucene analyzer to use, ``None`` if term is already analyzed. + k1 : float + BM25 k1 parameter. + b : float + BM25 b parameter. + + Returns + ------- + float + BM25 weight of the term in the document, or 0 if the term does not exist in the document. + """ + if analyzer is None: + return self.object.getBM25AnalyzedTermWeightWithParameters(self.reader, docid, + term, + float(k1), float(b)) + else: + return self.object.getBM25UnanalyzedTermWeightWithParameters(self.reader, docid, + term, analyzer, + float(k1), float(b)) + + def compute_query_document_score(self, docid: str, query: str, similarity=None): + if similarity is None: + return self.object.computeQueryDocumentScore(self.reader, docid, query) + else: + return self.object.computeQueryDocumentScoreWithSimilarity(self.reader, docid, query, similarity) + + def convert_internal_docid_to_collection_docid(self, docid: int) -> str: + """Convert Lucene's internal ``docid`` to its external collection ``docid``. + + Parameters + ---------- + docid : int + Lucene internal ``docid``. + + Returns + ------- + str + External collection ``docid`` corresponding to Lucene's internal ``docid``. + """ + return self.object.convertLuceneDocidToDocid(self.reader, docid) + + def convert_collection_docid_to_internal_docid(self, docid: str) -> int: + """Convert external collection ``docid`` to its Lucene's internal ``docid``. + + Parameters + ---------- + docid : str + External collection ``docid``. + + Returns + ------- + str + Lucene internal ``docid`` corresponding to the external collection ``docid``. + """ + return self.object.convertDocidToLuceneDocid(self.reader, docid) + + def stats(self) -> Dict[str, int]: + """Return dictionary with index statistics. + + Returns + ------- + Dict[str, int] + Index statistics as a dictionary of statistic's name to statistic. + - documents: number of documents + - non_empty_documents: number of non-empty documents + - unique_terms: number of unique terms + - total_terms: number of total terms + """ + index_stats_map = self.object.getIndexStats(self.reader) + + if index_stats_map is None: + return None + + index_stats_dict = {} + for term in index_stats_map.keySet().toArray(): + index_stats_dict[term] = index_stats_map.get(term) + + return index_stats_dict + + def dump_documents_BM25(self, file_path, k1=0.9, b=0.4): + """Dumps out all the document vectors with BM25 weights in Pyserini's JSONL vector format. + + Parameters + ---------- + file_path : str + File path to dump JSONL file. + k1 : float + BM25 k1 parameter. + b : float + BM25 b parameter. + """ + + f = open(file_path, 'w') + + assert 'documents' in self.stats() + for i in tqdm(range(self.stats()['documents'])): + docid = self.convert_internal_docid_to_collection_docid(i) + bm25_vector = {} + for term in self.get_document_vector(docid): + bm25_vector[term] = self.compute_bm25_term_weight(docid, term, analyzer=None, k1=k1, b=b) + + # vectors are written line by line to avoid running out of memory + f.write(json.dumps({'id': docid, 'vector': bm25_vector}) + "\n") + + f.close() + + def quantize_weights(self, input_file_path, output_file_path, bits = 8): + """Takes vectors of weights in Pyserini's JSONL vector format and quantizes them. + + Parameters + ---------- + input_file_path : str + File path of vectors of weights in Pyserini's JSONL vector format. + output_file_path : str + File path to output JSONL file of quantized weight vectors. + bits : int + Number of bits to use to represent quantized scores. + """ + + min_weight = float('inf') + max_weight = float('-inf') + + input_file = open(input_file_path, 'r') + + # vectors are read line by line to avoid running out of memory + for line in input_file: + doc = json.loads(line) + for weight in doc['vector'].values(): + if weight > max_weight: + max_weight = weight + if weight < min_weight: + min_weight = weight + input_file.seek(0) + + output_file = open(output_file_path, 'w') + + smallest_impact = 1 + for line in input_file: + doc = json.loads(line) + for element in doc['vector']: + doc['vector'][element] = math.floor((2 ** bits - smallest_impact) * (doc['vector'][element] - min_weight) / (max_weight - min_weight)) + smallest_impact + output_file.write(json.dumps(doc) + "\n") + + input_file.close() + output_file.close() diff --git a/pyserini/index/lucene/_indexer.py b/pyserini/index/lucene/_indexer.py new file mode 100644 index 0000000000000000000000000000000000000000..8546ee31572eea5a5d1fa8d133417573285124aa --- /dev/null +++ b/pyserini/index/lucene/_indexer.py @@ -0,0 +1,121 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import logging +from typing import List, Dict + +from pyserini.pyclass import autoclass + +logger = logging.getLogger(__name__) + +JLuceneIndexer = autoclass('io.anserini.index.SimpleIndexer') +JsonCollectionDocument = autoclass('io.anserini.collection.JsonCollection$Document') +JacksonObjectMapper = autoclass('com.fasterxml.jackson.databind.ObjectMapper') +JacksonJsonNode = autoclass('com.fasterxml.jackson.databind.JsonNode') + + +class LuceneIndexer: + """Wrapper class for ``SimpleIndexer`` in Anserini. Provides basic functionality for on-the-fly indexing via a + programmatic API, i.e., indexing in-process objects as opposed to on-file documents. + + Parameters + ---------- + index_dir : str + Path to Lucene index directory. + args : List[str] + List of arguments to pass to ``SimpleIndexer``. + append : bool + Append to existing index. + threads : int + Number of indexing threads. + """ + def __init__(self, index_dir: str = None, args: List[str] = None, append: bool = False, threads: int = 8): + self.index_dir = index_dir + self.args = args + if args: + args.extend(['-input', '', '-collection', 'JsonCollection', '-threads', str(threads)]) + if append: + args.extend(['-append']) + self.object = JLuceneIndexer(args) + else: + self.object = JLuceneIndexer(index_dir, append, int(threads)) + + self.mapper = JacksonObjectMapper() + + def add_doc_raw(self, doc: str): + """Add a raw document (in the form of a JSON string) to the index. + + Parameters + ---------- + doc : str + Document to add. + """ + self.object.addRawDocument(doc) + + def add_doc_dict(self, doc: Dict[str, str]): + """Add a document (in the form of a Python dictionary) to the index. + + Parameters + ---------- + doc : Dict[str, str] + Document to add. + """ + self.object.addJsonDocument(JsonCollectionDocument.fromFields(doc['id'], doc['contents'])) + + def add_doc_json(self, node: JacksonJsonNode): + """Add a document (in the form of a Jackson JSON node object) to the index. + + Parameters + ---------- + node : JacksonJsonNode + Document to add. + """ + self.object.addJsonNode(node) + + def add_batch_raw(self, docs: List[str]): + """Add a batch of raw documents (in the form of JSON strings) to the index. + + Parameters + ---------- + docs : List[str] + Documents to add. + """ + self.object.addRawDocuments(docs) + + def add_batch_dict(self, docs: List[Dict[str, str]]): + """Add a batch of documents (in the form of Python dictionaries) to the index. + + Parameters + ---------- + docs : List[Dict[str, str]] + Documents to add. + """ + docs = list(map(lambda d: JsonCollectionDocument.fromFields(d['id'], d['contents']), docs)) + self.object.addJsonDocuments(docs) + + def add_batch_json(self, nodes: List[JacksonJsonNode]): + """Add a batch of documents (in the form of Jackson JSON node objects) to the index. + + Parameters + ---------- + nodes : List[JacksonJsonNode] + Documents to add. + """ + self.object.addJsonNodes(nodes) + + def close(self): + """Close this indexer, committing all in-memory data to disk.""" + self.object.close() diff --git a/pyserini/index/merge_faiss_indexes.py b/pyserini/index/merge_faiss_indexes.py new file mode 100644 index 0000000000000000000000000000000000000000..5662aae9fe4ec844f89e85b7bf992341bd73321d --- /dev/null +++ b/pyserini/index/merge_faiss_indexes.py @@ -0,0 +1,46 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse + +import faiss +import os + + +parser = argparse.ArgumentParser() +parser.add_argument('--dimension', type=int, help='dimension of passage embeddings', required=False, default=768) +parser.add_argument('--prefix', type=str, help='directory to store brute force index of corpus', required=True) +parser.add_argument('--shard-num', type=int, help='number of shards', default=1) +args = parser.parse_args() + +new_index = faiss.IndexFlatIP(args.dimension) +docid_files = [] +for i in range(args.shard_num): + index = faiss.read_index(os.path.join(args.prefix + str(i), 'index')) + docid_files.append(os.path.join(args.prefix + str(i), 'docid')) + vectors = index.reconstruct_n(0, index.ntotal) + new_index.add(vectors) + +if not os.path.exists(args.prefix + 'full'): + os.mkdir(args.prefix + 'full') + +faiss.write_index(new_index, os.path.join(args.prefix + 'full', 'index')) + +with open(os.path.join(args.prefix + 'full', 'docid'), 'w') as wfd: + for f in docid_files: + with open(f, 'r') as f1: + for line in f1: + wfd.write(line) diff --git a/pyserini/index/nmslib.py b/pyserini/index/nmslib.py new file mode 100644 index 0000000000000000000000000000000000000000..be658b4de82b15fa81c5fd573720a5008a88afad --- /dev/null +++ b/pyserini/index/nmslib.py @@ -0,0 +1,102 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import copy +import json +import os +import shutil +import time + +import faiss +import nmslib +from scipy.sparse import csr_matrix + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--input', type=str, help='path to embeddings directory', required=True) + parser.add_argument('--output', type=str, help='path to output index dir', required=True) + parser.add_argument('--M', type=int, default=256, required=False) + parser.add_argument('--efC', type=int, default=256, required=False) + parser.add_argument('--threads', type=int, default=12, required=False) + args = parser.parse_args() + + if not os.path.exists(args.output): + os.mkdir(args.output) + + is_sparse = False + + if 'index' in os.listdir(args.input): + shutil.copy(os.path.join(args.input, 'docid'), os.path.join(args.output, 'docid')) + bf_index = faiss.read_index(os.path.join(args.input, 'index')) + vectors = bf_index.reconstruct_n(0, bf_index.ntotal) + else: + vectors = [] + for filename in os.listdir(args.input): + path = os.path.join(args.input, filename) + with open(path) as f_in, open(os.path.join(args.output, 'docid'), 'w') as f_out: + for line in f_in: + info = json.loads(line) + docid = info['id'] + vector = info['vector'] + f_out.write(f'{docid}\n') + vectors.append(vector) + + tokens = set() + if isinstance(vectors[0], dict): + is_sparse = True + for vec in vectors: + for key in vec: + tokens.add(key) + token2id = {} + with open(os.path.join(args.output, 'tokens'), 'w') as f: + for idx, tok in enumerate(tokens): + token2id[tok] = idx + f.write(f'{tok}\n') + + if is_sparse: + matrix_row, matrix_col, matrix_data = [], [], [] + for i, vec in enumerate(vectors): + weight_dict = vec + tokens = weight_dict.keys() + col = [token2id[tok] for tok in tokens] + data = weight_dict.values() + matrix_row.extend([i] * len(weight_dict)) + matrix_col.extend(col) + matrix_data.extend(data) + vectors = csr_matrix((matrix_data, (matrix_row, matrix_col)), shape=(len(vectors), len(token2id))) + + M = args.M + efC = args.efC + num_threads = args.threads + index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0} + if is_sparse: + index = nmslib.init(method='hnsw', space='negdotprod_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) + else: + index = nmslib.init(method='hnsw', space='negdotprod', data_type=nmslib.DataType.DENSE_VECTOR) + index.addDataPointBatch(vectors) + start = time.time() + index.createIndex(index_time_params, print_progress=True) + end = time.time() + index_time = end - start + print('Index-time parameters', index_time_params) + print('Indexing time = %f' % index_time) + index.saveIndex(os.path.join(args.output, 'index.bin'), save_data=True) + + metadata = copy.deepcopy(index_time_params) + metadata['index-time'] = index_time + metadata['type'] = 'sparse' if is_sparse else 'dense' + json.dump(metadata, open(os.path.join(args.output, 'meta'), 'w'), indent=4) diff --git a/pyserini/multithreading.py b/pyserini/multithreading.py new file mode 100644 index 0000000000000000000000000000000000000000..ab36c535f24f8056f0d8d3e14d99ccca4f03b514 --- /dev/null +++ b/pyserini/multithreading.py @@ -0,0 +1,39 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import threading + + +class ThreadSafeCount: + + def __init__(self): + self.value = 0 + self.lock = threading.Lock() + + def increment(self, inc=1): + with self.lock: + self.value += inc + return self.value + + +class Counters: + + def __init__(self): + self.indexable = ThreadSafeCount() + self.unindexable = ThreadSafeCount() + self.skipped = ThreadSafeCount() + self.errors = ThreadSafeCount() + diff --git a/pyserini/output_writer.py b/pyserini/output_writer.py new file mode 100644 index 0000000000000000000000000000000000000000..2222552244f55d96fdc7ab0801199e28a22d8000 --- /dev/null +++ b/pyserini/output_writer.py @@ -0,0 +1,116 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import json +import os + +from abc import ABC, abstractmethod +from enum import Enum, unique +from typing import List + +from pyserini.search import JLuceneSearcherResult + + +@unique +class OutputFormat(Enum): + TREC = 'trec' + MSMARCO = "msmarco" + KILT = 'kilt' + + +class OutputWriter(ABC): + + def __init__(self, file_path: str, mode: str = 'w', + max_hits: int = 1000, tag: str = None, topics: dict = None, + use_max_passage: bool = False, max_passage_delimiter: str = None, max_passage_hits: int = 100): + self.file_path = file_path + self.mode = mode + self.tag = tag + self.topics = topics + self.use_max_passage = use_max_passage + self.max_passage_delimiter = max_passage_delimiter if use_max_passage else None + self.max_hits = max_passage_hits if use_max_passage else max_hits + self._file = None + + def __enter__(self): + dirname = os.path.dirname(self.file_path) + if dirname: + os.makedirs(dirname, exist_ok=True) + self._file = open(self.file_path, self.mode) + return self + + def __exit__(self, exc_type, exc_value, exc_traceback): + self._file.close() + + def hits_iterator(self, hits: List[JLuceneSearcherResult]): + unique_docs = set() + rank = 1 + for hit in hits: + if self.use_max_passage and self.max_passage_delimiter: + docid = hit.docid.split(self.max_passage_delimiter)[0] + else: + docid = hit.docid.strip() + + if self.use_max_passage: + if docid in unique_docs: + continue + unique_docs.add(docid) + + yield docid, rank, hit.score, hit + + rank = rank + 1 + if rank > self.max_hits: + break + + @abstractmethod + def write(self, topic: str, hits: List[JLuceneSearcherResult]): + raise NotImplementedError() + + +class TrecWriter(OutputWriter): + def write(self, topic: str, hits: List[JLuceneSearcherResult]): + for docid, rank, score, _ in self.hits_iterator(hits): + self._file.write(f'{topic} Q0 {docid} {rank} {score:.6f} {self.tag}\n') + + +class MsMarcoWriter(OutputWriter): + def write(self, topic: str, hits: List[JLuceneSearcherResult]): + for docid, rank, score, _ in self.hits_iterator(hits): + self._file.write(f'{topic}\t{docid}\t{rank}\n') + + +class KiltWriter(OutputWriter): + def write(self, topic: str, hits: List[JLuceneSearcherResult]): + datapoint = self.topics[topic] + provenance = [] + for docid, rank, score, _ in self.hits_iterator(hits): + provenance.append({"wikipedia_id": docid}) + datapoint["output"] = [{"provenance": provenance}] + json.dump(datapoint, self._file) + self._file.write('\n') + + +def get_output_writer(file_path: str, output_format: OutputFormat, *args, **kwargs) -> OutputWriter: + mapping = { + OutputFormat.TREC: TrecWriter, + OutputFormat.MSMARCO: MsMarcoWriter, + OutputFormat.KILT: KiltWriter, + } + return mapping[output_format](file_path, *args, **kwargs) + + +def tie_breaker(hits): + return sorted(hits, key=lambda x: (-x.score, x.docid)) diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py new file mode 100644 index 0000000000000000000000000000000000000000..9bde039b3ce943bf0b1f3d6f2c335450ae936359 --- /dev/null +++ b/pyserini/prebuilt_index_info.py @@ -0,0 +1,5679 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +TF_INDEX_INFO_MSMARCO = { + # MS MARCO V1 document corpus, three indexes with different amounts of information (and sizes). + "msmarco-v1-doc": { + "description": "Lucene index of the MS MARCO V1 document corpus. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-doc.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-doc.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-doc.20221004.252b5e.tar.gz", + ], + "md5": "b2b1841c93255f9902150128d5e27e41", + "size compressed (bytes)": 13736982438, + "total_terms": 2742219865, + "documents": 3213835, + "unique_terms": 29823777, + "downloaded": False + }, + "msmarco-v1-doc-slim": { + "description": "Lucene index of the MS MARCO V1 document corpus ('slim' version). (Lucene 9)", + "filename": "lucene-index.msmarco-v1-doc-slim.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-doc.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-doc-slim.20221004.252b5e.tar.gz", + ], + "md5": "400fe94ec97a20cf775596085c5ad79d", + "size compressed (bytes)": 1791498133, + "total_terms": 2742219865, + "documents": 3213835, + "unique_terms": 29823777, + "downloaded": False + }, + "msmarco-v1-doc-full": { + "description": "Lucene index of the MS MARCO V1 document corpus ('full' version). (Lucene 9)", + "filename": "lucene-index.msmarco-v1-doc-full.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-doc.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-doc-full.20221004.252b5e.tar.gz", + ], + "md5": "75735da0dd35e3631d22bf682ebed8a0", + "size compressed (bytes)": 25525615599, + "total_terms": 2742219865, + "documents": 3213835, + "unique_terms": 29823777, + "downloaded": False + }, + + # MS MARCO V1 document corpus, doc2query-T5 expansions. + "msmarco-v1-doc-d2q-t5": { + "description": "Lucene index of the MS MARCO V1 document corpus with doc2query-T5 expansions. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-doc-d2q-t5.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-doc-d2q-t5.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-doc-d2q-t5.20221004.252b5e.tar.gz", + ], + "md5": "87530b64e55080fcfb90ec9e598be23e", + "size compressed (bytes)": 1885596544, + "total_terms": 3748343494, + "documents": 3213835, + "unique_terms": 30631009, + "downloaded": False + }, + "msmarco-v1-doc-d2q-t5-docvectors": { + "description": "Lucene index (+docvectors) of the MS MARCO V1 document corpus with doc2query-T5 expansions. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-doc-d2q-t5-docvectors.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-doc-d2q-t5.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-doc-d2q-t5-docvectors.20221004.252b5e.tar.gz", + ], + "md5": "a081b866b78e0f604ddb9e3103ee6cc5", + "size compressed (bytes)": 11152231182, + "total_terms": 3748343494, + "documents": 3213835, + "unique_terms": 30631009, + "downloaded": False + }, + + # MS MARCO V1 segmented document corpus, three indexes with different amounts of information (and sizes). + "msmarco-v1-doc-segmented": { + "description": "Lucene index of the MS MARCO V1 segmented document corpus. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-doc-segmented.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-doc-segmented.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-doc-segmented.20221004.252b5e.tar.gz", + ], + "md5": "59fdf88f360d0a72d1b94b9729c2198e", + "size compressed (bytes)": 15924438098, + "total_terms": 3200522554, + "documents": 20545677, + "unique_terms": 21191748, + "downloaded": False + }, + "msmarco-v1-doc-segmented-slim": { + "description": "Lucene index of the MS MARCO V1 segmented document corpus ('slim' version). (Lucene 9)", + "filename": "lucene-index.msmarco-v1-doc-segmented-slim.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-doc-segmented.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-doc-segmented-slim.20221004.252b5e.tar.gz", + ], + "md5": "c277161780d501ab832e16e6396f9cae", + "size compressed (bytes)": 3306727108, + "total_terms": 3200522554, + "documents": 20545677, + "unique_terms": 21191748, + "downloaded": False + }, + "msmarco-v1-doc-segmented-full": { + "description": "Lucene index of the MS MARCO V1 segmented document corpus ('full' version). (Lucene 9)", + "filename": "lucene-index.msmarco-v1-doc-segmented-full.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-doc-segmented.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-doc-segmented-full.20221004.252b5e.tar.gz", + ], + "md5": "c1af97d16c552a99a23382639c4a668c", + "size compressed (bytes)": 29470600011, + "total_terms": 3200522554, + "documents": 20545677, + "unique_terms": 21191748, + "downloaded": False + }, + + # MS MARCO V1 segmented document corpus, doc2query-T5 expansions. + "msmarco-v1-doc-segmented-d2q-t5": { + "description": "Lucene index of the MS MARCO V1 segmented document corpus with doc2query-T5 expansions. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-doc-segmented-d2q-t5.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-doc-segmented-d2q-t5.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-doc-segmented-d2q-t5.20221004.252b5e.tar.gz", + ], + "md5": "b242fd9cb0982e87d0c667439cb6d59c", + "size compressed (bytes)": 3554554620, + "total_terms": 4206646183, + "documents": 20545677, + "unique_terms": 22055268, + "downloaded": False + }, + "msmarco-v1-doc-segmented-d2q-t5-docvectors": { + "description": "Lucene index (+docvectors) of the MS MARCO V1 segmented document corpus with doc2query-T5 expansions. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-doc-segmented-d2q-t5-docvectors.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-doc-segmented-d2q-t5.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-doc-segmented-d2q-t5-docvectors.20221004.252b5e.tar.gz", + ], + "md5": "40341fc2cf151b8c447a8e77f5e9f100", + "size compressed (bytes)": 16349673687, + "total_terms": 4206646183, + "documents": 20545677, + "unique_terms": 22055268, + "downloaded": False + }, + + # MS MARCO V1 passage corpus, three indexes with different amounts of information (and sizes). + "msmarco-v1-passage": { + "description": "Lucene index of the MS MARCO V1 passage corpus. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-passage.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz", + ], + "md5": "c697b18c9a0686ca760583e615dbe450", + "size compressed (bytes)": 2170758938, + "total_terms": 352316036, + "documents": 8841823, + "unique_terms": 2660824, + "downloaded": False + }, + "msmarco-v1-passage-slim": { + "description": "Lucene index of the MS MARCO V1 passage corpus ('slim' version). (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-slim.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-passage.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-slim.20221004.252b5e.tar.gz", + ], + "md5": "9f952db731ed7c3f2ec14010664ddcec", + "size compressed (bytes)": 491451085, + "total_terms": 352316036, + "documents": 8841823, + "unique_terms": 2660824, + "downloaded": False + }, + "msmarco-v1-passage-full": { + "description": "Lucene index of the MS MARCO V1 passage corpus ('full' version). (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-full.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-passage.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-full.20221004.252b5e.tar.gz", + ], + "md5": "0ff5ceaae32333d3580ae594d460385c", + "size compressed (bytes)": 3720616158, + "total_terms": 352316036, + "documents": 8841823, + "unique_terms": 2660824, + "downloaded": False + }, + + # MS MARCO V1 passage corpus, doc2query-T5 expansions. + "msmarco-v1-passage-d2q-t5": { + "description": "Lucene index of the MS MARCO V1 passage corpus with doc2query-T5 expansions. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-d2q-t5.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-d2q-t5.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-d2q-t5.20221004.252b5e.tar.gz", + ], + "md5": "0a62959d300634aa0eb37e910aa4f4a7", + "size compressed (bytes)": 807866125, + "total_terms": 1986612263, + "documents": 8841823, + "unique_terms": 3929111, + "downloaded": False + }, + "msmarco-v1-passage-d2q-t5-docvectors": { + "description": "Lucene index (+docvectors) of the MS MARCO V1 passage corpus with doc2query-T5 expansions. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-d2q-t5-docvectors.20221004.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-d2q-t5.20221004.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-d2q-t5-docvectors.20221004.252b5e.tar.gz", + ], + "md5": "2530b20771c6f441073ff49a56ea9004", + "size compressed (bytes)": 4409861543, + "total_terms": 1986612263, + "documents": 8841823, + "unique_terms": 3929111, + "downloaded": False + }, + + # MS MARCO V1 indexes for LTR experiments. + "msmarco-passage-ltr": { + "description": "Lucene index of the MS MARCO passage corpus with four extra preprocessed fields for LTR. (Lucene 8)", + "filename": "index-msmarco-passage-ltr-20210519-e25e33f.tar.gz", + "readme": "index-msmarco-passage-ltr-20210519-e25e33f-readme.txt", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/index-msmarco-passage-ltr-20210519-e25e33f.tar.gz", + "https://vault.cs.uwaterloo.ca/s/8qFCaCtwabRfYQD/download" + ], + "md5": "a5de642c268ac1ed5892c069bdc29ae3", + "size compressed (bytes)": 14073966046, + "total_terms": 352316036, + "documents": 8841823, + "unique_terms": 2660824, + "downloaded": False + }, + "msmarco-doc-per-passage-ltr": { + "description": "Lucene index of the MS MARCO document per-passage corpus with four extra preprocessed fields for LTR. (Lucene 8)", + "filename": "index-msmarco-doc-per-passage-ltr-20211031-33e4151.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/index-msmarco-doc-per-passage-ltr-20211031-33e4151.tar.gz", + "https://vault.cs.uwaterloo.ca/s/kNdXMWXEsTt3fT8/download" + ], + "md5": "bd60e89041b4ebbabc4bf0cfac608a87", + "size compressed (bytes)": 45835520960, + "total_terms": 1232004740, + "documents": 20545628, + "unique_terms": 10123678, + "downloaded": False + }, + "msmarco-document-segment-ltr": { + "description": "Lucene index of the MS MARCO document segmented corpus with four extra preprocessed fields for LTR. (Lucene 8)", + "filename": "lucene-index.msmarco-doc-segmented.ibm.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-doc-segmented.ibm.tar.gz" + ], + "md5": "13064bdaf8e8a79222634d67ecd3ddb5", + "size compressed (bytes)": 98984853515, + "total_terms": 3197500226, + "documents": 20532330, + "unique_terms": -1, + "downloaded": False + }, + + # MS MARCO V2 document corpus, three indexes with different amounts of information (and sizes). + "msmarco-v2-doc": { + "description": "Lucene index of the MS MARCO V2 document corpus. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-doc.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-doc.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-doc.20220808.4d6d2a.tar.gz", + ], + "md5": "0599bd6ed5ee28390b279eb398ef0267", + "size compressed (bytes)": 63431299815, + "total_terms": 14165667143, + "documents": 11959635, + "unique_terms": 44860768, + "downloaded": False + }, + "msmarco-v2-doc-slim": { + "description": "Lucene index of the MS MARCO V2 document corpus ('slim' version). (Lucene 9)", + "filename": "lucene-index.msmarco-v2-doc-slim.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-doc.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-doc-slim.20220808.4d6d2a.tar.gz", + ], + "md5": "4dfc5549e3c15abec4b9694542a376d1", + "size compressed (bytes)": 7172175394, + "total_terms": 14165667143, + "documents": 11959635, + "unique_terms": 44860768, + "downloaded": False + }, + "msmarco-v2-doc-full": { + "description": "Lucene index of the MS MARCO V2 document corpus ('full' version). (Lucene 9)", + "filename": "lucene-index.msmarco-v2-doc-full.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-doc.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-doc-full.20220808.4d6d2a.tar.gz", + ], + "md5": "fc6f546898725617eb5ca7a144bef531", + "size compressed (bytes)": 119537276117, + "total_terms": 14165667143, + "documents": 11959635, + "unique_terms": 44860768, + "downloaded": False + }, + + # MS MARCO V2 document corpus, doc2query-T5 expansions. + "msmarco-v2-doc-d2q-t5": { + "description": "Lucene index of the MS MARCO V2 document corpus with doc2query-T5 expansions. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-doc-d2q-t5.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-doc-d2q-t5.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-doc-d2q-t5.20220808.4d6d2a.tar.gz", + ], + "md5": "25514f77600a6be87aeb1c66c9107b89", + "size compressed (bytes)": 8155218407, + "total_terms": 19760783236, + "documents": 11959635, + "unique_terms": 54148271, + "downloaded": False + }, + "msmarco-v2-doc-d2q-t5-docvectors": { + "description": "Lucene index (+docvectors) of the MS MARCO V2 document corpus with doc2query-T5 expansions. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-doc-d2q-t5-docvectors.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-doc-d2q-t5.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-doc-d2q-t5-docvectors.20220808.4d6d2a.tar.gz", + ], + "md5": "a3ce9b1146857a332825825623ab89e7", + "size compressed (bytes)": 54415612794, + "total_terms": 19760783236, + "documents": 11959635, + "unique_terms": 54148271, + "downloaded": False + }, + + # MS MARCO V2 segmented document corpus, three indexes with different amounts of information (and sizes). + "msmarco-v2-doc-segmented": { + "description": "Lucene index of the MS MARCO V2 segmented document corpus. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-doc-segmented.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-doc-segmented.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-doc-segmented.20220808.4d6d2a.tar.gz" + ], + "md5": "8a5f444fa5a63cc5d4ddc3e6dd15faa0", + "size compressed (bytes)": 109269078191, + "total_terms": 24780918039, + "documents": 124131414, + "unique_terms": 29265408, + "downloaded": False + }, + "msmarco-v2-doc-segmented-slim": { + "description": "Lucene index of the MS MARCO V2 segmented document corpus ('slim' version). (Lucene 9)", + "filename": "lucene-index.msmarco-v2-doc-segmented-slim.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-doc-segmented.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-doc-segmented-slim.20220808.4d6d2a.tar.gz" + ], + "md5": "f50c591aa9a0a0126ebc4dc53c6306d7", + "size compressed (bytes)": 20852487058, + "total_terms": 24780918039, + "documents": 124131414, + "unique_terms": 29265408, + "downloaded": False + }, + "msmarco-v2-doc-segmented-full": { + "description": "Lucene index of the MS MARCO V2 segmented document corpus ('full' version). (Lucene 9)", + "filename": "lucene-index.msmarco-v2-doc-segmented-full.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-doc-segmented.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-doc-segmented-full.20220808.4d6d2a.tar.gz" + ], + "md5": "259b936d3591e48770da9dde153d1617", + "size compressed (bytes)": 201358944352, + "total_terms": 24780918039, + "documents": 124131414, + "unique_terms": 29265408, + "downloaded": False + }, + + # MS MARCO V2 segmented document corpus, doc2query-T5 expansions. + "msmarco-v2-doc-segmented-d2q-t5": { + "description": "Lucene index of the MS MARCO V2 segmented document corpus with doc2query-T5 expansions. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-doc-segmented-d2q-t5.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-doc-segmented-d2q-t5.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-doc-segmented-d2q-t5.20220808.4d6d2a.tar.gz" + ], + "md5": "1e9fa18f082aaadfef02ba9eea32fcc2", + "size compressed (bytes)": 24242738999, + "total_terms": 30376034132, + "documents": 124131414, + "unique_terms": 38932296, + "downloaded": False + }, + "msmarco-v2-doc-segmented-d2q-t5-docvectors": { + "description": "Lucene index (+docvectors) of the MS MARCO V2 segmented document corpus with doc2query-T5 expansions. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-doc-segmented-d2q-t5-docvectors.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-doc-segmented-d2q-t5.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-doc-segmented-d2q-t5-docvectors.20220808.4d6d2a.tar.gz", + ], + "md5": "eff6fe5b61936491c8985ad7efa46b20", + "size compressed (bytes)": 114315186555, + "total_terms": 30376034132, + "documents": 124131414, + "unique_terms": 38932296, + "downloaded": False + }, + + # MS MARCO V2 passage corpus, three indexes with different amounts of information (and sizes). + "msmarco-v2-passage": { + "description": "Lucene index of the MS MARCO V2 passage corpus. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-passage.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-passage.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-passage.20220808.4d6d2a.tar.gz" + ], + "md5": "eacd8556dd416ccad517b5e7dc97bceb", + "size compressed (bytes)": 38808092190, + "total_terms": 4673266800, + "documents": 138364198, + "unique_terms": 11885838, + "downloaded": False + }, + "msmarco-v2-passage-slim": { + "description": "Lucene index of the MS MARCO V2 passage corpus ('slim' version). (Lucene 9)", + "filename": "lucene-index.msmarco-v2-passage-slim.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-passage.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-passage-slim.20220808.4d6d2a.tar.gz" + ], + "md5": "d7e644c048669aa72314dd358b475765", + "size compressed (bytes)": 8170344330, + "total_terms": 4673266800, + "documents": 138364198, + "unique_terms": 11885838, + "downloaded": False + }, + "msmarco-v2-passage-full": { + "description": "Lucene index of the MS MARCO V2 passage corpus ('full' version). (Lucene 9)", + "filename": "lucene-index.msmarco-v2-passage-full.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-passage.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-passage-full.20220808.4d6d2a.tar.gz" + ], + "md5": "ef5c22c865094c386b9ec600165bb061", + "size compressed (bytes)": 60413585958, + "total_terms": 4673266800, + "documents": 138364198, + "unique_terms": 11885838, + "downloaded": False + }, + + # MS MARCO V2 passage corpus, doc2query-T5 expansions. + "msmarco-v2-passage-d2q-t5": { + "description": "Lucene index of the MS MARCO V2 passage corpus with doc2query-T5 expansions. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-passage-d2q-t5.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-passage-d2q-t5.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-passage-d2q-t5.20220808.4d6d2a.tar.gz", + ], + "md5": "3c357f9c219e4c3d980bc663e1f5a5f4", + "size compressed (bytes)": 14404903785, + "total_terms": 16961479264, + "documents": 138364198, + "unique_terms": 36651533, + "downloaded": False + }, + "msmarco-v2-passage-d2q-t5-docvectors": { + "description": "Lucene index (+docvectors) of the MS MARCO V2 passage corpus with doc2query-T5 expansions. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-passage-d2q-t5-docvectors.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-passage-d2q-t5.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-passage-d2q-t5-docvectors.20220808.4d6d2a.tar.gz", + ], + "md5": "01e369b644e5a8b7413e04140780cf94", + "size compressed (bytes)": 59206472740, + "total_terms": 16961479264, + "documents": 138364198, + "unique_terms": 36651533, + "downloaded": False + }, + + # MS MARCO V2 augmented passage corpus, three indexes with different amounts of information (and sizes). + "msmarco-v2-passage-augmented": { + "description": "Lucene index of the MS MARCO V2 augmented passage corpus. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-passage-augmented.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-passage-augmented.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-passage-augmented.20220808.4d6d2a.tar.gz" + ], + "md5": "69675971a0172eb5e37668ea42761d43", + "size compressed (bytes)": 75036026507, + "total_terms": 15272965252, + "documents": 138364198, + "unique_terms": 16579899, + "downloaded": False + }, + "msmarco-v2-passage-augmented-slim": { + "description": "Lucene index of the MS MARCO V2 augmented passage corpus ('slim' version). (Lucene 9)", + "filename": "lucene-index.msmarco-v2-passage-augmented-slim.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-passage-augmented.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-passage-augmented-slim.20220808.4d6d2a.tar.gz" + ], + "md5": "3524b5b28117ac1a5365cd664c6871f1", + "size compressed (bytes)": 14757394934, + "total_terms": 15272965252, + "documents": 138364198, + "unique_terms": 16579899, + "downloaded": False + }, + "msmarco-v2-passage-augmented-full": { + "description": "Lucene index of the MS MARCO V2 augmented passage corpus ('full' version). (Lucene 9)", + "filename": "lucene-index.msmarco-v2-passage-augmented-full.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-passage-augmented.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-passage-augmented-full.20220808.4d6d2a.tar.gz" + ], + "md5": "c3e18c02e749c0416e1acc653899c6b0", + "size compressed (bytes)": 130622740320, + "total_terms": 15272965252, + "documents": 138364198, + "unique_terms": 16579899, + "downloaded": False + }, + + # MS MARCO V2 augmented passage corpus, doc2query-T5 expansions. + "msmarco-v2-passage-augmented-d2q-t5": { + "description": "Lucene index of the MS MARCO V2 augmented passage corpus with doc2query-T5 expansions. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-passage-augmented-d2q-t5.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-passage-augmented-d2q-t5.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-passage-augmented-d2q-t5.20220808.4d6d2a.tar.gz" + ], + "md5": "2b683a3a64692b95375ddbdcb9590f25", + "size compressed (bytes)": 14404903785, + "total_terms": 27561177716, + "documents": 138364198, + "unique_terms": 41177061, + "downloaded": False + }, + "msmarco-v2-passage-augmented-d2q-t5-docvectors": { + "description": "Lucene index (+docvectors) of the MS MARCO V2 augmented passage corpus with doc2query-T5 expansions. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-passage-augmented-d2q-t5-docvectors.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-passage-augmented-d2q-t5.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-passage-augmented-d2q-t5-docvectors.20220808.4d6d2a.tar.gz", + ], + "md5": "fe6eaeceabaa06cb09fdf8432f65f9d8", + "size compressed (bytes)": 59206472740, + "total_terms": 27561177716, + "documents": 138364198, + "unique_terms": 41177061, + "downloaded": False + } +} + +TF_INDEX_INFO_BEIR = { + # BEIR (v1.0.0) flat indexes + "beir-v1.0.0-trec-covid.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): TREC-COVID", + "filename": "lucene-index.beir-v1.0.0-trec-covid.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-trec-covid.flat.20221116.505594.tar.gz" + ], + "md5": "57b812594b11d064a23123137ae7dade", + "size compressed (bytes)": 226268665, + "total_terms": 20822821, + "documents": 171331, + "unique_terms": 202648, + "downloaded": False + }, + "beir-v1.0.0-bioasq.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): BioASQ", + "filename": "lucene-index.beir-v1.0.0-bioasq.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-bioasq.flat.20221116.505594.tar.gz" + ], + "md5": "cf8d4804b06bb8678d30b1375b46a0b3", + "size compressed (bytes)": 24821933356, + "total_terms": 2257541758, + "documents": 14914603, + "unique_terms": 4960004, + "downloaded": False + }, + "beir-v1.0.0-nfcorpus.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): NFCorpus", + "filename": "lucene-index.beir-v1.0.0-nfcorpus.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-nfcorpus.flat.20221116.505594.tar.gz" + ], + "md5": "34c0b11ad13a4715a78d025902061d37", + "size compressed (bytes)": 6509700, + "total_terms": 637485, + "documents": 3633, + "unique_terms": 22111, + "downloaded": False + }, + "beir-v1.0.0-nq.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): NQ", + "filename": "lucene-index.beir-v1.0.0-nq.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-nq.flat.20221116.505594.tar.gz" + ], + "md5": "a2c5db4dd3780fff3c7c6bfea1dd08e8", + "size compressed (bytes)": 1645453748, + "total_terms": 151249294, + "documents": 2681468, + "unique_terms": 997027, + "downloaded": False + }, + "beir-v1.0.0-hotpotqa.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): HotpotQA", + "filename": "lucene-index.beir-v1.0.0-hotpotqa.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-hotpotqa.flat.20221116.505594.tar.gz" + ], + "md5": "3be2875f93537369641287dcdf25add9", + "size compressed (bytes)": 2019081888, + "total_terms": 172477066, + "documents": 5233329, + "unique_terms": 2644892, + "downloaded": False + }, + "beir-v1.0.0-fiqa.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): FiQA-2018", + "filename": "lucene-index.beir-v1.0.0-fiqa.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-fiqa.flat.20221116.505594.tar.gz" + ], + "md5": "409b779e8a39813d2fbdfd1ea2f009e9", + "size compressed (bytes)": 55982536, + "total_terms": 5288635, + "documents": 57600, + "unique_terms": 66977, + "downloaded": False + }, + "beir-v1.0.0-signal1m.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): Signal-1M", + "filename": "lucene-index.beir-v1.0.0-signal1m.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-signal1m.flat.20221116.505594.tar.gz" + ], + "md5": "d0828b92a3df814bfa4b73bddeb25da7", + "size compressed (bytes)": 496596576, + "total_terms": 32240069, + "documents": 2866315, + "unique_terms": 796647, + "downloaded": False + }, + "beir-v1.0.0-trec-news.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): TREC-NEWS", + "filename": "lucene-index.beir-v1.0.0-trec-news.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-trec-news.flat.20221116.505594.tar.gz" + ], + "md5": "98df3de34b4b76a4390520c606817ec4", + "size compressed (bytes)": 2623576957, + "total_terms": 275651967, + "documents": 594589, + "unique_terms": 729872, + "downloaded": False + }, + "beir-v1.0.0-robust04.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): Robust04", + "filename": "lucene-index.beir-v1.0.0-robust04.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-robust04.flat.20221116.505594.tar.gz" + ], + "md5": "89dfcb7297c12a772d1bfd7917df908d", + "size compressed (bytes)": 1728446730, + "total_terms": 174384263, + "documents": 528036, + "unique_terms": 923466, + "downloaded": False + }, + "beir-v1.0.0-arguana.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): ArguAna", + "filename": "lucene-index.beir-v1.0.0-arguana.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-arguana.flat.20221116.505594.tar.gz" + ], + "md5": "d6c005689a9e7e91f3b1a7fbc74063e1", + "size compressed (bytes)": 10563485, + "total_terms": 969528, + "documents": 8674, + "unique_terms": 23895, + "downloaded": False + }, + "beir-v1.0.0-webis-touche2020.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): Webis-Touche2020", + "filename": "lucene-index.beir-v1.0.0-webis-touche2020.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-webis-touche2020.flat.20221116.505594.tar.gz" + ], + "md5": "20c6e9f29461eea1a520cd1abead709a", + "size compressed (bytes)": 750400932, + "total_terms": 76082209, + "documents": 382545, + "unique_terms": 525540, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-android.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): CQADupStack-android", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-android.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-android.flat.20221116.505594.tar.gz" + ], + "md5": "9f9f35e34f76336bc6e516599cbaf75b", + "size compressed (bytes)": 17423320, + "total_terms": 1760762, + "documents": 22998, + "unique_terms": 41456, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-english.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): CQADupStack-english", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-english.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-english.flat.20221116.505594.tar.gz" + ], + "md5": "7d887497d32eedd92c314c93feaca28e", + "size compressed (bytes)": 24949578, + "total_terms": 2236655, + "documents": 40221, + "unique_terms": 62517, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-gaming.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): CQADupStack-gaming", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-gaming.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-gaming.flat.20221116.505594.tar.gz" + ], + "md5": "140e16ee86a69c8fd4d16a83a6d51591", + "size compressed (bytes)": 29156970, + "total_terms": 2827717, + "documents": 45301, + "unique_terms": 60070, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-gis.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): CQADupStack-gis", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-gis.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-gis.flat.20221116.505594.tar.gz" + ], + "md5": "4bd93695f28af0a11172f387ef41fee6", + "size compressed (bytes)": 43396154, + "total_terms": 4048584, + "documents": 37637, + "unique_terms": 184133, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-mathematica.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): CQADupStack-mathematica", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-mathematica.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-mathematica.flat.20221116.505594.tar.gz" + ], + "md5": "5b5b7ab3d0437428e29a5a1431de1ca5", + "size compressed (bytes)": 21589909, + "total_terms": 2332642, + "documents": 16705, + "unique_terms": 111611, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-physics.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): CQADupStack-physics", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-physics.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-physics.flat.20221116.505594.tar.gz" + ], + "md5": "6864144bca1bb169a452321e14ef12e0", + "size compressed (bytes)": 37956215, + "total_terms": 3785483, + "documents": 38316, + "unique_terms": 55950, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-programmers.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): CQADupStack-programmers", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-programmers.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-programmers.flat.20221116.505594.tar.gz" + ], + "md5": "7b7d2bbf7cc5d53924d09c3b781dba8a", + "size compressed (bytes)": 40297069, + "total_terms": 3905694, + "documents": 32176, + "unique_terms": 74195, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-stats.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): CQADupStack-stats", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-stats.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-stats.flat.20221116.505594.tar.gz" + ], + "md5": "0b09b7bee2b60df0ff73710a93a79218", + "size compressed (bytes)": 52212599, + "total_terms": 5356042, + "documents": 42269, + "unique_terms": 183358, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-tex.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): CQADupStack-tex", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-tex.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-tex.flat.20221116.505594.tar.gz" + ], + "md5": "48a2541bd7d1adec06f053486655e815", + "size compressed (bytes)": 91819025, + "total_terms": 9556423, + "documents": 68184, + "unique_terms": 288088, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-unix.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): CQADupStack-unix", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-unix.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-unix.flat.20221116.505594.tar.gz" + ], + "md5": "a6cc0a867f6210ad44755c0a36fd682a", + "size compressed (bytes)": 53802808, + "total_terms": 5767374, + "documents": 47382, + "unique_terms": 206323, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-webmasters.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): CQADupStack-webmasters", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-webmasters.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-webmasters.flat.20221116.505594.tar.gz" + ], + "md5": "a04f65d575b4233a151c4960b82815b9", + "size compressed (bytes)": 15174811, + "total_terms": 1482585, + "documents": 17405, + "unique_terms": 40547, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-wordpress.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): CQADupStack-wordpress", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-wordpress.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-wordpress.flat.20221116.505594.tar.gz" + ], + "md5": "4ab079b9f7d0463955ce073b5d53e64d", + "size compressed (bytes)": 54807597, + "total_terms": 5463472, + "documents": 48605, + "unique_terms": 125727, + "downloaded": False + }, + "beir-v1.0.0-quora.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): Quora", + "filename": "lucene-index.beir-v1.0.0-quora.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-quora.flat.20221116.505594.tar.gz" + ], + "md5": "53fa2bd0667d23a50f95adaf169b87a1", + "size compressed (bytes)": 52698691, + "total_terms": 4390852, + "documents": 522931, + "unique_terms": 69597, + "downloaded": False + }, + "beir-v1.0.0-dbpedia-entity.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): DBPedia", + "filename": "lucene-index.beir-v1.0.0-dbpedia-entity.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-dbpedia-entity.flat.20221116.505594.tar.gz" + ], + "md5": "6bc15a920e262d12ec3842401755e934", + "size compressed (bytes)": 2085473498, + "total_terms": 164794982, + "documents": 4635922, + "unique_terms": 3351459, + "downloaded": False + }, + "beir-v1.0.0-scidocs.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): SCIDOCS", + "filename": "lucene-index.beir-v1.0.0-scidocs.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-scidocs.flat.20221116.505594.tar.gz" + ], + "md5": "f1fba96a71a62bc567ecbd167de3794b", + "size compressed (bytes)": 186572809, + "total_terms": 3266767, + "documents": 25657, + "unique_terms": 63604, + "downloaded": False + }, + "beir-v1.0.0-fever.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): FEVER", + "filename": "lucene-index.beir-v1.0.0-fever.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-fever.flat.20221116.505594.tar.gz" + ], + "md5": "1b06f43ea36e2ed450d1b1d90099ae67", + "size compressed (bytes)": 3880155553, + "total_terms": 325179165, + "documents": 5416568, + "unique_terms": 3293639, + "downloaded": False + }, + "beir-v1.0.0-climate-fever.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): Climate-FEVER", + "filename": "lucene-index.beir-v1.0.0-climate-fever.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-climate-fever.flat.20221116.505594.tar.gz" + ], + "md5": "68811e2614b3bac9e1b879c883fc722e", + "size compressed (bytes)": 3880208200, + "total_terms": 325185072, + "documents": 5416593, + "unique_terms": 3293621, + "downloaded": False + }, + "beir-v1.0.0-scifact.flat": { + "description": "Lucene flat index of BEIR (v1.0.0): SciFact", + "filename": "lucene-index.beir-v1.0.0-scifact.flat.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-flat.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-scifact.flat.20221116.505594.tar.gz" + ], + "md5": "6f6e55f1cf80c362f86bee65529b71de", + "size compressed (bytes)": 8851173, + "total_terms": 838128, + "documents": 5183, + "unique_terms": 28865, + "downloaded": False + }, + + # BEIR (v1.0.0) multifield indexes + "beir-v1.0.0-trec-covid.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): TREC-COVID", + "filename": "lucene-index.beir-v1.0.0-trec-covid.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-trec-covid.multifield.20221116.505594.tar.gz" + ], + "md5": "7501a330a0c9246e6350413c3f6ced7c", + "size compressed (bytes)": 222831983, + "total_terms": 19060122, + "documents": 129192, + "unique_terms": 193851, + "downloaded": False + }, + "beir-v1.0.0-bioasq.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): BioASQ", + "filename": "lucene-index.beir-v1.0.0-bioasq.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-bioasq.multifield.20221116.505594.tar.gz" + ], + "md5": "cc01ab450cac0b8865bd1e70e2a58596", + "size compressed (bytes)": 25346354679, + "total_terms": 2099554307, + "documents": 14914602, + "unique_terms": 4889053, + "downloaded": False + }, + "beir-v1.0.0-nfcorpus.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): NFCorpus", + "filename": "lucene-index.beir-v1.0.0-nfcorpus.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-nfcorpus.multifield.20221116.505594.tar.gz" + ], + "md5": "904e53b80fe04b3844b97847bc77a772", + "size compressed (bytes)": 6645576, + "total_terms": 601950, + "documents": 3633, + "unique_terms": 21819, + "downloaded": False + }, + "beir-v1.0.0-nq.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): NQ", + "filename": "lucene-index.beir-v1.0.0-nq.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-nq.multifield.20221116.505594.tar.gz" + ], + "md5": "693ca315de9fbbbf7f664be313a03847", + "size compressed (bytes)": 1642708204, + "total_terms": 144050891, + "documents": 2680961, + "unique_terms": 996653, + "downloaded": False + }, + "beir-v1.0.0-hotpotqa.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): HotpotQA", + "filename": "lucene-index.beir-v1.0.0-hotpotqa.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-hotpotqa.multifield.20221116.505594.tar.gz" + ], + "md5": "ef8c2f40097e652eec99e6bf25e151cd", + "size compressed (bytes)": 2083441492, + "total_terms": 158180692, + "documents": 5233235, + "unique_terms": 2627639, + "downloaded": False + }, + "beir-v1.0.0-fiqa.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): FiQA-2018", + "filename": "lucene-index.beir-v1.0.0-fiqa.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-fiqa.multifield.20221116.505594.tar.gz" + ], + "md5": "073f3f19a94689e5fac511af49316fe1", + "size compressed (bytes)": 55984419, + "total_terms": 5288635, + "documents": 57600, + "unique_terms": 66977, + "downloaded": False + }, + "beir-v1.0.0-signal1m.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): Signal-1M", + "filename": "lucene-index.beir-v1.0.0-signal1m.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-signal1m.multifield.20221116.505594.tar.gz" + ], + "md5": "4482ae02f18e8336c0a95ea33b5b6ede", + "size compressed (bytes)": 496603092, + "total_terms": 32240069, + "documents": 2866315, + "unique_terms": 796647, + "downloaded": False + }, + "beir-v1.0.0-trec-news.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): TREC-NEWS", + "filename": "lucene-index.beir-v1.0.0-trec-news.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-trec-news.multifield.20221116.505594.tar.gz" + ], + "md5": "3151122da3cf081a0c8894af7b75be43", + "size compressed (bytes)": 2633899363, + "total_terms": 270886723, + "documents": 578605, + "unique_terms": 727856, + "downloaded": False + }, + "beir-v1.0.0-robust04.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): Robust04", + "filename": "lucene-index.beir-v1.0.0-robust04.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-robust04.multifield.20221116.505594.tar.gz" + ], + "md5": "fdf741a75efe089d0451de5720b52c3a", + "size compressed (bytes)": 1728446303, + "total_terms": 174384263, + "documents": 528036, + "unique_terms": 923466, + "downloaded": False + }, + "beir-v1.0.0-arguana.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): ArguAna", + "filename": "lucene-index.beir-v1.0.0-arguana.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-arguana.multifield.20221116.505594.tar.gz" + ], + "md5": "a8201952860d31c56ea8a54c31e88b51", + "size compressed (bytes)": 10524118, + "total_terms": 944123, + "documents": 8674, + "unique_terms": 23867, + "downloaded": False + }, + "beir-v1.0.0-webis-touche2020.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): Webis-Touche2020", + "filename": "lucene-index.beir-v1.0.0-webis-touche2020.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-webis-touche2020.multifield.20221116.505594.tar.gz" + ], + "md5": "e160ea813990cff4dbdb9f50d509f8ea", + "size compressed (bytes)": 750724439, + "total_terms": 74066724, + "documents": 382545, + "unique_terms": 524665, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-android.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): CQADupStack-android", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-android.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-android.multifield.20221116.505594.tar.gz" + ], + "md5": "de85f92a018d83a7ea496d9ef955b8c5", + "size compressed (bytes)": 17887736, + "total_terms": 1591285, + "documents": 22998, + "unique_terms": 40824, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-english.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): CQADupStack-english", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-english.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-english.multifield.20221116.505594.tar.gz" + ], + "md5": "71c5d3db04586283772f6069668f5bfa", + "size compressed (bytes)": 25558901, + "total_terms": 2006983, + "documents": 40221, + "unique_terms": 61530, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-gaming.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): CQADupStack-gaming", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-gaming.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-gaming.multifield.20221116.505594.tar.gz" + ], + "md5": "ff7c628b568f916c3bc3f7bf2af831eb", + "size compressed (bytes)": 29992453, + "total_terms": 2510477, + "documents": 45300, + "unique_terms": 59113, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-gis.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): CQADupStack-gis", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-gis.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-gis.multifield.20221116.505594.tar.gz" + ], + "md5": "4083830da4922d1294b3fb38873ba5a2", + "size compressed (bytes)": 44188661, + "total_terms": 3789161, + "documents": 37637, + "unique_terms": 183298, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-mathematica.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): CQADupStack-mathematica", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-mathematica.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-mathematica.multifield.20221116.505594.tar.gz" + ], + "md5": "baa9414c385db88eaafffa95d5ec7d48", + "size compressed (bytes)": 21911919, + "total_terms": 2234369, + "documents": 16705, + "unique_terms": 111306, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-physics.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): CQADupStack-physics", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-physics.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-physics.multifield.20221116.505594.tar.gz" + ], + "md5": "342b105462067b87e78730921dd7288d", + "size compressed (bytes)": 38736492, + "total_terms": 3542078, + "documents": 38316, + "unique_terms": 55229, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-programmers.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): CQADupStack-programmers", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-programmers.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-programmers.multifield.20221116.505594.tar.gz" + ], + "md5": "2e95b82caf156d0f0b109c62e0011eab", + "size compressed (bytes)": 40982052, + "total_terms": 3682227, + "documents": 32176, + "unique_terms": 73765, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-stats.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): CQADupStack-stats", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-stats.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-stats.multifield.20221116.505594.tar.gz" + ], + "md5": "87c53df624baed7921672286beb94f9c", + "size compressed (bytes)": 53094508, + "total_terms": 5073873, + "documents": 42269, + "unique_terms": 182933, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-tex.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): CQADupStack-tex", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-tex.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-tex.multifield.20221116.505594.tar.gz" + ], + "md5": "86407171e4ff305ecb173afdd49eef7c", + "size compressed (bytes)": 93081190, + "total_terms": 9155405, + "documents": 68184, + "unique_terms": 287393, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-unix.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): CQADupStack-unix", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-unix.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-unix.multifield.20221116.505594.tar.gz" + ], + "md5": "acb0cc50cccb9e8dfca0ed599df0cfaa", + "size compressed (bytes)": 54758816, + "total_terms": 5449726, + "documents": 47382, + "unique_terms": 205471, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-webmasters.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): CQADupStack-webmasters", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-webmasters.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-webmasters.multifield.20221116.505594.tar.gz" + ], + "md5": "7701f016b6fc643c30630742f7712bbd", + "size compressed (bytes)": 15524400, + "total_terms": 1358292, + "documents": 17405, + "unique_terms": 40073, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-wordpress.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): CQADupStack-wordpress", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-wordpress.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-wordpress.multifield.20221116.505594.tar.gz" + ], + "md5": "d791cf8449a18ebe698d404f526375ee", + "size compressed (bytes)": 55738636, + "total_terms": 5151575, + "documents": 48605, + "unique_terms": 125110, + "downloaded": False + }, + "beir-v1.0.0-quora.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): Quora", + "filename": "lucene-index.beir-v1.0.0-quora.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-quora.multifield.20221116.505594.tar.gz" + ], + "md5": "2d92b46f715df08ce146167ed1b12079", + "size compressed (bytes)": 52703122, + "total_terms": 4390852, + "documents": 522931, + "unique_terms": 69597, + "downloaded": False + }, + "beir-v1.0.0-dbpedia-entity.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): DBPedia", + "filename": "lucene-index.beir-v1.0.0-dbpedia-entity.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-dbpedia-entity.multifield.20221116.505594.tar.gz" + ], + "md5": "b3f6b64bfd7903ff25ca2fa01a288392", + "size compressed (bytes)": 2144410289, + "total_terms": 152205479, + "documents": 4635922, + "unique_terms": 3338476, + "downloaded": False + }, + "beir-v1.0.0-scidocs.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): SCIDOCS", + "filename": "lucene-index.beir-v1.0.0-scidocs.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-scidocs.multifield.20221116.505594.tar.gz" + ], + "md5": "04c1e9aad3751dc552027d8bc3491323", + "size compressed (bytes)": 175887267, + "total_terms": 3065828, + "documents": 25313, + "unique_terms": 62562, + "downloaded": False + }, + "beir-v1.0.0-fever.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): FEVER", + "filename": "lucene-index.beir-v1.0.0-fever.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-fever.multifield.20221116.505594.tar.gz" + ], + "md5": "28ea09308760235ea2ec72d6f9b2f432", + "size compressed (bytes)": 3947213444, + "total_terms": 310655699, + "documents": 5396138, + "unique_terms": 3275057, + "downloaded": False + }, + "beir-v1.0.0-climate-fever.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): Climate-FEVER", + "filename": "lucene-index.beir-v1.0.0-climate-fever.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-climate-fever.multifield.20221116.505594.tar.gz" + ], + "md5": "827f2759cdfc45c47bbb67835cfcb1f2", + "size compressed (bytes)": 3947277939, + "total_terms": 310661477, + "documents": 5396163, + "unique_terms": 3275068, + "downloaded": False + }, + "beir-v1.0.0-scifact.multifield": { + "description": "Lucene multifield index of BEIR (v1.0.0): SciFact", + "filename": "lucene-index.beir-v1.0.0-scifact.multifield.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-scifact.multifield.20221116.505594.tar.gz" + ], + "md5": "efbafbc3e4909a026fe80bf8b1444b08", + "size compressed (bytes)": 9078032, + "total_terms": 784591, + "documents": 5183, + "unique_terms": 28581, + "downloaded": False + } +} + +TF_INDEX_INFO_MRTYDI = { + "mrtydi-v1.1-arabic": { + "description": "Lucene index for Mr.TyDi v1.1 (Arabic).", + "filename": "lucene-index.mrtydi-v1.1-arabic.20220928.b5ecc5.tar.gz", + "readme": "lucene-index.mrtydi-v1.1-arabic.20220928.b5ecc5.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-arabic.20220928.b5ecc5.tar.gz", + ], + "md5": "efff40a2548f759eb8b0e47e0622685b", + "size compressed (bytes)": 1420441600, + "total_terms": 92529032, + "documents": 2106586, + "unique_terms": 1284748, + "downloaded": False + }, + "mrtydi-v1.1-bengali": { + "description": "Lucene index for Mr.TyDi v1.1 (Bengali).", + "filename": "lucene-index.mrtydi-v1.1-bengali.20220928.b5ecc5.tar.gz", + "readme": "lucene-index.mrtydi-v1.1-bengali.20220928.b5ecc5.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-bengali.20220928.b5ecc5.tar.gz" + ], + "md5": "6ed844c8f17b2f041fba7c5676d3fb42", + "size compressed (bytes)": 294942720, + "total_terms": 15236599, + "documents": 304059, + "unique_terms": 520699, + "downloaded": False + }, + "mrtydi-v1.1-english": { + "description": "Lucene index for Mr.TyDi v1.1 (English).", + "filename": "lucene-index.mrtydi-v1.1-english.20220928.b5ecc5.tar.gz", + "readme": "lucene-index.mrtydi-v1.1-english.20220928.b5ecc5.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-english.20220928.b5ecc5.tar.gz" + ], + "md5": "e6b0a2531d958c3d1a65634dc315b0ab", + "size compressed (bytes)": 20566118400, + "total_terms": 1507060932, + "documents": 32907100, + "unique_terms": -1, + "downloaded": False + }, + "mrtydi-v1.1-finnish": { + "description": "Lucene index for Mr.TyDi v1.1 (Finnish).", + "filename": "lucene-index.mrtydi-v1.1-finnish.20220928.b5ecc5.tar.gz", + "readme": "lucene-index.mrtydi-v1.1-finnish.20220928.b5ecc5.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-finnish.20220928.b5ecc5.tar.gz" + ], + "md5": "0f464c022447eed5431157f0b2feb0b3", + "size compressed (bytes)": 1116272640, + "total_terms": 69416543, + "documents": 1908757, + "unique_terms": 1715076, + "downloaded": False + }, + "mrtydi-v1.1-indonesian": { + "description": "Lucene index for Mr.TyDi v1.1 (Indonesian).", + "filename": "lucene-index.mrtydi-v1.1-indonesian.20220928.b5ecc5.tar.gz", + "readme": "lucene-index.mrtydi-v1.1-indonesian.20220928.b5ecc5.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-indonesian.20220928.b5ecc5.tar.gz" + ], + "md5": "345d43a2443786a3394a93a6f7ef77b7", + "size compressed (bytes)": 698388480, + "total_terms": 52493134, + "documents": 1469399, + "unique_terms": 942552, + "downloaded": False + }, + "mrtydi-v1.1-japanese": { + "description": "Lucene index for Mr.TyDi v1.1 (Japanese).", + "filename": "lucene-index.mrtydi-v1.1-japanese.20220928.b5ecc5.tar.gz", + "readme": "lucene-index.mrtydi-v1.1-japanese.20220928.b5ecc5.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-japanese.20220928.b5ecc5.tar.gz" + ], + "md5": "5f0802c1257c325a3e25c58523dba841", + "size compressed (bytes)": 4333844480, + "total_terms": 300761975, + "documents": 7000027, + "unique_terms": 1588879, + "downloaded": False + }, + "mrtydi-v1.1-korean": { + "description": "Lucene index for Mr.TyDi v1.1 (Korean).", + "filename": "lucene-index.mrtydi-v1.1-korean.20220928.b5ecc5.tar.gz", + "readme": "lucene-index.mrtydi-v1.1-korean.20220928.b5ecc5.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-korean.20220928.b5ecc5.tar.gz" + ], + "md5": "4277f406b138c46edf7c17e4248f3b2e", + "size compressed (bytes)": 1349109760, + "total_terms": 122217295, + "documents": 1496126, + "unique_terms": 1517179, + "downloaded": False + }, + "mrtydi-v1.1-russian": { + "description": "Lucene index for Mr.TyDi v1.1 (Russian).", + "filename": "lucene-index.mrtydi-v1.1-russian.20220928.b5ecc5.tar.gz", + "readme": "lucene-index.mrtydi-v1.1-russian.20220928.b5ecc5.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-russian.20220928.b5ecc5.tar.gz" + ], + "md5": "d5837fee29c60c7a3a24cfd598056038", + "size compressed (bytes)": 6864660480, + "total_terms": 346329117, + "documents": 9597504, + "unique_terms": 3034240, + "downloaded": False + }, + "mrtydi-v1.1-swahili": { + "description": "Lucene index for Mr.TyDi v1.1 (Swahili).", + "filename": "lucene-index.mrtydi-v1.1-swahili.20220928.b5ecc5.tar.gz", + "readme": "lucene-index.mrtydi-v1.1-swahili.20220928.b5ecc5.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-swahili.20220928.b5ecc5.tar.gz" + ], + "md5": "bebff76ec6dfe76c904604f8ed1bcd3e", + "size compressed (bytes)": 59607040, + "total_terms": 4937051, + "documents": 136689, + "unique_terms": 385711, + "downloaded": False + }, + "mrtydi-v1.1-telugu": { + "description": "Lucene index for Mr.TyDi v1.1 (Telugu).", + "filename": "lucene-index.mrtydi-v1.1-telugu.20220928.b5ecc5.tar.gz", + "readme": "lucene-index.mrtydi-v1.1-telugu.20220928.b5ecc5.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-telugu.20220928.b5ecc5.tar.gz" + ], + "md5": "89f8b280cacbdc27e90bb1ea40029c21", + "size compressed (bytes)": 519157760, + "total_terms": 26812052, + "documents": 548224, + "unique_terms": 1157217, + "downloaded": False + }, + "mrtydi-v1.1-thai": { + "description": "Lucene index for Mr.TyDi v1.1 (Thai).", + "filename": "lucene-index.mrtydi-v1.1-thai.20220928.b5ecc5.tar.gz", + "readme": "lucene-index.mrtydi-v1.1-thai.20220928.b5ecc5.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.mrtydi-v1.1-thai.20220928.b5ecc5.tar.gz" + ], + "md5": "047152fc6bc1b5c5d945f38b23de971e", + "size compressed (bytes)": 546201600, + "total_terms": 31550936, + "documents": 568855, + "unique_terms": 663628, + "downloaded": False + } +} + +TF_INDEX_INFO_MIRACL = { + "miracl-v1.0-ar": { + "description": "Lucene index for MIRACL v1.0 (Arabic).", + "filename": "lucene-index.miracl-v1.0-ar.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-ar.20221004.2b2856.tar.gz" + ], + "md5": "503d3b49a557222d8074ac831a2f047a", + "size compressed (bytes)": 1193292491, + "total_terms": 90223450, + "documents": 2061414, + "unique_terms": 1246254, + "downloaded": False + }, + "miracl-v1.0-bn": { + "description": "Lucene index for MIRACL v1.0 (Bengali).", + "filename": "lucene-index.miracl-v1.0-bn.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-bn.20221004.2b2856.tar.gz" + ], + "md5": "7a20210328f0b83f44e041f0c94d30e2", + "size compressed (bytes)": 236113202, + "total_terms": 14963235, + "documents": 297265, + "unique_terms": 506812, + "downloaded": False + }, + "miracl-v1.0-en": { + "description": "Lucene index for MIRACL v1.0 (English).", + "filename": "lucene-index.miracl-v1.0-en.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-en.20221004.2b2856.tar.gz" + ], + "md5": "4fbd652deb76bcc05daa35392d4aa9f3", + "size compressed (bytes)": 17823436054, + "total_terms": 1505029955, + "documents": 32893221, + "unique_terms": 6152316, + "downloaded": False + }, + "miracl-v1.0-es": { + "description": "Lucene index for MIRACL v1.0 (Spanish).", + "filename": "lucene-index.miracl-v1.0-es.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-es.20221004.2b2856.tar.gz" + ], + "md5": "b4c9993ee3a131871d4f07dd96e80531", + "size compressed (bytes)": 5474245249, + "total_terms": 389319806, + "documents": 10373953, + "unique_terms": 2907509, + "downloaded": False + }, + "miracl-v1.0-fa": { + "description": "Lucene index for MIRACL v1.0 (Persian).", + "filename": "lucene-index.miracl-v1.0-fa.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-fa.20221004.2b2856.tar.gz" + ], + "md5": "bfc824aa37633e3d45bcfd5c5e0e1701", + "size compressed (bytes)": 1023090577, + "total_terms": 67968038, + "documents": 2207172, + "unique_terms": 1208930, + "downloaded": False + }, + "miracl-v1.0-fi": { + "description": "Lucene index for MIRACL v1.0 (Finnish).", + "filename": "lucene-index.miracl-v1.0-fi.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-fi.20221004.2b2856.tar.gz" + ], + "md5": "4197c90efd781c6153acaf15452c5479", + "size compressed (bytes)": 925422988, + "total_terms": 68295087, + "documents": 1883509, + "unique_terms": 1669817, + "downloaded": False + }, + "miracl-v1.0-fr": { + "description": "Lucene index for MIRACL v1.0 (French).", + "filename": "lucene-index.miracl-v1.0-fr.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-fr.20221004.2b2856.tar.gz" + ], + "md5": "e68b10d90be71b702888a3d00a8aa39c", + "size compressed (bytes)": 6747612709, + "total_terms": 508723988, + "documents": 14636953, + "unique_terms": 2811342, + "downloaded": False + }, + "miracl-v1.0-hi": { + "description": "Lucene index for MIRACL v1.0 (Hindi).", + "filename": "lucene-index.miracl-v1.0-hi.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-hi.20221004.2b2856.tar.gz" + ], + "md5": "d81f4e2b7ec5df8f9741168c23c977e2", + "size compressed (bytes)": 340997734, + "total_terms": 21080143, + "documents": 506264, + "unique_terms": 597558, + "downloaded": False + }, + "miracl-v1.0-id": { + "description": "Lucene index for MIRACL v1.0 (Indonesian).", + "filename": "lucene-index.miracl-v1.0-id.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-id.20221004.2b2856.tar.gz" + ], + "md5": "b1092e732991029fae7c542e5e129255", + "size compressed (bytes)": 577263718, + "total_terms": 51469219, + "documents": 1446315, + "unique_terms": 911944, + "downloaded": False + }, + "miracl-v1.0-ja": { + "description": "Lucene index for MIRACL v1.0 (Japanese).", + "filename": "lucene-index.miracl-v1.0-ja.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-ja.20221004.2b2856.tar.gz" + ], + "md5": "4db9550d0af63736a0fd2b486b3b7273", + "size compressed (bytes)": 3745158372, + "total_terms": 296659169, + "documents": 6953614, + "unique_terms": 1558643, + "downloaded": False + }, + "miracl-v1.0-ko": { + "description": "Lucene index for MIRACL v1.0 (Korean).", + "filename": "lucene-index.miracl-v1.0-ko.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-ko.20221004.2b2856.tar.gz" + ], + "md5": "c82f5c7641fd78b8dadfcb279a1c0340", + "size compressed (bytes)": 1150899287, + "total_terms": 121464424, + "documents": 1486752, + "unique_terms": 1504782, + "downloaded": False + }, + "miracl-v1.0-ru": { + "description": "Lucene index for MIRACL v1.0 (Russian).", + "filename": "lucene-index.miracl-v1.0-ru.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-ru.20221004.2b2856.tar.gz" + ], + "md5": "c1b974e298d9e1deeccae8b84a5bcd64", + "size compressed (bytes)": 6003987738, + "total_terms": 343106870, + "documents": 9543918, + "unique_terms": 2955627, + "downloaded": False + }, + "miracl-v1.0-sw": { + "description": "Lucene index for MIRACL v1.0 (Swahili).", + "filename": "lucene-index.miracl-v1.0-sw.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-sw.20221004.2b2856.tar.gz" + ], + "md5": "64b77bcc11e04575d0723ad81ac7c135", + "size compressed (bytes)": 45410264, + "total_terms": 4752278, + "documents": 131924, + "unique_terms": 361306, + "downloaded": False + }, + "miracl-v1.0-te": { + "description": "Lucene index for MIRACL v1.0 (Telugu).", + "filename": "lucene-index.miracl-v1.0-te.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-te.20221004.2b2856.tar.gz" + ], + "md5": "1f78c68678f439a3143a6fb0d25bfe27", + "size compressed (bytes)": 402045711, + "total_terms": 26105595, + "documents": 518079, + "unique_terms": 1120047, + "downloaded": False + }, + "miracl-v1.0-th": { + "description": "Lucene index for MIRACL v1.0 (Thai).", + "filename": "lucene-index.miracl-v1.0-th.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-th.20221004.2b2856.tar.gz" + ], + "md5": "eeef93c23b76fdc66b9e1ee01576765e", + "size compressed (bytes)": 431498349, + "total_terms": 29922100, + "documents": 542166, + "unique_terms": 626084, + "downloaded": False + }, + "miracl-v1.0-zh": { + "description": "Lucene index for MIRACL v1.0 (Chinese).", + "filename": "lucene-index.miracl-v1.0-zh.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-zh.20221004.2b2856.tar.gz" + ], + "md5": "dc7880da333b7c56d3a4ff0bf018febd", + "size compressed (bytes)": 4212198217, + "total_terms": 423635495, + "documents": 4934368, + "unique_terms": 6517412, + "downloaded": False + }, + "miracl-v1.0-de": { + "description": "Lucene index for MIRACL v1.0 (German).", + "filename": "lucene-index.miracl-v1.0-de.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-de.20221004.2b2856.tar.gz" + ], + "md5": "a40d1b9429c450b2e476d1e4ba22784d", + "size compressed (bytes)": 8708219012, + "total_terms": 581583743, + "documents": 15866222, + "unique_terms": 6288858, + "downloaded": False + }, + "miracl-v1.0-yo": { + "description": "Lucene index for MIRACL v1.0 (Yoruba).", + "filename": "lucene-index.miracl-v1.0-yo.20221004.2b2856.tar.gz", + "readme": "lucene-index.miracl-v1.0.20221004.2b2856.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.miracl-v1.0-yo.20221004.2b2856.tar.gz" + ], + "md5": "7fa283d1af4a7c4ea8791eab9e386807", + "size compressed (bytes)": 13211664, + "total_terms": 1387088, + "documents": 49043, + "unique_terms": 174539, + "downloaded": False + } +} + +TF_INDEX_INFO_CIRAL = { + "ciral-v1.0-ha": { + "description": "Lucene index for CIRAL v1.0 (Hausa).", + "filename": "lucene-index.ciral-v1.0-ha.20230721.e850ea.tar.gz", + "readme": "lucene-index.ciral-v1.0.20230721.e850ea.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.ciral-v1.0-ha.20230721.e850ea.tar.gz" + ], + "md5": "9bef13f2b528d3a5712ce412c3c264f7", + "size compressed (bytes)": 671653035, + 'total_terms': 93696543, + 'documents': 715355, + 'unique_terms': 817967, + "downloaded": False + }, + + "ciral-v1.0-so": { + "description": "Lucene index for CIRAL v1.0 (Somali).", + "filename": "lucene-index.ciral-v1.0-so.20230721.e850ea.tar.gz", + "readme": "lucene-index.ciral-v1.0.20230721.e850ea.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.ciral-v1.0-so.20230721.e850ea.tar.gz" + ], + "md5": "4bb9d3ae1a6d65fbb2a4e7e57a71397d", + "size compressed (bytes)": 916229181, + "total_terms": 103736362, + "documents": 827552, + "unique_terms": 1636109, + "downloaded": False + }, + + "ciral-v1.0-sw": { + "description": "Lucene index for CIRAL v1.0 (Swahili).", + "filename": "lucene-index.ciral-v1.0-sw.20230721.e850ea.tar.gz", + "readme": "lucene-index.ciral-v1.0.20230721.e850ea.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.ciral-v1.0-sw.20230721.e850ea.tar.gz" + ], + "md5": "1236a1a4c87268d98ec6534cd99aaada", + "size compressed (bytes)": 896921754, + "total_terms": 115140711, + "documents": 949013, + "unique_terms": 1655554, + "downloaded": False + }, + + "ciral-v1.0-yo": { + "description": "Lucene index for CIRAL v1.0 (Yoruba).", + "filename": "lucene-index.ciral-v1.0-yo.20230721.e850ea.tar.gz", + "readme": "lucene-index.ciral-v1.0.20230721.e850ea.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.ciral-v1.0-yo.20230721.e850ea.tar.gz" + ], + "md5": "655e571314ed85cbfe637246c3d18110", + "size compressed (bytes)": 94610259, + "total_terms": 13693080, + "documents": 82095, + "unique_terms": 236638, + "downloaded": False + } + +} + + +TF_INDEX_INFO_OTHER = { + "cacm": { + "description": "Lucene index of the CACM corpus. (Lucene 9)", + "filename": "lucene-index.cacm.tar.gz", + "urls": [ + "https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.20221005.252b5e.tar.gz", + ], + "md5": "cfe14d543c6a27f4d742fb2d0099b8e0", + "size compressed (bytes)": 2347197, + "total_terms": 320968, + "documents": 3204, + "unique_terms": 14363, + }, + "robust04": { + "description": "Lucene index of TREC Disks 4 & 5 (minus Congressional Records), used in the TREC 2004 Robust Track. (Lucene 9)", + "filename": "lucene-index.robust04.20221005.252b5e.tar.gz", + "readme": "lucene-index.robust04.20221005.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.robust04.20221005.252b5e.tar.gz", + ], + "md5": "a1abd5437394956b7ec8bea4699b5e46", + "size compressed (bytes)": 1806776535, + "total_terms": 174540872, + "documents": 528030, + "unique_terms": 923436, + }, + + "enwiki-paragraphs": { + "description": "Lucene index of English Wikipedia for BERTserini", + "filename": "lucene-index.enwiki-20180701-paragraphs.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.enwiki-20180701-paragraphs.tar.gz", + "https://vault.cs.uwaterloo.ca/s/WHKMSCbwQfDXyHt/download" + ], + "md5": "77d1cd530579905dad2ee3c2bda1b73d", + "size compressed (bytes)": 17725958785, + "total_terms": 1498980668, + "documents": 39880064, + "unique_terms": -1, + "downloaded": False + }, + "zhwiki-paragraphs": { + "description": "Lucene index of Chinese Wikipedia for BERTserini", + "filename": "lucene-index.zhwiki-20181201-paragraphs.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.zhwiki-20181201-paragraphs.tar.gz", + "https://vault.cs.uwaterloo.ca/s/6kEjQZaRYtnb8A6/download" + ], + "md5": "c005af4036296972831288c894918a92", + "size compressed (bytes)": 3284531213, + "total_terms": 320776789, + "documents": 4170312, + "unique_terms": -1, + "downloaded": False + }, + + "trec-covid-r5-abstract": { + "description": "Lucene index for TREC-COVID Round 5: abstract index", + "filename": "lucene-index-cord19-abstract-2020-07-16.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-07-16/lucene-index-cord19-abstract-2020-07-16.tar.gz", + "https://vault.cs.uwaterloo.ca/s/c37JxKYQ7Hogs72/download" + ], + "md5": "c883571ccc78b4c2ce05b41eb07f5405", + "size compressed (bytes)": 2796524, + "total_terms": 22100404, + "documents": 192459, + "unique_terms": 195875, + "downloaded": False + }, + "trec-covid-r5-full-text": { + "description": "Lucene index for TREC-COVID Round 5: full-text index", + "filename": "lucene-index-cord19-full-text-2020-07-16.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-07-16/lucene-index-cord19-full-text-2020-07-16.tar.gz", + "https://vault.cs.uwaterloo.ca/s/c7CcxRbFWfiFnFq/download" + ], + "md5": "23cfad89b4c206d66125f5736f60248f", + "size compressed (bytes)": 5351744, + "total_terms": 275238847, + "documents": 192460, + "unique_terms": 1843368, + "downloaded": False + }, + "trec-covid-r5-paragraph": { + "description": "Lucene index for TREC-COVID Round 5: paragraph index", + "filename": "lucene-index-cord19-paragraph-2020-07-16.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-07-16/lucene-index-cord19-paragraph-2020-07-16.tar.gz", + "https://vault.cs.uwaterloo.ca/s/HXigraF5KJL3xS8/download" + ], + "md5": "c2c6ac832f8a1fcb767d2356d2b1e1df", + "size compressed (bytes)": 11352968, + "total_terms": 627083574, + "documents": 3010497, + "unique_terms": 1843368, + "downloaded": False + }, + "trec-covid-r4-abstract": { + "description": "Lucene index for TREC-COVID Round 4: abstract index", + "filename": "lucene-index-cord19-abstract-2020-06-19.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-06-19/lucene-index-cord19-abstract-2020-06-19.tar.gz", + "https://vault.cs.uwaterloo.ca/s/fBta6sAt4MdaHQX/download" + ], + "md5": "029bd55daba8800fbae2be9e5fcd7b33", + "size compressed (bytes)": 2584264, + "total_terms": 18724353, + "documents": 158226, + "unique_terms": 179937, + "downloaded": False + }, + "trec-covid-r4-full-text": { + "description": "Lucene index for TREC-COVID Round 4: full-text index", + "filename": "lucene-index-cord19-full-text-2020-06-19.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-06-19/lucene-index-cord19-full-text-2020-06-19.tar.gz", + "https://vault.cs.uwaterloo.ca/s/yErSHZHD38jcDSY/download" + ], + "md5": "3d0eb12094a24cff9bcacd1f17c3ea1c", + "size compressed (bytes)": 4983900, + "total_terms": 254810123, + "documents": 158227, + "unique_terms": 1783089, + "downloaded": False + }, + "trec-covid-r4-paragraph": { + "description": "Lucene index for TREC-COVID Round 4: paragraph index", + "filename": "lucene-index-cord19-paragraph-2020-06-19.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-06-19/lucene-index-cord19-paragraph-2020-06-19.tar.gz", + "https://vault.cs.uwaterloo.ca/s/7md4kwNNgy3oxiH/download" + ], + "md5": "5cd8cd6998177bed7a3e0057ef8b3595", + "size compressed (bytes)": 10382704, + "total_terms": 567579834, + "documents": 2781172, + "unique_terms": 1783089, + "downloaded": False + }, + "trec-covid-r3-abstract": { + "description": "Lucene index for TREC-COVID Round 3: abstract index", + "filename": "lucene-index-cord19-abstract-2020-05-19.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-05-19/lucene-index-cord19-abstract-2020-05-19.tar.gz", + "https://vault.cs.uwaterloo.ca/s/Zg9p2D5tJgiTGx2/download" + ], + "md5": "37bb97d0c41d650ba8e135fd75ae8fd8", + "size compressed (bytes)": 2190328, + "total_terms": 16278419, + "documents": 128465, + "unique_terms": 168291, + "downloaded": False + }, + "trec-covid-r3-full-text": { + "description": "Lucene index for TREC-COVID Round 3: full-text index", + "filename": "lucene-index-cord19-full-text-2020-05-19.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-05-19/lucene-index-cord19-full-text-2020-05-19.tar.gz", + "https://vault.cs.uwaterloo.ca/s/BTzaQgZ55898dXM/download" + ], + "md5": "f5711915a66cd2b511e0fb8d03e4c325", + "size compressed (bytes)": 4233300, + "total_terms": 215806519, + "documents": 128465, + "unique_terms": 1620335, + "downloaded": False + }, + "trec-covid-r3-paragraph": { + "description": "Lucene index for TREC-COVID Round 3: paragraph index", + "filename": "lucene-index-cord19-paragraph-2020-05-19.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-05-19/lucene-index-cord19-paragraph-2020-05-19.tar.gz", + "https://vault.cs.uwaterloo.ca/s/nPyMYTys6NkmEdN/download" + ], + "md5": "012ab1f804382b2275c433a74d7d31f2", + "size compressed (bytes)": 9053524, + "total_terms": 485309568, + "documents": 2297201, + "unique_terms": 1620335, + "downloaded": False + }, + "trec-covid-r2-abstract": { + "description": "Lucene index for TREC-COVID Round 2: abstract index", + "filename": "lucene-index-cord19-abstract-2020-05-01.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-05-01/lucene-index-cord19-abstract-2020-05-01.tar.gz", + "https://vault.cs.uwaterloo.ca/s/3YZE65FSypwfnQQ/download" + ], + "md5": "a06e71a98a68d31148cb0e97e70a2ee1", + "size compressed (bytes)": 1575804, + "total_terms": 7651125, + "documents": 59873, + "unique_terms": 109750, + "downloaded": False + }, + "trec-covid-r2-full-text": { + "description": "Lucene index for TREC-COVID Round 2: full-text index", + "filename": "lucene-index-cord19-full-text-2020-05-01.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-05-01/lucene-index-cord19-full-text-2020-05-01.tar.gz", + "https://vault.cs.uwaterloo.ca/s/NdPEB7swXeZnq3o/download" + ], + "md5": "e7eca1b976cdf2cd80e908c9ac2263cb", + "size compressed (bytes)": 3088540, + "total_terms": 154736295, + "documents": 59876, + "unique_terms": 1214374, + "downloaded": False + }, + "trec-covid-r2-paragraph": { + "description": "Lucene index for TREC-COVID Round 2: paragraph index", + "filename": "lucene-index-cord19-paragraph-2020-05-01.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-05-01/lucene-index-cord19-paragraph-2020-05-01.tar.gz", + "https://vault.cs.uwaterloo.ca/s/Mz7n5FAt7rmnYCY/download" + ], + "md5": "8f9321757a03985ac1c1952b2fff2c7d", + "size compressed (bytes)": 6881696, + "total_terms": 360119048, + "documents": 1758168, + "unique_terms": 1214374, + "downloaded": False + }, + "trec-covid-r1-abstract": { + "description": "Lucene index for TREC-COVID Round 1: abstract index", + "filename": "lucene-index-covid-2020-04-10.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-04-10/lucene-index-covid-2020-04-10.tar.gz", + "https://vault.cs.uwaterloo.ca/s/Rz8AEmsFo9NWGP6/download" + ], + "md5": "ec239d56498c0e7b74e3b41e1ce5d42a", + "size compressed (bytes)": 1621440, + "total_terms": 6672525, + "documents": 51069, + "unique_terms": 104595, + "downloaded": False + }, + "trec-covid-r1-full-text": { + "description": "Lucene index for TREC-COVID Round 1: full-text index", + "filename": "lucene-index-covid-full-text-2020-04-10.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-04-10/lucene-index-covid-full-text-2020-04-10.tar.gz", + "https://vault.cs.uwaterloo.ca/s/oQzSoxrT3grGmBe/download" + ], + "md5": "401a6f5583b0f05340c73fbbeb3279c8", + "size compressed (bytes)": 4471820, + "total_terms": 315624154, + "documents": 51071, + "unique_terms": 1812522, + "downloaded": False + }, + "trec-covid-r1-paragraph": { + "description": "Lucene index for TREC-COVID Round 1: paragraph index", + "filename": "lucene-index-covid-paragraph-2020-04-10.tar.gz", + "urls": [ + "https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-04-10/lucene-index-covid-paragraph-2020-04-10.tar.gz", + "https://vault.cs.uwaterloo.ca/s/HDtb5Ys7MYBkePC/download" + ], + "md5": "8b87a2c55bc0a15b87f11e796860216a", + "size compressed (bytes)": 5994192, + "total_terms": 330715243, + "documents": 1412648, + "unique_terms": 944574, + "downloaded": False + }, + + "cast2019": { + "description": "Lucene index for TREC 2019 CaST", + "filename": "index-cast2019.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/index-cast2019.tar.gz", + "https://vault.cs.uwaterloo.ca/s/56LcDcRPopdQc4d/download" + ], + "md5": "36e604d7f5a4e08ade54e446be2f6345", + "size compressed (bytes)": 21266884884, + "total_terms": 1593628213, + "documents": 38429835, + "unique_terms": -1, + "downloaded": False + }, + + "wikipedia-dpr-100w": { + "description": "Lucene index of Wikipedia with DPR 100-word splits", + "filename": "lucene-index.wikipedia-dpr-100w.20210120.d1b9e6.tar.gz", + "readme": "index-wikipedia-dpr-20210120-d1b9e6-readme.txt", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.wikipedia-dpr-100w.20210120.d1b9e6.tar.gz" + ], + "md5": "7b58c08da992b2ea7e96667f0b176651", + "size compressed (bytes)": 9177917732, + "total_terms": 1512973270, + "documents": 21015324, + "unique_terms": 5345463, + "downloaded": False + }, + "wikipedia-dpr-100w-slim": { + "description": "Lucene index of Wikipedia with DPR 100-word splits (slim version, document text not stored)", + "filename": "lucene-index.wikipedia-dpr-100w-slim.20210120.d1b9e6.tar.gz", + "readme": "index-wikipedia-dpr-slim-20210120-d1b9e6-readme.txt", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.wikipedia-dpr-100w-slim.20210120.d1b9e6.tar.gz" + ], + "md5": "5d24352f0de6ae75b60e11a9cf622251", + "size compressed (bytes)": 1810337190, + "total_terms": 1512973270, + "documents": 21015324, + "unique_terms": 5345463, + "downloaded": False + }, + "wikipedia-kilt-doc": { + "description": "Lucene index of Wikipedia snapshot used as KILT's knowledge source.", + "filename": "lucene-index.wikipedia-kilt-doc.20210421.f29307.tar.gz", + "readme": "index-wikipedia-kilt-doc-20210421-f29307-readme.txt", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.wikipedia-kilt-doc.20210421.f29307.tar.gz" + ], + "md5": "d4a1e7628f6f68c51dd2d764e62b7f8d", + "size compressed (bytes)": 10901145611, + "total_terms": 1915061164, + "documents": 5903530, + "unique_terms": 8722502, + "downloaded": False + }, + "wiki-all-6-3-tamber": { + "description": "Lucene index of wiki-all-6-3-tamber from castorini/odqa-wiki-corpora", + "filename": "lucene-index.wiki-all-6-3-tamber.20230111.40277a.tar.gz", + "readme": "lucene-index-wiki-all-6-3-tamber-20230111-40277a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.wiki-all-6-3-tamber.20230111.40277a.tar.gz", + ], + "md5": "018b45ee8c6278a879caa3145b2dc05d", + "size compressed (bytes)": 26240661946, + "total_terms": 5064706668, + "documents": 76680040, + "unique_terms": 14604922, + "downloaded": False + }, + + "hc4-v1.0-fa": { + "description": "Lucene index for HC4 v1.0 (Persian). (Lucene 9)", + "filename": "lucene-index.hc4-v1.0-fa.20221025.c4a8d0.tar.gz", + "readme": "lucene-index.hc4-v1.0.20221025.c4a8d0.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.hc4-v1.0-fa.20221025.c4a8d0.tar.gz" + ], + "md5": "80735c01b2f2cf82288381370adf1d66", + "size compressed (bytes)": 1652960750, + "total_terms": 112225896, + "documents": 486486, + "unique_terms": 617109, + "downloaded": False + }, + "hc4-v1.0-ru": { + "description": "Lucene index for HC4 v1.0 (Russian). (Lucene 9)", + "filename": "lucene-index.hc4-v1.0-ru.20221025.c4a8d0.tar.gz", + "readme": "lucene-index.hc4-v1.0.20221025.c4a8d0.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.hc4-v1.0-ru.20221025.c4a8d0.tar.gz" + ], + "md5": "40259ba9ca993f850c960a172debe33e", + "size compressed (bytes)": 13292705599, + "total_terms": 764996714, + "documents": 4721064, + "unique_terms": 2625222, + "downloaded": False + }, + "hc4-v1.0-zh": { + "description": "Lucene index for HC4 v1.0 (Chinese). (Lucene 9)", + "filename": "lucene-index.hc4-v1.0-zh.20221025.c4a8d0.tar.gz", + "readme": "lucene-index.hc4-v1.0.20221025.c4a8d0.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.hc4-v1.0-zh.20221025.c4a8d0.tar.gz" + ], + "md5": "2ea8885b8ec6c637971c8df0706b623e", + "size compressed (bytes)": 2899033342, + "total_terms": 304468580, + "documents": 646302, + "unique_terms": 4380932, + "downloaded": False + }, + "neuclir22-fa": { + "description": "Lucene index for NeuCLIR 2022 corpus (Persian). (Lucene 9)", + "filename": "lucene-index.neuclir22-fa.20221025.c4a8d0.tar.gz", + "readme": "lucene-index.neuclir22.20221025.c4a8d0.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.neuclir22-fa.20221025.c4a8d0.tar.gz" + ], + "md5": "d423fb72bcd5bf2dea6e4a19743dcb95", + "size compressed (bytes)": 7565790180, + "total_terms": 514262091, + "documents": 2232016, + "unique_terms": 1479443, + "downloaded": False + }, + "neuclir22-ru": { + "description": "Lucene index for NeuCLIR 2022 corpus (Russian). (Lucene 9)", + "filename": "lucene-index.neuclir22-ru.20221025.c4a8d0.tar.gz", + "readme": "lucene-index.neuclir22.20221025.c4a8d0.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.neuclir22-ru.20221025.c4a8d0.tar.gz" + ], + "md5": "2d04bbc880d535c1c4ab172c2c2d8ffe", + "size compressed (bytes)": 14202967387, + "total_terms": 830006658, + "documents": 4627541, + "unique_terms": 3396095, + "downloaded": False + }, + "neuclir22-zh": { + "description": "Lucene index for NeuCLIR 2022 corpus (Chinese). (Lucene 9)", + "filename": "lucene-index.neuclir22-zh.20221025.c4a8d0.tar.gz", + "readme": "lucene-index.neuclir22.20221025.c4a8d0.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.neuclir22-zh.20221025.c4a8d0.tar.gz" + ], + "md5": "46fe989676ff510b997af24f6398199f", + "size compressed (bytes)": 15733809682, + "total_terms": 1654090507, + "documents": 3179206, + "unique_terms": 8213058, + "downloaded": False + }, + "neuclir22-fa-en": { + "description": "Lucene index for NeuCLIR 2022 corpus (official English translation from Persian). (Lucene 9)", + "filename": "lucene-index.neuclir22-fa-en.20221025.c4a8d0.tar.gz", + "readme": "lucene-index.neuclir22-en.20221025.c4a8d0.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.neuclir22-fa-en.20221025.c4a8d0.tar.gz" + ], + "md5": "35363339b7f0527f27403b848fe01b04", + "size compressed (bytes)": 6172239242, + "total_terms": 554848215, + "documents": 2232016, + "unique_terms": 1033260, + "downloaded": False + }, + "neuclir22-ru-en": { + "description": "Lucene index for NeuCLIR 2022 corpus (official English translation from Russian). (Lucene 9)", + "filename": "lucene-index.neuclir22-ru-en.20221025.c4a8d0.tar.gz", + "readme": "lucene-index.neuclir22-en.20221025.c4a8d0.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.neuclir22-ru-en.20221025.c4a8d0.tar.gz" + ], + "md5": "b0b98803260665eeae97163d2361838e", + "size compressed (bytes)": 10513242212, + "total_terms": 911886830, + "documents": 4627541, + "unique_terms": 2794257, + "downloaded": False + }, + "neuclir22-zh-en": { + "description": "Lucene index for NeuCLIR 2022 corpus (official English translation from Chinese). (Lucene 9)", + "filename": "lucene-index.neuclir22-zh-en.20221025.c4a8d0.tar.gz", + "readme": "lucene-index.neuclir22-en.20221025.c4a8d0.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.neuclir22-zh-en.20221025.c4a8d0.tar.gz" + ], + "md5": "d44ca9c7b634cf56e8cfd5892a3d3427", + "size compressed (bytes)": 8470981318, + "total_terms": 803227160, + "documents": 3179206, + "unique_terms": 1616532, + "downloaded": False + }, # TODO: need to update links to these files to rgw.cs.uwaterloo.ca/... + "atomic_text_v0.2.1_small_validation": { + "description": "Lucene index for AToMiC Text v0.2.1 small setting on validation set (Lucene 9)", + "filename": "lucene-index.atomic.image.flat.small.validation.tar.gz", + "readme": "lucene-index.atomic.20230525.a7df7f.README.md", + "urls": [ + "https://huggingface.co/spaces/dlrudwo1269/AToMiC_bm25_files/resolve/main/prebuilt_indexes/lucene-index.atomic.text.flat.small.validation.tar.gz" + ], + "md5": "377f3e4ae48e1afbe05650e339322050", + "size compressed (bytes)": 32900945, + "total_terms": 2999824, + "documents": 17173, + "unique_terms": 118071, + "downloaded": False + }, + "atomic_text_v0.2.1_base": { + "description": "Lucene index for AToMiC Text v0.2.1 base setting on validation set (Lucene 9)", + "filename": "lucene-index.atomic.image.flat.base.tar.gz", + "readme": "lucene-index.atomic.20230525.a7df7f.README.md", + "urls": [ + "https://huggingface.co/spaces/dlrudwo1269/AToMiC_bm25_files/resolve/main/prebuilt_indexes/lucene-index.atomic.text.flat.base.tar.gz" + ], + "md5": "41ca80241e77ed3515dd48bfc047a923", + "size compressed (bytes)": 5532178004, + "total_terms": 520954965, + "documents": 3029504, + "unique_terms": -1, + "downloaded": False + }, + "atomic_text_v0.2.1_large": { + "description": "Lucene index for AToMiC Text v0.2.1 large setting on validation set (Lucene 9)", + "filename": "lucene-index.atomic.image.flat.large.tar.gz", + "readme": "lucene-index.atomic.20230525.a7df7f.README.md", + "urls": [ + "https://huggingface.co/spaces/dlrudwo1269/AToMiC_bm25_files/resolve/main/prebuilt_indexes/lucene-index.atomic.text.flat.large.tar.gz" + ], + "md5": "0dd1975d82fa7c57a471e4e6b1882177", + "size compressed (bytes)": 18224101285, + "total_terms": 1727597393, + "documents": 10134744, + "unique_terms": -1, + "downloaded": False + }, + "atomic_image_v0.2_small_validation": { + "description": "Lucene index for AToMiC Images v0.2 small setting on validation set (Lucene 9)", + "filename": "lucene-index.atomic.image.flat.small.validation.tar.gz", + "readme": "lucene-index.atomic.20230525.a7df7f.README.md", + "urls": [ + "https://huggingface.co/spaces/dlrudwo1269/AToMiC_bm25_files/resolve/main/prebuilt_indexes/lucene-index.atomic.image.flat.small.validation.tar.gz" + ], + "md5": "b5363a9a7ecd0f071fb8e0319168ccf8", + "size compressed (bytes)": 4902534, + "total_terms": 308646, + "documents": 16126, + "unique_terms": 48666, + "downloaded": False + }, + "atomic_image_v0.2_base": { + "description": "Lucene index for AToMiC Images v0.2 base setting on validation set (Lucene 9)", + "filename": "lucene-index.atomic.image.flat.base.tar.gz", + "readme": "lucene-index.atomic.20230525.a7df7f.README.md", + "urls": [ + "https://huggingface.co/spaces/dlrudwo1269/AToMiC_bm25_files/resolve/main/prebuilt_indexes/lucene-index.atomic.image.flat.base.tar.gz" + ], + "md5": "55e88e334165b7147092ee67dfa74955", + "size compressed (bytes)": 1218292466, + "total_terms": 100743397, + "documents": 3410779, + "unique_terms": -1, + "downloaded": False + }, + "atomic_image_v0.2_large": { + "description": "Lucene index for AToMiC Images v0.2 large setting on validation set (Lucene 9)", + "filename": "lucene-index.atomic.image.flat.large.tar.gz", + "readme": "lucene-index.atomic.20230525.a7df7f.README.md", + "urls": [ + "https://huggingface.co/spaces/dlrudwo1269/AToMiC_bm25_files/resolve/main/prebuilt_indexes/lucene-index.atomic.image.flat.large.tar.gz" + ], + "md5": "919c3f870968ffbe24f30407ad1385f8", + "size compressed (bytes)": 1341866370, + "total_terms": 108550562, + "documents": 3803656, + "unique_terms": -1, + "downloaded": False + }, +} + +TF_INDEX_INFO = {**TF_INDEX_INFO_MSMARCO, + **TF_INDEX_INFO_BEIR, + **TF_INDEX_INFO_MRTYDI, + **TF_INDEX_INFO_MIRACL, + **TF_INDEX_INFO_CIRAL, + **TF_INDEX_INFO_OTHER} + +IMPACT_INDEX_INFO_MSMARCO = { + "msmarco-v1-passage-slimr": { + "description": "Lucene impact index of the MS MARCO V1 passage corpus enoded by SLIM trained with BM25 negatives. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-slimr.20230220.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-slimr.20230220.md", + "urls": [ + "https://vault.cs.uwaterloo.ca/s/EptAojzmCxz7mYM/download", + ], + "md5": "79e566fee4f376096e12a33cf67c8012", + "size compressed (bytes)": 1942207690, + "total_terms": 100694232684, + "documents": 8841823, + "unique_terms": 28121, + "downloaded": False + }, + "msmarco-v1-passage-slimr-pp": { + "description": "Lucene impact index of the MS MARCO V1 passage corpus enoded by SLIM trained with cross-encoder distillation and hardnegative mining. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-slimr-pp.20230220.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-slimr-pp.20230220.md", + "urls": [ + "https://vault.cs.uwaterloo.ca/s/22Gjmnp5EP2HpqR/download", + ], + "md5": "17b2edd909bcda4980a93fb0ab87e72b", + "size compressed (bytes)": 2164253966, + "total_terms": 104421954301, + "documents": 8841823, + "unique_terms": 27766, + "downloaded": False + }, + "msmarco-v1-passage-unicoil": { + "description": "Lucene impact index of the MS MARCO V1 passage corpus for uniCOIL. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-unicoil.20221005.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-unicoil.20221005.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-unicoil.20221005.252b5e.tar.gz", + ], + "md5": "29521fa94165e87caaaddcb5b0d37b13", + "size compressed (bytes)": 1161034003, + "total_terms": 44495093768, + "documents": 8841823, + "unique_terms": 27678, + "downloaded": False + }, + "msmarco-v1-passage-unicoil-noexp": { + "description": "Lucene impact index of the MS MARCO V1 passage corpus for uniCOIL (noexp). (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-unicoil-noexp.20221005.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-unicoil-noexp.20221005.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-unicoil-noexp.20221005.252b5e.tar.gz", + ], + "md5": "dcb6506e0b8bb1d41863ea9cbaa057cf", + "size compressed (bytes)": 873512626, + "total_terms": 26468530021, + "documents": 8841823, + "unique_terms": 27647, + "downloaded": False + }, + "msmarco-v1-passage-deepimpact": { + "description": "Lucene impact index of the MS MARCO passage corpus encoded by DeepImpact. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-deepimpact.20221005.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-deepimpact.20221005.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-deepimpact.20221005.252b5e.tar.gz", + ], + "md5": "e1cd5bd86ae5b35912991a6c8c448bb0", + "size compressed (bytes)": 1242661484, + "total_terms": 35455908214, + "documents": 8841823, + "unique_terms": 3514102, + "downloaded": False + }, + "msmarco-v1-passage-unicoil-tilde": { + "description": "Lucene impact index of the MS MARCO passage corpus encoded by uniCOIL-TILDE. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-unicoil-tilde.20221005.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-unicoil-tilde.20221005.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-unicoil-tilde.20221005.252b5e.tar.gz", + ], + "md5": "b732c58113ec39b197083dee3e702932", + "size compressed (bytes)": 1871922326, + "total_terms": 73040108576, + "documents": 8841823, + "unique_terms": 27646, + "downloaded": False + }, + "msmarco-v1-passage-distill-splade-max": { + "description": "Lucene impact index of the MS MARCO passage corpus encoded by distill-splade-max. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-distill-splade-max.20221005.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-distill-splade-max.20221005.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-distill-splade-max.20221005.252b5e.tar.gz" + ], + "md5": "7d8b56b348685b9c3e29e306803c61eb", + "size compressed (bytes)": 3822892457, + "total_terms": 95445422483, + "documents": 8841823, + "unique_terms": 28131, + "downloaded": False + }, + + "msmarco-v1-passage-splade-pp-ed": { + "description": "Lucene impact index of the MS MARCO passage corpus encoded by SPLADE++ CoCondenser-EnsembleDistil. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-splade-pp-ed.20230524.a59610.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-splade-pp.20230524.a59610.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-splade-pp-ed.20230524.a59610.tar.gz" + ], + "md5": "4b3c969033cbd017306df42ce134c395", + "size compressed (bytes)": 2102229906, + "total_terms": 52376261130, + "documents": 8841823, + "unique_terms": 28679, + "downloaded": False + }, + "msmarco-v1-passage-splade-pp-ed-docvectors": { + "description": "Lucene impact index (with docvectors) of the MS MARCO passage corpus encoded by SPLADE++ CoCondenser-EnsembleDistil. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-splade-pp-ed-docvectors.20230524.a59610.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-splade-pp.20230524.a59610.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-splade-pp-ed-docvectors.20230524.a59610.tar.gz" + ], + "md5": "270301ea1413c38cc83cf682c7787b49", + "size compressed (bytes)": 13052697908, + "total_terms": 52376261130, + "documents": 8841823, + "unique_terms": 28679, + "downloaded": False + }, + "msmarco-v1-passage-splade-pp-ed-text": { + "description": "Lucene impact index (with text) of the MS MARCO passage corpus encoded by SPLADE++ CoCondenser-EnsembleDistil. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-splade-pp-ed-text.20230524.a59610.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-splade-pp.20230524.a59610.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-splade-pp-ed-text.20230524.a59610.tar.gz" + ], + "md5": "151e9b1b345197cd4a0edbf7127f3deb", + "size compressed (bytes)": 9983469862, + "total_terms": 52376261130, + "documents": 8841823, + "unique_terms": 28679, + "downloaded": False + }, + "msmarco-v1-passage-splade-pp-sd": { + "description": "Lucene impact index of the MS MARCO passage corpus encoded by SPLADE++ CoCondenser-SelfDistil. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-splade-pp-sd.20230524.a59610.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-splade-pp.20230524.a59610.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-splade-pp-sd.20230524.a59610.tar.gz" + ], + "md5": "4e4a3969c1e9e7262b2783ad192086ae", + "size compressed (bytes)": 2367261002, + "total_terms": 55456660129, + "documents": 8841823, + "unique_terms": 28662, + "downloaded": False + }, + "msmarco-v1-passage-splade-pp-sd-docvectors": { + "description": "Lucene impact index (with docvectors) of the MS MARCO passage corpus encoded by SPLADE++ CoCondenser-SelfDistil. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-splade-pp-sd-docvectors.20230524.a59610.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-splade-pp.20230524.a59610.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-splade-pp-sd-docvectors.20230524.a59610.tar.gz" + ], + "md5": "8d75aecc95e63853c832916da62e97f9", + "size compressed (bytes)": 14829233820, + "total_terms": 55456660129, + "documents": 8841823, + "unique_terms": 28662, + "downloaded": False + }, + "msmarco-v1-passage-splade-pp-sd-text": { + "description": "Lucene impact index (with text) of the MS MARCO passage corpus encoded by SPLADE++ CoCondenser-SelfDistil. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-passage-splade-pp-sd-text.20230524.a59610.tar.gz", + "readme": "lucene-index.msmarco-v1-passage-splade-pp.20230524.a59610.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-splade-pp-sd-text.20230524.a59610.tar.gz" + ], + "md5": "1d90dc2803a6fea55a4d16da7623e2ed", + "size compressed (bytes)": 11473065718, + "total_terms": 55456660129, + "documents": 8841823, + "unique_terms": 28662, + "downloaded": False + }, + + "msmarco-v1-doc-segmented-unicoil": { + "description": "Lucene impact index of the MS MARCO V1 segmented document corpus for uniCOIL, with title/segment encoding. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-doc-segmented-unicoil.20221005.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-doc-segmented-unicoil.20221005.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-doc-segmented-unicoil.20221005.252b5e.tar.gz", + ], + "md5": "06e087b8575f3d49177abfcfaf4bba1c", + "size compressed (bytes)": 5765257637, + "total_terms": 214505277898, + "documents": 20545677, + "unique_terms": 29142, + "downloaded": False + }, + "msmarco-v1-doc-segmented-unicoil-noexp": { + "description": "Lucene impact index of the MS MARCO V1 segmented document corpus for uniCOIL (noexp), with title/segment encoding. (Lucene 9)", + "filename": "lucene-index.msmarco-v1-doc-segmented-unicoil-noexp.20221005.252b5e.tar.gz", + "readme": "lucene-index.msmarco-v1-doc-segmented-unicoil-noexp.20221005.252b5e.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-doc-segmented-unicoil-noexp.20221005.252b5e.tar.gz", + ], + "md5": "f2bb0e6e9e0ea4baa6072f6f842623d8", + "size compressed (bytes)": 5323380960, + "total_terms": 152323732876, + "documents": 20545677, + "unique_terms": 29142, + "downloaded": False + }, + + "msmarco-v2-passage-unicoil-0shot": { + "description": "Lucene impact index of the MS MARCO V2 passage corpus for uniCOIL. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-passage-unicoil-0shot.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-passage-unicoil-0shot.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-passage-unicoil-0shot.20220808.4d6d2a.tar.gz", + ], + "md5": "9da229088995a3abfea57dd8681d16d5", + "size compressed (bytes)": 21736933361, + "total_terms": 775253560148, + "documents": 138364198, + "unique_terms": 29149, + "downloaded": False + }, + "msmarco-v2-passage-unicoil-noexp-0shot": { + "description": "Lucene impact index of the MS MARCO V2 passage corpus for uniCOIL (noexp). (Lucene 9)", + "filename": "lucene-index.msmarco-v2-passage-unicoil-noexp-0shot.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-passage-unicoil-noexp-0shot.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-passage-unicoil-noexp-0shot.20220808.4d6d2a.tar.gz", + ], + "md5": "dda9de84072d2162e8649a040153942e", + "size compressed (bytes)": 14347302774, + "total_terms": 411330032512, + "documents": 138364198, + "unique_terms": 29148, + "downloaded": False + }, + + "msmarco-v2-doc-segmented-unicoil-0shot": { + "description": "Lucene impact index of the MS MARCO V2 segmented document corpus for uniCOIL, with title prepended. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-doc-segmented-unicoil-0shot.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-doc-segmented-unicoil-0shot.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot.20220808.4d6d2a.tar.gz" + ], + "md5": "cc98b13869c78ad3ef069d3a1c4ebaf4", + "size compressed (bytes)": 33573641204, + "total_terms": 1204542769110, + "documents": 124131414, + "unique_terms": 29168, + "downloaded": False + }, + "msmarco-v2-doc-segmented-unicoil-noexp-0shot": { + "description": "Lucene impact index of the MS MARCO V2 segmented document corpus for uniCOIL (noexp) with title prepended. (Lucene 9)", + "filename": "lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot.20220808.4d6d2a.tar.gz", + "readme": "lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot.20220808.4d6d2a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot.20220808.4d6d2a.tar.gz" + ], + "md5": "e70c3bf0016407bf20cfe35fb0d277e0", + "size compressed (bytes)": 29059155839, + "total_terms": 820664704261, + "documents": 124131404, + "unique_terms": 29172, + "downloaded": False + } +} + +IMPACT_INDEX_INFO_BEIR = { + # BEIR (v1.0.0) impact indexes encoded by SPLADE-distill CoCodenser-medium + "beir-v1.0.0-trec-covid-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): TREC-COVID encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-trec-covid-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-trec-covid-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "0f91fb01fec4b1c590fe683ad2383339", + "size compressed (bytes)": 55889585, + "total_terms": 1697942549, + "documents": 171332, + "unique_terms": 26611, + "downloaded": False + }, + "beir-v1.0.0-bioasq-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): BioASQ encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-bioasq-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-bioasq-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "a0317f26b1fab3bca71b46e0a4eff816", + "size compressed (bytes)": 5396189427, + "total_terms": 181960155708, + "documents": 14914603, + "unique_terms": 27703, + "downloaded": False + }, + "beir-v1.0.0-nfcorpus-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): NFCorpus encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-nfcorpus-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-nfcorpus-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "9c6f3ecfa6186c3ab5125f5c3d4eb962", + "size compressed (bytes)": 1439110, + "total_terms": 41582222, + "documents": 3633, + "unique_terms": 16295, + "downloaded": False + }, + "beir-v1.0.0-nq-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): NQ encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-nq-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-nq-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "9d418f806b8304a075945afa80bfcc22", + "size compressed (bytes)": 833470407, + "total_terms": 21901570532, + "documents": 2681468, + "unique_terms": 28747, + "downloaded": False + }, + "beir-v1.0.0-hotpotqa-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): HotpotQA encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-hotpotqa-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-hotpotqa-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "e96767f3d97cba5104dfd76eafdb35b7", + "size compressed (bytes)": 1173403732, + "total_terms": 32565190895, + "documents": 5233329, + "unique_terms": 28724, + "downloaded": False + }, + "beir-v1.0.0-fiqa-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): FiQA-2018 encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-fiqa-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-fiqa-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "937f0112a77a81879d6e42431d7fd522", + "size compressed (bytes)": 19624314, + "total_terms": 487502241, + "documents": 57638, + "unique_terms": 26244, + "downloaded": False + }, + "beir-v1.0.0-signal1m-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): Signal-1M encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-signal1m-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-signal1m-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "ac79812f60bcd597d351174a58fb085c", + "size compressed (bytes)": 602427178, + "total_terms": 13103073741, + "documents": 2866316, + "unique_terms": 28130, + "downloaded": False + }, + "beir-v1.0.0-trec-news-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): TREC-NEWS encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-trec-news-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-trec-news-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "d24ca30cb52510d193f9361e7f6996b7", + "size compressed (bytes)": 270800660, + "total_terms": 7519025445, + "documents": 594977, + "unique_terms": 27745, + "downloaded": False + }, + "beir-v1.0.0-robust04-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): Robust04 encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-robust04-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-robust04-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "24e6310cd04a73604a8b467e582d153f", + "size compressed (bytes)": 213476457, + "total_terms": 6718533167, + "documents": 528155, + "unique_terms": 27623, + "downloaded": False + }, + "beir-v1.0.0-arguana-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): ArguAna encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-arguana-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-arguana-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "d008e420e5be96ab7e9d40bafc3183ce", + "size compressed (bytes)": 3816904, + "total_terms": 96421121, + "documents": 8674, + "unique_terms": 22536, + "downloaded": False + }, + "beir-v1.0.0-webis-touche2020-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): Webis-Touche2020 encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-webis-touche2020-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-webis-touche2020-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "e05433f5cd3113b50b5fe166e18975d4", + "size compressed (bytes)": 124322238, + "total_terms": 3229042324, + "documents": 382545, + "unique_terms": 27742, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-android-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): CQADupStack-android encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-android-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-android-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "753c02411a6391e5d45ba39fdc30a535", + "size compressed (bytes)": 5995405, + "total_terms": 157949889, + "documents": 22998, + "unique_terms": 18891, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-english-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): CQADupStack-english encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-english-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-english-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "f377274f93d9f6426034fdd78457f5ee", + "size compressed (bytes)": 9857825, + "total_terms": 218761119, + "documents": 40221, + "unique_terms": 26613, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-gaming-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): CQADupStack-gaming encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-gaming-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-gaming-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "a8549ec6b7af25fe4a60fd7f4827afbd", + "size compressed (bytes)": 12976249, + "total_terms": 296073202, + "documents": 45301, + "unique_terms": 24564, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-gis-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): CQADupStack-gis encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-gis-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-gis-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "26341f18a352668986bc8cf82006dc38", + "size compressed (bytes)": 10250646, + "total_terms": 296967034, + "documents": 37637, + "unique_terms": 22034, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-mathematica-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): CQADupStack-mathematica encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-mathematica-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-mathematica-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "799a7c044cc774b29e55de4a8c0a813b", + "size compressed (bytes)": 4771584, + "total_terms": 132796971, + "documents": 16705, + "unique_terms": 19765, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-physics-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): CQADupStack-physics encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-physics-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-physics-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "75ed5bb0217ba4f1c957bc25109f2823", + "size compressed (bytes)": 10887180, + "total_terms": 284896455, + "documents": 38316, + "unique_terms": 22985, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-programmers-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): CQADupStack-programmers encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-programmers-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-programmers-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "42e2da2036a3e1d5780c90cda8c2193e", + "size compressed (bytes)": 10036425, + "total_terms": 258856106, + "documents": 32176, + "unique_terms": 22560, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-stats-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): CQADupStack-stats encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-stats-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-stats-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "50043a036497ea6533fd2ce62f151370", + "size compressed (bytes)": 11867711, + "total_terms": 333590386, + "documents": 42269, + "unique_terms": 23322, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-tex-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): CQADupStack-tex encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-tex-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-tex-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "83026f984c1007c656f15d7c01cf5da0", + "size compressed (bytes)": 19613041, + "total_terms": 604604076, + "documents": 68184, + "unique_terms": 24669, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-unix-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): CQADupStack-unix encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-unix-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-unix-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "5bb2b4999e8769aca00c7dff2baaf297", + "size compressed (bytes)": 12705584, + "total_terms": 369576280, + "documents": 47382, + "unique_terms": 21712, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-webmasters-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): CQADupStack-webmasters encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-webmasters-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-webmasters-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "bb2b4227659f8f32e1fcd4d7dee6065c", + "size compressed (bytes)": 4987493, + "total_terms": 127823828, + "documents": 17405, + "unique_terms": 20286, + "downloaded": False + }, + "beir-v1.0.0-cqadupstack-wordpress-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): CQADupStack-wordpress encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-cqadupstack-wordpress-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-cqadupstack-wordpress-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "2acbaa7b2a0f8699e54fdee2efb2d376", + "size compressed (bytes)": 12583602, + "total_terms": 362488001, + "documents": 48605, + "unique_terms": 21867, + "downloaded": False + }, + "beir-v1.0.0-quora-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): Quora encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-quora-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-quora-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "6358d683527284ecf4c1dbb6ad008a0f", + "size compressed (bytes)": 51880975, + "total_terms": 1322737004, + "documents": 522931, + "unique_terms": 27042, + "downloaded": False + }, + "beir-v1.0.0-dbpedia-entity-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): DBPedia encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-dbpedia-entity-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-dbpedia-entity-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "9cb05766611bea863a96818219657c78", + "size compressed (bytes)": 1225612002, + "total_terms": 30490098411, + "documents": 4635922, + "unique_terms": 28709, + "downloaded": False + }, + "beir-v1.0.0-scidocs-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): SCIDOCS encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-scidocs-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-scidocs-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "38d2a4bbabf9b6b1cd627ce81660e07d", + "size compressed (bytes)": 11252695, + "total_terms": 273175826, + "documents": 25657, + "unique_terms": 24241, + "downloaded": False + }, + "beir-v1.0.0-fever-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): FEVER encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-fever-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-fever-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "cc71baa5838edd4e7cd288ca26488532", + "size compressed (bytes)": 1497554696, + "total_terms": 38844967407, + "documents": 5416568, + "unique_terms": 28670, + "downloaded": False + }, + "beir-v1.0.0-climate-fever-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): Climate-FEVER encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-climate-fever-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-climate-fever-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "1479d75cd9496a7d57064b86f6ee67ef", + "size compressed (bytes)": 1497450545, + "total_terms": 38845226073, + "documents": 5416593, + "unique_terms": 28670, + "downloaded": False + }, + "beir-v1.0.0-scifact-splade_distil_cocodenser_medium": { + "description": "Lucene impact index of BEIR (v1.0.0): SciFact encoded by SPLADE-distill CoCodenser-medium", + "filename": "lucene-index.beir-v1.0.0-scifact-splade_distil_cocodenser_medium.20221116.505594.tar.gz", + "readme": "lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-scifact-splade_distil_cocodenser_medium.20221116.505594.tar.gz" + ], + "md5": "367db6c4a466d442ba089a38dad9fc6e", + "size compressed (bytes)": 2173167, + "total_terms": 65836037, + "documents": 5183, + "unique_terms": 17486, + "downloaded": False + } +} + +IMPACT_INDEX_INFO = {**IMPACT_INDEX_INFO_MSMARCO, + **IMPACT_INDEX_INFO_BEIR} + +FAISS_INDEX_INFO_MSMARCO = { + # Aggretriever indexes + "msmarco-v1-passage.aggretriever-cocondenser": { + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by aggretriever-cocondenser encoder.", + "filename": "faiss.msmarco-v1-passage.aggretriever-cocondenser.20230407.f627ef.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.aggretriever-cocondenser.20230407.f627ef.tar.gz" + ], + "md5": "58da608d5b31b28001b3aa1cf33479f6", + "size compressed (bytes)": 26053474943, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, + "msmarco-v1-passage.aggretriever-distilbert": { + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by aggretriever-distilbert encoder.", + "filename": "faiss.msmarco-v1-passage.aggretriever-distilbert.20230407.f627ef.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.aggretriever-distilbert.20230407.f627ef.tar.gz" + ], + "md5": "ed1492be0ce7539aacd5db5028404989", + "size compressed (bytes)": 25963140666, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, + + "msmarco-v1-passage.ance": { + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by the ANCE MS MARCO passage encoder", + "filename": "faiss.msmarco-v1-passage.ance.20210224.060cef.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.ance.20210224.060cef.tar.gz" + ], + "md5": "c4e485efd0802031783d6fe487125446", + "size compressed (bytes)": 25102344836, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, + "msmarco-v1-passage.distilbert-dot-margin-mse-t2": { + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by the distilbert-dot-margin_mse-T2-msmarco encoder", + "filename": "faiss.msmarco-v1-passage.distilbert-dot-margin_mse-t2.20210316.d44c3a.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.distilbert-dot-margin_mse-t2.20210316.d44c3a.tar.gz" + ], + "md5": "66dcbf3331f270673e3b9702a6ad3540", + "size compressed (bytes)": 25162771693, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, + "msmarco-v1-passage.distilbert-dot-tas_b-b256": { + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by distilbert-dot-tas_b-b256-msmarco encoder", + "filename": "faiss.msmarco-v1-passage.distilbert-dot-tas_b-b256.20210527.63276f.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.distilbert-dot-tas_b-b256.20210527.63276f.tar.gz" + ], + "md5": "4e64a643fc051bc9506a3a554e9394e7", + "size compressed (bytes)": 25162329414, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, + "msmarco-v1-passage.sbert": { + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by the SBERT MS MARCO passage encoder", + "filename": "faiss.msmarco-v1-passage.sbert.20210313.a0fbb3.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.sbert.20210313.a0fbb3.tar.gz" + ], + "md5": "d5d9eb67fa9da8b77a219ac77a5a3d3e", + "size compressed (bytes)": 25214193092, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, + "msmarco-v1-passage.tct_colbert": { + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by TCT-ColBERT", + "filename": "faiss.msmarco-v1-passage.tct_colbert.20210112.be7119.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.tct_colbert.20210112.be7119.tar.gz" + ], + "md5": "2dea6e8697b220719139027c7ee2aff0", + "size compressed (bytes)": 25204501822, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, + "msmarco-v1-passage.tct_colbert.hnsw": { + "description": "Faiss HNSW index of the MS MARCO passage corpus encoded by TCT-ColBERT", + "filename": "hnsw-faiss.msmarco-v1-passage.tct_colbert.20210112.be7119.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/hnsw-faiss.msmarco-v1-passage.tct_colbert.20210112.be7119.tar.gz" + ], + "md5": "24acb6e6ba0ac1f5c6b73bd3e6d6477f", + "size compressed (bytes)": 33359120779, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, + "msmarco-v1-passage.tct_colbert-v2": { + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by the tct_colbert-v2 passage encoder", + "filename": "faiss.msmarco-v1-passage.tct_colbert-v2.20210608.5f341b.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.tct_colbert-v2.20210608.5f341b.tar.gz" + ], + "md5": "805bb253828a59af1899a8cc42e0f766", + "size compressed (bytes)": 25211079468, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, + "msmarco-v1-passage.tct_colbert-v2-hn": { + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by the tct_colbert-v2-hn passage encoder", + "filename": "faiss.msmarco-v1-passage.tct_colbert-v2-hn.20210608.5f341b.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.tct_colbert-v2-hn.20210608.5f341b.tar.gz" + ], + "md5": "569f0ee9d45586b547d84fcd240e5cee", + "size compressed (bytes)": 25205730053, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, + "msmarco-v1-passage.tct_colbert-v2-hnp": { + "description": "Faiss FlatIP index of the MS MARCO passage corpus encoded by the tct_colbert-v2-hnp passage encoder", + "filename": "faiss.msmarco-v1-passage.tct_colbert-v2-hnp.20210608.5f341b.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.tct_colbert-v2-hnp.20210608.5f341b.tar.gz" + ], + "md5": "53bcaa78ab0ca629f3379b8aa00eb3ae", + "size compressed (bytes)": 25225526436, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, + "msmarco-v1-passage.openai-ada2": { + "description": "Faiss FlatIP index of the MS MARCO document corpus encoded by TCT-ColBERT-V2-HNP", + "filename": "faiss.msmarco-v1-passage.openai-ada2.20230530.e3a58f.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-passage.openai-ada2.20230530.e3a58f.tar.gz" + ], + "md5": "14725ced21bdcd0c9866aab1cfe8f2e0", + "size compressed (bytes)": 45649935573, + "documents": 8841823, + "downloaded": False, + "texts": "msmarco-v1-passage" + }, + + "msmarco-v1-doc.ance-maxp": { + "description": "Faiss FlatIP index of the MS MARCO document corpus encoded by the ANCE MaxP encoder", + "filename": "faiss.msmarco-v1-doc.ance_maxp.20210304.b2a1b0.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-doc.ance_maxp.20210304.b2a1b0.tar.gz" + ], + "md5": "6b484b0c04872f22ae903ed7d7ba1327", + "size compressed (bytes)": 58312805253, + "documents": 20544550, + "downloaded": False, + "texts": "msmarco-v1-doc" + }, + "msmarco-v1-doc.tct_colbert": { + "description": "Faiss FlatIP index of the MS MARCO document corpus encoded by TCT-ColBERT", + "filename": "faiss.msmarco-v1-doc.tct_colbert.20210112.be7119.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-doc.tct_colbert.20210112.be7119.tar.gz" + ], + "md5": "4e97e1d6990ba5d4b93b7798c3036edc", + "size compressed (bytes)": 58514325945, + "documents": 20544550, + "downloaded": False, + "texts": "smarco-v1-doc" + }, + "msmarco-v1-doc-segmented.tct_colbert-v2-hnp": { + "description": "Faiss FlatIP index of the MS MARCO document corpus encoded by TCT-ColBERT-V2-HNP", + "filename": "faiss.msmarco-v1-doc-segmented.tct_colbert-v2-hnp.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.msmarco-v1-doc-segmented.tct_colbert-v2-hnp.tar.gz" + ], + "md5": "1dab64255822d2fd4dff8c0807319d0a", + "size compressed (bytes)": 58586765413, + "documents": 20544550, + "downloaded": False, + "texts": "msmarco-v1-doc-segmented" + } +} + +FAISS_INDEX_INFO_BEIR = { + # BEIR (v1.0.0) contriever indexes + "beir-v1.0.0-trec-covid.contriever": { + "description": "Faiss index for BEIR v1.0.0 (TREC-COVID) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-trec-covid.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-trec-covid.contriever.20230124.tar.gz" + ], + "md5": "5b5baf557979e30e943180627fe31340", + "size compressed (bytes)": 488100317, + "documents": 171332, + "downloaded": False, + "texts": "beir-v1.0.0-trec-covid.flat" + }, + "beir-v1.0.0-bioasq.contriever": { + "description": "Faiss index for BEIR v1.0.0 (BioASQ) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-bioasq.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-bioasq.contriever.20230124.tar.gz" + ], + "md5": "c0cbca535d38c1f1f78ff1bd6d91af5d", + "size compressed (bytes)": 42417202460, + "documents": 14914603, + "downloaded": False, + "texts": "beir-v1.0.0-bioasq.flat" + }, + "beir-v1.0.0-nfcorpus.contriever": { + "description": "Faiss index for BEIR v1.0.0 (NFCorpus) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-nfcorpus.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-nfcorpus.contriever.20230124.tar.gz" + ], + "md5": "5eff0107f7953ebe7658c3a6400e7027", + "size compressed (bytes)": 10322409, + "documents": 3633, + "downloaded": False, + "texts": "beir-v1.0.0-nfcorpus.flat" + }, + "beir-v1.0.0-nq.contriever": { + "description": "Faiss index for BEIR v1.0.0 (NQ) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-nq.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-nq.contriever.20230124.tar.gz" + ], + "md5": "e1825fe0ce5c8000b63b1499374adb0e", + "size compressed (bytes)": 7617697503, + "documents": 2681468, + "downloaded": False, + "texts": "beir-v1.0.0-nq.flat" + }, + "beir-v1.0.0-hotpotqa.contriever": { + "description": "Faiss index for BEIR v1.0.0 (HotpotQA) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-hotpotqa.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-hotpotqa.contriever.20230124.tar.gz" + ], + "md5": "51445960e00a18264ae3947b3af2bc80", + "size compressed (bytes)": 14874721901, + "documents": 5233329, + "downloaded": False, + "texts": "beir-v1.0.0-hotpotqa.flat" + }, + "beir-v1.0.0-fiqa.contriever": { + "description": "Faiss index for BEIR v1.0.0 (FiQA-2018) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-fiqa.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-fiqa.contriever.20230124.tar.gz" + ], + "md5": "a03cc30459b1a1928b93ad1aa51a7849", + "size compressed (bytes)": 164024764, + "documents": 57638, + "downloaded": False, + "texts": "beir-v1.0.0-fiqa.flat" + }, + "beir-v1.0.0-signal1m.contriever": { + "description": "Faiss index for BEIR v1.0.0 (Signal-1M) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-signal1m.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-signal1m.contriever.20230124.tar.gz" + ], + "md5": "19e3e324b7b87e55fb9f6b6b1e72c464", + "size compressed (bytes)": 8142533760, + "documents": 2866316, + "downloaded": False, + "texts": "beir-v1.0.0-signal1m.flat" + }, + "beir-v1.0.0-trec-news.contriever": { + "description": "Faiss index for BEIR v1.0.0 (TREC-NEWS) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-trec-news.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-trec-news.contriever.20230124.tar.gz" + ], + "md5": "20db6299b57b3e78ea2f8b7a2b649770", + "size compressed (bytes)": 1629958623, + "documents": 594977, + "downloaded": False, + "texts": "beir-v1.0.0-trec-news.flat" + }, + "beir-v1.0.0-robust04.contriever": { + "description": "Faiss index for BEIR v1.0.0 (Robust04) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-robust04.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-robust04.contriever.20230124.tar.gz" + ], + "md5": "81c730b68e066baf18d5b46918b8c830", + "size compressed (bytes)": 1501110333, + "documents": 528155, + "downloaded": False, + "texts": "beir-v1.0.0-robust04.flat" + }, + "beir-v1.0.0-arguana.contriever": { + "description": "Faiss index for BEIR v1.0.0 (ArguAna) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-arguana.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-arguana.contriever.20230124.tar.gz" + ], + "md5": "03f701916d49dd86b9c8989796d2dcc4", + "size compressed (bytes)": 24710561, + "documents": 8674, + "downloaded": False, + "texts": "beir-v1.0.0-arguana.flat" + }, + "beir-v1.0.0-webis-touche2020.contriever": { + "description": "Faiss index for BEIR v1.0.0 (Webis-Touche2020) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-webis-touche2020.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-webis-touche2020.contriever.20230124.tar.gz" + ], + "md5": "dfff9bc58521f09542f0affa3069f9a7", + "size compressed (bytes)": 1091320704, + "documents": 382545, + "downloaded": False, + "texts": "beir-v1.0.0-webis-touche2020.flat" + }, + "beir-v1.0.0-cqadupstack-android.contriever": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-android) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-cqadupstack-android.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-android.contriever.20230124.tar.gz" + ], + "md5": "4f03c0238f0e8f77e6365b61108042ed", + "size compressed (bytes)": 65447231, + "documents": 22998, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-android.flat" + }, + "beir-v1.0.0-cqadupstack-english.contriever": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-english) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-cqadupstack-english.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-english.contriever.20230124.tar.gz" + ], + "md5": "319e3cba8f5f5d5175aad92c99c4b0fd", + "size compressed (bytes)": 114460495, + "documents": 40221, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-english.flat" + }, + "beir-v1.0.0-cqadupstack-gaming.contriever": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-gaming) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-cqadupstack-gaming.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-gaming.contriever.20230124.tar.gz" + ], + "md5": "049f2cb22adfb5803a5f7f762f578bce", + "size compressed (bytes)": 128906099, + "documents": 45301, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-gaming.flat" + }, + "beir-v1.0.0-cqadupstack-gis.contriever": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-gis) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-cqadupstack-gis.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-gis.contriever.20230124.tar.gz" + ], + "md5": "13fdfa5a13634c10c1e7e6179bb4c376", + "size compressed (bytes)": 107128974, + "documents": 37637, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-gis.flat" + }, + "beir-v1.0.0-cqadupstack-mathematica.contriever": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-mathematica) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-cqadupstack-mathematica.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-mathematica.contriever.20230124.tar.gz" + ], + "md5": "e4f756eede3ae5f9228d32096c1bd5b4", + "size compressed (bytes)": 47544559, + "documents": 16705, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-mathematica.flat" + }, + "beir-v1.0.0-cqadupstack-physics.contriever": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-physics) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-cqadupstack-physics.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-physics.contriever.20230124.tar.gz" + ], + "md5": "b92ec0c233a1112d6f8782fb0f2bc9c1", + "size compressed (bytes)": 109048286, + "documents": 38316, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-physics.flat" + }, + "beir-v1.0.0-cqadupstack-programmers.contriever": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-programmers) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-cqadupstack-programmers.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-programmers.contriever.20230124.tar.gz" + ], + "md5": "f180240f35e2a3c27d39361a20533205", + "size compressed (bytes)": 91583135, + "documents": 32176, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-programmers.flat" + }, + "beir-v1.0.0-cqadupstack-stats.contriever": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-stats) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-cqadupstack-stats.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-stats.contriever.20230124.tar.gz" + ], + "md5": "64737df62b4e03b93356ba234cefe0e6", + "size compressed (bytes)": 120288620, + "documents": 42269, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-stats.flat" + }, + "beir-v1.0.0-cqadupstack-tex.contriever": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-tex) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-cqadupstack-tex.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-tex.contriever.20230124.tar.gz" + ], + "md5": "ef087faff49e5bae0799e8576e387c0d", + "size compressed (bytes)": 194080724, + "documents": 68184, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-tex.flat" + }, + "beir-v1.0.0-cqadupstack-unix.contriever": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-unix) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-cqadupstack-unix.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-unix.contriever.20230124.tar.gz" + ], + "md5": "9279884bfc3a14c2896276b679a58dbf", + "size compressed (bytes)": 134860159, + "documents": 47382, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-unix.flat" + }, + "beir-v1.0.0-cqadupstack-webmasters.contriever": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-webmasters) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-cqadupstack-webmasters.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-webmasters.contriever.20230124.tar.gz" + ], + "md5": "f1a46fc6f6586c716d2a6239753c9573", + "size compressed (bytes)": 49531545, + "documents": 17405, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-webmasters.flat" + }, + "beir-v1.0.0-cqadupstack-wordpress.contriever": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-wordpress) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-cqadupstack-wordpress.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-wordpress.contriever.20230124.tar.gz" + ], + "md5": "27480c7a4c8d437af30618bf98b10969", + "size compressed (bytes)": 138348184, + "documents": 48605, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-wordpress.flat" + }, + "beir-v1.0.0-quora.contriever": { + "description": "Faiss index for BEIR v1.0.0 (Quora) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-quora.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-quora.contriever.20230124.tar.gz" + ], + "md5": "4876145908b7af946593df6dbb8af600", + "size compressed (bytes)": 1485866217, + "documents": 522931, + "downloaded": False, + "texts": "beir-v1.0.0-quora.flat" + }, + "beir-v1.0.0-dbpedia-entity.contriever": { + "description": "Faiss index for BEIR v1.0.0 (DBPedia) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-dbpedia-entity.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-dbpedia-entity.contriever.20230124.tar.gz" + ], + "md5": "ee88a23de31d3faf403673c08ea0c844", + "size compressed (bytes)": 13214316305, + "documents": 4635922, + "downloaded": False, + "texts": "beir-v1.0.0-dbpedia-entity.flat" + }, + "beir-v1.0.0-scidocs.contriever": { + "description": "Faiss index for BEIR v1.0.0 (SCIDOCS) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-scidocs.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-scidocs.contriever.20230124.tar.gz" + ], + "md5": "dd1555b714c482a22cbb74d8c72599c9", + "size compressed (bytes)": 73532556, + "documents": 25657, + "downloaded": False, + "texts": "beir-v1.0.0-scidocs.flat" + }, + "beir-v1.0.0-fever.contriever": { + "description": "Faiss index for BEIR v1.0.0 (FEVER) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-fever.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-fever.contriever.20230124.tar.gz" + ], + "md5": "d5b738dc38e56857a987bdb1eb4ce5c1", + "size compressed (bytes)": 15437918827, + "documents": 5416568, + "downloaded": False, + "texts": "beir-v1.0.0-fever.flat" + }, + "beir-v1.0.0-climate-fever.contriever": { + "description": "Faiss index for BEIR v1.0.0 (Climate-FEVER) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-climate-fever.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-climate-fever.contriever.20230124.tar.gz" + ], + "md5": "1e169cf6a8baaa4909f6823e3c23a80f", + "size compressed (bytes)": 15437988868, + "documents": 5416593, + "downloaded": False, + "texts": "beir-v1.0.0-climate-fever.flat" + }, + "beir-v1.0.0-scifact.contriever": { + "description": "Faiss index for BEIR v1.0.0 (SciFact) corpus encoded by Contriever encoder.", + "filename": "faiss.beir-v1.0.0-scifact.contriever.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-scifact.contriever.20230124.tar.gz" + ], + "md5": "61eb253aa08c9c97fa2f82ef2a96ca7b", + "size compressed (bytes)": 14753553, + "documents": 5183, + "downloaded": False, + "texts": "beir-v1.0.0-scifact.flat" + }, + + # BEIR (v1.0.0) contriever ft MSMARCO indexes + "beir-v1.0.0-trec-covid.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (TREC-COVID) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-trec-covid.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-trec-covid.contriever-msmarco.20230124.tar.gz" + ], + "md5": "7dd33fbd77deba89174b6d1b2c34866c", + "size compressed (bytes)": 487986935, + "documents": 171332, + "downloaded": False, + "texts": "beir-v1.0.0-trec-covid.flat", + }, + "beir-v1.0.0-bioasq.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (BioASQ) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-bioasq.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-bioasq.contriever-msmarco.20230124.tar.gz" + ], + "md5": "e51924bb78555942f0a9465959a6f6f2", + "size compressed (bytes)": 42438279267, + "documents": 14914603, + "downloaded": False, + "texts": "beir-v1.0.0-bioasq.flat", + }, + "beir-v1.0.0-nfcorpus.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (NFCorpus) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-nfcorpus.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-nfcorpus.contriever-msmarco.20230124.tar.gz" + ], + "md5": "657649d19fafd06cb031c6b11868d7f9", + "size compressed (bytes)": 10327231, + "documents": 3633, + "downloaded": False, + "texts": "beir-v1.0.0-nfcorpus.flat", + }, + "beir-v1.0.0-nq.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (NQ) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-nq.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-nq.contriever-msmarco.20230124.tar.gz" + ], + "md5": "8d7ff2e5e285b1549bb8af27a7cf6e30", + "size compressed (bytes)": 7619790303, + "documents": 2681468, + "downloaded": False, + "texts": "beir-v1.0.0-nq.flat", + }, + "beir-v1.0.0-hotpotqa.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (HotpotQA) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-hotpotqa.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-hotpotqa.contriever-msmarco.20230124.tar.gz" + ], + "md5": "bef5b2fba77859c778f121ae2f17c9f1", + "size compressed (bytes)": 14889518902, + "documents": 5233329, + "downloaded": False, + "texts": "beir-v1.0.0-hotpotqa.flat", + }, + "beir-v1.0.0-fiqa.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (FiQA-2018) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-fiqa.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-fiqa.contriever-msmarco.20230124.tar.gz" + ], + "md5": "3dd16db861dbef4da545ccbea127198a", + "size compressed (bytes)": 163998627, + "documents": 57638, + "downloaded": False, + "texts": "beir-v1.0.0-fiqa.flat", + }, + "beir-v1.0.0-signal1m.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (Signal-1M) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-signal1m.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-signal1m.contriever-msmarco.20230124.tar.gz" + ], + "md5": "c4e25dc99c27a9d1931ad129d4091da0", + "size compressed (bytes)": 8146484698, + "documents": 2866316, + "downloaded": False, + "texts": "beir-v1.0.0-signal1m.flat", + }, + "beir-v1.0.0-trec-news.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (TREC-NEWS) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-trec-news.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-trec-news.contriever-msmarco.20230124.tar.gz" + ], + "md5": "22272011f0e0dea7f66b624de196b6b3", + "size compressed (bytes)": 1629437319, + "documents": 594977, + "downloaded": False, + "texts": "beir-v1.0.0-trec-news.flat", + }, + "beir-v1.0.0-robust04.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (Robust04) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-robust04.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-robust04.contriever-msmarco.20230124.tar.gz" + ], + "md5": "a2a0603fae866e1e92abcdfc46de6fe5", + "size compressed (bytes)": 1501089289, + "documents": 528155, + "downloaded": False, + "texts": "beir-v1.0.0-robust04.flat", + }, + "beir-v1.0.0-arguana.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (ArguAna) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-arguana.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-arguana.contriever-msmarco.20230124.tar.gz" + ], + "md5": "dcc0408ab033433d47363f5902fbde3d", + "size compressed (bytes)": 24705859, + "documents": 8674, + "downloaded": False, + "texts": "beir-v1.0.0-arguana.flat", + }, + "beir-v1.0.0-webis-touche2020.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (Webis-Touche2020) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-webis-touche2020.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-webis-touche2020.contriever-msmarco.20230124.tar.gz" + ], + "md5": "60072a3b32855067fea0f8e21ce0d905", + "size compressed (bytes)": 1090748271, + "documents": 382545, + "downloaded": False, + "texts": "beir-v1.0.0-webis-touche2020.flat", + }, + "beir-v1.0.0-cqadupstack-android.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-android) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-cqadupstack-android.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-android.contriever-msmarco.20230124.tar.gz" + ], + "md5": "f9b02c2410fc8ddf63e96ea6ebbd8447", + "size compressed (bytes)": 65438882, + "documents": 22998, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-android.flat", + }, + "beir-v1.0.0-cqadupstack-english.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-english) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-cqadupstack-english.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-english.contriever-msmarco.20230124.tar.gz" + ], + "md5": "7c50f04a61a08f16dfb1d28010b4e222", + "size compressed (bytes)": 114462161, + "documents": 40221, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-english.flat", + }, + "beir-v1.0.0-cqadupstack-gaming.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-gaming) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-cqadupstack-gaming.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-gaming.contriever-msmarco.20230124.tar.gz" + ], + "md5": "d97fafe933ae40fc12a9df0afc6a8e78", + "size compressed (bytes)": 128896840, + "documents": 45301, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-gaming.flat", + }, + "beir-v1.0.0-cqadupstack-gis.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-gis) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-cqadupstack-gis.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-gis.contriever-msmarco.20230124.tar.gz" + ], + "md5": "f536d8feda0069a1769ad71010fab0e3", + "size compressed (bytes)": 107086862, + "documents": 37637, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-gis.flat", + }, + "beir-v1.0.0-cqadupstack-mathematica.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-mathematica) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-cqadupstack-mathematica.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-mathematica.contriever-msmarco.20230124.tar.gz" + ], + "md5": "987fb7ac275baf344828cdda0013703d", + "size compressed (bytes)": 47526982, + "documents": 16705, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-mathematica.flat", + }, + "beir-v1.0.0-cqadupstack-physics.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-physics) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-cqadupstack-physics.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-physics.contriever-msmarco.20230124.tar.gz" + ], + "md5": "e252b1c4dcb06d2183109dc4bc820176", + "size compressed (bytes)": 109024692, + "documents": 38316, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-physics.flat", + }, + "beir-v1.0.0-cqadupstack-programmers.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-programmers) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-cqadupstack-programmers.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-programmers.contriever-msmarco.20230124.tar.gz" + ], + "md5": "09bd10b2b06c7b0c7611e7811958f4b3", + "size compressed (bytes)": 91567840, + "documents": 32176, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-programmers.flat", + }, + "beir-v1.0.0-cqadupstack-stats.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-stats) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-cqadupstack-stats.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-stats.contriever-msmarco.20230124.tar.gz" + ], + "md5": "c4586c11a2bc90f9ea5a3355fc6e6c53", + "size compressed (bytes)": 120271253, + "documents": 42269, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-stats.flat", + }, + "beir-v1.0.0-cqadupstack-tex.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-tex) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-cqadupstack-tex.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-tex.contriever-msmarco.20230124.tar.gz" + ], + "md5": "c3c5ec87aeb33a7320c0d61146c03fc0", + "size compressed (bytes)": 194009234, + "documents": 68184, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-tex.flat", + }, + "beir-v1.0.0-cqadupstack-unix.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-unix) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-cqadupstack-unix.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-unix.contriever-msmarco.20230124.tar.gz" + ], + "md5": "3220f3eb0e9f0095cf13dcc8eb3ae1e0", + "size compressed (bytes)": 134821535, + "documents": 47382, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-unix.flat", + }, + "beir-v1.0.0-cqadupstack-webmasters.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-webmasters) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-cqadupstack-webmasters.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-webmasters.contriever-msmarco.20230124.tar.gz" + ], + "md5": "f696855c02090833a6ca695f8efa3006", + "size compressed (bytes)": 49530869, + "documents": 17405, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-webmasters.flat", + }, + "beir-v1.0.0-cqadupstack-wordpress.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (CQADupStack-wordpress) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-cqadupstack-wordpress.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-cqadupstack-wordpress.contriever-msmarco.20230124.tar.gz" + ], + "md5": "e92063c046803a76010b57e0ef1ace9e", + "size compressed (bytes)": 138328541, + "documents": 48605, + "downloaded": False, + "texts": "beir-v1.0.0-cqadupstack-wordpress.flat", + }, + "beir-v1.0.0-quora.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (Quora) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-quora.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-quora.contriever-msmarco.20230124.tar.gz" + ], + "md5": "82481f11087ebf63156da1f3dda00d5e", + "size compressed (bytes)": 1487402659, + "documents": 522931, + "downloaded": False, + "texts": "beir-v1.0.0-quora.flat", + }, + "beir-v1.0.0-dbpedia-entity.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (DBPedia) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-dbpedia-entity.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-dbpedia-entity.contriever-msmarco.20230124.tar.gz" + ], + "md5": "5b9249745aa548776a8f22269bd55dbe", + "size compressed (bytes)": 13226846024, + "documents": 4635922, + "downloaded": False, + "texts": "beir-v1.0.0-dbpedia-entity.flat", + }, + "beir-v1.0.0-scidocs.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (SCIDOCS) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-scidocs.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-scidocs.contriever-msmarco.20230124.tar.gz" + ], + "md5": "091d751629ae22d843ce741f05f00b81", + "size compressed (bytes)": 73530332, + "documents": 25657, + "downloaded": False, + "texts": "beir-v1.0.0-scidocs.flat", + }, + "beir-v1.0.0-fever.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (FEVER) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-fever.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-fever.contriever-msmarco.20230124.tar.gz" + ], + "md5": "c1e9851e23c9f46e7210aedd613e4a1b", + "size compressed (bytes)": 15444001312, + "documents": 5416568, + "downloaded": False, + "texts": "beir-v1.0.0-fever.flat", + }, + "beir-v1.0.0-climate-fever.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (Climate-FEVER) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-climate-fever.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-climate-fever.contriever-msmarco.20230124.tar.gz" + ], + "md5": "1ec289569b80edb25d885100feba83aa", + "size compressed (bytes)": 15444073223, + "documents": 5416593, + "downloaded": False, + "texts": "beir-v1.0.0-climate-fever.flat", + }, + "beir-v1.0.0-scifact.contriever-msmarco": { + "description": "Faiss index for BEIR v1.0.0 (SciFact) corpus encoded by Contriever encoder that has been fine-tuned with MS MARCO passage.", + "filename": "faiss.beir-v1.0.0-scifact.contriever-msmarco.20230124.tar.gz", + "readme": "faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.beir-v1.0.0-scifact.contriever-msmarco.20230124.tar.gz" + ], + "md5": "e560d5de0ccb65f66853540cb6917369", + "size compressed (bytes)": 14758747, + "documents": 5183, + "downloaded": False, + "texts": "beir-v1.0.0-scifact.flat", + } +} + +FAISS_INDEX_INFO_MRTYDI = { + "mrtydi-v1.1-arabic-mdpr-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Arabic) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-arabic.20220207.5df364.tar.gz", + "readme": "faiss.mrtydi-v1.1-arabic.20220207.5df364.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-arabic.20220207.5df364.tar.gz", + "https://vault.cs.uwaterloo.ca/s/Jgj3rYjbyRrmJs8/download" # Note, this is Crystina's account. + ], + "md5": "de86c1ce43854bbeea4e3af5d95d6ffb", + "size compressed (bytes)": 5997943791, + "documents": 2106586, + "downloaded": False, + "texts": "mrtydi-v1.1-arabic" + }, + "mrtydi-v1.1-bengali-mdpr-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Bengali) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-bengali.20220207.5df364.tar.gz", + "readme": "faiss.mrtydi-v1.1-bengali.20220207.5df364.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-bengali.20220207.5df364.tar.gz", + "https://vault.cs.uwaterloo.ca/s/4PpkzXAQtXFFJHR/download" # Note, this is Crystina's account. + ], + "md5": "e60cb6f1f7139cf0551f0ba4e4e83bf6", + "size compressed (bytes)": 865716848, + "documents": 304059, + "downloaded": False, + "texts": "mrtydi-v1.1-bengali" + }, + "mrtydi-v1.1-english-mdpr-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (English) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-english.20220207.5df364.tar.gz", + "readme": "faiss.mrtydi-v1.1-english.20220207.5df364.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-english.20220207.5df364.tar.gz", + "https://vault.cs.uwaterloo.ca/s/A7pjbwYeoT4Krnj/download" # Note, this is Crystina's account. + ], + "md5": "a0a8cc39e8af782ec82188a18c4c97c3", + "size compressed (bytes)": 93585951488, + "documents": 32907100, + "downloaded": False, + "texts": "mrtydi-v1.1-english" + }, + "mrtydi-v1.1-finnish-mdpr-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Finnish) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-finnish.20220207.5df364.tar.gz", + "readme": "faiss.mrtydi-v1.1-finnish.20220207.5df364.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-finnish.20220207.5df364.tar.gz", + "https://vault.cs.uwaterloo.ca/s/erNYkrYzRZxpecz/download" # Note, this is Crystina's account. + ], + "md5": "3e4e18aacf07ca551b474315f267ead6", + "size compressed (bytes)": 5435516778, + "documents": 1908757, + "downloaded": False, + "texts": "mrtydi-v1.1-finnish" + }, + "mrtydi-v1.1-indonesian-mdpr-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Indonesian) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-indonesian.20220207.5df364.tar.gz", + "readme": "faiss.mrtydi-v1.1-indonesian.20220207.5df364.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-indonesian.20220207.5df364.tar.gz", + "https://vault.cs.uwaterloo.ca/s/BpR3MzT7KJ6edx7/download" # Note, this is Crystina's account. + ], + "md5": "0bf693e4046d9a565ae18b9f5939d193", + "size compressed (bytes)": 865716848, + "documents": 4179177829, + "downloaded": False, + "texts": "mrtydi-v1.1-indonesian" + }, + "mrtydi-v1.1-japanese-mdpr-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Japanese) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-japanese.20220207.5df364.tar.gz", + "readme": "faiss.mrtydi-v1.1-japanese.20220207.5df364.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-japanese.20220207.5df364.tar.gz", + "https://vault.cs.uwaterloo.ca/s/k7bptHT8GwMJpnF/download" # Note, this is Crystina's account. + ], + "md5": "4ba566e27bc0158108259b18a153e2fc", + "size compressed (bytes)": 19920816424, + "documents": 7000027, + "downloaded": False, + "texts": "mrtydi-v1.1-japanese" + }, + "mrtydi-v1.1-korean-mdpr-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Korean) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-korean.20220207.5df364.tar.gz", + "readme": "faiss.mrtydi-v1.1-korean.20220207.5df364.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-korean.20220207.5df364.tar.gz", + "https://vault.cs.uwaterloo.ca/s/TigfYMde94YWAoE/download" # Note, this is Crystina's account. + ], + "md5": "44212e5722632d5bcb14f0680741638c", + "size compressed (bytes)": 4257414237, + "documents": 1496126, + "downloaded": False, + "texts": "mrtydi-v1.1-korean" + }, + "mrtydi-v1.1-russian-mdpr-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Russian) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-russian.20220207.5df364.tar.gz", + "readme": "faiss.mrtydi-v1.1-russian.20220207.5df364.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-russian.20220207.5df364.tar.gz", + "https://vault.cs.uwaterloo.ca/s/eN7demnmnspqxjk/download" # Note, this is Crystina's account. + ], + "md5": "e7634093f2a3362928e9699441ce8a3b", + "size compressed (bytes)": 27317759143, + "documents": 9597504, + "downloaded": False, + "texts": "mrtydi-v1.1-russian" + }, + "mrtydi-v1.1-swahili-mdpr-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Swahili) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-swahili.20220207.5df364.tar.gz", + "readme": "faiss.mrtydi-v1.1-swahili.20220207.5df364.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-swahili.20220207.5df364.tar.gz", + "https://vault.cs.uwaterloo.ca/s/JgiX8PRftnqcPwy/download" # Note, this is Crystina's account. + ], + "md5": "5061bdd1d81bc32490bbb3682096acdd", + "size compressed (bytes)": 389658394, + "documents": 136689, + "downloaded": False, + "texts": "mrtydi-v1.1-swahili" + }, + "mrtydi-v1.1-telugu-mdpr-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Telugu) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-telugu.20220207.5df364.tar.gz", + "readme": "faiss.mrtydi-v1.1-telugu.20220207.5df364.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-telugu.20220207.5df364.tar.gz", + "https://vault.cs.uwaterloo.ca/s/dkm6RGdgRbnwiX2/download" # Note, this is Crystina's account. + ], + "md5": "4952dacaeae89185d3757f9f26af4e88", + "size compressed (bytes)": 1561173721, + "documents": 548224, + "downloaded": False, + "texts": "mrtydi-v1.1-telugu" + }, + "mrtydi-v1.1-thai-mdpr-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Thai) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-thai.20220207.5df364.tar.gz", + "readme": "faiss.mrtydi-v1.1-thai.20220207.5df364.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-thai.20220207.5df364.tar.gz", + "https://vault.cs.uwaterloo.ca/s/fFrRYefd3nWFR3J/download" # Note, this is Crystina's account. + ], + "md5": "2458f704b277fa8ffe2509b6296892a0", + "size compressed (bytes)": 1616059846, + "documents": 568855, + "downloaded": False, + "texts": "mrtydi-v1.1-thai" + }, + + "mrtydi-v1.1-arabic-mdpr-tied-pft-msmarco": { + "description": "Faiss index for Mr.TyDi v1.1 (Arabic) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.mrtydi-v1.1-arabic.20220413.aa1c0e9.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220413.aa1c0e9.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-arabic.20220413.aa1c0e9.tar.gz", + ], + "md5": "bafb6fb2c530567dec26aa4597c6ee25", + "size compressed (bytes)": 5997943791, + "documents": 2106586, + "downloaded": False, + "texts": "mrtydi-v1.1-arabic", + }, + "mrtydi-v1.1-bengali-mdpr-tied-pft-msmarco": { + "description": "Faiss index for Mr.TyDi v1.1 (Bengali) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.mrtydi-v1.1-bengali.20220413.aa1c0e9.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220413.aa1c0e9.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-bengali.20220413.aa1c0e9.tar.gz", + ], + "md5": "d04bb6e634fb4f7df23dbff7481a8f9b", + "size compressed (bytes)": 865733058, + "documents": 304059, + "downloaded": False, + "texts": "mrtydi-v1.1-bengali", + }, + "mrtydi-v1.1-english-mdpr-tied-pft-msmarco": { + "description": "Faiss index for Mr.TyDi v1.1 (English) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.mrtydi-v1.1-english.20220413.aa1c0e9.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220413.aa1c0e9.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-english.20220413.aa1c0e9.tar.gz", + ], + "md5": "4a93a2211199f7359cc99486a9f93d02", + "size compressed (bytes)": 93594561391, + "documents": 32907100, + "downloaded": False, + "texts": "mrtydi-v1.1-english" + }, + "mrtydi-v1.1-finnish-mdpr-tied-pft-msmarco": { + "description": "Faiss index for Mr.TyDi v1.1 (Finnish) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.mrtydi-v1.1-finnish.20220413.aa1c0e9.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220413.aa1c0e9.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-finnish.20220413.aa1c0e9.tar.gz", + ], + "md5": "6cbe2d52225fb15a494857b9df593113", + "size compressed (bytes)": 5436419399, + "documents": 1908757, + "downloaded": False, + "texts": "mrtydi-v1.1-finnish" + }, + "mrtydi-v1.1-indonesian-mdpr-tied-pft-msmarco": { + "description": "Faiss index for Mr.TyDi v1.1 (Indonesian) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.mrtydi-v1.1-indonesian.20220413.aa1c0e9.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220413.aa1c0e9.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-indonesian.20220413.aa1c0e9.tar.gz", + ], + "md5": "26108a7ee1fc5ac15e0b7fcecf4d39ad", + "size compressed (bytes)": 4178791300, + "documents": 1469399, + "downloaded": False, + "texts": "mrtydi-v1.1-indonesian" + }, + "mrtydi-v1.1-japanese-mdpr-tied-pft-msmarco": { + "description": "Faiss index for Mr.TyDi v1.1 (Japanese) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.mrtydi-v1.1-japanese.20220413.aa1c0e9.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220413.aa1c0e9.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-japanese.20220413.aa1c0e9.tar.gz", + ], + "md5": "2ef2b5e3f5778d99e65aafc48450508a", + "size compressed (bytes)": 19918319452, + "documents": 7000027, + "downloaded": False, + "texts": "mrtydi-v1.1-japanese" + }, + "mrtydi-v1.1-korean-mdpr-tied-pft-msmarco": { + "description": "Faiss index for Mr.TyDi v1.1 (Korean) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.mrtydi-v1.1-korean.20220413.aa1c0e9.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220413.aa1c0e9.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-korean.20220413.aa1c0e9.tar.gz", + ], + "md5": "26ed9be031603019304b66f985ce154c", + "size compressed (bytes)": 4256863335, + "documents": 1496126, + "downloaded": False, + "texts": "mrtydi-v1.1-korean" + }, + "mrtydi-v1.1-russian-mdpr-tied-pft-msmarco": { + "description": "Faiss index for Mr.TyDi v1.1 (Russian) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.mrtydi-v1.1-russian.20220413.aa1c0e9.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220413.aa1c0e9.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-russian.20220413.aa1c0e9.tar.gz", + ], + "md5": "b1be7a45a702be4021f38425c0001f39", + "size compressed (bytes)": 27318555548, + "documents": 9597504, + "downloaded": False, + "texts": "mrtydi-v1.1-russian" + }, + "mrtydi-v1.1-swahili-mdpr-tied-pft-msmarco": { + "description": "Faiss index for Mr.TyDi v1.1 (Swahili) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.mrtydi-v1.1-swahili.20220413.aa1c0e9.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220413.aa1c0e9.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-swahili.20220413.aa1c0e9.tar.gz", + ], + "md5": "14edb5f677820b5a5a3858555e900591", + "size compressed (bytes)": 389600527, + "documents": 136689, + "downloaded": False, + "texts": "mrtydi-v1.1-swahili" + }, + "mrtydi-v1.1-telugu-mdpr-tied-pft-msmarco": { + "description": "Faiss index for Mr.TyDi v1.1 (Telugu) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.mrtydi-v1.1-telugu.20220413.aa1c0e9.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220413.aa1c0e9.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-telugu.20220413.aa1c0e9.tar.gz", + ], + "md5": "25b37f5d7a035a17b447f1732e241b85", + "size compressed (bytes)": 1561419958, + "documents": 548224, + "downloaded": False, + "texts": "mrtydi-v1.1-telugu" + }, + "mrtydi-v1.1-thai-mdpr-tied-pft-msmarco": { + "description": "Faiss index for Mr.TyDi v1.1 (Thai) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.mrtydi-v1.1-thai.20220413.aa1c0e9.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220413.aa1c0e9.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-thai.20220413.aa1c0e9.tar.gz", + ], + "md5": "0544ce677fa31b633a29a079c0cdfc82", + "size compressed (bytes)": 1616716166, + "documents": 568855, + "downloaded": False, + "texts": "mrtydi-v1.1-thai" + }, + "mrtydi-v1.1-arabic-mdpr-tied-pft-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Arabic) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-arabic.20220523.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220523.7b099d5.mdpr-tied-pft-nq.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-arabic.20220523.7b099d5.tar.gz", + ], + "md5": "3d764e7936bb6beb5308ccfd6717b38e", + "size compressed (bytes)": 5988743258, + "documents": 2106586, + "downloaded": False, + "texts": "mrtydi-v1.1-arabic" + }, + "mrtydi-v1.1-bengali-mdpr-tied-pft-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Bengali) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-bengali.20220523.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220523.7b099d5.mdpr-tied-pft-nq.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-bengali.20220523.7b099d5.tar.gz", + ], + "md5": "2ee8e550245f7eb5184c27fe3369d818", + "size compressed (bytes)": 864358280, + "documents": 304059, + "downloaded": False, + "texts": "mrtydi-v1.1-bengali" + }, + "mrtydi-v1.1-english-mdpr-tied-pft-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (English) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-english.20220523.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220523.7b099d5.mdpr-tied-pft-nq.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-english.20220523.7b099d5.tar.gz", + ], + "md5": "a1be61486c209bf2545d63f950274a99", + "size compressed (bytes)": 93435965796, + "documents": 32907100, + "downloaded": False, + "texts": "mrtydi-v1.1-english" + }, + "mrtydi-v1.1-finnish-mdpr-tied-pft-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Finnish) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-finnish.20220523.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220523.7b099d5.mdpr-tied-pft-nq.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-finnish.20220523.7b099d5.tar.gz", + ], + "md5": "0dbd873fa8bf8c87052940bdf4097ba2", + "size compressed (bytes)": 5427976705, + "documents": 1908757, + "downloaded": False, + "texts": "mrtydi-v1.1-finnish" + }, + "mrtydi-v1.1-indonesian-mdpr-tied-pft-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Indonesian) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-indonesian.20220523.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220523.7b099d5.mdpr-tied-pft-nq.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-indonesian.20220523.7b099d5.tar.gz", + ], + "md5": "937f7c03e2386166e34ef81b25d7959f", + "size compressed (bytes)": 4172976570, + "documents": 4179177829, + "downloaded": False, + "texts": "mrtydi-v1.1-indonesian" + }, + "mrtydi-v1.1-japanese-mdpr-tied-pft-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Japanese) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-japanese.20220523.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220523.7b099d5.mdpr-tied-pft-nq.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-japanese.20220523.7b099d5.tar.gz", + ], + "md5": "21a64d1a012a854d4bf42fa24c8712fd", + "size compressed (bytes)": 19890571158, + "documents": 7000027, + "downloaded": False, + "texts": "mrtydi-v1.1-japanese" + }, + "mrtydi-v1.1-korean-mdpr-tied-pft-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Korean) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-korean.20220523.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220523.7b099d5.mdpr-tied-pft-nq.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-korean.20220523.7b099d5.tar.gz", + ], + "md5": "ed3216fb5bc431ac52931b58cc4c4d0f", + "size compressed (bytes)": 4250320804, + "documents": 1496126, + "downloaded": False, + "texts": "mrtydi-v1.1-korean" + }, + "mrtydi-v1.1-russian-mdpr-tied-pft-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Russian) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-russian.20220523.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220523.7b099d5.mdpr-tied-pft-nq.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-russian.20220523.7b099d5.tar.gz", + ], + "md5": "c3c4db1397c7125f8e411cf637054148", + "size compressed (bytes)": 27278520787, + "documents": 9597504, + "downloaded": False, + "texts": "mrtydi-v1.1-russian" + }, + "mrtydi-v1.1-swahili-mdpr-tied-pft-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Swahili) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-swahili.20220523.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220523.7b099d5.mdpr-tied-pft-nq.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-swahili.20220523.7b099d5.tar.gz", + ], + "md5": "20235115c0a877e11c91cb662d5a6fdb", + "size compressed (bytes)": 389244265, + "documents": 136689, + "downloaded": False, + "texts": "mrtydi-v1.1-swahili" + }, + "mrtydi-v1.1-telugu-mdpr-tied-pft-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Telugu) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-telugu.20220523.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220523.7b099d5.mdpr-tied-pft-nq.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-telugu.20220523.7b099d5.tar.gz", + ], + "md5": "86cae6fe8f8c08489e49b6e6c28a09b0", + "size compressed (bytes)": 1558691592, + "documents": 548224, + "downloaded": False, + "texts": "mrtydi-v1.1-telugu" + }, + "mrtydi-v1.1-thai-mdpr-tied-pft-nq": { + "description": "Faiss index for Mr.TyDi v1.1 (Thai) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-thai.20220523.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220523.7b099d5.mdpr-tied-pft-nq.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-thai.20220523.7b099d5.tar.gz", + ], + "md5": "3ba9c64a9f7479bd2e3a84a816ee0f6f", + "size compressed (bytes)": 1613563144, + "documents": 568855, + "downloaded": False, + "texts": "mrtydi-v1.1-thai" + }, + + "mrtydi-v1.1-arabic-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for Mr.TyDi v1.1 (Arabic) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-arabic.20220524.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220524.7b099d5.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-arabic.20220524.7b099d5.tar.gz", + ], + "md5": "9ea47ae7425fd3376f015ca7c6ba5134", + "size compressed (bytes)": 5988743258, + "documents": 2106586, + "downloaded": False, + "texts": "mrtydi-v1.1-arabic" + }, + "mrtydi-v1.1-bengali-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for Mr.TyDi v1.1 (Bengali) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-bengali.20220524.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220524.7b099d5.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-bengali.20220524.7b099d5.tar.gz", + ], + "md5": "d1e75f4960a723b068bb778a972ffb54", + "size compressed (bytes)": 864358280, + "documents": 304059, + "downloaded": False, + "texts": "mrtydi-v1.1-bengali" + }, + "mrtydi-v1.1-english-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for Mr.TyDi v1.1 (English) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-english.20220524.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220524.7b099d5.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-english.20220524.7b099d5.tar.gz", + ], + "md5": "1fce43e549ff57bbac432a579961f34b", + "size compressed (bytes)": 93435965796, + "documents": 32907100, + "downloaded": False, + "texts": "mrtydi-v1.1-english" + }, + "mrtydi-v1.1-finnish-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for Mr.TyDi v1.1 (Finnish) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-finnish.20220524.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220524.7b099d5.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-finnish.20220524.7b099d5.tar.gz", + ], + "md5": "6faa7b2fe8ad4b9ca284bd7e8f69b727", + "size compressed (bytes)": 5427976705, + "documents": 1908757, + "downloaded": False, + "texts": "mrtydi-v1.1-finnish" + }, + "mrtydi-v1.1-indonesian-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for Mr.TyDi v1.1 (Indonesian) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-indonesian.20220524.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220524.7b099d5.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-indonesian.20220524.7b099d5.tar.gz", + ], + "md5": "659b1e0a1bea46f62a842b55385085b7", + "size compressed (bytes)": 4172976570, + "documents": 4179177829, + "downloaded": False, + "texts": "mrtydi-v1.1-indonesian" + }, + "mrtydi-v1.1-japanese-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for Mr.TyDi v1.1 (Japanese) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-japanese.20220524.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220524.7b099d5.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-japanese.20220524.7b099d5.tar.gz", + ], + "md5": "126c82da9e0e0e1fd290cf62d7fe4dfa", + "size compressed (bytes)": 19890571158, + "documents": 7000027, + "downloaded": False, + "texts": "mrtydi-v1.1-japanese" + }, + "mrtydi-v1.1-korean-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for Mr.TyDi v1.1 (Korean) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-korean.20220524.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220524.7b099d5.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-korean.20220524.7b099d5.tar.gz", + ], + "md5": "cf07b71aaefba58bbe150265f6696503", + "size compressed (bytes)": 4250320804, + "documents": 1496126, + "downloaded": False, + "texts": "mrtydi-v1.1-korean" + }, + "mrtydi-v1.1-russian-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for Mr.TyDi v1.1 (Russian) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-russian.20220524.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220524.7b099d5.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-russian.20220524.7b099d5.tar.gz", + ], + "md5": "c0a53fa6428cb9b1399a90e3a9a805d5", + "size compressed (bytes)": 27278520787, + "documents": 9597504, + "downloaded": False, + "texts": "mrtydi-v1.1-russian" + }, + "mrtydi-v1.1-swahili-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for Mr.TyDi v1.1 (Swahili) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-swahili.20220524.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220524.7b099d5.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-swahili.20220524.7b099d5.tar.gz", + ], + "md5": "93dc3f3453815c92f3bccf4f41c5f2d4", + "size compressed (bytes)": 389244265, + "documents": 136689, + "downloaded": False, + "texts": "mrtydi-v1.1-swahili" + }, + "mrtydi-v1.1-telugu-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for Mr.TyDi v1.1 (Telugu) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-telugu.20220524.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220524.7b099d5.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-telugu.20220524.7b099d5.tar.gz", + ], + "md5": "7aba1b7ee36e572bd982b3f62f41c380", + "size compressed (bytes)": 1558691592, + "documents": 548224, + "downloaded": False, + "texts": "mrtydi-v1.1-telugu" + }, + "mrtydi-v1.1-thai-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for Mr.TyDi v1.1 (Thai) corpus encoded by mDPR passage encoder pre-fine-tuned on NQ.", + "filename": "faiss.mrtydi-v1.1-thai.20220524.7b099d5.tar.gz", + "readme": "faiss.mrtydi-v1.1.20220524.7b099d5.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.mrtydi-v1.1-thai.20220524.7b099d5.tar.gz", + ], + "md5": "57151073a4c0d90b64242e4536a3af75", + "size compressed (bytes)": 1613563144, + "documents": 568855, + "downloaded": False, + "texts": "mrtydi-v1.1-thai" + } +} + +FAISS_INDEX_INFO_MIRACL = { + "miracl-v1.0-ar-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Arabic) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-ar.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ar.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "177d47e9a802c87abca52380ad1ce83b", + "size compressed (bytes)": 5997943791, + "documents": 2061414, + "downloaded": False, + "texts": "miracl-v1.0-ar", + }, + "miracl-v1.0-bn-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Bengali) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-bn.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-bn.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "156e8ba8cd369b1c4a606e28ea025b2e", + "size compressed (bytes)": 846825710, + "documents": 297265, + "downloaded": False, + "texts": "miracl-v1.0-bn", + }, + "miracl-v1.0-en-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (English) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-en.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-en.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "ce00518f54b130a157112c2a1b2d0980", + "size compressed (bytes)": 93554329467, + "documents": 32893221, + "downloaded": False, + "texts": "miracl-v1.0-en" + }, + "miracl-v1.0-es-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Spanish) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-es.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-es.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "d7a9627bb60e901386f455ba6c9063ac", + "size compressed (bytes)": 29553300598, + "documents": 10373953, + "downloaded": False, + "texts": "miracl-v1.0-es" + }, + "miracl-v1.0-fa-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Persian) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-fa.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-fa.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "e8b59e3eb2e08f61f81569c6d4c85350", + "size compressed (bytes)": 6286832343, + "documents": 2207172, + "downloaded": False, + "texts": "miracl-v1.0-fa" + }, + "miracl-v1.0-fi-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Finnish) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-fi.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-fi.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "a82d6e6cf964d2e4cfac99cf14cbcc35", + "size compressed (bytes)": 5366190875, + "documents": 1883509, + "downloaded": False, + "texts": "miracl-v1.0-fi" + }, + "miracl-v1.0-fr-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (French) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-fr.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-fr.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "a952d944aa63dcee604c8357f1be18db", + "size compressed (bytes)": 41648462587, + "documents": 14636953, + "downloaded": False, + "texts": "miracl-v1.0-fr" + }, + "miracl-v1.0-hi-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Hindi) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-hi.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-hi.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "9d1dc4b948edf3df263977d82c9fcc3f", + "size compressed (bytes)": 1440625097, + "documents": 506264, + "downloaded": False, + "texts": "miracl-v1.0-hi" + }, + "miracl-v1.0-id-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Indonesian) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-id.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-id.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "19815233f5cc3a198b88cdb990459637", + "size compressed (bytes)": 4115281873, + "documents": 1446315, + "downloaded": False, + "texts": "miracl-v1.0-id" + }, + "miracl-v1.0-ja-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Japanese) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-ja.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ja.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "6e9b6e304b2b1a192a3d81e55880f971", + "size compressed (bytes)": 19791965448, + "documents": 6953614, + "downloaded": False, + "texts": "miracl-v1.0-ja" + }, + "miracl-v1.0-ko-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Korean) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-ko.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ko.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "ea1fa34341fc5d5ea88e5b633025d2d5", + "size compressed (bytes)": 4231563116, + "documents": 1486752, + "downloaded": False, + "texts": "miracl-v1.0-korean" + }, + "miracl-v1.0-ru-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Russian) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-ru.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ru.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "4325e716ee6af5ea2b73d4b25f1ad76c", + "size compressed (bytes)": 27173379698, + "documents": 9543918, + "downloaded": False, + "texts": "miracl-v1.0-ru" + }, + "miracl-v1.0-sw-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Swahili) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-sw.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-sw.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "2b879dac6823077ae497ba8ebfce523b", + "size compressed (bytes)": 376181791, + "documents": 131924, + "downloaded": False, + "texts": "miracl-v1.0-sw" + }, + "miracl-v1.0-te-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Telugu) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-te.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-te.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "a3dfb8ba31f316c93d1fd147f88fbbfd", + "size compressed (bytes)": 1476021181, + "documents": 518079, + "downloaded": False, + "texts": "miracl-v1.0-te" + }, + "miracl-v1.0-th-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Thai) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-th.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-th.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "cb0c9b84a80ff338372b32857c58368d", + "size compressed (bytes)": 1541590044, + "documents": 542166, + "downloaded": False, + "texts": "miracl-v1.0-th" + }, + "miracl-v1.0-zh-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Chinese) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-zh.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-zh.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "2743dfaa794b7abbef1d3c912c5cc4b5", + "size compressed (bytes)": 14046912361, + "documents": 4934368, + "downloaded": False, + "texts": "miracl-v1.0-zh", + }, + "miracl-v1.0-de-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (German) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-de.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-de.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "1abcf3aac78e30ebe7a75163412f1c84", + "size compressed (bytes)": 45154018897, + "documents": 15866222, + "downloaded": False, + "texts": "miracl-v1.0-de", + }, + "miracl-v1.0-yo-mdpr-tied-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Yoruba) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-yo.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-yo.mdpr-tied-pft-msmarco.20221004.2b2856.tar.gz" + ], + "md5": "2ad15ea0576ae3284082ae661e001faa", + "size compressed (bytes)": 139412730, + "documents": 49043, + "downloaded": False, + "texts": "miracl-v1.0-yo", + }, + + "miracl-v1.0-ar-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Arabic) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-ar.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ar.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "428fbde84d2c18e48f0821298947a9d1", + "size compressed (bytes)": 5866199790, + "documents": 2061414, + "downloaded": False, + "texts": "miracl-v1.0-ar", + }, + "miracl-v1.0-bn-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Bengali) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-bn.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-bn.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "4394a09e043be9be5b820814a82fc8ac", + "size compressed (bytes)": 846476050, + "documents": 297265, + "downloaded": False, + "texts": "miracl-v1.0-bn", + }, + "miracl-v1.0-en-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (English) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-en.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-en.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "5bd57f5e4daf93294fd2cbd969c05bb3", + "size compressed (bytes)": 93527497283, + "documents": 32893221, + "downloaded": False, + "texts": "miracl-v1.0-en" + }, + "miracl-v1.0-es-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Spanish) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-es.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-es.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "b6db16c1ab0ae95fec0465299c660d2a", + "size compressed (bytes)": 29544413180, + "documents": 10373953, + "downloaded": False, + "texts": "miracl-v1.0-es" + }, + "miracl-v1.0-fa-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Persian) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-fa.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-fa.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "2a2825706211eb96bd3dbb616463c661", + "size compressed (bytes)": 6283957262, + "documents": 2207172, + "downloaded": False, + "texts": "miracl-v1.0-fa" + }, + "miracl-v1.0-fi-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Finnish) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-fi.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-fi.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "65719de730cda3fa5f6a8a75611db6eb", + "size compressed (bytes)": 5363289277, + "documents": 1883509, + "downloaded": False, + "texts": "miracl-v1.0-fi" + }, + "miracl-v1.0-fr-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (French) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-fr.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-fr.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "24eb2f63f78aa1e39b1ea61e20661424", + "size compressed (bytes)": 41635104326, + "documents": 14636953, + "downloaded": False, + "texts": "miracl-v1.0-fr" + }, + "miracl-v1.0-hi-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Hindi) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-hi.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-hi.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "d08aad08a8592aa40355fb7d50afd170", + "size compressed (bytes)": 1439798033, + "documents": 506264, + "downloaded": False, + "texts": "miracl-v1.0-hi" + }, + "miracl-v1.0-id-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Indonesian) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-id.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-id.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "b02c20d4fc27e390ec5b1e9ca732dc5a", + "size compressed (bytes)": 4113737773, + "documents": 1446315, + "downloaded": False, + "texts": "miracl-v1.0-id" + }, + "miracl-v1.0-ja-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Japanese) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-ja.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ja.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "a5f219c7f46a36c5c7a2555fbdaa0479", + "size compressed (bytes)": 19790154560, + "documents": 6953614, + "downloaded": False, + "texts": "miracl-v1.0-ja" + }, + "miracl-v1.0-ko-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Korean) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-ko.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ko.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "67b2a803eab3491a057d4ac6b81974f1", + "size compressed (bytes)": 4230830690, + "documents": 1486752, + "downloaded": False, + "texts": "miracl-v1.0-korean" + }, + "miracl-v1.0-ru-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Russian) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-ru.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ru.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "edad6d5cb508de61ba84173d0ad2aa31", + "size compressed (bytes)": 27169921407, + "documents": 9543918, + "downloaded": False, + "texts": "miracl-v1.0-ru" + }, + "miracl-v1.0-sw-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Swahili) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-sw.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-sw.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "0b039d766b55f678102a59a6e050d0bc", + "size compressed (bytes)": 375865677, + "documents": 131924, + "downloaded": False, + "texts": "miracl-v1.0-sw" + }, + "miracl-v1.0-te-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Telugu) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-te.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-te.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "ea21915c69f70f41acadee4b6b83d129", + "size compressed (bytes)": 1474866678, + "documents": 518079, + "downloaded": False, + "texts": "miracl-v1.0-te" + }, + "miracl-v1.0-th-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Thai) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-th.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-th.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "a5875b473109310789710e2f3df91b0f", + "size compressed (bytes)": 1540180247, + "documents": 542166, + "downloaded": False, + "texts": "miracl-v1.0-th" + }, + "miracl-v1.0-zh-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Chinese) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-zh.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-zh.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "a2d233e792d46c20c912d10afff033f5", + "size compressed (bytes)": 14043150097, + "documents": 4934368, + "downloaded": False, + "texts": "miracl-v1.0-zh", + }, + "miracl-v1.0-de-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Chinese) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-de.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-de.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "d53da12ae6119ed54ef968e968f8520a", + "size compressed (bytes)": 45139752128, + "documents": 15866222, + "downloaded": False, + "texts": "miracl-v1.0-de", + }, + "miracl-v1.0-yo-mdpr-tied-pft-msmarco-ft-all": { + "description": "Faiss index for MIRACL v1.0 (Chinese) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-yo.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz", + "readme": "faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-yo.mdpr-tied-pft-msmarco-ft-all.20221004.2b2856.tar.gz" + ], + "md5": "0a1b0f48108508724a3892dfc04eb756", + "size compressed (bytes)": 139286213, + "documents": 49043, + "downloaded": False, + "texts": "miracl-v1.0-yo", + }, + + "miracl-v1.0-ar-mdpr-tied-pft-msmarco-ft-miracl-ar": { + "description": "Faiss index for MIRACL v1.0 (Arabic) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-ar.mdpr-tied-pft-msmarco-ft-miracl-ar.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ar.mdpr-tied-pft-msmarco-ft-miracl-ar.20230329.e40d4a.tar.gz", + ], + "md5": "29cdb7fa7cc52cabc32791d57be3bd42", + "size compressed (bytes)": 5871030506, + "documents": 2061414, + "downloaded": False, + "texts": "miracl-v1.0-ar" + }, + "miracl-v1.0-bn-mdpr-tied-pft-msmarco-ft-miracl-bn": { + "description": "Faiss index for MIRACL v1.0 (Bengali) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-bn.mdpr-tied-pft-msmarco-ft-miracl-bn.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-bn.mdpr-tied-pft-msmarco-ft-miracl-bn.20230329.e40d4a.tar.gz", + ], + "md5": "8972166564a9c13e102ae83ea062c166", + "size compressed (bytes)": 846236944, + "documents": 297265, + "downloaded": False, + "texts": "miracl-v1.0-bn" + }, + "miracl-v1.0-en-mdpr-tied-pft-msmarco-ft-miracl-en": { + "description": "Faiss index for MIRACL v1.0 (English) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-en.mdpr-tied-pft-msmarco-ft-miracl-en.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-en.mdpr-tied-pft-msmarco-ft-miracl-en.20230329.e40d4a.tar.gz", + ], + "md5": "cd43e6c93879a107b94396a42aa7c987", + "size compressed (bytes)": 93502848095, + "documents": 32893221, + "downloaded": False, + "texts": "miracl-v1.0-en" + }, + "miracl-v1.0-es-mdpr-tied-pft-msmarco-ft-miracl-es": { + "description": "Faiss index for MIRACL v1.0 (Spanish) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-es.mdpr-tied-pft-msmarco-ft-miracl-es.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-es.mdpr-tied-pft-msmarco-ft-miracl-es.20230329.e40d4a.tar.gz", + ], + "md5": "4f45c3171690dd691afcfc9e45b89494", + "size compressed (bytes)": 29552466540, + "documents": 10373953, + "downloaded": False, + "texts": "miracl-v1.0-es" + }, + "miracl-v1.0-fa-mdpr-tied-pft-msmarco-ft-miracl-fa": { + "description": "Faiss index for MIRACL v1.0 (Persian) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-fa.mdpr-tied-pft-msmarco-ft-miracl-fa.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-fa.mdpr-tied-pft-msmarco-ft-miracl-fa.20230329.e40d4a.tar.gz", + ], + "md5": "ae262fea849f6903c93e1f3269e07804", + "size compressed (bytes)": 6287728719, + "documents": 2207172, + "downloaded": False, + "texts": "miracl-v1.0-fa" + }, + "miracl-v1.0-fi-mdpr-tied-pft-msmarco-ft-miracl-fi": { + "description": "Faiss index for MIRACL v1.0 (Finnish) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-fi.mdpr-tied-pft-msmarco-ft-miracl-fi.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-fi.mdpr-tied-pft-msmarco-ft-miracl-fi.20230329.e40d4a.tar.gz", + ], + "md5": "12c5c5c4dd8df37ad8ae90039851fbec", + "size compressed (bytes)": 5367069541, + "documents": 1883509, + "downloaded": False, + "texts": "miracl-v1.0-fi" + }, + "miracl-v1.0-fr-mdpr-tied-pft-msmarco-ft-miracl-fr": { + "description": "Faiss index for MIRACL v1.0 (French) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-fr.mdpr-tied-pft-msmarco-ft-miracl-fr.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-fr.mdpr-tied-pft-msmarco-ft-miracl-fr.20230329.e40d4a.tar.gz", + ], + "md5": "8cf28f8df0805a848cb5c54d5f5d8bfb", + "size compressed (bytes)": 41654288474, + "documents": 14636953, + "downloaded": False, + "texts": "miracl-v1.0-fr" + }, + "miracl-v1.0-hi-mdpr-tied-pft-msmarco-ft-miracl-hi": { + "description": "Faiss index for MIRACL v1.0 (Hindi) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-hi.mdpr-tied-pft-msmarco-ft-miracl-hi.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-hi.mdpr-tied-pft-msmarco-ft-miracl-hi.20230329.e40d4a.tar.gz", + ], + "md5": "f579dfa45a5f14c48f97ba9980f7dec8", + "size compressed (bytes)": 1440859085, + "documents": 506264, + "downloaded": False, + "texts": "miracl-v1.0-hi" + }, + "miracl-v1.0-id-mdpr-tied-pft-msmarco-ft-miracl-id": { + "description": "Faiss index for MIRACL v1.0 (Indonesian) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-id.mdpr-tied-pft-msmarco-ft-miracl-id.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-id.mdpr-tied-pft-msmarco-ft-miracl-id.20230329.e40d4a.tar.gz", + ], + "md5": "d5b540fb82fe21c1fd2b56e248184af6", + "size compressed (bytes)": 4111428848, + "documents": 1446315, + "downloaded": False, + "texts": "miracl-v1.0-id" + }, + "miracl-v1.0-ja-mdpr-tied-pft-msmarco-ft-miracl-ja": { + "description": "Faiss index for MIRACL v1.0 (Japanese) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-ja.mdpr-tied-pft-msmarco-ft-miracl-ja.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ja.mdpr-tied-pft-msmarco-ft-miracl-ja.20230329.e40d4a.tar.gz", + ], + "md5": "e7ad21b12a7d5e937c55d49184d68814", + "size compressed (bytes)": 19790420501, + "documents": 6953614, + "downloaded": False, + "texts": "miracl-v1.0-ja" + }, + "miracl-v1.0-ko-mdpr-tied-pft-msmarco-ft-miracl-ko": { + "description": "Faiss index for MIRACL v1.0 (Korean) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-ko.mdpr-tied-pft-msmarco-ft-miracl-ko.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ko.mdpr-tied-pft-msmarco-ft-miracl-ko.20230329.e40d4a.tar.gz", + ], + "md5": "c31290dfae5429549500759279af3a8d", + "size compressed (bytes)": 4230154713, + "documents": 1486752, + "downloaded": False, + "texts": "miracl-v1.0-ko" + }, + "miracl-v1.0-ru-mdpr-tied-pft-msmarco-ft-miracl-ru": { + "description": "Faiss index for MIRACL v1.0 (Russian) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-ru.mdpr-tied-pft-msmarco-ft-miracl-ru.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ru.mdpr-tied-pft-msmarco-ft-miracl-ru.20230329.e40d4a.tar.gz", + ], + "md5": "b9460efd096292a1012ab1d27082498e", + "size compressed (bytes)": 27177739148, + "documents": 9543918, + "downloaded": False, + "texts": "miracl-v1.0-ru" + }, + "miracl-v1.0-sw-mdpr-tied-pft-msmarco-ft-miracl-sw": { + "description": "Faiss index for MIRACL v1.0 (Swahili) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-sw.mdpr-tied-pft-msmarco-ft-miracl-sw.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-sw.mdpr-tied-pft-msmarco-ft-miracl-sw.20230329.e40d4a.tar.gz", + ], + "md5": "526a930a27353462e11cc7e1b794dcc7", + "size compressed (bytes)": 375865597, + "documents": 131924, + "downloaded": False, + "texts": "miracl-v1.0-sw" + }, + "miracl-v1.0-te-mdpr-tied-pft-msmarco-ft-miracl-te": { + "description": "Faiss index for MIRACL v1.0 (Telugu) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-te.mdpr-tied-pft-msmarco-ft-miracl-te.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-te.mdpr-tied-pft-msmarco-ft-miracl-te.20230329.e40d4a.tar.gz", + ], + "md5": "f64b28542afdd15b2fe3831972bcd91e", + "size compressed (bytes)": 1475895517, + "documents": 518079, + "downloaded": False, + "texts": "miracl-v1.0-te" + }, + "miracl-v1.0-th-mdpr-tied-pft-msmarco-ft-miracl-th": { + "description": "Faiss index for MIRACL v1.0 (Thai) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-th.mdpr-tied-pft-msmarco-ft-miracl-th.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-th.mdpr-tied-pft-msmarco-ft-miracl-th.20230329.e40d4a.tar.gz", + ], + "md5": "b6ba6d5363bf07a5dc8e1cd35fe11e93", + "size compressed (bytes)": 1540581013, + "documents": 542166, + "downloaded": False, + "texts": "miracl-v1.0-th" + }, + "miracl-v1.0-zh-mdpr-tied-pft-msmarco-ft-miracl-zh": { + "description": "Faiss index for MIRACL v1.0 (Chinese) corpus encoded by mDPR passage encoder pre-fine-tuned on MS MARCO, then fine-tuned in-language with MIRACL.", + "filename": "faiss.miracl-v1.0-zh.mdpr-tied-pft-msmarco-ft-miracl-zh.20230329.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-zh.mdpr-tied-pft-msmarco-ft-miracl-zh.20230329.e40d4a.tar.gz", + ], + "md5": "feba34e41cb8234988f7fb99bd8998f3", + "size compressed (bytes)": 14049243202, + "documents": 4934368, + "downloaded": False, + "texts": "miracl-v1.0-zh" + }, + + "miracl-v1.0-ar-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Arabic) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-ar.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ar.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "80c18ac84922ae27bfbee881485816c6", + "size compressed (bytes)": 5861079368, + "documents": 2061414, + "downloaded": False, + "texts": "miracl-v1.0-ar", + }, + "miracl-v1.0-bn-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Bengali) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-bn.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-bn.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "08191b7749151a7bc70e54b92988dd25", + "size compressed (bytes)": 845828394, + "documents": 297265, + "downloaded": False, + "texts": "miracl-v1.0-bn", + }, + "miracl-v1.0-en-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (English) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-en.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-en.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "a460d0eb95cf8a278686531e13141d00", + "size compressed (bytes)": 93426889457, + "documents": 32893221, + "downloaded": False, + "texts": "miracl-v1.0-en" + }, + "miracl-v1.0-es-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Spanish) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-es.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-es.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "936e9188c4dcf57f8f116b9e25790372", + "size compressed (bytes)": 29499200527, + "documents": 10373953, + "downloaded": False, + "texts": "miracl-v1.0-es" + }, + "miracl-v1.0-fa-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Persian) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-fa.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-fa.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "58f83135ecafae6993e49f5f08e471ff", + "size compressed (bytes)": 6278766617, + "documents": 2207172, + "downloaded": False, + "texts": "miracl-v1.0-fa" + }, + "miracl-v1.0-fi-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Finnish) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-fi.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-fi.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "b10bc504213199fe0c0972678ab4fdd6", + "size compressed (bytes)": 5358004166, + "documents": 1883509, + "downloaded": False, + "texts": "miracl-v1.0-fi" + }, + "miracl-v1.0-fr-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (French) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-fr.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-fr.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "b0d5543824b456d9008d05d7dcef5272", + "size compressed (bytes)": 41578767020, + "documents": 14636953, + "downloaded": False, + "texts": "miracl-v1.0-fr" + }, + "miracl-v1.0-hi-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Hindi) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-hi.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-hi.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "ba66e98169b22244c7a7a89ae9bfe549", + "size compressed (bytes)": 1439122724, + "documents": 506264, + "downloaded": False, + "texts": "miracl-v1.0-hi" + }, + "miracl-v1.0-id-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Indonesian) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-id.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-id.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "700466ab62bfd4b0ceddff7aa9b7a5f8", + "size compressed (bytes)": 4113610061, + "documents": 1446315, + "downloaded": False, + "texts": "miracl-v1.0-id" + }, + "miracl-v1.0-ja-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Japanese) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-ja.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ja.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "f0358ae58b32456c3cef5f71e83a0143", + "size compressed (bytes)": 19772957772, + "documents": 6953614, + "downloaded": False, + "texts": "miracl-v1.0-ja" + }, + "miracl-v1.0-ko-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Korean) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-ko.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ko.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "fa00afb61fa4332c408069cb6eb2e8f2", + "size compressed (bytes)": 4229330667, + "documents": 1486752, + "downloaded": False, + "texts": "miracl-v1.0-korean" + }, + "miracl-v1.0-ru-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Russian) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-ru.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-ru.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "118835c214f7b24997ab9f1744b3f5ee", + "size compressed (bytes)": 27155045095, + "documents": 9543918, + "downloaded": False, + "texts": "miracl-v1.0-ru" + }, + "miracl-v1.0-sw-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Swahili) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-sw.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-sw.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "ae45812eadb685c672f7b19c084ae3bc", + "size compressed (bytes)": 375416284, + "documents": 131924, + "downloaded": False, + "texts": "miracl-v1.0-sw" + }, + "miracl-v1.0-te-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Telugu) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-te.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-te.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "8cbea3c141002dd477a15b387350ea37", + "size compressed (bytes)": 1474250608, + "documents": 518079, + "downloaded": False, + "texts": "miracl-v1.0-te" + }, + "miracl-v1.0-th-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Thai) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-th.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-th.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "58cd7d862f202ece45dbd4cb6b6d12f4", + "size compressed (bytes)": 1540980581, + "documents": 542166, + "downloaded": False, + "texts": "miracl-v1.0-th" + }, + "miracl-v1.0-zh-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Chinese) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-zh.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-zh.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "d8800abe1ac22b4161704f2b6d4fe575", + "size compressed (bytes)": 14034991692, + "documents": 4934368, + "downloaded": False, + "texts": "miracl-v1.0-zh", + }, + "miracl-v1.0-de-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (German) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-de.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-de.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "218cb42441af355285fbf219e9d2d7c7", + "size compressed (bytes)": 45085913144, + "documents": 15866222, + "downloaded": False, + "texts": "miracl-v1.0-de", + }, + "miracl-v1.0-yo-mcontriever-pft-msmarco": { + "description": "Faiss index for MIRACL v1.0 (Yoruba) corpus encoded by mContriever passage encoder pre-fine-tuned on MS MARCO.", + "filename": "faiss.miracl-v1.0-yo.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz", + "readme": "faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.miracl-v1.0-yo.mcontriever-tied-pft-msmarco.20230313.e40d4a.tar.gz" + ], + "md5": "f8aee10055a31914c4c214819a7c1890", + "size compressed (bytes)": 139276690, + "documents": 49043, + "downloaded": False, + "texts": "miracl-v1.0-yo", + } + +} + +FAISS_INDEX_INFO_WIKIPEDIA = { + "wikipedia-dpr-100w.dpr-multi": { + "description": "Faiss FlatIP index of Wikipedia encoded by the DPR doc encoder trained on multiple QA datasets", + "filename": "faiss.wikipedia-dpr-100w.dpr_multi.20200127.f403c3.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.wikipedia-dpr-100w.dpr_multi.20200127.f403c3.tar.gz" + ], + "md5": "fe307ef2e60ab6e6f3ad66e24a4144ae", + "size compressed (bytes)": 59836766732, + "documents": 21015320, + "downloaded": False, + "texts": "wikipedia-dpr-100w" + }, + "wikipedia-dpr-100w.dpr-single-nq": { + "description": "Faiss FlatIP index of Wikipedia encoded by the DPR doc encoder trained on NQ", + "filename": "faiss.wikipedia-dpr-100w.dpr_single-nq.20200115.cd5034.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.wikipedia-dpr-100w.dpr_single-nq.20200115.cd5034.tar.gz" + ], + "md5": "01fb6bcaa047df254663d0a3d854b7cc", + "size compressed (bytes)": 59836863979, + "documents": 21015320, + "downloaded": False, + "texts": "wikipedia-dpr-100w" + }, + "wikipedia-dpr-100w.bpr-single-nq": { + "description": "Faiss binary index of Wikipedia encoded by the BPR doc encoder trained on NQ", + "filename": "faiss.wikipedia-dpr-100w.bpr_single-nq.20210827.8a8f75.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.wikipedia-dpr-100w.bpr_single-nq.20210827.8a8f75.tar.gz" + ], + "md5": "b022580ab2fc66f6eaa54af241dba690", + "size compressed (bytes)": 1886380629, + "documents": 21015320, + "downloaded": False, + "texts": "wikipedia-dpr-100w" + }, + "wikipedia-dpr-100w.ance-multi": { + "description": "Faiss FlatIP index of Wikipedia encoded by the ANCE-multi encoder", + "filename": "faiss.wikipedia-dpr-100w.ance_multi.20210224.060cef.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.wikipedia-dpr-100w.ance_multi.20210224.060cef.tar.gz" + ], + "md5": "eb00e096460c8e6296a39732f1676dd7", + "size compressed (bytes)": 59890491335, + "documents": 21015320, + "downloaded": False, + "texts": "wikipedia-dpr-100w" + }, + "wikipedia-dpr-100w.dkrr-nq": { + "description": "Faiss FlatIP index of Wikipedia DPR encoded by the retriever model from 'Distilling Knowledge from Reader to Retriever for Question Answering' trained on NQ", + "filename": "faiss.wikipedia-dpr-100w.dkrr-dpr-nq-retriever.20220217.25ed1f.cc91b2.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.wikipedia-dpr-100w.dkrr-dpr-nq-retriever.20220217.25ed1f.cc91b2.tar.gz", + ], + "md5": "36a658e08dafb3e3313b05f88e001557", + "size compressed (bytes)": 37812137732, + "documents": 21015324, + "downloaded": False, + "texts": "wwikipedia-dpr-100w" + }, + "wikipedia-dpr-100w.dkrr-tqa": { + "description": "Faiss FlatIP index of Wikipedia DPR encoded by the retriever model from 'Distilling Knowledge from Reader to Retriever for Question Answering' trained on TriviaQA", + "filename": "faiss.wikipedia-dpr-100w.dkrr-dpr-tqa-retriever.20220217.25ed1f.cc91b2.tar.gz", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.wikipedia-dpr-100w.dkrr-dpr-tqa-retriever.20220217.25ed1f.cc91b2.tar.gz", + ], + "md5": "072a514ca3ff7717339038d024019e3d", + "size compressed (bytes)": 37802648577, + "documents": 21015324, + "downloaded": False, + "texts": "wikipedia-dpr-100w" + }, + "wiki-all-6-3.dpr2-multi-retriever": { + "description": "Faiss FlatIP index of wiki-all-6-3-tamber encoded by a 2nd iteration DPR model trained on multiple QA datasets", + "filename": "faiss.wiki-all-6-3.dpr2-multi-retriever.20230103.186fa7.tar.gz", + "readme": "faiss-flat.wiki-all-6-3.dpr2-multi-retriever.20230103.186fa7.README.md", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss.wiki-all-6-3.dpr2-multi-retriever.20230103.186fa7.tar.gz", + ], + "md5": "823b6297d6fd8011598e7618742ac7f8", + "size compressed (bytes)": 218257913366, + "documents": 76680040, + "downloaded": False, + "texts": "wiki-all-6-3-tamber" + } +} + +FAISS_INDEX_INFO_OTHER = { + "cast2019-tct_colbert-v2.hnsw": { + "description": "Faiss HNSW index of the CAsT2019 passage corpus encoded by the tct_colbert-v2 passage encoder", + "filename": "faiss-hnsw.cast2019.tct_colbert-v2.tar.gz", + "readme": "faiss-hnsw.cast2019.tct_colbert-v2-readme.txt", + "urls": [ + "https://rgw.cs.uwaterloo.ca/pyserini/indexes/hnsw-faiss.cast2019.tct_colbert-v2.tar.gz" + ], + "md5": "2ce7ce8064ed235a9b6aad08571340d4", + "size compressed (bytes)": 112121368296, + "documents": 38429835, + "downloaded": False, + "texts": "cast2019" + } +} + +FAISS_INDEX_INFO = {**FAISS_INDEX_INFO_MSMARCO, + **FAISS_INDEX_INFO_BEIR, + **FAISS_INDEX_INFO_MRTYDI, + **FAISS_INDEX_INFO_MIRACL, + **FAISS_INDEX_INFO_WIKIPEDIA, + **FAISS_INDEX_INFO_OTHER} diff --git a/pyserini/pyclass.py b/pyserini/pyclass.py new file mode 100644 index 0000000000000000000000000000000000000000..599f3cfc0ac9da42464caa4b9cb9b1cc148525e6 --- /dev/null +++ b/pyserini/pyclass.py @@ -0,0 +1,36 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Module for hiding Python-Java calls via Pyjnius +""" + +from .setup import configure_classpath, os + +# If the environment variable isn't defined, look in the current directory. +configure_classpath(os.environ['ANSERINI_CLASSPATH'] if 'ANSERINI_CLASSPATH' in os.environ else + os.path.join(os.path.split(__file__)[0], 'resources/jars/')) + +from jnius import autoclass, cast + +# Base Java classes +JString = autoclass('java.lang.String') +JFloat = autoclass('java.lang.Float') +JPath = autoclass('java.nio.file.Path') +JPaths = autoclass('java.nio.file.Paths') +JList = autoclass('java.util.List') +JArrayList = autoclass('java.util.ArrayList') +JHashMap = autoclass('java.util.HashMap') diff --git a/pyserini/query_iterator.py b/pyserini/query_iterator.py new file mode 100644 index 0000000000000000000000000000000000000000..f48a1d7d1a851a9df1f31485abf3f379665061fa --- /dev/null +++ b/pyserini/query_iterator.py @@ -0,0 +1,161 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import json +from abc import ABC, abstractmethod +from enum import Enum, unique +from pathlib import Path + +from pyserini.search import get_topics, get_topics_with_reader +from pyserini.util import download_url, get_cache_home +from pyserini.external_query_info import KILT_QUERY_INFO +from urllib.error import HTTPError, URLError + + +@unique +class TopicsFormat(Enum): + DEFAULT = 'default' + KILT = 'kilt' + + +class QueryIterator(ABC): + + PREDEFINED_ORDER = {'msmarco-doc-dev', + 'msmarco-doc-test', + 'msmarco-passage-dev-subset', + 'msmarco-passage-test-subset'} + + def __init__(self, topics: dict, order: list = None): + self.order = order if order else sorted(topics.keys()) + self.topics = topics + + @abstractmethod + def get_query(self, id_): + raise NotImplementedError() + + @classmethod + @abstractmethod + def from_topics(cls, topics_path: str): + raise NotImplementedError() + + def __iter__(self): + for id_ in self.order: + yield id_, self.get_query(id_) + + def __len__(self): + return len(self.topics.keys()) + + @staticmethod + def get_predefined_order(topics_path: str): + order = None + normalized_path = Path(topics_path).stem # get filename w/o extension + normalized_path = normalized_path.replace('_', '-') + + if normalized_path in QueryIterator.PREDEFINED_ORDER: + print(f'Using pre-defined topic order for {normalized_path}') + # Lazy import: + from pyserini.query_iterator_order_info import QUERY_IDS + order = QUERY_IDS[topics_path] + return order + + +class DefaultQueryIterator(QueryIterator): + + def get_query(self, id_): + return self.topics[id_].get('title') + + @classmethod + def from_topics(cls, topics_path: str): + if os.path.exists(topics_path): + if topics_path.endswith('.json'): + with open(topics_path, 'r') as f: + topics = json.load(f) + elif 'beir' in topics_path: + topics = get_topics_with_reader('io.anserini.search.topicreader.TsvStringTopicReader', topics_path) + elif topics_path.endswith('.tsv') or topics_path.endswith('.tsv.gz'): + try: + topics = get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader', topics_path) + except ValueError as e: + topics = get_topics_with_reader('io.anserini.search.topicreader.TsvStringTopicReader', topics_path) + elif topics_path.endswith('.trec'): + topics = get_topics_with_reader('io.anserini.search.topicreader.TrecTopicReader', topics_path) + elif 'cacm' in topics_path: + topics = get_topics_with_reader('io.anserini.search.topicreader.CacmTopicReader', topics_path) + elif topics_path.endswith('.jsonl'): + topics = get_topics_with_reader('io.anserini.search.topicreader.JsonStringTopicReader', topics_path) + else: + raise NotImplementedError(f"Not sure how to parse {topics_path}. Please specify the file extension.") + else: + topics = get_topics(topics_path) + if not topics: + raise FileNotFoundError(f'Topic {topics_path} Not Found') + order = QueryIterator.get_predefined_order(topics_path) + return cls(topics, order) + + +class KiltQueryIterator(QueryIterator): + + ENT_START_TOKEN = "[START_ENT]" + ENT_END_TOKEN = "[END_ENT]" + + def get_query(self, id_): + datapoint = self.topics[id_] + query = ( + datapoint["input"] + .replace(KiltQueryIterator.ENT_START_TOKEN, "") + .replace(KiltQueryIterator.ENT_END_TOKEN, "") + .strip() + ) + return query + + @classmethod + def from_topics(cls, topics_path: str): + topics = {} + order = [] + if not os.path.exists(topics_path): + # Download if necessary: + topics_path = cls.download_kilt_topics(topics_path) + with open(topics_path, 'r') as f: + for line in f: + datapoint = json.loads(line) + topics[datapoint["id"]] = datapoint + order.append(datapoint["id"]) + return cls(topics, order) + + @classmethod + def download_kilt_topics(cls, task: str, force=False): + if task not in KILT_QUERY_INFO: + raise ValueError(f'Unrecognized query name {task}') + task = KILT_QUERY_INFO[task] + md5 = task['md5'] + save_dir = os.path.join(get_cache_home(), 'queries') + if not os.path.exists(save_dir): + os.makedirs(save_dir) + for url in task['urls']: + try: + return download_url(url, save_dir, force=force, md5=md5) + except (HTTPError, URLError) as e: + print(f'Unable to download encoded query at {url}, trying next URL...') + raise ValueError(f'Unable to download encoded query at any known URLs.') + + +def get_query_iterator(topics_path: str, topics_format: TopicsFormat): + mapping = { + TopicsFormat.DEFAULT: DefaultQueryIterator, + TopicsFormat.KILT: KiltQueryIterator, + } + return mapping[topics_format].from_topics(topics_path) diff --git a/pyserini/query_iterator_order_info.py b/pyserini/query_iterator_order_info.py new file mode 100644 index 0000000000000000000000000000000000000000..8ddffdc8235b952e084e270a13da297765e7f497 --- /dev/null +++ b/pyserini/query_iterator_order_info.py @@ -0,0 +1,22 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +QUERY_IDS = { + 'msmarco-doc-dev': [174249, 320792, 1090270, 1101279, 201376, 54544, 118457, 178627, 1101278, 68095, 87892, 257309, 1090242, 211691, 165002, 1101276, 264827, 342285, 372586, 89786, 118448, 92542, 206117, 141472, 196232, 352818, 208145, 79891, 208494, 319564, 155234, 14151, 67802, 1090184, 323382, 323998, 289812, 333486, 1090171, 73257, 1090170, 127876, 1090165, 259417, 1101271, 205107, 307118, 335710, 127984, 1090151, 1090146, 1090132, 1090115, 1090110, 1090107, 1090086, 1090077, 1090072, 1090054, 1101259, 1089983, 1089966, 1089964, 1089940, 1089925, 1089896, 1101236, 1089868, 1089846, 1089832, 1089810, 1101228, 1089804, 1089787, 1089776, 1089763, 1089760, 1089719, 1089706, 1089693, 1089691, 1089688, 1089683, 1089674, 1101214, 1089645, 1101211, 1089619, 1089597, 1089576, 1089560, 1089541, 1089511, 1089501, 1089469, 1089443, 1089438, 1089434, 1089414, 1089408, 1089401, 1089355, 1089325, 1089312, 1089293, 1089286, 1089273, 1101173, 1101172, 1101171, 1089177, 1089167, 1089158, 1089156, 1089143, 1089121, 1089093, 1089085, 1089071, 1089051, 1089044, 1089043, 1089036, 1089027, 1089026, 1089022, 1089021, 1089002, 1089001, 1088993, 1088987, 1088973, 1088960, 1088958, 1088947, 1088938, 1088928, 1088903, 1088889, 1088884, 1088869, 1088856, 1088845, 1088832, 1088800, 1101131, 1102300, 1088758, 1088734, 1088718, 1088693, 1101121, 1088685, 1088628, 1088606, 1088541, 1088539, 1088475, 1088453, 1101090, 1088437, 1101088, 1088379, 1088358, 1088349, 1088302, 1088164, 1088153, 1088138, 1088043, 1087999, 1087959, 1101048, 1087915, 1087911, 1087869, 1087858, 1087848, 1087803, 1087795, 1087774, 1087766, 1087736, 1087729, 1087727, 1087722, 1087687, 1087680, 1087675, 1087634, 1101018, 1087589, 1087581, 1087566, 1087556, 1087532, 1087514, 1087492, 1087487, 1087486, 1087484, 1087425, 1087375, 1087361, 1087351, 1087327, 1087317, 1087309, 1087238, 1087226, 1087215, 1100986, 1087204, 1087185, 1087173, 1087171, 1100980, 1087129, 1087122, 1087114, 1087105, 1087077, 1087076, 1087074, 1087066, 1087061, 1087047, 1087046, 1087042, 1087018, 1087014, 1087001, 1086974, 1086933, 1086928, 1086927, 1086917, 1086915, 1086893, 1086886, 1086883, 1086874, 1086860, 1086855, 1086836, 1086834, 1086765, 1086760, 1086715, 1086713, 1100937, 1086708, 1086701, 1086681, 1086679, 1100933, 1086628, 1100930, 1086595, 1086581, 1086565, 1086555, 1086532, 1086477, 1100919, 1086468, 1086439, 1086430, 1086424, 1086391, 1086385, 1086384, 1086326, 1086309, 1086288, 1086281, 1086271, 1086266, 1086248, 1086241, 1086224, 1086200, 1086174, 1086120, 1086075, 1086046, 1086022, 1086014, 1085980, 1085967, 1100875, 1085943, 1085918, 1085889, 1085845, 1085842, 1085812, 1085804, 1085796, 1085780, 1085779, 1085764, 1085762, 1085760, 1085733, 1085697, 1085674, 1100852, 1085658, 1085630, 1085613, 1085586, 1085584, 1085572, 1085550, 1085533, 1085532, 1085521, 1085517, 1100839, 1085510, 1085457, 1085456, 1085454, 1085441, 1085422, 1085421, 1085393, 1085356, 1085348, 1085341, 1085339, 1085327, 1085319, 1085288, 1085279, 1085245, 1085229, 1100816, 1085197, 1085141, 1085048, 1085035, 1085013, 1085008, 1084986, 1084982, 1084971, 1084910, 1084906, 1084898, 1084889, 1084887, 1100783, 1084848, 1084838, 1084814, 1084769, 1084755, 1084722, 1084713, 1084712, 1084686, 1084603, 1084582, 1084518, 1084516, 1084512, 1084478, 1084475, 1084469, 1084408, 1084403, 1084389, 1084383, 1084354, 1084336, 1084330, 1084326, 1084324, 1084308, 1084301, 1100732, 1084276, 1084273, 1084233, 1084230, 1084197, 1100724, 1084192, 1084086, 1084076, 1084075, 1084038, 1083997, 1083948, 1083945, 1083933, 1083926, 1083909, 1083832, 1083831, 1083822, 1083800, 1083797, 1100687, 1083783, 1083727, 1083721, 1083704, 1083690, 1083686, 1083641, 1083627, 1083597, 1083584, 1083535, 1083517, 1083502, 1083500, 1083499, 1083472, 1083443, 1083430, 1083428, 1083410, 1083362, 1083345, 1083341, 1083340, 1083332, 1083307, 1083293, 1100639, 1083285, 1083278, 1083268, 1083267, 1100634, 1083161, 1083158, 1083152, 1083127, 1083125, 1083108, 1083095, 1083092, 1083085, 1083017, 1083010, 1083000, 1082948, 1082947, 1082924, 1082893, 1082870, 1082840, 1082835, 1082807, 1082792, 1082779, 1082759, 1082751, 1082750, 1082730, 1082668, 1082653, 1082622, 1082607, 1100581, 1082536, 1082531, 1082502, 1082455, 1082445, 1082427, 1082384, 1082377, 1082351, 1082341, 1082339, 1082332, 1082281, 1082265, 1082263, 1082242, 1082117, 1100544, 1081946, 1100541, 1081730, 1081609, 1081595, 1081338, 1100537, 1081091, 1081086, 1080970, 1080950, 1100533, 1080555, 1080406, 1080253, 1080031, 1080010, 1102240, 1079535, 1079340, 1079086, 1079050, 1078906, 1078765, 1077844, 1077019, 1077006, 1077002, 1100499, 1076269, 1075980, 1100492, 1075919, 1075741, 1075656, 1100488, 1075608, 1075591, 1075348, 1075313, 1075262, 1074997, 1074995, 1074989, 1074949, 1074883, 1074807, 1074804, 1074001, 1073640, 1073569, 1073365, 1073358, 1072874, 1072750, 1072603, 1072513, 1100458, 1072500, 1100457, 1100455, 1072188, 1100454, 1071992, 1071598, 1071545, 1071534, 1071389, 1071198, 1070546, 1070452, 1070324, 1070131, 1100438, 1069981, 1069717, 1069521, 1069405, 1069344, 1069222, 1069128, 1068952, 1068715, 1068584, 1068290, 1068276, 1067826, 1067772, 1067764, 1067724, 1067659, 1100415, 1067640, 1067587, 1067284, 1067276, 1066971, 1066966, 1066958, 1066916, 1066709, 1100403, 1066161, 1065971, 1065712, 1065650, 1065558, 1065551, 1065494, 1065160, 1064961, 1064808, 1064687, 1064206, 1064195, 1063974, 1063892, 1063777, 1063758, 1063702, 1063644, 1063607, 1063478, 1063349, 1100370, 1062961, 1062928, 1062784, 1062744, 1062687, 1062589, 1062511, 1062350, 1062332, 1062223, 1061762, 1061472, 1061324, 1061237, 1061210, 1061167, 1060881, 1060868, 1060795, 1060623, 1060566, 1060496, 1060462, 1060391, 1060040, 1060039, 1059698, 1059601, 1059421, 1059420, 1059287, 1059077, 1059045, 1100319, 1058952, 1058885, 1058604, 1100308, 1058601, 1058515, 1058470, 1058442, 1058325, 1058271, 1058182, 1058141, 1100299, 1058036, 1057996, 1057937, 1057708, 1057656, 1057539, 1057334, 1057139, 1057112, 1057015, 1056758, 1056742, 1056726, 1056644, 1056580, 1056437, 1056211, 1056163, 1056159, 1056060, 1056057, 1055940, 1055889, 1055717, 1055505, 1055197, 1055125, 1054999, 1054969, 1054958, 1054923, 1054707, 1054468, 1054451, 1054450, 1054440, 1054438, 1054189, 1054023, 1053931, 1053901, 1053896, 1053716, 1100229, 1102206, 1100224, 1053253, 1053219, 1052985, 1052965, 1100218, 1052640, 1052115, 1052089, 1051942, 1051902, 1051808, 1051571, 1051530, 1051520, 1051475, 1051422, 1051372, 1051352, 1051339, 1051307, 1100190, 1051285, 1051229, 1051223, 1051214, 1051211, 1100188, 1051112, 1100187, 1051108, 1050923, 1050778, 1050695, 1050231, 1049955, 1100173, 1049791, 1049774, 1049456, 1100168, 1049329, 1100167, 1049085, 1048917, 1048642, 1048381, 1048359, 1048303, 1048282, 1048281, 1100151, 1048185, 1047917, 1047913, 1047854, 1047843, 1047833, 1047794, 1047738, 1047708, 1047702, 1047700, 1047662, 1047642, 1047629, 1047599, 1100137, 1047548, 1047386, 1047365, 1047160, 1047138, 1047010, 1046969, 1046931, 1046736, 1100119, 1046569, 1046520, 1046475, 1046463, 1046161, 1100106, 1046047, 1046042, 1100105, 1045855, 1045826, 1045717, 1045709, 1045567, 1100094, 1045554, 1045540, 1045527, 1045494, 1045374, 1045347, 1045229, 1045227, 1045208, 1045135, 1045071, 1100077, 1100070, 1043914, 1043815, 1043658, 1043568, 1043545, 1043337, 1043064, 1042978, 1042800, 1042752, 1042626, 1042426, 1042364, 1042158, 1042099, 1041951, 1041948, 1041924, 1100035, 1041520, 1041226, 1041146, 1041043, 1040959, 1040848, 1040703, 1040532, 1040409, 1040312, 1040099, 1040088, 1040082, 1040064, 1100010, 1040038, 1040030, 1039728, 1039521, 1039298, 1099998, 1039002, 1038879, 1038859, 1038830, 1038724, 1038527, 1099985, 1038184, 1099981, 1099980, 1037872, 1037817, 1037689, 1037250, 1036784, 1036782, 1036627, 1102177, 1036385, 1036380, 1036244, 1036214, 1036005, 1035931, 1035874, 1035805, 1035719, 1035535, 1035383, 1035379, 1035367, 1035321, 1035278, 1035247, 1035006, 1034845, 1034761, 1034703, 1034680, 1034666, 1034595, 1034587, 1034446, 1034409, 1034204, 1034136, 1099914, 1033962, 1033927, 1099911, 1033725, 1033718, 1033652, 1033534, 1033398, 1033250, 1033249, 1033205, 1033092, 1032822, 1032758, 1032341, 1032281, 1099888, 1032198, 1032182, 1032019, 1031976, 1031861, 1031684, 1031682, 1031054, 1031033, 1031032, 1030924, 1030823, 1030722, 1030623, 1099859, 1030381, 1099855, 1030378, 1030324, 1030176, 1029909, 1029908, 1029772, 1029617, 1029552, 1029544, 1029492, 1099836, 1099834, 1029124, 1029058, 1029031, 1029030, 1029003, 1028796, 1028755, 1028753, 1099823, 1028711, 1028608, 1028598, 1028538, 1099816, 1028179, 1028098, 1027919, 1027817, 1027812, 1102163, 1027373, 1099806, 1099805, 1027178, 1026991, 1026799, 1026768, 1026148, 1025991, 1025801, 1025624, 1025483, 1025290, 1025270, 1025259, 1024904, 1024893, 1024727, 1024672, 1024667, 1024592, 1024591, 1024528, 1024288, 1024221, 1024166, 1024069, 1099756, 1023025, 1022907, 1022832, 1022782, 1022712, 1022621, 1022442, 1022410, 1022370, 1022359, 1022198, 1022178, 1022124, 1021971, 1099729, 1021931, 1021900, 1099726, 1021797, 1021695, 1021605, 1021327, 1021324, 1021170, 1021065, 1021053, 1020907, 1020724, 1020500, 1099706, 1020244, 1099700, 1019783, 1019724, 1019649, 1019414, 1019262, 1018918, 1018807, 1018658, 1018359, 1099670, 1018056, 1018032, 1017971, 1017952, 1017773, 1017706, 1017692, 1017687, 1017605, 1017529, 1017524, 1017476, 1017276, 1017204, 1016915, 1016879, 1099653, 1016676, 1016611, 1016583, 1016281, 1016154, 1016015, 1016013, 1015641, 1099636, 1015347, 1015307, 1099632, 1014885, 1014264, 1099626, 1014210, 1014132, 1013965, 1013797, 1013615, 1013592, 1013579, 1013570, 1013492, 1013424, 1013304, 1013267, 1013229, 1013114, 1012866, 1012865, 1099595, 1012547, 1012431, 1012026, 1011925, 1011713, 1011618, 1011512, 1011381, 1011328, 1011248, 1011166, 1011140, 1011120, 1011044, 1011021, 1011018, 1010615, 1010527, 1010524, 1010287, 1010057, 1010048, 1009994, 1009961, 1009749, 1009742, 1009724, 1009695, 1009610, 1009527, 1009388, 1009237, 1009183, 1009023, 1008979, 1008977, 1008968, 1008951, 1008947, 1008911, 1099495, 1008515, 1099482, 1007972, 1007959, 1007934, 1007696, 1007691, 1007673, 1007628, 1007606, 1007550, 1007473, 1007242, 1006911, 1006791, 1006751, 1006578, 1006459, 1006199, 1005798, 1099452, 1099451, 1005586, 1005131, 1005113, 1004949, 1004921, 1004258, 1004254, 1004243, 1004240, 1004233, 1004228, 1004167, 1003849, 1003831, 1003590, 1003482, 1003351, 1003319, 1003277, 1003239, 1003003, 1002940, 1002938, 1002889, 1002716, 1099391, 1002585, 1002584, 1002554, 1002482, 1002426, 1002330, 1002274, 1002252, 1002238, 1002197, 1002148, 1002058, 1001999, 1099368, 1001926, 1001903, 1001810, 1001454, 1000864, 1000798, 1000681, 1000574, 1000519, 1000459, 1000319, 1000272, 1102121, 1000083, 1000030, 1000017, 1000006, 1000004, 1000000, 999942, 999836, 999791, 999691, 999685, 999567, 999552, 999550, 999517, 999469, 999439, 999416, 999385, 999356, 1099321, 999192, 999110, 999089, 999086, 998965, 998905, 998891, 998802, 998735, 998681, 998680, 998675, 998658, 998646, 998641, 998591, 998493, 998417, 998309, 998247, 998246, 998174, 998101, 1099290, 998013, 997935, 997932, 1099288, 997878, 997860, 1099284, 997744, 997713, 997649, 997648, 997542, 997481, 997449, 997351, 997086, 997044, 996922, 996835, 996825, 996805, 996623, 996414, 996328, 996272, 996181, 996119, 995806, 995805, 995787, 995756, 995380, 995280, 995221, 995141, 994867, 994830, 1099226, 994792, 994688, 994533, 994479, 994397, 1099219, 994338, 1099217, 994228, 994133, 994085, 994005, 993996, 993987, 993834, 993821, 993795, 993748, 993627, 993320, 993255, 993178, 993153, 993041, 992950, 992949, 992946, 992840, 992839, 992677, 992660, 992659, 992605, 992559, 992535, 992531, 1099178, 992433, 992407, 992367, 992365, 992363, 992257, 992224, 992193, 992191, 992132, 992120, 991854, 991832, 991782, 991762, 991685, 991471, 991342, 991324, 991210, 991207, 991171, 991111, 991044, 991032, 990995, 990938, 990852, 990841, 990763, 990649, 990414, 990375, 990345, 990026, 989894, 989870, 989831, 989573, 989530, 1099105, 989296, 989213, 989042, 988954, 988915, 1102400, 988787, 988745, 988743, 988710, 988653, 988636, 988540, 988504, 1099072, 988253, 988124, 988122, 988121, 988119, 1099065, 987845, 987823, 987822, 987809, 987671, 987573, 987567, 987502, 987309, 987237, 987230, 1099050, 987192, 987183, 987066, 986936, 986852, 986793, 986791, 986733, 986494, 986484, 986427, 986411, 986316, 986210, 986162, 986068, 985461, 985433, 985431, 985372, 985360, 985275, 985259, 985173, 985167, 985165, 985158, 984992, 984948, 984856, 984774, 129837, 241405, 61452, 173001, 197024, 81993, 186446, 86624, 98817, 246626, 373121, 240504, 112035, 141353, 11006, 235832, 96379, 1098967, 298565, 86094, 141694, 320117, 281002, 353623, 1098953, 60357, 58583, 262974, 334754, 36214, 96749, 181394, 296993, 75608, 83448, 270603, 1098927, 183046, 362845, 164528, 244821, 95409, 293401, 1098909, 176015, 323798, 10157, 137919, 8854, 1098905, 89777, 97895, 149447, 239516, 299350, 323535, 1098895, 1098874, 253678, 160562, 282530, 166043, 357162, 328629, 1098860, 122440, 53813, 10312, 88577, 1098846, 109276, 15382, 29097, 185009, 98682, 230082, 194531, 168069, 1098809, 1098806, 1098804, 168238, 242219, 127315, 203688, 176994, 160255, 47864, 292676, 222954, 36965, 272500, 2962, 125545, 1098765, 11133, 1098763, 118365, 172981, 96310, 276338, 80590, 131665, 125996, 27618, 210690, 334916, 136209, 92437, 24979, 277785, 227591, 249321, 136098, 307521, 1098698, 264594, 169778, 158887, 135516, 15607, 237945, 164912, 125627, 20597, 339888, 276298, 234651, 129565, 12903, 145821, 180592, 1098646, 176677, 9926, 1098641, 265960, 172787, 94865, 135386, 234998, 100616, 1098608, 305361, 61882, 338713, 1098600, 102506, 128113, 44072, 128200, 334433, 329901, 153027, 90941, 197964, 49802, 184452, 229325, 231292, 273481, 30188, 244808, 101451, 191971, 108622, 150087, 182393, 181222, 144491, 258485, 292094, 1098523, 277632, 1098520, 97295, 188908, 1098510, 107812, 310853, 208339, 1098497, 149790, 132263, 106508, 314907, 371695, 1098481, 12741, 305333, 28216, 20671, 320320, 86264, 220151, 316803, 70340, 223468, 59217, 276329, 236580, 130932, 139239, 206549, 234821, 93308, 174273, 278863, 199572, 285656, 31432, 347491, 207251, 54531, 56033, 300312, 107077, 160885, 209651, 1098355, 143464, 1098354, 183201, 1098338, 222158, 159667, 61180, 1098322, 99183, 85954, 153794, 239189, 195693, 209764, 190307, 343976, 29169, 86701, 24115, 123975, 167436, 160339, 267644, 119534, 10276, 21765, 119975, 165807, 195582, 114037, 282397, 1102028, 165480, 279718, 1098222, 103125, 181144, 55691, 212236, 359499, 119168, 19457, 162351, 371204, 190212, 183874, 357664, 259128, 1098180, 249792, 232703, 1098169, 133037, 226461, 318073, 277701, 183723, 228474, 62648, 72613, 53814, 17848, 139897, 328611, 154633, 259239, 137411, 22882, 309402, 114638, 324645, 280927, 311067, 1098111, 118702, 1098110, 76770, 201366, 195440, 1098102, 142411, 234165, 1098090, 337073, 167156, 18101, 75342, 267012, 193742, 36473, 15039, 264410, 161224, 286160, 132359, 191632, 1098057, 170788, 1098048, 136700, 242713, 186390, 1102390, 324159, 196111, 78730, 40056, 9454, 295406, 184436, 78076, 265729, 212195, 152519, 126491, 57402, 139929, 131873, 185276, 27932, 160787, 323154, 377805, 1098013, 57882, 1098010, 334904, 20356, 303777, 367290, 284072, 1097999, 236949, 288884, 58801, 200296, 142039, 260762, 253965, 46579, 85904, 92260, 167566, 146244, 338040, 123710, 306105, 299094, 178468, 337209, 1097939, 74759, 202797, 1097937, 47741, 277799, 308687, 58571, 1102001, 323555, 259885, 1097909, 1097906, 255027, 1097905, 132104, 126525, 1097894, 142382, 1097885, 185299, 264150, 39577, 289556, 290499, 107283, 57614, 74356, 310948, 211621, 1101995, 294518, 329958, 157149, 149853, 204924, 144857, 227637, 207595, 337190, 113664, 181301, 237561, 62439, 362076, 1097796, 88284, 139767, 13397, 239511, 214040, 226509, 1097786, 202073, 250367, 313940, 326509, 236427, 32642, 267187, 51276, 145877, 256052, 10205, 112718, 342115, 244902, 212634, 323096, 46095, 1097723, 65583, 1097721, 64179, 139090, 299939, 184235, 196596, 154301, 82293, 267341, 1097674, 17635, 262232, 93649, 285537, 51090, 34039, 1215, 335711, 66154, 276208, 144694, 21861, 250636, 299110, 302337, 163602, 31595, 146598, 70852, 271038, 30956, 193866, 249802, 116939, 69506, 328474, 189466, 1097602, 1101977, 140804, 325292, 203390, 298550, 30860, 143293, 192894, 283154, 551309, 65038, 448630, 459280, 757275, 116820, 128772, 432653, 550565, 573954, 37952, 706950, 492853, 451406, 571103, 699510, 418353, 441409, 438286, 533105, 689851, 694561, 405660, 163860, 559507, 464860, 583234, 392393, 431602, 298940, 560673, 709342, 409854, 398447, 633399, 193581, 559198, 613852, 390484, 656250, 407131, 742822, 606944, 1097508, 433691, 701335, 217246, 455776, 723144, 409143, 444790, 408765, 515335, 758074, 483795, 591898, 468762, 462301, 580411, 1097469, 450921, 632394, 1097461, 557157, 406576, 543849, 60677, 1097448, 663006, 129229, 539957, 615383, 152598, 1097438, 448976, 756949, 719411, 562827, 70709, 575616, 387848, 539601, 463373, 724872, 483521, 257885, 566335, 523621, 466640, 704236, 619087, 1097386, 184105, 1097373, 180902, 675320, 643572, 709560, 1097359, 593732, 453220, 696404, 463443, 591940, 577813, 558263, 442525, 610940, 167229, 433220, 704072, 1097317, 732631, 340712, 755040, 454018, 1097314, 569674, 172062, 587524, 372070, 419692, 1097304, 1097298, 471705, 503381, 400696, 731759, 666694, 456016, 701390, 559959, 699873, 704398, 743046, 717845, 499904, 463133, 455659, 610056, 398258, 669979, 487279, 736347, 653041, 459948, 417040, 400692, 740762, 738165, 556489, 571954, 167994, 648877, 592495, 735387, 1097242, 703268, 480932, 1097236, 6217, 741392, 602957, 528760, 1097223, 412352, 709936, 616045, 437914, 732618, 715189, 189115, 1097213, 446834, 392936, 515317, 710297, 426442, 718444, 565696, 1097195, 587674, 495680, 510867, 679390, 588829, 432874, 470611, 424092, 753479, 412597, 406181, 632825, 417404, 674702, 15063, 496276, 1097154, 687632, 147542, 511101, 523952, 489513, 685091, 565868, 594930, 1097135, 735895, 548254, 748997, 627085, 695238, 1097119, 422624, 1097118, 519145, 395038, 592192, 1097100, 608557, 1097093, 611152, 742667, 1097087, 423608, 448975, 514767, 727551, 1097066, 678176, 432602, 606117, 27743, 694063, 1097040, 485287, 610898, 637208, 517245, 648119, 1097027, 39360, 537526, 1097023, 707670, 1097014, 261098, 517117, 604673, 19940, 1096998, 479284, 646354, 691141, 73788, 601624, 330560, 568841, 594831, 1096964, 442673, 703211, 138640, 457622, 1096958, 750487, 459481, 558448, 189312, 755465, 733422, 404713, 663820, 1096947, 1096945, 418063, 507381, 1096944, 688739, 436844, 605169, 428113, 48417, 745559, 716641, 469873, 509907, 263889, 727837, 54843, 407102, 420304, 634583, 1096911, 416846, 655057, 512405, 568649, 478359, 653187, 445494, 559018, 91345, 690010, 754191, 476807, 112318, 708904, 1096887, 1096886, 467597, 592601, 585344, 679360, 757644, 415500, 635058, 474873, 622100, 685177, 299023, 333700, 427086, 1096870, 580313, 256783, 733186, 747345, 1096866, 684977, 387864, 488676, 575268, 613318, 570068, 147166, 1096850, 747720, 711682, 643561, 1096840, 497132, 1096830, 290779, 1096827, 256192, 635237, 691507, 1096823, 402417, 636853, 469535, 639163, 581521, 751778, 386653, 424509, 454258, 554511, 453270, 586916, 478827, 1096787, 656371, 477648, 649640, 1096776, 365044, 585680, 524722, 685591, 497360, 489374, 568895, 682105, 476483, 634126, 413040, 735343, 433549, 1288, 494730, 659929, 1096742, 696217, 417362, 739599, 710755, 669444, 117683, 633350, 131768, 337864, 495082, 1096712, 686290, 590861, 477380, 415165, 521402, 541135, 389258, 711803, 589564, 497596, 759503, 480064, 461491, 596088, 466774, 406974, 714672, 1096667, 680514, 635125, 1096658, 1096656, 589777, 424449, 645024, 456305, 399364, 1096644, 724947, 1096641, 494086, 626232, 277977, 372378, 1096628, 550331, 303045, 288702, 645604, 539648, 1096620, 503674, 1096610, 1096607, 641156, 505107, 505810, 626462, 397090, 464484, 484551, 617611, 587326, 670142, 243244, 208265, 455862, 666792, 665972, 670022, 660534, 753168, 517386, 470982, 745944, 590945, 422501, 1096557, 642352, 477639, 1096551, 669046, 412982, 399617, 534941, 497470, 1096543, 576822, 745746, 635647, 1096533, 1096532, 647949, 417080, 582146, 1096527, 613233, 524332, 615457, 637234, 567159, 576851, 547301, 456551, 156215, 148761, 168787, 693152, 142782, 1096509, 245921, 75717, 573157, 560245, 718112, 2, 649763, 555850, 18840, 627513, 1101871, 607374, 753517, 54040, 1096479, 1101870, 1096476, 1096475, 84778, 440098, 42361, 567714, 435794, 453869, 482808, 473492, 1101868, 1096457, 464440, 1096454, 453705, 65584, 620992, 556976, 694560, 407869, 484454, 1096431, 1096429, 667373, 527568, 595577, 461601, 693736, 231482, 443081, 282214, 747937, 467683, 551119, 450093, 696918, 540906, 575492, 588888, 687375, 479570, 708781, 578100, 198581, 441128, 748054, 329515, 560059, 1101861, 686469, 731886, 227968, 405867, 1096376, 1096375, 406205, 507086, 97766, 539278, 511367, 1096368, 490505, 458885, 1096360, 273014, 482412, 613694, 660999, 342450, 413079, 420934, 682025, 627323, 82100, 371420, 592672, 493508, 626701, 640232, 638503, 724579, 437165, 352236, 719488, 274981, 547018, 433680, 731902, 412340, 358150, 510858, 505171, 544060, 453451, 632625, 405090, 409207, 477309, 749955, 504306, 436249, 1096268, 448035, 530572, 1096262, 573899, 97972, 185397, 506985, 1096258, 1096257, 1096256, 610716, 19552, 436475, 1096252, 648049, 662436, 652556, 536480, 414714, 1101845, 687615, 505541, 651187, 759062, 754786, 443489, 249821, 506438, 1096207, 499568, 663679, 428773, 421145, 600350, 690606, 657204, 697780, 556248, 427323, 574944, 220761, 408563, 526331, 555558, 440362, 497757, 391481, 1096180, 601684, 585165, 586268, 368229, 734979, 466335, 667535, 504335, 677460, 509730, 115365, 418926, 693469, 427730, 375291, 649200, 614598, 478295, 674914, 659230, 88160, 722413, 261521, 1096126, 738525, 142579, 393881, 574317, 408739, 481961, 397592, 614409, 525660, 463635, 171776, 632536, 630905, 573452, 473319, 605363, 746055, 697983, 403361, 406525, 510444, 755459, 1096087, 669800, 739671, 534305, 695993, 496175, 722981, 288200, 1096065, 744764, 680951, 548099, 588122, 409887, 433685, 593541, 464663, 569939, 434369, 1096049, 758901, 669288, 59204, 1096045, 650076, 1096044, 513061, 66161, 116517, 663950, 384985, 658498, 1096025, 596716, 231109, 561448, 649110, 745402, 743696, 76283, 570979, 544123, 646179, 434462, 160735, 735384, 442593, 225499, 722515, 656859, 566946, 413905, 724121, 1101822, 682365, 1095994, 77424, 714678, 221664, 93234, 1095988, 625022, 652912, 97612, 609104, 1095982, 478220, 575096, 417902, 458110, 1095971, 692955, 422600, 583766, 639560, 624876, 425688, 523062, 1095955, 718782, 456734, 597384, 578783, 460162, 750946, 308032, 671692, 1095942, 556476, 409557, 273449, 358240, 418423, 485558, 392501, 486370, 75698, 738162, 586740, 1095928, 676454, 641284, 123859, 601629, 646623, 1095921, 758720, 548673, 641618, 29416, 528841, 277556, 467612, 437671, 739913, 530611, 503401, 290830, 749267, 1095881, 151547, 547089, 192502, 1095876, 1095874, 540432, 607338, 739743, 1095868, 699872, 599550, 687245, 438455, 1095864, 472448, 543813, 570905, 619159, 282411, 595568, 1095857, 445094, 582557, 595236, 148777, 681264, 1095845, 1101806, 242107, 289276, 732448, 423178, 508316, 483178, 481297, 156889, 525534, 40337, 159078, 153981, 425505, 481387, 266920, 1095807, 1095806, 173391, 451609, 574547, 1095798, 392195, 544319, 742988, 552868, 435412, 163038, 447340, 585378, 272815, 172608, 740624, 129205, 436602, 733510, 507087, 508855, 1095749, 299732, 108037, 662524, 730278, 405036, 59392, 717563, 117113, 1095725, 649893, 522076, 420400, 1095716, 393268, 1095711, 426214, 1095704, 418165, 662016, 1095699, 510229, 575146, 758909, 249618, 645252, 1095687, 249866, 408427, 470001, 77323, 624176, 472359, 75266, 633916, 556166, 551819, 754509, 679658, 625205, 574051, 586790, 1095654, 505152, 537410, 636949, 1095650, 524447, 546956, 444350, 593135, 630318, 688644, 1095641, 581975, 681791, 682626, 1095631, 436924, 683045, 639412, 164282, 149801, 70720, 510158, 108507, 744109, 754166, 389385, 458771, 192579, 583916, 665009, 440802, 422955, 608197, 461078, 393462, 47716, 506579, 576312, 431481, 748771, 711710, 1095571, 1095566, 449244, 426504, 420980, 641293, 1095560, 94953, 1095558, 436100, 727707, 1095555, 148424, 321239, 664138, 647876, 280223, 1095542, 658372, 1095537, 717751, 4947, 399527, 605467, 555590, 581801, 237936, 1095495, 535142, 448123, 634113, 593275, 119761, 177221, 547820, 420867, 1095478, 404202, 415962, 1095469, 384845, 458774, 663131, 459707, 569689, 412532, 437752, 744092, 660479, 8714, 663890, 736713, 153037, 414799, 1095437, 451070, 760367, 556307, 275997, 453175, 22670, 415815, 632106, 138793, 443027, 660803, 124787, 407662, 536995, 670476, 32176, 755907, 474234, 706342, 438058, 511330, 583369, 508104, 682190, 638928, 199776, 403793, 663388, 423878, 413404, 662282, 652961, 451484, 414155, 275528, 734198, 263670, 635079, 604153, 667136, 1095377, 743868, 567759, 1095371, 470385, 728060, 577511, 476977, 568585, 1095360, 1095357, 1095354, 625458, 138127, 507434, 637459, 607599, 758519, 698719, 635626, 450854, 1095335, 704223, 1095332, 428819, 436847, 685717, 682205, 644023, 496717, 686260, 754113, 750421, 647687, 630391, 14963, 436091, 275137, 594793, 1101761, 673984, 1095306, 83621, 659182, 401287, 466738, 1095278, 115930, 455782, 562821, 741977, 660957, 701345, 450851, 635044, 611199, 588627, 302435, 488825, 278542, 476947, 741274, 530602, 737940, 584500, 1095233, 591993, 520627, 43781, 437324, 540306, 700835, 653054, 584727, 146812, 619675, 670829, 604628, 701663, 478054, 490883, 327750, 203458, 700641, 661398, 571237, 592220, 680490, 497536, 2235, 703270, 741970, 37685, 1095165, 194750, 521801, 392488, 749752, 1095155, 584905, 388950, 274175, 730229, 334558, 752473, 746065, 532142, 578607, 522953, 422609, 130825, 471007, 469819, 711811, 449235, 607855, 733591, 302878, 1095126, 1095121, 510018, 570725, 129517, 1095108, 624644, 563652, 677212, 729697, 155086, 466162, 691055, 127098, 77878, 425330, 710914, 690801, 562594, 1101739, 1095066, 418552, 478981, 728460, 430142, 1095055, 604229, 117036, 450681, 731723, 572517, 677672, 321363, 432680, 733692, 667932, 418977, 672429, 726076, 1095012, 278429, 438324, 188134, 387662, 570070, 136157, 1094999, 461281, 1094996, 402427, 391125, 589903, 703383, 633986, 708739, 61623, 1094982, 549738, 587853, 703765, 516029, 759038, 686541, 705681, 1094962, 392350, 329369, 450788, 640103, 430229, 245120, 563347, 597395, 617795, 727224, 748672, 752700, 707513, 672109, 624503, 734426, 612471, 657264, 526984, 543251, 459291, 563943, 728823, 753299, 433579, 583798, 449442, 647503, 622725, 503580, 743675, 580450, 745469, 656376, 589586, 388588, 525868, 174592, 526671, 614047, 393203, 1101723, 1094869, 672433, 276979, 693101, 738484, 535599, 507934, 592235, 466252, 420365, 514851, 742022, 691004, 760512, 212796, 1094840, 748321, 577131, 689885, 617968, 490802, 1094825, 455743, 632726, 603031, 541425, 453856, 661076, 604113, 702792, 722352, 547139, 704080, 694678, 603773, 138266, 582641, 1101714, 584569, 455456, 559009, 711840, 565231, 528117, 729672, 129792, 684459, 626005, 689700, 632923, 1094759, 560419, 574569, 112477, 549135, 1094755, 208610, 421813, 688218, 466202, 714709, 345350, 458235, 724571, 608323, 1094724, 633635, 430985, 559709, 398335, 574002, 712832, 750821, 681514, 607292, 467274, 1101706, 639545, 1094691, 521018, 659247, 1094689, 632055, 516413, 338696, 334867, 518940, 495018, 472024, 261683, 559607, 422893, 622658, 178859, 320051, 743708, 496244, 740876, 421437, 211468, 503390, 612846, 153048, 124128, 1094634, 455273, 92509, 612670, 478691, 576195, 93823, 737512, 243712, 453851, 439375, 728150, 1094612, 73853, 611271, 1094605, 624143, 726098, 567452, 541948, 698445, 671219, 740416, 94782, 250228, 1094578, 210442, 601128, 462979, 411953, 6791, 471983, 1094566, 475402, 231717, 642032, 657091, 679167, 102695, 645343, 1094536, 175251, 498478, 677936, 537825, 549235, 396391, 1094519, 711759, 488345, 549219, 1094501, 721885, 555750, 456443, 568526, 1094477, 708517, 25603, 709559, 191792, 1094469, 391101, 1576, 170982, 753214, 1094460, 341317, 1094453, 693636, 481341, 504044, 489858, 556217, 406923, 430989, 413858, 402318, 633153, 231298, 538333, 618408, 249118, 567630, 393954, 628056, 645892, 1102351, 647260, 1094395, 1094394, 1094389, 537761, 331352, 419326, 425375, 598802, 506181, 248086, 559771, 638849, 744891, 560357, 1094370, 445714, 1094369, 571696, 425072, 473935, 1094364, 602352, 1094361, 635497, 563995, 21741, 419333, 693162, 730626, 593792, 482666, 1101674, 96250, 733892, 409071, 1094316, 223165, 387603, 26207, 680373, 452200, 644658, 384406, 468907, 137440, 540983, 494346, 412319, 171527, 635150, 438316, 623857, 402075, 614069, 285729, 634412, 1101670, 669427, 616447, 65000, 609799, 266760, 146212, 161418, 1101668, 1094271, 454872, 714636, 99556, 631724, 21948, 515813, 409694, 88375, 563359, 602652, 574730, 281704, 543951, 242019, 743693, 445908, 584592, 621419, 24441, 403388, 551860, 124534, 486274, 138223, 524166, 673143, 1094220, 715508, 1094215, 522151, 404051, 394021, 416228, 393420, 563771, 670600, 1094197, 1101661, 1094191, 281702, 649451, 583611, 473394, 408945, 470459, 1094175, 414276, 705279, 623281, 489931, 545450, 694845, 531142, 564707, 753480, 628085, 1094141, 448183, 568709, 614186, 230891, 725047, 700224, 521851, 707721, 715588, 524699, 517516, 543644, 1094110, 702790, 447551, 647872, 760070, 525467, 473886, 1094085, 1094081, 674595, 649294, 591026, 537301, 261650, 509111, 405985, 1094062, 1094056, 427340, 457809, 34015, 639084, 455853, 680250, 544308, 424898, 572286, 426347, 199442, 144254, 82161, 565915, 1094027, 449750, 683193, 728110, 576452, 525779, 610425, 605651, 720013, 582848, 503607, 690508, 1093971, 155056, 483241, 64528, 541969, 1093966, 536791, 86203, 1093962, 527769, 1093959, 609956, 549342, 577167, 731736, 700618, 1093941, 284313, 22479, 507221, 405310, 1093927, 1093926, 571474, 100013, 609628, 690705, 713134, 712545, 604954, 499413, 541274, 479525, 106125, 480504, 400311, 50891, 610190, 1093881, 599524, 457714, 605648, 535421, 487569, 591310, 555458, 4696, 525047, 1093855, 744261, 54199, 536654, 653092, 558548, 961921, 978802, 982481, 775343, 138629, 841165, 841020, 846513, 786021, 1066043, 1093791, 939473, 1058978, 858421, 1093786, 345453, 330450, 821372, 855968, 160671, 1093781, 1036002, 783963, 1093773, 1003445, 315131, 841961, 912898, 771694, 961950, 772928, 776465, 991240, 772055, 917334, 1093750, 853057, 787784, 1041473, 1093732, 783277, 792900, 1093723, 65488, 148564, 792742, 29612, 1093717, 212977, 989855, 936273, 987486, 1019470, 884870, 1005191, 998941, 952378, 95286, 258337, 783433, 1020915, 931905, 808716, 1053111, 1093698, 900924, 1057251, 875787, 907997, 976829, 234114, 888100, 865616, 58130, 789439, 1093682, 763878, 790059, 338637, 810242, 808362, 925571, 944700, 170770, 1011860, 815015, 25025, 930326, 1028131, 1093650, 778890, 978057, 1005520, 900731, 1093637, 960397, 862742, 25534, 1093621, 831601, 900076, 981400, 995176, 852037, 1036542, 971233, 914321, 921173, 942221, 944181, 983708, 804197, 988269, 791629, 804905, 831560, 849561, 913509, 1093570, 230725, 831315, 1037407, 837181, 1011811, 1076078, 303934, 959228, 1093556, 869827, 1093552, 783602, 849337, 1011721, 976941, 950799, 863623, 1031240, 1024034, 812734, 1093540, 1078198, 1093534, 773155, 898714, 1023850, 227317, 825954, 1093507, 956993, 202081, 49435, 884878, 1078731, 780336, 893681, 868598, 1038755, 357340, 1093487, 1093481, 968560, 934235, 1051886, 376537, 906901, 860266, 778139, 831474, 853344, 1093443, 241246, 1093438, 822859, 53897, 884722, 140161, 66908, 992618, 762111, 991138, 64960, 981006, 830040, 944231, 948397, 925951, 835478, 1073972, 147337, 1093419, 800792, 790536, 1093410, 800318, 899869, 1093406, 1081569, 244092, 1093399, 831030, 877810, 798284, 837375, 166748, 1016406, 270140, 893271, 148851, 171370, 897476, 1027669, 848478, 329114, 1093359, 959083, 865426, 947678, 1003114, 856171, 779553, 773998, 1093349, 988988, 939104, 927553, 881695, 1035228, 954455, 778948, 881582, 1093322, 1003329, 904295, 373209, 971633, 132639, 1093312, 952388, 1093305, 767745, 845888, 869035, 1004940, 1017734, 931147, 989994, 792463, 789332, 850919, 950139, 1101576, 942651, 779475, 1004199, 934223, 1093255, 1011529, 1033703, 804103, 827791, 903811, 813675, 1009109, 1093238, 776122, 1093235, 176744, 1101566, 1093231, 894610, 873250, 1026098, 948452, 224314, 1039195, 891082, 917489, 321918, 859274, 972699, 944245, 983438, 109647, 1093202, 1046952, 1093200, 1093196, 793475, 844390, 1093181, 1093179, 1057476, 129228, 1093172, 960265, 254652, 80712, 809933, 971213, 803306, 1048565, 839878, 946428, 766769, 910818, 339934, 1093142, 1022577, 1093128, 831962, 788851, 872347, 796056, 130034, 1093112, 971653, 788702, 803599, 1093104, 827801, 786857, 777297, 206806, 989108, 1093096, 74328, 1101552, 1093094, 42568, 866101, 820267, 362016, 262280, 1046648, 155041, 841521, 1093064, 840061, 965578, 1093042, 1061994, 983499, 810680, 1093038, 879869, 1093031, 839528, 861169, 818819, 914637, 975775, 1101535, 958311, 957607, 1093006, 28352, 1070412, 870348, 338917, 800987, 895263, 1092996, 977770, 820973, 61531, 134239, 855546, 892454, 57270, 890532, 875796, 979133, 1010537, 994112, 1092984, 778857, 1101531, 941865, 1029016, 1092978, 824000, 22231, 1037104, 924895, 368728, 1092952, 908154, 762652, 862345, 920717, 943190, 862856, 10264, 978605, 242863, 1092930, 872081, 853882, 1010277, 1092919, 778095, 1092911, 822937, 91722, 837681, 161828, 987660, 837202, 810324, 283344, 889104, 25294, 839128, 782549, 1056548, 794469, 779674, 903479, 934964, 1092865, 1005163, 919913, 1092863, 761096, 1018525, 991894, 1021907, 944194, 1092858, 1037826, 109819, 993234, 1059820, 819618, 1024312, 1092832, 1092822, 2798, 166403, 893275, 766272, 809556, 359040, 801907, 348136, 1092796, 776080, 878959, 1092792, 991590, 1092791, 874914, 993107, 839137, 809798, 904007, 948797, 795991, 999555, 817349, 96602, 979054, 70504, 980633, 960437, 783687, 782696, 809909, 1092759, 1092757, 786009, 1092756, 341736, 930483, 783098, 1040684, 855050, 956403, 881723, 856568, 1033912, 994918, 1092738, 823203, 1072506, 1075156, 929046, 953351, 1092724, 1003006, 69871, 111377, 968608, 791223, 865518, 1092715, 789037, 866251, 1079868, 897789, 922398, 844211, 846082, 952452, 922335, 869348, 992652, 967106, 926019, 985644, 902657, 880527, 1092665, 766804, 950355, 1063177, 1101503, 190601, 924398, 1078920, 1092643, 300246, 762558, 999028, 885153, 924567, 837467, 850957, 913579, 272605, 891565, 935358, 63548, 884533, 1092605, 144285, 278606, 1078187, 260853, 1060616, 1038592, 780613, 1026372, 941219, 969750, 913568, 1052717, 887395, 1023363, 1092557, 1092551, 354222, 320025, 807880, 869759, 1092543, 960302, 1043702, 1092528, 943170, 72485, 888988, 1092522, 1092517, 944451, 1043413, 908069, 1040353, 320970, 823549, 1003875, 885081, 796812, 1092484, 937427, 984499, 196720, 915769, 1092482, 974808, 998381, 1050007, 792688, 1102335, 333579, 1092474, 885308, 842223, 1092470, 322345, 999637, 865476, 135464, 1031456, 896931, 914707, 915762, 1044041, 1092450, 167371, 820161, 1028742, 1078491, 1092441, 766238, 894161, 885986, 859669, 1069556, 862640, 962731, 370316, 897981, 1050747, 1092422, 914406, 1092417, 860655, 1092416, 970152, 1005678, 339501, 882002, 1028652, 1092394, 891498, 909048, 1092391, 255633, 1017348, 782426, 782079, 865971, 160808, 767248, 910150, 875986, 999261, 945535, 990197, 1092348, 1039361, 952658, 1050253, 78418, 991064, 914368, 1038685, 900450, 990481, 1025348, 914771, 814282, 850820, 905707, 1092327, 840532, 212303, 823596, 762761, 1058165, 999610, 1092311, 1040507, 877453, 998093, 979787, 872632, 975997, 789292, 1039346, 1079785, 886332, 116431, 303790, 984434, 1071722, 928753, 796383, 860942, 940386, 1064518, 1092263, 170581, 1092258, 267566, 1092257, 73106, 909547, 1000097, 952445, 854085, 995825, 951820, 882141, 1028555, 1092238, 297019, 69789, 1092237, 1092236, 1024599, 1065448, 1057270, 953274, 801059, 814699, 1025188, 1033580, 1080419, 128178, 760817, 1101466, 995443, 863187, 902919, 877845, 1092203, 770167, 1049200, 879747, 912961, 808528, 991241, 935952, 1092176, 916901, 930293, 83458, 1092168, 1073721, 878840, 1092165, 776609, 1092161, 855725, 798945, 1058425, 1092143, 1031999, 1062457, 761388, 846806, 72809, 922389, 948351, 1056950, 1039495, 1003334, 990010, 908316, 889289, 47270, 777519, 1070728, 1092120, 917536, 1057446, 849596, 997533, 805900, 822649, 840445, 996653, 1071270, 931940, 828596, 1092095, 889046, 1092093, 937578, 1014911, 975495, 813953, 828779, 908237, 156723, 887392, 1101448, 926980, 947974, 868487, 911605, 900696, 1007875, 900062, 831784, 259763, 801478, 54307, 783781, 1040461, 1092042, 760908, 838453, 977952, 1052948, 849245, 820899, 1019830, 1101443, 887398, 1042399, 1092029, 788431, 992340, 839488, 1092023, 26079, 845790, 73119, 187818, 224548, 1102330, 1092007, 786937, 907127, 930621, 776517, 847726, 17586, 1021554, 792847, 776700, 907538, 806688, 798883, 786375, 1091983, 1001108, 899212, 1091973, 143849, 942354, 842272, 935707, 1001381, 813899, 874876, 257018, 1043587, 989647, 157580, 155700, 1091941, 848432, 893789, 890890, 119089, 775355, 114573, 880766, 924047, 860573, 102627, 825147, 989099, 1011003, 1075713, 782381, 824542, 960566, 990784, 1076030, 860078, 874299, 810660, 1057488, 1059646, 1091850, 1014115, 794625, 780993, 1059619, 854862, 1091833, 931772, 1066116, 200062, 922024, 974670, 885433, 825583, 962443, 870544, 769630, 800652, 900599, 803237, 846291, 815243, 244011, 1079141, 829425, 936182, 98847, 927989, 189174, 785176, 1065118, 841919, 906126, 775457, 1091765, 1006922, 991419, 1046750, 1048876, 351820, 947466, 287912, 963788, 830551, 766301, 986325, 1091749, 1006987, 771314, 842070, 1052421, 994941, 842596, 837740, 115833, 963564, 982348, 830531, 1062603, 1056482, 1005500, 773858, 1021241, 59084, 1091719, 822585, 765147, 802634, 987657, 1042676, 1091692, 868410, 1091690, 214771, 1091688, 810210, 849142, 1091681, 803861, 874691, 888777, 1048995, 372674, 842221, 1091667, 909273, 1091665, 913286, 761032, 1056850, 1091661, 1091659, 1091654, 898686, 1065032, 938066, 985653, 896383, 1091643, 928567, 837372, 1091633, 824920, 1091630, 824938, 1031118, 873914, 987720, 935364, 1037341, 1044755, 1004493, 800243, 1091595, 859229, 771170, 1091576, 770604, 1091569, 897240, 58234, 891719, 955117, 918324, 1024950, 905479, 196963, 864507, 933946, 964577, 852179, 128633, 842333, 989644, 1014697, 161117, 834848, 1091529, 7968, 1015766, 257335, 1029681, 145569, 1091522, 818842, 1091520, 831302, 1091513, 900164, 1010700, 900077, 245416, 1032011, 908489, 850555, 810270, 948532, 832508, 868184, 813605, 939020, 964152, 911056, 251172, 920885, 995029, 991598, 988149, 1051095, 1049368, 1074499, 1091471, 764139, 970242, 831871, 1091467, 770233, 1091461, 873886, 1047088, 1063709, 955093, 784549, 902855, 1091450, 970824, 940916, 912899, 780297, 991383, 988294, 792977, 998482, 860071, 984930, 77391, 75335, 866428, 1091421, 853646, 1015556, 772129, 47588, 1021446, 932878, 859376, 980789, 1102325, 937947, 143025, 872869, 1080968, 968004, 1091384, 765583, 1042488, 806574, 886382, 921348, 836832, 808235, 882982, 1073980, 853471, 180693, 1091360, 1038871, 960003, 1049221, 1033759, 903097, 1010607, 1091340, 785721, 1091337, 957688, 1040694, 194724, 1009959, 1091330, 788035, 46081, 1058100, 301061, 973917, 1038849, 969264, 953355, 860542, 175625, 802794, 194870, 888911, 1057168, 1101374, 995526, 1016869, 1047592, 298444, 1051990, 985905, 1091269, 830649, 850450, 870875, 357519, 1091264, 1023838, 855031, 958993, 1091255, 795951, 939744, 954711, 200600, 851490, 1063659, 932495, 994582, 1091246, 236708, 1091234, 238886, 804687, 149670, 897401, 812190, 842108, 1045203, 868919, 1041714, 1060305, 968310, 1009668, 1036800, 860462, 1091206, 164946, 769085, 904727, 844128, 1059442, 1091194, 155119, 864905, 1091189, 97964, 798469, 761963, 326410, 1016703, 1059801, 1091177, 970830, 786674, 16860, 1091173, 1091163, 888796, 909506, 1091158, 991938, 905766, 794319, 795540, 829025, 952520, 909176, 872655, 783843, 1003997, 1050670, 1016790, 1080495, 956624, 1017892, 815320, 1013367, 1091115, 1091112, 811852, 1091108, 913137, 1057757, 831380, 929473, 921812, 1079434, 1021679, 822642, 248385, 946747, 1091082, 51054, 909886, 996042, 869891, 879150, 1091068, 883861, 925059, 1058822, 790178, 781877, 1091059, 1081321, 1049867, 993876, 91790, 131925, 855029, 907173, 1058717, 828093, 1091048, 898631, 829050, 59190, 760930, 865660, 979713, 36388, 1068408, 1036999, 984075, 1022762, 918424, 969974, 843140, 768133, 941749, 879155, 798253, 980726, 832188, 1091015, 999897, 878367, 1032074, 783822, 845719, 764691, 904389, 926064, 910375, 70787, 1090987, 792187, 863817, 1070867, 980168, 909221, 203274, 885184, 763084, 994311, 25036, 1101341, 1090965, 84520, 952047, 854785, 863738, 808200, 818612, 935362, 980811, 919712, 1027785, 999641, 846438, 772864, 948829, 989543, 1090924, 888559, 804996, 995654, 993544, 969066, 220495, 1090910, 992729, 787255, 1015055, 938773, 286915, 942915, 1067990, 1090887, 1090886, 917022, 826518, 847415, 874455, 193422, 804523, 1090877, 780850, 958142, 953332, 988960, 885505, 969023, 1053997, 1068924, 1090869, 946825, 993419, 788484, 1090861, 972064, 829087, 1080537, 786477, 1065985, 773924, 1090842, 1090841, 1077000, 1090838, 1090833, 1001981, 865384, 810394, 355458, 311540, 867947, 93311, 858391, 1090808, 917015, 1025895, 885301, 791140, 1071061, 1090796, 822218, 820027, 1090791, 833507, 995789, 1090789, 153739, 156052, 1049767, 1090758, 789997, 792789, 857943, 49943, 1027865, 905604, 931726, 278658, 1090742, 784961, 993501, 818421, 851813, 1090730, 135079, 775297, 1090727, 995212, 888934, 815891, 961048, 818798, 97652, 1078752, 878817, 833268, 1090701, 1090700, 867490, 898318, 323592, 938359, 843409, 875417, 1064155, 996317, 816483, 348994, 823421, 798967, 149767, 240489, 28442, 205741, 240584, 62411, 264284, 138492, 186727, 38608, 144028, 275534, 54235, 135633, 236582, 348594, 58409, 1090624, 358455, 42555, 1090613, 26485, 55848, 113826, 126821, 60339, 375891, 1090596, 184916, 247717, 50833, 168175, 340006, 326190, 99461, 84473, 166625, 38098, 128158, 197542, 108287, 153588, 156688, 1090558, 227992, 75801, 1090550, 326637, 194430, 1090542, 1090540, 1090537, 1090530, 59426, 1101300, 165335, 176065, 166784, 53109, 49234, 1090513, 174344, 370979, 1101298, 166111, 72435, 260172, 59030, 243139, 79763, 156251, 291396, 186265, 169305, 307504, 355484, 137508, 186063, 1101296, 289586, 277737, 249176, 344955, 1090458, 81649, 203039, 56188, 35996, 36703, 44686, 25344, 121017, 72398, 129491, 30039, 148016, 1090413, 57411, 54819, 57258, 199837, 300306, 1090400, 1090399, 149161, 215603, 165135, 59654, 296441, 160312, 149221, 1090377, 100661, 196250, 206762, 283141, 111995, 55682, 1090358, 1090352, 1090350, 81945, 188714, 291248, 205251, 305650, 100250, 1090329, 82842, 288139, 180887, 88831, 171906, 1090311, 290091, 1101282, 326719, 147073, 243761, 162662, 247194, 195199], + 'msmarco-doc-test': [355339, 1035339, 943613, 1051868, 876108, 770613, 84901, 928755, 895787, 920435, 1009016, 923221, 1126106, 988661, 975821, 154441, 802079, 184355, 938754, 1008125, 794144, 51577, 1126090, 1073795, 767703, 937603, 876154, 853437, 275951, 773040, 813841, 1135894, 117487, 927093, 1126069, 804764, 832389, 1126064, 808540, 963267, 909560, 334918, 1010426, 955087, 886048, 767404, 898402, 849376, 970577, 7869, 1126035, 889718, 82412, 809339, 68610, 891083, 346202, 835783, 1024923, 973416, 67937, 1019366, 857232, 989398, 787957, 889757, 907576, 1037969, 957990, 1037871, 1073093, 848496, 147746, 960571, 932412, 993677, 997024, 800980, 927899, 1053885, 1018475, 943566, 929863, 1079280, 943913, 839899, 128604, 1061763, 809683, 1036759, 798642, 57068, 977156, 1125956, 913435, 1125954, 60634, 276099, 913041, 993117, 789981, 941780, 1125930, 904849, 1125924, 1024250, 128322, 1125920, 1125919, 845751, 943412, 1056425, 978017, 796563, 53330, 1054994, 794701, 990049, 305251, 1074350, 773878, 1135875, 855288, 952866, 792115, 848431, 996851, 1056576, 1031850, 1035354, 1125864, 271481, 936261, 789305, 1416, 842609, 900690, 793245, 983196, 816456, 851318, 899008, 811001, 903976, 1125841, 902410, 931576, 959564, 955763, 1125829, 1013640, 65752, 828649, 992308, 105367, 1054386, 1125820, 58374, 763534, 957181, 823034, 858790, 28453, 984702, 8234, 871720, 778062, 1079501, 1008523, 188803, 761705, 1006025, 991361, 803911, 1125782, 982127, 812148, 34925, 965627, 1001465, 168906, 1125763, 1070541, 203578, 788201, 1032694, 903643, 822527, 810947, 323665, 909052, 912272, 902935, 1125745, 980205, 880930, 950222, 1011337, 803862, 1046316, 1056710, 816915, 996237, 968847, 1135859, 912165, 1125706, 1074076, 944345, 301180, 966925, 784039, 1125694, 971998, 76945, 1125690, 1125688, 779295, 905638, 875806, 1125684, 54659, 1125680, 1008285, 1135856, 898753, 20530, 1003074, 875793, 341529, 841979, 1056910, 857280, 792006, 23822, 842753, 1043433, 128291, 1125651, 1026391, 869721, 1074859, 997654, 1125645, 972896, 839707, 318781, 1050274, 1019841, 1125633, 855922, 886682, 1033989, 1125628, 275173, 765070, 1125626, 230103, 796808, 280245, 851807, 790280, 1077110, 1019506, 1048410, 1029806, 1125599, 124943, 824765, 906203, 1125596, 198905, 166606, 1125592, 1125591, 1125590, 1010376, 1125587, 965313, 1125583, 803296, 1000865, 1135848, 1019720, 928128, 991278, 991583, 128365, 788278, 863720, 769008, 1018290, 1125559, 760825, 122724, 261830, 1125556, 1125555, 842923, 775366, 900867, 989196, 1030770, 767991, 882803, 1005440, 841671, 943638, 1060768, 864153, 806779, 819755, 799323, 1003747, 253837, 893530, 859732, 921193, 1066463, 1125519, 902014, 939096, 946406, 1125510, 24480, 798003, 1135841, 1125495, 368106, 853522, 204701, 928453, 924975, 854766, 814340, 886243, 1059906, 1125481, 1078827, 989869, 1073638, 837433, 962187, 1036844, 983787, 861435, 18793, 311494, 1125459, 1125455, 1064450, 947416, 809209, 972396, 306216, 1125443, 931357, 790199, 132469, 1044869, 836655, 881324, 885159, 949501, 1058138, 340540, 1076490, 909115, 919673, 1125409, 1125406, 1023476, 971528, 1076183, 840845, 994762, 1125395, 980406, 1125394, 204957, 354123, 775487, 762865, 808343, 788151, 964554, 1067743, 836007, 840770, 825151, 871767, 222861, 299781, 153374, 1125352, 219844, 208394, 261661, 355519, 146170, 1125347, 180979, 220352, 153239, 323874, 175228, 61240, 49429, 121109, 165037, 282280, 336011, 121113, 171824, 116659, 9904, 9975, 282050, 233856, 78181, 1135818, 199508, 57774, 224261, 206738, 228769, 1125306, 21227, 141078, 1125292, 190377, 184333, 297682, 130610, 1125273, 1125272, 68896, 96597, 202664, 128757, 307758, 278239, 1125260, 334263, 303070, 1125251, 145104, 208344, 236824, 1125245, 77398, 84713, 24841, 110736, 1125238, 148515, 319235, 50800, 1125227, 300375, 310642, 281106, 307403, 190070, 161474, 136578, 189365, 20440, 14448, 249267, 28862, 1125194, 60902, 204851, 17077, 336236, 161434, 112638, 282352, 21075, 341207, 156479, 1125153, 329704, 261295, 114048, 213365, 374724, 340145, 122049, 92622, 314064, 247025, 168854, 11304, 233178, 76591, 157744, 31548, 1125111, 150926, 92713, 276665, 150029, 178677, 194563, 237689, 378218, 1125086, 324242, 235280, 105183, 1125079, 1135796, 24280, 1125075, 79457, 88200, 174034, 130306, 139285, 111573, 102366, 198015, 158054, 205433, 91055, 343439, 275968, 254923, 23367, 12166, 1125041, 115254, 28653, 300384, 184640, 219898, 50782, 84257, 1136966, 281922, 319757, 313747, 81842, 37122, 23986, 1125015, 231134, 1125013, 179395, 158569, 176276, 275413, 120398, 55454, 253834, 118372, 1124998, 323294, 305205, 85798, 143889, 230878, 1124990, 1124989, 121025, 49984, 118151, 1124982, 276525, 180091, 307344, 186484, 82578, 246327, 53422, 96443, 255889, 1124958, 87592, 1124957, 144952, 1124953, 144498, 1135780, 52199, 285049, 198444, 302038, 122795, 122298, 235309, 360650, 60301, 59722, 25398, 202245, 1124926, 130951, 1124915, 1124882, 1124872, 1124863, 1124803, 1124767, 1124753, 1124703, 1124699, 1124663, 1124621, 1124573, 1124569, 1124549, 1124542, 1124534, 1136837, 1135738, 1124531, 1124522, 1124504, 1124501, 1124480, 1135727, 1124472, 1124469, 1124462, 1124451, 1135722, 1124391, 1124388, 1124373, 1124369, 1124335, 1124324, 1124306, 1124300, 1124276, 1124251, 1124226, 1124221, 1124198, 1124194, 1124171, 1124170, 1124160, 1124159, 1124122, 1124114, 1124093, 1124090, 1124087, 1124067, 1124059, 1123997, 1136830, 1123971, 1123969, 1123968, 1123953, 1123930, 1123917, 1123915, 1123888, 1123840, 1123837, 1123822, 1123776, 1123765, 1123761, 1123721, 1123709, 1123636, 1123626, 1135625, 1123603, 1123584, 1123544, 1123492, 1123488, 1123469, 1123465, 1135606, 1135605, 1123435, 1123397, 1123383, 1123337, 1123298, 1123211, 1123209, 1123191, 1123168, 1123112, 1123103, 1123090, 1123074, 1123057, 1123055, 1123052, 1135570, 1123034, 1123028, 1135568, 1135563, 1122957, 1122936, 1122915, 1122908, 1122892, 1122859, 1135553, 1122853, 1136818, 1122792, 1122785, 1122776, 1122772, 1122760, 1122745, 1122706, 1122695, 1122690, 1122686, 1122662, 1122658, 1122652, 1122648, 1122643, 1135533, 1122610, 1122606, 1122601, 1122594, 1122593, 1122591, 1122586, 1122584, 1122569, 1135525, 1122504, 1135522, 1122501, 1122488, 1122476, 1122471, 1122446, 1122442, 1122409, 1122352, 1122348, 1122343, 1122342, 1122341, 1122336, 1122334, 1135498, 1122316, 1122306, 1122305, 1122283, 1122271, 1122267, 1122255, 1122247, 1122242, 1122237, 1122234, 1122233, 1122222, 1122220, 1122212, 1122168, 1122155, 1122087, 1122082, 1122064, 1136811, 1121993, 1121967, 1121963, 1121941, 1121931, 1121922, 1135448, 1121892, 1121875, 1121861, 1121860, 1135438, 1121830, 1121817, 1121814, 1121799, 1121794, 1121759, 1121748, 1121673, 1121667, 1121642, 1121641, 1121631, 1121618, 1121576, 1121566, 1121532, 1121523, 1121474, 1121466, 1121459, 1135397, 1135395, 1121426, 1121424, 1121412, 1121380, 1121374, 1121369, 1121333, 1121327, 1121309, 1121268, 1121260, 1121251, 1121249, 1121191, 1121167, 1121162, 1121156, 1121118, 1135362, 1121083, 1121082, 1121068, 1121050, 1121044, 1121022, 1121000, 1120994, 1120986, 1120982, 1120963, 1120945, 1120926, 1120919, 1120904, 1120891, 1120887, 1120867, 1120842, 1120835, 1120834, 1120776, 1120775, 1120773, 1120744, 1120726, 1120706, 1120689, 1120685, 1120678, 1120676, 1120672, 1120668, 1135301, 1120633, 1120621, 1120619, 1120606, 1120599, 1120576, 1120574, 1120564, 1120563, 1120559, 1120541, 1120537, 1120519, 1120515, 1120466, 1120462, 1120453, 1135280, 1120399, 1120395, 1135274, 1120391, 1120375, 1120361, 1120348, 1120328, 1120316, 1120268, 1120261, 1135262, 1120253, 1120248, 1120236, 1120189, 1120187, 1120180, 1120167, 1120089, 1120084, 1120049, 1120041, 1120019, 1120006, 1135238, 1135234, 1119953, 1119943, 1119904, 1119884, 1119872, 1119862, 1119828, 1119764, 1119744, 1119740, 1119695, 1119627, 1119620, 1119603, 1119597, 1119593, 1119531, 1119529, 1119514, 1119501, 1135190, 1119444, 1119384, 1119374, 1119355, 1119347, 1119338, 1119316, 1119307, 1119305, 1119280, 1119271, 1119259, 1119230, 1135165, 1119189, 1119179, 1119169, 1119168, 1119167, 1119132, 1119128, 1119112, 1119110, 1119097, 1119076, 1135150, 1119040, 1119038, 1119021, 1119015, 1119013, 1119006, 1135142, 1118976, 1118974, 1118954, 1118953, 1118941, 1118927, 1118926, 1118921, 1118889, 1118884, 1118879, 1118871, 1118869, 1118868, 1118820, 1118806, 1118799, 1118797, 1118793, 1118792, 1135121, 1118768, 1118759, 1118734, 1118677, 1118676, 1118671, 1118659, 1118651, 1118641, 1118627, 1135106, 1118595, 1118585, 1136771, 1135094, 1118456, 1118455, 1118435, 1118434, 1118429, 1118416, 1118388, 1135081, 1118310, 1118294, 1118293, 1118286, 1118259, 1118230, 1118229, 1118227, 1118226, 1118209, 1118199, 1135052, 1118176, 1118172, 1118169, 1118145, 1118140, 1135042, 1135039, 1118042, 1118014, 1118012, 1135028, 1117935, 1117925, 1117901, 1117875, 1117872, 1117858, 1117826, 1117798, 1117787, 1117771, 1117767, 1117765, 1136763, 1134998, 1117740, 1117725, 1117709, 1117708, 1117700, 1117691, 1117689, 1117672, 1134987, 1117650, 1117623, 1117616, 1117589, 1117584, 1117581, 1117579, 1134978, 1117566, 1117542, 1117505, 1117495, 1134967, 1117451, 1117450, 1117446, 1117405, 1117402, 1117398, 1117394, 1117375, 1117361, 1117357, 1117350, 1117343, 1117337, 1117331, 1117313, 1117307, 1117299, 1117295, 1117294, 1134949, 1117271, 1117263, 1117261, 1134945, 1117235, 1117206, 1117183, 1117182, 1117178, 1117154, 1117150, 1117148, 1134931, 1117066, 1117062, 1117055, 1117033, 1134926, 1116996, 1136756, 1116903, 1116896, 1116877, 1116871, 1116867, 1116864, 1116862, 1116846, 1116845, 1116821, 1116816, 1116776, 1116775, 1116763, 1116728, 1116706, 1116702, 1116695, 1116694, 1116663, 1116657, 1116643, 1116633, 1116612, 1116606, 1116592, 1116554, 1116553, 1116537, 1116531, 1116467, 1116452, 1116433, 1116429, 1116419, 1116402, 1134871, 1116369, 1116368, 1116353, 1116324, 1116304, 1134862, 1116301, 1116273, 1116265, 1116264, 1116260, 1116242, 1116234, 1116228, 1116221, 1134853, 1116211, 1116201, 1134850, 1116180, 1116169, 1116168, 1116164, 1116162, 1116161, 1134846, 1116139, 1116134, 1116121, 1116112, 1116103, 1116096, 1116092, 1116090, 1134839, 1134838, 1116037, 1116028, 1116025, 1116021, 1116019, 1116016, 1116015, 1134835, 1115983, 1115970, 1115961, 1115949, 1115933, 1115929, 1115881, 1115870, 1115819, 1115784, 1115783, 1115760, 1115748, 1115716, 1134807, 1134806, 1115693, 1115677, 1115660, 1115656, 1115651, 1115650, 1115649, 1115617, 1115599, 1115595, 1115586, 1115585, 1115584, 1115539, 1115526, 1115511, 1115485, 1134784, 1115462, 1115432, 1115425, 1115423, 1115388, 1115372, 1115339, 1115334, 1115332, 1115325, 1134769, 1115281, 1115255, 1115248, 1115206, 1115197, 1115191, 1115172, 1134752, 1115154, 1115118, 1115109, 1115106, 1115100, 1115097, 1115086, 1115072, 1115030, 1115021, 1115004, 1114979, 1114974, 1114962, 1114947, 1114905, 1114901, 1114882, 1114870, 1134723, 1114838, 1114828, 1114805, 1114782, 1114757, 1114753, 1114743, 1114739, 1114700, 1114690, 1114669, 1114660, 1114655, 1114654, 1114650, 1114634, 1114589, 1114588, 1114585, 1114584, 1114542, 1114524, 1114510, 1114502, 1114498, 1114495, 1114488, 1114476, 1114471, 1114460, 1134676, 1114428, 1114423, 1114420, 1114402, 1114383, 1114358, 1134666, 1114290, 1114275, 1134656, 1114236, 1114206, 1114200, 1114198, 1114188, 1114185, 1114164, 1114149, 1114131, 1114125, 1114108, 1114099, 1114093, 1114092, 1134639, 1114078, 1114066, 1136728, 1114055, 1114047, 1114044, 1113970, 1113959, 1113944, 1113877, 1113870, 1113861, 1113847, 1113840, 1136726, 1134614, 1113808, 1113802, 1113796, 1113792, 1113767, 1113756, 1113751, 1113724, 1113709, 1113699, 1113683, 1113654, 1113622, 1113608, 1136724, 1113597, 1113545, 1113528, 1113526, 1113520, 1134583, 1113506, 1113498, 1113496, 1113461, 1113439, 1113429, 1113425, 1113416, 1113398, 1134572, 1113381, 1113380, 1113353, 1113352, 1113347, 1113318, 1113307, 1113304, 1113269, 1113258, 1134560, 1134558, 1113231, 1134557, 1113201, 1113175, 1113170, 1113163, 1113158, 1113148, 1113147, 1134552, 1113125, 1113092, 1113090, 1134539, 1113073, 1113041, 1112954, 1112947, 1112944, 1112939, 1112928, 1112908, 1112897, 1112847, 1112838, 1112827, 1112819, 1134500, 1134499, 1112770, 1112709, 1112705, 1112663, 1112658, 1112656, 1112614, 1112606, 1112596, 1112568, 1112514, 1112506, 1112487, 1112486, 1112442, 1112396, 1112390, 1112384, 1112382, 1112375, 1112366, 1112327, 1112324, 1112313, 1134449, 1112302, 1112297, 1112291, 1134444, 1112250, 1112240, 1112234, 1112210, 1134436, 1112203, 1112154, 1112152, 1112141, 1112107, 1112105, 1112100, 1112089, 1112061, 1134422, 1134420, 1112044, 1112018, 1112014, 1111987, 1111969, 1111957, 1111908, 1111902, 1111898, 1111892, 1111890, 1134405, 1111874, 1111813, 1111802, 1111791, 1111790, 1134394, 1111760, 1111710, 1111705, 1111702, 1111678, 1111668, 1111662, 1111650, 1111605, 1111581, 1111580, 1111576, 1111564, 1111504, 1111502, 1111472, 1111470, 1111460, 1111439, 1111417, 1111400, 1111396, 1111392, 1111377, 1111345, 1111338, 1111316, 1111313, 1111306, 1134343, 1111275, 1111241, 1111214, 1111188, 1111156, 1111132, 1111119, 1111099, 1111071, 1111049, 1111030, 1111024, 1111023, 1110997, 1134309, 1110964, 1134306, 1110929, 1110927, 1110905, 1110903, 1110874, 1110868, 1110849, 1110836, 1110794, 1110776, 1110730, 1134281, 1110698, 1134277, 1110643, 1110605, 1134272, 1110576, 1134266, 1110531, 1110512, 1134263, 1110498, 1110470, 1110468, 1110426, 1110423, 1110410, 1110401, 1134251, 1110392, 1110391, 1110357, 1110353, 1110344, 1110337, 1110326, 1110322, 1110321, 1110314, 1110295, 1110284, 1110275, 1110264, 1110246, 1110234, 1110217, 1110215, 1110213, 1110196, 1110190, 1134221, 1110189, 1110163, 1110129, 1134212, 1110081, 1134203, 1110001, 1109974, 1109969, 1109917, 1134188, 1109872, 1109853, 1134184, 1109822, 1109805, 1109794, 1109788, 1109784, 1109768, 1109722, 1109701, 1109694, 1109658, 1109657, 1109628, 1109615, 1109599, 1109579, 1109571, 1134157, 1109546, 1109542, 1109540, 1109537, 1109525, 1109496, 1109487, 1109477, 1109474, 1109473, 1109471, 1109464, 1109463, 1109462, 1134140, 1109436, 1109413, 1109408, 1109407, 1109397, 1109396, 1134135, 1109381, 1109379, 1109365, 1109319, 1109311, 1109288, 1109238, 1109215, 1109201, 1109190, 1109171, 1109110, 1134109, 1109050, 1109048, 1109040, 1109022, 1109002, 1108993, 1108985, 1108975, 1108961, 1108959, 1108953, 1108935, 1108922, 1108914, 1108911, 1108875, 1108874, 1108867, 1108847, 1108821, 1108811, 1108809, 1108799, 1108789, 1136676, 1108763, 1108735, 1108658, 1108645, 1108637, 1108636, 1108632, 1108629, 1108607, 1134057, 1108564, 1108526, 1108523, 1108516, 1108510, 1134049, 1108492, 1108487, 1108481, 1108478, 1108472, 1108462, 1108406, 1108400, 1108332, 1134030, 1108268, 1134028, 1108241, 1108227, 1134024, 1108216, 1108203, 1108199, 1108147, 1108131, 1108121, 1108099, 1108075, 1108071, 1134001, 1108011, 1108009, 1107991, 1107982, 1107970, 1107953, 1107919, 1107915, 1107898, 1107885, 1133988, 1107845, 1133986, 1107834, 1133983, 1107749, 1107748, 1107745, 1107702, 1107677, 1107646, 1107618, 1107602, 1107593, 1107568, 1107563, 1107450, 1107401, 1107399, 1107381, 1107364, 1107344, 1107336, 1107308, 1107299, 1133931, 1107245, 1107210, 1107207, 1107193, 1107192, 1107189, 1107171, 1107141, 1107132, 1107123, 1107117, 1107112, 1107108, 1107092, 1107091, 1107085, 1107057, 1133907, 1107033, 1107015, 1133902, 1106978, 1106920, 1106912, 1106873, 1106858, 1106850, 1106840, 1106834, 1106797, 1106764, 1106756, 1106686, 1106680, 1106676, 1106672, 1106658, 1106652, 1106642, 1106607, 1106589, 1133854, 1106543, 1106539, 1106537, 1106535, 1106533, 1106521, 1106516, 1106508, 1106502, 1106450, 1106421, 1106414, 1106408, 1106389, 1106381, 1106377, 1106348, 1106343, 1106335, 1106306, 1106291, 1106290, 1133827, 1106234, 1106230, 1106216, 1106200, 1106196, 1106159, 1106126, 1106125, 1106099, 1106089, 1133812, 1106079, 1133809, 1106027, 1106011, 1133799, 1133798, 1105989, 1105982, 1105978, 1133792, 1105897, 1105882, 1105853, 1105850, 1105831, 1105817, 1105816, 1133780, 1105805, 1105798, 1105797, 1105766, 1105761, 1105753, 1105700, 1105689, 1105666, 1105625, 1105617, 1105614, 1105594, 1105593, 1105582, 1105571, 1105565, 1133757, 1105526, 1105506, 1105498, 1105489, 1105485, 1105441, 1105432, 1105431, 1105427, 1105422, 1133744, 1105381, 1105364, 1105358, 1105337, 1105298, 1105287, 1105276, 1105275, 1105253, 1105248, 1105239, 1105202, 1105190, 1105169, 1105163, 1133721, 1105148, 1105146, 1105144, 1105142, 1105110, 1105108, 1105100, 1105086, 1105073, 1105046, 1105042, 1133710, 1105021, 1105017, 1105013, 1104984, 1136634, 1104957, 1104950, 1104949, 1104942, 1104915, 1104781, 1104773, 1104763, 1104725, 1104720, 1104712, 1104704, 1104699, 1104698, 1104685, 1104640, 1104633, 1104630, 1104557, 1104513, 1104509, 1104506, 1104497, 1104468, 1104458, 1104454, 1133658, 1104406, 1104403, 1104339, 1133644, 1104279, 1104252, 1104250, 1104235, 1104223, 1104221, 1104198, 1104175, 1104124, 1104118, 1104105, 1104099, 1104087, 1133620, 1104071, 1104064, 1104036, 1104022, 1104020, 1104005, 1133611, 1103987, 1103974, 1103969, 1103966, 1103921, 1103911, 1103910, 1103888, 1103879, 1103828, 1103826, 1103816, 1103798, 1103793, 1103787, 1103776, 1103766, 1103759, 1103690, 1103687, 1103684, 1103679, 1103651, 1103601, 1103579, 1103561, 1103555, 1103553, 1103537, 1103535, 1133558, 1103511, 1133557, 1103468, 1103467, 1103446, 1103416, 1103387, 1103355, 1103322, 1103314, 1103303, 1103290, 1103289, 1133535, 1103260, 1103257, 1133533, 1103250, 1103182, 1103136, 1103121, 1103093, 1103091, 1103089, 1103084, 1103076, 1103019, 1103009, 1102998, 1102989, 1102979, 1102895, 1102892, 1102869, 1102862, 1102854, 1102849, 1102839, 1102827, 1102811, 1102803, 1102768, 1133474, 1102714, 1102704, 1102693, 1102667, 1102617, 1102590, 1102589, 1102579, 1102578, 1102498, 1102488, 1102477, 1102474, 1102456, 1133442, 138157, 2610, 1133431, 149979, 377304, 1133428, 216736, 359286, 62525, 2663, 272951, 306421, 42476, 139000, 201194, 204951, 209797, 1937, 1133376, 285032, 1133374, 236763, 121746, 246076, 176953, 147537, 1133366, 360488, 175123, 160276, 159922, 131617, 318841, 80372, 63246, 146783, 349622, 288566, 1133349, 326787, 65809, 356260, 100154, 132133, 74274, 228670, 134903, 260302, 108391, 103402, 103595, 49439, 242042, 339286, 233529, 40228, 19684, 59230, 262686, 90139, 213353, 32202, 56678, 144051, 1133288, 48170, 137662, 49381, 346023, 180370, 132575, 49810, 24093, 1133258, 131405, 102330, 332859, 1133254, 266752, 252103, 1133252, 92176, 54747, 369873, 78332, 76140, 170498, 236254, 160010, 285797, 1133231, 370985, 201444, 239147, 95448, 48846, 122011, 274306, 209497, 173661, 1133202, 1136591, 132938, 1133194, 295928, 1133190, 212146, 318302, 107701, 1133187, 178610, 21792, 23963, 271748, 291553, 137712, 127150, 152627, 1133171, 187371, 130467, 148898, 103328, 347583, 120219, 345861, 126866, 237441, 162657, 323393, 62845, 54246, 333489, 150595, 180956, 1133122, 119400, 183919, 1136584, 100777, 198536, 142148, 1133113, 134628, 117174, 137468, 125791, 38087, 334251, 1133092, 272071, 57674, 28684, 78497, 129183, 31825, 1425, 284067, 243874, 161766, 278827, 266488, 89633, 94173, 194893, 257783, 354466, 187317, 139175, 67222, 1133057, 236776, 195958, 165977, 347294, 262636, 128874, 1133036, 308617, 331343, 30163, 43707, 30649, 27310, 2045, 330504, 83320, 271835, 315683, 178575, 135634, 200918, 1132996, 34366, 1132991, 33137, 20892, 135821, 131850, 1132977, 207703, 340377, 185119, 83959, 378632, 328464, 32278, 1132965, 1132959, 271881, 1132952, 287159, 275737, 274797, 1132945, 309926, 268574, 289499, 1132925, 248362, 1132921, 100932, 204904, 56894, 91565, 280019, 1132913, 106320, 1132903, 182350, 355540, 132495, 323018, 62577, 1132890, 37185, 1136559, 65052, 205954, 225986, 161346, 159992, 1132847, 239971, 113269, 43476, 134905, 326921, 158752, 68626, 334222, 75286, 1132834, 96857, 225703, 134469, 232008, 140770, 50797, 59725, 172940, 130168, 1132815, 183696, 121488, 245833, 87730, 216731, 224688, 1132796, 68618, 38122, 64384, 1132790, 190054, 191625, 202250, 310488, 222133, 247819, 1136550, 247506, 54958, 229260, 200144, 163053, 95594, 175258, 80718, 219809, 320340, 1132754, 85348, 293041, 72577, 334716, 41184, 186086, 50626, 29725, 1132735, 239245, 1132734, 75799, 124895, 1132717, 261101, 91157, 44340, 67422, 92143, 31192, 276343, 49482, 124291, 227104, 57710, 143012, 74057, 94039, 272269, 299381, 143955, 268235, 261207, 124798, 260080, 242796, 117115, 299709, 165393, 1214, 157565, 1132651, 245620, 55727, 313766, 25026, 57218, 344937, 67147, 313491, 114725, 50189, 70108, 303874, 160801, 1132592, 24041, 88882, 289943, 1132583, 22364, 112541, 226572, 200782, 1132564, 63290, 156302, 63795, 184833, 156548, 268010, 227516, 90708, 1132549, 208493, 82973, 147064, 56808, 1132529, 357336, 160313, 278403, 176124, 188784, 37547, 30359, 347113, 293516, 233185, 64430, 127812, 364094, 12761, 72904, 271435, 277093, 166683, 167974, 160574, 301352, 207754, 181479, 117965, 169257, 1132444, 259070, 135347, 153663, 296378, 60870, 184223, 23531, 27528, 98675, 125929, 1132409, 326797, 225419, 359463, 112864, 259437, 340815, 71908, 88808, 1132399, 43167, 42055, 187763, 115594, 95449, 40124, 196233, 202384, 81184, 213758, 138933, 1132360, 343640, 293069, 289801, 273773, 176499, 333375, 1132352, 1132347, 309040, 297010, 295730, 320086, 166508, 1132312, 148209, 13101, 25465, 129457, 109587, 277868, 199407, 36299, 13912, 294614, 160309, 65904, 59911, 200228, 267705, 191894, 76102, 174722, 71138, 321703, 11863, 64535, 123648, 1132255, 188166, 92670, 22836, 137674, 62078, 83401, 373795, 76154, 166325, 274555, 36951, 164940, 266150, 153592, 200695, 239250, 207572, 181305, 303706, 97834, 195677, 201381, 153123, 159867, 189529, 174157, 203720, 320700, 39325, 1136491, 134127, 256066, 39660, 335114, 209769, 1132162, 312826, 455425, 538026, 67225, 162696, 1132147, 689120, 508870, 394040, 404889, 490071, 495618, 549190, 49387, 673689, 735360, 208702, 487934, 131247, 563898, 513591, 502104, 752441, 150443, 485594, 586761, 57, 4776, 214625, 1136482, 493900, 609252, 1132087, 732756, 654897, 179955, 461950, 660426, 685004, 734466, 368900, 156776, 628564, 344155, 473182, 386934, 692151, 706780, 482382, 666238, 720868, 146574, 162946, 728829, 466657, 580605, 129695, 1132047, 398127, 473495, 675920, 506768, 415155, 736117, 525069, 466400, 671117, 653909, 643328, 383847, 517763, 713301, 220290, 181644, 389541, 1132006, 699218, 537744, 745784, 369105, 723486, 1131983, 642800, 452336, 756790, 734758, 334219, 255251, 702598, 450543, 613422, 626761, 89634, 596136, 167533, 727181, 429906, 608124, 713357, 573701, 655939, 684502, 538143, 662108, 265494, 120593, 722220, 445502, 597686, 544220, 537188, 718364, 653077, 624210, 710887, 691798, 169584, 705174, 1131909, 209531, 452924, 451826, 647597, 463021, 709056, 490982, 643870, 144842, 466338, 1131892, 91778, 203783, 756681, 433786, 756829, 366342, 728735, 1131884, 538878, 508254, 575461, 504751, 229045, 628808, 108500, 703554, 599137, 671829, 720949, 696201, 554585, 389908, 745278, 1131840, 464930, 674566, 700756, 486173, 1131830, 21838, 353333, 1131821, 731129, 1131818, 680613, 1131813, 730149, 556637, 430755, 727943, 20616, 599673, 455256, 548054, 539566, 501894, 585888, 443964, 644356, 611442, 521254, 713278, 1136443, 547374, 741173, 1131777, 606672, 483253, 588308, 725726, 244472, 238804, 490000, 478255, 708144, 236269, 650476, 1131754, 465990, 647887, 435864, 714849, 660983, 415438, 543638, 1131738, 1131735, 534684, 733956, 643361, 463180, 671027, 1131724, 618695, 396098, 190164, 84797, 477552, 1131713, 399730, 1131703, 1131699, 642647, 192397, 370635, 466536, 549327, 658273, 522054, 707645, 386213, 463137, 562352, 440144, 169166, 595808, 508476, 450426, 588762, 680190, 636417, 618349, 629913, 422890, 277177, 642252, 447697, 515005, 503164, 259924, 584017, 558978, 694851, 672262, 78640, 537267, 581447, 1131613, 636093, 417895, 1131609, 180593, 739636, 625256, 731251, 188053, 449539, 475394, 738829, 419729, 1131593, 121843, 4383, 582098, 489238, 722550, 704182, 431159, 473020, 615000, 65692, 535699, 533613, 589875, 635379, 1131559, 1136425, 1131557, 464864, 746963, 563962, 1131554, 409031, 743777, 582128, 496927, 617223, 1131543, 677292, 540318, 243941, 747597, 578356, 1131533, 576964, 734678, 707689, 501442, 561499, 692815, 100940, 1131512, 473028, 1131510, 497632, 1131507, 383831, 566937, 290290, 49850, 637080, 614540, 689461, 747285, 614620, 694270, 633073, 391077, 603796, 676426, 1131478, 415661, 402595, 515273, 1131467, 737266, 526352, 460855, 485891, 726929, 96565, 582339, 1131446, 586754, 505992, 251774, 588712, 677133, 455371, 710756, 342150, 604332, 679482, 17199, 451643, 755461, 259312, 1131415, 1131411, 725828, 564310, 608244, 327855, 675569, 706373, 724657, 283924, 702722, 220398, 598348, 150347, 1131396, 413054, 516429, 730062, 710329, 479687, 1131383, 737913, 469566, 591326, 429876, 405298, 21744, 551851, 507424, 630264, 499666, 561538, 316436, 623112, 494786, 515775, 641164, 1131343, 574337, 614338, 562030, 455513, 562904, 753220, 562697, 738248, 230808, 1131320, 484350, 198610, 718133, 685661, 402991, 607582, 500775, 134875, 747566, 429205, 707853, 1131307, 1136401, 1131301, 521329, 392124, 393611, 1131295, 622262, 527745, 227967, 399503, 441204, 1131278, 580227, 592329, 693494, 618024, 664540, 697374, 1131260, 1136397, 43548, 632020, 750167, 410387, 187330, 685998, 743021, 613923, 446160, 528174, 1131240, 707577, 567878, 240102, 619013, 458674, 1131227, 1131222, 412136, 515123, 571215, 429182, 1131216, 674956, 1131209, 438344, 403520, 632935, 112928, 651821, 696173, 1131192, 674691, 633998, 1131182, 565366, 533428, 273443, 1131173, 464548, 681173, 451150, 737112, 389501, 579601, 387851, 593455, 534021, 452761, 643223, 456807, 566216, 83666, 1131155, 454824, 634650, 242583, 488073, 47269, 522358, 430258, 711802, 455957, 538373, 577234, 714881, 535009, 716082, 521367, 486431, 490752, 1131104, 442307, 709522, 694739, 703736, 1131092, 692238, 708094, 405684, 1131075, 736703, 632755, 144050, 463660, 534279, 493020, 692201, 723457, 527398, 557952, 1131049, 1131048, 699817, 241937, 1131046, 390313, 540655, 494111, 457426, 479475, 709726, 701898, 430704, 613827, 754589, 688208, 352420, 508510, 150505, 418883, 595641, 153809, 427372, 1131013, 589844, 1131008, 517135, 1131005, 1131004, 540951, 661717, 671630, 28661, 1130996, 406838, 629420, 1130994, 719438, 1130988, 671720, 702952, 644204, 563542, 437910, 716995, 673666, 604500, 575378, 573622, 484886, 676791, 515531, 708693, 370750, 644678, 1130937, 753089, 724623, 219723, 1130935, 590560, 1130933, 571087, 515785, 555353, 684994, 603617, 503963, 397564, 226012, 262542, 405974, 603714, 177610, 389739, 682567, 1130897, 674513, 567881, 531311, 544191, 455561, 426367, 380561, 245295, 663156, 431674, 493829, 129347, 686392, 718295, 74000, 514264, 678466, 503613, 127437, 739166, 583772, 722441, 188318, 719371, 626218, 258390, 717111, 1130849, 418725, 595910, 513838, 674571, 1130837, 533312, 29811, 466878, 459503, 623603, 1130830, 694106, 593611, 569902, 725715, 668648, 402832, 530079, 534250, 327062, 1136350, 204088, 457951, 416646, 511466, 576357, 72015, 1130806, 568405, 1130805, 742695, 538393, 412073, 637960, 598934, 561834, 747004, 200289, 562002, 524438, 682859, 360822, 394208, 1130772, 78501, 628136, 445026, 689657, 486716, 576292, 682902, 435130, 721273, 563938, 446290, 128543, 699279, 529769, 721661, 717849, 626517, 475408, 493845, 587923, 639288, 1130732, 674504, 599504, 1130728, 543290, 1130726, 444598, 439766, 621550, 417717, 552319, 706900, 468021, 412410, 455359, 161162, 414393, 1130684, 527633, 720395, 576305, 484467, 619408, 515064, 452385, 686422, 567899, 1130672, 715765, 663755, 709494, 55079, 727291, 669130, 1130667, 428479, 706985, 570789, 262878, 616705, 83712, 612831, 319218, 619655, 396122, 522212, 729058, 745317, 438891, 1130640, 1130635, 740366, 452155, 734746, 409853, 624662, 403035, 593489, 420872, 436325, 557806, 1130608, 488887, 596699, 344368, 416672, 614567, 554792, 575621, 456383, 547426, 547711, 521791, 672626, 148977, 514360, 25802, 456029, 510645, 506278, 642473, 73619, 552459, 1130575, 433683, 391829, 642144, 509654, 408986, 755381, 1130558, 546404, 561461, 665022, 1130548, 493826, 406237, 1130536, 680480, 1130524, 662687, 731691, 748843, 1130511, 414899, 491017, 569409, 560815, 412865, 712140, 388319, 491200, 297146, 695196, 499188, 115952, 524469, 14244, 130858, 593986, 489013, 541229, 29667, 126582, 458638, 497813, 418389, 123525, 675245, 592333, 640857, 1130449, 464240, 502221, 409003, 700345, 695697, 661986, 1130435, 578560, 1130431, 382254, 586049, 554435, 461052, 410169, 502453, 554031, 159535, 695737, 426684, 473062, 646245, 402135, 573471, 557944, 691188, 523197, 481998, 94642, 135936, 397550, 511215, 234583, 643749, 490936, 499385, 1130383, 644600, 1130378, 625691, 675823, 232508, 512536, 642301, 574114, 717873, 235560, 485567, 739828, 416823, 567870, 521342, 271620, 1130345, 636141, 30677, 444485, 621989, 1130340, 558003, 1130335, 594086, 405780, 1130327, 323085, 487275, 747961, 59330, 399595, 510004, 1130312, 696381, 435223, 517928, 1130307, 684324, 729023, 179219, 638898, 1130296, 585585, 706401, 326176, 730832, 546217, 531470, 514029, 599538, 1130277, 663182, 453350, 358771, 413075, 197375, 542015, 179066, 323565, 439146, 749326, 1130240, 50863, 1130232, 556923, 70752, 537610, 571009, 750029, 616483, 639157, 760171, 552670, 392181, 579092, 497356, 744538, 1130193, 498576, 413335, 444318, 558963, 644392, 584499, 474468, 61526, 246118, 46683, 712006, 406640, 494285, 587145, 1130156, 216656, 695643, 647777, 529272, 591718, 151408, 148503, 663916, 429843, 526266, 696148, 408157, 234277, 645349, 402799, 1130123, 703298, 464465, 6113, 727572, 495159, 68164, 596533, 1130115, 648174, 529043, 278691, 354076, 415184, 396312, 493439, 617033, 703240, 427221, 631852, 147550, 144138, 1130095, 479495, 501645, 79438, 412750, 30282, 1130084, 375206, 407457, 582138, 581229, 649335, 498021, 728112, 656101, 403914, 633411, 110375, 159767, 557592, 622238, 630123, 513303, 739171, 631665, 1130023, 749976, 1130015, 1130013, 611400, 600638, 1130006, 486312, 538718, 323959, 699896, 466456, 696519, 489166, 509832, 683410, 443766, 582390, 558809, 242115, 437191, 1129981, 416692, 427311, 520202, 278034, 727154, 670360, 1129973, 620882, 705905, 549722, 709492, 744835, 1129959, 659901, 473029, 668190, 480320, 671862, 396967, 566618, 629337, 444688, 497659, 693353, 662982, 655097, 715661, 430114, 428503, 596837, 394148, 506577, 1129896, 418615, 677304, 547770, 1129888, 6055, 407007, 1129886, 724767, 634856, 581844, 1129878, 596613, 721708, 169442, 749813, 664034, 611049, 547771, 729755, 626823, 397579, 62517, 500646, 721882, 490612, 284910, 594476, 241399, 585220, 394980, 732794, 1129841, 474961, 1129838, 745872, 582183, 410391, 1129835, 414970, 389506, 412503, 559629, 511806, 504057, 601649, 12553, 1129822, 418501, 535288, 407383, 743489, 500680, 393238, 712468, 399701, 736364, 536101, 467932, 738788, 469356, 193676, 668999, 400681, 516185, 532352, 691956, 177167, 424318, 422938, 423807, 572976, 530295, 369311, 448042, 680004, 536822, 748434, 451642, 1129769, 177775, 614001, 477793, 513836, 408908, 482580, 463421, 108813, 466454, 540572, 436582, 601783, 470680, 670004, 497483, 410329, 484646, 1129733, 221987, 1129731, 535067, 516941, 725044, 656422, 1136245, 555674, 700590, 667607, 334662, 703134, 412395, 143721, 565173, 236793, 1129700, 691711, 641617, 567820, 651708, 637695, 664605, 625633, 658427, 62521, 61519, 636208, 593007, 722260, 89360, 488839, 325057, 684536, 553161, 600524, 411822, 516087, 542582, 1129650, 636814, 495608, 577746, 553571, 1129642, 662028, 431988, 650966, 500574, 230013, 1129630, 515217, 725969, 737654, 714335, 678724, 606791, 427033, 466865, 466968, 500832, 674396, 661623, 693880, 462794, 181329, 751571, 645149, 570875, 439879, 1136233, 460506, 514421, 602653, 544978, 594835, 686436, 111077, 534202, 631789, 583527, 706624, 675316, 404221, 549110, 388265, 549237, 633137, 716717, 661990, 132345, 600959, 726834, 642841, 249364, 645818, 752488, 705551, 368124, 458832, 418832, 1129518, 486839, 61075, 1129514, 463679, 680225, 739996, 1129508, 512264, 595689, 686229, 435526, 1129499, 734836, 237465, 617192, 89149, 596659, 620810, 188273, 547627, 548020, 402132, 426403, 204520, 508162, 536931, 745373, 624304, 1129483, 477474, 249814, 663878, 621190, 383220, 255025, 432162, 564558, 617318, 450640, 547661, 426435, 726971, 569053, 718490, 412407, 659825, 490358, 723295, 1136215, 1129452, 521026, 1129448, 448305, 733585, 1129444, 637187, 661763, 754567, 715756, 706501, 579403, 400491, 1129436, 581075, 416561, 1129433, 479358, 434008, 658865, 1136212, 630092, 524266, 533379, 35150, 552364, 430536, 1129422, 488571, 759514, 754609, 409736, 680230, 688646, 465156, 269425, 571432, 553530, 632671, 469121, 552458, 610414, 724606, 759101, 465920, 1129357, 1129356, 407302, 538850, 425694, 425438, 648925, 439929, 497477, 423254, 129008, 530174, 511841, 625685, 397803, 1129315, 756934, 603085, 645080, 512985, 552563, 628657, 388981, 18075, 321226, 758918, 213726, 453857, 567505, 598973, 684899, 412357, 643181, 617733, 92974, 686139, 657594, 653662, 571431, 510514, 626866, 591784, 595422, 594352, 613576, 1136198, 702508, 731662, 199831, 590445, 679625, 412182, 677485, 395326, 519027, 1129241, 723897, 599724, 442810, 1129232, 701050, 606110, 60900, 721942, 460002, 689019, 518796, 81017, 687671, 628109, 457609, 717538, 590726, 535449, 418157, 497919, 390770, 681047, 456674, 653450, 642760, 1129167, 541708, 738451, 599178, 36033, 452422, 703281, 733738, 687792, 202310, 752388, 432012, 1129145, 149491, 535668, 582756, 612251, 541429, 642328, 588848, 662167, 742808, 558303, 718489, 750114, 499364, 653517, 1129103, 400435, 533677, 648975, 1136890, 415479, 632456, 154904, 659458, 1136183, 580040, 1129085, 1129084, 521869, 516532, 691028, 755688, 726379, 713127, 432503, 593386, 1129067, 437986, 458616, 688815, 410946, 612158, 98415, 630845, 583249, 697861, 1129055, 611027, 1136180, 156707, 722996, 469976, 386091, 1129042, 396974, 747939, 675811, 172986, 749215, 468437, 173834, 264602, 566732, 643303, 556581, 664751, 57292, 461190, 461659, 124607, 89100, 734136, 92742, 635081, 14571, 518785, 196450, 675430, 613222, 1128995, 581535, 412, 598286, 403328, 653560, 474875, 602263, 416161, 1128981, 589171, 1128976, 742238, 404407, 750926, 544890, 511401, 448000, 608870, 637004, 1128958, 1128956, 503949, 208417, 1128954, 333182, 1128949, 419783, 591543, 637698, 402300, 1128939, 668300, 566407, 553988, 486939, 1128927, 537223, 176781, 748579, 1128917, 532494, 472232, 1128914, 647637, 1128903, 1128900, 79203, 271544, 96933, 1128888, 545791, 748018, 68360, 383029, 395660, 493218, 423863, 518578, 669798, 72298, 424280, 655607, 1128872, 561064, 1128860, 378866, 569527, 462111, 702919, 596645, 626166, 252887, 730156, 389192, 584757, 611785, 588563, 696261, 535610, 730541, 1128840, 399884, 389808, 470372, 294538, 411600, 487606, 428836, 609922, 64259, 1128821, 1128818, 427505, 495243, 576792, 631266, 202726, 582484, 395665, 693097, 714863, 567976, 392882, 450290, 357777, 583824, 573221, 335374, 525433, 604884, 622467, 524369, 609071, 613079, 622187, 569862, 269960, 187574, 460663, 1128760, 418801, 604102, 1128752, 142656, 69841, 572432, 581303, 744230, 637909, 1128738, 450498, 640885, 505647, 1136152, 1128726, 154164, 493723, 477176, 327812, 651679, 453305, 697773, 121431, 415021, 400908, 581377, 236105, 735502, 459153, 1128691, 1128689, 322610, 646814, 449717, 680854, 250927, 541155, 1128681, 197174, 403824, 512569, 526597, 1128676, 738859, 569047, 1128668, 97873, 549663, 596007, 678053, 688140, 653413, 620231, 197875, 734529, 416738, 509003, 164798, 567024, 413801, 366118, 520273, 557046, 424856, 566615, 122690, 572446, 1128624, 543631, 1128620, 634302, 610923, 755062, 608491, 561885, 575606, 555772, 1128567, 480646, 706291, 472957, 564382, 44537, 1128562, 740168, 274605, 653579, 407816, 634055, 651969, 280704, 1128549, 659297, 447912, 631766, 408905, 173181, 522821, 624426, 1128539, 407575, 409057, 409708, 644637, 395821, 21003, 413172, 404156, 1128524, 412817, 440973, 576361, 626536, 345102, 655332, 1128511, 552956, 1128508, 708745, 409157, 1128494, 1128492, 22256, 406582, 1128484, 156181, 700430, 481345, 690869, 759007, 587999, 677421, 514241, 408686, 561991, 572170, 604126, 741514, 1128450, 610645, 725355, 751255, 636603, 1128432, 417905, 1057246, 812967, 1128427, 771994, 337952, 64882, 779025, 1128417, 818265, 771555, 845740, 320499, 820021, 946751, 783583, 1013322, 975809, 70472, 1037917, 968788, 1022698, 373224, 875528, 1128385, 1128380, 964482, 245463, 873986, 1029871, 1032729, 1037279, 149975, 1128346, 881533, 795754, 901007, 970549, 1128337, 959256, 1068313, 289517, 89877, 836062, 777839, 961525, 1078498, 242603, 986960, 1128319, 985840, 1030163, 985900, 857963, 797562, 1012478, 1128297, 1037088, 796223, 977852, 832897, 1128291, 1128287, 936422, 881142, 876885, 918446, 769310, 165237, 1128280, 795757, 947066, 840136, 1128276, 95167, 875518, 868055, 893658, 1136113, 854924, 1128264, 995898, 892353, 1045190, 1053031, 327873, 1011812, 838440, 132007, 1041905, 248407, 878415, 1018792, 874523, 774158, 1008502, 997671, 1076615, 227230, 1020019, 827488, 810830, 165287, 866615, 899741, 1000993, 937626, 1057030, 1136108, 1057241, 1014933, 1008191, 1055448, 22817, 809594, 136476, 899268, 933594, 999612, 1019246, 1128198, 887883, 947119, 917298, 860972, 1050241, 968631, 229107, 812644, 1034759, 857520, 929366, 982144, 843768, 42541, 229677, 1128166, 835940, 1128164, 1128160, 984952, 1032680, 894139, 926094, 992867, 936255, 1074449, 186071, 90368, 72476, 802667, 795872, 859387, 188445, 820387, 1033642, 1128119, 761430, 236359, 800142, 1005146, 998479, 1034491, 800703, 1049419, 167873, 772409, 1046115, 828068, 1054959, 805998, 978459, 1128087, 980023, 1128080, 916768, 1044449, 925041, 962160, 1128071, 191143, 1056313, 826354, 781539, 304380, 203790, 1128064, 797200, 22372, 803115, 957479, 768114, 991332, 897222, 809570, 295077, 786761, 243076, 796050, 1064020, 1128040, 999644, 239143, 1013556, 1128028, 1032719, 956231, 887242, 879329, 907807, 1020489, 967278, 893358, 1078015, 861865, 81693, 784092, 205959, 768939, 65305, 1127990, 1057936, 1040752, 935962, 107205, 825453, 949738, 952722, 924092, 1012829, 943014, 258617, 774027, 912931, 1127969, 893657, 829295, 1064659, 1127959, 772836, 932294, 71557, 1037376, 802776, 1127938, 856861, 1036776, 1016732, 1127932, 815939, 1003514, 1031609, 161027, 254740, 1024432, 903268, 1127914, 1127912, 136726, 1058856, 1043229, 121484, 1013679, 859955, 987306, 864853, 893756, 1028701, 1127897, 1127896, 845232, 839841, 899014, 916107, 1063739, 1010069, 842886, 767589, 1064344, 905050, 811245, 880110, 1054797, 1070930, 252441, 841870, 763443, 849720, 1026733, 996146, 883176, 937486, 791397, 158817, 999522, 8718, 895721, 1034305, 824384, 877676, 1127822, 322709, 907310, 1127810, 24636, 833302, 811447, 858395, 1127802, 978096, 331648, 1034015, 27705, 56067, 148159, 925169, 834934, 1046166, 1078080, 771730, 1055227, 871301, 1030994, 67359, 958846, 1033381, 885018, 1127754, 892490, 788960, 1127753, 891987, 1127752, 174305, 78352, 1127741, 230824, 919556, 174039, 1127730, 112175, 885095, 71390, 839912, 990480, 901678, 1127718, 838709, 890075, 917100, 314262, 951320, 1034261, 1037159, 941866, 1001968, 673041, 869918, 978121, 1127697, 1127695, 20734, 779540, 830234, 960998, 916453, 931401, 972092, 854884, 233900, 226741, 253406, 1071255, 856393, 1066186, 171431, 809913, 1127674, 870172, 1127668, 859431, 763641, 1003630, 1032978, 767490, 766142, 1072559, 811974, 1036830, 252314, 821619, 168786, 770534, 67379, 844464, 8452, 954096, 815308, 1127621, 856978, 844433, 1010270, 761225, 803948, 867262, 968186, 988542, 1059674, 1063049, 792925, 1064595, 143062, 159842, 987894, 853267, 150207, 1127588, 994761, 975140, 352949, 1070417, 865909, 86290, 804916, 1127570, 1127567, 980996, 1127562, 1127561, 1065678, 1068306, 1127554, 935643, 883929, 1040730, 1127547, 946839, 941232, 968238, 836003, 860643, 1048716, 1073975, 847831, 870184, 797815, 890953, 920289, 925119, 979571, 841302, 8136, 1031152, 976102, 118484, 865206, 765659, 770356, 72956, 1136042, 793699, 864818, 932299, 949516, 1061852, 1068587, 981837, 827381, 859101, 979007, 902790, 871016, 779201, 918750, 1127498, 279176, 838421, 885663, 60764, 1127493, 1061326, 1127489, 1067801, 791513, 20010, 865218, 874196, 939698, 934193, 1127468, 790118, 923070, 1005149, 1021302, 1002690, 952938, 1059177, 1068025, 1127462, 864864, 911889, 1025104, 24462, 828518, 1127448, 1061590, 998480, 1127444, 54818, 903790, 988595, 180298, 894466, 1127434, 993838, 1127425, 1051206, 1015668, 764766, 890643, 919771, 1127411, 797565, 908897, 795460, 880092, 907635, 979086, 914186, 187585, 1040212, 1127398, 1013228, 192284, 991798, 916050, 907301, 872978, 1136028, 1080183, 831024, 789014, 1127378, 150873, 1127364, 925292, 1054749, 766379, 129435, 858696, 798239, 960142, 1127351, 1005502, 919760, 804755, 813370, 1049877, 919707, 1070083, 995599, 772833, 888413, 1127325, 947430, 843139, 761907, 1055761, 264434, 814908, 929714, 761014, 775430, 932859, 824644, 37706, 842042, 115718, 864762, 762072, 101478, 903661, 854655, 72228, 875937, 937168, 1050808, 869292, 783335, 976771, 761313, 979890, 868913, 981240, 1015949, 1037830, 1018202, 982019, 35370, 816794, 1008453, 982696, 129700, 838235, 1127195, 865638, 917436, 881246, 762434, 806450, 1030502, 1127188, 1127184, 65383, 904565, 1136013, 967706, 1053723, 1023767, 31169, 1043151, 976293, 1127177, 1014055, 297058, 1049092, 1127162, 834856, 858461, 1025444, 968552, 902969, 844140, 982810, 77565, 881067, 340541, 319123, 781808, 1136008, 977828, 266611, 899428, 800348, 1043976, 1127110, 316262, 295361, 791170, 995778, 1040530, 357297, 56962, 982229, 1072522, 1025072, 168655, 1127084, 1127081, 884436, 989526, 864694, 187675, 799293, 292227, 892584, 1127044, 1030954, 1011348, 942728, 861724, 966614, 966679, 18164, 1071370, 976827, 1127025, 226190, 782125, 937753, 971564, 955228, 955028, 1052076, 1009291, 960734, 952306, 1126994, 1017930, 896746, 1126981, 145391, 1049202, 902586, 1126971, 878401, 1049390, 1126963, 855667, 985913, 994564, 913285, 1126958, 888689, 891634, 1013904, 916214, 975875, 1126948, 981828, 1126945, 1014189, 283348, 1126935, 881738, 1046757, 1126932, 996236, 1126931, 1080843, 1126925, 1081591, 1075711, 860145, 830462, 1126914, 240792, 1126910, 266390, 986693, 1001492, 73882, 1126880, 941093, 979044, 810631, 1126875, 1042543, 1007481, 87762, 1065739, 292284, 980185, 796451, 909549, 812006, 304449, 116653, 905706, 788034, 1002287, 1126817, 982967, 1126815, 896446, 870582, 1126807, 1019607, 1060142, 917813, 95381, 1073430, 1041628, 197487, 903975, 1049177, 233881, 1005869, 1052835, 877076, 1068315, 1077039, 999829, 1060669, 931621, 1126761, 974727, 1135966, 869887, 888762, 1126750, 1051205, 1035340, 819279, 1126742, 136473, 1126736, 858242, 776547, 1126711, 800274, 852966, 1051326, 803633, 984476, 12954, 911480, 855901, 981948, 929693, 761941, 1078365, 770648, 767499, 253693, 203321, 836888, 1126691, 278684, 931678, 1078766, 12048, 1126673, 899891, 123919, 904461, 798979, 798354, 1126658, 1065534, 873607, 1126654, 825961, 1126651, 955453, 331284, 92008, 908665, 1030949, 316302, 777578, 1033997, 1027534, 956670, 897892, 999910, 1000893, 1046684, 1126609, 821676, 793432, 325310, 1126587, 1076289, 935870, 980263, 799784, 858159, 892768, 856417, 992595, 795622, 1065206, 276928, 1053061, 833797, 881070, 1126545, 772968, 949129, 41048, 1049736, 1126533, 855243, 773025, 1016486, 1126531, 992162, 830306, 963943, 990969, 761883, 952768, 865754, 826731, 833544, 794347, 1126517, 853533, 811650, 1049180, 158469, 1126507, 303585, 271672, 1126499, 910699, 1135935, 1126491, 1002572, 939521, 1136859, 1135933, 122010, 927216, 868111, 250673, 823415, 1004948, 993055, 814791, 1078745, 966542, 915433, 8356, 1037981, 836498, 803596, 931027, 837140, 1065923, 1126425, 1061433, 852842, 1126416, 1126414, 802817, 910246, 1126403, 772885, 968995, 789140, 905574, 1070533, 25179, 869486, 867346, 930833, 981606, 833432, 367519, 1126380, 51514, 1126377, 1126374, 1081455, 971331, 903073, 921267, 8008, 995529, 879076, 1004774, 1126361, 908077, 845321, 960803, 981207, 883184, 935437, 917606, 65416, 906238, 232919, 963471, 858085, 888024, 1009408, 1056764, 9082, 131573, 1126331, 1081930, 762035, 811758, 975774, 810958, 1126317, 887806, 976678, 849869, 304430, 893117, 831794, 783751, 67545, 88116, 834181, 836636, 1032658, 932447, 767549, 959723, 866276, 881767, 1041743, 1035658, 1043138, 1016027, 232609, 958435, 998270, 1043346, 1126267, 845725, 840782, 807974, 1028772, 11258, 876934, 831217, 870157, 906391, 1052164, 831882, 1068326, 299461, 781689, 1126245, 1126244, 972647, 983987, 968667, 783989, 119263, 847301, 856149, 272863, 939453, 1050794, 954363, 1126223, 996876, 814920, 1126215, 974485, 908101, 922237, 961305, 1126209, 992904, 971415, 786171, 1022554, 896672, 807223, 899876, 964608, 792143, 59381, 1126178, 13823, 904780, 1041217, 842703, 1126146, 952165, 332824, 9129, 1049881, 877556, 1108939, 1112389, 792752, 1119729, 1105095, 1105103, 1128373, 1127622, 1124979, 885490, 1119827, 190044, 500575, 883785, 264403, 1108100, 421756, 1108307, 966413, 1111546, 156493, 1124145, 1110199, 1056204, 199143, 835929, 1063750, 1104031, 398483, 432930, 478605, 1044797, 1124464, 1107988, 130510, 1127893, 1135377, 1126206, 645693, 1133328, 646207, 1012021, 489204, 1119075, 573724, 600573, 1120447, 574575, 1055865, 494835, 1126814, 168216, 100983, 194013, 1119092, 1133167, 1133418, 427578, 324211, 11096, 1134787, 89928, 499920, 527433, 40578, 694342, 1125225, 1136427, 1128856, 719381, 53175, 131651, 1037798, 915593, 264014, 1121402, 962179, 1117099, 744366, 277780, 1114563, 1014126, 1117346, 148538, 451602, 474735, 359349, 903469, 1115776, 1104492, 315637, 1112341, 588587, 706080, 117831, 1120868, 1111906, 523270, 133358, 67262, 1121166, 805321, 1129828, 131843, 104861, 833860, 207786, 691330, 1103528, 1132213, 335594, 1134138, 138632, 671071, 705609, 1114819, 855434, 1134463, 747511, 502261, 183378, 654723, 1117387, 479871, 541571, 1106007, 60235, 180442, 710347, 1124210, 287683, 490595, 291865, 794725, 1103812, 436600, 1047259, 964223, 564054, 87181, 1116052, 554515, 443396, 1123581, 714453, 972007, 929033, 433234, 1121709, 88073, 87452, 1005165, 1133249, 953067, 101169, 855410, 1121276, 1114646, 19335, 789700, 47923, 301524, 405717, 165633, 952774, 766511, 1106293, 452431, 1109818, 1047902, 306076, 551040, 1059231, 182539, 1115569, 351697, 904965, 292906, 662372, 364142, 20455, 1119058, 203318, 1126813, 240053, 1115392, 1113437, 1122461, 1116341, 1129237, 912070, 278813, 423273, 507445, 25129, 146187, 634428, 1121986, 321441, 532603, 1030303, 1037496, 1043135, 1045109, 1049519, 1051399, 1056416, 1064670, 1065636, 1071750, 1103153, 1103791, 1104501, 1105792, 1105860, 1106928, 1106979, 1107315, 1107440, 1108450, 1108466, 1108473, 1108651, 1108729, 1109699, 1109707, 1109850, 1110678, 1112142, 1113042, 1113256, 1114166, 1114286, 1114993, 1115210, 1116380, 1117817, 1117886, 1118370, 1118426, 1119118, 1119543, 1120588, 1121353, 1121879, 1122138, 1122767, 1122843, 1123657, 1124552, 1125632, 1125755, 1126523, 1126738, 1127004, 1127233, 1127540, 1128456, 1129081, 1130705, 1130734, 1130847, 1131069, 1132044, 1132247, 1132532, 1132842, 1132943, 1132950, 1133485, 1133579, 1134094, 1134207, 1134431, 1134680, 1134939, 1134988, 1135268, 1135283, 1135413, 1135626, 1136043, 1136047, 1136769, 1136962, 118440, 119821, 121171, 125659, 135802, 141630, 144862, 156498, 166046, 169208, 174463, 175920, 177604, 181626, 197312, 206106, 227873, 23849, 240158, 245052, 246883, 253749, 256942, 257119, 258062, 26703, 273695, 302846, 318362, 324585, 330501, 330975, 332593, 336901, 3505, 360721, 384356, 390360, 405163, 42255, 425632, 426175, 42752, 435548, 436707, 444389, 449367, 452915, 463271, 469589, 47210, 482726, 48792, 50122, 514096, 519025, 53233, 537060, 537817, 543273, 545355, 555530, 583468, 586148, 590019, 605127, 610265, 611953, 640502, 64647, 653399, 655526, 655914, 660198, 67316, 673670, 701453, 703782, 708979, 716113, 730539, 735482, 735922, 75198, 768208, 779302, 792635, 794223, 794429, 801118, 804066, 808400, 809525, 814183, 819983, 849550, 85020, 850358, 86606, 877809, 883915, 88495, 911232, 914916, 91576, 918162, 938400, 940547, 945835, 978031, 985594, 99005, 997622, 999466, 132622], + 'msmarco-passage-dev-subset': [1048585, 2, 524332, 1048642, 524447, 786674, 1048876, 1048917, 786786, 524699, 1048995, 786857, 524722, 873886, 524733, 786918, 786937, 1049085, 262232, 524835, 524848, 1049200, 1049221, 1049329, 1049368, 787255, 262974, 1049456, 1049774, 1049791, 525534, 1288, 1049894, 787784, 1049955, 1050007, 525779, 263670, 811852, 1576, 525868, 306105, 1050231, 1050253, 1050275, 526013, 263889, 788431, 264150, 526331, 788484, 1050670, 1050695, 264284, 1050747, 1050778, 2235, 264410, 788702, 1050857, 437291, 1050923, 526671, 788851, 264594, 830531, 1051095, 1051108, 1051112, 789037, 1051211, 1051214, 1051223, 1051229, 1051257, 264827, 526984, 1051279, 1051285, 1051307, 1051339, 1051352, 1051372, 2798, 1051422, 789292, 1051475, 789332, 1051520, 1051530, 2962, 1051571, 961705, 1038859, 1091234, 1051723, 1051755, 1051808, 527568, 1051886, 1051902, 527625, 1051942, 1051943, 527769, 1052115, 527853, 265729, 790059, 1052274, 790178, 265960, 528117, 1052421, 1052427, 838116, 1052563, 1052585, 1091264, 1052615, 1052640, 830812, 1052717, 1005595, 1052948, 568709, 1052965, 1052985, 528760, 1053111, 528841, 525047, 266760, 1053219, 1053253, 4696, 791140, 266920, 791223, 529090, 267012, 529230, 4947, 1053611, 830973, 1053716, 568841, 267341, 1053896, 1053901, 1053931, 1053992, 1053997, 267566, 791862, 1054023, 791916, 568895, 267644, 918424, 1054186, 1054189, 529918, 1054328, 792187, 1054339, 1093231, 1054438, 1054450, 1054451, 1054468, 5925, 44686, 1054593, 1054595, 1054610, 1091163, 1054707, 792595, 6217, 1091337, 792688, 530572, 792742, 530601, 1054923, 1054958, 1054969, 792847, 1054999, 1055125, 1055176, 1055197, 1055351, 531142, 1091360, 1055505, 793475, 831302, 242713, 1055717, 531490, 1215, 1055889, 1093487, 1055940, 531676, 1056057, 1056060, 831380, 1056163, 1056211, 1056265, 307118, 1056303, 166111, 1056420, 1056437, 1056446, 1056482, 7968, 1056548, 270140, 1056580, 794469, 1056644, 1056726, 1056758, 794625, 525660, 1100134, 1056850, 270422, 1056950, 270520, 270521, 1057015, 270603, 794893, 270642, 1057098, 1057112, 1057139, 1057168, 1057251, 1057270, 88831, 8714, 1057334, 1057367, 533105, 8854, 1057446, 1057476, 1057488, 1057539, 569473, 1057631, 1057656, 9083, 533398, 1057708, 569507, 1057757, 1057937, 1057996, 9454, 1058036, 1058100, 795991, 1058140, 1058141, 1058142, 1058165, 1058182, 794665, 1058271, 1058325, 1058415, 1058442, 1058470, 272075, 1058515, 796383, 831784, 534305, 1058601, 1058604, 569674, 220151, 10205, 1058792, 1058822, 10276, 1058885, 10312, 534617, 272500, 1058952, 1058978, 1059045, 1059077, 1094039, 534941, 272815, 1059253, 1059287, 1059420, 1059421, 535142, 1059442, 273014, 1059496, 1059504, 11006, 1059601, 1059619, 11050, 1059646, 1059698, 11133, 1059801, 273449, 535599, 273481, 535627, 273522, 1059970, 535743, 1060039, 1060040, 176677, 1094191, 812190, 1060305, 1060342, 1060391, 798253, 798284, 1060462, 1094249, 11913, 1060496, 274067, 1060566, 274175, 1060616, 1060623, 536480, 1060795, 1006922, 570023, 262280, 1060868, 1060881, 613727, 536654, 45757, 482666, 798883, 1091545, 536791, 798945, 570068, 1004258, 1061167, 1061210, 1061237, 536995, 1094389, 12741, 1061324, 1061382, 274981, 1061472, 12903, 275049, 275137, 537301, 537410, 308032, 1061762, 275355, 537505, 537526, 275528, 275534, 13397, 537706, 537761, 275629, 537825, 1062190, 1062223, 1062233, 537995, 1062332, 1062334, 1062350, 275997, 1062457, 800318, 1062511, 838453, 1062589, 538309, 1062603, 1094605, 1062609, 538333, 538340, 276208, 46040, 1062687, 14151, 276298, 1062744, 276329, 276338, 276348, 1062784, 832508, 1062928, 1062961, 471850, 576601, 800987, 1063177, 818798, 1063349, 1063371, 276979, 1063461, 1063478, 14947, 14963, 539278, 15039, 801478, 15063, 1063644, 1063659, 1063702, 1063758, 1063765, 1063777, 539601, 277459, 1063892, 539648, 15382, 1063974, 277556, 15441, 801907, 277623, 277632, 1007473, 277701, 1064140, 277737, 1019649, 15607, 1064195, 1064206, 277785, 277799, 539957, 277977, 1064473, 278074, 802372, 1064518, 1064529, 832790, 540306, 253678, 1064687, 540432, 540456, 802634, 1064808, 1064852, 278429, 1094996, 1064961, 278542, 1065032, 278606, 278658, 1065118, 16559, 1065160, 540906, 1065227, 540983, 278863, 1095059, 803237, 541135, 16860, 803306, 1065494, 1065551, 1065558, 541272, 541274, 1065650, 279229, 17110, 1095121, 1065712, 541425, 1065728, 1095126, 803599, 541557, 46579, 1065971, 1065985, 17430, 1066043, 1066116, 279718, 17635, 541948, 804103, 541969, 804197, 279987, 17848, 17884, 789439, 658372, 280223, 18101, 1066709, 1066716, 1066792, 804687, 1066916, 1066958, 1066966, 1066971, 542806, 323592, 804996, 1067276, 1067284, 18759, 280927, 543251, 134239, 1067587, 1067640, 1067659, 281270, 1067724, 1067764, 1067772, 1067826, 543644, 19457, 805900, 543813, 19552, 281702, 281704, 543849, 1095542, 543951, 1068276, 1068290, 544060, 281930, 1068408, 544123, 19940, 544277, 544308, 544319, 282214, 1068715, 806574, 1068726, 282397, 806688, 282411, 833507, 995787, 1068924, 20356, 1068952, 305650, 282530, 20432, 1069028, 544745, 1051990, 544811, 1069108, 1069128, 20597, 1069222, 20671, 544974, 1069313, 833579, 1069327, 1069344, 1069405, 1069474, 1069521, 1069556, 283141, 545359, 1052089, 1069717, 807585, 545450, 21185, 576822, 283344, 545575, 178325, 283548, 1069981, 1069983, 807880, 1070131, 545847, 808019, 21603, 1100581, 21741, 21765, 1095899, 1070412, 21861, 1070452, 284072, 808362, 21948, 1070546, 463373, 790110, 1095952, 284313, 546459, 22231, 808716, 1070867, 47419, 284565, 22479, 1071061, 546825, 1071198, 546956, 1071270, 547018, 547089, 1071389, 547139, 22882, 1071485, 1071534, 1071545, 547301, 285158, 1071598, 809556, 484454, 1071722, 166625, 285375, 831030, 23285, 809798, 47588, 285537, 1071992, 309745, 809933, 285656, 285729, 1072188, 548036, 810210, 810242, 548099, 1057007, 810270, 178741, 810324, 47674, 1072479, 1072513, 810394, 548254, 286160, 1072603, 1072750, 548475, 1052610, 810680, 1072874, 47741, 548673, 1072988, 24441, 790536, 1073358, 1073365, 24807, 549135, 24979, 1073569, 25025, 25036, 47864, 549342, 1073640, 1073721, 1073801, 1073805, 25294, 821372, 703268, 1073943, 1073972, 1073980, 1074001, 549731, 549738, 559507, 25534, 25603, 1096557, 287912, 1074499, 812387, 288139, 550331, 288200, 26079, 26207, 1074804, 1074807, 397592, 1096644, 550565, 812734, 1074883, 550609, 26334, 1074949, 1096667, 1074989, 1074995, 1074997, 934889, 288702, 26664, 1075244, 1075262, 1075313, 288884, 1075336, 1075348, 551119, 26847, 1075588, 1075591, 1075608, 1075636, 1075656, 551413, 289276, 1075713, 1075741, 786520, 572517, 1075919, 289556, 289586, 1076030, 813899, 1076078, 551819, 551860, 27618, 289812, 1076269, 27743, 1100783, 814282, 27932, 1074603, 28216, 814699, 290488, 834848, 290499, 48417, 1077000, 1077002, 1077006, 290585, 1077019, 290632, 814964, 814987, 552868, 815015, 290779, 1097040, 290830, 1077356, 1097066, 815243, 105709, 815420, 815421, 29097, 291248, 815580, 29169, 291396, 1077844, 922398, 291516, 29416, 1078187, 29612, 1078198, 1078222, 1097236, 816289, 1078446, 1078491, 292094, 1097259, 572978, 816483, 292225, 1078731, 1078752, 30188, 1078765, 554511, 1078906, 1078920, 554738, 816893, 1079050, 1079086, 292676, 1079231, 292813, 398335, 1079340, 1079434, 555179, 817349, 1079535, 227992, 817597, 555458, 791629, 31222, 1079815, 1079817, 1079831, 293401, 555558, 293421, 555590, 1079888, 1079959, 1079987, 31432, 1080010, 1080031, 555750, 555850, 31595, 952520, 1080253, 1080406, 1080419, 293992, 556144, 556166, 1080495, 556217, 556248, 1080537, 1080555, 556307, 879155, 1097619, 32176, 818612, 556476, 556587, 1080937, 1080939, 1080948, 1080950, 818819, 1080970, 818842, 1081086, 1081091, 32642, 556952, 556976, 1097723, 1081338, 557157, 1054071, 295135, 1081569, 1081595, 1081609, 557401, 557417, 1081730, 819618, 557492, 1081946, 1082002, 1082091, 1082117, 1082242, 1082263, 1082265, 1082281, 1082332, 1082336, 1082339, 1082341, 1082351, 1082384, 1100986, 1082427, 1082445, 1082448, 1082455, 1082501, 1082502, 1082531, 1082536, 1082547, 558263, 1082576, 34015, 1082603, 1082607, 34039, 1082622, 1082653, 1082668, 1082730, 558448, 1082750, 1082751, 1082759, 1082779, 1082792, 1082807, 1082835, 1082840, 1082870, 1082872, 1082877, 1082893, 1082924, 1082947, 1082948, 1082966, 1082978, 1083000, 1083010, 1083017, 1083021, 1083052, 1083085, 839137, 1083092, 1083095, 1083108, 820973, 1083125, 1083127, 1083150, 1083152, 1083157, 1083158, 1083161, 1083243, 1083267, 1083268, 1083278, 1083285, 1083296, 559009, 559018, 1083307, 1098057, 1083332, 1083340, 1083341, 1083342, 1083345, 1083361, 1083362, 1083401, 1083410, 296993, 1083428, 1083430, 1083443, 1083472, 559198, 1083493, 1083499, 1083500, 1083502, 1083508, 1083517, 1083535, 1083537, 1083584, 1083597, 1083598, 1083611, 1083627, 1083641, 1083663, 1083675, 1083690, 1083713, 1083721, 1083722, 1083727, 1083743, 1054440, 1083783, 1083791, 1083797, 1083800, 1083819, 1083822, 1083831, 1083832, 1083846, 1083852, 1083865, 559607, 1083909, 1083933, 1083945, 1083948, 1083967, 559709, 1084038, 1084041, 1084075, 1084076, 1084086, 297672, 1084192, 1084197, 1084230, 1084233, 559959, 1084273, 1084276, 1084289, 1084301, 1084308, 1084324, 1084326, 1084330, 1084336, 1084354, 1084383, 1084403, 1084408, 1084435, 1084441, 1084469, 1084475, 1084478, 1084512, 1084516, 1084518, 560245, 1084555, 1084582, 1084599, 1084602, 1084603, 1084624, 1084686, 36133, 1084712, 1084713, 1084722, 822585, 1084755, 1084769, 822649, 1084814, 1084838, 1084848, 298444, 1084887, 1084889, 1084898, 1084906, 1084910, 1084930, 1084942, 1084971, 1084982, 1084986, 822859, 1085008, 1085013, 1085048, 1085139, 1085141, 1085192, 1085197, 1085229, 1085245, 1085279, 1085288, 1085303, 1085319, 1085327, 1085339, 1085341, 823203, 1085348, 1085351, 1085356, 298940, 1085386, 1085393, 1085421, 1085422, 1085434, 1085441, 1085454, 299023, 1085456, 1085457, 1085510, 1085517, 1085521, 299094, 1085532, 1085533, 1085535, 299110, 1085545, 1085550, 823421, 1085572, 1085584, 1085586, 1085613, 1085630, 1085658, 1085674, 576360, 1085733, 1085741, 1085760, 1085762, 1085764, 1085775, 1085777, 1085779, 1085780, 299350, 1085796, 1085804, 1085812, 1085842, 1085845, 1085862, 1085889, 1085918, 1085924, 1085930, 1085936, 1085943, 1085967, 1085980, 1086008, 1086014, 1086022, 1086046, 1086075, 1086085, 1086120, 1086145, 299732, 1086174, 1086186, 1086200, 1086224, 1086241, 1086266, 1086271, 1086281, 1086288, 1086296, 1086305, 1086309, 1086326, 1086354, 299939, 1086384, 1086385, 1086424, 1086430, 1086439, 530602, 1086468, 1086491, 1086498, 530611, 1086532, 1086555, 1086565, 1086575, 1086581, 1086594, 1086595, 1086628, 1086637, 1086679, 1086681, 1086693, 1086701, 1086708, 1086713, 1086715, 300312, 1011248, 1086760, 1086765, 1086834, 1086836, 1086855, 1086860, 1086874, 1086883, 1086886, 1086893, 1086915, 1086917, 1086927, 1086928, 1086942, 1086974, 1086976, 1087014, 1087018, 1087042, 1087046, 1087047, 1087050, 1087061, 1087066, 1087074, 1087077, 1087105, 300674, 1087114, 562827, 1087122, 1087126, 1087129, 1087171, 1087173, 1087185, 1087186, 1087204, 1087215, 1087226, 1087238, 1087269, 1087309, 1087317, 1087327, 1087351, 1087361, 300933, 1087375, 792900, 563119, 1087425, 1087435, 1087455, 1087484, 1087486, 1087487, 1087492, 301061, 1087514, 1087544, 1087556, 1087566, 1087581, 1087603, 1087604, 1087609, 1087634, 1087675, 1087680, 1087687, 1087690, 1087722, 1087727, 1087728, 1087729, 1087736, 1087764, 1087766, 1087774, 1087795, 1087803, 1087835, 1087848, 1087858, 1087869, 1087870, 1087904, 1087911, 563652, 1087959, 1087967, 1087969, 1087999, 825948, 825954, 1088138, 39577, 1088164, 1088209, 1088210, 1088211, 1088221, 563943, 1088252, 1088254, 563995, 1088302, 1088309, 1088311, 1088332, 1088349, 1088358, 1088379, 1088434, 1088437, 1088444, 1088453, 1088475, 1088502, 1088510, 1088512, 1088515, 1088539, 1088541, 1088606, 1088628, 40056, 1088648, 1088653, 826513, 1088658, 1088685, 1088693, 1088715, 1088734, 1088742, 1088758, 302337, 564509, 1088800, 1088816, 1088832, 1088845, 1088856, 1088869, 1088875, 1088884, 1088889, 1088903, 40337, 1088915, 1088928, 1088938, 1088958, 1088960, 1088973, 1088987, 1088993, 1089001, 1089002, 1089021, 1089022, 1089026, 1089027, 1089036, 1089043, 1089044, 1089051, 1089071, 1089085, 1089093, 1089121, 1089143, 1089156, 1089158, 1089164, 1089167, 1089177, 1089214, 1089246, 1089273, 1089277, 1089286, 1089293, 1089312, 1089325, 1089355, 1089376, 1089401, 1089406, 1089408, 1089414, 1089434, 1089438, 1089443, 1089469, 560673, 1089498, 1089501, 1089511, 565231, 1089521, 1089541, 1011713, 1089558, 1089560, 1089576, 1089597, 1089619, 1089639, 1089645, 1089656, 1089674, 1089678, 1089683, 1089688, 1089691, 1089693, 1089696, 1089706, 1089719, 1089727, 1089750, 1089760, 1089763, 1089776, 1089787, 1089804, 1089805, 1089810, 1089832, 1089846, 1089868, 1089896, 1089906, 1089925, 1089940, 1089945, 1089964, 1089966, 1089983, 565696, 1090029, 1090043, 1090054, 1090063, 1090072, 1090077, 1090086, 1090100, 1090102, 1090107, 1090110, 1090115, 1090132, 1090146, 1090151, 565868, 1090165, 1090170, 1090171, 1090184, 303777, 303790, 1090242, 1090270, 1090350, 1090364, 303934, 1090374, 1090388, 813193, 1090395, 1090400, 1090456, 1090458, 566174, 1090472, 1090530, 1090540, 41969, 1090613, 566335, 1090624, 909221, 1090700, 1090701, 828596, 1090742, 1090758, 1090791, 1090808, 1011925, 1090833, 1090838, 1090839, 1090841, 1090842, 1090861, 1090869, 1090877, 1090886, 1090910, 828779, 1090924, 837181, 42361, 1090945, 1090965, 1090987, 1091015, 1091048, 1091059, 1091068, 1091108, 1091112, 1091115, 1091116, 42568, 400311, 1091153, 1091164, 829025, 1091173, 1091177, 1091194, 566946, 1091246, 829103, 1091255, 1091330, 1091340, 1091421, 567159, 1091450, 1091467, 1091471, 1091473, 1091479, 1091513, 1091520, 1091522, 1091529, 1091535, 829425, 1099433, 1091576, 1091595, 1091630, 1091643, 1091654, 1091659, 1091661, 1091665, 1091667, 1091681, 1091688, 1091715, 1091719, 567443, 567452, 1091749, 305333, 1091767, 1091786, 305361, 1091807, 1091811, 1091833, 1091865, 567630, 1091923, 1091941, 1091955, 1091983, 1092005, 1092007, 1092010, 1092013, 1092023, 1092042, 837375, 1092093, 1092095, 1092105, 1092108, 1092120, 1092143, 1092159, 1092161, 1092162, 1092165, 1092168, 1092176, 1092180, 567895, 1092203, 1092236, 1092237, 1092238, 1092257, 1092258, 1092259, 1092263, 1092297, 1092311, 1092327, 1092330, 1092342, 1092348, 43781, 1092391, 1092394, 1092416, 1092417, 1092470, 1092474, 1092482, 1092484, 1092522, 1092528, 1092543, 1092551, 1092557, 1092605, 924844, 1092643, 44072, 1092665, 830551, 1092715, 1092724, 1092738, 1092751, 1092756, 1092757, 1092759, 1092791, 1092792, 1092796, 568526, 1092822, 1092832, 1092858, 1092863, 1092865, 1092870, 568585, 1092910, 1092911, 1092919, 568649, 1092942, 1092952, 1092972, 1092978, 1092984, 568703, 1092996, 1093006, 1093031, 1093038, 1093042, 1093064, 1093094, 1093096, 1093104, 1093107, 1093112, 1093128, 1093172, 1093179, 1093181, 831052, 1093200, 1093202, 1093235, 1093238, 1093255, 1093256, 1093305, 1093312, 1093359, 1093399, 1093405, 1093406, 1093407, 1093410, 1093419, 307005, 1093438, 1093443, 1093444, 831315, 1093462, 1093481, 206762, 1093534, 1093540, 1093552, 1093556, 1093561, 1093564, 1093570, 1099767, 1093621, 1093637, 1093650, 1093682, 1093698, 45125, 831560, 1093717, 1093723, 1093732, 831601, 1093750, 1093757, 1093773, 1093781, 1093786, 1093791, 1093795, 569555, 1093845, 1012431, 1093855, 1093866, 1093875, 1093881, 1093901, 1093915, 1093920, 307492, 1093926, 1093927, 1093941, 307521, 1093959, 1093962, 1093966, 1093971, 569689, 1093998, 94953, 831871, 1094027, 1094056, 1094062, 1056159, 1094081, 1094085, 51276, 831962, 1094110, 1094141, 1094145, 45590, 1094175, 1094197, 1094204, 1094215, 1094220, 569939, 1094232, 1094241, 1094271, 1094275, 1094316, 832188, 570070, 1094364, 1094369, 1094370, 1094394, 1094406, 1094440, 1094451, 1094453, 1094460, 1094469, 1094477, 45924, 1094501, 1094509, 1094519, 1094536, 1094566, 1094578, 1094579, 1094612, 1094634, 46095, 1094689, 1094691, 1094693, 1094699, 1094724, 1094755, 1094759, 1094825, 1094840, 1094869, 1094943, 1094959, 1094962, 1094982, 1094991, 1094999, 1095012, 570725, 1095055, 1095058, 1095066, 1095085, 1095092, 1095108, 308687, 1095155, 570905, 1095233, 1095238, 570979, 1095278, 46711, 1095306, 1095308, 1095332, 1095335, 1095354, 1095357, 1095360, 1095371, 1095377, 571103, 1095437, 1095469, 1095478, 1095490, 1095495, 1095523, 1095537, 1056405, 1095555, 1095557, 1095558, 1095560, 1095566, 1095571, 1095631, 1095633, 1095641, 1095650, 1095654, 1095687, 1095699, 1095704, 1095705, 1095711, 1095716, 1095723, 1095725, 1095747, 1095749, 571474, 1095787, 1095798, 1095806, 1095807, 1095816, 1095845, 47270, 1095856, 1095857, 1095864, 1095868, 1095874, 1095876, 1095881, 1095921, 1095922, 1095928, 1095942, 1095955, 1095966, 1095971, 1095982, 571696, 1095988, 1095994, 95286, 1096006, 1096021, 1096025, 1096044, 1096045, 1096049, 1096065, 1096087, 1096118, 1096126, 1096180, 1096207, 1096211, 1096227, 571954, 1096252, 1096256, 1096258, 1096262, 1096268, 47716, 1096311, 1096347, 1096357, 1096360, 1096368, 1096371, 1096375, 1096376, 1096401, 1096425, 1096429, 1096431, 1096454, 1096457, 1096463, 1096475, 1096476, 1096479, 1096493, 1096498, 1096509, 1096516, 1096527, 1096532, 1096533, 1096541, 1096543, 1096551, 572286, 1096605, 1096607, 1096610, 1096619, 1096620, 1096628, 1096641, 1096656, 1096658, 1096694, 1096712, 1096739, 1096776, 1096787, 1096788, 1096794, 1096823, 1096827, 1096830, 1096840, 1096850, 1096855, 1096866, 1096870, 1096886, 1096887, 1096911, 1096932, 1096943, 1096944, 1096945, 1096947, 1096958, 1096964, 1096983, 1096998, 1097014, 1097023, 1097027, 1097069, 1097087, 1097093, 1097100, 1097118, 1097119, 1097135, 1097153, 1097154, 1097195, 1097198, 313940, 1097213, 1097223, 1097242, 310853, 1097294, 1097298, 1097304, 1097314, 1097317, 835206, 1097359, 1097373, 310948, 1097386, 1097438, 573157, 1097448, 1097449, 1097461, 1097469, 1097492, 311067, 1097508, 1097523, 1097537, 1056742, 48998, 1097585, 1097602, 835478, 1097674, 1097721, 573452, 1097746, 1097786, 1097796, 49234, 1005907, 1097894, 1097905, 1097906, 1097909, 1097937, 1097979, 1097995, 1097999, 1098010, 49435, 1098013, 1098044, 1098048, 1098071, 1098090, 1098101, 1098102, 1098110, 1098111, 1098169, 1098180, 1098182, 1098222, 1098226, 1098236, 573954, 1098249, 1098276, 1098284, 574002, 1098322, 1098338, 1098354, 1098355, 1098422, 1098440, 1098452, 1098481, 1098497, 1098510, 1098520, 1098523, 1098536, 1098556, 1098561, 1098570, 1098600, 1098608, 1098609, 1098641, 1098646, 1098698, 1098719, 1098725, 1098763, 1098765, 836640, 1098787, 312368, 1098802, 1098804, 1098809, 574547, 1098846, 574569, 1098860, 1098874, 1098895, 1098905, 1098909, 1098927, 1098967, 836832, 1099050, 1099065, 1099072, 1099084, 1099099, 1099108, 1099178, 1099189, 1099217, 1099219, 1099226, 574944, 1099244, 1099284, 1099288, 1099290, 1099321, 1099340, 1099342, 837202, 1099351, 1099368, 575096, 1099391, 575146, 1099451, 1099452, 1099456, 50891, 1099482, 1099488, 1099530, 575268, 1099595, 837476, 1099626, 1099632, 1099636, 1099653, 1099656, 1099670, 575407, 1099700, 1099706, 1099729, 1099733, 1099739, 1099756, 1099761, 575492, 1099805, 1099806, 1099816, 1099834, 1099836, 1099855, 1099859, 1099865, 1099880, 1099888, 1099903, 1099911, 1099914, 1099943, 1099947, 1099955, 1099980, 1099981, 1099985, 1099998, 1100010, 1100035, 1100051, 1100064, 1100070, 1100077, 1100094, 1100105, 1100106, 1100119, 1100137, 1100138, 1100143, 1100151, 1100167, 1100168, 1100173, 1100187, 1100188, 1100190, 1100192, 1100218, 1100224, 1100226, 1100229, 838101, 1100299, 1100308, 1100319, 1100357, 1100370, 1100403, 1100415, 1100438, 1100454, 1100455, 1100457, 1100458, 576195, 1100486, 1100488, 1100492, 1100496, 1100499, 1100505, 1100533, 1100537, 1100541, 1100544, 1100580, 576312, 1100634, 1100640, 1100661, 1100687, 1100724, 1100732, 314307, 576452, 1100765, 1100772, 1100816, 1100839, 1100852, 1100855, 1100875, 1100919, 1100930, 1100933, 1100980, 838845, 1101018, 1101044, 1101048, 1101055, 1101088, 1101090, 1101121, 1101171, 1101172, 1101173, 1101211, 1101214, 1101228, 1101236, 1101259, 1101276, 1101278, 1101279, 1101296, 1101300, 1101336, 1101341, 1101347, 8798, 1101374, 1101394, 577131, 445714, 1101434, 1101448, 577167, 1101466, 1101467, 1101503, 1101535, 1101552, 315131, 1101566, 1101567, 1101568, 1101576, 1101603, 1101661, 1101665, 1101668, 1101670, 1101674, 1101706, 1101714, 315291, 1101739, 1101761, 1101131, 53191, 1101784, 1101806, 576851, 1101822, 1101861, 1101869, 1101870, 1101902, 1101906, 1101961, 1101977, 1101995, 1102001, 839878, 1102028, 1102088, 577813, 1102121, 1102163, 840053, 1102206, 577930, 1102235, 1102240, 1102262, 1102300, 315884, 1102325, 1102330, 1102335, 53814, 1102393, 1102400, 53897, 708038, 53991, 840445, 1101271, 839128, 54040, 1092930, 578362, 1101282, 840532, 1101298, 54199, 1101303, 54235, 571237, 54307, 314907, 578735, 578783, 574730, 54531, 54544, 841020, 1101365, 54648, 795540, 316803, 841165, 54819, 54843, 579133, 882982, 1101443, 841521, 227637, 579479, 55223, 841665, 1014131, 841919, 841961, 841980, 489858, 842070, 842108, 55682, 55691, 140367, 842221, 842223, 842272, 55848, 842333, 318073, 580313, 56033, 580411, 580450, 842596, 56188, 580483, 988253, 839528, 53109, 1101698, 843140, 1101721, 1101723, 56740, 953355, 36388, 843409, 56993, 795951, 577511, 581521, 57258, 581552, 581666, 319564, 1101845, 319652, 581801, 1101868, 57614, 1101871, 581975, 844128, 796056, 970830, 604229, 582146, 792977, 320025, 57882, 320051, 844390, 320117, 58074, 58130, 844594, 320320, 1058284, 582557, 582641, 752473, 58409, 582705, 58551, 582848, 58571, 58583, 1093082, 320792, 58801, 320970, 1014697, 845304, 583234, 59030, 583325, 583369, 59084, 321239, 845529, 1086279, 59190, 59204, 59217, 845719, 583611, 845790, 59392, 583686, 59426, 845888, 845892, 1102177, 583766, 583798, 9926, 583916, 840061, 846082, 59654, 307504, 1084389, 321918, 321951, 846291, 1014885, 846438, 1049484, 322211, 846513, 322345, 584500, 584569, 584592, 60339, 60357, 846806, 584695, 1093142, 1102351, 584905, 60677, 141185, 578100, 53813, 1058717, 585165, 10157, 323096, 323154, 585344, 585378, 61180, 323382, 61277, 847722, 847726, 323535, 585680, 323555, 61452, 585743, 585806, 1005888, 61531, 61623, 1058853, 848100, 323815, 61836, 323998, 61882, 586268, 848432, 324159, 848478, 1093196, 62055, 62136, 62411, 62439, 586740, 586785, 324645, 586790, 62554, 586916, 62648, 849142, 272605, 849337, 927989, 587326, 849561, 325292, 63152, 587524, 199442, 587674, 403793, 97972, 63548, 587853, 176701, 578607, 325929, 850236, 185397, 588122, 51054, 850450, 326190, 850555, 850557, 64179, 971904, 326410, 326417, 588627, 326509, 850820, 753479, 447648, 326629, 588829, 851004, 326719, 64711, 64960, 851425, 65000, 65038, 851490, 589423, 489257, 65267, 589564, 851813, 65488, 589777, 327640, 65583, 65584, 852037, 327750, 589903, 65627, 852179, 65770, 1093322, 328072, 65957, 271038, 66161, 66281, 328474, 328527, 66389, 1058425, 328611, 328629, 328704, 590861, 1093349, 590945, 328814, 591026, 66771, 66908, 853344, 329114, 1091569, 591310, 853471, 203003, 67200, 853646, 329369, 853699, 1102099, 329515, 853882, 1059820, 853995, 591898, 591940, 854085, 591993, 28442, 329901, 67802, 329958, 592192, 592220, 592235, 672429, 68095, 220495, 592495, 330419, 330450, 592601, 854785, 1016281, 592672, 330560, 854862, 330640, 1049767, 987720, 855031, 613318, 330792, 68832, 593135, 593275, 331141, 855546, 331352, 593541, 855725, 855727, 593732, 593792, 69506, 855968, 856171, 230082, 69789, 69871, 594295, 856568, 70340, 332600, 70504, 594793, 594831, 70595, 594930, 332797, 70709, 70720, 70787, 70852, 1093491, 333327, 71238, 988754, 595568, 595577, 333434, 831474, 333486, 1093507, 333579, 1049867, 333700, 596088, 596130, 1016869, 878615, 596468, 334433, 72398, 334558, 596716, 72435, 99461, 72485, 334754, 72613, 788035, 334867, 334904, 334916, 859229, 859274, 859376, 73094, 597384, 73106, 597395, 73119, 46081, 859669, 73257, 859870, 335710, 335711, 597870, 860071, 335910, 99676, 73788, 860266, 73853, 860462, 860542, 796812, 860655, 74328, 74356, 860942, 598802, 74637, 861169, 74759, 337073, 861403, 857943, 861433, 337190, 337209, 798967, 449442, 599524, 599550, 75266, 709559, 75335, 75342, 337509, 599720, 667932, 875417, 75608, 75698, 75717, 337864, 56323, 75801, 338040, 405867, 862345, 600231, 1102390, 862448, 600350, 862640, 76283, 862856, 338637, 1050033, 338696, 338713, 143849, 863112, 76770, 338917, 1017687, 601128, 339009, 863387, 77034, 863623, 863738, 77323, 601624, 601629, 339501, 77391, 601684, 77424, 77491, 339888, 339934, 339981, 340006, 77878, 602352, 864507, 602413, 602652, 78418, 1017966, 340712, 602957, 78730, 603031, 603050, 755878, 341039, 865426, 865476, 865518, 341317, 603773, 865971, 341736, 866101, 79698, 866139, 866251, 604113, 604153, 79891, 362845, 342115, 866428, 342156, 342285, 342450, 604619, 604628, 604673, 1061994, 80590, 604954, 1018359, 80876, 605169, 867490, 81075, 81137, 605467, 605648, 867947, 821068, 81649, 974808, 868184, 606117, 343976, 868410, 81993, 868487, 868525, 82100, 82161, 868598, 344400, 57402, 82293, 57411, 82379, 868919, 995654, 869035, 606944, 712832, 82842, 869308, 869348, 82949, 607292, 607338, 259885, 975040, 607374, 869519, 345350, 345453, 607599, 869759, 869827, 83448, 83458, 869891, 83506, 607855, 83621, 450851, 188714, 1016611, 870348, 608323, 870544, 916306, 870693, 608557, 870861, 870875, 84473, 84520, 844211, 997932, 609024, 84778, 609104, 871376, 85053, 85095, 1091690, 609469, 800652, 609628, 954307, 347491, 609799, 872081, 609956, 610056, 975495, 610128, 610132, 610190, 85904, 872347, 482412, 85954, 348136, 86094, 348242, 1019236, 610425, 800792, 872632, 86203, 872655, 1019262, 86264, 872777, 872823, 931905, 872855, 610716, 872869, 348594, 872946, 610898, 86624, 669800, 610940, 348869, 873250, 348994, 611199, 611271, 87019, 611366, 611468, 58234, 611747, 873914, 975775, 189355, 87701, 874299, 87892, 87926, 874455, 88160, 612471, 874691, 1090542, 88284, 88375, 612670, 874827, 874876, 858391, 874914, 612846, 88577, 535421, 613179, 613214, 613233, 89143, 622734, 875787, 875796, 1019783, 613694, 757644, 89418, 613755, 875986, 613852, 613870, 89610, 351820, 614047, 89777, 614069, 888777, 89786, 614121, 614186, 614286, 352236, 614409, 90169, 90209, 614598, 876924, 352818, 877161, 1063709, 932639, 615219, 90941, 877453, 615383, 615457, 615624, 91345, 877810, 877845, 91422, 353623, 877938, 91711, 91722, 616045, 91790, 91881, 878367, 616331, 354222, 823596, 616447, 92260, 980726, 354515, 878817, 878840, 92437, 627085, 878959, 92542, 471983, 617167, 617246, 1085497, 321363, 879657, 93234, 93308, 93311, 355458, 879747, 617611, 355484, 998223, 1064155, 617795, 93649, 1020500, 93823, 618223, 452572, 880527, 618408, 618486, 880766, 618818, 356916, 94782, 619087, 94865, 619159, 540109, 357162, 881582, 357340, 881695, 881723, 619675, 95409, 619805, 38098, 882002, 95651, 882141, 358150, 358240, 147090, 96250, 96310, 358455, 96379, 96420, 1091765, 96602, 1020999, 620992, 96749, 883282, 359040, 1094361, 621419, 97295, 359499, 883861, 453175, 97612, 97652, 97766, 759038, 622100, 97895, 97964, 989644, 884533, 98151, 802794, 884722, 693736, 622658, 622725, 884870, 884878, 893642, 42555, 622893, 885081, 98682, 885153, 885184, 98817, 98847, 885301, 885308, 623281, 885433, 885505, 1024599, 99183, 247717, 99267, 234998, 361594, 361620, 885932, 885986, 99556, 623857, 99805, 624143, 362016, 624176, 886332, 624199, 362076, 886382, 100013, 100020, 100046, 822937, 624503, 100250, 100307, 624644, 100364, 584727, 624790, 605651, 497360, 624876, 100616, 100661, 625022, 1065388, 143025, 887392, 887395, 887398, 919913, 625458, 363332, 1065448, 101451, 887906, 363637, 625782, 888100, 626005, 626232, 626318, 888559, 626462, 888796, 978259, 626701, 888911, 888934, 102506, 191853, 888988, 889046, 102627, 626918, 889104, 102695, 483521, 889289, 365044, 627323, 103125, 627513, 1094575, 847415, 628056, 628085, 1022198, 890532, 824000, 628532, 104290, 890890, 978605, 803861, 891082, 891498, 891565, 367290, 517245, 891719, 1049926, 1066161, 17586, 105549, 300306, 760367, 892224, 892329, 892454, 368229, 630391, 106125, 106508, 368728, 630905, 893275, 107077, 893681, 1022762, 107283, 164946, 893789, 631724, 236362, 894161, 632055, 323798, 107812, 632106, 369981, 632192, 370068, 1094727, 108037, 632394, 1094731, 632455, 370316, 894610, 632536, 632625, 108507, 632825, 804523, 370734, 108622, 632923, 370979, 542431, 633153, 149221, 371204, 633350, 760930, 633617, 633635, 1023111, 1042399, 895932, 371695, 633916, 109647, 633986, 633994, 634038, 109819, 634113, 634126, 109841, 634174, 372070, 896383, 518675, 372137, 326637, 634412, 634489, 372378, 634583, 896931, 372792, 635044, 635058, 635079, 897240, 635125, 804905, 635150, 635237, 897401, 373121, 463133, 897476, 373209, 635497, 635626, 897789, 635647, 897910, 897953, 635823, 897981, 111723, 630318, 636188, 280796, 111995, 112035, 280825, 761425, 1094863, 636434, 636437, 898686, 898714, 112318, 149801, 112477, 374690, 636853, 636929, 374799, 636949, 112718, 717845, 899212, 637208, 637234, 899423, 18840, 375291, 929046, 637459, 899800, 899869, 900062, 900076, 900077, 113664, 113732, 375891, 900450, 900599, 638503, 900696, 900731, 376537, 638723, 900924, 638795, 114573, 114633, 114638, 638928, 849245, 639084, 313262, 639163, 901355, 639412, 639545, 639560, 115365, 674595, 1086248, 377805, 115704, 640103, 115833, 412532, 115930, 640232, 47513, 902657, 79763, 902855, 116431, 116455, 1067990, 902919, 902931, 116517, 1024312, 309402, 893271, 903235, 116820, 849596, 641156, 955359, 116939, 237945, 641284, 641293, 117036, 903479, 117113, 379337, 641583, 641618, 903811, 980789, 904007, 117683, 980811, 117728, 642032, 194430, 642352, 904542, 1100639, 904727, 118365, 118457, 380755, 905057, 905479, 119089, 119168, 905604, 643561, 905707, 643572, 905766, 119534, 119683, 906126, 644023, 119761, 194724, 544260, 119975, 1068584, 906692, 644658, 833268, 644746, 824542, 907046, 907127, 645024, 645252, 121017, 121023, 38946, 645343, 907538, 1095165, 645472, 645604, 850361, 907997, 645892, 908069, 908154, 646071, 908237, 908316, 646179, 908489, 646354, 384406, 646623, 564707, 122440, 831815, 122582, 719488, 122639, 909111, 384845, 909176, 122807, 384985, 647260, 909547, 20520, 647503, 647687, 680490, 909886, 385652, 123529, 647872, 647876, 647949, 123710, 910150, 648049, 648119, 123859, 798469, 910375, 123975, 124128, 910622, 910777, 910818, 910870, 386653, 124534, 545052, 648877, 911032, 911056, 588775, 124787, 649110, 649200, 492988, 649294, 562821, 64528, 257309, 649451, 911605, 108287, 588888, 387603, 649763, 387662, 125545, 649893, 125627, 865384, 387848, 125705, 387864, 650076, 912234, 125842, 125898, 125996, 650378, 650462, 850919, 388465, 388588, 912879, 912898, 126491, 126525, 912961, 912992, 913098, 913137, 913286, 651187, 913374, 913509, 127098, 389258, 913568, 913579, 1098953, 632726, 389385, 127315, 850957, 938773, 127682, 977467, 914321, 914368, 914406, 127984, 914545, 128158, 128166, 128178, 128200, 914637, 1026258, 652556, 914707, 914771, 390484, 1086675, 807845, 178859, 652912, 128633, 390813, 652961, 895263, 653041, 938963, 653054, 653092, 915305, 653187, 391101, 391125, 915544, 129205, 129228, 129229, 915762, 915769, 129491, 129517, 391662, 129641, 129684, 916186, 129792, 129837, 130034, 392195, 654459, 392350, 633375, 392393, 392488, 654633, 392501, 633399, 786477, 916901, 1070324, 917015, 917022, 808200, 1070361, 655046, 21793, 392905, 655057, 392936, 130825, 917283, 917334, 130932, 917489, 393203, 917536, 393268, 393420, 393462, 917789, 371420, 917825, 1026789, 393696, 131597, 131665, 153027, 393881, 131768, 393954, 824920, 131873, 394021, 918324, 131925, 394095, 656250, 656345, 656371, 656376, 132104, 132151, 240489, 132263, 656602, 132317, 132359, 918800, 132473, 656859, 808528, 983299, 132639, 657091, 919310, 395038, 1070728, 657204, 896479, 657264, 133037, 395382, 919712, 395538, 939744, 395786, 657974, 763878, 920218, 808746, 134014, 920458, 658498, 396391, 920717, 920753, 658667, 920885, 789997, 921173, 66154, 134861, 659182, 921348, 659230, 397090, 659247, 135079, 921621, 397417, 921812, 135386, 135464, 135465, 135516, 922024, 135633, 135635, 659929, 660046, 922335, 660220, 922389, 109276, 22670, 136098, 398258, 136157, 922593, 660479, 136209, 660534, 398447, 660672, 660803, 660957, 136700, 503381, 660999, 661028, 661076, 459707, 661398, 399364, 399414, 399527, 137411, 137440, 399617, 137508, 853057, 924047, 661945, 662016, 399970, 137889, 137919, 662282, 662334, 138127, 924567, 662436, 138223, 662524, 138266, 372586, 924895, 400631, 138492, 924978, 400692, 126821, 925059, 138629, 138640, 400803, 663006, 138793, 663131, 139090, 663388, 925571, 401287, 139239, 925766, 23223, 663679, 110614, 139405, 663771, 401640, 925951, 663820, 926019, 663890, 926064, 663950, 401878, 1028179, 139767, 664138, 139897, 664194, 139929, 402075, 926436, 140161, 402318, 140216, 140238, 926700, 402417, 402427, 926980, 140696, 1090961, 665009, 927196, 140804, 809909, 198246, 140921, 665231, 403095, 927553, 766272, 547820, 403361, 403388, 403454, 161828, 141353, 766301, 722615, 141472, 984774, 665972, 141694, 488021, 403954, 404051, 142039, 928478, 404202, 928567, 928572, 142153, 928753, 589586, 142382, 666694, 142411, 666792, 404713, 142579, 142782, 142831, 667136, 405036, 929372, 405090, 1090329, 929473, 667373, 405238, 405310, 405330, 1028752, 667535, 143293, 143424, 143464, 405660, 1072500, 1072506, 405737, 930124, 155041, 405985, 930293, 930326, 111377, 406140, 679360, 144028, 406181, 930483, 406205, 941515, 930534, 930549, 930621, 406351, 406386, 144254, 144285, 930721, 406525, 406576, 144491, 1034839, 144528, 406718, 746785, 144682, 24115, 144694, 931147, 240504, 669046, 406923, 406974, 144857, 407102, 407131, 669288, 407274, 669427, 669444, 931726, 931772, 810660, 931940, 407662, 669979, 407869, 670022, 145821, 670142, 145877, 408134, 408149, 932495, 417570, 408275, 670437, 670476, 146212, 146244, 146269, 408419, 408427, 932735, 670600, 1090352, 408563, 932878, 670829, 408696, 408739, 146598, 408765, 933132, 408945, 933236, 146812, 409071, 671219, 409143, 898318, 57270, 409207, 147073, 933551, 147166, 933652, 671579, 933742, 147337, 671692, 409557, 933861, 933946, 147542, 409694, 934134, 409854, 409887, 934223, 934235, 672109, 148016, 1052414, 672352, 672433, 792789, 942221, 934795, 148424, 672753, 934964, 148564, 286915, 148633, 243244, 148761, 148777, 148851, 673143, 898631, 935358, 935362, 935364, 811266, 149161, 935707, 680250, 149447, 800243, 411660, 935952, 935973, 855050, 549219, 149670, 549235, 673984, 936182, 149767, 149790, 411953, 936273, 301777, 149853, 936501, 150087, 412319, 412340, 412352, 674702, 412597, 674914, 986316, 412982, 413040, 413079, 820027, 937427, 151011, 156251, 937578, 413404, 675719, 937947, 151547, 938066, 938140, 413858, 413905, 200062, 938359, 676275, 414155, 25344, 414276, 676454, 768133, 152519, 414714, 939020, 414733, 152598, 414757, 414799, 939104, 1029291, 677212, 415165, 153037, 939473, 153048, 677460, 418752, 677519, 415474, 415500, 677672, 939866, 506181, 637254, 563771, 153588, 677936, 415815, 153739, 153794, 415962, 678176, 940386, 1096257, 153981, 768411, 416228, 154301, 416457, 154372, 940916, 940940, 678913, 154633, 416846, 165135, 154785, 941219, 679167, 417040, 417080, 200600, 155056, 808235, 155086, 679390, 155119, 417362, 155234, 417404, 941749, 679658, 941865, 637576, 417664, 679878, 155700, 417902, 417946, 680102, 418032, 418063, 942354, 418165, 680324, 418195, 156052, 680373, 418353, 156215, 942651, 680514, 418423, 156379, 418552, 755907, 942915, 418633, 680951, 92509, 943170, 943190, 594105, 418926, 418977, 156889, 681514, 681264, 157149, 419326, 419333, 681791, 419692, 157580, 681944, 1031173, 682025, 944181, 944194, 944245, 682105, 682205, 944451, 769085, 682365, 900164, 244808, 682425, 420304, 420365, 420400, 944700, 682626, 214771, 944949, 420673, 682910, 113826, 987644, 420867, 683045, 987657, 420934, 987660, 420980, 158887, 26485, 507086, 421145, 159078, 421437, 1075156, 267187, 159667, 421813, 1031502, 507221, 114037, 946428, 422152, 422268, 1031580, 684459, 422398, 160255, 160312, 946747, 160339, 422501, 944231, 946825, 422609, 422624, 684780, 160562, 422827, 684977, 160694, 160735, 685091, 160808, 422955, 990938, 685177, 682190, 423178, 565856, 161117, 947678, 685591, 947785, 161418, 423608, 423646, 947974, 616415, 423878, 424045, 424092, 948397, 686260, 686290, 948452, 948532, 551309, 1031910, 686469, 424408, 424449, 162351, 424509, 948829, 686739, 686746, 424753, 813536, 162662, 424898, 425072, 1100937, 687245, 163038, 813605, 638849, 687375, 1032074, 425330, 425375, 949686, 988412, 687615, 687632, 425505, 813675, 625205, 163570, 163602, 950139, 249866, 163860, 163912, 950355, 426214, 426347, 1079141, 164282, 426442, 688644, 901206, 426504, 950799, 688711, 1075980, 688739, 426622, 164528, 164912, 689223, 427086, 165002, 165007, 813953, 427340, 202306, 951820, 689700, 817309, 689851, 689885, 952047, 690010, 165807, 907173, 952378, 952388, 428113, 952445, 952452, 166043, 690508, 952658, 690565, 428424, 690606, 166403, 690705, 166468, 952926, 690801, 464663, 36025, 953020, 428773, 690956, 428819, 166680, 428847, 691004, 166748, 691055, 166784, 428941, 953274, 691141, 953332, 953351, 683193, 857956, 953445, 167156, 167204, 691507, 167229, 691709, 167436, 429664, 429675, 167566, 167620, 290091, 464860, 590433, 167994, 430142, 168000, 954455, 168069, 430229, 168175, 945535, 692494, 168238, 954711, 692577, 66707, 1096742, 989296, 955093, 692955, 955117, 168787, 955220, 693101, 814568, 430985, 430989, 693152, 693162, 693297, 693447, 693469, 431481, 693636, 693642, 169390, 955911, 508855, 302435, 956060, 694063, 169778, 858421, 596282, 956403, 432161, 28352, 1033249, 956624, 694560, 694561, 694678, 432602, 837372, 432653, 432680, 694845, 956993, 1090513, 170581, 432811, 432874, 170770, 170788, 695238, 695240, 170982, 433220, 957607, 957688, 433415, 171370, 433549, 433579, 171527, 433680, 433685, 433691, 171691, 695993, 958142, 171776, 241405, 958311, 1033534, 171906, 696217, 696242, 696312, 172062, 696404, 434369, 434462, 696738, 958993, 959034, 172608, 696918, 959083, 434835, 727837, 172787, 959228, 172981, 173001, 435412, 815320, 173391, 435541, 959854, 697780, 960003, 435794, 697972, 697983, 960265, 960302, 436091, 436100, 960397, 960437, 436249, 960566, 698445, 174273, 436475, 174344, 698719, 436586, 436602, 29089, 698828, 174592, 961048, 961097, 436844, 436847, 72809, 946631, 436924, 961255, 699243, 1010700, 437165, 815618, 961579, 437324, 175251, 990414, 961921, 961950, 437671, 699837, 699872, 699873, 437752, 175625, 903097, 437914, 509907, 438058, 700224, 962443, 176015, 176065, 438286, 438316, 438324, 962731, 422600, 438455, 772129, 700618, 700641, 700835, 700871, 176744, 815891, 439061, 176994, 439176, 701335, 701345, 701390, 963564, 177221, 439375, 177238, 145569, 963788, 1034446, 701663, 128113, 439731, 160671, 964054, 964152, 248086, 440098, 422893, 440269, 964577, 1034587, 440362, 1034595, 160787, 178468, 702790, 702792, 702855, 178612, 178627, 440802, 1034679, 1087076, 178825, 160885, 1034703, 699510, 703211, 703270, 441128, 703383, 965578, 292021, 441409, 703765, 510513, 29921, 441734, 860078, 996301, 259128, 947466, 704072, 704080, 826518, 704223, 704236, 991210, 704398, 442377, 30039, 991241, 442455, 442491, 442525, 442593, 564668, 442673, 180592, 967106, 180693, 772864, 443027, 180887, 311540, 180902, 161224, 443081, 204924, 705279, 181144, 181222, 1035098, 181301, 443489, 705681, 181394, 705687, 181476, 181531, 968071, 968206, 372674, 968310, 706167, 706215, 205086, 948797, 706342, 182081, 968560, 968608, 444350, 1035278, 182393, 969023, 969066, 444790, 706950, 760638, 860573, 729508, 969264, 445094, 183046, 538570, 85018, 183201, 707513, 969750, 445494, 707670, 445573, 707721, 904389, 969974, 707835, 117977, 970152, 183723, 729672, 445908, 970242, 183874, 183880, 1035535, 183988, 184105, 708438, 970605, 708517, 184235, 184249, 643359, 970824, 184436, 708739, 184452, 708781, 708904, 184621, 467683, 446834, 971213, 971233, 336648, 49802, 184916, 971378, 185009, 447169, 709342, 948351, 30860, 447340, 971633, 249321, 971653, 709560, 185276, 971729, 185299, 447540, 447551, 709802, 992132, 972064, 709936, 447797, 30956, 185879, 448035, 448123, 710297, 448183, 186063, 186265, 972699, 205809, 186390, 186446, 710755, 448630, 448745, 710914, 773858, 186727, 448975, 448976, 948653, 973362, 686541, 449235, 449244, 187186, 1079785, 481341, 992407, 973731, 711682, 711710, 711759, 973917, 711803, 711811, 711840, 879150, 449750, 1079868, 974201, 974220, 187818, 118702, 450093, 1036214, 249802, 188134, 974670, 712545, 450681, 992618, 450788, 712944, 450854, 450921, 713134, 992677, 188908, 451070, 1101110, 713360, 189115, 49943, 713448, 189174, 975688, 451406, 189312, 451484, 451609, 189466, 975997, 1088718, 452200, 190078, 190212, 190307, 714636, 714672, 714678, 976829, 714709, 976941, 190601, 381321, 715189, 453220, 453270, 715508, 715588, 453451, 977770, 977952, 191536, 453705, 978057, 191632, 453851, 453856, 453869, 191792, 454018, 191971, 454258, 716641, 978802, 192502, 192579, 979054, 979133, 454872, 192894, 993353, 455273, 556489, 717563, 979713, 455456, 993419, 979787, 512825, 762455, 193422, 717751, 717763, 455659, 455743, 455776, 455782, 455793, 455853, 455862, 980168, 193742, 718112, 193866, 456016, 306806, 193968, 718444, 456305, 980633, 294518, 456443, 456551, 718782, 194531, 981006, 456734, 133977, 194750, 194870, 981400, 1092422, 719411, 195199, 195440, 272047, 719749, 457622, 573899, 382119, 457714, 195582, 457809, 195693, 457842, 862742, 720013, 982348, 458064, 458110, 720261, 982481, 458235, 196111, 1091384, 196232, 196250, 775457, 196453, 1081321, 196596, 458771, 458774, 196720, 458885, 513397, 993996, 1080229, 196949, 196963, 983438, 983451, 197024, 983499, 983543, 721409, 459280, 459291, 128772, 983708, 459481, 197542, 721885, 984075, 1037826, 984178, 459948, 197945, 197964, 984434, 460162, 722352, 984499, 722413, 722515, 460403, 984770, 984856, 984930, 984948, 906901, 984992, 198581, 722981, 985158, 985165, 985167, 985173, 985207, 198807, 985259, 985275, 723144, 985304, 985360, 461078, 985371, 985431, 985433, 985461, 1036656, 461281, 199177, 985644, 985653, 985736, 985752, 461491, 298113, 461601, 985905, 723781, 199572, 295406, 986068, 986162, 986197, 199776, 986210, 724121, 199837, 986325, 986411, 724275, 986427, 986472, 200042, 986484, 986494, 724410, 462301, 724571, 776122, 724579, 200296, 986733, 844658, 994582, 986791, 986793, 724680, 855029, 986852, 724733, 986932, 986935, 986936, 986972, 724872, 724887, 462765, 987066, 724947, 987100, 1010524, 987183, 725047, 987192, 987230, 987237, 462979, 987309, 425688, 987486, 987502, 463230, 907334, 987567, 987573, 201154, 987671, 463443, 987791, 201366, 201376, 987809, 987822, 987823, 987845, 987914, 732631, 463635, 987978, 725867, 725951, 988119, 988121, 988122, 988124, 801059, 988142, 988149, 988169, 988211, 726076, 726098, 988269, 1038527, 988294, 988306, 988416, 202006, 988504, 202073, 988512, 202081, 988540, 988636, 988653, 988710, 464440, 156723, 988742, 988743, 988745, 726614, 464484, 988787, 820161, 988911, 988915, 988954, 988960, 988988, 989042, 989099, 989108, 989213, 202797, 1097885, 727224, 202954, 989396, 995029, 776576, 203039, 1005500, 252295, 989530, 989543, 989573, 776609, 989647, 989676, 727551, 203274, 203317, 203390, 989831, 727699, 727707, 989855, 989866, 989870, 203458, 989894, 727765, 989912, 727779, 989963, 989994, 990010, 990026, 203646, 990093, 203688, 990176, 990197, 728060, 990223, 728110, 728150, 990307, 990345, 1038871, 990375, 1038879, 466162, 990459, 990481, 466202, 990526, 466252, 1045554, 728460, 466335, 990649, 990763, 990784, 990841, 990852, 1097939, 466640, 728823, 728836, 990995, 466738, 991032, 991044, 466774, 991064, 991079, 991111, 991138, 991171, 1082701, 991207, 991240, 729173, 991324, 991342, 991364, 685717, 991383, 991419, 991471, 205107, 467274, 991590, 991598, 991662, 205251, 991685, 991748, 991761, 991762, 991782, 991832, 729697, 991854, 467597, 991894, 467612, 991938, 514851, 165335, 992120, 792463, 205741, 992184, 992191, 992193, 992224, 50498, 992257, 992340, 992363, 992365, 992367, 730229, 992383, 730278, 992433, 992531, 992535, 206117, 992559, 992605, 992652, 992659, 992660, 992729, 78076, 992757, 730626, 992802, 992839, 992840, 992869, 165480, 992946, 992949, 992950, 206549, 993041, 468762, 993107, 993153, 993174, 993178, 468907, 993234, 820899, 206806, 206819, 993255, 993320, 993492, 993501, 427730, 993544, 993606, 993627, 993651, 1010173, 207251, 731545, 993748, 993795, 993821, 469535, 993834, 731723, 993876, 731736, 993883, 993987, 994005, 994012, 207595, 731886, 731902, 994070, 994085, 994087, 469819, 994112, 994133, 994228, 470001, 994311, 994338, 994397, 732288, 994449, 994478, 994479, 994533, 574317, 208145, 732448, 208198, 470385, 994688, 208265, 1083293, 470459, 515317, 732618, 208339, 994792, 994830, 208411, 818421, 470611, 994918, 208494, 994947, 932223, 208610, 995125, 864905, 995141, 995176, 995212, 995221, 1045709, 208822, 470982, 995280, 471007, 733186, 995380, 995443, 995526, 733422, 995576, 995595, 995598, 879869, 733510, 297019, 733591, 995756, 995789, 995805, 995806, 995825, 733692, 471705, 996011, 733892, 996042, 996054, 209651, 996119, 775138, 996181, 209764, 209730, 996272, 472024, 996328, 734198, 996414, 1092450, 734426, 559318, 996634, 472359, 472448, 1083642, 996835, 210442, 296441, 996922, 997044, 997086, 1083686, 997122, 1083704, 997227, 997351, 914845, 997449, 997481, 735343, 735384, 735387, 997533, 997542, 473319, 251445, 997648, 997649, 473394, 997713, 997744, 473492, 909048, 997860, 997872, 997878, 211468, 997913, 997935, 1083839, 998013, 735895, 998062, 998101, 211691, 473886, 998192, 473935, 998246, 998247, 998248, 736125, 998309, 998381, 998417, 998482, 998493, 474234, 1083926, 1040238, 998569, 998591, 998609, 212195, 998646, 998658, 212236, 998675, 998680, 998681, 1083952, 474419, 998735, 998834, 736713, 212435, 998891, 998903, 212477, 998941, 474659, 996623, 999028, 999086, 999089, 999110, 696677, 474873, 996653, 909273, 999192, 212796, 999261, 559771, 999356, 999385, 999391, 865616, 999416, 999439, 999517, 999518, 999550, 999552, 999555, 999567, 999610, 999637, 737512, 865660, 999685, 475402, 999756, 999791, 999836, 999897, 999921, 999942, 1000000, 1000004, 1000006, 1000017, 1000030, 1000083, 737940, 996805, 1000097, 1000170, 996825, 1000232, 1000272, 738162, 738165, 1000459, 214040, 1000509, 1000585, 909506, 1000619, 738484, 1000678, 1000681, 476483, 1000798, 1000864, 1000906, 1000951, 560059, 738931, 476807, 1001108, 574051, 822218, 476947, 476977, 1001279, 1040703, 997017, 1001381, 477100, 1001397, 1001454, 477286, 477309, 477380, 739599, 1001810, 739671, 739743, 1001903, 1001926, 477639, 1001981, 210690, 1001999, 215603, 1002058, 487279, 1002145, 1002148, 1002197, 1002238, 1002252, 1002274, 50833, 1002330, 478054, 478063, 740263, 35996, 1002426, 1002482, 1002554, 740416, 1002584, 1002585, 1002596, 478359, 1002716, 1002737, 740624, 1002887, 1002889, 740762, 1002938, 1002940, 478691, 740852, 1002997, 1003003, 1003006, 1003015, 675320, 478827, 1003210, 1003213, 560419, 1003239, 478981, 1003277, 1003299, 1003329, 1003334, 1003351, 1003359, 1041043, 741274, 1003445, 1003481, 1003482, 1003507, 741392, 1003557, 1003561, 479284, 1003590, 1003603, 479379, 217246, 469873, 822642, 36214, 479525, 1003831, 1003849, 479570, 1003875, 1003880, 1003884, 1003973, 1003997, 741970, 741977, 731759, 742022, 1004191, 1004199, 1004228, 1004233, 1004240, 1004243, 1004254, 862701, 1004322, 480064, 565915, 1084905, 218000, 1004493, 827791, 742446, 480504, 742667, 480536, 298550, 1004921, 1004940, 1004949, 298565, 742822, 1005113, 1005131, 742988, 473361, 1005163, 1085035, 743046, 1005191, 480932, 36473, 1046047, 1005475, 1005476, 1005520, 1080968, 517117, 481297, 1005586, 1005653, 481387, 1005678, 1005798, 743668, 743675, 743693, 743696, 1005949, 1006000, 743868, 997808, 1006199, 744092, 481961, 744109, 954144, 744261, 1006459, 1006489, 1006509, 220087, 1006578, 1006580, 36703, 1006751, 482496, 1006791, 1006852, 744764, 1006911, 1006987, 744891, 482808, 517386, 220761, 1007242, 688218, 483028, 1007382, 129565, 483178, 483241, 745402, 1007550, 211621, 1007606, 745469, 1007628, 1007673, 1007691, 1007696, 745559, 998093, 1007875, 745746, 1007934, 745794, 1007959, 1007972, 745830, 483795, 745944, 221664, 746055, 1008208, 746065, 998174, 1063607, 1008515, 1008516, 80712, 746438, 222158, 118448, 1041951, 1008830, 484551, 1008911, 1008947, 1008951, 1008968, 1008977, 1008979, 1009023, 1009109, 823549, 1085697, 1009183, 1009237, 222954, 1009388, 561448, 747345, 1009527, 485287, 223165, 1009610, 1009668, 1009695, 1009724, 1009742, 1009749, 485558, 747720, 223468, 1009959, 1009961, 1009994, 51090, 1010048, 1010057, 1010059, 1042158, 747937, 747985, 1010151, 998485, 748054, 736347, 1010277, 1010287, 1085888, 748321, 1010527, 1010537, 486274, 1010607, 1010615, 486370, 1010670, 996317, 307008, 605363, 224314, 486512, 748672, 486623, 748771, 224548, 1011003, 1011018, 1011021, 1011044, 224626, 748935, 998641, 1011120, 1011140, 748997, 1011166, 1011328, 212251, 1011381, 1011382, 749244, 749267, 1011512, 1011529, 749399, 1011618, 1011663, 212303, 1011721, 487569, 1011860, 749752, 94798, 225499, 1012026, 998802, 749955, 225752, 837740, 750111, 824080, 1012329, 1012464, 488198, 839488, 37685, 1012547, 226132, 750421, 750487, 998905, 488416, 226335, 1012780, 1012865, 1012866, 226461, 488676, 750821, 488711, 998965, 750946, 488825, 1013114, 1013229, 1013267, 1013304, 1086391, 1013367, 1013424, 1013492, 824282, 1013579, 1013592, 1013615, 212634, 489374, 227317, 1013797, 489513, 1086477, 751778, 751797, 1013965, 227591, 824371, 1014115, 1014132, 37952, 1014210, 489931, 1014242, 649640, 1014264, 36965, 227968, 167371, 490505, 752700, 1014884, 228474, 1014911, 300246, 1015055, 780850, 490802, 228738, 490883, 753040, 490903, 753071, 1015307, 753168, 1015347, 753214, 753299, 1015556, 753480, 1094395, 1015641, 261650, 753517, 212977, 229325, 1015766, 169305, 491585, 81945, 1016013, 1016015, 999469, 999481, 1016154, 1016254, 754113, 562594, 754166, 754191, 1016406, 1092441, 1016460, 1016547, 1016565, 1086933, 1016583, 230179, 754509, 1016676, 1016703, 431602, 1016790, 1016879, 1016915, 754786, 1087589, 1016943, 10264, 1098806, 492681, 1087001, 999641, 492853, 1043337, 230725, 755040, 1017204, 755093, 1017276, 230891, 1017348, 1090915, 999691, 755275, 824938, 1017476, 1017498, 1017524, 1017529, 1017537, 231109, 755459, 1017605, 755465, 1017692, 1017706, 231292, 231298, 1017734, 1017773, 1017775, 1045527, 493508, 1017830, 493543, 1017892, 231482, 1017952, 1017971, 1018032, 1018056, 38608, 231717, 193581, 985372, 231877, 494086, 1043545, 1018525, 494346, 1018658, 825147, 1018807, 863187, 1018918, 494730, 1013570, 756949, 232703, 1019179, 1019200, 495018, 1019356, 495082, 1019405, 1019414, 868953, 757275, 1019433, 1019470, 1019602, 757511, 836044, 1019705, 1019724, 6791, 495483, 1019787, 1019830, 495680, 1087532, 1020198, 758074, 1020244, 233904, 1020376, 496175, 496244, 234114, 496276, 234165, 758519, 1020710, 1020724, 563347, 234388, 563359, 758720, 1020907, 1020915, 912899, 968004, 496717, 758901, 758909, 1021065, 234644, 234651, 1040312, 344955, 759021, 1021170, 1000319, 759062, 1021241, 234821, 1021277, 1021318, 1021324, 1021327, 825583, 1087735, 497107, 497132, 1021446, 235027, 235089, 1021532, 1021554, 1021605, 1021639, 759503, 759515, 1021679, 1021682, 1021695, 497470, 1021797, 497536, 497596, 1021900, 1021907, 1021931, 235534, 1021971, 1022022, 497757, 1000472, 1022124, 1022132, 924398, 1022178, 760070, 235832, 257772, 1022359, 1022370, 1000519, 1022410, 1022442, 1087915, 1022577, 39360, 1044249, 1022620, 1022621, 694726, 1022630, 760512, 738422, 498398, 1000574, 1022712, 1022735, 498478, 1022769, 1022782, 1022832, 236427, 1022907, 1022911, 760817, 388950, 236580, 236582, 1023025, 760908, 236708, 761032, 236801, 1088043, 761096, 738525, 499068, 400696, 1023363, 236949, 499126, 761388, 499413, 761627, 1023782, 237370, 237373, 1023838, 1023850, 499568, 1088153, 237561, 1024034, 1024069, 499818, 761963, 1024166, 1024176, 499904, 762059, 1024221, 1055921, 762111, 1024288, 1024300, 1024305, 313438, 237936, 762296, 870422, 1024528, 1024591, 1024592, 1024667, 1024669, 1024672, 762558, 1024727, 826153, 762652, 1024835, 1024893, 1024904, 762761, 1024950, 1000959, 1088347, 618979, 1099077, 1025188, 763084, 1025259, 1025270, 1025290, 238886, 476724, 1025348, 1090358, 1025483, 239189, 1025624, 1090558, 1044755, 1025714, 763619, 1025801, 1011811, 39908, 1025895, 1099105, 239511, 239516, 1025991, 239648, 1026098, 1026148, 239830, 764139, 1026372, 165116, 258485, 1026711, 1026768, 1026775, 1003114, 1026799, 764691, 1026991, 240584, 1083997, 1027178, 1027209, 765147, 1027373, 741267, 1027650, 765512, 1027669, 503390, 503401, 8701, 765583, 608197, 1027785, 1027812, 1027817, 1027865, 503580, 503607, 1027919, 503674, 1028098, 503833, 1028131, 863499, 504044, 766202, 766238, 242019, 242061, 242103, 1028538, 242107, 1028555, 504306, 1028598, 1028608, 504335, 242219, 1028652, 1088947, 1028670, 1028711, 1028742, 1028753, 1028755, 1028796, 766769, 84106, 766804, 766808, 1029003, 1029016, 1029030, 1029031, 1029058, 1029124, 1029181, 1045347, 242863, 767248, 505107, 127876, 1029402, 505152, 505171, 1029492, 1029544, 1029552, 243139, 1029617, 1029681, 1029694, 1029772, 1029791, 767671, 505541, 243416, 767745, 1029908, 1029909, 1045494, 505810, 243712, 1030176, 558046, 243761, 1030215, 1030230, 1030271, 506025, 1030324, 1030378, 1030381, 1030388, 1030446, 1030451, 244092, 1030617, 1030623, 1030722, 506438, 1030823, 302878, 506579, 477648, 1030924, 1031032, 1031033, 1031047, 1031054, 1031118, 1031240, 244821, 506985, 244902, 241246, 244929, 507087, 1031456, 245120, 739913, 507381, 1031679, 1031682, 1031684, 507434, 769630, 303045, 245416, 1031861, 156566, 1031909, 1031976, 1031999, 1032011, 1032019, 1032156, 1032182, 507901, 1032198, 507934, 1032281, 770167, 1032341, 245921, 770233, 508104, 508316, 783781, 1099746, 770604, 1032758, 1032822, 837467, 1089670, 1033007, 770894, 246626, 1033092, 1033205, 1033250, 1033296, 771170, 771239, 1033398, 509111, 509114, 1033443, 771314, 575616, 1033580, 247194, 1033652, 1046093, 1033703, 1033718, 1033725, 1033759, 776392, 771694, 771734, 1033912, 1033927, 1033962, 509730, 1034039, 1034050, 488345, 1034136, 1034172, 772055, 1034204, 510018, 478220, 1034409, 510152, 510158, 510229, 827801, 1034666, 1034680, 510444, 1034761, 478295, 248385, 1034845, 1035006, 772928, 1035078, 510858, 510867, 510893, 226509, 1035228, 1035247, 773155, 1035321, 1035367, 1035379, 1035383, 511101, 1035410, 249118, 249176, 511330, 511367, 511417, 1035719, 1035805, 1035861, 1035874, 1035931, 1036002, 1036005, 249618, 828036, 773924, 511837, 773998, 511861, 249792, 774087, 1046520, 1036244, 249821, 512087, 1036380, 1036385, 828093, 595236, 244011, 1036542, 512278, 427323, 1036627, 250228, 1036675, 512405, 1090291, 1036782, 1036784, 250367, 1036800, 1090311, 512564, 512685, 1036999, 774866, 1037033, 250636, 512807, 1037104, 1037116, 1037188, 1090377, 1037250, 863817, 1037302, 1037341, 513061, 1090399, 1037373, 740876, 1037407, 357519, 775297, 1090413, 775343, 775355, 776517, 251172, 1099495, 1037662, 1037686, 1037689, 1037722, 1037781, 1037817, 1037872, 1037881, 281002, 283154, 513779, 1038161, 391481, 156688, 1038184, 1090537, 776080, 1090550, 1090789, 1090796, 1090596, 1101531, 1038592, 776465, 174249, 1038678, 1038685, 532142, 523413, 1038719, 1038724, 1038755, 1038830, 776700, 1038849, 1046969, 959589, 1039002, 1039052, 514767, 252632, 1003319, 1039195, 1039298, 1090727, 1039346, 1090730, 1039361, 837681, 828588, 777235, 777297, 515185, 1039495, 1039521, 1039586, 515335, 777519, 1039728, 1039746, 515573, 777792, 1040022, 1040030, 1047152, 1040038, 1040064, 1040082, 1040088, 1040099, 515813, 778095, 778139, 1090887, 516029, 1040353, 253965, 253966, 1040409, 1040461, 1040507, 1040532, 1040684, 820267, 1040694, 516413, 1040793, 1040848, 1040959, 778857, 778890, 1041050, 254652, 778948, 1041146, 1041159, 1041226, 1021053, 994867, 1012328, 517085, 1003695, 1091080, 255027, 1091082, 1041473, 1041520, 779475, 779553, 1041703, 1041714, 1041753, 357664, 517516, 779674, 255469, 1041924, 1091158, 1041948, 617968, 560357, 255633, 1042099, 1091189, 829050, 1091206, 780215, 1042364, 1042426, 780297, 780336, 256052, 1042488, 1042507, 1091269, 256192, 1042626, 1042676, 1042752, 780613, 1042800, 427532, 1042978, 1043064, 794160, 780993, 256783, 781074, 518940, 1043413, 829087, 519145, 471197, 257018, 994941, 1043568, 1043587, 1043658, 1043702, 1091461, 257335, 1043815, 1043914, 1043955, 1043969, 1043995, 781877, 1044041, 782079, 1044244, 1004167, 257885, 782253, 904295, 520184, 86701, 782381, 782417, 782426, 1099726, 782549, 258337, 1091633, 1044809, 782696, 520627, 520636, 1047365, 1045071, 1045072, 520816, 1091692, 1045135, 1045203, 1045208, 1091706, 1045227, 1045229, 783098, 558548, 521018, 1045374, 783277, 611152, 1045540, 1045567, 783433, 783843, 259239, 521402, 1045717, 783602, 1045826, 783687, 259417, 1045853, 1045855, 525467, 783822, 1046042, 1091850, 521801, 783963, 783981, 521851, 1046161, 259763, 259781, 1048185, 522076, 1046384, 1046387, 522151, 1046463, 1046475, 1046567, 1046569, 260172, 1046648, 784549, 1046736, 1046750, 1092517, 1048282, 1091973, 784700, 1099803, 1046931, 784805, 1046952, 567714, 1047010, 1047012, 1047088, 784961, 1047138, 1092029, 1047160, 1047162, 260762, 522953, 567759, 1047269, 260853, 785176, 523062, 1047386, 1099823, 261098, 524116, 1047548, 1047556, 1047592, 1047599, 1047625, 1047629, 1047642, 1047662, 743708, 1047700, 1047702, 1047708, 1099831, 1047738, 1047794, 1047833, 1047843, 1047854, 1082377, 785721, 794319, 523621, 830649, 1047913, 785772, 1047917, 261521, 1047987, 830040, 261652, 261683, 786009, 786021, 523952, 1048281, 786157, 1048303, 43649, 1048359, 1048361, 1048363, 1048377, 1048381, 734979, 524166, 968921, 786375, 1048565], + 'msmarco-passage-test-subset': [57, 524369, 43707, 1048716, 524438, 524469, 786761, 262542, 412, 262636, 524790, 1049088, 1049092, 1049096, 262686, 349622, 1049177, 1049180, 1049202, 611785, 262878, 262883, 525069, 1049390, 1049419, 1049537, 787526, 525433, 1214, 1049881, 1416, 1425, 787957, 1005146, 788034, 1050225, 1050241, 1050274, 788151, 263874, 1050405, 1937, 526266, 2045, 526352, 175123, 1050794, 1050808, 87762, 264434, 526597, 264602, 2610, 612105, 1051206, 2663, 789140, 1051326, 527126, 789305, 1136443, 1051570, 265198, 527398, 340540, 3243, 1051868, 3297, 1136507, 265494, 527647, 527745, 655939, 1052076, 306421, 789981, 1052164, 790111, 790118, 1073795, 1005502, 1136591, 265976, 790280, 3867, 528174, 266150, 1052743, 1052835, 266488, 4383, 656101, 1053031, 266611, 1053061, 481345, 266752, 791170, 656150, 529043, 4776, 838148, 1053411, 1126209, 88200, 791397, 529272, 5000, 1136818, 1053809, 1053885, 1126223, 529769, 267705, 792006, 792143, 1126077, 530079, 1054386, 268010, 530174, 1005869, 962187, 1054576, 530295, 6055, 268235, 6113, 1054733, 1054749, 1054797, 1054959, 268574, 792925, 793003, 1055227, 1055343, 793245, 1006025, 1055448, 1049736, 793432, 531296, 531311, 531334, 531470, 1055761, 793699, 269425, 576361, 525514, 831384, 794144, 908077, 1056313, 1113870, 1049877, 269960, 1056425, 7869, 794347, 1056576, 532352, 229260, 8136, 270297, 532494, 270378, 794701, 1056910, 8452, 1057030, 1057038, 1057241, 1057246, 8718, 533268, 533312, 795460, 307344, 87592, 9082, 533379, 533428, 795622, 271435, 533613, 271481, 1057936, 533677, 271544, 795872, 271620, 271672, 1058138, 271748, 533953, 271835, 534021, 271881, 796223, 534202, 272071, 534250, 534279, 796451, 831794, 272269, 796563, 10141, 1058856, 796808, 534684, 1059138, 1059177, 1059186, 272863, 535009, 797200, 535067, 272951, 10941, 1108953, 1059643, 797562, 797565, 535431, 535449, 11172, 11258, 927093, 273443, 11304, 535610, 1059906, 535668, 797815, 535699, 797862, 307758, 1060142, 798003, 273773, 798239, 536101, 11863, 798354, 875806, 9129, 536330, 12048, 1060669, 274306, 12166, 1060768, 798642, 1060897, 274522, 274555, 274605, 1050658, 536822, 798979, 12553, 274731, 274771, 536931, 274873, 799171, 1061326, 12761, 190377, 1061434, 799293, 799323, 537188, 537223, 12954, 537267, 1061590, 275173, 13101, 1061763, 537545, 275413, 1061852, 537610, 799784, 537744, 799932, 537821, 275722, 275737, 800142, 538026, 538078, 275951, 13823, 800274, 538143, 13912, 800348, 276099, 614001, 538373, 538393, 857280, 276343, 14244, 800703, 855922, 538654, 276525, 14448, 1063049, 657776, 276665, 800980, 538850, 14571, 538878, 538943, 276928, 277093, 733738, 277177, 795754, 1063665, 1063739, 788960, 745278, 539535, 539566, 539595, 395821, 802079, 277868, 1064344, 1051205, 278034, 1064595, 540318, 1064659, 278239, 540436, 802584, 802667, 278403, 540572, 802776, 540655, 802817, 278684, 278691, 655097, 1065206, 540951, 803115, 803296, 541155, 541229, 541238, 1065534, 279176, 17077, 803596, 17199, 803633, 1065923, 541708, 803911, 803948, 1117901, 1066186, 804098, 585434, 1066264, 542015, 280019, 1066463, 804329, 1066501, 18075, 920524, 18164, 542582, 804764, 804916, 280704, 18711, 18793, 543290, 805501, 1067743, 1067801, 543631, 543638, 1008125, 543772, 805998, 19684, 1068313, 1068315, 281922, 806258, 833432, 544191, 282055, 544220, 864839, 1126711, 282125, 20010, 1068587, 806450, 1119189, 527633, 282280, 806602, 544540, 544560, 943412, 456029, 544623, 806779, 544722, 20440, 20530, 20616, 20734, 323665, 20892, 1069557, 21003, 21075, 545410, 283348, 21227, 807811, 807866, 21444, 545791, 1070083, 807974, 283924, 21792, 546081, 1008523, 21838, 1070417, 808343, 284067, 546217, 1070511, 1070533, 808540, 546404, 22197, 22256, 546597, 1070941, 22372, 808898, 546758, 921267, 790199, 1071255, 284910, 1071370, 22817, 547140, 285032, 285049, 809339, 547374, 547405, 809570, 809594, 547515, 809683, 341207, 547627, 23367, 547661, 547711, 809913, 547770, 23531, 285797, 548020, 548054, 23822, 23910, 1072522, 548237, 23963, 1072559, 23986, 810496, 24093, 810631, 834181, 810830, 286573, 24462, 1073093, 810958, 811001, 24636, 965313, 811245, 549110, 286980, 24841, 549190, 943566, 549213, 549237, 811447, 811457, 25026, 549327, 1073638, 91565, 572170, 811650, 811758, 549663, 703281, 549722, 25465, 266390, 811974, 812006, 1074152, 812148, 484886, 1074350, 25802, 1074449, 397579, 812644, 288566, 1124469, 222861, 812967, 572446, 48170, 813370, 551271, 1075711, 27310, 813770, 289499, 289517, 551683, 813841, 922089, 27528, 551851, 1076183, 289801, 814107, 27705, 1076289, 289943, 814248, 814340, 1076490, 552319, 1076615, 552364, 290290, 552458, 552459, 552563, 290429, 814791, 552670, 814827, 28453, 814908, 814920, 1077110, 552834, 28661, 552956, 552959, 28684, 28862, 553161, 553530, 553571, 291553, 1078015, 1078080, 815939, 354466, 29667, 553988, 29725, 29785, 1078365, 29811, 1078498, 816456, 292227, 292284, 554435, 30163, 1078745, 1078766, 554529, 1078827, 30282, 554585, 30359, 816794, 227967, 816915, 554792, 816994, 30649, 30677, 1079280, 1053723, 293041, 293069, 555353, 31169, 31192, 136287, 747961, 293516, 555674, 31504, 1080183, 743021, 31825, 818265, 583532, 32202, 1080843, 32278, 556581, 32330, 1080912, 556637, 294538, 294614, 556923, 556935, 557046, 1081348, 1081455, 966679, 295077, 1081591, 33137, 1081774, 295361, 557592, 819755, 1081930, 923070, 557806, 295730, 295732, 820021, 620882, 557944, 557952, 558003, 996237, 617318, 820387, 311494, 558303, 792115, 296378, 34322, 341578, 558809, 558963, 558978, 297010, 297058, 34925, 297146, 35150, 821619, 821676, 9904, 559629, 35370, 442810, 297682, 822224, 560231, 36033, 822527, 560709, 560815, 823034, 561064, 70230, 823415, 823541, 561398, 37122, 561461, 37185, 561499, 299381, 561538, 299461, 9975, 561834, 37547, 299709, 299781, 561991, 37706, 562002, 562030, 824384, 562352, 562373, 38087, 38122, 574337, 824644, 300375, 300384, 312219, 824765, 300537, 562697, 1054994, 562904, 825151, 874196, 825453, 301352, 563542, 39325, 603617, 825961, 563898, 39660, 563962, 302038, 826354, 564283, 399884, 564310, 564382, 40124, 564558, 826731, 40648, 827155, 94173, 40892, 827381, 41048, 827488, 565366, 41184, 793327, 827847, 402915, 303637, 303706, 828068, 303874, 566216, 487606, 42055, 828518, 566407, 618695, 304358, 304380, 304430, 828723, 304449, 828744, 566615, 566618, 1107117, 566732, 42476, 42541, 50782, 566937, 829107, 567024, 829295, 305205, 567416, 43167, 567505, 43476, 567820, 567870, 567878, 567881, 567899, 830234, 830306, 830462, 306216, 568405, 568511, 830709, 44340, 1019841, 44537, 831024, 968631, 44621, 44659, 44678, 306873, 569047, 569053, 831217, 569196, 569409, 569527, 307403, 45301, 542712, 831882, 1012478, 569862, 569902, 307826, 45895, 832389, 925169, 832897, 308617, 570789, 570875, 46683, 571009, 571014, 571087, 833302, 309040, 1118879, 571215, 46941, 309117, 833544, 47125, 571431, 571432, 47269, 833797, 571671, 1110643, 182697, 47714, 47729, 309926, 619655, 572364, 572432, 310488, 834856, 310642, 834934, 1136837, 572884, 572957, 572976, 1136401, 48846, 573221, 1056764, 1056770, 573471, 573622, 835783, 49381, 49387, 8234, 573701, 49429, 49439, 49482, 835940, 836003, 836007, 836062, 574068, 49810, 574114, 49850, 49984, 848382, 836498, 857520, 50189, 836636, 836655, 312378, 836888, 50626, 312826, 837140, 1119006, 50797, 50800, 708745, 837287, 50863, 837383, 575259, 837433, 575378, 663878, 575461, 796050, 575606, 575621, 313491, 51360, 51514, 838000, 445502, 51577, 313747, 313766, 838235, 314064, 838377, 838421, 576292, 838440, 576305, 576357, 314262, 52199, 838709, 576792, 52604, 576964, 980518, 577234, 839463, 577546, 839707, 53330, 839841, 53422, 1119097, 577746, 839899, 839912, 315683, 857232, 840136, 1102456, 1102474, 1102477, 1102488, 1102498, 1102578, 1102579, 1102589, 1102590, 1102617, 1102645, 1102657, 1102667, 1102682, 316262, 1102704, 1102714, 316302, 1102744, 1102755, 1102768, 1102803, 1102811, 1102827, 1102839, 1102854, 1102862, 1102869, 1102892, 1102895, 1102970, 1102979, 1102989, 1103007, 1103009, 1103019, 1103076, 1103084, 1103089, 1103091, 1103093, 1103121, 1103136, 1103156, 1103166, 1103182, 1103192, 1103198, 1103250, 1103257, 1103260, 1103289, 1103290, 1103314, 1103322, 54747, 1103350, 1103355, 579092, 1103387, 1103416, 1103418, 841302, 1103467, 1103468, 1103502, 1103506, 1103511, 54958, 1103535, 1103537, 1103553, 1103555, 1103561, 1103579, 1103581, 1103601, 1103630, 1103645, 1103651, 1103654, 55079, 1103669, 1103679, 1103684, 1103687, 1103690, 1103759, 1103787, 1103793, 1103798, 1103816, 1103826, 1103828, 1103879, 1103888, 1103906, 1103910, 1103911, 1103921, 1103966, 1103969, 1103974, 1104005, 1104020, 1104022, 55454, 1104036, 1104064, 1104071, 1104077, 1104087, 1104099, 1104105, 1104118, 1104124, 1104175, 842042, 1104198, 1104215, 1104221, 1104223, 1104235, 1104252, 1104258, 1104279, 1104339, 1104403, 1104406, 580130, 1104454, 1104458, 1104468, 1104497, 1104506, 1104509, 1104513, 795757, 1104537, 1104557, 1104624, 1104630, 1104633, 1104640, 1104685, 1104698, 1104699, 1104704, 1104712, 1104720, 1104725, 1104740, 1104745, 1104763, 1104773, 1104781, 1104805, 580605, 842753, 1104915, 1104920, 1104942, 1104949, 1104950, 1104957, 1104984, 1105013, 1105017, 842886, 1105042, 1105073, 1105086, 1105087, 1105100, 1105108, 1105110, 1105142, 1105144, 1105146, 1105147, 1105148, 1105163, 1105169, 1105183, 1105190, 1105202, 318781, 1105216, 1105239, 1105248, 1105253, 1105263, 1105275, 1105280, 1105285, 1105287, 1105298, 1105337, 1105358, 1105364, 1105381, 1105425, 1105427, 1105429, 1105431, 1105432, 1105435, 1105441, 1105445, 56894, 1105485, 1105489, 1105498, 1105506, 1105515, 1105526, 1105551, 1105565, 1105567, 1105571, 1105585, 1105593, 1105594, 1105614, 1105617, 1105625, 581377, 1105666, 1105686, 1105689, 1105700, 581447, 1105753, 1105761, 1105766, 57218, 1105798, 1105805, 1105816, 1105817, 581535, 1105831, 1105853, 57292, 1105870, 1105871, 1105886, 1105900, 581663, 1105953, 1105978, 1105989, 1106011, 1106027, 1106029, 1106088, 1106089, 1106099, 1106125, 1106126, 581844, 1106141, 1106144, 1106159, 1106162, 319742, 1106180, 319757, 1106200, 1106201, 1106212, 1106216, 1106231, 1106234, 1106241, 57674, 844113, 1106273, 844140, 57710, 1106290, 1106291, 1106306, 1106327, 1106343, 1106348, 57774, 1106377, 1106381, 1106389, 1106408, 1106414, 582128, 1106421, 708693, 1106440, 1106450, 1106482, 1106491, 1106502, 1106508, 1106516, 1106521, 1106533, 1106535, 1106539, 1106543, 844433, 1106589, 1106597, 1106607, 1106617, 582339, 1106642, 1106652, 1106658, 1106672, 1106676, 582390, 1106680, 1106686, 1106709, 1106756, 1106764, 582484, 1106797, 1106821, 1106829, 1106834, 1106840, 1106850, 1106858, 1106862, 1106873, 1106912, 1106920, 320499, 1106988, 1107000, 1107015, 1107023, 1107033, 1107046, 1107082, 1107085, 1107091, 1107103, 1107108, 1107112, 320700, 1107141, 1107147, 1107149, 1107151, 1107153, 1107171, 582884, 1107189, 1107192, 1107193, 1107207, 1107210, 1107245, 1107299, 1107308, 1107336, 1107344, 1107353, 1107362, 1107364, 1107373, 845232, 1107381, 1107399, 1107401, 1107450, 1107455, 845321, 1107494, 1107525, 583249, 255251, 1107563, 1107565, 1107593, 1107602, 1107618, 1107640, 1107646, 1107677, 1107685, 1107702, 583443, 1107748, 1107749, 1107794, 59230, 583527, 1107834, 1107845, 845740, 1107885, 845751, 1107898, 1107915, 321504, 1107953, 1107960, 1107970, 1107982, 1107991, 1108009, 1108011, 845894, 1108071, 1108099, 1108121, 1108131, 321703, 1108147, 1108202, 1108203, 1108216, 1108227, 1108241, 1108268, 1108271, 59722, 59725, 584017, 1108332, 1108400, 1108406, 846308, 1108462, 1108472, 1108478, 1108481, 59911, 1108492, 1108510, 1108516, 1108521, 1108523, 1108526, 1108564, 1108607, 1108626, 1108629, 1108636, 1108637, 1108645, 1108658, 1108699, 1108718, 1108735, 1108738, 1108754, 1108763, 1108789, 1108799, 1108833, 1108847, 1108874, 1108875, 60301, 1108897, 1108911, 1108914, 1108922, 1108935, 1108950, 1108959, 1108975, 1108983, 1108984, 1108985, 1109002, 1109022, 1109040, 322609, 322610, 1109048, 1109110, 1109114, 1109188, 1109190, 1109195, 1109201, 1109214, 1109215, 1109238, 403328, 1109261, 1109311, 1109319, 60764, 1109379, 1109397, 60831, 1109408, 1109413, 1109436, 847301, 60870, 323018, 1109462, 1109463, 1109464, 1109471, 1109473, 1109474, 1109477, 1109487, 1109496, 585220, 1109525, 1109537, 1109540, 1109542, 1109546, 1109563, 1109571, 1109579, 1109599, 1109615, 1109616, 1109628, 1109655, 1109657, 1109658, 1109694, 1109701, 323294, 1109768, 1109784, 1109788, 1109794, 1058779, 1109805, 1109822, 323393, 1109853, 1109872, 585585, 323461, 1109901, 1109917, 1109974, 323565, 1110081, 1110087, 61526, 1110163, 1110169, 585888, 1110189, 1110196, 1110203, 1110213, 1110215, 1110217, 1110234, 1110246, 1110264, 1110275, 1110284, 1110294, 1110295, 1110314, 1110321, 1110322, 1110326, 1110337, 1110344, 1110353, 1110357, 1110388, 323959, 1110392, 1110401, 1110410, 1110423, 1110426, 1110468, 1110470, 1110498, 1110512, 1110523, 1110576, 1110605, 62078, 62097, 1110687, 1110698, 1110730, 848623, 1110776, 1110836, 578356, 1110849, 1110858, 1110868, 1110874, 1110903, 1110905, 1110929, 1110936, 1110964, 1110997, 1111023, 1111024, 586754, 586761, 1111057, 1111071, 1111099, 62525, 1111119, 1111132, 1102693, 62577, 1111156, 1111168, 1111205, 1111241, 1111292, 1111306, 1111313, 1111316, 1111338, 1111345, 1111377, 1111392, 1111396, 1111400, 62845, 587145, 1111470, 1111472, 1111502, 1111504, 1111564, 1111566, 1111576, 1111580, 1111581, 1111605, 1111650, 1111662, 1111668, 1111678, 1111702, 1111705, 1111710, 1111749, 1111760, 1111790, 1111791, 1111802, 1111810, 1111813, 63246, 849720, 63290, 1111890, 1111892, 1111898, 1111902, 1111908, 54246, 1111957, 1111969, 1111987, 1112014, 1112018, 1112044, 1112061, 1102849, 1112089, 1112100, 1112105, 1112107, 1112141, 1112154, 316436, 1112195, 1112210, 1112234, 1112240, 1112250, 1112262, 1112291, 1112297, 1112302, 1112306, 1112313, 1112324, 1112327, 850214, 1112366, 63795, 1112375, 1112377, 1112382, 1112384, 1112390, 1112394, 1112396, 1112406, 1112442, 1112452, 1112479, 1112487, 1112494, 1112501, 1112506, 1112514, 1112517, 588308, 1112606, 1112614, 1112625, 1102942, 1112656, 1112663, 588408, 1112702, 1102959, 1112755, 1112770, 1112819, 1112827, 1112834, 1112838, 1112847, 1112893, 1112897, 1112908, 1112928, 1112939, 1112944, 1112947, 1112954, 64384, 1102998, 1112968, 1112985, 588712, 64430, 1113041, 1113073, 1113090, 1113092, 1113109, 64535, 1113125, 1113147, 1113148, 1113158, 1113163, 1113170, 1113201, 1113216, 326787, 326797, 1113231, 1113258, 1113269, 1113304, 1113307, 1113318, 1113347, 1113352, 326921, 1113380, 1113381, 1113393, 1113398, 1113416, 1113422, 1113429, 1113439, 64882, 589171, 1113461, 1113490, 1113496, 1113498, 1113506, 1113520, 1113525, 1113526, 1113528, 1113545, 1113597, 1113608, 1113622, 65052, 1113654, 1113683, 1113699, 1113710, 327292, 1113728, 1113756, 1113767, 1113792, 1113796, 1113802, 1113808, 1113831, 1113840, 1113861, 1113877, 1113944, 1113959, 1113970, 1114019, 1114026, 1114047, 1114055, 1114066, 1114078, 1114092, 1114093, 1114099, 1114108, 1114125, 1114131, 589844, 1114149, 1114153, 589875, 1114164, 1114181, 1114185, 1114188, 1114198, 1114200, 1114206, 1114215, 404156, 327812, 65692, 1114275, 327855, 1114292, 65752, 1114358, 1114383, 65809, 1114428, 1114460, 1114471, 1114476, 65904, 1114483, 1114488, 1114495, 1114498, 1114502, 1114510, 1114512, 1114521, 1114523, 1114539, 65965, 328110, 1114549, 535288, 1114584, 1114585, 1114588, 1114589, 1114634, 1114635, 1114650, 590365, 1114654, 1114655, 1114664, 1114669, 1114686, 1114700, 590445, 1114743, 1114750, 1114753, 1114757, 1114782, 1103303, 1114805, 1114817, 1114828, 1114838, 590560, 1114856, 1114870, 328464, 1114901, 1114905, 1114946, 1114947, 1114948, 1114979, 852842, 1115004, 590726, 1115021, 1115030, 1115032, 1115072, 1115075, 1115086, 1115092, 1115097, 1115100, 1115106, 1115109, 1115118, 1115154, 1115172, 1115181, 1115191, 1115197, 1115206, 1115255, 1115281, 66745, 1115325, 1115332, 1115355, 1115372, 1115388, 1115423, 1115425, 1115432, 1115462, 1115463, 1115485, 1115511, 853376, 1115526, 1115539, 1115544, 1115584, 1115585, 1115586, 1115595, 1115599, 591326, 1115617, 1115649, 1115650, 1115651, 1103446, 1115660, 1115667, 853533, 1115708, 1115716, 67147, 1115748, 1115783, 1115784, 1115796, 67222, 67225, 1115819, 591543, 1115870, 1115877, 1115881, 1115887, 1115898, 1115929, 1115933, 1115949, 1115961, 1115970, 1115976, 1115983, 67422, 591718, 1116013, 1116015, 1116019, 1116021, 1116025, 1116028, 1116037, 591784, 1116090, 1116103, 1116112, 1116121, 1116139, 1116161, 1116162, 1116164, 1116168, 1116169, 1116185, 1116201, 1116211, 1116228, 1116234, 1116236, 1116264, 1116265, 1116268, 1116273, 1116301, 1116304, 1116324, 1116353, 1116361, 1116368, 1116369, 1116395, 1116419, 1116429, 1116433, 1116440, 1116450, 1116452, 1116467, 186086, 1116531, 1116537, 1116548, 330117, 1116553, 1116554, 1116592, 1116606, 1116607, 592329, 1116657, 1116663, 1116694, 1116695, 1116702, 1116706, 1116728, 1116775, 1116776, 1116799, 1116821, 1116846, 1116862, 1116864, 1116867, 1116871, 1116877, 1116896, 1116902, 854766, 330504, 1116964, 1116996, 854884, 1117033, 1117055, 1117062, 1117066, 1117104, 1117148, 1117150, 1117154, 1117178, 68618, 54659, 68626, 1117206, 1117261, 1117263, 1117271, 1117294, 593007, 1117299, 1117307, 1117313, 1117331, 1117337, 68764, 1117350, 1117357, 1117361, 1117375, 68811, 1117402, 1117446, 1117450, 1117505, 317325, 1117566, 1117579, 1117581, 1117584, 1117589, 1117616, 1117623, 1103776, 69075, 1117672, 593386, 1117689, 1117691, 1117700, 1117708, 1117709, 331284, 1117725, 1117732, 1117740, 593455, 1117748, 1117757, 1117765, 593489, 1117787, 1117798, 855667, 1117826, 1117858, 1117875, 1117883, 1117892, 1117911, 1117925, 331496, 1117935, 1117951, 1117978, 1117979, 1118012, 1118014, 1118042, 855901, 1118055, 331648, 1118140, 1118145, 1118169, 1118172, 1118176, 1118187, 1118199, 1118209, 1118227, 1118229, 1118230, 1118232, 1118240, 1118248, 1118251, 1118259, 593986, 1118293, 1118294, 1118310, 1118326, 1118333, 1118388, 1118389, 1118394, 1118416, 69841, 1118423, 1118429, 1118434, 1118455, 1118456, 1118457, 1118522, 856393, 1118585, 1118595, 1118627, 594352, 1118641, 1118651, 1118659, 1118671, 1118676, 1118677, 1118683, 70108, 70145, 1118737, 1118759, 594476, 1118768, 1118771, 1118792, 1118793, 1118797, 1118799, 1118869, 1118871, 1118883, 1118884, 1118889, 1118892, 1118926, 1118927, 1118953, 1118954, 1118976, 1119004, 1119013, 1119015, 1119021, 1119038, 1119040, 70472, 1119060, 1119076, 856951, 1119110, 1119112, 1119115, 1119121, 856978, 594835, 1119128, 70555, 1119132, 1119150, 1119168, 1119169, 1119179, 594910, 1119207, 1119225, 332824, 1119264, 1119271, 1119280, 70714, 332859, 1119305, 1119307, 595026, 1119316, 70752, 1119330, 1119338, 1119347, 1119374, 1119392, 1119440, 1119444, 1119496, 1119501, 1119528, 1119529, 1119592, 1119593, 1119597, 1119603, 333182, 1119617, 1119620, 1119627, 857537, 595422, 71138, 1119744, 1119755, 1119758, 1119764, 1119774, 857643, 1119828, 1119862, 1119884, 333454, 1119904, 333489, 595641, 1119943, 1119953, 71390, 595689, 1120006, 1120010, 1120019, 1120041, 1120044, 1120049, 1120084, 1120089, 595808, 1120119, 1120128, 71557, 1120162, 1120167, 1120180, 1120187, 1120189, 595910, 1120248, 1120253, 1120261, 1120268, 596007, 858159, 1120316, 1120328, 1120361, 858242, 1120391, 1120395, 1120399, 1120418, 1120453, 1120462, 858328, 1104250, 71908, 1120515, 1120519, 1120541, 1120559, 1120564, 1120574, 1120576, 72015, 858461, 1120606, 1120619, 1120633, 334219, 334222, 1120667, 1120668, 1120672, 1120676, 334251, 1120703, 1120704, 1120706, 1120736, 1120744, 1120773, 1120775, 1120776, 1120781, 55727, 858675, 596533, 1120834, 1120835, 858696, 1120842, 1120867, 1120877, 1120887, 1120891, 1120904, 1120919, 1120924, 1120926, 596645, 596659, 1120963, 1114962, 1120982, 1120986, 596699, 1120988, 1120994, 1121000, 1121017, 1121022, 1121044, 1121050, 72476, 1121057, 334633, 1121068, 1121082, 1121083, 1121104, 1121118, 596837, 334716, 72577, 1121156, 1121162, 72587, 1121167, 859072, 859101, 1121249, 1121260, 1121268, 1121269, 1121309, 597025, 1121327, 1121366, 1121380, 1121397, 1121412, 1121424, 1121426, 1121459, 1121466, 1121474, 72904, 1121501, 1121523, 859387, 1121532, 335114, 1121551, 1121559, 1121566, 859431, 1121576, 1121618, 1121624, 1121631, 1121641, 1121642, 597377, 1121667, 1121673, 335288, 1121736, 1121748, 1121759, 1121799, 335374, 1121814, 1121817, 1121830, 1121861, 1121875, 73328, 1121922, 1121931, 1121937, 1121940, 1121963, 1121967, 597686, 73415, 1121993, 1122033, 1122064, 580227, 1122082, 1122084, 1122087, 859955, 1122155, 1122168, 73619, 1122212, 1122220, 1122221, 1122222, 1122233, 1122234, 1122237, 1122242, 1122247, 1122255, 1122283, 1122286, 860145, 1122305, 1122306, 1122308, 1122325, 1122334, 1122336, 1122341, 1122342, 1122343, 1122348, 598116, 1122409, 1122442, 336011, 1122446, 73882, 1122467, 1122471, 1122476, 1122488, 1122501, 1122504, 1122510, 1122557, 598286, 74000, 1122584, 1122586, 1122593, 1122594, 1122606, 1122619, 74057, 598348, 1122648, 1122652, 1122658, 1122686, 1122690, 1122695, 1122706, 929863, 1122734, 1122745, 1122760, 1122772, 1122776, 1122785, 860643, 1122792, 1122811, 1122826, 1122846, 74274, 1122853, 1122859, 1122892, 336462, 1122908, 1122915, 1122936, 1122966, 1122997, 1123028, 1123052, 1123055, 1123074, 1123090, 1123103, 1123112, 860972, 1123133, 1123134, 1123136, 1123168, 1123184, 1123209, 1123211, 1123220, 598934, 1123227, 1123257, 598973, 1123298, 1123307, 1123350, 318302, 1123383, 1123397, 1123405, 1123408, 1123412, 599137, 1123465, 599178, 1123492, 1123499, 1123518, 1123520, 1123544, 1123586, 1123603, 1123626, 1123627, 1123629, 1123636, 1123649, 1123660, 1123709, 599432, 1123721, 1123755, 1123761, 1123765, 1123776, 599504, 599538, 1123840, 75286, 861724, 1123888, 1123927, 1123930, 1123952, 1123953, 599673, 1123968, 1123969, 1123971, 1123997, 599724, 1124030, 1124040, 1124067, 1124087, 1124093, 1124114, 1124122, 1056710, 1124150, 1124152, 1124159, 1124160, 1124170, 1124171, 1124194, 1124218, 1124226, 1124251, 1124276, 1124300, 600013, 1124306, 1124324, 1124335, 75799, 337952, 1124388, 1124391, 1124451, 1124462, 613923, 1124472, 274797, 1124501, 1124504, 1124522, 1124530, 1124531, 1124534, 1124542, 1124549, 1124569, 1124573, 1124601, 1124621, 1124660, 1124663, 76102, 1124695, 1124699, 1124703, 76140, 76154, 76171, 1124753, 1124767, 1124796, 1124802, 1124803, 1124820, 1124822, 1124831, 1124834, 1124844, 1124866, 1124872, 580697, 1124915, 1124925, 1124926, 1124953, 1124958, 1124982, 1124989, 1124990, 1124998, 1125015, 1125036, 1125041, 859732, 1125079, 1125111, 1125116, 1125132, 1125138, 1125153, 1125194, 1125238, 1125245, 600959, 1125251, 1105046, 1125272, 1125273, 1125292, 1119872, 1125342, 1125352, 842923, 1125394, 1125395, 1125409, 1125443, 1125455, 1125459, 596136, 76945, 1125550, 1125555, 1125559, 1125583, 1125587, 1125590, 1125591, 1125592, 1125596, 1125599, 1125626, 1125628, 1125633, 1125644, 1125645, 1125667, 1125680, 1125684, 1125688, 1125694, 1125699, 339286, 1061433, 1125743, 1125745, 1125750, 1125782, 1125820, 1125829, 1125841, 1125864, 1125920, 1125924, 1125930, 601649, 1125945, 1125947, 1125956, 1125959, 601682, 1125975, 1125979, 1126018, 1126028, 1126035, 1126069, 601783, 1126090, 1126106, 77565, 1126144, 1126146, 1126149, 1126156, 1126178, 77619, 1126215, 1126244, 1126245, 1126267, 339841, 788201, 77725, 1126317, 1126331, 1126361, 1126374, 1126377, 1126380, 1126414, 1126416, 1126425, 1126490, 1126491, 1126494, 1126499, 56678, 1126507, 1126517, 77947, 1126525, 1126531, 1126533, 1126545, 1126578, 1126587, 1126609, 1126651, 1126654, 1126658, 1126673, 843139, 1126691, 1126742, 1126750, 1126761, 1126807, 340377, 1126815, 843163, 864694, 602575, 1126875, 1126880, 864762, 78332, 1126909, 1126910, 1126914, 1126925, 1126931, 1126932, 1126935, 1126945, 1126948, 1126958, 1126963, 1126971, 340541, 1126994, 1127025, 1127044, 1127064, 1127081, 1127084, 1127097, 1127110, 1127117, 1127131, 1127132, 581075, 1127162, 1127177, 1127184, 1127188, 1127195, 78640, 1127228, 1127246, 1127280, 1127325, 1127331, 1127351, 1127359, 1127364, 1127367, 603085, 1127378, 1127398, 1119942, 1127411, 1127425, 1127434, 1127444, 1127448, 1127454, 1127462, 1127468, 1127474, 1127475, 1127489, 1127493, 1127498, 1127525, 1127547, 1127554, 1127561, 1127562, 1127567, 1127570, 1127588, 1127621, 1127668, 1127674, 1127684, 1127695, 1127697, 1127730, 1127741, 1127752, 1127753, 1127754, 79203, 865638, 1127790, 1127810, 1127822, 1127866, 1127895, 1127896, 1127897, 865754, 1127914, 1127932, 1127938, 1127959, 1127969, 79438, 1128028, 1128040, 865909, 1128064, 1128071, 581229, 1128080, 603796, 1128087, 1128119, 1128152, 1128160, 1128164, 1128166, 1128185, 56962, 1128212, 1128264, 1128276, 1128280, 1128283, 1128287, 1128297, 1128319, 1128337, 1128342, 1128346, 1128362, 1128380, 1128381, 1128385, 604102, 604126, 1128417, 1128432, 1128434, 1128450, 1105582, 1128484, 1128492, 1128494, 1128508, 1128511, 581303, 1128524, 1128539, 1128549, 1128567, 342150, 342159, 1128620, 604334, 1128624, 1128668, 80098, 1128676, 1128681, 1128689, 1128691, 1102527, 1128726, 1128738, 1128752, 1128760, 1128801, 1128818, 1128821, 1128840, 1128860, 1128872, 319218, 1128888, 1128889, 1128900, 1128903, 604621, 1128914, 1128916, 1128917, 1128927, 1128958, 1128976, 1128979, 1128981, 1128986, 1128995, 1129055, 1129067, 1129084, 1129085, 1129103, 1129145, 1129167, 1129226, 1129227, 1129232, 1129274, 1129315, 1129356, 1129357, 605098, 867262, 1129422, 1129433, 1129444, 1129448, 1129499, 1129514, 1129537, 1102556, 1129589, 81017, 81030, 1129630, 1129642, 1129644, 1129650, 1129700, 1129731, 1129733, 1129743, 1129748, 1105797, 81184, 1129769, 1129770, 1117767, 81225, 605516, 1129822, 1129838, 1129841, 1129878, 1129888, 1129896, 1129905, 1129959, 1129973, 1129981, 1130006, 1130013, 1130015, 1130023, 1130051, 343640, 1105850, 1130084, 1130091, 1130115, 1130123, 1130156, 1130193, 868055, 1130214, 1130232, 1130240, 1130254, 1105882, 1130277, 1130307, 1130312, 1130327, 1130332, 1130335, 1130340, 1105897, 1130431, 1130435, 1130511, 1130524, 1130536, 1130548, 1130575, 1130608, 1130640, 1130667, 1130670, 1130672, 1130684, 1130708, 1130726, 1130728, 1130732, 1130744, 1130772, 1130805, 1130806, 1130830, 1130837, 1130849, 1105982, 1130874, 1130885, 1130897, 1130916, 1130935, 1130937, 1130946, 82408, 1130988, 1130994, 1130996, 1131004, 1131005, 1131008, 1131013, 1131042, 1131048, 1131049, 868913, 1131062, 1131075, 606791, 1131092, 1131104, 1131106, 868965, 82578, 1131155, 1131173, 1131182, 1131192, 669130, 1131209, 1131216, 1131222, 1131227, 1131240, 1131260, 1131278, 1131279, 1131295, 1131300, 1131301, 1131307, 1131320, 1131343, 1131358, 344937, 82810, 1131396, 1131411, 1131415, 1131417, 607143, 869292, 1131446, 1106079, 1131467, 1131478, 1131510, 1131512, 1131533, 345102, 1131543, 82973, 1131554, 1131557, 1131559, 1131562, 1131588, 1131593, 1131596, 1131609, 1131650, 931357, 1131699, 1131703, 1131724, 1131729, 1131735, 1131738, 1131754, 1131777, 1131786, 1131813, 1131818, 1131821, 1131830, 1131840, 317018, 1131884, 1131892, 83320, 1131961, 83401, 1131983, 1132006, 1132047, 1132059, 869918, 1132147, 1106196, 1132162, 1106199, 1132255, 1132268, 345861, 1132309, 1132312, 870172, 1132336, 1132347, 1132352, 1106230, 608084, 1132399, 1132409, 608124, 1132444, 346023, 1132513, 1132529, 83959, 1132541, 1132549, 1132564, 1132574, 1132583, 1132588, 1132592, 1132613, 346202, 1132647, 1132651, 1132717, 870582, 1132734, 1132735, 1132737, 1132754, 608491, 625704, 1132790, 608508, 1132815, 1132829, 1132834, 1132890, 232508, 1132913, 1132921, 1132945, 1132952, 1132959, 1132960, 1132965, 1132977, 1106335, 1132996, 1133036, 1133057, 1133092, 756829, 1133113, 1133122, 1133153, 608870, 871016, 1133171, 1133173, 1133187, 1133190, 1133202, 1133231, 1133252, 1133254, 1133258, 1133288, 582098, 407339, 1133349, 1133366, 1133374, 1133376, 1133380, 1133405, 1133428, 1133431, 1133442, 1133444, 871301, 1133474, 1133527, 582138, 609252, 1133557, 1133558, 1133611, 1133620, 1133644, 1133646, 1133652, 1133658, 1133710, 1133715, 1133721, 1133744, 1133757, 1133780, 1133792, 1133798, 1133799, 582183, 871720, 1133809, 1133810, 1133812, 1133854, 1133902, 1133907, 85348, 1133931, 1133983, 1133986, 1133988, 1133989, 1133990, 1134001, 347583, 1134024, 1134028, 1134030, 1134057, 1134064, 1134071, 713301, 320086, 1134135, 1134140, 1134184, 85610, 1134188, 1106537, 1134203, 1134212, 1134221, 1134263, 1134266, 1134281, 1134306, 1134309, 1134313, 1134343, 1134357, 85798, 1134394, 1134405, 713357, 1134420, 1134422, 1134429, 1134436, 1134444, 1134449, 1134499, 1134500, 1134539, 1134552, 1134557, 1134558, 1134560, 1106598, 1134572, 1134583, 1134614, 844464, 1134639, 1134656, 1134658, 1134666, 1134676, 610414, 1134723, 1134728, 407575, 1134752, 1134784, 14371, 1134806, 1134807, 1134835, 1134838, 1134839, 1134846, 1134850, 1134853, 1134862, 1134871, 1134920, 1134926, 1134931, 1134945, 1134949, 1134952, 1134967, 1134978, 1134987, 1134998, 1135028, 1135039, 1135042, 1135052, 1135081, 1135094, 1135098, 1135106, 1135121, 1135142, 1135150, 538718, 1135165, 1135190, 669798, 1135234, 1135238, 1135249, 1135262, 1135274, 1135280, 1135301, 611027, 611049, 1135362, 1135395, 1135397, 1135438, 1135448, 1135498, 1135522, 1135525, 1135533, 1135553, 1135563, 1135568, 1135570, 1135605, 1135606, 320340, 1135625, 1135650, 1135684, 611400, 1135722, 1135727, 1135738, 873607, 1135780, 1135818, 1135841, 1135848, 1135856, 1135859, 1135875, 1135894, 1135903, 1128954, 1135933, 1135935, 1135966, 1136008, 1136013, 1136014, 1136028, 1136042, 1136073, 1136108, 1136113, 873986, 1136152, 1136180, 1136183, 1136188, 1136198, 1136212, 1136215, 1136233, 1136245, 1136283, 87730, 1136350, 1136384, 1136388, 1136397, 1136424, 1136425, 1136429, 612158, 1136482, 1136491, 612251, 612252, 1136550, 1136559, 1136584, 1136634, 874523, 1136676, 1136686, 1136724, 1136726, 1136728, 1136756, 1136763, 1136771, 1136793, 1136811, 1136824, 1136830, 1106978, 1136859, 1136890, 1136918, 1136948, 1136966, 612831, 858395, 55325, 582756, 613079, 88808, 88882, 613222, 1107092, 875518, 875528, 89100, 613422, 1107123, 89149, 1107132, 613576, 351473, 613642, 89360, 875793, 976102, 89480, 351625, 875937, 613827, 408145, 876017, 613911, 89633, 89634, 932447, 876108, 102330, 876154, 89877, 320832, 614338, 90139, 1110927, 614540, 352420, 614567, 614620, 1102868, 90368, 876885, 876934, 877076, 90708, 615000, 352949, 1114423, 615149, 91055, 877546, 877556, 353282, 91157, 353333, 877676, 615543, 714335, 91778, 1107568, 1120375, 616142, 796695, 91913, 354076, 878401, 354123, 878415, 92008, 92143, 92176, 616483, 963267, 971998, 845517, 616705, 92622, 879076, 1064020, 92670, 464999, 92713, 92742, 617033, 1107731, 617125, 1107745, 879329, 617192, 617223, 92974, 355339, 355519, 355540, 617733, 617742, 1107851, 880092, 15612, 880110, 618024, 59330, 880353, 94039, 618349, 356260, 59381, 880839, 2375, 230103, 881067, 881070, 94642, 881142, 409003, 619013, 583772, 881246, 1108075, 94866, 881324, 583824, 881533, 619408, 357297, 95167, 357336, 1108138, 1064450, 881738, 881767, 95381, 889718, 95448, 882005, 95594, 1108199, 619994, 95764, 357934, 620231, 88116, 882803, 96443, 620788, 620810, 334918, 96565, 96597, 358771, 883176, 883184, 96857, 671507, 561885, 621190, 359283, 359286, 621550, 359463, 1108487, 864818, 97466, 883929, 72228, 884131, 621989, 622023, 97834, 97873, 1108579, 622187, 622238, 622262, 884436, 622467, 1108632, 846492, 360488, 710329, 98415, 360650, 81842, 885018, 628109, 885095, 98675, 360822, 579403, 623112, 584499, 885663, 1108809, 1108811, 623603, 1108821, 55690, 1108867, 886048, 886243, 624210, 624304, 278827, 624426, 1108961, 886682, 409934, 1108993, 624662, 624689, 1109050, 195393, 100777, 887242, 1109070, 1109072, 100932, 100940, 625256, 1068306, 1077039, 1120685, 322709, 497477, 625633, 887806, 625685, 625691, 887840, 1109171, 887883, 101478, 888024, 578343, 60634, 101850, 626166, 626218, 364094, 888413, 847124, 143062, 789014, 1109288, 626517, 626536, 888689, 1129452, 888762, 102366, 497659, 626761, 626823, 626866, 1109365, 1065678, 1109381, 889197, 1109396, 191894, 410350, 1109407, 1065739, 365139, 675569, 103128, 60902, 889757, 103328, 103402, 323085, 103595, 890075, 628039, 628136, 366118, 247876, 366342, 890643, 628564, 61075, 628657, 1109680, 628808, 890953, 498021, 579601, 891083, 148515, 1109722, 104758, 54818, 891432, 629337, 629420, 105183, 891634, 1109806, 61240, 578560, 105367, 891987, 105574, 105604, 629913, 1111030, 630092, 630123, 885159, 892353, 368106, 368124, 892490, 892584, 1109969, 847829, 847831, 106320, 892768, 1110001, 630845, 893117, 368900, 893211, 631108, 631214, 369105, 61519, 631266, 893530, 369311, 1110129, 107205, 893657, 893658, 893756, 701898, 631665, 631789, 631852, 107701, 894139, 61659, 367519, 632456, 72298, 108391, 586049, 1111049, 370635, 108500, 370750, 632935, 1110391, 633073, 633124, 633137, 108965, 633411, 895721, 895787, 895931, 1110531, 633998, 634055, 848431, 544859, 542609, 804755, 896446, 491017, 848496, 896746, 634650, 110375, 324242, 634856, 841671, 1110710, 110736, 935962, 897222, 635081, 56067, 761250, 111077, 373224, 457609, 635379, 1110794, 326685, 635729, 897878, 897891, 897892, 1023476, 630264, 111704, 636093, 1110904, 972647, 636141, 111894, 636208, 334263, 898402, 636348, 636417, 334194, 112175, 636603, 898753, 936255, 636814, 898971, 899008, 374724, 899014, 112638, 899076, 637004, 637005, 637080, 899268, 112864, 637187, 112928, 62517, 62521, 899428, 375219, 899511, 1023767, 899741, 637695, 637698, 899876, 544978, 899891, 1111188, 637909, 113645, 637960, 1111214, 281106, 761705, 638163, 845725, 1111275, 674396, 638532, 900690, 900867, 900870, 901007, 638898, 901412, 639288, 1111439, 377304, 1111460, 639500, 901678, 115254, 639618, 325057, 377534, 237689, 115594, 82412, 115718, 412503, 377887, 237772, 115952, 902410, 378218, 902586, 325213, 902790, 378632, 902969, 640857, 841870, 640885, 116653, 116659, 1068025, 378866, 903268, 325310, 641164, 641193, 117115, 841979, 903643, 903661, 893358, 641617, 1103766, 903790, 903975, 588762, 500204, 1111874, 379846, 580040, 117829, 187234, 642144, 117862, 117910, 642252, 117965, 904461, 642328, 904565, 868111, 642587, 904780, 642647, 118372, 859388, 642741, 642760, 849869, 118484, 1068326, 642800, 642841, 905050, 150873, 643181, 544108, 643223, 1112097, 643303, 643328, 905574, 905638, 119263, 675245, 1112152, 119400, 282050, 643749, 803862, 1103838, 1112568, 119531, 643870, 769008, 1112203, 587923, 850078, 413172, 906203, 906238, 644204, 906391, 544294, 906491, 644356, 644392, 382254, 587999, 644416, 382407, 644637, 644678, 120398, 644699, 117487, 631766, 120593, 1121333, 645080, 907301, 907310, 383029, 907322, 121025, 645349, 500775, 383220, 121113, 907576, 1112486, 907807, 121431, 121484, 121488, 645818, 850384, 1130095, 68896, 383847, 121746, 121843, 719371, 646245, 1112596, 122011, 326176, 369873, 632020, 122298, 1079717, 1112658, 908897, 981606, 646814, 909052, 1112705, 122644, 1112709, 122690, 122724, 122795, 282652, 123090, 909549, 909560, 894466, 909736, 647597, 632223, 647637, 64259, 588563, 123525, 647887, 544890, 123648, 56494, 981828, 910246, 648174, 123919, 386091, 1103987, 123980, 648296, 386213, 239143, 100154, 124244, 910690, 910699, 124291, 648638, 124607, 648925, 648975, 807223, 386934, 124798, 649104, 124895, 124938, 1043073, 125037, 649335, 911480, 588848, 1113175, 676274, 632671, 912165, 125791, 912272, 604884, 125929, 388130, 388265, 388319, 650476, 650559, 912931, 1113353, 912970, 126582, 650966, 388981, 126848, 913285, 126866, 326988, 1113425, 913435, 389192, 414393, 913574, 127150, 851318, 853864, 389501, 389506, 327062, 651679, 389541, 651708, 938754, 651821, 389739, 389808, 651969, 914186, 389908, 127812, 632755, 390313, 128174, 652495, 128291, 128365, 128521, 1113692, 108813, 128604, 390770, 652917, 1113724, 653077, 653090, 128874, 391077, 915433, 129008, 1026391, 239971, 653413, 653450, 653517, 653560, 653579, 129347, 653662, 129435, 129457, 1113847, 916050, 916107, 391829, 65305, 129695, 129700, 916214, 392124, 916453, 392181, 851807, 65383, 130306, 916768, 65416, 458638, 21744, 392610, 1114021, 654897, 1114044, 917100, 130694, 392882, 917233, 917252, 130858, 917298, 130951, 917436, 655332, 655373, 393238, 370985, 917567, 917606, 1114132, 131247, 393411, 655607, 917813, 131405, 655728, 393611, 415141, 131573, 131617, 1070541, 131850, 394040, 656267, 394148, 132007, 918446, 64421, 1114277, 394208, 1114279, 132133, 656422, 918605, 918750, 132345, 132423, 132469, 132495, 132575, 656994, 677485, 1114402, 394980, 657161, 132938, 919556, 56808, 395326, 109587, 919673, 919707, 657594, 919760, 919771, 133385, 22247, 1008285, 395660, 395665, 1114524, 1114547, 133747, 920289, 396098, 396122, 658273, 920435, 328169, 396197, 134127, 658427, 396312, 22364, 1114643, 1114660, 1121794, 134469, 1114690, 658865, 134628, 396870, 921193, 396967, 396974, 1114739, 396995, 134875, 921322, 134903, 134905, 659297, 659385, 153592, 659458, 1114804, 921716, 921746, 135347, 503164, 397550, 323874, 397564, 659825, 659901, 135634, 397803, 1114882, 135702, 1114887, 634302, 922237, 135821, 135936, 398127, 660426, 136168, 983896, 1114974, 398438, 660681, 1027626, 136473, 136476, 136578, 809209, 660983, 136726, 983987, 923189, 923221, 399035, 22836, 896672, 923726, 923757, 661623, 399503, 661717, 399595, 137468, 661763, 137554, 399701, 399730, 137662, 137674, 661986, 661990, 137712, 662028, 280245, 662108, 662167, 1115248, 849376, 318841, 138157, 400354, 138263, 940548, 400435, 400491, 662687, 1115334, 924854, 1115339, 400681, 924975, 925041, 925119, 662982, 788278, 400908, 842609, 547426, 925292, 663156, 663182, 138933, 139000, 139175, 401335, 139340, 401572, 663755, 198015, 663891, 139619, 663916, 926094, 664034, 926191, 853437, 402132, 402135, 402300, 664497, 664537, 664540, 664605, 1115656, 118151, 402532, 402595, 664751, 664775, 1115693, 402732, 402799, 402832, 927126, 665022, 140770, 927216, 402991, 403035, 547771, 403145, 141078, 1115798, 403399, 403520, 1028448, 927899, 141475, 853707, 665875, 842703, 403824, 928128, 403914, 666238, 67359, 928453, 772885, 404221, 198444, 142148, 404407, 928755, 666615, 1116016, 1114768, 142656, 404889, 404968, 929366, 1116092, 1116096, 143012, 405183, 897659, 67545, 405298, 405322, 1116134, 929693, 929714, 667607, 1116180, 405684, 405761, 405780, 143680, 143721, 1116221, 405974, 1116242, 143889, 668190, 143955, 1116260, 668300, 406187, 144050, 144051, 406237, 144138, 24041, 668648, 930833, 345241, 406582, 406640, 144498, 931027, 406838, 668999, 406920, 1116402, 144842, 407007, 144872, 144952, 931401, 931576, 111573, 407302, 931621, 407383, 931678, 407457, 145391, 67937, 373795, 145662, 854417, 932294, 932299, 932340, 932412, 408157, 408210, 670360, 1116612, 592333, 146170, 1116633, 68088, 932859, 319123, 408686, 146574, 903976, 671027, 408905, 408908, 146783, 671117, 68164, 408986, 24480, 409031, 409057, 1116763, 409157, 147064, 327873, 933594, 854655, 1116816, 671630, 409515, 1116829, 671720, 1116845, 147537, 671829, 147550, 409708, 409736, 409754, 409853, 147746, 934193, 458832, 1116903, 680004, 1116927, 672262, 68360, 410169, 148159, 410329, 148209, 410387, 410391, 148322, 672626, 68432, 410635, 148503, 982127, 673041, 410946, 854924, 673175, 148898, 673223, 148977, 57068, 935437, 66067, 1073430, 935643, 411392, 411421, 673666, 673689, 935870, 411600, 149491, 1117182, 1117183, 68610, 810947, 411822, 1117235, 936261, 319235, 412073, 636661, 149975, 149979, 936422, 412136, 418216, 150029, 412182, 287159, 493826, 1117295, 150207, 412357, 674504, 674513, 412395, 412407, 412410, 674566, 674571, 936777, 150347, 674691, 150443, 1117343, 150505, 674867, 150595, 412750, 1029987, 674956, 412817, 858085, 412865, 937168, 1117394, 1117398, 1117405, 413054, 150926, 413075, 112541, 855288, 675316, 937486, 1117451, 675430, 937603, 413335, 937626, 767957, 937753, 1117495, 151408, 675811, 675823, 675920, 1117542, 413801, 676113, 676426, 414315, 505980, 1117650, 25398, 1073975, 676791, 939007, 152627, 939096, 414899, 414970, 677133, 415021, 677269, 1074076, 677292, 415155, 677304, 939453, 1117771, 415184, 939521, 153123, 677421, 153239, 939698, 415438, 153374, 415526, 415661, 415776, 1117872, 1117874, 153663, 678053, 1113709, 593611, 153809, 860569, 153868, 416161, 375206, 154164, 678466, 154285, 375244, 154316, 678672, 416561, 678724, 154441, 416646, 117174, 416672, 416692, 416705, 416738, 1118040, 941093, 416823, 941232, 681173, 154904, 941346, 1105021, 1113751, 679438, 1118127, 679482, 331716, 113269, 679625, 941780, 417717, 417895, 1118226, 417905, 1122505, 680190, 680225, 680230, 418157, 942491, 1118286, 856149, 156181, 680480, 418389, 942728, 156302, 680613, 942763, 418501, 840770, 418615, 156479, 156548, 680854, 418725, 943014, 415479, 680890, 594086, 418801, 418832, 156707, 418883, 681047, 156776, 418952, 200918, 1118435, 681317, 419192, 943613, 943638, 943913, 157565, 419729, 258390, 1074859, 157744, 856417, 944345, 944428, 420169, 158026, 158054, 840782, 682567, 1118647, 944730, 158469, 158569, 682859, 682902, 158714, 420872, 158752, 1118734, 945385, 1121860, 1118369, 683410, 159127, 1118806, 1118820, 683641, 419783, 869486, 159535, 1118868, 159767, 159842, 159867, 159922, 1118921, 946406, 159992, 114048, 160010, 684324, 1118941, 422217, 684502, 1118974, 684536, 1131383, 160276, 160309, 160313, 946751, 856861, 946910, 160574, 684899, 947066, 947119, 684994, 685004, 422890, 422938, 160801, 947416, 947430, 31548, 161027, 423205, 161162, 275968, 161346, 685661, 1119167, 161434, 161474, 947981, 423807, 685998, 423863, 948176, 161766, 1119230, 948232, 686139, 686229, 1119259, 245463, 686392, 686422, 424280, 686436, 424318, 96933, 1119355, 162657, 1105276, 162696, 949129, 1119384, 424856, 687020, 1119390, 162946, 949501, 949516, 425438, 949738, 145104, 425588, 1119514, 687792, 425694, 1119531, 687991, 1111417, 950222, 988508, 688208, 950390, 114725, 426403, 688646, 950862, 426666, 688815, 1119695, 71120, 164798, 689120, 1119740, 427033, 639157, 164940, 1119760, 188803, 165037, 427220, 427221, 951558, 427311, 689461, 427372, 165237, 333375, 165287, 427505, 689657, 165393, 952165, 165977, 71383, 952722, 952749, 166325, 428479, 952768, 428494, 166356, 428503, 952866, 158817, 952938, 166508, 690869, 428744, 166683, 428836, 428928, 953258, 691188, 429119, 974485, 429182, 857963, 429205, 1105422, 429228, 726929, 691711, 691798, 167533, 954096, 691956, 429843, 840845, 429876, 1120236, 692151, 167873, 692201, 954363, 692238, 430114, 167974, 145260, 430258, 1120322, 430536, 168445, 692815, 430704, 955028, 430755, 168655, 168786, 955228, 693097, 1120404, 168854, 693157, 168906, 168959, 431159, 693353, 169166, 1120466, 693494, 169257, 955763, 955888, 431669, 169584, 693880, 1114236, 1120537, 956142, 1120563, 694087, 694106, 431988, 432012, 169964, 694270, 1120599, 432162, 1120621, 432315, 1131713, 956670, 1120651, 432503, 334232, 694683, 694739, 904767, 1120678, 170498, 1120689, 694851, 957181, 1120726, 695196, 957479, 84257, 1114290, 957686, 695697, 171431, 695737, 957990, 858717, 902411, 374178, 433786, 1123034, 596613, 171824, 696148, 434008, 696173, 696201, 28653, 958390, 696261, 958435, 858790, 1120945, 696375, 696381, 434295, 696519, 696533, 958846, 904849, 172940, 334662, 435130, 116221, 697374, 959564, 173181, 815308, 959723, 435526, 697773, 1121191, 697861, 843768, 173661, 960142, 435864, 173834, 1121251, 174034, 960571, 436285, 174157, 436325, 960734, 174305, 541429, 174351, 960803, 698703, 436582, 465990, 960998, 1121369, 1121374, 174722, 902935, 961305, 75656, 699218, 699279, 437191, 961525, 1114420, 175228, 175258, 509832, 699726, 961879, 699817, 903073, 72956, 699896, 962160, 437910, 291448, 437986, 946839, 700345, 176124, 700430, 1125466, 438344, 481998, 176276, 700590, 700756, 176499, 438891, 701050, 176781, 176953, 439146, 963471, 177167, 791513, 597520, 963943, 177610, 439766, 701919, 902014, 116991, 439879, 1121892, 177775, 439929, 440144, 702298, 1121941, 964482, 964554, 964608, 702508, 861865, 702598, 608244, 1125481, 702722, 554031, 178575, 178610, 702919, 702952, 728829, 178677, 1114542, 440973, 703134, 703145, 703240, 703298, 441204, 179066, 965627, 179219, 703554, 179309, 179395, 703736, 884569, 870157, 1070930, 861435, 179876, 704182, 179955, 1122267, 1122271, 180091, 442307, 442313, 966614, 58374, 423254, 1122316, 180298, 180370, 966925, 1122352, 180593, 705131, 967278, 705174, 180956, 180979, 967419, 1122458, 181213, 705551, 967706, 181305, 181329, 181479, 443694, 705905, 443766, 181644, 968186, 968238, 1122569, 443964, 1122591, 706291, 1122601, 1122610, 706373, 444233, 706401, 968552, 444318, 1024250, 706501, 968667, 1122643, 706624, 444485, 182350, 968788, 1122662, 336236, 968847, 444598, 706780, 968932, 444688, 968995, 706900, 706985, 445026, 707431, 336379, 707577, 707645, 707689, 969984, 707853, 183696, 1122894, 642301, 708094, 708144, 183919, 1122920, 708271, 446160, 970549, 1122957, 970577, 446290, 184221, 184223, 446448, 184333, 184355, 708671, 184497, 1123046, 1123057, 84713, 709056, 1123069, 642473, 184833, 971331, 971415, 971528, 185119, 971564, 971626, 709492, 709494, 709522, 709726, 1123191, 1079501, 447697, 972092, 447912, 448000, 448042, 448059, 972396, 186071, 448305, 380561, 1123337, 972896, 710756, 186484, 710887, 710970, 973336, 973416, 449154, 1123435, 449252, 973614, 1123469, 187317, 1123488, 187330, 187371, 449539, 711802, 973951, 449717, 187574, 187585, 852966, 187675, 712006, 187763, 712140, 188053, 450290, 188166, 712468, 1123584, 188273, 450426, 974727, 188318, 450493, 450498, 450543, 188445, 67379, 974906, 450640, 975031, 407816, 975140, 188784, 713127, 555772, 713278, 451150, 1123822, 975774, 189365, 975809, 975821, 1123837, 975865, 975875, 451642, 451643, 189529, 976015, 451826, 976293, 1123915, 1114148, 1123917, 452155, 190070, 643361, 452298, 190164, 1123961, 452336, 452385, 976678, 452422, 976771, 976827, 714849, 714863, 714881, 966542, 977156, 1124059, 452924, 190845, 715140, 1124090, 191143, 453305, 191189, 453350, 137306, 905706, 715661, 977828, 977852, 715756, 715765, 993117, 978017, 1124198, 191625, 978096, 978121, 453857, 1124221, 716082, 163053, 978422, 978459, 282352, 192284, 192397, 716717, 978995, 979007, 979044, 1124369, 643775, 1124373, 979086, 454824, 716995, 717111, 192846, 1124418, 455256, 979571, 731251, 455359, 455371, 717538, 455425, 979749, 455513, 455561, 979890, 862380, 717849, 717873, 980023, 193676, 1124550, 980185, 980205, 455957, 980263, 718133, 456002, 980406, 718295, 1120348, 718364, 980533, 718489, 718490, 456383, 194320, 1037279, 456674, 194563, 980996, 722260, 731483, 456807, 163517, 981207, 981240, 1123804, 1037361, 194893, 207250, 457118, 719438, 195189, 981696, 457426, 195312, 981837, 981948, 1124863, 982019, 982104, 195677, 1124882, 982144, 982229, 457951, 982377, 600638, 195958, 720395, 1124957, 196233, 982696, 982810, 196450, 458616, 1125013, 458674, 982967, 458688, 720868, 196599, 983068, 720949, 688140, 853267, 983196, 1125075, 1125081, 721273, 459153, 197069, 819279, 197174, 459398, 983787, 459503, 721661, 197375, 76591, 721708, 197487, 1125183, 644600, 721882, 721942, 1125227, 984245, 460002, 197875, 722189, 722220, 1125260, 197965, 984476, 1115187, 984620, 1125306, 984702, 460506, 722676, 460663, 984952, 198536, 198610, 985070, 198698, 722996, 460855, 198894, 198905, 426367, 461052, 723295, 461190, 723457, 723486, 1125483, 426435, 1125495, 723688, 199407, 985840, 77398, 1125510, 985900, 985913, 199508, 461659, 723897, 986094, 723975, 601268, 461950, 199831, 251774, 462111, 722441, 724400, 200144, 1029607, 862111, 994564, 200228, 986693, 1125651, 724606, 724623, 724657, 1125670, 724767, 986927, 986960, 1125690, 1125706, 462794, 200695, 725044, 200782, 426684, 987306, 463021, 645149, 1125763, 463137, 463180, 725355, 463295, 201194, 463421, 463507, 201381, 725715, 725726, 201444, 987894, 463660, 463679, 725828, 863720, 725969, 725996, 988153, 689019, 726379, 464240, 329704, 988542, 1125954, 988595, 464354, 988661, 732756, 202245, 202250, 202310, 464465, 121109, 202384, 464548, 120219, 190054, 726834, 989078, 989086, 202664, 726971, 295928, 1132796, 464864, 202726, 464904, 989196, 1126064, 464930, 727154, 727181, 951320, 989398, 727291, 465156, 989526, 989622, 609071, 722550, 727572, 203321, 989795, 989850, 989869, 203578, 990049, 990059, 727943, 203720, 465920, 203783, 203790, 728112, 990480, 204088, 466338, 990628, 466400, 466454, 466456, 466536, 687671, 728735, 466657, 204520, 990969, 991000, 204701, 466865, 466878, 729023, 729058, 670004, 1126403, 466968, 991278, 204851, 24280, 991332, 204904, 991361, 204951, 204957, 78181, 991583, 991798, 205433, 729755, 602263, 908101, 992162, 730062, 467932, 383831, 730149, 340145, 730156, 992308, 468021, 205954, 205959, 992595, 34366, 730541, 468437, 468465, 992867, 252864, 992904, 730832, 468730, 993055, 1126709, 206738, 1126736, 731129, 469121, 469236, 733585, 469356, 993677, 233856, 1126817, 731662, 731691, 993838, 469566, 995778, 427801, 207572, 207703, 207754, 469976, 207879, 78352, 122049, 602653, 602654, 470372, 1126981, 994761, 994762, 1019506, 208344, 331343, 864853, 208394, 208417, 864864, 208493, 732794, 470680, 208657, 600524, 208702, 470933, 952306, 995250, 78497, 78501, 471017, 995529, 908665, 995599, 995721, 995898, 209497, 209531, 515531, 321226, 733956, 996146, 209769, 209797, 996236, 340815, 472232, 996592, 734466, 734529, 603021, 996730, 472445, 734678, 996851, 996876, 734746, 734758, 865206, 734136, 734836, 865218, 997024, 865253, 210808, 472957, 997278, 473020, 473028, 473029, 473062, 997437, 473182, 515785, 735469, 735502, 997654, 997671, 473495, 253693, 909115, 736117, 736234, 998480, 736364, 212146, 862515, 474468, 736703, 212559, 736936, 216579, 1127718, 474875, 474961, 737112, 1115677, 737266, 999522, 999612, 166606, 999644, 1127802, 251513, 475394, 475408, 184640, 213353, 213365, 737654, 999829, 999910, 737913, 213726, 213758, 1127912, 738248, 691028, 214041, 738451, 341529, 79390, 172986, 1127990, 603714, 1000865, 1000893, 738788, 111891, 738829, 1000993, 738859, 79457, 214625, 909587, 739143, 739166, 739171, 477176, 1001465, 1001492, 1124480, 477474, 739636, 1115760, 477552, 862621, 1001961, 1001968, 739828, 1128198, 477793, 739954, 739996, 215744, 341802, 1002287, 740168, 1128270, 740366, 478255, 1002572, 1128291, 1107057, 1002690, 775366, 647777, 216656, 216731, 216736, 1128402, 866276, 1128427, 1003514, 1003630, 479358, 741514, 1003747, 479475, 479495, 479527, 1003909, 1003917, 479687, 1128562, 36299, 735360, 1004330, 480056, 742238, 217999, 604332, 480320, 1004774, 866523, 742695, 480646, 742800, 1004948, 742808, 1005149, 1005174, 779201, 480917, 866615, 1005440, 604500, 743489, 779295, 219453, 743777, 219723, 219797, 219809, 219844, 219898, 744230, 482144, 1128939, 80372, 1128949, 1128956, 429906, 482382, 744538, 220290, 220352, 220398, 482580, 744835, 1129042, 745060, 745317, 1007481, 745373, 483253, 692258, 745784, 745863, 745872, 671862, 36951, 1008191, 1129241, 221987, 1008453, 1008502, 1041905, 222132, 222133, 80718, 484350, 484467, 867190, 484556, 998270, 484646, 1008938, 1009016, 746963, 747004, 484881, 1009291, 1009408, 747285, 1129436, 941435, 223160, 747566, 1129483, 747597, 867346, 485567, 1009859, 1129508, 485594, 1010069, 747939, 998479, 748018, 485891, 1010270, 1010376, 1010426, 486173, 748434, 486312, 1129633, 224261, 486431, 748579, 748843, 486716, 486814, 224688, 1011124, 486839, 486939, 1011337, 1011348, 749215, 749326, 487275, 897859, 867648, 124943, 1011812, 1129835, 225419, 955087, 749813, 343439, 256066, 749976, 225703, 1129886, 750029, 487934, 750114, 750167, 1012318, 488073, 225986, 226012, 226190, 488426, 1012829, 488571, 924092, 226572, 750926, 488839, 226741, 488887, 1013228, 1013231, 489013, 1013322, 751255, 751289, 489166, 1130105, 489238, 227104, 1013556, 1013640, 227230, 1013679, 751571, 751621, 227362, 1013904, 227516, 1130199, 1014055, 452761, 955453, 1014167, 1014189, 490000, 490071, 605977, 1014437, 81693, 752388, 752441, 1130296, 752488, 868157, 490358, 752686, 911889, 490612, 1130345, 1014933, 490752, 752914, 1130378, 228670, 1130383, 228769, 606110, 490936, 753089, 490982, 1015289, 753220, 229045, 491200, 868300, 1130449, 229107, 1106615, 1015668, 229254, 1015949, 753921, 1130541, 229677, 1130558, 344155, 387851, 230013, 1016486, 868477, 1016577, 1130635, 169442, 754567, 1016732, 754589, 754609, 230365, 431674, 755062, 230808, 230824, 493020, 230878, 1017442, 493218, 1017519, 755381, 493262, 231134, 755461, 344368, 493439, 755688, 1017930, 493723, 493829, 493845, 493900, 1018202, 606609, 257085, 1018290, 25179, 494111, 1130933, 232008, 1018475, 494285, 606672, 1018792, 956231, 756681, 904125, 756790, 338487, 232609, 494786, 756934, 1131046, 1019246, 232919, 1019366, 495159, 495243, 1019607, 233178, 233185, 1019720, 1019740, 1125086, 495570, 495608, 495618, 233529, 1020019, 233881, 233900, 869124, 1020489, 758422, 496334, 301180, 234277, 234583, 758918, 1131379, 759007, 496927, 759101, 759125, 1021302, 1021505, 497356, 759514, 235280, 235309, 497483, 913041, 1131507, 497632, 235560, 1022004, 869395, 497813, 497919, 760171, 941866, 1131613, 760387, 236105, 1022554, 853522, 236254, 1022698, 236269, 236359, 1107730, 498576, 760825, 761014, 236763, 236776, 236793, 236824, 761225, 761313, 499188, 761430, 1044414, 499364, 499385, 237335, 237441, 83266, 237465, 499666, 869721, 761883, 761907, 607582, 761941, 762035, 762072, 1131909, 563938, 1024432, 762434, 741173, 500447, 1024775, 500574, 1024923, 500646, 762803, 500680, 762865, 869887, 1025072, 500794, 1025104, 500832, 763063, 238804, 1132087, 1025444, 239147, 763443, 239245, 763534, 239250, 501442, 763641, 763748, 501645, 239589, 501878, 501894, 83666, 502000, 502104, 502221, 240102, 83712, 764533, 1026733, 502453, 127437, 870184, 1026842, 764766, 8008, 1132360, 765070, 240792, 240865, 880930, 1027534, 1132452, 1125347, 1027591, 346031, 846022, 503482, 1027793, 40228, 765659, 241399, 503613, 1110190, 695643, 503949, 503963, 766142, 1116643, 504057, 241937, 242042, 1107919, 766379, 242115, 1132628, 1028701, 1028772, 242583, 242603, 504751, 242796, 433683, 1125406, 243076, 767404, 767490, 767499, 243224, 767549, 767589, 243365, 1029816, 767678, 767703, 652242, 1029871, 1132847, 505647, 767991, 1030163, 1132903, 768114, 505992, 243874, 1132917, 243941, 1132925, 243972, 1030502, 506278, 768557, 1030770, 1132991, 506577, 244472, 1030949, 1030954, 1030994, 506768, 768916, 768939, 1031152, 608785, 507093, 1031384, 507139, 95449, 769310, 259312, 769449, 1031609, 507424, 245295, 565173, 1031850, 507583, 1031935, 303070, 1133194, 245620, 128322, 1032101, 245833, 958484, 508162, 1032499, 770356, 246076, 508254, 246118, 770472, 1032658, 783779, 770534, 1032680, 1032694, 1032719, 1032729, 1125519, 770613, 246327, 508476, 770648, 508510, 1032978, 84797, 508821, 246724, 508870, 509003, 1033381, 128543, 247025, 1033544, 1033642, 84901, 771555, 1125556, 771730, 783989, 247506, 509654, 1033989, 1033997, 1034015, 1133533, 1133535, 347113, 771994, 1034154, 200289, 247819, 247821, 1034261, 510004, 1034305, 1034337, 248028, 1034491, 772409, 772507, 128757, 1034724, 1034759, 248362, 510514, 248407, 510645, 772833, 772836, 303585, 772968, 1035153, 773025, 773040, 347294, 1035339, 1035340, 1035354, 511215, 1035658, 511401, 249267, 511466, 1133827, 249364, 773878, 609580, 1078819, 511806, 511841, 774027, 249814, 871767, 774158, 512069, 512146, 1036468, 512264, 774472, 512378, 512423, 1036759, 1036776, 512536, 1036830, 1036844, 512569, 512621, 959256, 1037088, 250673, 1134049, 1037159, 139285, 129183, 250836, 512985, 250927, 1037376, 8356, 1134109, 513232, 775430, 513303, 775459, 775487, 1003074, 1134157, 1037830, 1037871, 513591, 1037917, 1037969, 1037981, 609922, 513836, 513838, 514029, 1134251, 1046889, 1134272, 435223, 1134277, 514241, 252103, 514264, 514360, 776547, 514421, 252314, 252441, 872213, 776859, 776865, 1039098, 515005, 252887, 515064, 1129518, 515112, 515123, 515217, 515273, 1039636, 777578, 828649, 1039787, 253406, 653909, 777839, 777906, 515775, 253711, 129672, 778062, 1040212, 253834, 253837, 516087, 516185, 872463, 1040530, 1040637, 516429, 1040730, 1040752, 778620, 516532, 1040890, 254576, 86135, 779025, 254740, 1041217, 516941, 741537, 254923, 779256, 517135, 1134769, 255016, 255025, 1041478, 87055, 1041628, 779540, 1041743, 1029806, 86290, 517763, 517928, 255889, 610645, 60900, 855243, 518220, 1042543, 256245, 122010, 518578, 907635, 518785, 518796, 256692, 1043138, 1043151, 130168, 1043229, 519027, 1043346, 1043433, 872978, 741803, 781539, 781689, 820372, 506130, 781808, 1043976, 602152, 610923, 781987, 257783, 174039, 782125, 782208, 1044449, 258032, 520202, 520273, 1016027, 1125919, 1044869, 1059674, 782783, 130467, 258617, 305251, 357777, 1045190, 521026, 130543, 521147, 783335, 259070, 521254, 1045593, 521329, 521342, 521367, 783583, 1045742, 130610, 259437, 783751, 584757, 521791, 1046115, 521869, 1046166, 784039, 784092, 796068, 1046316, 522054, 259924, 522212, 260080, 1135619, 522358, 1046684, 261830, 260302, 1046757, 130776, 1046881, 522821, 611442, 1047406, 523197, 261101, 1048410, 1135796, 261207, 43548, 261295, 864153, 261661, 218384, 786171, 524266, 567976], +} diff --git a/pyserini/resources/beir.yaml b/pyserini/resources/beir.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e488ec92f37da2eb4afb9999304704ec3db45938 --- /dev/null +++ b/pyserini/resources/beir.yaml @@ -0,0 +1,741 @@ +conditions: + - name: bm25-flat + command: python -m pyserini.search.lucene --index beir-v1.0.0-${dataset}.flat --topics beir-v1.0.0-${dataset}-test --output $output --output-format trec --batch 36 --threads 12 --hits 1000 --bm25 --remove-query + datasets: + - dataset: trec-covid + scores: + - nDCG@10: 0.5947 + R@100: 0.1091 + R@1000: 0.3955 + - dataset: bioasq + scores: + - nDCG@10: 0.5225 + R@100: 0.7687 + R@1000: 0.9030 + - dataset: nfcorpus + scores: + - nDCG@10: 0.3218 + R@100: 0.2457 + R@1000: 0.3704 + - dataset: nq + scores: + - nDCG@10: 0.3055 + R@100: 0.7513 + R@1000: 0.8958 + - dataset: hotpotqa + scores: + - nDCG@10: 0.6330 + R@100: 0.7957 + R@1000: 0.8820 + - dataset: fiqa + scores: + - nDCG@10: 0.2361 + R@100: 0.5395 + R@1000: 0.7393 + - dataset: signal1m + scores: + - nDCG@10: 0.3304 + R@100: 0.3703 + R@1000: 0.5642 + - dataset: trec-news + scores: + - nDCG@10: 0.3952 + R@100: 0.4469 + R@1000: 0.7051 + - dataset: robust04 + scores: + - nDCG@10: 0.4070 + R@100: 0.3746 + R@1000: 0.6345 + - dataset: arguana + scores: + - nDCG@10: 0.3970 + R@100: 0.9324 + R@1000: 0.9872 + - dataset: webis-touche2020 + scores: + - nDCG@10: 0.4422 + R@100: 0.5822 + R@1000: 0.8621 + - dataset: cqadupstack-android + scores: + - nDCG@10: 0.3801 + R@100: 0.6829 + R@1000: 0.8632 + - dataset: cqadupstack-english + scores: + - nDCG@10: 0.3453 + R@100: 0.5757 + R@1000: 0.7323 + - dataset: cqadupstack-gaming + scores: + - nDCG@10: 0.4822 + R@100: 0.7651 + R@1000: 0.8945 + - dataset: cqadupstack-gis + scores: + - nDCG@10: 0.2901 + R@100: 0.6119 + R@1000: 0.8174 + - dataset: cqadupstack-mathematica + scores: + - nDCG@10: 0.2015 + R@100: 0.4877 + R@1000: 0.7221 + - dataset: cqadupstack-physics + scores: + - nDCG@10: 0.3214 + R@100: 0.6326 + R@1000: 0.8340 + - dataset: cqadupstack-programmers + scores: + - nDCG@10: 0.2802 + R@100: 0.5588 + R@1000: 0.7734 + - dataset: cqadupstack-stats + scores: + - nDCG@10: 0.2711 + R@100: 0.5338 + R@1000: 0.7310 + - dataset: cqadupstack-tex + scores: + - nDCG@10: 0.2244 + R@100: 0.4686 + R@1000: 0.6907 + - dataset: cqadupstack-unix + scores: + - nDCG@10: 0.2749 + R@100: 0.5417 + R@1000: 0.7616 + - dataset: cqadupstack-webmasters + scores: + - nDCG@10: 0.3059 + R@100: 0.5820 + R@1000: 0.8066 + - dataset: cqadupstack-wordpress + scores: + - nDCG@10: 0.2483 + R@100: 0.5152 + R@1000: 0.7552 + - dataset: quora + scores: + - nDCG@10: 0.7886 + R@100: 0.9733 + R@1000: 0.9950 + - dataset: dbpedia-entity + scores: + - nDCG@10: 0.3180 + R@100: 0.4682 + R@1000: 0.6760 + - dataset: scidocs + scores: + - nDCG@10: 0.1490 + R@100: 0.3477 + R@1000: 0.5638 + - dataset: fever + scores: + - nDCG@10: 0.6513 + R@100: 0.9185 + R@1000: 0.9589 + - dataset: climate-fever + scores: + - nDCG@10: 0.1651 + R@100: 0.4249 + R@1000: 0.6324 + - dataset: scifact + scores: + - nDCG@10: 0.6789 + R@100: 0.9253 + R@1000: 0.9767 + - name: bm25-multifield + command: python -m pyserini.search.lucene --index beir-v1.0.0-${dataset}.multifield --topics beir-v1.0.0-${dataset}-test --output $output --output-format trec --batch 36 --threads 12 --hits 1000 --bm25 --remove-query --fields contents=1.0 title=1.0 + datasets: + - dataset: trec-covid + scores: + - nDCG@10: 0.6559 + R@100: 0.1141 + R@1000: 0.3891 + - dataset: bioasq + scores: + - nDCG@10: 0.4646 + R@100: 0.7145 + R@1000: 0.8428 + - dataset: nfcorpus + scores: + - nDCG@10: 0.3254 + R@100: 0.2500 + R@1000: 0.3718 + - dataset: nq + scores: + - nDCG@10: 0.3285 + R@100: 0.7597 + R@1000: 0.9019 + - dataset: hotpotqa + scores: + - nDCG@10: 0.6027 + R@100: 0.7400 + R@1000: 0.8405 + - dataset: fiqa + scores: + - nDCG@10: 0.2361 + R@100: 0.5395 + R@1000: 0.7393 + - dataset: signal1m + scores: + - nDCG@10: 0.3304 + R@100: 0.3703 + R@1000: 0.5642 + - dataset: trec-news + scores: + - nDCG@10: 0.3977 + R@100: 0.4216 + R@1000: 0.6993 + - dataset: robust04 + scores: + - nDCG@10: 0.4070 + R@100: 0.3746 + R@1000: 0.6345 + - dataset: arguana + scores: + - nDCG@10: 0.4142 + R@100: 0.9431 + R@1000: 0.9893 + - dataset: webis-touche2020 + scores: + - nDCG@10: 0.3673 + R@100: 0.5376 + R@1000: 0.8668 + - dataset: cqadupstack-android + scores: + - nDCG@10: 0.3709 + R@100: 0.6889 + R@1000: 0.8712 + - dataset: cqadupstack-english + scores: + - nDCG@10: 0.3321 + R@100: 0.5842 + R@1000: 0.7574 + - dataset: cqadupstack-gaming + scores: + - nDCG@10: 0.4418 + R@100: 0.7571 + R@1000: 0.8882 + - dataset: cqadupstack-gis + scores: + - nDCG@10: 0.2904 + R@100: 0.6458 + R@1000: 0.8248 + - dataset: cqadupstack-mathematica + scores: + - nDCG@10: 0.2046 + R@100: 0.5215 + R@1000: 0.7559 + - dataset: cqadupstack-physics + scores: + - nDCG@10: 0.3248 + R@100: 0.6486 + R@1000: 0.8506 + - dataset: cqadupstack-programmers + scores: + - nDCG@10: 0.2963 + R@100: 0.6194 + R@1000: 0.8096 + - dataset: cqadupstack-stats + scores: + - nDCG@10: 0.2790 + R@100: 0.5719 + R@1000: 0.7619 + - dataset: cqadupstack-tex + scores: + - nDCG@10: 0.2086 + R@100: 0.4954 + R@1000: 0.7222 + - dataset: cqadupstack-unix + scores: + - nDCG@10: 0.2788 + R@100: 0.5721 + R@1000: 0.7783 + - dataset: cqadupstack-webmasters + scores: + - nDCG@10: 0.3008 + R@100: 0.6100 + R@1000: 0.8226 + - dataset: cqadupstack-wordpress + scores: + - nDCG@10: 0.2562 + R@100: 0.5526 + R@1000: 0.7848 + - dataset: quora + scores: + - nDCG@10: 0.7886 + R@100: 0.9733 + R@1000: 0.9950 + - dataset: dbpedia-entity + scores: + - nDCG@10: 0.3128 + R@100: 0.3981 + R@1000: 0.5848 + - dataset: scidocs + scores: + - nDCG@10: 0.1581 + R@100: 0.3561 + R@1000: 0.5599 + - dataset: fever + scores: + - nDCG@10: 0.7530 + R@100: 0.9309 + R@1000: 0.9599 + - dataset: climate-fever + scores: + - nDCG@10: 0.2129 + R@100: 0.4357 + R@1000: 0.6099 + - dataset: scifact + scores: + - nDCG@10: 0.6647 + R@100: 0.9076 + R@1000: 0.9800 + - name: splade-distil-cocodenser-medium + command: python -m pyserini.search.lucene --index beir-v1.0.0-${dataset}-splade_distil_cocodenser_medium --topics beir-v1.0.0-${dataset}-test-splade_distil_cocodenser_medium --output $output --output-format trec --batch 36 --threads 12 --hits 1000 --impact --remove-query + datasets: + - dataset: trec-covid + scores: + - nDCG@10: 0.7109 + R@100: 0.1308 + R@1000: 0.4433 + - dataset: bioasq + scores: + - nDCG@10: 0.5035 + R@100: 0.7422 + R@1000: 0.8904 + - dataset: nfcorpus + scores: + - nDCG@10: 0.3454 + R@100: 0.2891 + R@1000: 0.5694 + - dataset: nq + scores: + - nDCG@10: 0.5442 + R@100: 0.9285 + R@1000: 0.9812 + - dataset: hotpotqa + scores: + - nDCG@10: 0.6860 + R@100: 0.8144 + R@1000: 0.8945 + - dataset: fiqa + scores: + - nDCG@10: 0.3514 + R@100: 0.6298 + R@1000: 0.8323 + - dataset: signal1m + scores: + - nDCG@10: 0.2957 + R@100: 0.3311 + R@1000: 0.5514 + - dataset: trec-news + scores: + - nDCG@10: 0.3936 + R@100: 0.4323 + R@1000: 0.6977 + - dataset: robust04 + scores: + - nDCG@10: 0.4581 + R@100: 0.3773 + R@1000: 0.6099 + - dataset: arguana + scores: + - nDCG@10: 0.5210 + R@100: 0.9822 + R@1000: 0.9950 + - dataset: webis-touche2020 + scores: + - nDCG@10: 0.2435 + R@100: 0.4723 + R@1000: 0.8116 + - dataset: cqadupstack-android + scores: + - nDCG@10: 0.3954 + R@100: 0.7405 + R@1000: 0.9035 + - dataset: cqadupstack-english + scores: + - nDCG@10: 0.4026 + R@100: 0.6768 + R@1000: 0.8346 + - dataset: cqadupstack-gaming + scores: + - nDCG@10: 0.5061 + R@100: 0.8138 + R@1000: 0.9253 + - dataset: cqadupstack-gis + scores: + - nDCG@10: 0.3223 + R@100: 0.6419 + R@1000: 0.8385 + - dataset: cqadupstack-mathematica + scores: + - nDCG@10: 0.2423 + R@100: 0.5732 + R@1000: 0.7848 + - dataset: cqadupstack-physics + scores: + - nDCG@10: 0.3668 + R@100: 0.7286 + R@1000: 0.8931 + - dataset: cqadupstack-programmers + scores: + - nDCG@10: 0.3412 + R@100: 0.6653 + R@1000: 0.8451 + - dataset: cqadupstack-stats + scores: + - nDCG@10: 0.3142 + R@100: 0.5889 + R@1000: 0.7823 + - dataset: cqadupstack-tex + scores: + - nDCG@10: 0.2575 + R@100: 0.5231 + R@1000: 0.7372 + - dataset: cqadupstack-unix + scores: + - nDCG@10: 0.3292 + R@100: 0.6192 + R@1000: 0.8225 + - dataset: cqadupstack-webmasters + scores: + - nDCG@10: 0.3343 + R@100: 0.6404 + R@1000: 0.8767 + - dataset: cqadupstack-wordpress + scores: + - nDCG@10: 0.2839 + R@100: 0.5974 + R@1000: 0.8036 + - dataset: quora + scores: + - nDCG@10: 0.8136 + R@100: 0.9817 + R@1000: 0.9979 + - dataset: dbpedia-entity + scores: + - nDCG@10: 0.4416 + R@100: 0.5636 + R@1000: 0.7774 + - dataset: scidocs + scores: + - nDCG@10: 0.1590 + R@100: 0.3671 + R@1000: 0.5891 + - dataset: fever + scores: + - nDCG@10: 0.7962 + R@100: 0.9550 + R@1000: 0.9751 + - dataset: climate-fever + scores: + - nDCG@10: 0.2276 + R@100: 0.5140 + R@1000: 0.7084 + - dataset: scifact + scores: + - nDCG@10: 0.6992 + R@100: 0.9270 + R@1000: 0.9767 + - name: contriever + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/contriever --index beir-v1.0.0-${dataset}.contriever --topics beir-v1.0.0-${dataset}-test --output $output --batch 128 --threads 16 --hits 1000 --remove-query + datasets: + - dataset: trec-covid + scores: + - nDCG@10: 0.2732 + R@100: 0.0368 + R@1000: 0.1675 + - dataset: bioasq + scores: + - nDCG@10: 0.3016 + R@100: 0.5412 + R@1000: 0.7396 + - dataset: nfcorpus + scores: + - nDCG@10: 0.3173 + R@100: 0.2943 + R@1000: 0.6232 + - dataset: nq + scores: + - nDCG@10: 0.2536 + R@100: 0.7712 + R@1000: 0.9286 + - dataset: hotpotqa + scores: + - nDCG@10: 0.4807 + R@100: 0.7046 + R@1000: 0.8294 + - dataset: fiqa + scores: + - nDCG@10: 0.2449 + R@100: 0.5619 + R@1000: 0.8215 + - dataset: signal1m + scores: + - nDCG@10: 0.2338 + R@100: 0.2568 + R@1000: 0.4757 + - dataset: trec-news + scores: + - nDCG@10: 0.3484 + R@100: 0.4234 + R@1000: 0.7389 + - dataset: robust04 + scores: + - nDCG@10: 0.3155 + R@100: 0.2757 + R@1000: 0.5097 + - dataset: arguana + scores: + - nDCG@10: 0.3791 + R@100: 0.9011 + R@1000: 0.9851 + - dataset: webis-touche2020 + scores: + - nDCG@10: 0.1668 + R@100: 0.3736 + R@1000: 0.7144 + - dataset: cqadupstack-android + scores: + - nDCG@10: 0.3771 + R@100: 0.7436 + R@1000: 0.9173 + - dataset: cqadupstack-english + scores: + - nDCG@10: 0.3571 + R@100: 0.6442 + R@1000: 0.8042 + - dataset: cqadupstack-gaming + scores: + - nDCG@10: 0.4597 + R@100: 0.8092 + R@1000: 0.9354 + - dataset: cqadupstack-gis + scores: + - nDCG@10: 0.2411 + R@100: 0.5792 + R@1000: 0.8018 + - dataset: cqadupstack-mathematica + scores: + - nDCG@10: 0.1841 + R@100: 0.5127 + R@1000: 0.7757 + - dataset: cqadupstack-physics + scores: + - nDCG@10: 0.3430 + R@100: 0.7013 + R@1000: 0.8980 + - dataset: cqadupstack-programmers + scores: + - nDCG@10: 0.3029 + R@100: 0.6402 + R@1000: 0.8434 + - dataset: cqadupstack-stats + scores: + - nDCG@10: 0.2483 + R@100: 0.5269 + R@1000: 0.7417 + - dataset: cqadupstack-tex + scores: + - nDCG@10: 0.1540 + R@100: 0.4333 + R@1000: 0.6870 + - dataset: cqadupstack-unix + scores: + - nDCG@10: 0.2636 + R@100: 0.5879 + R@1000: 0.8212 + - dataset: cqadupstack-webmasters + scores: + - nDCG@10: 0.2878 + R@100: 0.6485 + R@1000: 0.8800 + - dataset: cqadupstack-wordpress + scores: + - nDCG@10: 0.1914 + R@100: 0.5364 + R@1000: 0.7551 + - dataset: quora + scores: + - nDCG@10: 0.8349 + R@100: 0.9871 + R@1000: 0.9981 + - dataset: dbpedia-entity + scores: + - nDCG@10: 0.2916 + R@100: 0.4529 + R@1000: 0.7142 + - dataset: scidocs + scores: + - nDCG@10: 0.1491 + R@100: 0.3601 + R@1000: 0.6105 + - dataset: fever + scores: + - nDCG@10: 0.6821 + R@100: 0.9356 + R@1000: 0.9655 + - dataset: climate-fever + scores: + - nDCG@10: 0.1550 + R@100: 0.4422 + R@1000: 0.7232 + - dataset: scifact + scores: + - nDCG@10: 0.6493 + R@100: 0.9260 + R@1000: 0.9967 + - name: contriever-msmarco + command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/contriever-msmarco --index beir-v1.0.0-${dataset}.contriever-msmarco --topics beir-v1.0.0-${dataset}-test --output $output --batch 128 --threads 16 --hits 1000 --remove-query + datasets: + - dataset: trec-covid + scores: + - nDCG@10: 0.5964 + R@100: 0.0907 + R@1000: 0.3351 + - dataset: bioasq + scores: + - nDCG@10: 0.3829 + R@100: 0.6072 + R@1000: 0.7666 + - dataset: nfcorpus + scores: + - nDCG@10: 0.3281 + R@100: 0.3008 + R@1000: 0.6305 + - dataset: nq + scores: + - nDCG@10: 0.4977 + R@100: 0.9252 + R@1000: 0.986 + - dataset: hotpotqa + scores: + - nDCG@10: 0.6376 + R@100: 0.7772 + R@1000: 0.8718 + - dataset: fiqa + scores: + - nDCG@10: 0.3293 + R@100: 0.6558 + R@1000: 0.8695 + - dataset: signal1m + scores: + - nDCG@10: 0.2783 + R@100: 0.322 + R@1000: 0.5419 + - dataset: trec-news + scores: + - nDCG@10: 0.4283 + R@100: 0.4924 + R@1000: 0.7752 + - dataset: robust04 + scores: + - nDCG@10: 0.4729 + R@100: 0.3917 + R@1000: 0.6552 + - dataset: arguana + scores: + - nDCG@10: 0.4461 + R@100: 0.9765 + R@1000: 0.9964 + - dataset: webis-touche2020 + scores: + - nDCG@10: 0.204 + R@100: 0.442 + R@1000: 0.829 + - dataset: cqadupstack-android + scores: + - nDCG@10: 0.4255 + R@100: 0.7503 + R@1000: 0.9304 + - dataset: cqadupstack-english + scores: + - nDCG@10: 0.4326 + R@100: 0.6935 + R@1000: 0.8435 + - dataset: cqadupstack-gaming + scores: + - nDCG@10: 0.5276 + R@100: 0.8481 + R@1000: 0.9427 + - dataset: cqadupstack-gis + scores: + - nDCG@10: 0.3022 + R@100: 0.6272 + R@1000: 0.8417 + - dataset: cqadupstack-mathematica + scores: + - nDCG@10: 0.2355 + R@100: 0.5726 + R@1000: 0.7995 + - dataset: cqadupstack-physics + scores: + - nDCG@10: 0.4159 + R@100: 0.7619 + R@1000: 0.9162 + - dataset: cqadupstack-programmers + scores: + - nDCG@10: 0.3574 + R@100: 0.7191 + R@1000: 0.8878 + - dataset: cqadupstack-stats + scores: + - nDCG@10: 0.3095 + R@100: 0.586 + R@1000: 0.7805 + - dataset: cqadupstack-tex + scores: + - nDCG@10: 0.2209 + R@100: 0.4985 + R@1000: 0.7348 + - dataset: cqadupstack-unix + scores: + - nDCG@10: 0.3257 + R@100: 0.6161 + R@1000: 0.8373 + - dataset: cqadupstack-webmasters + scores: + - nDCG@10: 0.3392 + R@100: 0.7032 + R@1000: 0.8956 + - dataset: cqadupstack-wordpress + scores: + - nDCG@10: 0.2532 + R@100: 0.5769 + R@1000: 0.7929 + - dataset: quora + scores: + - nDCG@10: 0.8648 + R@100: 0.9935 + R@1000: 0.9994 + - dataset: dbpedia-entity + scores: + - nDCG@10: 0.4128 + R@100: 0.5414 + R@1000: 0.7751 + - dataset: scidocs + scores: + - nDCG@10: 0.1652 + R@100: 0.3783 + R@1000: 0.6216 + - dataset: fever + scores: + - nDCG@10: 0.7583 + R@100: 0.9494 + R@1000: 0.9705 + - dataset: climate-fever + scores: + - nDCG@10: 0.2371 + R@100: 0.5746 + R@1000: 0.8019 + - dataset: scifact + scores: + - nDCG@10: 0.6768 + R@100: 0.947 + R@1000: 0.9833 diff --git a/pyserini/resources/index-metadata/faiss-flat.wiki-all-6-3.dpr2-multi-retriever.20230103.186fa7.README.md b/pyserini/resources/index-metadata/faiss-flat.wiki-all-6-3.dpr2-multi-retriever.20230103.186fa7.README.md new file mode 100644 index 0000000000000000000000000000000000000000..549d7c4e804f57ed27cc6eb795040ba57940bf7b --- /dev/null +++ b/pyserini/resources/index-metadata/faiss-flat.wiki-all-6-3.dpr2-multi-retriever.20230103.186fa7.README.md @@ -0,0 +1,19 @@ +# wiki-all-6-3-dpr2-multi + +Faiss FlatIP index of wiki-all-6-3 (https://huggingface.co/datasets/castorini/odqa-wiki-corpora) encoded by a 2nd iteration DPR model trained on multiple QA datasets (castorini/wiki-all-6-3-multi-dpr2-passage-encoder). +This index was generated on 2023/01/03 on `narval` at commits: + ++ Pyserini commit ['186fa7'](https://github.com/castorini/pyserini/commit/186fa793867f7572d62dc323322ba92926f12ce4) (2023/01/03) ++ [Tevatron](https://github.com/texttron/tevatron) commit [`7a5afe`](https://github.com/texttron/tevatron/commit/7a5afedb5893009154a0e915a2597e1a95e9d2a8) (2023/01/03) + +with the following command to generate the embeddings (from Tevatron repo): + +```bash +python -m tevatron.driver.jax_encode \ + --output_dir=temp \ + --model_name_or_path wiki-all-6-3-multi-dpr2-passage-encoder \ + --per_device_eval_batch_size 1248 \ + --dataset_name wiki_all_6_3.jsonl \ + --encoded_save_path corpus_emb.pkl \ + --p_max_len 256 +``` diff --git a/pyserini/resources/index-metadata/faiss-flat.wikipedia.dkrr-dpr-nq-retriever.20220217.25ed1f.cc91b2.README.md b/pyserini/resources/index-metadata/faiss-flat.wikipedia.dkrr-dpr-nq-retriever.20220217.25ed1f.cc91b2.README.md new file mode 100644 index 0000000000000000000000000000000000000000..faa97b4a33f5dd414332071bd635e9885f3a6cd6 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss-flat.wikipedia.dkrr-dpr-nq-retriever.20220217.25ed1f.cc91b2.README.md @@ -0,0 +1,27 @@ +# wikipedia-dpr-dkrr-nq + +Faiss FlatIP index of Wikipedia DPR encoded by the retriever model from [Distilling Knowledge from Reader to Retriever for Question Answering](https://arxiv.org/abs/2012.04584) trained on NQ. +This index was generated on 2022/02/17 on `orca` at commits: + ++ Pyserini commit [`cc91b2`](https://github.com/castorini/pyserini/commit/cc91b22f549702068cea1283f91b31d28d127b2f) (2022/02/17) ++ [FiD](https://github.com/facebookresearch/FiD) commit [`25ed1f`](https://github.com/facebookresearch/FiD/commit/25ed1ff0fe0288b80fb5e9e5de8d6346b94b8d48) (2022/02/17) + +with the following command to generate the embeddings (from FiD repo): + +```bash +python generate_passage_embeddings.py \ + --model_path nq_retriever \ + --passages passages.tsv \ + --output_path wikipedia_embeddings_nq \ + --shard_id 0 \ + --num_shards 1 \ + --per_gpu_batch_size 500 +``` + +and the following command to convert the embeddings to faiss IndexFlatIP form: + +```bash +python convert_dkrr_embeddings_to_faiss.py \ + --embeddings wikipedia_embeddings_nq \ + --output faiss-flat.wikipedia.dkrr-dpr-nq-retriever +``` diff --git a/pyserini/resources/index-metadata/faiss-flat.wikipedia.dkrr-dpr-tqa-retriever.20220217.25ed1f.cc91b2.README.md b/pyserini/resources/index-metadata/faiss-flat.wikipedia.dkrr-dpr-tqa-retriever.20220217.25ed1f.cc91b2.README.md new file mode 100644 index 0000000000000000000000000000000000000000..2c1cc89fab10436c8e8266dffb1f1dc3a0d2e305 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss-flat.wikipedia.dkrr-dpr-tqa-retriever.20220217.25ed1f.cc91b2.README.md @@ -0,0 +1,27 @@ +# wikipedia-dpr-dkrr-tqa + +Faiss FlatIP index of Wikipedia DPR encoded by the retriever model from [Distilling Knowledge from Reader to Retriever for Question Answering](https://arxiv.org/abs/2012.04584) trained on TriviaQA. +This index was generated on 2022/02/17 on `orca` at commits: + ++ Pyserini commit [`cc91b2`](https://github.com/castorini/pyserini/commit/cc91b22f549702068cea1283f91b31d28d127b2f) (2022/02/17) ++ [FiD](https://github.com/facebookresearch/FiD) commit [`25ed1f`](https://github.com/facebookresearch/FiD/commit/25ed1ff0fe0288b80fb5e9e5de8d6346b94b8d48) (2022/02/17) + +with the following command to generate the embeddings (from FiD repo): + +```bash +python generate_passage_embeddings.py \ + --model_path tqa_retriever \ + --passages passages.tsv \ + --output_path wikipedia_embeddings_tqa \ + --shard_id 0 \ + --num_shards 1 \ + --per_gpu_batch_size 500 +``` + +and the following command to convert the embeddings to faiss IndexFlatIP form: + +```bash +python convert_dkrr_embeddings_to_faiss.py \ + --embeddings wikipedia_embeddings_tqa \ + --output faiss-flat.wikipedia.dkrr-dpr-tqa-retriever +``` diff --git a/pyserini/resources/index-metadata/faiss-hnsw.cast2019.tct_colbert-v2-readme.txt b/pyserini/resources/index-metadata/faiss-hnsw.cast2019.tct_colbert-v2-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fb3cd09f7c5b26b29a839d3f3c4bf5fdbe2cfdc --- /dev/null +++ b/pyserini/resources/index-metadata/faiss-hnsw.cast2019.tct_colbert-v2-readme.txt @@ -0,0 +1,5 @@ +This faiss hnsw index was generated on 2021/10/23 using the repo https://github.com/castorini/CQE (see command Inference section) on Orca (Passage encoding on graham). + +Both the hyperparameter M and efConstruction are set to 256. +Note that in the future the index name should be renamed as faiss-hnsw.cast2019.tct_colbert-v2 + diff --git a/pyserini/resources/index-metadata/faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md b/pyserini/resources/index-metadata/faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md new file mode 100644 index 0000000000000000000000000000000000000000..5da58d8377d4842a6192028bf24e9f6f1d15f22c --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.beir-v1.0.0.contriever-msmarco.20230124.README.md @@ -0,0 +1,19 @@ +# BEIR v1.0.0 contriever-msmarco + +This index was generated on 20230124 using Tevatron with following command: + +``` +python -m tevatron.driver.encode \ +--output_dir=temp \ +--model_name_or_path facebook/contriever-msmarco \ +--fp16 \ +--tokenizer_name bert-base-uncased \ +--per_device_eval_batch_size 156 \ +--p_max_len 512 \ +--dataset_name Tevatron/beir-corpus:$subdataset \ +--encoded_save_path beir_embeddings/corpus_emb.$subdataset.pkl +``` + +where the `subdataset` is one of the BEIR dataset, e.g. `scifact`. + +The Embedding is then converted to Pyserini index format. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.beir-v1.0.0.contriever.20230124.README.md b/pyserini/resources/index-metadata/faiss.beir-v1.0.0.contriever.20230124.README.md new file mode 100644 index 0000000000000000000000000000000000000000..761bf6627edc5411e3254ae6169129f344284372 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.beir-v1.0.0.contriever.20230124.README.md @@ -0,0 +1,19 @@ +# BEIR v1.0.0 contriever + +This index was generated on 20230124 using Tevatron with following command: + +``` +python -m tevatron.driver.encode \ +--output_dir=temp \ +--model_name_or_path facebook/contriever \ +--fp16 \ +--tokenizer_name bert-base-uncased \ +--per_device_eval_batch_size 156 \ +--p_max_len 512 \ +--dataset_name Tevatron/beir-corpus:$subdataset \ +--encoded_save_path beir_embeddings/corpus_emb.$subdataset.pkl +``` + +where the `subdataset` is one of the BEIR dataset, e.g. `scifact`. + +The Embedding is then converted to Pyserini index format. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md b/pyserini/resources/index-metadata/faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md new file mode 100644 index 0000000000000000000000000000000000000000..31b70df5f4ee3a9dc5353a096b50b0be142e9a25 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco-ft-all.README.md @@ -0,0 +1,24 @@ +# miracl-v1.0-mdpr-tied-pft-msmarco-ft-all + +This index was generated on 2022/10/04 at Pyserini commit [`2b2856`](https://github.com/castorini/pyserini/commit/2b2856a9037c11061470cbf3d0961c7d041f1342) on `basilisk` with the following command: + +``` +corpus=./corpus/miracl-corpus-v1.0-${lang} + +encoder=castorini/mdpr-tied-pft-msmarco-ft-all +shard_id=0 +shard_num=1 + +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + --shard-id $shard_id \ + --shard-num $shard_num \ + output --embeddings $index_dir-$shard_id \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --encoder-class 'auto' \ + --fp16 +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md b/pyserini/resources/index-metadata/faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md new file mode 100644 index 0000000000000000000000000000000000000000..90686de22d6609ea339163a2020cef3956249e13 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.miracl-v1.0.20221004.2b2856.mdpr-tied-pft-msmarco.README.md @@ -0,0 +1,24 @@ +# miracl-v1.0-mdpr-tied-pft-msmarco + +This index was generated on 2022/10/04 at Pyserini commit [`2b2856`](https://github.com/castorini/pyserini/commit/2b2856a9037c11061470cbf3d0961c7d041f1342) on `basilisk` with the following command: + +``` +corpus=./corpus/miracl-corpus-v1.0-${lang} + +encoder=castorini/mdpr-tied-pft-msmarco +shard_id=0 +shard_num=1 + +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + --shard-id $shard_id \ + --shard-num $shard_num \ + output --embeddings $index_dir-$shard_id \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --encoder-class 'auto' \ + --fp16 +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md b/pyserini/resources/index-metadata/faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md new file mode 100644 index 0000000000000000000000000000000000000000..48d86c12fd33aa2910284accf87cb897b46df215 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.miracl-v1.0.20230313.e40d4a.mcontriever-tied-pft-msmarco.README.md @@ -0,0 +1,42 @@ +# miracl-v1.0-mdpr-tied-pft-msmarco-ft-miracl-${lang} + +This index was generated on 2023/03/13 on commit 20230313. + +## Index from Pyserini +```bash +lang=ar # or any lang abbreviation + +encoder=facebook/mcontriever-msmarco +index_dir=faiss.miracl-v1.0-$lang.mcontriever-tied-pft-msmarco.20230313.e40d4a +echo $index_dir + +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + --shard-id $shard_id \ + --shard-num $shard_num \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --encoder-class contriever \ + --fp16 +``` + +## To use as Search +``` +index= +output=run.miracl.mdpr-tied-pft-msmarco.$lang.dev.txt + +python -m pyserini.search.faiss \ + --encoder-class contriever \ + --encoder facebook/mcontriever-msmarco \ + --topics miracl-v1.0-$lang-dev \ + --index miracl-v1.0-$lang-mcontriever-pft-msmarco \ + --output $output \ + --batch 128 --threads 16 --hits 100 +``` + + +python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-$lang-dev --index miracl-v1.0-$lang-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 100 \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md b/pyserini/resources/index-metadata/faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd131f1999b230bd209feb2338372c0ffe14746b --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md @@ -0,0 +1,142 @@ +# miracl-v1.0-mdpr-tied-pft-msmarco-ft-miracl-${lang} + +This index was generated on 2023/03/21 using [tevatron](https://github.com/texttron/tevatron) with the following commands: + +## Create Train Directory + +> **`create_train_dir.py`** +> ```python +> import json +> from pyserini.search.lucene import LuceneSearcher +> from datasets import load_dataset +> from random import shuffle +> from tqdm import tqdm +> +> searcher = LuceneSearcher.from_prebuilt_index('miracl-v1.0-${lang}') +> searcher.set_language('${lang}') +> +> miracl_train = load_dataset('miracl/miracl', '${lang}', split='train') +> with open('miracl_train_bm25_neg_top100_random30.${lang}.jsonl', 'w') as f: +> for data in tqdm(miracl_train): +> query = data['query'] +> positives = data['positive_passages'] +> negatives = data['negative_passages'] +> positive_ids = [p['docid'] for p in positives] +> negative_ids = [p['docid'] for p in negatives] +> hits = searcher.search(query, k=100) +> bm25_negatives = [] +> for hit in hits: +> info = json.loads(hit.raw) +> if info['docid'] not in positive_ids and info['docid'] not in negative_ids: +> bm25_negatives.append(info) +> all_negatives = negatives + bm25_negatives +> shuffle(all_negatives) +> random_30_negatives = all_negatives[:30] +> data['negative_passages'] = random_30_negatives +> if len(random_30_negatives) > 0: +> f.write(json.dumps(data, ensure_ascii=False)+'\n') +> ``` + +```bash +python create_train_dir.py +``` + +## Train +```bash +CUDA_VISIBLE_DEVICES=0 python -m tevatron.driver.train \ + --output_dir model_miracl_${lang} \ + --model_name_or_path castorini/mdpr-tied-pft-msmarco \ + --tokenizer_name bert-base-multilingual-cased \ + --save_steps 20000 \ + --dataset_name Tevatron/msmarco-passage \ + --per_device_train_batch_size 64 \ + --train_dir miracl_train_bm25_neg_top100_random30.${lang}.jsonl \ + --train_n_passages 2 \ + --learning_rate 1e-5 \ + --q_max_len 32 \ + --p_max_len 256 \ + --num_train_epochs 40 \ + --logging_steps 10 \ + --overwrite_output_dir \ + --fp16 +``` + +## Encode Corpus +```bash +CUDA_VISIBLE_DEVICES=0 python -m tevatron.driver.encode \ + --output_dir=temp_out \ + --model_name_or_path model_miracl_${lang} \ + --fp16 \ + --per_device_eval_batch_size 256 \ + --dataset_name miracl/miracl-corpus:${lang} \ + --p_max_len 256 \ + --encoded_save_path model_miracl_${lang}_corpus/${lang}_corpus_emb.pt +``` + +## Convert Index + +> #### **`convert_index.py`** +> ```python +> import numpy as np +> import faiss +> import pickle +> import os +> from tqdm import tqdm +> import argparse +> +> parser = argparse.ArgumentParser() +> parser.add_argument('--input', type=str, required=True) +> parser.add_argument('--output', type=str, required=True) +> args = parser.parse_args() +> +> def pickle_load(path): +> with open(path, 'rb') as f: +> reps, lookup = pickle.load(f) +> return np.array(reps), lookup +> +> index = faiss.IndexFlatIP(768) +> +> all_ids = [] +> for name in tqdm(os.listdir(args.input)): +> if 'corpus_emb' not in name: +> continue +> path = os.path.join(args.input, name) +> reps, ids = pickle_load(path) +> all_ids.extend(ids) +> index.add(reps) +> +> faiss.write_index(index, f'{args.output}/index') +> with open(f'{args.output}/docid', 'w') as f: +> for i in all_ids: +> f.write(f'{i}\n') +> ``` + +```bash +python test.py --input=model_miracl_${lang}_corpus --output=${lang}_index +``` + + +## Index from Pyserini +Tested to use the same checkpoint to index directly via Pyserini using the following command, got the same score. (on basilisk) +(only tested on Swahili) +```bash +encoder=castorini/mdpr-tied-pft-msmarco-ft-miracl-$lang + +index_dir=miracl-v1.0-$lang-mdpr-tied-pft-msmarco-ft-miracl-$lang +echo $index_dir + + +CUDA_VISIBLE_DEVICES=1 \ +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + --shard-id $shard_id \ + --shard-num $shard_num \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --encoder-class 'auto' \ + --fp16 +``` diff --git a/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-arabic.20220207.5df364.README.md b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-arabic.20220207.5df364.README.md new file mode 100644 index 0000000000000000000000000000000000000000..9d9fecd2e88ac3edbeb8c25b9c7383e0433075ff --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-arabic.20220207.5df364.README.md @@ -0,0 +1,46 @@ +# mrtydi-v1.1-arabic + +Faiss flat index for Mr.TyDi v1.1 (Arabic), using mDPR fine-tuned on NQ. + +This index was generated on 2022/02/07 at commit [5df364](https://github.com/castorini/pyserini/commit/5df3649b128ece125ce8a9171ed4001ce3a6ef23) on `narval` with the following command: + +```bash +lang=arabic + +tarfn=mrtydi-v1.1-$lang.tar.gz +encoder=models/mdpr-context-encoder +corpus=mrtydi-v1.1-$lang/collection/docs.jsonl +index_dir=mrtydi-mdpr-dindex/$lang + +wget https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/$tarfn +tar –xvf $tarfn +gzip -cvf $corpus.gz > $corpus + +mkdir -p $index_dir + +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --fp16 +``` + +Note that the delimiter was manually changed from "`\n`" into "`\n\n`" in `pyserini.encode`. +This was later generalized into a command-line option in [Pyserini #1000](https://github.com/castorini/pyserini/pull/1000/commits/5021e12d1d2e1bc3d4015955bcf77076c5798ce6#diff-45356c3f5e9cd223bb23d7efea3f7ed834abbcd32f604eb7fdd138e364273241L104). + +Here's a sample retrieval command (on the test set): + +```bash +set_name=test +python -m pyserini.dsearch \ + --encoder castorini/mdpr-question-nq \ + --topics mrtydi-v1.1-${lang}-${set_name} \ + --index ${index_dir} \ + --output runs/run.mrtydi-v1.1-$lang.${set_name}.txt + --batch-size 36 \ + --threads 12 +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-bengali.20220207.5df364.README.md b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-bengali.20220207.5df364.README.md new file mode 100644 index 0000000000000000000000000000000000000000..f585aa07dcf732f7404565caa611e14fd58b724f --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-bengali.20220207.5df364.README.md @@ -0,0 +1,46 @@ +# mrtydi-v1.1-bengali + +Faiss flat index for Mr.TyDi v1.1 (Bengali), using mDPR fine-tuned on NQ. + +This index was generated on 2022/02/07 at commit [5df364](https://github.com/castorini/pyserini/commit/5df3649b128ece125ce8a9171ed4001ce3a6ef23) on `narval` with the following command: + +```bash +lang=bengali + +tarfn=mrtydi-v1.1-$lang.tar.gz +encoder=models/mdpr-context-encoder +corpus=mrtydi-v1.1-$lang/collection/docs.jsonl +index_dir=mrtydi-mdpr-dindex/$lang + +wget https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/$tarfn +tar –xvf $tarfn +gzip -cvf $corpus.gz > $corpus + +mkdir -p $index_dir + +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --fp16 +``` + +Note that the delimiter was manually changed from "`\n`" into "`\n\n`" in `pyserini.encode`. +This was later generalized into a command-line option in [Pyserini #1000](https://github.com/castorini/pyserini/pull/1000/commits/5021e12d1d2e1bc3d4015955bcf77076c5798ce6#diff-45356c3f5e9cd223bb23d7efea3f7ed834abbcd32f604eb7fdd138e364273241L104). + +Here's a sample retrieval command (on the test set): + +```bash +set_name=test +python -m pyserini.dsearch \ + --encoder castorini/mdpr-question-nq \ + --topics mrtydi-v1.1-${lang}-${set_name} \ + --index ${index_dir} \ + --output runs/run.mrtydi-v1.1-$lang.${set_name}.txt + --batch-size 36 \ + --threads 12 +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-english.20220207.5df364.README.md b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-english.20220207.5df364.README.md new file mode 100644 index 0000000000000000000000000000000000000000..6190b570e61dc90e76d9209c8ac9fb0004972fc1 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-english.20220207.5df364.README.md @@ -0,0 +1,46 @@ +# mrtydi-v1.1-english + +Faiss flat index for Mr.TyDi v1.1 (English), using mDPR fine-tuned on NQ. + +This index was generated on 2022/02/07 at commit [5df364](https://github.com/castorini/pyserini/commit/5df3649b128ece125ce8a9171ed4001ce3a6ef23) on `narval` with the following command: + +```bash +lang=english + +tarfn=mrtydi-v1.1-$lang.tar.gz +encoder=models/mdpr-context-encoder +corpus=mrtydi-v1.1-$lang/collection/docs.jsonl +index_dir=mrtydi-mdpr-dindex/$lang + +wget https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/$tarfn +tar –xvf $tarfn +gzip -cvf $corpus.gz > $corpus + +mkdir -p $index_dir + +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --fp16 +``` + +Note that the delimiter was manually changed from "`\n`" into "`\n\n`" in `pyserini.encode`. +This was later generalized into a command-line option in [Pyserini #1000](https://github.com/castorini/pyserini/pull/1000/commits/5021e12d1d2e1bc3d4015955bcf77076c5798ce6#diff-45356c3f5e9cd223bb23d7efea3f7ed834abbcd32f604eb7fdd138e364273241L104). + +Here's a sample retrieval command (on the test set): + +```bash +set_name=test +python -m pyserini.dsearch \ + --encoder castorini/mdpr-question-nq \ + --topics mrtydi-v1.1-${lang}-${set_name} \ + --index ${index_dir} \ + --output runs/run.mrtydi-v1.1-$lang.${set_name}.txt + --batch-size 36 \ + --threads 12 +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-finnish.20220207.5df364.README.md b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-finnish.20220207.5df364.README.md new file mode 100644 index 0000000000000000000000000000000000000000..f04b2d99985cd6583c0b17ece6185a9500b6f541 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-finnish.20220207.5df364.README.md @@ -0,0 +1,46 @@ +# mrtydi-v1.1-finnish + +Faiss flat index for Mr.TyDi v1.1 (Finnish), using mDPR fine-tuned on NQ. + +This index was generated on 2022/02/07 at commit [5df364](https://github.com/castorini/pyserini/commit/5df3649b128ece125ce8a9171ed4001ce3a6ef23) on `narval` with the following command: + +```bash +lang=finnish + +tarfn=mrtydi-v1.1-$lang.tar.gz +encoder=models/mdpr-context-encoder +corpus=mrtydi-v1.1-$lang/collection/docs.jsonl +index_dir=mrtydi-mdpr-dindex/$lang + +wget https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/$tarfn +tar –xvf $tarfn +gzip -cvf $corpus.gz > $corpus + +mkdir -p $index_dir + +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --fp16 +``` + +Note that the delimiter was manually changed from "`\n`" into "`\n\n`" in `pyserini.encode`. +This was later generalized into a command-line option in [Pyserini #1000](https://github.com/castorini/pyserini/pull/1000/commits/5021e12d1d2e1bc3d4015955bcf77076c5798ce6#diff-45356c3f5e9cd223bb23d7efea3f7ed834abbcd32f604eb7fdd138e364273241L104). + +Here's a sample retrieval command (on the test set): + +```bash +set_name=test +python -m pyserini.dsearch \ + --encoder castorini/mdpr-question-nq \ + --topics mrtydi-v1.1-${lang}-${set_name} \ + --index ${index_dir} \ + --output runs/run.mrtydi-v1.1-$lang.${set_name}.txt + --batch-size 36 \ + --threads 12 +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-indonesian.20220207.5df364.README.md b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-indonesian.20220207.5df364.README.md new file mode 100644 index 0000000000000000000000000000000000000000..6745b03862b707ce500493da118fade400fed8f0 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-indonesian.20220207.5df364.README.md @@ -0,0 +1,46 @@ +# mrtydi-v1.1-indonesian + +Faiss flat index for Mr.TyDi v1.1 (Indonesian), using mDPR fine-tuned on NQ. + +This index was generated on 2022/02/07 at commit [5df364](https://github.com/castorini/pyserini/commit/5df3649b128ece125ce8a9171ed4001ce3a6ef23) on `narval` with the following command: + +```bash +lang=indonesian + +tarfn=mrtydi-v1.1-$lang.tar.gz +encoder=models/mdpr-context-encoder +corpus=mrtydi-v1.1-$lang/collection/docs.jsonl +index_dir=mrtydi-mdpr-dindex/$lang + +wget https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/$tarfn +tar –xvf $tarfn +gzip -cvf $corpus.gz > $corpus + +mkdir -p $index_dir + +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --fp16 +``` + +Note that the delimiter was manually changed from "`\n`" into "`\n\n`" in `pyserini.encode`. +This was later generalized into a command-line option in [Pyserini #1000](https://github.com/castorini/pyserini/pull/1000/commits/5021e12d1d2e1bc3d4015955bcf77076c5798ce6#diff-45356c3f5e9cd223bb23d7efea3f7ed834abbcd32f604eb7fdd138e364273241L104). + +Here's a sample retrieval command (on the test set): + +```bash +set_name=test +python -m pyserini.dsearch \ + --encoder castorini/mdpr-question-nq \ + --topics mrtydi-v1.1-${lang}-${set_name} \ + --index ${index_dir} \ + --output runs/run.mrtydi-v1.1-$lang.${set_name}.txt + --batch-size 36 \ + --threads 12 +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-japanese.20220207.5df364.README.md b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-japanese.20220207.5df364.README.md new file mode 100644 index 0000000000000000000000000000000000000000..c7b0e58a79e5909dbf09a77f7936a339de58df78 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-japanese.20220207.5df364.README.md @@ -0,0 +1,46 @@ +# mrtydi-v1.1-japanese + +Faiss flat index for Mr.TyDi v1.1 (Japanese), using mDPR fine-tuned on NQ. + +This index was generated on 2022/02/07 at commit [5df364](https://github.com/castorini/pyserini/commit/5df3649b128ece125ce8a9171ed4001ce3a6ef23) on `narval` with the following command: + +```bash +lang=japanese + +tarfn=mrtydi-v1.1-$lang.tar.gz +encoder=models/mdpr-context-encoder +corpus=mrtydi-v1.1-$lang/collection/docs.jsonl +index_dir=mrtydi-mdpr-dindex/$lang + +wget https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/$tarfn +tar –xvf $tarfn +gzip -cvf $corpus.gz > $corpus + +mkdir -p $index_dir + +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --fp16 +``` + +Note that the delimiter was manually changed from "`\n`" into "`\n\n`" in `pyserini.encode`. +This was later generalized into a command-line option in [Pyserini #1000](https://github.com/castorini/pyserini/pull/1000/commits/5021e12d1d2e1bc3d4015955bcf77076c5798ce6#diff-45356c3f5e9cd223bb23d7efea3f7ed834abbcd32f604eb7fdd138e364273241L104). + +Here's a sample retrieval command (on the test set): + +```bash +set_name=test +python -m pyserini.dsearch \ + --encoder castorini/mdpr-question-nq \ + --topics mrtydi-v1.1-${lang}-${set_name} \ + --index ${index_dir} \ + --output runs/run.mrtydi-v1.1-$lang.${set_name}.txt + --batch-size 36 \ + --threads 12 +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-korean.20220207.5df364.README.md b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-korean.20220207.5df364.README.md new file mode 100644 index 0000000000000000000000000000000000000000..87451ed03cd7fc1bd7591f36718b87c4a44991d1 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-korean.20220207.5df364.README.md @@ -0,0 +1,46 @@ +# mrtydi-v1.1-korean + +Faiss flat index for Mr.TyDi v1.1 (Korean), using mDPR fine-tuned on NQ. + +This index was generated on 2022/02/07 at commit [5df364](https://github.com/castorini/pyserini/commit/5df3649b128ece125ce8a9171ed4001ce3a6ef23) on `narval` with the following command: + +```bash +lang=korean + +tarfn=mrtydi-v1.1-$lang.tar.gz +encoder=models/mdpr-context-encoder +corpus=mrtydi-v1.1-$lang/collection/docs.jsonl +index_dir=mrtydi-mdpr-dindex/$lang + +wget https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/$tarfn +tar –xvf $tarfn +gzip -cvf $corpus.gz > $corpus + +mkdir -p $index_dir + +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --fp16 +``` + +Note that the delimiter was manually changed from "`\n`" into "`\n\n`" in `pyserini.encode`. +This was later generalized into a command-line option in [Pyserini #1000](https://github.com/castorini/pyserini/pull/1000/commits/5021e12d1d2e1bc3d4015955bcf77076c5798ce6#diff-45356c3f5e9cd223bb23d7efea3f7ed834abbcd32f604eb7fdd138e364273241L104). + +Here's a sample retrieval command (on the test set): + +```bash +set_name=test +python -m pyserini.dsearch \ + --encoder castorini/mdpr-question-nq \ + --topics mrtydi-v1.1-${lang}-${set_name} \ + --index ${index_dir} \ + --output runs/run.mrtydi-v1.1-$lang.${set_name}.txt + --batch-size 36 \ + --threads 12 +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-russian.20220207.5df364.README.md b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-russian.20220207.5df364.README.md new file mode 100644 index 0000000000000000000000000000000000000000..d804cba8f71bece12840497fdb5a8a3eb0e6609e --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-russian.20220207.5df364.README.md @@ -0,0 +1,46 @@ +# mrtydi-v1.1-russian + +Faiss flat index for Mr.TyDi v1.1 (Russian), using mDPR fine-tuned on NQ. + +This index was generated on 2022/02/07 at commit [5df364](https://github.com/castorini/pyserini/commit/5df3649b128ece125ce8a9171ed4001ce3a6ef23) on `narval` with the following command: + +```bash +lang=russian + +tarfn=mrtydi-v1.1-$lang.tar.gz +encoder=models/mdpr-context-encoder +corpus=mrtydi-v1.1-$lang/collection/docs.jsonl +index_dir=mrtydi-mdpr-dindex/$lang + +wget https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/$tarfn +tar –xvf $tarfn +gzip -cvf $corpus.gz > $corpus + +mkdir -p $index_dir + +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --fp16 +``` + +Note that the delimiter was manually changed from "`\n`" into "`\n\n`" in `pyserini.encode`. +This was later generalized into a command-line option in [Pyserini #1000](https://github.com/castorini/pyserini/pull/1000/commits/5021e12d1d2e1bc3d4015955bcf77076c5798ce6#diff-45356c3f5e9cd223bb23d7efea3f7ed834abbcd32f604eb7fdd138e364273241L104). + +Here's a sample retrieval command (on the test set): + +```bash +set_name=test +python -m pyserini.dsearch \ + --encoder castorini/mdpr-question-nq \ + --topics mrtydi-v1.1-${lang}-${set_name} \ + --index ${index_dir} \ + --output runs/run.mrtydi-v1.1-$lang.${set_name}.txt + --batch-size 36 \ + --threads 12 +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-swahili.20220207.5df364.README.md b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-swahili.20220207.5df364.README.md new file mode 100644 index 0000000000000000000000000000000000000000..3324357be037f2ffaf70852c44abe1aa9cbbbc63 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-swahili.20220207.5df364.README.md @@ -0,0 +1,46 @@ +# mrtydi-v1.1-swahili + +Faiss flat index for Mr.TyDi v1.1 (Swahili), using mDPR fine-tuned on NQ. + +This index was generated on 2022/02/07 at commit [5df364](https://github.com/castorini/pyserini/commit/5df3649b128ece125ce8a9171ed4001ce3a6ef23) on `narval` with the following command: + +```bash +lang=swahili + +tarfn=mrtydi-v1.1-$lang.tar.gz +encoder=models/mdpr-context-encoder +corpus=mrtydi-v1.1-$lang/collection/docs.jsonl +index_dir=mrtydi-mdpr-dindex/$lang + +wget https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/$tarfn +tar –xvf $tarfn +gzip -cvf $corpus.gz > $corpus + +mkdir -p $index_dir + +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --fp16 +``` + +Note that the delimiter was manually changed from "`\n`" into "`\n\n`" in `pyserini.encode`. +This was later generalized into a command-line option in [Pyserini #1000](https://github.com/castorini/pyserini/pull/1000/commits/5021e12d1d2e1bc3d4015955bcf77076c5798ce6#diff-45356c3f5e9cd223bb23d7efea3f7ed834abbcd32f604eb7fdd138e364273241L104). + +Here's a sample retrieval command (on the test set): + +```bash +set_name=test +python -m pyserini.dsearch \ + --encoder castorini/mdpr-question-nq \ + --topics mrtydi-v1.1-${lang}-${set_name} \ + --index ${index_dir} \ + --output runs/run.mrtydi-v1.1-$lang.${set_name}.txt + --batch-size 36 \ + --threads 12 +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-telugu.20220207.5df364.README.md b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-telugu.20220207.5df364.README.md new file mode 100644 index 0000000000000000000000000000000000000000..3607478d0b00d2bd6fe6926f5ecd45e00d4a67a2 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-telugu.20220207.5df364.README.md @@ -0,0 +1,46 @@ +# mrtydi-v1.1-telugu + +Faiss flat index for Mr.TyDi v1.1 (Telugu), using mDPR fine-tuned on NQ. + +This index was generated on 2022/02/07 at commit [5df364](https://github.com/castorini/pyserini/commit/5df3649b128ece125ce8a9171ed4001ce3a6ef23) on `narval` with the following command: + +```bash +lang=telugu + +tarfn=mrtydi-v1.1-$lang.tar.gz +encoder=models/mdpr-context-encoder +corpus=mrtydi-v1.1-$lang/collection/docs.jsonl +index_dir=mrtydi-mdpr-dindex/$lang + +wget https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/$tarfn +tar –xvf $tarfn +gzip -cvf $corpus.gz > $corpus + +mkdir -p $index_dir + +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --fp16 +``` + +Note that the delimiter was manually changed from "`\n`" into "`\n\n`" in `pyserini.encode`. +This was later generalized into a command-line option in [Pyserini #1000](https://github.com/castorini/pyserini/pull/1000/commits/5021e12d1d2e1bc3d4015955bcf77076c5798ce6#diff-45356c3f5e9cd223bb23d7efea3f7ed834abbcd32f604eb7fdd138e364273241L104). + +Here's a sample retrieval command (on the test set): + +```bash +set_name=test +python -m pyserini.dsearch \ + --encoder castorini/mdpr-question-nq \ + --topics mrtydi-v1.1-${lang}-${set_name} \ + --index ${index_dir} \ + --output runs/run.mrtydi-v1.1-$lang.${set_name}.txt + --batch-size 36 \ + --threads 12 +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-thai.20220207.5df364.README.md b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-thai.20220207.5df364.README.md new file mode 100644 index 0000000000000000000000000000000000000000..233d6f7e2ae2e0a98121289f2d5f84265b855951 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1-thai.20220207.5df364.README.md @@ -0,0 +1,46 @@ +# mrtydi-v1.1-thai + +Faiss flat index for Mr.TyDi v1.1 (Thai), using mDPR fine-tuned on NQ. + +This index was generated on 2022/02/07 at commit [5df364](https://github.com/castorini/pyserini/commit/5df3649b128ece125ce8a9171ed4001ce3a6ef23) on `narval` with the following command: + +```bash +lang=thai + +tarfn=mrtydi-v1.1-$lang.tar.gz +encoder=models/mdpr-context-encoder +corpus=mrtydi-v1.1-$lang/collection/docs.jsonl +index_dir=mrtydi-mdpr-dindex/$lang + +wget https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/$tarfn +tar –xvf $tarfn +gzip -cvf $corpus.gz > $corpus + +mkdir -p $index_dir + +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --fp16 +``` + +Note that the delimiter was manually changed from "`\n`" into "`\n\n`" in `pyserini.encode`. +This was later generalized into a command-line option in [Pyserini #1000](https://github.com/castorini/pyserini/pull/1000/commits/5021e12d1d2e1bc3d4015955bcf77076c5798ce6#diff-45356c3f5e9cd223bb23d7efea3f7ed834abbcd32f604eb7fdd138e364273241L104). + +Here's a sample retrieval command (on the test set): + +```bash +set_name=test +python -m pyserini.dsearch \ + --encoder castorini/mdpr-question-nq \ + --topics mrtydi-v1.1-${lang}-${set_name} \ + --index ${index_dir} \ + --output runs/run.mrtydi-v1.1-$lang.${set_name}.txt + --batch-size 36 \ + --threads 12 +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss.mrtydi-v1.1.20220413.aa1c0e9.mdpr-tied-pft-msmarco.README.md b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1.20220413.aa1c0e9.mdpr-tied-pft-msmarco.README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd327099d367d59521e436ff833b27396f2f5710 --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1.20220413.aa1c0e9.mdpr-tied-pft-msmarco.README.md @@ -0,0 +1,71 @@ +# mrtydi-v1.1-arabic (trained on MS MARCO) + +Faiss flat index for Mr.TyDi v1.1 (Arabic), using mDPR fine-tuned on MS MARCO. + +This index was generated on 2022/03/27 at commit [aa1c0e9](https://github.com/castorini/pyserini/commit/aa1c0e9a5bbfab406f8c73d23c91a009307096c6) on `cedar` with the following command: + +```bash +lang=arabic + +tarfn=mrtydi-v1.1-$lang.tar.gz +corpus=mrtydi-v1.1-$lang/collection/docs.jsonl +index_dir=mrtydi-mdpr-dindex-msmarco/$lang + +wget https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/$tarfn +tar –xvf $tarfn +gzip -cvf $corpus.gz > $corpus + +shard_num=1 +encoder=mdpr-mrtydi-0shot-msmarco-tied-encoder-converted + +for shard_id in $(seq 0 `$shard_num - 1`) ; do + index_dir=mdpr-dindex/$lang-$shard_id + mkdir -p $index_dir + python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + --shard-id $shard_id \ + --shard-num $shard_num \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --fp16 +done +``` + +Note that the delimiter are only supported after [Pyserini #1000](https://github.com/castorini/pyserini/pull/1000/commits/5021e12d1d2e1bc3d4015955bcf77076c5798ce6#diff-45356c3f5e9cd223bb23d7efea3f7ed834abbcd32f604eb7fdd138e364273241L104). + +The index can be later reproduced on commit [7b099d5](https://github.com/crystina-z/pyserini/commit/7b099d534901d1f0161982605cd40d039ddb701d) using +``` +encoder=castorini/mdpr-tied-pft-msmarco +index_dir=mdpr-dindex/$lang-$shard_id +mkdir -p $index_dir +python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + --shard-id $shard_id \ + --shard-num $shard_num \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --encoder-class 'auto' \ + --fp16 +``` + +Here's a sample retrieval command (on the test set): + +```bash +set_name=test +python -m pyserini.dsearch \ + --encoder castorini/mdpr-tied-pft-msmarco \ + --topics mrtydi-v1.1-${lang}-${set_name} \ + --index ${index_dir} \ + --output runs/run.mrtydi-v1.1-$lang.${set_name}.txt \ + --batch-size 36 \ + --threads 12 \ + --encoder-class 'auto' +``` diff --git a/pyserini/resources/index-metadata/faiss.mrtydi-v1.1.20220523.7b099d5.mdpr-tied-pft-nq.README.md b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1.20220523.7b099d5.mdpr-tied-pft-nq.README.md new file mode 100644 index 0000000000000000000000000000000000000000..3fc67edad6ee47b0d468b0b6337f36c9c5560ceb --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1.20220523.7b099d5.mdpr-tied-pft-nq.README.md @@ -0,0 +1,54 @@ +# mrtydi-v1.1-arabic (trained on NQ) + +Faiss flat index for Mr.TyDi v1.1 (Arabic), using mDPR fine-tuned on NQ. + +This index was generated on 2022/05/23 at commit [7b099d5](https://github.com/crystina-z/pyserini/commit/7b099d534901d1f0161982605cd40d039ddb701d) on `basilisk` with the following command: + +```bash +lang=arabic # any language in Mr. TyDi + +tarfn=mrtydi-v1.1-$lang.tar.gz +corpus=mrtydi-v1.1-$lang/collection/docs.jsonl +index_dir=mrtydi-mdpr-dindex-msmarco/$lang + +wget https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/$tarfn +tar –xvf $tarfn +gzip -cvf $corpus.gz > $corpus + +shard_num=1 +encoder=castorini/mdpr-tied-pft-nq + +for shard_id in $(seq 0 `$shard_num - 1`) ; do + index_dir=mdpr-dindex/$lang-$shard_id + mkdir -p $index_dir + python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + --shard-id $shard_id \ + --shard-num $shard_num \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --encoder-class 'auto' \ + --fp16 +done +``` + +Note that the delimiter are only supported after [Pyserini #1000](https://github.com/castorini/pyserini/pull/1000/commits/5021e12d1d2e1bc3d4015955bcf77076c5798ce6#diff-45356c3f5e9cd223bb23d7efea3f7ed834abbcd32f604eb7fdd138e364273241L104). + + +Here's a sample retrieval command (on the test set): + +```bash +set_name=test +python -m pyserini.search.faiss \ + --encoder castorini/mdpr-tied-pft-msmarco \ + --topics mrtydi-v1.1-${lang}-${set_name} \ + --index ${index_dir} \ + --output runs/run.mrtydi-v1.1-$lang.${set_name}.txt \ + --batch-size 36 \ + --threads 12 \ + --encoder-class 'auto' +``` diff --git a/pyserini/resources/index-metadata/faiss.mrtydi-v1.1.20220524.7b099d5.mdpr-tied-pft-msmarco-ft-all.README.md b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1.20220524.7b099d5.mdpr-tied-pft-msmarco-ft-all.README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ae5b26c5aa9df37841358aab566e38b55f98eca --- /dev/null +++ b/pyserini/resources/index-metadata/faiss.mrtydi-v1.1.20220524.7b099d5.mdpr-tied-pft-msmarco-ft-all.README.md @@ -0,0 +1,54 @@ +# mrtydi-v1.1-arabic (pre--fine-tuned on MS MARCO then fine-tuned on all Mr. TyDi languages) + +Faiss flat index for Mr.TyDi v1.1 (Arabic), using mDPR fine-tuned on NQ. + +This index was generated on 2022/05/24 at commit [7b099d5](https://github.com/crystina-z/pyserini/commit/7b099d534901d1f0161982605cd40d039ddb701d) on `basilisk` with the following command: + +```bash +lang=arabic # any language in Mr. TyDi + +tarfn=mrtydi-v1.1-$lang.tar.gz +corpus=mrtydi-v1.1-$lang/collection/docs.jsonl +index_dir=mrtydi-mdpr-dindex-msmarco/$lang + +wget https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/$tarfn +tar –xvf $tarfn +gzip -cvf $corpus.gz > $corpus + +shard_num=1 +encoder=castorini/mdpr-tied-pft-msmarco-ft-all + +for shard_id in $(seq 0 `$shard_num - 1`) ; do + index_dir=mdpr-dindex/$lang-$shard_id + mkdir -p $index_dir + python -m pyserini.encode input --corpus $corpus \ + --fields title text \ + --delimiter "\n\n" \ + --shard-id $shard_id \ + --shard-num $shard_num \ + output --embeddings $index_dir \ + --to-faiss \ + encoder --encoder $encoder \ + --fields title text \ + --batch 128 \ + --encoder-class 'auto' \ + --fp16 +done +``` + +Note that the delimiter are only supported after [Pyserini #1000](https://github.com/castorini/pyserini/pull/1000/commits/5021e12d1d2e1bc3d4015955bcf77076c5798ce6#diff-45356c3f5e9cd223bb23d7efea3f7ed834abbcd32f604eb7fdd138e364273241L104). + + +Here's a sample retrieval command (on the test set): + +```bash +set_name=test +python -m pyserini.search.faiss \ + --encoder castorini/mdpr-tied-pft-msmarco \ + --topics mrtydi-v1.1-${lang}-${set_name} \ + --index ${index_dir} \ + --output runs/run.mrtydi-v1.1-$lang.${set_name}.txt \ + --batch-size 36 \ + --threads 12 \ + --encoder-class 'auto' +``` diff --git a/pyserini/resources/index-metadata/index-msmarco-doc-20201117-f87c94-readme.txt b/pyserini/resources/index-metadata/index-msmarco-doc-20201117-f87c94-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd7fe0374595aaced509bb1fcaf66c6602910c0d --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-doc-20201117-f87c94-readme.txt @@ -0,0 +1,15 @@ +This index was generated on 2020/11/17 at commit f87c945fd1c1e4174468194c72e3c05688dc45dd Mon Nov 16 16:17:20 2020 -0500 +with the following command: + +sh target/appassembler/bin/IndexCollection -collection CleanTrecCollection \ + -generator DefaultLuceneDocumentGenerator -input collections/msmarco-doc \ + -index index-msmarco-doc-20201117-f87c94 -threads 1 -storeRaw -optimize + +Note that to reduce index size: + ++ positions are not indexed (so no phrase queries) ++ document vectors are not stored (so no query expansion) + +However, the raw documents are stored, so they can be fetched and fed to further downstream reranking components. + +index-msmarco-doc-20201117-f87c94.tar.gz MD5 checksum = ac747860e7a37aed37cc30ed3990f273 diff --git a/pyserini/resources/index-metadata/index-msmarco-doc-expanded-per-doc-20201126-1b4d0a-readme.txt b/pyserini/resources/index-metadata/index-msmarco-doc-expanded-per-doc-20201126-1b4d0a-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..db57732f8afbf3f3e88b5b1d33eec15bb8c406ad --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-doc-expanded-per-doc-20201126-1b4d0a-readme.txt @@ -0,0 +1,14 @@ +This index was generated on 2020/11/26 at + ++ docTTTTTquery commit d2704c025c2bf6db652b4b27f49c4e59714ba898 (2020/11/24). ++ anserini commit 1b4d0a29879a867ca5d1f003f924acc3279455ba (2020/11/25). + +with the following command: + +sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input msmarco-doc-expanded -index index-msmarco-doc-expanded-per-doc-20201126-1b4d0a -optimize + +Note that this index does not store any "extras" (positions, document vectors, raw documents, etc.). + +index-msmarco-doc-expanded-per-doc-20201126-1b4d0a.tar.gz MD5 checksum = f7056191842ab77a01829cff68004782 diff --git a/pyserini/resources/index-metadata/index-msmarco-doc-expanded-per-passage-20201126-1b4d0a-readme.txt b/pyserini/resources/index-metadata/index-msmarco-doc-expanded-per-passage-20201126-1b4d0a-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..29362ba57057b07c9facdece128d8b1ab8540cb1 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-doc-expanded-per-passage-20201126-1b4d0a-readme.txt @@ -0,0 +1,14 @@ +This index was generated on 2020/11/26 at + ++ docTTTTTquery commit d2704c025c2bf6db652b4b27f49c4e59714ba898 (2020/11/24). ++ anserini commit 1b4d0a29879a867ca5d1f003f924acc3279455ba (2020/11/25). + +with the following command: + +sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input msmarco-doc-expanded-passage -index index-msmarco-doc-expanded-per-passage-20201126-1b4d0a -optimize + +Note that this index does not store any "extras" (positions, document vectors, raw documents, etc.). + +index-msmarco-doc-expanded-per-passage-20201126-1b4d0a.tar.gz MD5 checksum = 54ea30c64515edf3c3741291b785be53 diff --git a/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-20201204-f50dcc-readme.txt b/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-20201204-f50dcc-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f250a5de3139b1ead215a7dee37c80dc94aee43 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-20201204-f50dcc-readme.txt @@ -0,0 +1,19 @@ +This index was generated on 2020/12/04 at + ++ docTTTTTquery commit 5be1af130b4657ea117781f761c4e5d15c77cb42 (2020/12/01). ++ anserini commit f50dcceb6cd0ec3403c1e77066aa51bb3275d24e (2020/12/04). + +with the following command: + +sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input msmarco-doc-passage -index index-msmarco-doc-per-passage-20201204-f50dcc -storeRaw -optimize + +Note that to reduce index size: + ++ positions are not indexed (so no phrase queries) ++ document vectors are not stored (so no query expansion) + +However, the raw documents are stored, so they can be fetched and fed to further downstream reranking components. + +index-msmarco-doc-per-passage-20201204-f50dcc.tar.gz MD5 checksum = 797367406a7542b649cefa6b41cf4c33 diff --git a/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-ltr-readme.txt b/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-ltr-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..65aec72512cdb11473d59a693a10cef00486145d --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-ltr-readme.txt @@ -0,0 +1,12 @@ +This index was generated on 2021/10/31 at commit 33e4151e6d58f5b8ea0ef0768dc5308ec48b1aae 2021-10-31 16:53:36 +0800 +with the following command: + +sh target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -input collections/msmarco-ltr-document/ltr_msmarco_pass_doc_jsonl \ + -index index-msmarco-doc-per-passage-ltr-20211031-33e4151 -threads 21 -storeRaw -optimize -storePositions -storeDocvectors -pretokenizdd + +Note, pretokenized option is used to keep preprocessed tokenization. +This is built with spacy 3.0.6. +The max length is 3 and stride is 1. + +index-msmarco-passage-ltr-20210519-e25e33f MD5 checksum = bd60e89041b4ebbabc4bf0cfac608a87 diff --git a/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-slim-20201204-f50dcc-readme.txt b/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-slim-20201204-f50dcc-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..565915c8b7b71a5484a920e1fd7d61fa6ed86b60 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-slim-20201204-f50dcc-readme.txt @@ -0,0 +1,14 @@ +This index was generated on 2020/12/04 at + ++ docTTTTTquery commit 5be1af130b4657ea117781f761c4e5d15c77cb42 (2020/12/01). ++ anserini commit f50dcceb6cd0ec3403c1e77066aa51bb3275d24e (2020/12/04). + +with the following command: + +sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input msmarco-doc-passage -index index-msmarco-doc-per-passage-slim-20201204-f50dcc -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +index-msmarco-doc-per-passage-slim-20201204-f50dcc.tar.gz MD5 checksum = 77c2409943a8c9faffabf57cb6adca69 diff --git a/pyserini/resources/index-metadata/index-msmarco-doc-slim-20201202-ab6e28-readme.txt b/pyserini/resources/index-metadata/index-msmarco-doc-slim-20201202-ab6e28-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e79f60ca78240b3cc07d0e0fe0cfcfab76fb1db --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-doc-slim-20201202-ab6e28-readme.txt @@ -0,0 +1,10 @@ +This index was generated on 2020/12/02 at commit ab6e280b06a7a6476d001a5eb2319c191010c0e1 (2020/12/01) +with the following command: + +sh target/appassembler/bin/IndexCollection -collection CleanTrecCollection \ + -generator DefaultLuceneDocumentGenerator -input collections/msmarco-doc \ + -index index-msmarco-doc-slim-20201202-ab6e28 -threads 1 -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +index-msmarco-doc-slim-20201202-ab6e28.tar.gz MD5 checksum = c56e752f7992bf6149761097641d515a diff --git a/pyserini/resources/index-metadata/index-msmarco-passage-20201117-f87c94-readme.txt b/pyserini/resources/index-metadata/index-msmarco-passage-20201117-f87c94-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3a08f586af6a12e26bf35ca643b8df471402313 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-passage-20201117-f87c94-readme.txt @@ -0,0 +1,15 @@ +This index was generated on 2020/11/17 at commit f87c945fd1c1e4174468194c72e3c05688dc45dd Mon Nov 16 16:17:20 2020 -0500 +with the following command: + +sh target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -input collections/msmarco-passage/collection_jsonl \ + -index index-msmarco-passage-20201117-f87c94 -threads 9 -storeRaw -optimize + +Note that to reduce index size: + ++ positions are not indexed (so no phrase queries) ++ document vectors are not stored (so no query expansion) + +However, the raw passages are stored, so they can be fetched and fed to further downstream reranking components. + +index-msmarco-passage-20201117-f87c94.tar.gz MD5 checksum = 1efad4f1ae6a77e235042eff4be1612d diff --git a/pyserini/resources/index-metadata/index-msmarco-passage-expanded-20201121-e127fb-readme.txt b/pyserini/resources/index-metadata/index-msmarco-passage-expanded-20201121-e127fb-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..23fa65442855f366c02dd94532e38cba6d0ea215 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-passage-expanded-20201121-e127fb-readme.txt @@ -0,0 +1,14 @@ +This index was generated on 2020/11/21 at + ++ docTTTTTquery commit 701ea0a72beeb8db46aa409352a72ba52cd2c36b Tue Nov 17 07:13:27 2020 -0500 ++ anserini commit e127fbea6f5515d60eb7c325cd866657dbf13cc6 Sat Nov 21 07:58:03 2020 -0500 + +with the following command: + +sh anserini/target/appassembler/bin/IndexCollection \ + -collection JsonCollection -generator DefaultLuceneDocumentGenerator \ + -input msmarco-passage-expanded -index index-msmarco-passage-expanded-20201121-e127fb -threads 9 -optimize + +Note that this index does not store any "extras" (positions, document vectors, raw documents, etc.). + +index-msmarco-passage-expanded-20201121-e127fb.tar.gz MD5 checksum = e5762e9e065b6fe5000f9c18da778565 diff --git a/pyserini/resources/index-metadata/index-msmarco-passage-ltr-20210519-e25e33f-readme.txt b/pyserini/resources/index-metadata/index-msmarco-passage-ltr-20210519-e25e33f-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a5e758a89f7c02bce46111805426255c1da8a88 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-passage-ltr-20210519-e25e33f-readme.txt @@ -0,0 +1,11 @@ +This index was generated on 2021/05/19 at commit e25e33f4a06e9c1ab4d795908cae4474fa019643 2021-05-17 21:48:48 -0400 +with the following command: + +sh target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -input collections/msmarco-ltr-passage/ltr_collection_jsonl \ + -index index-msmarco-passage-ltr-20210519-e25e33f -threads 9 -storeRaw -optimize -storePositions -storeDocvectors -pretokenizdd + +Note, pretokenized option is used to keep preprocessed tokenization. +This is built with spacy 3.0.6. + +index-msmarco-passage-ltr-20210519-e25e33f MD5 checksum = a5de642c268ac1ed5892c069bdc29ae3 diff --git a/pyserini/resources/index-metadata/index-msmarco-passage-slim-20201202-ab6e28-readme.txt b/pyserini/resources/index-metadata/index-msmarco-passage-slim-20201202-ab6e28-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..010eaab227bbd2a50082bd31623658015fad7a93 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-passage-slim-20201202-ab6e28-readme.txt @@ -0,0 +1,10 @@ +This index was generated on 2020/12/02 at commit ab6e280b06a7a6476d001a5eb2319c191010c0e1 (2020/12/01) +with the following command: + +sh target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -input collections/msmarco-passage/collection_jsonl \ + -index index-msmarco-passage-slim-20201202-ab6e28 -threads 9 -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +index-msmarco-passage-slim-20201202-ab6e28.tar.gz MD5 checksum = 5e11da4cebd2e8dda2e73c589ffb0b4c diff --git a/pyserini/resources/index-metadata/index-robust04-20191213-readme.txt b/pyserini/resources/index-metadata/index-robust04-20191213-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc45b21c72c06257c3556cb88f87ae4a2ceb5350 --- /dev/null +++ b/pyserini/resources/index-metadata/index-robust04-20191213-readme.txt @@ -0,0 +1,7 @@ +This index was generated on 12/13/2019 with Anserini v0.7.0, with the following command: + +sh target/appassembler/bin/IndexCollection -collection TrecCollection \ + -input /tuna1/collections/newswire/disk45/ -index index-robust04-20191213 \ + -generator JsoupGenerator -threads 16 -storePositions -storeDocvectors -storeRawDocs -optimize + +index-robust04-20191213.tar.gz MD5 checksum = 15f3d001489c97849a010b0a4734d018 diff --git a/pyserini/resources/index-metadata/index-wikipedia-dpr-20210120-d1b9e6-readme.txt b/pyserini/resources/index-metadata/index-wikipedia-dpr-20210120-d1b9e6-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..e449ad1048c18696b1638916a1011f0f7da255f7 --- /dev/null +++ b/pyserini/resources/index-metadata/index-wikipedia-dpr-20210120-d1b9e6-readme.txt @@ -0,0 +1,18 @@ +This index was generated on 2021/01/20 at + ++ anserini commit d1b9e67928aa60fa557113ace5d209b0c58e994c (2021/01/19). + +with the following command: + +sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 22 \ + -input wikipedia-dpr-jsonl -index index-wikipedia-dpr-20210120-d1b9e6 -storeRaw -optimize + +Note that to reduce index size: + ++ positions are not indexed (so no phrase queries) ++ document vectors are not stored (so no query expansion) + +However, the raw documents are stored, so they can be fetched and fed to further downstream reranking components. + +index-wikipedia-dpr-20210120-d1b9e6.tar.gz MD5 checksum = c28f3a56b2dfcef25bf3bf755c264d04 diff --git a/pyserini/resources/index-metadata/index-wikipedia-dpr-slim-20210120-d1b9e6-readme.txt b/pyserini/resources/index-metadata/index-wikipedia-dpr-slim-20210120-d1b9e6-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ff6af6e28a5b08851f3b1e747a8d0024102c3a4 --- /dev/null +++ b/pyserini/resources/index-metadata/index-wikipedia-dpr-slim-20210120-d1b9e6-readme.txt @@ -0,0 +1,13 @@ +This index was generated on 2021/01/20 at + ++ anserini commit d1b9e67928aa60fa557113ace5d209b0c58e994c (2021/01/19). + +with the following command: + +sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 22 \ + -input wikipedia-dpr-jsonl -index index-wikipedia-dpr-slim-20210120-d1b9e6 -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +index-wikipedia-dpr-slim-20210120-d1b9e6.tar.gz MD5 checksum = 7d40604a824b5df37a1ae9d25ea38071 diff --git a/pyserini/resources/index-metadata/index-wikipedia-kilt-doc-20210421-f29307-readme.txt b/pyserini/resources/index-metadata/index-wikipedia-kilt-doc-20210421-f29307-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..8449100a55f1206cedcc1ba0ba32d44b070cef9d --- /dev/null +++ b/pyserini/resources/index-metadata/index-wikipedia-kilt-doc-20210421-f29307-readme.txt @@ -0,0 +1,18 @@ +This index was generated on 2021/04/22 at + ++ anserini commit f29307a9fb162ec7bef4919a164929a673d2304e (2021/04/21). + +with the following command: + +python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \ + -threads 40 -input collections/wikipedia-kilt-doc \ + -index indexes/index-wikipedia-kilt-doc-20210421-f29307 -storeRaw -optimize + +Note that to reduce index size: + ++ positions are not indexed (so no phrase queries) ++ document vectors are not stored (so no query expansion) + +However, the raw documents are stored, so they can be fetched and fed to further downstream reranking components. + +index-wikipedia-kilt-doc-20210421-f29307.tar.gz MD5 checksum = b8ec8feb654f7aaa86f9901dc6c804a8 diff --git a/pyserini/resources/index-metadata/lucene-index-wiki-all-6-3-tamber-20230111-40277a.README.md b/pyserini/resources/index-metadata/lucene-index-wiki-all-6-3-tamber-20230111-40277a.README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e04c56f067c243bce43282d67b6734cde21adb7 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index-wiki-all-6-3-tamber-20230111-40277a.README.md @@ -0,0 +1,24 @@ +# wiki-all-6-3-tamber lucene index + +This Lucene index was generated on 2023/01/11 at Pyserini commit ['40277a'](https://github.com/castorini/pyserini/commit/40277ae007e4d28882af19d6ce1e899a0af04a68) +with the following commands: + +First make sure you have git lfs installed to clone the huggingface repository. +```bash +git lfs install +``` + +```bash +git clone https://huggingface.co/datasets/castorini/odqa-wiki-corpora + +python -m pyserini.index.lucene \ + --collection MrTyDiCollection \ + --input odqa-wiki-corpora/wiki-all-6-3-tamber \ + --index indexes/index-wiki-all-6-3-tamber-20230111-40277a \ + --generator DefaultLuceneDocumentGenerator \ + --threads 12 \ + --optimize \ + --storeRaw + ``` + +lucene-index-wiki-all-6-3-tamber-20230111-40277a.tar.gz MD5 checksum = 018b45ee8c6278a879caa3145b2dc05d diff --git a/pyserini/resources/index-metadata/lucene-index.atomic.20230525.a7df7f.README.md b/pyserini/resources/index-metadata/lucene-index.atomic.20230525.a7df7f.README.md new file mode 100644 index 0000000000000000000000000000000000000000..03901e39925b5b173355c99443c91c4d7c97cf56 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.atomic.20230525.a7df7f.README.md @@ -0,0 +1,5 @@ +# AToMiC BM 25 Indexes + +Lucene indexes for the AToMiC dataset (text collection v0.2.1, image collection v0.2) + +These indexes were generated on 2023/05/25 at Anserini commit [`a7df7f`](https://github.com/castorini/anserini/commit/a7df7fc5d527ede8f34ee60afa41dec4f6b0e93a) on Compute Canada's Cedar cluster running [this script](https://github.com/TREC-AToMiC/AToMiC/blob/f2f9b58ffd39d920c7599ba49de40a34dd1a21b8/examples/bm25_en_caption/run_bm25_baseline.py#L62) (in particular, the `create_index` function). diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-arguana-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-arguana-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..8b0d21d867892fee7e630177e58a6ab528e0522a --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-arguana-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — ArguAna + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/arguana \ + -index indexes/lucene-index.beir-v1.0.0-arguana-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-arguana-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-arguana-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-arguana-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..123ac08eeda746598b3acf4563e96b27df14abd3 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-arguana-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — ArguAna + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/arguana \ + -index indexes/lucene-index.beir-v1.0.0-arguana-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-arguana-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-arguana-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-arguana-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..e92ae63bb34cebb3e63795c0b9fcd9f89da487bd --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-arguana-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — ArguAna + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/arguana \ + -index indexes/lucene-index.beir-v1.0.0-arguana-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-arguana-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-bioasq-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-bioasq-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..796c53d4264bebb6cfc87339adb5e943d49d10da --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-bioasq-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — BioASQ + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/bioasq \ + -index indexes/lucene-index.beir-v1.0.0-bioasq-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-bioasq-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-bioasq-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-bioasq-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..8de237d9da139a3d5e225adc913f2af905e62880 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-bioasq-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — BioASQ + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/bioasq \ + -index indexes/lucene-index.beir-v1.0.0-bioasq-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-bioasq-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-bioasq-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-bioasq-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..cdcdecfe13c0f62dc8a7b4b653cd5574839d94ff --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-bioasq-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — BioASQ + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/bioasq \ + -index indexes/lucene-index.beir-v1.0.0-bioasq-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-bioasq-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-climate-fever-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-climate-fever-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc2750c1060a248d5574db3ba281841273bcfc48 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-climate-fever-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Climate-FEVER + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/climate-fever \ + -index indexes/lucene-index.beir-v1.0.0-climate-fever-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-climate-fever-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-climate-fever-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-climate-fever-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..3e7ae58fcf77cd215c9c319e0bca6e0a23e2f824 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-climate-fever-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Climate-FEVER + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/climate-fever \ + -index indexes/lucene-index.beir-v1.0.0-climate-fever-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-climate-fever-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-climate-fever-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-climate-fever-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..b383b388fb4f10cd703b8659e62cdfc455b22c6c --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-climate-fever-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Climate-FEVER + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/climate-fever \ + -index indexes/lucene-index.beir-v1.0.0-climate-fever-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-climate-fever-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-android-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-android-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..33fd2fd9c1fcdd376c5534c44492ffe190aee380 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-android-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-android + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-android \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-android-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-android-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-android-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-android-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e9c81607f3b1d2fe5103779992a90ea0ad405b7 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-android-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-android + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-android \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-android-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-android-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-android-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-android-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..4eb1eaf22339ae6d4e91e0922cf84517f8cf6ad2 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-android-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-android + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-android \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-android-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-android-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-english-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-english-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..b496abd0624fd55991fac16cd27fb12dc586e36a --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-english-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-english + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-english \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-english-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-english-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-english-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-english-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..e95065cf28a3351c2b241a970e9638e05290201b --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-english-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-english + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-english \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-english-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-english-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-english-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-english-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..ea16f8f847e1e82b89f76c7ff2052a0c58ed4f0d --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-english-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-english + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-english \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-english-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-english-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gaming-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gaming-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..cfa3dd88b05e728c70f1f188c7cf231ffc0ed775 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gaming-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-gaming + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-gaming \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-gaming-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-gaming-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gaming-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gaming-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..53c76ac330a398541b04c629244230f09157edff --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gaming-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-gaming + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-gaming \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-gaming-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-gaming-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gaming-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gaming-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..64ea65499106fee22d08a8b3d48b789e39bcfb2a --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gaming-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-gaming + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-gaming \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-gaming-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-gaming-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gis-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gis-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..0140cef81d5b3c722859f6abb9db8ccb7fdb41cf --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gis-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-gis + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-gis \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-gis-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-gis-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gis-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gis-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..2c4e31ca24247dd0af11217feb1eacc16fedfe26 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gis-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-gis + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-gis \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-gis-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-gis-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gis-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gis-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..e1562022c60ce60cb8be99bd0a3f98281533b5b2 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-gis-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-gis + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-gis \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-gis-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-gis-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-mathematica-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-mathematica-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..ff959813ba669c27e1e750b1be340738414f21f1 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-mathematica-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-mathematica + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-mathematica \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-mathematica-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-mathematica-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-mathematica-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-mathematica-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..55970173077f57c2897050b926d2ae1f0430a952 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-mathematica-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-mathematica + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-mathematica \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-mathematica-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-mathematica-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-mathematica-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-mathematica-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..7a0e2e1791aa4c096dfbd05cffb4b12b085239a9 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-mathematica-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-mathematica + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-mathematica \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-mathematica-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-mathematica-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-physics-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-physics-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..75b8b30df7adc47c8b56b7bb1124ce29e38f8047 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-physics-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-physics + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-physics \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-physics-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-physics-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-physics-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-physics-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..c263e8ceb36832605b7bb1e4f989bbbe71386cd1 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-physics-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-physics + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-physics \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-physics-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-physics-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-physics-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-physics-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..9ec68d8b7c8267e2e05ad6d01947a2265dc9adc3 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-physics-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-physics + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-physics \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-physics-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-physics-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-programmers-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-programmers-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..2509f4e1cb1ea9d64b1a1cf5fc6cf10a529eaf59 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-programmers-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-programmers + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-programmers \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-programmers-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-programmers-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-programmers-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-programmers-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..562532f768b8fc20e31519a8a96746ddd35856e3 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-programmers-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-programmers + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-programmers \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-programmers-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-programmers-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-programmers-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-programmers-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..9ec2b5b53070039d1229bc30e273e9dfb95428d4 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-programmers-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-programmers + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-programmers \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-programmers-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-programmers-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-stats-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-stats-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..c963fbb48612b78abe21071b62e01463e9360557 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-stats-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-stats + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-stats \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-stats-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-stats-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-stats-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-stats-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..c6e93c3d4b290fc040655e0605ad56a9d84bfefe --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-stats-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-stats + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-stats \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-stats-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-stats-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-stats-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-stats-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..77dee0ceaec027d3b0f06946fd82f04d630c5cb7 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-stats-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-stats + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-stats \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-stats-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-stats-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-tex-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-tex-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..4bc3e0ab4aea74b6a27380ebe3413992a677a18e --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-tex-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-tex + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-tex \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-tex-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-tex-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-tex-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-tex-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..f694890541777ca0b8ac764fc116d5f9b7909cf5 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-tex-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-tex + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-tex \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-tex-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-tex-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-tex-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-tex-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..273df443a41cdf3389ba023ca78776374439a61d --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-tex-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-tex + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-tex \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-tex-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-tex-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-unix-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-unix-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..f08109e6de338dbbacb4cd858c231d57ce22caf7 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-unix-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-unix + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-unix \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-unix-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-unix-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-unix-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-unix-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..668c33bb4295b14f80e9bc92efa64b8424dbdc8b --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-unix-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-unix + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-unix \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-unix-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-unix-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-unix-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-unix-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..a80d04351790505fb9bbdbaf78d16ec502f886f6 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-unix-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-unix + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-unix \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-unix-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-unix-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-webmasters-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-webmasters-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..0c6705fd1c1e66846bfc3f8e5190d1ac43ff77f0 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-webmasters-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-webmasters + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-webmasters \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-webmasters-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-webmasters-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-webmasters-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-webmasters-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..2ee07650500a4391f9e45fa0f51de62cdc974b36 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-webmasters-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-webmasters + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-webmasters \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-webmasters-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-webmasters-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-webmasters-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-webmasters-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..fda3f5eb88b5b175946267b0c983463e159c62c0 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-webmasters-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-webmasters + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-webmasters \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-webmasters-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-webmasters-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-wordpress-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-wordpress-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..347a80a05cef0dd570d36978f6d596ef096dee32 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-wordpress-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-wordpress + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-wordpress \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-wordpress-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-wordpress-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-wordpress-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-wordpress-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..7ef78bd5e8091cb483fb03459b4640913a6fe4f5 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-wordpress-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-wordpress + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/cqadupstack-wordpress \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-wordpress-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-wordpress-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-wordpress-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-wordpress-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..e76790fdbd1a2a14238ec5d630d97601d36612af --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-cqadupstack-wordpress-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — CQADupStack-wordpress + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-wordpress \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-wordpress-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-wordpress-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-dbpedia-entity-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-dbpedia-entity-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..f0d7073698c7f446391141928a374931273c20af --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-dbpedia-entity-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — DBPedia + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/dbpedia-entity \ + -index indexes/lucene-index.beir-v1.0.0-dbpedia-entity-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-dbpedia-entity-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-dbpedia-entity-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-dbpedia-entity-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..69fb579b7718d752a18744beab6c742245806d81 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-dbpedia-entity-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — DBPedia + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/dbpedia-entity \ + -index indexes/lucene-index.beir-v1.0.0-dbpedia-entity-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-dbpedia-entity-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-dbpedia-entity-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-dbpedia-entity-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..f88b9e113cc0fc868c94a05d9c77de4e02c72bf2 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-dbpedia-entity-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — DBPedia + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/dbpedia-entity \ + -index indexes/lucene-index.beir-v1.0.0-dbpedia-entity-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-dbpedia-entity-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fever-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fever-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..4458baccb3f24ab4b325e33a7838cab68322ff79 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fever-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — FEVER + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/fever \ + -index indexes/lucene-index.beir-v1.0.0-fever-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-fever-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fever-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fever-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..9369c49b0a87854afb0cc56bc405a1b7fe89e671 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fever-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — FEVER + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/fever \ + -index indexes/lucene-index.beir-v1.0.0-fever-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-fever-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fever-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fever-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..1948f7920f98a29611ca1052172cf8300dfca9a1 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fever-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — FEVER + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/fever \ + -index indexes/lucene-index.beir-v1.0.0-fever-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-fever-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fiqa-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fiqa-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..831fdea1d0dcf110b6f3893a85bdf9ff18710b54 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fiqa-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — FiQA-2018 + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/fiqa \ + -index indexes/lucene-index.beir-v1.0.0-fiqa-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-fiqa-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fiqa-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fiqa-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..d94a2598d60129af7dfef0873a028242a589b21e --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fiqa-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — FiQA-2018 + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/fiqa \ + -index indexes/lucene-index.beir-v1.0.0-fiqa-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-fiqa-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fiqa-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fiqa-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..7a30a31cf4debf5ab817e4e63e46d3f2a2c63614 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-fiqa-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — FiQA-2018 + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/fiqa \ + -index indexes/lucene-index.beir-v1.0.0-fiqa-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-fiqa-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-flat.20221116.505594.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-flat.20221116.505594.README.md new file mode 100644 index 0000000000000000000000000000000000000000..2ef11bbf6f7bdb6899b98be579388cbcf6528a43 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-flat.20221116.505594.README.md @@ -0,0 +1,237 @@ +# BEIR (v1.0.0): "flat" Lucene indexes + +These "flat" Lucene indexes were generated on 2022/11/16 at Anserini commit [`505594`](https://github.com/castorini/anserini/commit/505594b6573294a9a4c72a8feee3416f8a9bd2d9) on `tuna` with the following commands: + +```bash +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/trec-covid \ + -index indexes/lucene-index.beir-v1.0.0-trec-covid-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-trec-covid-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/bioasq \ + -index indexes/lucene-index.beir-v1.0.0-bioasq-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-bioasq-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/nfcorpus \ + -index indexes/lucene-index.beir-v1.0.0-nfcorpus-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-nfcorpus-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/nq \ + -index indexes/lucene-index.beir-v1.0.0-nq-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-nq-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/hotpotqa \ + -index indexes/lucene-index.beir-v1.0.0-hotpotqa-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-hotpotqa-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/fiqa \ + -index indexes/lucene-index.beir-v1.0.0-fiqa-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-fiqa-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/signal1m \ + -index indexes/lucene-index.beir-v1.0.0-signal1m-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-signal1m-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/trec-news \ + -index indexes/lucene-index.beir-v1.0.0-trec-news-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-trec-news-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/robust04 \ + -index indexes/lucene-index.beir-v1.0.0-robust04-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-robust04-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/arguana \ + -index indexes/lucene-index.beir-v1.0.0-arguana-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-arguana-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/webis-touche2020 \ + -index indexes/lucene-index.beir-v1.0.0-webis-touche2020-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-webis-touche2020-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-android \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-android-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-android-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-english \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-english-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-english-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-gaming \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-gaming-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-gaming-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-gis \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-gis-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-gis-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-mathematica \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-mathematica-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-mathematica-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-physics \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-physics-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-physics-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-programmers \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-programmers-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-programmers-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-stats \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-stats-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-stats-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-tex \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-tex-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-tex-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-unix \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-unix-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-unix-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-webmasters \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-webmasters-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-webmasters-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-wordpress \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-wordpress-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-wordpress-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/quora \ + -index indexes/lucene-index.beir-v1.0.0-quora-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-quora-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/dbpedia-entity \ + -index indexes/lucene-index.beir-v1.0.0-dbpedia-entity-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-dbpedia-entity-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/scidocs \ + -index indexes/lucene-index.beir-v1.0.0-scidocs-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-scidocs-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/fever \ + -index indexes/lucene-index.beir-v1.0.0-fever-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-fever-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/climate-fever \ + -index indexes/lucene-index.beir-v1.0.0-climate-fever-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-climate-fever-flat.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/scifact \ + -index indexes/lucene-index.beir-v1.0.0-scifact-flat.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-scifact-flat.20221116.505594 & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-hotpotqa-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-hotpotqa-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..7e07a916b987ba92da71e0e5a31d84fbd83b433a --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-hotpotqa-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — HotpotQA + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/hotpotqa \ + -index indexes/lucene-index.beir-v1.0.0-hotpotqa-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-hotpotqa-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-hotpotqa-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-hotpotqa-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..e97acc460d9725c3acafb92433b61963e13a5f70 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-hotpotqa-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — HotpotQA + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/hotpotqa \ + -index indexes/lucene-index.beir-v1.0.0-hotpotqa-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-hotpotqa-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-hotpotqa-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-hotpotqa-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..3567ef64b53c7135e1914533f09fdabf841bfcef --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-hotpotqa-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — HotpotQA + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/hotpotqa \ + -index indexes/lucene-index.beir-v1.0.0-hotpotqa-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-hotpotqa-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md new file mode 100644 index 0000000000000000000000000000000000000000..453718050415aec6b30aa1f3899580667e599d2f --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-multifield.20221116.505594.README.md @@ -0,0 +1,237 @@ +# BEIR (v1.0.0): "multifield" Lucene indexes + +These "multifield" Lucene indexes were generated on 2022/11/16 at Anserini commit [`505594`](https://github.com/castorini/anserini/commit/505594b6573294a9a4c72a8feee3416f8a9bd2d9) on `tuna` with the following commands: + +```bash +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/trec-covid \ + -index indexes/lucene-index.beir-v1.0.0-trec-covid-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-trec-covid-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/bioasq \ + -index indexes/lucene-index.beir-v1.0.0-bioasq-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-bioasq-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/nfcorpus \ + -index indexes/lucene-index.beir-v1.0.0-nfcorpus-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-nfcorpus-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/nq \ + -index indexes/lucene-index.beir-v1.0.0-nq-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-nq-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/hotpotqa \ + -index indexes/lucene-index.beir-v1.0.0-hotpotqa-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-hotpotqa-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/fiqa \ + -index indexes/lucene-index.beir-v1.0.0-fiqa-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-fiqa-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/signal1m \ + -index indexes/lucene-index.beir-v1.0.0-signal1m-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-signal1m-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/trec-news \ + -index indexes/lucene-index.beir-v1.0.0-trec-news-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-trec-news-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/robust04 \ + -index indexes/lucene-index.beir-v1.0.0-robust04-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-robust04-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/arguana \ + -index indexes/lucene-index.beir-v1.0.0-arguana-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-arguana-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/webis-touche2020 \ + -index indexes/lucene-index.beir-v1.0.0-webis-touche2020-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-webis-touche2020-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-android \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-android-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-android-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-english \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-english-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-english-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-gaming \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-gaming-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-gaming-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-gis \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-gis-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-gis-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-mathematica \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-mathematica-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-mathematica-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-physics \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-physics-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-physics-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-programmers \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-programmers-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-programmers-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-stats \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-stats-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-stats-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-tex \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-tex-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-tex-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-unix \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-unix-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-unix-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-webmasters \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-webmasters-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-webmasters-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/cqadupstack-wordpress \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-wordpress-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-wordpress-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/quora \ + -index indexes/lucene-index.beir-v1.0.0-quora-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-quora-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/dbpedia-entity \ + -index indexes/lucene-index.beir-v1.0.0-dbpedia-entity-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-dbpedia-entity-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/scidocs \ + -index indexes/lucene-index.beir-v1.0.0-scidocs-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-scidocs-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/fever \ + -index indexes/lucene-index.beir-v1.0.0-fever-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-fever-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/climate-fever \ + -index indexes/lucene-index.beir-v1.0.0-climate-fever-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-climate-fever-multifield.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /tuna1/collections/beir-v1.0.0/corpus/scifact \ + -index indexes/lucene-index.beir-v1.0.0-scifact-multifield.20221116.505594/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-scifact-multifield.20221116.505594 & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nfcorpus-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nfcorpus-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..f6b0c4dd2aa20282e8bf8a28e3b2c112d5b77dbd --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nfcorpus-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — NFCorpus + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/nfcorpus \ + -index indexes/lucene-index.beir-v1.0.0-nfcorpus-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-nfcorpus-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nfcorpus-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nfcorpus-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..b66c5a2a377bc2bf7b5a148b38066163694cb5e4 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nfcorpus-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — NFCorpus + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/nfcorpus \ + -index indexes/lucene-index.beir-v1.0.0-nfcorpus-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-nfcorpus-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nfcorpus-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nfcorpus-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..0705766a3a68097d058e9cb8e5ae2dc4f65dd5b6 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nfcorpus-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — NFCorpus + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/nfcorpus \ + -index indexes/lucene-index.beir-v1.0.0-nfcorpus-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-nfcorpus-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nq-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nq-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b9bcd6b966a9212d408730f21bfe80148ecfd32 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nq-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — NQ + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/nq \ + -index indexes/lucene-index.beir-v1.0.0-nq-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-nq-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nq-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nq-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..b256ea81d29b920138c965ba34822d35c27816a6 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nq-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — NQ + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/nq \ + -index indexes/lucene-index.beir-v1.0.0-nq-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-nq-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nq-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nq-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..77352d39f3f1bbdc6bffcacf7358fb55b2d69ddc --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-nq-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — NQ + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/nq \ + -index indexes/lucene-index.beir-v1.0.0-nq-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-nq-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-quora-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-quora-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..1a06f63dfd59b1a5cee92c5e40dedcc4fe2586a6 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-quora-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Quora + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/quora \ + -index indexes/lucene-index.beir-v1.0.0-quora-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-quora-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-quora-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-quora-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..f4a8507dde1f2716bf665f6fb3a93db06749cf31 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-quora-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Quora + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/quora \ + -index indexes/lucene-index.beir-v1.0.0-quora-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-quora-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-quora-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-quora-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..04cdccf3318e6e12943d8418a07aa6dc2a10264c --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-quora-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Quora + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/quora \ + -index indexes/lucene-index.beir-v1.0.0-quora-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-quora-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-robust04-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-robust04-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..25f2f0a4fb1e228eadb9f827321bea28a86ec3df --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-robust04-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Robust04 + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/robust04 \ + -index indexes/lucene-index.beir-v1.0.0-robust04-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-robust04-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-robust04-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-robust04-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..0b9e79e25327c76b754ea0af611099579024788a --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-robust04-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Robust04 + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/robust04 \ + -index indexes/lucene-index.beir-v1.0.0-robust04-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-robust04-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-robust04-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-robust04-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..0d44304deb5afb9413d8f80163d5a9ceb3b11f7b --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-robust04-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Robust04 + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/robust04 \ + -index indexes/lucene-index.beir-v1.0.0-robust04-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-robust04-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scidocs-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scidocs-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..2bf9d2ad8a723c03c0da71804f30144aa46cc9f3 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scidocs-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — SCIDOCS + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/scidocs \ + -index indexes/lucene-index.beir-v1.0.0-scidocs-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-scidocs-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scidocs-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scidocs-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..a3cbd51c3b345bcfa97d1ea06d5e917059ed4dda --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scidocs-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — SCIDOCS + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/scidocs \ + -index indexes/lucene-index.beir-v1.0.0-scidocs-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-scidocs-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scidocs-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scidocs-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..1ba59c9233ad0e3b15650b6ae1b409575397eec9 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scidocs-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — SCIDOCS + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/scidocs \ + -index indexes/lucene-index.beir-v1.0.0-scidocs-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-scidocs-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scifact-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scifact-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..4151b71c3bf7ef0bc321af236cbfa87b6ea04c33 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scifact-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — SciFact + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/scifact \ + -index indexes/lucene-index.beir-v1.0.0-scifact-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-scifact-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scifact-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scifact-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..b6f0d29faff8db6ced0211bacd137ede06dea6f4 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scifact-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — SciFact + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/scifact \ + -index indexes/lucene-index.beir-v1.0.0-scifact-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-scifact-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scifact-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scifact-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..f78d4f52b0824193dcf65db562068bea4cc04df8 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-scifact-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — SciFact + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/scifact \ + -index indexes/lucene-index.beir-v1.0.0-scifact-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-scifact-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-signal1m-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-signal1m-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..3a7e897c398b8311c06098306d311bdc86a94558 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-signal1m-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Signal-1M + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/signal1m \ + -index indexes/lucene-index.beir-v1.0.0-signal1m-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-signal1m-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-signal1m-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-signal1m-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..05b73aa4d025f815116432ab5a199df1d6a17cdc --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-signal1m-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Signal-1M + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/signal1m \ + -index indexes/lucene-index.beir-v1.0.0-signal1m-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-signal1m-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-signal1m-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-signal1m-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..98c672c1dd524b7c1212fdef5823c7af5bcda879 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-signal1m-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Signal-1M + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/signal1m \ + -index indexes/lucene-index.beir-v1.0.0-signal1m-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-signal1m-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md new file mode 100644 index 0000000000000000000000000000000000000000..21cc0eafa438acf841b8e750ce7da72cabc19b95 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-splade_distil_cocodenser_medium.20221116.505594.README.md @@ -0,0 +1,237 @@ +# BEIR (v1.0.0): SPLADE-distill CoCodenser-medium + +These Lucene impact indexes for SPLADE-distill CoCodenser-medium were generated on 2022/11/16 at Anserini commit [`505594`](https://github.com/castorini/anserini/commit/505594b6573294a9a4c72a8feee3416f8a9bd2d9) on `tuna` with the following commands: + +```bash +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/trec-covid \ + -index indexes/lucene-index.beir-v1.0.0-trec-covid-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-trec-covid-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/bioasq \ + -index indexes/lucene-index.beir-v1.0.0-bioasq-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-bioasq-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/nfcorpus \ + -index indexes/lucene-index.beir-v1.0.0-nfcorpus-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-nfcorpus-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/nq \ + -index indexes/lucene-index.beir-v1.0.0-nq-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-nq-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/hotpotqa \ + -index indexes/lucene-index.beir-v1.0.0-hotpotqa-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-hotpotqa-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/fiqa \ + -index indexes/lucene-index.beir-v1.0.0-fiqa-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-fiqa-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/signal1m \ + -index indexes/lucene-index.beir-v1.0.0-signal1m-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-signal1m-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/trec-news \ + -index indexes/lucene-index.beir-v1.0.0-trec-news-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-trec-news-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/robust04 \ + -index indexes/lucene-index.beir-v1.0.0-robust04-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-robust04-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/arguana \ + -index indexes/lucene-index.beir-v1.0.0-arguana-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-arguana-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/webis-touche2020 \ + -index indexes/lucene-index.beir-v1.0.0-webis-touche2020-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-webis-touche2020-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-android \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-android-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-android-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-english \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-english-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-english-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-gaming \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-gaming-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-gaming-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-gis \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-gis-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-gis-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-mathematica \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-mathematica-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-mathematica-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-physics \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-physics-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-physics-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-programmers \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-programmers-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-programmers-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-stats \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-stats-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-stats-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-tex \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-tex-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-tex-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-unix \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-unix-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-unix-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-webmasters \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-webmasters-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-webmasters-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/cqadupstack-wordpress \ + -index indexes/lucene-index.beir-v1.0.0-cqadupstack-wordpress-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-cqadupstack-wordpress-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/quora \ + -index indexes/lucene-index.beir-v1.0.0-quora-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-quora-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/dbpedia-entity \ + -index indexes/lucene-index.beir-v1.0.0-dbpedia-entity-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-dbpedia-entity-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/scidocs \ + -index indexes/lucene-index.beir-v1.0.0-scidocs-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-scidocs-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/fever \ + -index indexes/lucene-index.beir-v1.0.0-fever-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-fever-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/climate-fever \ + -index indexes/lucene-index.beir-v1.0.0-climate-fever-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-climate-fever-splade_distil_cocodenser_medium.20221116.505594 & + +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /tuna1/collections/beir-v1.0.0/splade_distil_cocodenser_medium/scifact \ + -index indexes/lucene-index.beir-v1.0.0-scifact-splade_distil_cocodenser_medium.20221116.505594/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-scifact-splade_distil_cocodenser_medium.20221116.505594 & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-covid-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-covid-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..d3a2822ccf6b9a26c9fd2e4d10c556b6fcb8e978 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-covid-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — TREC-COVID + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/trec-covid \ + -index indexes/lucene-index.beir-v1.0.0-trec-covid-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-trec-covid-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-covid-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-covid-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..dc2e127f38b4c90a6a3a5fe9ca97b798092a14f2 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-covid-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — TREC-COVID + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/trec-covid \ + -index indexes/lucene-index.beir-v1.0.0-trec-covid-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-trec-covid-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-covid-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-covid-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..641d2fcdb39d83e2fc042b746f05d9890eac8248 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-covid-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — TREC-COVID + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/trec-covid \ + -index indexes/lucene-index.beir-v1.0.0-trec-covid-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-trec-covid-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-news-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-news-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..4af25537673536c1311813c06815a7bd54b3b696 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-news-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — TREC-NEWS + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/trec-news \ + -index indexes/lucene-index.beir-v1.0.0-trec-news-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-trec-news-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-news-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-news-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..29296fa8c0a92974921b0e0cce7d5b2ac59911a5 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-news-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — TREC-NEWS + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/trec-news \ + -index indexes/lucene-index.beir-v1.0.0-trec-news-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-trec-news-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-news-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-news-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..62e905c208a5854b9f960da4ea4db12d7d217826 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-trec-news-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — TREC-NEWS + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/trec-news \ + -index indexes/lucene-index.beir-v1.0.0-trec-news-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-trec-news-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-webis-touche2020-flat.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-webis-touche2020-flat.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..9888c1109b4834f90282f16fa1c8ae4ec132cad7 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-webis-touche2020-flat.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Webis-Touche2020 + +This **"flat" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirFlatCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/webis-touche2020 \ + -index indexes/lucene-index.beir-v1.0.0-webis-touche2020-flat.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-webis-touche2020-flat.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-webis-touche2020-multifield.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-webis-touche2020-multifield.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..3d0098ce7c725f493438c2e55ceff423932b6a8b --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-webis-touche2020-multifield.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Webis-Touche2020 + +This **"multifield" Lucene index** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection BeirMultifieldCollection \ + -input /scratch2/collections/beir-v1.0.0/corpus/webis-touche2020 \ + -index indexes/lucene-index.beir-v1.0.0-webis-touche2020-multifield.20220501.1842ee/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -fields title -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.beir-v1.0.0-webis-touche2020-multifield.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-webis-touche2020-splade_distil_cocodenser_medium.20220501.1842ee.README.md b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-webis-touche2020-splade_distil_cocodenser_medium.20220501.1842ee.README.md new file mode 100644 index 0000000000000000000000000000000000000000..880814b874258e217d572dc65967e760d7ba1e4e --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.beir-v1.0.0-webis-touche2020-splade_distil_cocodenser_medium.20220501.1842ee.README.md @@ -0,0 +1,13 @@ +# BEIR (v1.0.0) — Webis-Touche2020 + +This Lucene impact index for **SPLADE-distill CoCodenser-medium** was generated on 2022/05/01 at Anserini commit [`1842ee`](https://github.com/castorini/anserini/commit/1842eeffcbf4d18698d401b1c5a4b1c868f32fc6) on `damiano` with the following command: + +``` +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -generator DefaultLuceneDocumentGenerator \ + -input /scratch2/collections/beir-v1.0.0/splade_distil_cocodenser_medium/webis-touche2020 \ + -index indexes/lucene-index.beir-v1.0.0-webis-touche2020-splade_distil_cocodenser_medium.20220501.1842ee/ \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.beir-v1.0.0-webis-touche2020-splade_distil_cocodenser_medium.20220501.1842ee & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.ciral-v1.0.20230721.e850ea.README.md b/pyserini/resources/index-metadata/lucene-index.ciral-v1.0.20230721.e850ea.README.md new file mode 100644 index 0000000000000000000000000000000000000000..ccc621f0de8edf1bf3b8aff08bc882d055b9914d --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.ciral-v1.0.20230721.e850ea.README.md @@ -0,0 +1,19 @@ +# CIRAL v1.0 BM25 Indexes + +Lucene indexes for CIRAL covering all four languages. + +This index was generated on 2023/07/21 at Anserini commit [`e850ea`](https://github.com/castorini/anserini/commit/e850eaa5b0e3c0e406628cb1dbcf788ae46caf50) on `basilisk` with the following command: + +```bash +lang=ha # or yo, sw, so +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input ciral-passages-$lang/ \ + -index lucene-index.ciral-v1.0-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 \ + -language $lang \ + -pretokenized \ + -optimize \ + -storePositions -storeDocvectors -storeRaw +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.hc4-v1.0-fa.20220719.71c120.README.md b/pyserini/resources/index-metadata/lucene-index.hc4-v1.0-fa.20220719.71c120.README.md new file mode 100644 index 0000000000000000000000000000000000000000..68d9e2c13c484da162d7c85d6dd216ad3572729b --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.hc4-v1.0-fa.20220719.71c120.README.md @@ -0,0 +1,14 @@ +# hc4-v1.0-fa + +Lucene index for HC4 v1.0 (Persian). + +This index was generated on 2022/07/19 at Anserini commit [`71c120`](https://github.com/castorini/anserini/commit/71c1200d36ce17615cf4da510ac4ef2d2f0121f6) on `orca` with the following command: + + +``` +target/appassembler/bin/IndexCollection -collection NeuClirCollection \ + -generator DefaultLuceneDocumentGenerator -threads 8 \ + -input /store/collections/multilingual/hc4-v1.0-fa \ + -index indexes/lucene-index.hc4-v1.0-fa.20220719.71c120 \ + -storePositions -storeDocvectors -storeRaw -optimize -language fa +``` diff --git a/pyserini/resources/index-metadata/lucene-index.hc4-v1.0-ru.20220719.71c120.README.md b/pyserini/resources/index-metadata/lucene-index.hc4-v1.0-ru.20220719.71c120.README.md new file mode 100644 index 0000000000000000000000000000000000000000..5ed6c49f50bb420616b448cf9b7bde90a13f45e2 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.hc4-v1.0-ru.20220719.71c120.README.md @@ -0,0 +1,13 @@ +# hc4-v1.0-ru + +Lucene index for HC4 v1.0 (Russian). + +This index was generated on 2022/07/19 at Anserini commit [`71c120`](https://github.com/castorini/anserini/commit/71c1200d36ce17615cf4da510ac4ef2d2f0121f6) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection NeuClirCollection \ + -generator DefaultLuceneDocumentGenerator -threads 8 \ + -input /store/collections/multilingual/hc4-v1.0-ru \ + -index indexes/lucene-index.hc4-v1.0-ru.20220719.71c120 \ + -storePositions -storeDocvectors -storeRaw -optimize -language ru +``` diff --git a/pyserini/resources/index-metadata/lucene-index.hc4-v1.0-zh.20220719.71c120.README.md b/pyserini/resources/index-metadata/lucene-index.hc4-v1.0-zh.20220719.71c120.README.md new file mode 100644 index 0000000000000000000000000000000000000000..33b6541762a722bbf498f66c6b9dec75f2818071 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.hc4-v1.0-zh.20220719.71c120.README.md @@ -0,0 +1,13 @@ +# hc4-v1.0-zh + +Lucene index for HC4 v1.0 (Chinese). + +This index was generated on 2022/07/19 at Anserini commit [`71c120`](https://github.com/castorini/anserini/commit/71c1200d36ce17615cf4da510ac4ef2d2f0121f6) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection NeuClirCollection \ + -generator DefaultLuceneDocumentGenerator -threads 8 \ + -input /store/collections/multilingual/hc4-v1.0-zh \ + -index indexes/lucene-index.hc4-v1.0-zh.20220719.71c120 \ + -storePositions -storeDocvectors -storeRaw -optimize -language zh +``` diff --git a/pyserini/resources/index-metadata/lucene-index.hc4-v1.0.20221025.c4a8d0.README.md b/pyserini/resources/index-metadata/lucene-index.hc4-v1.0.20221025.c4a8d0.README.md new file mode 100644 index 0000000000000000000000000000000000000000..4ead6ffe5131727e51b38a08ddcf328583cb4bfc --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.hc4-v1.0.20221025.c4a8d0.README.md @@ -0,0 +1,34 @@ +# HC4 v1.0 Indexes + +Lucene indexes for HC4 v1.0 (Persian, Russian, and Chinese). + +These indexes was generated on 2022/10/25 at Anserini commit [`c4a8d0`](https://github.com/castorini/anserini/commit/c4a8d00e3c218ed89dca8a4e51c3b2c7d577db00) on `tuna` with the following commands: + +```bash +# HC4 fa +nohup target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /tuna1/collections/multilingual/hc4-v1.0-fa \ + -index indexes/lucene-index.hc4-v1.0-fa.20221025.c4a8d0 \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw -language fa -optimize \ + >& logs/log.hc4-v1.0-fa.20221025.c4a8d0 & + +# HC4 ru +nohup target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /tuna1/collections/multilingual/hc4-v1.0-ru \ + -index indexes/lucene-index.hc4-v1.0-ru.20221025.c4a8d0 \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw -language ru -optimize \ + >& logs/log.hc4-v1.0-ru.20221025.c4a8d0 & + +# HC4 zh +nohup target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /tuna1/collections/multilingual/hc4-v1.0-zh \ + -index indexes/lucene-index.hc4-v1.0-zh.20221025.c4a8d0 \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw -language zh -optimize \ + >& logs/log.hc4-v1.0-zh.20221025.c4a8d0 & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.miracl-v1.0.20221004.2b2856.README.md b/pyserini/resources/index-metadata/lucene-index.miracl-v1.0.20221004.2b2856.README.md new file mode 100644 index 0000000000000000000000000000000000000000..d95eed6ae4b69864fb050ca49124ea3bdd5e5aeb --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.miracl-v1.0.20221004.2b2856.README.md @@ -0,0 +1,14 @@ +# miracl-v1.0 + +Lucene index for MIRACL v1.0 (All languages) + +This index was generated on 2022/10/04 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: +``` +lang=ar # or: bn en fi fr hi id ja ko fa ru es sw te th zh +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MIRACL/miracl-corpus-v1.0-$lang \ + -index lucene-index.miracl-v1.0-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $lang +``` diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-arabic.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-arabic.20220108.6fcb89.README.md new file mode 100644 index 0000000000000000000000000000000000000000..0b3e01f260ce45f107e06ee6bea61a4fb4b08d2c --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-arabic.20220108.6fcb89.README.md @@ -0,0 +1,13 @@ +# mrtydi-v1.1-arabic + +Lucene index for Mr.TyDi v1.1 (Arabic). + +This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-arabic/ \ + -index indexes/lucene-index.mrtydi-v1.1-arabic.20220108.6fcb89/ \ + -storePositions -storeDocvectors -storeRaw -optimize -language ar +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-arabic.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-arabic.20220928.b5ecc5.README.md new file mode 100644 index 0000000000000000000000000000000000000000..451c03a78332f4ff0daa99d62cc9eb3038ad243d --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-arabic.20220928.b5ecc5.README.md @@ -0,0 +1,17 @@ +# mrtydi-v1.1-arabic + +Lucene index for Mr.TyDi v1.1 (Arabic). + +This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: + +``` +lang=arabic +abbr=ar + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-bengali.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-bengali.20220108.6fcb89.README.md new file mode 100644 index 0000000000000000000000000000000000000000..8d6565e9dcc57e1b5eabcb7ad1e590997f662f94 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-bengali.20220108.6fcb89.README.md @@ -0,0 +1,13 @@ +# mrtydi-v1.1-bengali + +Lucene index for Mr.TyDi v1.1 (Bengali). + +This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-bengali/ \ + -index indexes/lucene-index.mrtydi-v1.1-bengali.20220108.6fcb89/ \ + -storePositions -storeDocvectors -storeRaw -optimize -language bn +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-bengali.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-bengali.20220928.b5ecc5.README.md new file mode 100644 index 0000000000000000000000000000000000000000..a7a1e252af11910e9e31c33ab9682086daf43ead --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-bengali.20220928.b5ecc5.README.md @@ -0,0 +1,17 @@ +# mrtydi-v1.1-bengali + +Lucene index for Mr.TyDi v1.1 (Bengali). + +This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: + +``` +lang=bengali +abbr=bn + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-english.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-english.20220108.6fcb89.README.md new file mode 100644 index 0000000000000000000000000000000000000000..8c4f02f197373f57ab0765a0b639a7951f02a877 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-english.20220108.6fcb89.README.md @@ -0,0 +1,13 @@ +# mrtydi-v1.1-english + +Lucene index for Mr.TyDi v1.1 (English). + +This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-english/ \ + -index indexes/lucene-index.mrtydi-v1.1-english.20220108.6fcb89/ \ + -storePositions -storeDocvectors -storeRaw -optimize -language en +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-english.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-english.20220928.b5ecc5.README.md new file mode 100644 index 0000000000000000000000000000000000000000..b664d41a371476a3f6ef3799b9503837502fe40a --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-english.20220928.b5ecc5.README.md @@ -0,0 +1,17 @@ +# mrtydi-v1.1-english + +Lucene index for Mr.TyDi v1.1 (English). + +This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: + +``` +lang=english +abbr=en + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-finnish.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-finnish.20220108.6fcb89.README.md new file mode 100644 index 0000000000000000000000000000000000000000..10b161844fb9084bea40b96332f77087ffe34db7 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-finnish.20220108.6fcb89.README.md @@ -0,0 +1,13 @@ +# mrtydi-v1.1-finnish + +Lucene index for Mr.TyDi v1.1 (Finnish). + +This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-finnish/ \ + -index indexes/lucene-index.mrtydi-v1.1-finnish.20220108.6fcb89/ \ + -storePositions -storeDocvectors -storeRaw -optimize -language fi +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-finnish.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-finnish.20220928.b5ecc5.README.md new file mode 100644 index 0000000000000000000000000000000000000000..156c594906efc4a3d50d1454167a5696c011258b --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-finnish.20220928.b5ecc5.README.md @@ -0,0 +1,17 @@ +# mrtydi-v1.1-finnish + +Lucene index for Mr.TyDi v1.1 (Finnish). + +This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: + +``` +lang=finnish +abbr=fi + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-indonesian.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-indonesian.20220108.6fcb89.README.md new file mode 100644 index 0000000000000000000000000000000000000000..13570f2745912620cf6efd5149777186861e05bb --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-indonesian.20220108.6fcb89.README.md @@ -0,0 +1,13 @@ +# mrtydi-v1.1-indonesian + +Lucene index for Mr.TyDi v1.1 (Indonesian). + +This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-indonesian/ \ + -index indexes/lucene-index.mrtydi-v1.1-indonesian.20220108.6fcb89/ \ + -storePositions -storeDocvectors -storeRaw -optimize -language id +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-indonesian.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-indonesian.20220928.b5ecc5.README.md new file mode 100644 index 0000000000000000000000000000000000000000..0a8b36fe6c6533daeca301e86fdc12151401278a --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-indonesian.20220928.b5ecc5.README.md @@ -0,0 +1,17 @@ +# mrtydi-v1.1-indonesian + +Lucene index for Mr.TyDi v1.1 (Indonesian). + +This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: + +``` +lang=indonesian +abbr=id + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-japanese.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-japanese.20220108.6fcb89.README.md new file mode 100644 index 0000000000000000000000000000000000000000..a78a572a06d616d4ebf1cbdb79487575ae1a7935 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-japanese.20220108.6fcb89.README.md @@ -0,0 +1,13 @@ +# mrtydi-v1.1-japanese + +Lucene index for Mr.TyDi v1.1 (Japanese). + +This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-japanese/ \ + -index indexes/lucene-index.mrtydi-v1.1-japanese.20220108.6fcb89/ \ + -storePositions -storeDocvectors -storeRaw -optimize -language ja +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-japanese.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-japanese.20220928.b5ecc5.README.md new file mode 100644 index 0000000000000000000000000000000000000000..3fc37ed50ebb6ca33fb135da4161dea5eff86c1d --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-japanese.20220928.b5ecc5.README.md @@ -0,0 +1,17 @@ +# mrtydi-v1.1-japanese + +Lucene index for Mr.TyDi v1.1 (Japanese). + +This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: + +``` +lang=japanese +abbr=ja + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-korean.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-korean.20220108.6fcb89.README.md new file mode 100644 index 0000000000000000000000000000000000000000..3810f62615e23573b17b54309851945d66afdd92 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-korean.20220108.6fcb89.README.md @@ -0,0 +1,13 @@ +# mrtydi-v1.1-korean + +Lucene index for Mr.TyDi v1.1 (Korean). + +This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-korean/ \ + -index indexes/lucene-index.mrtydi-v1.1-korean.20220108.6fcb89/ \ + -storePositions -storeDocvectors -storeRaw -optimize -language ko +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-korean.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-korean.20220928.b5ecc5.README.md new file mode 100644 index 0000000000000000000000000000000000000000..ed106fd28b6c8223033beaec93b69186d2f8437b --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-korean.20220928.b5ecc5.README.md @@ -0,0 +1,17 @@ +# mrtydi-v1.1-korean + +Lucene index for Mr.TyDi v1.1 (Korean). + +This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: + +``` +lang=korean +abbr=ko + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-russian.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-russian.20220108.6fcb89.README.md new file mode 100644 index 0000000000000000000000000000000000000000..7df1ccbf99d852cd2d5482d774e672c5dae2174a --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-russian.20220108.6fcb89.README.md @@ -0,0 +1,13 @@ +# mrtydi-v1.1-russian + +Lucene index for Mr.TyDi v1.1 (Russian). + +This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-russian/ \ + -index indexes/lucene-index.mrtydi-v1.1-russian.20220108.6fcb89/ \ + -storePositions -storeDocvectors -storeRaw -optimize -language ru +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-russian.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-russian.20220928.b5ecc5.README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ee7340f3405e871dfe181c0e6e6c4addeaeff22 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-russian.20220928.b5ecc5.README.md @@ -0,0 +1,17 @@ +# mrtydi-v1.1-russian + +Lucene index for Mr.TyDi v1.1 (Russian). + +This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: + +``` +lang=russian +abbr=ru + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-swahili.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-swahili.20220108.6fcb89.README.md new file mode 100644 index 0000000000000000000000000000000000000000..5d7fd59cf679b0135f08f59f58dd21ef97380531 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-swahili.20220108.6fcb89.README.md @@ -0,0 +1,16 @@ +# mrtydi-v1.1-swahili + +Lucene index for Mr.TyDi v1.1 (Swahili). + +This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-swahili/ \ + -index indexes/lucene-index.mrtydi-v1.1-swahili.20220108.6fcb89/ \ + -storePositions -storeDocvectors -storeRaw -optimize -pretokenized +``` + +Note that `-language sw` gives identical results (and is more semantically accurate) but since we do not have a language-specific tokenizer here, we just use the whitespace tokenizer, which is what `-pretokenized` uses. +This index was built based on Anserini regressions at the time; see [Anserini #1727](https://github.com/castorini/anserini/pull/1727). \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-swahili.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-swahili.20220928.b5ecc5.README.md new file mode 100644 index 0000000000000000000000000000000000000000..02e7bb87583e06f3ee9db6aeff24548106322360 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-swahili.20220928.b5ecc5.README.md @@ -0,0 +1,17 @@ +# mrtydi-v1.1-swahili + +Lucene index for Mr.TyDi v1.1 (Swahili). + +This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: + +``` +lang=swahili +abbr=sw + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-telugu.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-telugu.20220108.6fcb89.README.md new file mode 100644 index 0000000000000000000000000000000000000000..eedd6e194610efd46b781998f78a8091124c682d --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-telugu.20220108.6fcb89.README.md @@ -0,0 +1,16 @@ +# mrtydi-v1.1-telugu + +Lucene index for Mr.TyDi v1.1 (Telugu). + +This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-telugu/ \ + -index indexes/lucene-index.mrtydi-v1.1-telugu.20220108.6fcb89/ \ + -storePositions -storeDocvectors -storeRaw -optimize -pretokenized +``` + +Note that `-language te` gives identical results (and is more semantically accurate) but since we do not have a language-specific tokenizer here, we just use the whitespace tokenizer, which is what `-pretokenized` uses. +This index was built based on Anserini regressions at the time; see [Anserini #1727](https://github.com/castorini/anserini/pull/1727). \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-telugu.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-telugu.20220928.b5ecc5.README.md new file mode 100644 index 0000000000000000000000000000000000000000..2f4bcd9b05c9cbd5ad145521c89111476a1ac104 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-telugu.20220928.b5ecc5.README.md @@ -0,0 +1,17 @@ +# mrtydi-v1.1-telugu + +Lucene index for Mr.TyDi v1.1 (Telugu). + +This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: + +``` +lang=telugu +abbr=te + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-thai.20220108.6fcb89.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-thai.20220108.6fcb89.README.md new file mode 100644 index 0000000000000000000000000000000000000000..296e30f8aacfddbe7166db1f51a93c8806b59b5b --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-thai.20220108.6fcb89.README.md @@ -0,0 +1,13 @@ +# mrtydi-v1.1-thai + +Lucene index for Mr.TyDi v1.1 (Thai). + +This index was generated on 2022/01/08 at Anserini commit [`6fcb89`](https://github.com/castorini/anserini/commit/6fcb896c61e2b8cf2f235def3e95dda5fe4cd2fc) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MrTyDiCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input /store/collections/mr-tydi-corpus/mrtydi-v1.1-thai/ \ + -index indexes/lucene-index.mrtydi-v1.1-thai.20220108.6fcb89/ \ + -storePositions -storeDocvectors -storeRaw -optimize -language th +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-thai.20220928.b5ecc5.README.md b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-thai.20220928.b5ecc5.README.md new file mode 100644 index 0000000000000000000000000000000000000000..5a23bafff699f7d3c0272d116c06691ba8b515db --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.mrtydi-v1.1-thai.20220928.b5ecc5.README.md @@ -0,0 +1,17 @@ +# mrtydi-v1.1-thai + +Lucene index for Mr.TyDi v1.1 (Thai). + +This index was generated on 2022/09/28 at Anserini commit [`b5ecc5`](https://github.com/castorini/anserini/commit/b5ecc5aff79ddfc82b175f6bd3048f5039f0480f) on `orca` with the following command: + +``` +lang=thai +abbr=th + +target/appassembler/bin/IndexCollection \ + -collection MrTyDiCollection \ + -input MrTyDi/miracl-corpus-v1.0-$lang \ + -index indexes-miracl/lucene-index.mrtydi-v1.1-$lang \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -storePositions -storeDocvectors -storeRaw -language $abbr +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-doc-per-passage-expansion.unicoil-d2q.20211012.58d286.readme.txt b/pyserini/resources/index-metadata/lucene-index.msmarco-doc-per-passage-expansion.unicoil-d2q.20211012.58d286.readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8d2cc17fac6d057f46e43119cf1cb88ec3ede1c --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-doc-per-passage-expansion.unicoil-d2q.20211012.58d286.readme.txt @@ -0,0 +1,12 @@ +This index was generated on 2021/10/12 at commit 58d286c3f9fe845e261c271f2a0f514462844d97 (2021/10/05) +with the following command: + +python -m pyserini.index -collection JsonVectorCollection \ + -input collections/msmarco-doc-per-passage-expansion-unicoil-d2q-b8/ \ + -index indexes/lucene-index.msmarco-doc-per-passage-expansion.unicoil-d2q.20211012.58d286 \ + -generator DefaultLuceneDocumentGenerator -impact -pretokenized \ + -threads 36 -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +lucene-index.msmarco-doc-per-passage-expansion.unicoil-d2q.20211012.58d286.tar.gz MD5 checksum = 44bfc848f9a77302b10a59c5b136eb95 diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-passage.deepimpact.20211012.58d286.readme.txt b/pyserini/resources/index-metadata/lucene-index.msmarco-passage.deepimpact.20211012.58d286.readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..e356525181b2355570566101d29e3286697a7709 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-passage.deepimpact.20211012.58d286.readme.txt @@ -0,0 +1,12 @@ +This index was generated on 2021/10/12 at commit 58d286c3f9fe845e261c271f2a0f514462844d97 (2021/10/05) +with the following command: + +python -m pyserini.index -collection JsonVectorCollection \ + -input collections/msmarco-passage-deepimpact-b8/ \ + -index indexes/lucene-index.msmarco-passage.deepimpact.20211012.58d286 \ + -generator DefaultLuceneDocumentGenerator -impact -pretokenized \ + -threads 36 -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +lucene-index.msmarco-passage.deepimpact.20211012.58d286.tar.gz MD5 checksum = 9938f5529fee5cdb405b8587746c9e93 diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-passage.distill-splade-max.20211012.58d286.readme.txt b/pyserini/resources/index-metadata/lucene-index.msmarco-passage.distill-splade-max.20211012.58d286.readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a3b451a3d3300eeb9ee0be65ab818e2616851be --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-passage.distill-splade-max.20211012.58d286.readme.txt @@ -0,0 +1,12 @@ +This index was generated on 2021/10/12 at commit 58d286c3f9fe845e261c271f2a0f514462844d97 (2021/10/05) +with the following command: + +python -m pyserini.index -collection JsonVectorCollection \ + -input collections/msmarco-passage-distill-splade-max \ + -index indexes/lucene-index.msmarco-passage.distill-splade-max.20211012.58d286 \ + -generator DefaultLuceneDocumentGenerator -impact -pretokenized \ + -threads 36 -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +lucene-index.msmarco-passage.distill-splade-max.20211012.58d286.tar.gz MD5 checksum = 957c0dd1b78b61aeddc8685150fd8360 diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-passage.unicoil-d2q.20211012.58d286.readme.txt b/pyserini/resources/index-metadata/lucene-index.msmarco-passage.unicoil-d2q.20211012.58d286.readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9b427dc795ba581282d4226d4d43d877ff38f25 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-passage.unicoil-d2q.20211012.58d286.readme.txt @@ -0,0 +1,12 @@ +This index was generated on 2021/10/12 at commit 58d286c3f9fe845e261c271f2a0f514462844d97 (2021/10/05) +with the following command: + +python -m pyserini.index -collection JsonVectorCollection \ + -input collections/msmarco-passage-unicoil-b8/ \ + -index indexes/lucene-index.msmarco-passage.unicoil-d2q.20211012.58d286 \ + -generator DefaultLuceneDocumentGenerator -impact -pretokenized \ + -threads 36 -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +lucene-index.msmarco-passage.unicoil-d2q.20211012.58d286.tar.gz MD5 checksum = 4a8cb3b86a0d9085a0860c7f7bb7fe99 diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-passage.unicoil-tilde.20211012.58d286.readme.txt b/pyserini/resources/index-metadata/lucene-index.msmarco-passage.unicoil-tilde.20211012.58d286.readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..817abea4cbd7e2f91faeae7c40a6e7018d97ad96 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-passage.unicoil-tilde.20211012.58d286.readme.txt @@ -0,0 +1,12 @@ +This index was generated on 2021/10/12 at commit 58d286c3f9fe845e261c271f2a0f514462844d97 (2021/10/05) +with the following command: + +python -m pyserini.index -collection JsonVectorCollection \ + -input collections/msmarco-passage-unicoil-tilde-expansion-b8/ \ + -index indexes/lucene-index.msmarco-passage.unicoil-tilde.20211012.58d286 \ + -generator DefaultLuceneDocumentGenerator -impact -pretokenized \ + -threads 36 -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +lucene-index.msmarco-passage.unicoil-tilde.20211012.58d286.tar.gz MD5 checksum = cc19cfe241053f5a303f7f05a7ac40a5 diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-d2q-t5-docvectors.20220525.30c997.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-d2q-t5-docvectors.20220525.30c997.README.md new file mode 100644 index 0000000000000000000000000000000000000000..d8b7721fe6cebe272126681a8a86d75a53344226 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-d2q-t5-docvectors.20220525.30c997.README.md @@ -0,0 +1,16 @@ +# msmarco-v1-doc-d2q-t5-docvectors + +Lucene index (+docvectors) of the MS MARCO V1 document corpus, with doc2query-T5 expansions. + +This index was generated on 2022/05/25 at Anserini commit [`30c997`](https://github.com/castorini/anserini/commit/30c9974f495a06c94d576d0e9c2c5861515e0e19) on `damiano` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 7 \ + -input /scratch2/collections/msmarco/msmarco-doc-docTTTTTquery/ \ + -index indexes/lucene-index.msmarco-v1-doc-d2q-t5-docvectors.20220525.30c997/ \ + -storeDocvectors -optimize +``` + +Note that this index stores term frequencies along with the docvectors: bag-of-words queries and relevance feedback are supported, but not phrase queries. +The raw text is not stored. diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-d2q-t5.20220201.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-d2q-t5.20220201.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..225bce6f8217691346370fc9c8f80ff92b876c9b --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-d2q-t5.20220201.9ea315.README.md @@ -0,0 +1,15 @@ +# msmarco-v1-doc-d2q-t5 + +Lucene index of the MS MARCO V1 document corpus, with doc2query-T5 expansions. + +This index was generated on 2022/02/01 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 7 \ + -input /store/collections/msmarco/msmarco-doc-docTTTTTquery/ \ + -index indexes/lucene-index.msmarco-v1-doc-d2q-t5.20220201.9ea315/ \ + -optimize +``` + +Note that this index stores term frequencies only, which supports bag-of-words queries, but no phrase queries and no relevance feedback. In addition, there is no way to fetch the raw text. diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-d2q-t5.20221004.252b5e.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-d2q-t5.20221004.252b5e.README.md new file mode 100644 index 0000000000000000000000000000000000000000..34aaad4377167adca0a4b22d273ad501143edb67 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-d2q-t5.20221004.252b5e.README.md @@ -0,0 +1,24 @@ +# msmarco-v1-doc-d2q-t5 + +Lucene index of the MS MARCO V1 document corpus, with doc2query-T5 expansions. + +Note that there are two variants: + ++ `msmarco-v1-doc-d2q-t5` (2.1G uncompressed): stores term frequencies only, which supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text. ++ `msmarco-v1-doc-d2q-t5-docvectors` (12G uncompressed): stores term frequencies and the docvectors, which enables pseudo-relevance feedabck. + +These indexes were generated on 2022/10/04 at Anserini commit [`252b5e`](https://github.com/castorini/anserini/commit/252b5e2087dd7b3b994d41a444d4ae0044519819) on `tuna` with the following commands: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 7 \ + -input /tuna1/collections/msmarco/msmarco-doc-docTTTTTquery/ \ + -index indexes/lucene-index.msmarco-v1-doc-d2q-t5.20221004.252b5e/ \ + -optimize >& logs/log.msmarco-v1-doc-d2q-t5.20221004.252b5e & + +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 7 \ + -input /tuna1/collections/msmarco/msmarco-doc-docTTTTTquery/ \ + -index indexes/lucene-index.msmarco-v1-doc-d2q-t5-docvectors.20221004.252b5e/ \ + -storeDocvectors -optimize >& logs/log.msmarco-v1-doc-d2q-t5-docvectors.20221004.252b5e & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-full.20220131.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-full.20220131.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d6038a93f3d84bb9ee22a63b1a177ef1d091325 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-full.20220131.9ea315.README.md @@ -0,0 +1,21 @@ +# msmarco-v1-doc-full + +Lucene index of the MS MARCO V1 document corpus. + +This index was generated on 2022/01/31 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 7 \ + -input /store/collections/msmarco/msmarco-doc/ \ + -index indexes/lucene-index.msmarco-v1-doc-full.20220131.9ea315/ \ + -storePositions -storeDocvectors -storeRaw -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v1-doc` (16G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v1-doc-slim` (2.0G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v1-doc-full` (28G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "full" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-d2q-t5-docvectors.20220525.30c997.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-d2q-t5-docvectors.20220525.30c997.README.md new file mode 100644 index 0000000000000000000000000000000000000000..9c269aa53d6c226e98956a84252959adb66cee6b --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-d2q-t5-docvectors.20220525.30c997.README.md @@ -0,0 +1,16 @@ +# msmarco-v1-doc-segmented-d2q-t5-docvectors + +Lucene index (+docvectors) of the MS MARCO V1 segmented document corpus, with doc2query-T5 expansions. + +This index was generated on 2022/05/25 at Anserini commit [`30c997`](https://github.com/castorini/anserini/commit/30c9974f495a06c94d576d0e9c2c5861515e0e19) on `damiano` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 32 \ + -input /scratch2/collections/msmarco/msmarco-doc-segmented-docTTTTTquery/ \ + -index indexes/lucene-index.msmarco-v1-doc-segmented-d2q-t5-docvectors.20220525.30c997/ \ + -storeDocvectors -optimize +``` + +Note that this index stores term frequencies along with the docvectors: bag-of-words queries and relevance feedback are supported, but not phrase queries. +The raw text is not stored. diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-d2q-t5.20220201.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-d2q-t5.20220201.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..0cd6c61f11f03bf01a4644b5fa4cc09e7aeca52c --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-d2q-t5.20220201.9ea315.README.md @@ -0,0 +1,15 @@ +# msmarco-v1-doc-segmented-d2q-t5 + +Lucene index of the MS MARCO V1 segmented document corpus, with doc2query-T5 expansions. + +This index was generated on 2022/02/01 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 16 \ + -input /store/collections/msmarco/msmarco-doc-segmented-docTTTTTquery/ \ + -index indexes/lucene-index.msmarco-v1-doc-segmented-d2q-t5.20220201.9ea315/ \ + -optimize +``` + +Note that this index stores term frequencies only, which supports bag-of-words queries, but no phrase queries and no relevance feedback. In addition, there is no way to fetch the raw text. diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-d2q-t5.20221004.252b5e.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-d2q-t5.20221004.252b5e.README.md new file mode 100644 index 0000000000000000000000000000000000000000..00fa81f125c2bcd58af7b5abb9c4e5a90cfbbd24 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-d2q-t5.20221004.252b5e.README.md @@ -0,0 +1,24 @@ +# msmarco-v1-doc-segmented-d2q-t5 + +Lucene index of the MS MARCO V1 segmented document corpus, with doc2query-T5 expansions. + +Note that there are two variants: + ++ `msmarco-v1-doc-segmented-d2q-t5` (4.1G uncompressed): stores term frequencies only, which supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text. ++ `msmarco-v1-doc-segmented-d2q-t5-docvectors` (19G uncompressed): stores term frequencies and the docvectors, which enables pseudo-relevance feedabck. + +These indexes were generated on 2022/10/04 at Anserini commit [`252b5e`](https://github.com/castorini/anserini/commit/252b5e2087dd7b3b994d41a444d4ae0044519819) on `tuna` with the following commands: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 16 \ + -input /tuna1/collections/msmarco/msmarco-doc-segmented-docTTTTTquery/ \ + -index indexes/lucene-index.msmarco-v1-doc-segmented-d2q-t5.20221004.252b5e/ \ + -optimize >& logs/log.msmarco-v1-doc-segmented-d2q-t5.20221004.252b5e & + +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 32 \ + -input /tuna1/collections/msmarco/msmarco-doc-segmented-docTTTTTquery/ \ + -index indexes/lucene-index.msmarco-v1-doc-segmented-d2q-t5-docvectors.20221004.252b5e/ \ + -storeDocvectors -optimize >& logs/log.msmarco-v1-doc-segmented-d2q-t5-docvectors.20221004.252b5e & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-full.20220131.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-full.20220131.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..4a178b038aed81b22e410b0d047e0f9ea3215a7f --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-full.20220131.9ea315.README.md @@ -0,0 +1,21 @@ +# msmarco-v1-doc-segmented-full + +Lucene index of the MS MARCO V1 segmented document corpus. + +This index was generated on 2022/01/31 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 16 \ + -input /store/collections/msmarco/msmarco-doc-segmented/ \ + -index indexes/lucene-index.msmarco-v1-doc-segmented-full.20220131.9ea315/ \ + -storePositions -storeDocvectors -storeRaw -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v1-doc-segmented` (20G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v1-doc-segmented-slim` (3.9G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v1-doc-segmented-full` (35G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "full" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-slim.20220131.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-slim.20220131.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..649cbb587fd3506f7cede3e382a87ae73ef04229 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-slim.20220131.9ea315.README.md @@ -0,0 +1,21 @@ +# msmarco-v1-doc-segmented-slim + +Lucene index of the MS MARCO V1 segmented document corpus. + +This index was generated on 2022/01/31 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 16 \ + -input /store/collections/msmarco/msmarco-doc-segmented/ \ + -index indexes/lucene-index.msmarco-v1-doc-segmented-slim.20220131.9ea315/ \ + -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v1-doc-segmented` (20G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v1-doc-segmented-slim` (3.9G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v1-doc-segmented-full` (35G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "slim" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-unicoil-noexp.20220419.c47993.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-unicoil-noexp.20220419.c47993.README.md new file mode 100644 index 0000000000000000000000000000000000000000..35a6015224d48861db6c242b0277170381674a32 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-unicoil-noexp.20220419.c47993.README.md @@ -0,0 +1,15 @@ +# msmarco-v1-doc-segmented-unicoil-noexp + +Lucene impact index of the MS MARCO V1 segmented document corpus for uniCOIL (noexp) with title prepended. + +This index was generated on 2022/04/19 at Pyserini commit [`c47993`](https://github.com/castorini/pyserini/commit/c47993aa2bebb8ab0a418214cfd299c0d0351c81) on `orca` with the following command: + +``` +python -m pyserini.index.lucene \ + --collection JsonVectorCollection \ + --input embeddings_msmarco-v1-doc-segmented-unicoil-noexp \ + --index indexes/lucene-index.msmarco-v1-doc-segmented-unicoil-noexp \ + --generator DefaultLuceneDocumentGenerator \ + --threads 12 \ + --impact --pretokenized --optimize +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-unicoil.20220219.6a7080.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-unicoil.20220219.6a7080.README.md new file mode 100644 index 0000000000000000000000000000000000000000..1a13ac701612a01ecf6fe489d75ab62a5d3bbd12 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented-unicoil.20220219.6a7080.README.md @@ -0,0 +1,14 @@ +# msmarco-v1-doc-segmented-unicoil + +Lucene impact index of the MS MARCO V1 segmented document corpus for uniCOIL. + +This index was generated on 2022/02/19 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/6a708047f71528f7d516c0dd45485204a36e6b1d) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /store/collections/msmarco/msmarco-doc-segmented-unicoil \ + -index indexes/lucene-index.msmarco-v1-doc-segmented-unicoil.20220219.6a7080/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -impact -pretokenized -optimize +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented.20220131.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented.20220131.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..8e257c9861c34b545ec73a977cfdcdf0b7fc9f34 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented.20220131.9ea315.README.md @@ -0,0 +1,21 @@ +# msmarco-v1-doc-segmented + +Lucene index of the MS MARCO V1 segmented document corpus. + +This index was generated on 2022/01/31 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 16 \ + -input /store/collections/msmarco/msmarco-doc-segmented/ \ + -index indexes/lucene-index.msmarco-v1-doc-segmented.20220131.9ea315/ \ + -storeRaw -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v1-doc-segmented` (20G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v1-doc-segmented-slim` (3.9G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v1-doc-segmented-full` (35G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "default" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented.20221004.252b5e.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented.20221004.252b5e.README.md new file mode 100644 index 0000000000000000000000000000000000000000..5766d69d57134ee7646f36fa920d001c9e0897d7 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-segmented.20221004.252b5e.README.md @@ -0,0 +1,31 @@ +# msmarco-v1-doc-segmented + +Lucene index of the MS MARCO V1 segmented document corpus. + +Note that there are three variants: + ++ `msmarco-v1-doc-segmented` (19G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v1-doc-segmented-slim` (3.9G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v1-doc-segmented-full` (33G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +These indexes were generated on 2022/10/04 at Anserini commit [`252b5e`](https://github.com/castorini/anserini/commit/252b5e2087dd7b3b994d41a444d4ae0044519819) on `tuna` with the following commands: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 16 \ + -input /tuna1/collections/msmarco/msmarco-doc-segmented/ \ + -index indexes/lucene-index.msmarco-v1-doc-segmented.20221004.252b5e/ \ + -storeRaw -optimize >& logs/log.msmarco-v1-doc-segmented.20221004.252b5e & + +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 16 \ + -input /tuna1/collections/msmarco/msmarco-doc-segmented/ \ + -index indexes/lucene-index.msmarco-v1-doc-segmented-slim.20221004.252b5e/ \ + -optimize >& logs/log.msmarco-v1-doc-segmented-slim.20221004.252b5e & + +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 16 \ + -input /tuna1/collections/msmarco/msmarco-doc-segmented/ \ + -index indexes/lucene-index.msmarco-v1-doc-segmented-full.20221004.252b5e/ \ + -storePositions -storeDocvectors -storeRaw -optimize >& logs/log.msmarco-v1-doc-segmented-full.20221004.252b5e & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-slim.20220131.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-slim.20220131.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..114aedff6734473458913da8030ec5dc44227b49 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc-slim.20220131.9ea315.README.md @@ -0,0 +1,21 @@ +# msmarco-v1-doc-slim + +Lucene index of the MS MARCO V1 document corpus. + +This index was generated on 2022/01/31 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 7 \ + -input /store/collections/msmarco/msmarco-doc/ \ + -index indexes/lucene-index.msmarco-v1-doc-slim.20220131.9ea315/ \ + -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v1-doc` (16G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v1-doc-slim` (2.0G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v1-doc-full` (28G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "slim" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc.20220131.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc.20220131.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..ef63fc70345d5e84bcd5b423e841d6ea189f6397 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc.20220131.9ea315.README.md @@ -0,0 +1,21 @@ +# msmarco-v1-doc + +Lucene index of the MS MARCO V1 document corpus. + +This index was generated on 2022/01/31 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 7 \ + -input /store/collections/msmarco/msmarco-doc/ \ + -index indexes/lucene-index.msmarco-v1-doc.20220131.9ea315/ \ + -storeRaw -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v1-doc` (16G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v1-doc-slim` (2.0G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v1-doc-full` (28G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "default" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc.20221004.252b5e.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc.20221004.252b5e.README.md new file mode 100644 index 0000000000000000000000000000000000000000..1719bab44f55c1b437fad8e7034c16acc8f86319 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-doc.20221004.252b5e.README.md @@ -0,0 +1,31 @@ +# msmarco-v1-doc + +Lucene index of the MS MARCO V1 document corpus. + +Note that there are three variants: + ++ `msmarco-v1-doc` (16G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v1-doc-slim` (2.0G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v1-doc-full` (28G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +These indexes were generated on 2022/10/04 at Anserini commit [`252b5e`](https://github.com/castorini/anserini/commit/252b5e2087dd7b3b994d41a444d4ae0044519819) on `tuna` with the following commands: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 7 \ + -input /tuna1/collections/msmarco/msmarco-doc/ \ + -index indexes/lucene-index.msmarco-v1-doc.20221004.252b5e/ \ + -storeRaw -optimize >& logs/log.msmarco-v1-doc.20221004.252b5e & + +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 7 \ + -input /tuna1/collections/msmarco/msmarco-doc/ \ + -index indexes/lucene-index.msmarco-v1-doc-slim.20221004.252b5e/ \ + -optimize >& logs/log.msmarco-v1-doc-slim.20221004.252b5e & + +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 7 \ + -input /tuna1/collections/msmarco/msmarco-doc/ \ + -index indexes/lucene-index.msmarco-v1-doc-full.20221004.252b5e/ \ + -storePositions -storeDocvectors -storeRaw -optimize >& logs/log.msmarco-v1-doc-full.20221004.252b5e & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-d2q-t5-docvectors.20220525.30c997.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-d2q-t5-docvectors.20220525.30c997.README.md new file mode 100644 index 0000000000000000000000000000000000000000..b04b8f9e5f217d6f83a7ca5858fa8ff395724436 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-d2q-t5-docvectors.20220525.30c997.README.md @@ -0,0 +1,16 @@ +# msmarco-v1-passage-d2q-t5-docvectors + +Lucene index (+docvectors) of the MS MARCO V1 passage corpus, with doc2query-T5 expansions. + +This index was generated on 2022/05/25 at Anserini commit [`30c997`](https://github.com/castorini/anserini/commit/30c9974f495a06c94d576d0e9c2c5861515e0e19) on `damiano` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco-passage-docTTTTTquery/ \ + -index indexes/lucene-index.msmarco-v1-passage-d2q-t5-docvectors.20220525.30c997/ \ + -storeDocvectors -optimize +``` + +Note that this index stores term frequencies along with the docvectors: bag-of-words queries and relevance feedback are supported, but not phrase queries. +The raw text is not stored. diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-d2q-t5.20220201.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-d2q-t5.20220201.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..0d6a6e2c1849d231a2e6a911115e53d9a0b17627 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-d2q-t5.20220201.9ea315.README.md @@ -0,0 +1,15 @@ +# msmarco-v1-passage-d2q-t5 + +Lucene index of the MS MARCO V1 passage corpus, with doc2query-T5 expansions. + +This index was generated on 2022/02/01 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco-passage-docTTTTTquery/ \ + -index indexes/lucene-index.msmarco-v1-passage-d2q-t5.20220201.9ea315/ \ + -optimize +``` + +Note that this index stores term frequencies only, which supports bag-of-words queries, but no phrase queries and no relevance feedback. In addition, there is no way to fetch the raw text. diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-d2q-t5.20221004.252b5e.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-d2q-t5.20221004.252b5e.README.md new file mode 100644 index 0000000000000000000000000000000000000000..013ec39c0aa10da5c60bc2463c55197abe3af735 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-d2q-t5.20221004.252b5e.README.md @@ -0,0 +1,24 @@ +# msmarco-v1-passage-d2q-t5 + +Lucene index of the MS MARCO V1 passage corpus, with doc2query-T5 expansions. + +Note that there are two variants: + ++ `msmarco-v1-passage-d2q-t5` (972M uncompressed): stores term frequencies only, which supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text. ++ `msmarco-v1-passage-d2q-t5-docvectors` (5.0G uncompressed): stores term frequencies and the docvectors, which enables pseudo-relevance feedabck. + +These indexes were generated on 2022/10/04 at Anserini commit [`252b5e`](https://github.com/castorini/anserini/commit/252b5e2087dd7b3b994d41a444d4ae0044519819) on `tuna` with the following commands: + +``` +nohup target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /tuna1/collections/msmarco/msmarco-passage-docTTTTTquery/ \ + -index indexes/lucene-index.msmarco-v1-passage-d2q-t5.20221004.252b5e/ \ + -optimize >& logs/log.msmarco-v1-passage-d2q-t5.20221004.252b5e & + +nohup target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /tuna1/collections/msmarco/msmarco-passage-docTTTTTquery/ \ + -index indexes/lucene-index.msmarco-v1-passage-d2q-t5-docvectors.20221004.252b5e/ \ + -storeDocvectors -optimize >& logs/log.msmarco-v1-passage-d2q-t5-docvectors.20221004.252b5e & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-full.20220131.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-full.20220131.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..f508b2f9acd1f1d6911ef91a118f1a3665162ed1 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-full.20220131.9ea315.README.md @@ -0,0 +1,21 @@ +# msmarco-v1-passage-full + +Lucene index of the MS MARCO V1 passage corpus. + +This index was generated on 2022/01/31 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 9 \ + -input /store/collections/msmarco/passage/ \ + -index indexes/lucene-index.msmarco-v1-passage-full.20220131.9ea315/ \ + -storePositions -storeDocvectors -storeRaw -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v1-passage` (2.5G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v1-passage-slim` (616M uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v1-passage-full` (4.3G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "full" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-slim.20220131.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-slim.20220131.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..808d3b06fb3d29c553c337ab4bde4a5768dcf11f --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-slim.20220131.9ea315.README.md @@ -0,0 +1,21 @@ +# msmarco-v1-passage-slim + +Lucene index of the MS MARCO V1 passage corpus. + +This index was generated on 2022/01/31 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 9 \ + -input /store/collections/msmarco/passage/ \ + -index indexes/lucene-index.msmarco-v1-passage-slim.20220131.9ea315/ \ + -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v1-passage` (2.5G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v1-passage-slim` (616M uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v1-passage-full` (4.3G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "slim" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-slimr-pp.20230220.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-slimr-pp.20230220.md new file mode 100644 index 0000000000000000000000000000000000000000..f0889b95214a708f78a383caf93a3d11af4a1274 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-slimr-pp.20230220.md @@ -0,0 +1,11 @@ +This index was generated on 2023/02/20 with the following command: + +python -m pyserini.index.lucene \ + --collection JsonVectorCollection \ + --input collections/slimr_qtopk20_ptopk20_hardneg7_nobalanced_hardneg_distilled \ + --index lucene-index.msmarco-v1-passage-slimr-pp.20230220 \ + --generator DefaultLuceneDocumentGenerator \ + --threads 48 \ + --impact --pretokenized + +lucene-index.msmarco-v1-passage-slimr-pp.20230220.tar.gz MD5 checksum = 17b2edd909bcda4980a93fb0ab87e72b diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-slimr.20230220.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-slimr.20230220.md new file mode 100644 index 0000000000000000000000000000000000000000..db8cf12b250edd8ca68856ecbfc065dca0e088de --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-slimr.20230220.md @@ -0,0 +1,11 @@ +This index was generated on 2023/02/20 with the following command: + +python -m pyserini.index.lucene \ + --collection JsonVectorCollection \ + --input collections/slimr_qtopk20_ptopk20_hardneg7_nobalanced \ + --index lucene-index.msmarco-v1-passage-slimr.20230220 \ + --generator DefaultLuceneDocumentGenerator \ + --threads 48 \ + --impact --pretokenized + +lucene-index.msmarco-v1-passage-slimr.20230220.tar.gz MD5 checksum = 79e566fee4f376096e12a33cf67c8012 diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-splade-pp.20230524.a59610.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-splade-pp.20230524.a59610.README.md new file mode 100644 index 0000000000000000000000000000000000000000..057f8190d60343b216a9b5e11e0ca55cfa3d6e17 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-splade-pp.20230524.a59610.README.md @@ -0,0 +1,63 @@ +# SPLADE++ Indexes for MS MARCO V1 Passage + +These are Lucene impact indexes for MS MARCO V1 Passage using the SPLADE++ models. +There are two separate model variants (SPLADE++ CoCondenser-EnsembleDistil and SPLADE++ CoCondenser-SelfDistil), three index types each: + ++ `msmarco-v1-passage-splade-pp-ed` (2.3G uncompressed): SPLADE++ CoCondenser-EnsembleDistil, minimal TF index. ++ `msmarco-v1-passage-splade-pp-ed-docvectors` (61G uncompressed): with docvectors stored. ++ `msmarco-v1-passage-splade-pp-ed-text` (12G uncompressed): with text stored. ++ `msmarco-v1-passage-splade-pp-sd` (2.6G uncompressed): SPLADE++ CoCondenser-SelfDistil, minimal TF index. ++ `msmarco-v1-passage-splade-pp-sd-docvectors` (67G uncompressed): with docvectors stored. ++ `msmarco-v1-passage-splade-pp-sd-text` (13G uncompressed): with text stored. + +These indexes were generated on 2024/05/24 at Anserini commit [`a59610`](https://github.com/castorini/anserini/commit/a59610795cf612f9f16264c4f9267c8d05f3a2e9) on `tuna` with the following command: + +```bash +target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /mnt/collections/msmarco/msmarco-passage-splade-pp-ed \ + -index indexes/lucene-index.msmarco-v1-passage-splade-pp-ed.20230524.a59610/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.msmarco-v1-passage-splade-pp-ed.20230524.a59610 & + +target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /mnt/collections/msmarco/msmarco-passage-splade-pp-ed \ + -index indexes/lucene-index.msmarco-v1-passage-splade-pp-ed-docvectors.20230524.a59610/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -impact -pretokenized -storeDocvectors -optimize \ + >& logs/log.msmarco-v1-passage-splade-pp-ed-docvectors.20230524.a59610 & + +target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /mnt/collections/msmarco/msmarco-passage-splade-pp-ed \ + -index indexes/lucene-index.msmarco-v1-passage-splade-pp-ed-text.20230524.a59610/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -impact -pretokenized -storeRaw -optimize \ + >& logs/log.msmarco-v1-passage-splade-pp-ed-text.20230524.a59610 & + +target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /mnt/collections/msmarco/msmarco-passage-splade-pp-sd \ + -index indexes/lucene-index.msmarco-v1-passage-splade-pp-sd.20230524.a59610/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -impact -pretokenized -optimize \ + >& logs/log.msmarco-v1-passage-splade-pp-sd.20230524.a59610 & + +target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /mnt/collections/msmarco/msmarco-passage-splade-pp-sd \ + -index indexes/lucene-index.msmarco-v1-passage-splade-pp-sd-docvectors.20230524.a59610/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -impact -pretokenized -storeDocvectors -optimize \ + >& logs/log.msmarco-v1-passage-splade-pp-sd-docvectors.20230524.a59610 & + +target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /mnt/collections/msmarco/msmarco-passage-splade-pp-sd \ + -index indexes/lucene-index.msmarco-v1-passage-splade-pp-sd-text.20230524.a59610/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -impact -pretokenized -storeRaw -optimize \ + >& logs/log.msmarco-v1-passage-splade-pp-sd-text.20230524.a59610 & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-unicoil-noexp.20220322.2f4058.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-unicoil-noexp.20220322.2f4058.README.md new file mode 100644 index 0000000000000000000000000000000000000000..8b6c372cb9f64cd556ec13942f4c99dc9726f8a8 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-unicoil-noexp.20220322.2f4058.README.md @@ -0,0 +1,15 @@ +# msmarco-v1-passage-unicoil-noexp + +Lucene impact index of the MS MARCO V1 passage corpus for uniCOIL (noexp). + +This index was generated on 2022/03/22 at Anserini commit [`2f4058`](https://github.com/castorini/anserini/commit/2f4058fbac852ec483c43e9e43ce9864db5a0027) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /store/collections/msmarco/msmarco-passage-unicoil-noexp/ \ + -index indexes/lucene-index.msmarco-v1-passage-unicoil-noexp.20220322.2f4058/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 \ + -impact -pretokenized -optimize +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-unicoil.20220219.6a7080.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-unicoil.20220219.6a7080.README.md new file mode 100644 index 0000000000000000000000000000000000000000..4e1481ccbe6fcf32d5f7bee7ad044b983d098a82 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage-unicoil.20220219.6a7080.README.md @@ -0,0 +1,14 @@ +# msmarco-v1-passage-unicoil + +Lucene impact index of the MS MARCO V1 passage corpus for uniCOIL. + +This index was generated on 2022/02/19 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/6a708047f71528f7d516c0dd45485204a36e6b1d) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /store/collections/msmarco/msmarco-passage-unicoil \ + -index indexes/lucene-index.msmarco-v1-passage-unicoil.20220219.6a7080/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 16 -impact -pretokenized -optimize +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage.20220131.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage.20220131.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..d1ce4b1f20940432f5e6cf49b9269619efa7a158 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage.20220131.9ea315.README.md @@ -0,0 +1,21 @@ +# msmarco-v1-passage + +Lucene index of the MS MARCO V1 passage corpus. + +This index was generated on 2022/01/31 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 9 \ + -input /store/collections/msmarco/passage/ \ + -index indexes/lucene-index.msmarco-v1-passage.20220131.9ea315/ \ + -storeRaw -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v1-passage` (2.5G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v1-passage-slim` (616M uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v1-passage-full` (4.3G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "default" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage.20221004.252b5e.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage.20221004.252b5e.README.md new file mode 100644 index 0000000000000000000000000000000000000000..6de41d6e0dca0ed64655f99ea5062cc33c5fca25 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v1-passage.20221004.252b5e.README.md @@ -0,0 +1,31 @@ +# msmarco-v1-passage + +Lucene index of the MS MARCO V1 passage corpus. + +Note that there are three variants: + ++ `msmarco-v1-passage` (2.6G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v1-passage-slim` (627M uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v1-passage-full` (4.3G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +These indexes were generated on 2022/10/04 at Anserini commit [`252b5e`](https://github.com/castorini/anserini/commit/252b5e2087dd7b3b994d41a444d4ae0044519819) on `tuna` with the following commands: + +``` +nohup target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 9 \ + -input /tuna1/collections/msmarco/passage/ \ + -index indexes/lucene-index.msmarco-v1-passage.20221004.252b5e/ \ + -storeRaw -optimize >& logs/log.msmarco-v1-passage.20221004.252b5e & + +nohup target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 9 \ + -input /tuna1/collections/msmarco/passage/ \ + -index indexes/lucene-index.msmarco-v1-passage-slim.20221004.252b5e/ \ + -optimize >& logs/log.msmarco-v1-passage-slim.20221004.252b5e & + +nohup target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 9 \ + -input /tuna1/collections/msmarco/passage/ \ + -index indexes/lucene-index.msmarco-v1-passage-full.20221004.252b5e/ \ + -storePositions -storeDocvectors -storeRaw -optimize >& logs/log.msmarco-v1-passage-full.20221004.252b5e & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-d2q-t5-docvectors.20220525.30c997.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-d2q-t5-docvectors.20220525.30c997.README.md new file mode 100644 index 0000000000000000000000000000000000000000..405a8aedf99d5083b24755d54458482466419262 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-d2q-t5-docvectors.20220525.30c997.README.md @@ -0,0 +1,16 @@ +# msmarco-v2-doc-d2q-t5-docvectors + +Lucene index (+docvectors) of the MS MARCO V2 document corpus, with doc2query-T5 expansions. + +This index was generated on 2022/05/25 at Anserini commit [`30c997`](https://github.com/castorini/anserini/commit/30c9974f495a06c94d576d0e9c2c5861515e0e19) on `damiano` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 24 \ + -input /scratch2/collections/msmarco/msmarco_v2_doc_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-doc-d2q-t5-docvectors.20220525.30c997/ \ + -storeDocvectors -optimize +``` + +Note that this index stores term frequencies along with the docvectors: bag-of-words queries and relevance feedback are supported, but not phrase queries. +The raw text is not stored. diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-d2q-t5.20220201.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-d2q-t5.20220201.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..a4f532dcacf819c565fdae2f5379637e7e541e03 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-d2q-t5.20220201.9ea315.README.md @@ -0,0 +1,15 @@ +# msmarco-v2-doc-d2q-t5 + +Lucene index of the MS MARCO V2 document corpus, with doc2query-T5 expansions. + +This index was generated on 2022/02/01 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 24 \ + -input /store/collections/msmarco/msmarco_v2_doc_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-doc-d2q-t5.20220201.9ea315/ \ + -optimize +``` + +Note that this index stores term frequencies only, which supports bag-of-words queries, but no phrase queries and no relevance feedback. In addition, there is no way to fetch the raw text. diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-d2q-t5.20220808.4d6d2a.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-d2q-t5.20220808.4d6d2a.README.md new file mode 100644 index 0000000000000000000000000000000000000000..9a12701b0254659aeb4fc93da3e9b6fb58c3d1d3 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-d2q-t5.20220808.4d6d2a.README.md @@ -0,0 +1,26 @@ +# msmarco-v2-doc-d2q-t5 + +Lucene index of the MS MARCO V2 document corpus, with doc2query-T5 expansions. + +Note that there are two variants: + ++ `msmarco-v2-doc-d2q-t5` (9.0G uncompressed): stores term frequencies only, which supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text. ++ `msmarco-v2-doc-d2q-t5-docvectors` (59G uncompressed): stores term frequencies and the docvectors, which enables pseudo-relevance feedabck. + +These indexes were generated on 2022/08/08 at Anserini commit [`fbe35e`](https://github.com/castorini/anserini/commit/4d6d2a5a367424131331df2a8e9e00e6a9c68856) on `damiano` with the following commands: + +```bash +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 24 \ + -input /scratch2/collections/msmarco/msmarco_v2_doc_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-doc-d2q-t5.20220808.4d6d2a/ \ + -optimize \ + >& logs/log.msmarco-v2-doc-d2q-t5.20220808.4d6d2a.txt & + +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 24 \ + -input /scratch2/collections/msmarco/msmarco_v2_doc_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-doc-d2q-t5-docvectors.20220808.4d6d2a/ \ + -storeDocvectors -optimize \ + >& logs/log.msmarco-v2-doc-d2q-t5-docvectors.20220808.4d6d2a.txt & + ``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-full.20220111.06fb4f.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-full.20220111.06fb4f.README.md new file mode 100644 index 0000000000000000000000000000000000000000..b5d513b58d95ef5981b3933183e0294838d1802e --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-full.20220111.06fb4f.README.md @@ -0,0 +1,21 @@ +# msmarco-v2-doc-full + +Lucene index of the MS MARCO V2 document corpus. + +This index was generated on 2022/01/11 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/06fb4f9947ff2167c276d8893287453af7680786) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco_v2_doc/ \ + -index indexes/lucene-index.msmarco-v2-doc-full.20220111.06fb4f/ \ + -storePositions -storeDocvectors -storeRaw -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v2-doc` (73G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-doc-slim` (8.2G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-doc-full` (132G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "full" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-per-passage.unicoil-noexp-0shot.20211012.58d286.readme.txt b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-per-passage.unicoil-noexp-0shot.20211012.58d286.readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..5813b9669b1b2b3dfd0cfa2e701a1b3f60e0aab1 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-per-passage.unicoil-noexp-0shot.20211012.58d286.readme.txt @@ -0,0 +1,12 @@ +This index was generated on 2021/10/12 at commit 58d286c3f9fe845e261c271f2a0f514462844d97 (2021/10/05) +with the following command: + +python -m pyserini.index -collection JsonVectorCollection \ + -input collections/msmarco-v2-doc-seg-unicoil-noexp-0shot-b8 \ + -index indexes/lucene-index.msmarco-v2-doc-per-passage.unicoil-noexp-0shot.20211012.58d286 \ + -generator DefaultLuceneDocumentGenerator -impact -pretokenized \ + -threads 36 -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +lucene-index.msmarco-v2-doc-per-passage.unicoil-noexp-0shot.20211012.58d286.tar.gz MD5 checksum = 1980db886d969c3393e4da20190eaa8f diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-d2q-t5-docvectors.20220525.30c997.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-d2q-t5-docvectors.20220525.30c997.README.md new file mode 100644 index 0000000000000000000000000000000000000000..9cab351b491ac339bed7677f9f47c07e0be02978 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-d2q-t5-docvectors.20220525.30c997.README.md @@ -0,0 +1,16 @@ +# msmarco-v2-doc-segmented-d2q-t5-docvectors + +Lucene index (+docvectors) of the MS MARCO V2 segmented document corpus, with doc2query-T5 expansions. + +This index was generated on 2022/05/25 at Anserini commit [`30c997`](https://github.com/castorini/anserini/commit/30c9974f495a06c94d576d0e9c2c5861515e0e19) on `damiano` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 24 \ + -input /scratch2/collections/msmarco/msmarco_v2_doc_segmented_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-doc-segmented-d2q-t5-docvectors.20220525.30c997/ \ + -storeDocvectors -optimize +``` + +Note that this index stores term frequencies along with the docvectors: bag-of-words queries and relevance feedback are supported, but not phrase queries. +The raw text is not stored. diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-d2q-t5.20220201.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-d2q-t5.20220201.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..4bb89c4eaf41fd72e6066e9dedfc380ef8aa8fab --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-d2q-t5.20220201.9ea315.README.md @@ -0,0 +1,15 @@ +# msmarco-v2-doc-segmented-d2q-t5 + +Lucene index of the MS MARCO V2 segmented document corpus, with doc2query-T5 expansions. + +This index was generated on 2022/02/01 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 24 \ + -input /store/collections/msmarco/msmarco_v2_doc_segmented_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-doc-segmented-d2q-t5.20220201.9ea315/ \ + -optimize +``` + +Note that this index stores term frequencies only, which supports bag-of-words queries, but no phrase queries and no relevance feedback. In addition, there is no way to fetch the raw text. diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-d2q-t5.20220808.4d6d2a.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-d2q-t5.20220808.4d6d2a.README.md new file mode 100644 index 0000000000000000000000000000000000000000..631e85c8242dafbaa0fbd8a22a6978e5dae3c4dc --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-d2q-t5.20220808.4d6d2a.README.md @@ -0,0 +1,26 @@ +# msmarco-v2-doc-segmented-d2q-t5 + +Lucene index of the MS MARCO V2 segmented document corpus, with doc2query-T5 expansions. + +Note that there are two variants: + ++ `msmarco-v2-doc-segmented-d2q-t5` (29G uncompressed): stores term frequencies only, which supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text. ++ `msmarco-v2-doc-segmented-d2q-t5-docvectors` (130G uncompressed): stores term frequencies and the docvectors, which enables pseudo-relevance feedabck. + +These indexes were generated on 2022/08/08 at Anserini commit [`fbe35e`](https://github.com/castorini/anserini/commit/4d6d2a5a367424131331df2a8e9e00e6a9c68856) on `damiano` with the following commands: + +```bash +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 24 \ + -input /scratch2/collections/msmarco/msmarco_v2_doc_segmented_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-doc-segmented-d2q-t5.20220808.4d6d2a/ \ + -optimize \ + >& logs/log.msmarco-v2-doc-segmented-d2q-t5.20220808.4d6d2a.txt & + +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 24 \ + -input /scratch2/collections/msmarco/msmarco_v2_doc_segmented_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-doc-segmented-d2q-t5-docvectors.20220808.4d6d2a/ \ + -storeDocvectors -optimize \ + >& logs/log.msmarco-v2-doc-segmented-d2q-t5-docvectors.20220808.4d6d2a.txt & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-full.20220111.06fb4f.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-full.20220111.06fb4f.README.md new file mode 100644 index 0000000000000000000000000000000000000000..ed75605287e1398147ed1b8d42b854caa10e3485 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-full.20220111.06fb4f.README.md @@ -0,0 +1,21 @@ +# msmarco-v2-doc-segmented-full + +Lucene index of the MS MARCO V2 segmented document corpus. + +This index was generated on 2022/01/11 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/06fb4f9947ff2167c276d8893287453af7680786) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco_v2_doc_segmented/ \ + -index indexes/lucene-index.msmarco-v2-doc-segmented-full.20220111.06fb4f/ \ + -storePositions -storeDocvectors -storeRaw -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v2-doc-segmented` (128G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-doc-segmented-slim` (25G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-doc-segmented-full` (217G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "full" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-slim.20220111.06fb4f.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-slim.20220111.06fb4f.README.md new file mode 100644 index 0000000000000000000000000000000000000000..0f7ea6da2cf22048d1069bc4ca1389a5b422e0ff --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-slim.20220111.06fb4f.README.md @@ -0,0 +1,21 @@ +# msmarco-v2-doc-segmented-slim + +Lucene index of the MS MARCO V2 segmented document corpus. + +This index was generated on 2022/01/11 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/06fb4f9947ff2167c276d8893287453af7680786) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco_v2_doc_segmented/ \ + -index indexes/lucene-index.msmarco-v2-doc-segmented-slim.20220111.06fb4f/ \ + -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v2-doc-segmented` (128G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-doc-segmented-slim` (25G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-doc-segmented-full` (217G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "slim" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot-v2.20220419.c47993.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot-v2.20220419.c47993.README.md new file mode 100644 index 0000000000000000000000000000000000000000..04e16e185b0020c4eabef825e1148862db3bd0d1 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot-v2.20220419.c47993.README.md @@ -0,0 +1,15 @@ +# msmarco-v2-doc-segmented-unicoil-0shot-v2 + +Lucene impact index of the MS MARCO V2 segmented document corpus for uniCOIL with title prepended. + +This index was generated on 2022/04/19 at Pyserini commit [`c47993`](https://github.com/castorini/pyserini/commit/c47993aa2bebb8ab0a418214cfd299c0d0351c81) on `orca` with the following command: + +``` +python -m pyserini.index.lucene \ + --collection JsonVectorCollection \ + --input embeddings_msmarco-v2-doc-segmented-unicoil-0shot-v2 \ + --index indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot-v2 \ + --generator DefaultLuceneDocumentGenerator \ + --threads 12 \ + --impact --pretokenized --optimize +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot.20220219.6a7080.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot.20220219.6a7080.README.md new file mode 100644 index 0000000000000000000000000000000000000000..89cee4cbb9dfbc875f6f3c2c45b91e2ce4d8640b --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot.20220219.6a7080.README.md @@ -0,0 +1,14 @@ +# msmarco-v2-doc-segmented-unicoil-0shot + +Lucene impact index of the MS MARCO V2 segmented document corpus for uniCOIL. + +This index was generated on 2022/02/19 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/6a708047f71528f7d516c0dd45485204a36e6b1d) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /store/collections/msmarco/msmarco_v2_doc_segmented_unicoil_0shot \ + -index indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot.20220219.6a7080/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 18 -impact -pretokenized -optimize +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot.20220808.4d6d2a.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot.20220808.4d6d2a.README.md new file mode 100644 index 0000000000000000000000000000000000000000..09d20888e080e26390914aaf5a0e68887b03e8dc --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot.20220808.4d6d2a.README.md @@ -0,0 +1,15 @@ +# msmarco-v2-doc-segmented-unicoil-0shot + +Lucene impact index of the MS MARCO V2 segmented document corpus for uniCOIL with title prepended. + +This index was generated on 2022/08/08 at Anserini commit [`fbe35e`](https://github.com/castorini/anserini/commit/4d6d2a5a367424131331df2a8e9e00e6a9c68856) on `damiano` with the following command: + +```bahs +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /scratch2/collections/msmarco/msmarco_v2_doc_segmented_unicoil_0shot_v2 \ + -index indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot.20220808.4d6d2a/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 18 -impact -pretokenized -optimize \ + >& logs/log.msmarco-v2-doc-segmented-unicoil-0shot.20220808.4d6d2a.txt & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.20220419.c47993.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.20220419.c47993.README.md new file mode 100644 index 0000000000000000000000000000000000000000..e3f573d32dd2da65a5a242508224d683ec55d23b --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.20220419.c47993.README.md @@ -0,0 +1,15 @@ +# msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 + +Lucene impact index of the MS MARCO V2 segmented document corpus for uniCOIL (noexp) with title prepended. + +This index was generated on 2022/04/19 at Pyserini commit [`c47993`](https://github.com/castorini/pyserini/commit/c47993aa2bebb8ab0a418214cfd299c0d0351c81) on `orca` with the following command: + +``` +python -m pyserini.index.lucene \ + --collection JsonVectorCollection \ + --input embeddings_msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 \ + --index indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 \ + --generator DefaultLuceneDocumentGenerator \ + --threads 12 \ + --impact --pretokenized --optimize +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot.20220219.6a7080.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot.20220219.6a7080.README.md new file mode 100644 index 0000000000000000000000000000000000000000..6253e9ef7149a992183319541be25414ff9773d7 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot.20220219.6a7080.README.md @@ -0,0 +1,14 @@ +# msmarco-v2-doc-segmented-unicoil-noexp-0shot + +Lucene impact index of the MS MARCO V2 segmented document corpus for uniCOIL (noexp). + +This index was generated on 2022/02/19 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/6a708047f71528f7d516c0dd45485204a36e6b1d) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /store/collections/msmarco/msmarco_v2_doc_segmented_unicoil_noexp_0shot \ + -index indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot.20220219.6a7080/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 18 -impact -pretokenized -optimize +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot.20220808.4d6d2a.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot.20220808.4d6d2a.README.md new file mode 100644 index 0000000000000000000000000000000000000000..2727c93143c25c327efb3bacb9ee4caf004bb249 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot.20220808.4d6d2a.README.md @@ -0,0 +1,15 @@ +# msmarco-v2-doc-segmented-unicoil-noexp-0shot + +Lucene impact index of the MS MARCO V2 segmented document corpus for uniCOIL (noexp) with title prepended. + +This index was generated on 2022/08/08 at Anserini commit [`fbe35e`](https://github.com/castorini/anserini/commit/4d6d2a5a367424131331df2a8e9e00e6a9c68856) on `damiano` with the following command: + +```bash +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /scratch2/collections/msmarco/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2 \ + -index indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot.20220808.4d6d2a/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 18 -impact -pretokenized -optimize \ + >& logs/log.msmarco-v2-doc-segmented-unicoil-noexp-0shot.20220808.4d6d2a.txt & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented.20220111.06fb4f.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented.20220111.06fb4f.README.md new file mode 100644 index 0000000000000000000000000000000000000000..04835ddbbe6e2b9d0f9c0db6ca312a3c9bec434e --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented.20220111.06fb4f.README.md @@ -0,0 +1,21 @@ +# msmarco-v2-doc-segmented + +Lucene index of the MS MARCO V2 segmented document corpus. + +This index was generated on 2022/01/11 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/06fb4f9947ff2167c276d8893287453af7680786) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco_v2_doc_segmented/ \ + -index indexes/lucene-index.msmarco-v2-doc-segmented.20220111.06fb4f/ \ + -storeRaw -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v2-doc-segmented` (128G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-doc-segmented-slim` (25G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-doc-segmented-full` (217G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "default" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented.20220808.4d6d2a.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented.20220808.4d6d2a.README.md new file mode 100644 index 0000000000000000000000000000000000000000..c64d860b9985c8073afc34258e313cf51ce1ad94 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-segmented.20220808.4d6d2a.README.md @@ -0,0 +1,34 @@ +# msmarco-v2-doc-segmented + +Lucene index of the MS MARCO V2 segmented document corpus. + +Note that there are three variants of this index: + ++ `msmarco-v2-doc-segmented` (132G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-doc-segmented-slim` (26G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-doc-segmented-full` (233G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +These indexes were generated on 2022/08/08 at Anserini commit [`fbe35e`](https://github.com/castorini/anserini/commit/4d6d2a5a367424131331df2a8e9e00e6a9c68856) on `damiano` with the following commands: + +```bash +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_doc_segmented/ \ + -index indexes/lucene-index.msmarco-v2-doc-segmented.20220808.4d6d2a/ \ + -storeRaw -optimize \ + >& logs/log.msmarco-v2-doc-segmented.20220808.4d6d2a.txt & + +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_doc_segmented/ \ + -index indexes/lucene-index.msmarco-v2-doc-segmented-slim.20220808.4d6d2a/ \ + -optimize \ + >& logs/log.msmarco-v2-doc-segmented-slim.20220808.4d6d2a.txt & + +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_doc_segmented/ \ + -index indexes/lucene-index.msmarco-v2-doc-segmented-full.20220808.4d6d2a/ \ + -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.msmarco-v2-doc-segmented-full.20220808.4d6d2a.txt & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-slim.20220111.06fb4f.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-slim.20220111.06fb4f.README.md new file mode 100644 index 0000000000000000000000000000000000000000..a99f257ebf45f311c2df6a7358674a1c4104f180 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc-slim.20220111.06fb4f.README.md @@ -0,0 +1,21 @@ +# msmarco-v2-doc-slim + +Lucene index of the MS MARCO V2 document corpus. + +This index was generated on 2022/01/11 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/06fb4f9947ff2167c276d8893287453af7680786) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco_v2_doc/ \ + -index indexes/lucene-index.msmarco-v2-doc-slim.20220111.06fb4f/ \ + -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v2-doc` (73G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-doc-slim` (8.2G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-doc-full` (132G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "slim" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc.20220111.06fb4f.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc.20220111.06fb4f.README.md new file mode 100644 index 0000000000000000000000000000000000000000..4e9164f02d5386adeffe75b7a42bfb09d10069a7 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc.20220111.06fb4f.README.md @@ -0,0 +1,21 @@ +# msmarco-v2-doc + +Lucene index of the MS MARCO V2 document corpus. + +This index was generated on 2022/01/11 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/06fb4f9947ff2167c276d8893287453af7680786) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco_v2_doc/ \ + -index indexes/lucene-index.msmarco-v2-doc.20220111.06fb4f/ \ + -storeRaw -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v2-doc` (73G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-doc-slim` (8.2G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-doc-full` (132G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "default" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc.20220808.4d6d2a.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc.20220808.4d6d2a.README.md new file mode 100644 index 0000000000000000000000000000000000000000..77c05dcafb1799f362d9753b9ce9d642ee70f19d --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-doc.20220808.4d6d2a.README.md @@ -0,0 +1,34 @@ +# msmarco-v2-doc + +Lucene index of the MS MARCO V2 document corpus. + +Note that there are three variants of this index: + ++ `msmarco-v2-doc` (73G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-doc-slim` (8.0G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-doc-full` (132G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +These indexes were generated on 2022/08/08 at Anserini commit [`fbe35e`](https://github.com/castorini/anserini/commit/4d6d2a5a367424131331df2a8e9e00e6a9c68856) on `damiano` with the following commands: + +```bash +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_doc/ \ + -index indexes/lucene-index.msmarco-v2-doc.20220808.4d6d2a/ \ + -storeRaw -optimize \ + >& logs/log.msmarco-v2-doc.20220808.4d6d2a.txt & + +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_doc/ \ + -index indexes/lucene-index.msmarco-v2-doc-slim.20220808.4d6d2a/ \ + -optimize \ + >& logs/log.msmarco-v2-doc-slim.20220808.4d6d2a.txt & + +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_doc/ \ + -index indexes/lucene-index.msmarco-v2-doc-full.20220808.4d6d2a/ \ + -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.msmarco-v2-doc-full.20220808.4d6d2a.txt & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-d2q-t5-docvectors.20220525.30c997.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-d2q-t5-docvectors.20220525.30c997.README.md new file mode 100644 index 0000000000000000000000000000000000000000..19538faf1e93bc36b0d066bbc4be3502707efac7 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-d2q-t5-docvectors.20220525.30c997.README.md @@ -0,0 +1,16 @@ +# msmarco-v2-passage-augmented-d2q-t5-docvectors + +Lucene index (+docvectors) of the MS MARCO V2 augmented passage corpus, with doc2query-T5 expansions. + +This index was generated on 2022/05/25 at Anserini commit [`30c997`](https://github.com/castorini/anserini/commit/30c9974f495a06c94d576d0e9c2c5861515e0e19) on `damiano` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_passage_augmented_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-passage-augmented-d2q-t5-docvectors.20220525.30c997/ \ + -storeDocvectors -optimize +``` + +Note that this index stores term frequencies along with the docvectors: bag-of-words queries and relevance feedback are supported, but not phrase queries. +The raw text is not stored. diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-d2q-t5.20220201.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-d2q-t5.20220201.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..27fe35125aa6ecf3bf206f8b5ffcfd9164c9adbd --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-d2q-t5.20220201.9ea315.README.md @@ -0,0 +1,15 @@ +# msmarco-v2-passage-augmented-d2q-t5 + +Lucene index of the MS MARCO V2 augmented passage corpus, with doc2query-T5 expansions. + +This index was generated on 2022/02/01 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco_v2_passage_augmented_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-passage-augmented-d2q-t5.20220201.9ea315/ \ + -optimize +``` + +Note that this index stores term frequencies only, which supports bag-of-words queries, but no phrase queries and no relevance feedback. In addition, there is no way to fetch the raw text. diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-d2q-t5.20220808.4d6d2a.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-d2q-t5.20220808.4d6d2a.README.md new file mode 100644 index 0000000000000000000000000000000000000000..782e3fe140660d83ae5ed7e7b084d48c83db8b20 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-d2q-t5.20220808.4d6d2a.README.md @@ -0,0 +1,26 @@ +# msmarco-v2-passage-augmented-d2q-t5 + +Lucene index of the MS MARCO V2 augmented passage corpus, with doc2query-T5 expansions. + +Note that there are two variants: + ++ `msmarco-v2-passage-augmented-d2q-t5` (26G uncompressed): stores term frequencies only, which supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text. ++ `msmarco-v2-passage-augmented-d2q-t5-docvectors` (111G uncompressed): stores term frequencies and the docvectors, which enables pseudo-relevance feedabck. + +These indexes were generated on 2022/08/08 at Anserini commit [`fbe35e`](https://github.com/castorini/anserini/commit/4d6d2a5a367424131331df2a8e9e00e6a9c68856) on `damiano` with the following commands: + +```bash +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_passage_augmented_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-passage-augmented-d2q-t5.20220808.4d6d2a/ \ + -optimize \ + >& logs/log.msmarco-v2-passage-augmented-d2q-t5.20220808.4d6d2a.txt & + +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_passage_augmented_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-passage-augmented-d2q-t5-docvectors.20220808.4d6d2a/ \ + -storeDocvectors -optimize \ + >& logs/log.msmarco-v2-passage-augmented-d2q-t5-docvectors.20220808.4d6d2a.txt & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-full.20220111.06fb4f.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-full.20220111.06fb4f.README.md new file mode 100644 index 0000000000000000000000000000000000000000..ee627583a0691de7dd70ebeac6d0488d1b7e36cc --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-full.20220111.06fb4f.README.md @@ -0,0 +1,21 @@ +# msmarco-v2-passage-augmented-full + +Lucene index of the MS MARCO V2 augmented passage corpus. + +This index was generated on 2022/01/11 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/06fb4f9947ff2167c276d8893287453af7680786) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco_v2_passage_augmented/ \ + -index indexes/lucene-index.msmarco-v2-passage-augmented-full.20220111.06fb4f/ \ + -storePositions -storeDocvectors -storeRaw -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v2-passage-augmented` (82G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-passage-augmented-slim` (18G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-passage-augmented-full` (142G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "full" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-slim.20220111.06fb4f.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-slim.20220111.06fb4f.README.md new file mode 100644 index 0000000000000000000000000000000000000000..61e5d0090e304f689c7418c30f6407aa033077cf --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented-slim.20220111.06fb4f.README.md @@ -0,0 +1,21 @@ +# msmarco-v2-passage-augmented-slim + +Lucene index of the MS MARCO V2 augmented passage corpus. + +This index was generated on 2022/01/11 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/06fb4f9947ff2167c276d8893287453af7680786) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco_v2_passage_augmented/ \ + -index indexes/lucene-index.msmarco-v2-passage-augmented-slim.20220111.06fb4f/ \ + -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v2-passage-augmented` (82G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-passage-augmented-slim` (18G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-passage-augmented-full` (142G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "slim" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented.20220111.06fb4f.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented.20220111.06fb4f.README.md new file mode 100644 index 0000000000000000000000000000000000000000..3aff53355dc5a138a0012515d3bc516f398ff5af --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented.20220111.06fb4f.README.md @@ -0,0 +1,21 @@ +# msmarco-v2-passage-augmented + +Lucene index of the MS MARCO V2 augmented passage corpus. + +This index was generated on 2022/01/11 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/06fb4f9947ff2167c276d8893287453af7680786) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco_v2_passage_augmented/ \ + -index indexes/lucene-index.msmarco-v2-passage-augmented.20220111.06fb4f/ \ + -storeRaw -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v2-passage-augmented` (82G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-passage-augmented-slim` (18G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-passage-augmented-full` (142G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "default" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented.20220808.4d6d2a.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented.20220808.4d6d2a.README.md new file mode 100644 index 0000000000000000000000000000000000000000..6be4bee58b5d5d4ae0a459d634584d035c2ed94b --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-augmented.20220808.4d6d2a.README.md @@ -0,0 +1,34 @@ +# msmarco-v2-passage-augmented + +Lucene index of the MS MARCO V2 augmented passage corpus. + +Note that there are three variants of this index: + ++ `msmarco-v2-passage-augmented` (93G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-passage-augmented-slim` (20G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-passage-augmented-full` (157G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +These indexes were generated on 2022/08/08 at Anserini commit [`fbe35e`](https://github.com/castorini/anserini/commit/4d6d2a5a367424131331df2a8e9e00e6a9c68856) on `damiano` with the following commands: + +```bash +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_passage_augmented/ \ + -index indexes/lucene-index.msmarco-v2-passage-augmented.20220808.4d6d2a/ \ + -storeRaw -optimize \ + >& logs/log.msmarco-v2-passage-augmented.20220808.4d6d2a.txt & + +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_passage_augmented/ \ + -index indexes/lucene-index.msmarco-v2-passage-augmented-slim.20220808.4d6d2a/ \ + -optimize \ + >& logs/log.msmarco-v2-passage-augmented-slim.20220808.4d6d2a.txt & + +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_passage_augmented/ \ + -index indexes/lucene-index.msmarco-v2-passage-augmented-full.20220808.4d6d2a/ \ + -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.msmarco-v2-passage-augmented-full.20220808.4d6d2a.txt & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-d2q-t5-docvectors.20220525.30c997.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-d2q-t5-docvectors.20220525.30c997.README.md new file mode 100644 index 0000000000000000000000000000000000000000..59fd7e47baf841cf1dc6ccbd5eb93634630c8740 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-d2q-t5-docvectors.20220525.30c997.README.md @@ -0,0 +1,16 @@ +# msmarco-v2-passage-d2q-t5-docvectors + +Lucene index (+docvectors) of the MS MARCO V2 passage corpus, with doc2query-T5 expansions. + +This index was generated on 2022/05/25 at Anserini commit [`30c997`](https://github.com/castorini/anserini/commit/30c9974f495a06c94d576d0e9c2c5861515e0e19) on `damiano` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_passage_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-passage-d2q-t5-docvectors.20220525.30c997/ \ + -storeDocvectors -optimize +``` + +Note that this index stores term frequencies along with the docvectors: bag-of-words queries and relevance feedback are supported, but not phrase queries. +The raw text is not stored. diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-d2q-t5.20220201.9ea315.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-d2q-t5.20220201.9ea315.README.md new file mode 100644 index 0000000000000000000000000000000000000000..37f9b289a37cbc346067860dcee02a3fe919e2c5 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-d2q-t5.20220201.9ea315.README.md @@ -0,0 +1,15 @@ +# msmarco-v2-passage-d2q-t5 + +Lucene index of the MS MARCO V2 passage corpus, with doc2query-T5 expansions. + +This index was generated on 2022/02/01 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/9ea3159adeeffd84e10e197af4c36febb5b74c7b) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco_v2_passage_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-passage-d2q-t5.20220201.9ea315/ \ + -optimize +``` + +Note that this index stores term frequencies only, which supports bag-of-words queries, but no phrase queries and no relevance feedback. In addition, there is no way to fetch the raw text. diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-d2q-t5.20220808.4d6d2a.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-d2q-t5.20220808.4d6d2a.README.md new file mode 100644 index 0000000000000000000000000000000000000000..23c350f0b81ca6776ce77551c437b6bc40a94c08 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-d2q-t5.20220808.4d6d2a.README.md @@ -0,0 +1,26 @@ +# msmarco-v2-passage-d2q-t5 + +Lucene index of the MS MARCO V2 passage corpus, with doc2query-T5 expansions. + +Note that there are two variants: + ++ `msmarco-v2-passage-d2q-t5` (19G uncompressed): stores term frequencies only, which supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text. ++ `msmarco-v2-passage-d2q-t5-docvectors` (71G uncompressed): stores term frequencies and the docvectors, which enables pseudo-relevance feedabck. + +These indexes were generated on 2022/08/08 at Anserini commit [`fbe35e`](https://github.com/castorini/anserini/commit/4d6d2a5a367424131331df2a8e9e00e6a9c68856) on `damiano` with the following commands: + +```bash +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_passage_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-passage-d2q-t5.20220808.4d6d2a/ \ + -optimize \ + >& logs/log.msmarco-v2-passage-d2q-t5.20220808.4d6d2a.txt & + +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_passage_d2q-t5/ \ + -index indexes/lucene-index.msmarco-v2-passage-d2q-t5-docvectors.20220808.4d6d2a/ \ + -storeDocvectors -optimize \ + >& logs/log.msmarco-v2-passage-d2q-t5-docvectors.20220808.4d6d2a.txt & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-full.20220111.06fb4f.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-full.20220111.06fb4f.README.md new file mode 100644 index 0000000000000000000000000000000000000000..8fd87fdff81a9f0dc3326e890790f514906092c4 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-full.20220111.06fb4f.README.md @@ -0,0 +1,21 @@ +# msmarco-v2-passage-full + +Lucene index of the MS MARCO V2 passage corpus. + +This index was generated on 2022/01/11 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/06fb4f9947ff2167c276d8893287453af7680786) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco_v2_passage/ \ + -index indexes/lucene-index.msmarco-v2-passage-full.20220111.06fb4f/ \ + -storePositions -storeDocvectors -storeRaw -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v2-passage` (45G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-passage-slim` (11G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-passage-full` (69G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "full" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-slim.20220111.06fb4f.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-slim.20220111.06fb4f.README.md new file mode 100644 index 0000000000000000000000000000000000000000..e3f5e1b578987061a5547305f4aae5f2ac34ef83 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-slim.20220111.06fb4f.README.md @@ -0,0 +1,21 @@ +# msmarco-v2-passage-slim + +Lucene index of the MS MARCO V2 passage corpus. + +This index was generated on 2022/01/11 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/06fb4f9947ff2167c276d8893287453af7680786) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco_v2_passage/ \ + -index indexes/lucene-index.msmarco-v2-passage-slim.20220111.06fb4f/ \ + -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v2-passage` (45G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-passage-slim` (11G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-passage-full` (69G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "slim" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-unicoil-0shot.20220219.6a7080.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-unicoil-0shot.20220219.6a7080.README.md new file mode 100644 index 0000000000000000000000000000000000000000..af3c94f3796f4c7c9dd34bb26ec39db9323caf7c --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-unicoil-0shot.20220219.6a7080.README.md @@ -0,0 +1,14 @@ +# msmarco-v2-passage-unicoil-0shot + +Lucene impact index of the MS MARCO V2 passage corpus for uniCOIL. + +This index was generated on 2022/02/19 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/6a708047f71528f7d516c0dd45485204a36e6b1d) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /store/collections/msmarco/msmarco_v2_passage_unicoil_0shot \ + -index indexes/lucene-index.msmarco-v2-passage-unicoil-0shot.20220219.6a7080/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 18 -impact -pretokenized -optimize +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-unicoil-0shot.20220808.4d6d2a.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-unicoil-0shot.20220808.4d6d2a.README.md new file mode 100644 index 0000000000000000000000000000000000000000..6c837f8a06cf7f9f431c0d08b98d8752f8551ed4 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-unicoil-0shot.20220808.4d6d2a.README.md @@ -0,0 +1,15 @@ +# msmarco-v2-passage-unicoil-0shot + +Lucene impact index of the MS MARCO V2 passage corpus for uniCOIL. + +This index was generated on 2022/08/08 at Anserini commit [`fbe35e`](https://github.com/castorini/anserini/commit/4d6d2a5a367424131331df2a8e9e00e6a9c68856) on `damiano` with the following command: + +```bash +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /scratch2/collections/msmarco/msmarco_v2_passage_unicoil_0shot \ + -index indexes/lucene-index.msmarco-v2-passage-unicoil-0shot.20220808.4d6d2a/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 18 -impact -pretokenized -optimize \ + >& logs/log.msmarco-v2-passage-unicoil-0shot.20220808.4d6d2a.txt & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-unicoil-noexp-0shot.20220219.6a7080.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-unicoil-noexp-0shot.20220219.6a7080.README.md new file mode 100644 index 0000000000000000000000000000000000000000..a7959c822e1b3db3f60c818c54fc2f518fa598fa --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-unicoil-noexp-0shot.20220219.6a7080.README.md @@ -0,0 +1,14 @@ +# msmarco-v2-passage-unicoil-noexp-0shot + +Lucene impact index of the MS MARCO V2 passage corpus for uniCOIL (noexp). + +This index was generated on 2022/02/19 at Anserini commit [`9ea315`](https://github.com/castorini/anserini/commit/6a708047f71528f7d516c0dd45485204a36e6b1d) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /store/collections/msmarco/msmarco_v2_passage_unicoil_noexp_0shot \ + -index indexes/lucene-index.msmarco-v2-passage-unicoil-noexp-0shot.20220219.6a7080/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 18 -impact -pretokenized -optimize +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-unicoil-noexp-0shot.20220808.4d6d2a.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-unicoil-noexp-0shot.20220808.4d6d2a.README.md new file mode 100644 index 0000000000000000000000000000000000000000..97dd1db97231532bcb9aff56ede3f73789bd381b --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage-unicoil-noexp-0shot.20220808.4d6d2a.README.md @@ -0,0 +1,15 @@ +# msmarco-v2-passage-unicoil-noexp-0shot + +Lucene impact index of the MS MARCO V2 passage corpus for uniCOIL (noexp). + +This index was generated on 2022/08/08 at Anserini commit [`fbe35e`](https://github.com/castorini/anserini/commit/4d6d2a5a367424131331df2a8e9e00e6a9c68856) on `damiano` with the following command: + +```bash +nohup target/appassembler/bin/IndexCollection \ + -collection JsonVectorCollection \ + -input /scratch2/collections/msmarco/msmarco_v2_passage_unicoil_noexp_0shot \ + -index indexes/lucene-index.msmarco-v2-passage-unicoil-noexp-0shot.20220808.4d6d2a/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 18 -impact -pretokenized -optimize \ + >& logs/log.msmarco-v2-passage-unicoil-noexp-0shot.20220808.4d6d2a.txt & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.20220111.06fb4f.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.20220111.06fb4f.README.md new file mode 100644 index 0000000000000000000000000000000000000000..9cbb2e23e868f75a140cb60dda4cf12fdc9236d8 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.20220111.06fb4f.README.md @@ -0,0 +1,21 @@ +# msmarco-v2-passage + +Lucene index of the MS MARCO V2 passage corpus. + +This index was generated on 2022/01/11 at Anserini commit [`06fb4f`](https://github.com/castorini/anserini/commit/06fb4f9947ff2167c276d8893287453af7680786) on `orca` with the following command: + +``` +target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /store/collections/msmarco/msmarco_v2_passage/ \ + -index indexes/lucene-index.msmarco-v2-passage.20220111.06fb4f/ \ + -storeRaw -optimize +``` + +Note that there are three variants of this index: + ++ `msmarco-v2-passage` (45G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-passage-slim` (11G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-passage-full` (69G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +This is the "default" version. \ No newline at end of file diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.20220808.4d6d2a.README.md b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.20220808.4d6d2a.README.md new file mode 100644 index 0000000000000000000000000000000000000000..4ea429f61c9a94bd5cd7cbeb325455e7b9ffc214 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.20220808.4d6d2a.README.md @@ -0,0 +1,34 @@ +# msmarco-v2-passage + +Lucene index of the MS MARCO V2 passage corpus. + +Note that there are three variants of this index: + ++ `msmarco-v2-passage` (48G uncompressed): the "default" version, which stores term frequencies and the raw text. This supports bag-of-words queries, but no phrase queries and no relevance feedback. ++ `msmarco-v2-passage-slim` (13G uncompressed): the "slim" version, which stores term frequencies only. This supports bag-of-words queries, but no phrase queries and no relevance feedback. There is no way to fetch the raw text from this index. ++ `msmarco-v2-passage-full` (72G uncompressed): the "full" version, which stores term frequencies, term positions, document vectors, and the raw text. This supports bag-of-words queries, phrase queries, and relevance feedback. + +These indexes were generated on 2022/08/08 at Anserini commit [`fbe35e`](https://github.com/castorini/anserini/commit/4d6d2a5a367424131331df2a8e9e00e6a9c68856) on `damiano` with the following commands: + +```bash +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_passage/ \ + -index indexes/lucene-index.msmarco-v2-passage.20220808.4d6d2a/ \ + -storeRaw -optimize \ + >& logs/log.msmarco-v2-passage.20220808.4d6d2a.txt & + +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_passage/ \ + -index indexes/lucene-index.msmarco-v2-passage-slim.20220808.4d6d2a/ \ + -optimize \ + >& logs/log.msmarco-v2-passage-slim.20220808.4d6d2a.txt & + +nohup target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \ + -generator DefaultLuceneDocumentGenerator -threads 18 \ + -input /scratch2/collections/msmarco/msmarco_v2_passage/ \ + -index indexes/lucene-index.msmarco-v2-passage-full.20220808.4d6d2a/ \ + -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.msmarco-v2-passage-full.20220808.4d6d2a.txt & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.unicoil-noexp-0shot.20211012.58d286.readme.txt b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.unicoil-noexp-0shot.20211012.58d286.readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..fea735fec452f7c8f1edea2ebb9ce79b5699aaa7 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.unicoil-noexp-0shot.20211012.58d286.readme.txt @@ -0,0 +1,12 @@ +This index was generated on 2021/10/12 at commit 58d286c3f9fe845e261c271f2a0f514462844d97 (2021/10/05) +with the following command: + +python -m pyserini.index -collection JsonVectorCollection \ + -input collections/msmarco-v2-passage-unicoil-noexp-0shot-b8 \ + -index indexes/lucene-index.msmarco-v2-passage.unicoil-noexp-0shot.20211012.58d286 \ + -generator DefaultLuceneDocumentGenerator -impact -pretokenized \ + -threads 36 -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +lucene-index.msmarco-v2-passage.unicoil-noexp-0shot.20211012.58d286.tar.gz MD5 checksum = 8886a8d9599838bc6d8d61464da61086 diff --git a/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.unicoil-tilde.20211012.58d286.readme.txt b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.unicoil-tilde.20211012.58d286.readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e772713ce6ff6135182ca0984bf05b7ef67bb53 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.unicoil-tilde.20211012.58d286.readme.txt @@ -0,0 +1,12 @@ +This index was generated on 2021/10/12 at commit 58d286c3f9fe845e261c271f2a0f514462844d97 (2021/10/05) +with the following command: + +python -m pyserini.index -collection JsonVectorCollection \ + -input collections/msmarco-v2-passage-unicoil-tilde-expansion-b8/ \ + -index indexes/lucene-index.msmarco-v2-passage.unicoil-tilde.20211012.58d286 \ + -generator DefaultLuceneDocumentGenerator -impact -pretokenized \ + -threads 36 -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +lucene-index.msmarco-v2-passage.unicoil-tilde.20211012.58d286.tar.gz MD5 checksum = 562f9534eefe04ab8c07beb304074d41 diff --git a/pyserini/resources/index-metadata/lucene-index.neuclir22-en.20221025.c4a8d0.README.md b/pyserini/resources/index-metadata/lucene-index.neuclir22-en.20221025.c4a8d0.README.md new file mode 100644 index 0000000000000000000000000000000000000000..1936a3fc7cb1fbe552525e0d4a4ff572e801bf54 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.neuclir22-en.20221025.c4a8d0.README.md @@ -0,0 +1,34 @@ +# NeuCLIR 2022 Indexes (English) + +Lucene indexes for the NeuCLIR 2022 corpora (official English translations from Persian, Russian, and Chinese). + +These indexes was generated on 2022/10/25 at Anserini commit [`c4a8d0`](https://github.com/castorini/anserini/commit/c4a8d00e3c218ed89dca8a4e51c3b2c7d577db00) on `tuna` with the following commands: + +```bash +# NeuCLIR22 fa -> en +nohup target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /tuna1/collections/multilingual/neuclir22-fa-en \ + -index indexes/lucene-index.neuclir22-fa-en.20221025.c4a8d0 \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.neuclir22-fa-en.20221025.c4a8d0 & + +# NeuCLIR22 ru -> en +nohup target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /tuna1/collections/multilingual/neuclir22-ru-en \ + -index indexes/lucene-index.neuclir22-ru-en.20221025.c4a8d0 \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.neuclir22-ru-en.20221025.c4a8d0 & + +# NeuCLIR22 zh -> en +nohup target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /tuna1/collections/multilingual/neuclir22-zh-en \ + -index indexes/lucene-index.neuclir22-zh-en.20221025.c4a8d0 \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw -optimize \ + >& logs/log.neuclir22-zh-en.20221025.c4a8d0 & +``` diff --git a/pyserini/resources/index-metadata/lucene-index.neuclir22-fa.20220719.71c120.README.md b/pyserini/resources/index-metadata/lucene-index.neuclir22-fa.20220719.71c120.README.md new file mode 100644 index 0000000000000000000000000000000000000000..8811a43297f7a06ddebb610d2463ae60f265b93a --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.neuclir22-fa.20220719.71c120.README.md @@ -0,0 +1,14 @@ +# neuclir22-fa + +Lucene index for Neuclir22 (Persian). + +This index was generated on 2022/07/19 at Anserini commit [`71c120`](https://github.com/castorini/anserini/commit/71c1200d36ce17615cf4da510ac4ef2d2f0121f6) on `orca` with the following command: + + +``` +target/appassembler/bin/IndexCollection -collection NeuClirCollection \ + -generator DefaultLuceneDocumentGenerator -threads 8 \ + -input /store/collections/multilingual/neuclir22-fa \ + -index indexes/lucene-index.neuclir22-fa.20220719.71c120 \ + -storePositions -storeDocvectors -storeRaw -optimize -language fa +``` diff --git a/pyserini/resources/index-metadata/lucene-index.neuclir22-ru.20220719.71c120.README.md b/pyserini/resources/index-metadata/lucene-index.neuclir22-ru.20220719.71c120.README.md new file mode 100644 index 0000000000000000000000000000000000000000..81445ff3336a7629ce702e5cda912c19031f5eb5 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.neuclir22-ru.20220719.71c120.README.md @@ -0,0 +1,14 @@ +# neuclir22-ru + +Lucene index for Neuclir22 (Russian). + +This index was generated on 2022/07/19 at Anserini commit [`71c120`](https://github.com/castorini/anserini/commit/71c1200d36ce17615cf4da510ac4ef2d2f0121f6) on `orca` with the following command: + + +``` +target/appassembler/bin/IndexCollection -collection NeuClirCollection \ + -generator DefaultLuceneDocumentGenerator -threads 8 \ + -input /store/collections/multilingual/neuclir22-ru \ + -index indexes/lucene-index.neuclir22-ru.20220719.71c120 \ + -storePositions -storeDocvectors -storeRaw -optimize -language ru +``` diff --git a/pyserini/resources/index-metadata/lucene-index.neuclir22-zh.20220719.71c120.README.md b/pyserini/resources/index-metadata/lucene-index.neuclir22-zh.20220719.71c120.README.md new file mode 100644 index 0000000000000000000000000000000000000000..908ec4242bc70a2933e0aba9dceb85514abc0520 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.neuclir22-zh.20220719.71c120.README.md @@ -0,0 +1,14 @@ +# neuclir22-zh + +Lucene index for Neuclir22 (Chinese). + +This index was generated on 2022/07/19 at Anserini commit [`71c120`](https://github.com/castorini/anserini/commit/71c1200d36ce17615cf4da510ac4ef2d2f0121f6) on `orca` with the following command: + + +``` +target/appassembler/bin/IndexCollection -collection NeuClirCollection \ + -generator DefaultLuceneDocumentGenerator -threads 8 \ + -input /store/collections/multilingual/neuclir22-zh \ + -index indexes/lucene-index.neuclir22-zh.20220719.71c120 \ + -storePositions -storeDocvectors -storeRaw -optimize -language zh +``` diff --git a/pyserini/resources/index-metadata/lucene-index.neuclir22.20221025.c4a8d0.README.md b/pyserini/resources/index-metadata/lucene-index.neuclir22.20221025.c4a8d0.README.md new file mode 100644 index 0000000000000000000000000000000000000000..892efb9d39ea3e48b10876170f355c87cad30423 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-index.neuclir22.20221025.c4a8d0.README.md @@ -0,0 +1,34 @@ +# NeuCLIR 2022 Indexes + +Lucene indexes for the NeuCLIR 2022 corpora (Persian, Russian, and Chinese). + +These indexes was generated on 2022/10/25 at Anserini commit [`c4a8d0`](https://github.com/castorini/anserini/commit/c4a8d00e3c218ed89dca8a4e51c3b2c7d577db00) on `tuna` with the following commands: + +```bash +# NeuCLIR22 fa +nohup target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /tuna1/collections/multilingual/neuclir22-fa \ + -index indexes/lucene-index.neuclir22-fa.20221025.c4a8d0 \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw -language fa -optimize \ + >& logs/log.neuclir22-fa.20221025.c4a8d0 & + +# NeuCLIR22 ru +nohup target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /tuna1/collections/multilingual/neuclir22-ru \ + -index indexes/lucene-index.neuclir22-ru.20221025.c4a8d0 \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw -language ru -optimize \ + >& logs/log.neuclir22-ru.20221025.c4a8d0 & + +# NeuCLIR22 zh +nohup target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /tuna1/collections/multilingual/neuclir22-zh \ + -index indexes/lucene-index.neuclir22-zh.20221025.c4a8d0 \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw -language zh -optimize \ + >& logs/log.neuclir22-zh.20221025.c4a8d0 & +``` diff --git a/pyserini/resources/jars/.placeholder b/pyserini/resources/jars/.placeholder new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/pyserini/resources/jars/anserini-0.21.1-SNAPSHOT-fatjar.jar b/pyserini/resources/jars/anserini-0.21.1-SNAPSHOT-fatjar.jar new file mode 100644 index 0000000000000000000000000000000000000000..6277a16e9697f2a8f3358763d26bf6ed2cddf5d8 --- /dev/null +++ b/pyserini/resources/jars/anserini-0.21.1-SNAPSHOT-fatjar.jar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b33e0e837fe46235da136869148853d411be0adf64fe6753c4aa294746df9a88 +size 145665105 diff --git a/pyserini/resources/naturalquestion.yaml b/pyserini/resources/naturalquestion.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e8050af038b762844b122a8da6eb7b9d2ed989b9 --- /dev/null +++ b/pyserini/resources/naturalquestion.yaml @@ -0,0 +1,66 @@ +conditions: + - model_name: BM25-k1_0.9_b_0.4 + command: + - python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics nq-test --output $output --bm25 --k1 0.9 --b 0.4 + scores: + - Top5: 44.82 + Top20: 64.02 + Top100: 79.20 + Top500: 86.59 + Top1000: 88.95 + - model_name: BM25-k1_0.9_b_0.4_dpr-topics + command: + - python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics dpr-nq-test --output $output --bm25 --k1 0.9 --b 0.4 + scores: + - Top5: 43.77 + Top20: 62.99 + Top100: 78.23 + Top500: 85.60 + Top1000: 88.01 + - model_name: GarT5-RRF + command: + - python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics nq-test-gar-t5-answers --output $output --bm25 --k1 0.9 --b 0.4 + - python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics nq-test-gar-t5-titles --output $output --bm25 --k1 0.9 --b 0.4 + - python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics nq-test-gar-t5-sentences --output $output --bm25 --k1 0.9 --b 0.4 + scores: + - Top5: 64.62 + Top20: 77.17 + Top100: 86.90 + Top500: 91.63 + Top1000: 92.91 + - model_name: DPR + command: + - python -m pyserini.search.faiss --threads 72 --batch-size 128 --index wikipedia-dpr-100w.dpr-single-nq --encoder facebook/dpr-question_encoder-single-nq-base --topics nq-test --output $output + scores: + - Top5: 68.61 + Top20: 80.58 + Top100: 86.68 + Top500: 90.91 + Top1000: 91.83 + - model_name: DPR-DKRR + command: + - 'python -m pyserini.search.faiss --threads 72 --batch-size 128 --index wikipedia-dpr-100w.dkrr-nq --encoder castorini/dkrr-dpr-nq-retriever --topics nq-test --output $output --query-prefix question: ' + scores: + - Top5: 73.80 + Top20: 84.27 + Top100: 89.34 + Top500: 92.24 + Top1000: 93.43 + - model_name: DPR-Hybrid + command: + - python -m pyserini.search.hybrid dense --index wikipedia-dpr-100w.dpr-single-nq --encoder facebook/dpr-question_encoder-single-nq-base sparse --index wikipedia-dpr-100w fusion --alpha 1.2 run --topics nq-test --output $output --threads 72 --batch-size 128 + scores: + - Top5: 72.52 + Top20: 83.43 + Top100: 89.03 + Top500: 92.16 + Top1000: 93.19 + - model_name: GarT5RRF-DKRR-RRF + command: + - '' + scores: + - Top5: 74.57 + Top20: 84.90 + Top100: 90.86 + Top500: 93.35 + Top1000: 94.18 diff --git a/pyserini/resources/triviaqa.yaml b/pyserini/resources/triviaqa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a6d466a3045a4180532f7c75097c763a448dffba --- /dev/null +++ b/pyserini/resources/triviaqa.yaml @@ -0,0 +1,66 @@ +conditions: + - model_name: BM25-k1_0.9_b_0.4 + command: + - python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics dpr-trivia-test --output $output --bm25 --k1 0.9 --b 0.4 + scores: + - Top5: 66.29 + Top20: 76.41 + Top100: 83.14 + Top500: 87.35 + Top1000: 88.50 + - model_name: BM25-k1_0.9_b_0.4_dpr-topics + command: + - python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics dpr-trivia-test --output $output --bm25 --k1 0.9 --b 0.4 + scores: + - Top5: 66.29 + Top20: 76.41 + Top100: 83.14 + Top500: 87.35 + Top1000: 88.50 + - model_name: GarT5-RRF + command: + - python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics dpr-trivia-test-gar-t5-answers --output $output --bm25 --k1 0.9 --b 0.4 + - python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics dpr-trivia-test-gar-t5-titles --output $output --bm25 --k1 0.9 --b 0.4 + - python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics dpr-trivia-test-gar-t5-sentences --output $output --bm25 --k1 0.9 --b 0.4 + scores: + - Top5: 72.82 + Top20: 80.66 + Top100: 85.95 + Top500: 89.07 + Top1000: 90.06 + - model_name: DPR + command: + - python -m pyserini.search.faiss --threads 72 --batch-size 128 --index wikipedia-dpr-100w.dpr-multi --encoder facebook/dpr-question_encoder-multiset-base --topics dpr-trivia-test --output $output + scores: + - Top5: 69.80 + Top20: 78.87 + Top100: 84.79 + Top500: 88.19 + Top1000: 89.30 + - model_name: DPR-DKRR + command: + - 'python -m pyserini.search.faiss --threads 72 --batch-size 128 --index wikipedia-dpr-100w.dkrr-tqa --encoder castorini/dkrr-dpr-tqa-retriever --topics dpr-trivia-test --output $output --query-prefix question: ' + scores: + - Top5: 77.23 + Top20: 83.74 + Top100: 87.78 + Top500: 89.87 + Top1000: 90.63 + - model_name: DPR-Hybrid + command: + - python -m pyserini.search.hybrid dense --index wikipedia-dpr-100w.dpr-multi --encoder facebook/dpr-question_encoder-multiset-base sparse --index wikipedia-dpr-100w fusion --alpha 0.95 run --topics dpr-trivia-test --output $output --threads 72 --batch-size 128 + scores: + - Top5: 76.01 + Top20: 82.64 + Top100: 86.55 + Top500: 89.12 + Top1000: 89.90 + - model_name: GarT5RRF-DKRR-RRF + command: + - '' + scores: + - Top5: 78.63 + Top20: 85.02 + Top100: 88.41 + Top500: 90.29 + Top1000: 90.83 \ No newline at end of file diff --git a/pyserini/search/__init__.py b/pyserini/search/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8679e46c62311bd8aee6a125579c024e1391a677 --- /dev/null +++ b/pyserini/search/__init__.py @@ -0,0 +1,61 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ._base import JQuery, JQueryGenerator, JDisjunctionMaxQueryGenerator, get_topics,\ + get_topics_with_reader, get_qrels_file, get_qrels +from .lucene import JLuceneSearcherResult, LuceneSimilarities, LuceneFusionSearcher, LuceneSearcher +from .lucene import JImpactSearcherResult, LuceneImpactSearcher +from ._deprecated import SimpleSearcher, ImpactSearcher, SimpleFusionSearcher + +from .faiss import DenseSearchResult, PRFDenseSearchResult, FaissSearcher, BinaryDenseSearcher, QueryEncoder, \ + DprQueryEncoder, BprQueryEncoder, DkrrDprQueryEncoder, TctColBertQueryEncoder, AnceQueryEncoder, AggretrieverQueryEncoder, AutoQueryEncoder +from .faiss import AnceEncoder +from .faiss import DenseVectorAveragePrf, DenseVectorRocchioPrf, DenseVectorAncePrf + + +__all__ = ['JQuery', + 'LuceneSimilarities', + 'LuceneFusionSearcher', + 'LuceneSearcher', + 'JLuceneSearcherResult', + 'LuceneImpactSearcher', + 'JImpactSearcherResult', + 'JDisjunctionMaxQueryGenerator', + 'JQueryGenerator', + 'get_topics', + 'get_topics_with_reader', + 'get_qrels_file', + 'get_qrels', + 'SimpleSearcher', + 'ImpactSearcher', + 'SimpleFusionSearcher', + 'DenseSearchResult', + 'PRFDenseSearchResult', + 'FaissSearcher', + 'BinaryDenseSearcher', + 'QueryEncoder', + 'DprQueryEncoder', + 'BprQueryEncoder', + 'DkrrDprQueryEncoder', + 'TctColBertQueryEncoder', + 'AnceEncoder', + 'AnceQueryEncoder', + 'AggretrieverQueryEncoder', + 'AutoQueryEncoder', + 'DenseVectorAveragePrf', + 'DenseVectorRocchioPrf', + 'DenseVectorAncePrf'] + diff --git a/pyserini/search/__main__.py b/pyserini/search/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..57e4826ce2700d8cdee229f50e02fc0fbdc57507 --- /dev/null +++ b/pyserini/search/__main__.py @@ -0,0 +1,25 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys +import os + +print('WARNING: directly calling pyserini.search is deprecated, please use pyserini.search.lucene instead') +args = " ".join(sys.argv[1:]) +os.system(f'python -m pyserini.search.lucene {args}') + + + diff --git a/pyserini/search/__pycache__/__init__.cpython-310.pyc b/pyserini/search/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69017ff61d166faee6586b3834558069376da184 Binary files /dev/null and b/pyserini/search/__pycache__/__init__.cpython-310.pyc differ diff --git a/pyserini/search/__pycache__/_base.cpython-310.pyc b/pyserini/search/__pycache__/_base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12fc2097c810320962ff2f6c746c098cba7304ca Binary files /dev/null and b/pyserini/search/__pycache__/_base.cpython-310.pyc differ diff --git a/pyserini/search/__pycache__/_deprecated.cpython-310.pyc b/pyserini/search/__pycache__/_deprecated.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7371a7bfe552215e41a3e8a81cef92ce5acc10e9 Binary files /dev/null and b/pyserini/search/__pycache__/_deprecated.cpython-310.pyc differ diff --git a/pyserini/search/_base.py b/pyserini/search/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..6e8cac4e2c2c1676885a0baa6aac672127c8eddc --- /dev/null +++ b/pyserini/search/_base.py @@ -0,0 +1,560 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This module provides Pyserini's Python search interface to Anserini. The main entry point is the ``LuceneSearcher`` +class, which wraps the Java class with the same name in Anserini. +""" + +import logging +import os + +from pyserini.util import get_cache_home +from pyserini.pyclass import autoclass + +logger = logging.getLogger(__name__) + +# Wrappers around Lucene classes +JQuery = autoclass('org.apache.lucene.search.Query') +JPath = autoclass('java.nio.file.Path') + +# Wrappers around Anserini classes +JQrels = autoclass('io.anserini.eval.Qrels') +JRelevanceJudgments = autoclass('io.anserini.eval.RelevanceJudgments') +JTopicReader = autoclass('io.anserini.search.topicreader.TopicReader') +JTopics = autoclass('io.anserini.search.topicreader.Topics') +JQueryGenerator = autoclass('io.anserini.search.query.QueryGenerator') +JBagOfWordsQueryGenerator = autoclass('io.anserini.search.query.BagOfWordsQueryGenerator') +JDisjunctionMaxQueryGenerator = autoclass('io.anserini.search.query.DisjunctionMaxQueryGenerator') +JCovid19QueryGenerator = autoclass('io.anserini.search.query.Covid19QueryGenerator') + +topics_mapping = { + 'trec1-adhoc': JTopics.TREC1_ADHOC, + 'trec2-adhoc': JTopics.TREC2_ADHOC, + 'trec3-adhoc': JTopics.TREC3_ADHOC, + 'robust04': JTopics.ROBUST04, + 'robust05': JTopics.ROBUST05, + 'core17': JTopics.CORE17, + 'core18': JTopics.CORE18, + 'wt10g': JTopics.WT10G, + 'trec2004-terabyte': JTopics.TREC2004_TERABYTE, + 'trec2005-terabyte': JTopics.TREC2005_TERABYTE, + 'trec2006-terabyte': JTopics.TREC2006_TERABYTE, + 'trec2007-million-query': JTopics.TREC2007_MILLION_QUERY, + 'trec2008-million-query': JTopics.TREC2008_MILLION_QUERY, + 'trec2009-million-query': JTopics.TREC2009_MILLION_QUERY, + 'trec2010-web': JTopics.TREC2010_WEB, + 'trec2011-web': JTopics.TREC2011_WEB, + 'trec2012-web': JTopics.TREC2012_WEB, + 'trec2013-web': JTopics.TREC2013_WEB, + 'trec2014-web': JTopics.TREC2014_WEB, + 'mb11': JTopics.MB11, + 'mb12': JTopics.MB12, + 'mb13': JTopics.MB13, + 'mb14': JTopics.MB14, + 'car17v1.5-benchmarkY1test': JTopics.CAR17V15_BENCHMARK_Y1_TEST, + 'car17v2.0-benchmarkY1test': JTopics.CAR17V20_BENCHMARK_Y1_TEST, + 'dl19-doc': JTopics.TREC2019_DL_DOC, + 'dl19-doc-unicoil': JTopics.TREC2019_DL_DOC_UNICOIL, + 'dl19-doc-unicoil-noexp': JTopics.TREC2019_DL_DOC_UNICOIL_NOEXP, + 'dl19-passage': JTopics.TREC2019_DL_PASSAGE, + 'dl19-passage-unicoil': JTopics.TREC2019_DL_PASSAGE_UNICOIL, + 'dl19-passage-unicoil-noexp': JTopics.TREC2019_DL_PASSAGE_UNICOIL_NOEXP, + 'dl20': JTopics.TREC2020_DL, + 'dl20-unicoil': JTopics.TREC2020_DL_UNICOIL, + 'dl20-unicoil-noexp': JTopics.TREC2020_DL_UNICOIL_NOEXP, + 'dl21': JTopics.TREC2021_DL, + 'dl21-unicoil': JTopics.TREC2021_DL_UNICOIL, + 'dl21-unicoil-noexp': JTopics.TREC2021_DL_UNICOIL_NOEXP, + 'msmarco-doc-dev': JTopics.MSMARCO_DOC_DEV, + 'msmarco-doc-dev-unicoil': JTopics.MSMARCO_DOC_DEV_UNICOIL, + 'msmarco-doc-dev-unicoil-noexp': JTopics.MSMARCO_DOC_DEV_UNICOIL_NOEXP, + 'msmarco-doc-test': JTopics.MSMARCO_DOC_TEST, + 'msmarco-passage-dev-subset': JTopics.MSMARCO_PASSAGE_DEV_SUBSET, + 'msmarco-passage-dev-subset-deepimpact': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_DEEPIMPACT, + 'msmarco-passage-dev-subset-unicoil': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL, + 'msmarco-passage-dev-subset-unicoil-noexp': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_NOEXP, + 'msmarco-passage-dev-subset-unicoil-tilde': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_TILDE, + 'msmarco-passage-dev-subset-distill-splade-max': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_DISTILL_SPLADE_MAX, + 'msmarco-passage-test-subset': JTopics.MSMARCO_PASSAGE_TEST_SUBSET, + 'msmarco-v2-doc-dev': JTopics.MSMARCO_V2_DOC_DEV, + 'msmarco-v2-doc-dev-unicoil': JTopics.MSMARCO_V2_DOC_DEV_UNICOIL, + 'msmarco-v2-doc-dev-unicoil-noexp': JTopics.MSMARCO_V2_DOC_DEV_UNICOIL_NOEXP, + 'msmarco-v2-doc-dev2': JTopics.MSMARCO_V2_DOC_DEV2, + 'msmarco-v2-doc-dev2-unicoil': JTopics.MSMARCO_V2_DOC_DEV2_UNICOIL, + 'msmarco-v2-doc-dev2-unicoil-noexp': JTopics.MSMARCO_V2_DOC_DEV2_UNICOIL_NOEXP, + 'msmarco-v2-passage-dev': JTopics.MSMARCO_V2_PASSAGE_DEV, + 'msmarco-v2-passage-dev-unicoil': JTopics.MSMARCO_V2_PASSAGE_DEV_UNICOIL, + 'msmarco-v2-passage-dev-unicoil-noexp': JTopics.MSMARCO_V2_PASSAGE_DEV_UNICOIL_NOEXP, + 'msmarco-v2-passage-dev2': JTopics.MSMARCO_V2_PASSAGE_DEV2, + 'msmarco-v2-passage-dev2-unicoil': JTopics.MSMARCO_V2_PASSAGE_DEV2_UNICOIL, + 'msmarco-v2-passage-dev2-unicoil-noexp': JTopics.MSMARCO_V2_PASSAGE_DEV2_UNICOIL_NOEXP, + 'ntcir8-zh': JTopics.NTCIR8_ZH, + 'clef2006-fr': JTopics.CLEF2006_FR, + 'trec2002-ar': JTopics.TREC2002_AR, + 'fire2012-bn': JTopics.FIRE2012_BN, + 'fire2012-hi': JTopics.FIRE2012_HI, + 'fire2012-en': JTopics.FIRE2012_EN, + 'covid-round1': JTopics.COVID_ROUND1, + 'covid-round1-udel': JTopics.COVID_ROUND1_UDEL, + 'covid-round2': JTopics.COVID_ROUND2, + 'covid-round2-udel': JTopics.COVID_ROUND2_UDEL, + 'covid-round3': JTopics.COVID_ROUND3, + 'covid-round3-udel': JTopics.COVID_ROUND3_UDEL, + 'covid-round4': JTopics.COVID_ROUND4, + 'covid-round4-udel': JTopics.COVID_ROUND4_UDEL, + 'covid-round5': JTopics.COVID_ROUND5, + 'covid-round5-udel': JTopics.COVID_ROUND5_UDEL, + 'trec2018-bl': JTopics.TREC2018_BL, + 'trec2019-bl': JTopics.TREC2019_BL, + 'trec2020-bl': JTopics.TREC2020_BL, + 'epidemic-qa-expert-prelim': JTopics.EPIDEMIC_QA_EXPERT_PRELIM, + 'epidemic-qa-consumer-prelim': JTopics.EPIDEMIC_QA_CONSUMER_PRELIM, + 'dpr-nq-dev': JTopics.DPR_NQ_DEV, + 'dpr-nq-test': JTopics.DPR_NQ_TEST, + 'dpr-trivia-dev': JTopics.DPR_TRIVIA_DEV, + 'dpr-trivia-test': JTopics.DPR_TRIVIA_TEST, + 'dpr-wq-test': JTopics.DPR_WQ_TEST, + 'dpr-squad-test': JTopics.DPR_SQUAD_TEST, + 'dpr-curated-test': JTopics.DPR_CURATED_TEST, + 'dpr-trivia-test-gar-t5-answers': JTopics.DPR_TRIVIA_TEST_GART5_ANSWERS, + 'dpr-trivia-test-gar-t5-titles': JTopics.DPR_TRIVIA_TEST_GART5_TITLES, + 'dpr-trivia-test-gar-t5-sentences': JTopics.DPR_TRIVIA_TEST_GART5_SENTENCES, + 'dpr-trivia-test-gar-t5-all': JTopics.DPR_TRIVIA_TEST_GART5_ALL, + 'nq-test-gar-t5-answers': JTopics.NQ_TEST_GART5_ANSWERS, + 'nq-test-gar-t5-titles': JTopics.NQ_TEST_GART5_TITLES, + 'nq-test-gar-t5-sentences': JTopics.NQ_TEST_GART5_SENTENCES, + 'nq-test-gar-t5-all': JTopics.NQ_TEST_GART5_ALL, + 'nq-dev': JTopics.NQ_DEV, + 'nq-test': JTopics.NQ_TEST, + 'mrtydi-v1.1-arabic-train': JTopics.MRTYDI_V11_AR_TRAIN, + 'mrtydi-v1.1-arabic-dev': JTopics.MRTYDI_V11_AR_DEV, + 'mrtydi-v1.1-arabic-test': JTopics.MRTYDI_V11_AR_TEST, + 'mrtydi-v1.1-bengali-train': JTopics.MRTYDI_V11_BN_TRAIN, + 'mrtydi-v1.1-bengali-dev': JTopics.MRTYDI_V11_BN_DEV, + 'mrtydi-v1.1-bengali-test': JTopics.MRTYDI_V11_BN_TEST, + 'mrtydi-v1.1-english-train': JTopics.MRTYDI_V11_EN_TRAIN, + 'mrtydi-v1.1-english-dev': JTopics.MRTYDI_V11_EN_DEV, + 'mrtydi-v1.1-english-test': JTopics.MRTYDI_V11_EN_TEST, + 'mrtydi-v1.1-finnish-train': JTopics.MRTYDI_V11_FI_TRAIN, + 'mrtydi-v1.1-finnish-dev': JTopics.MRTYDI_V11_FI_DEV, + 'mrtydi-v1.1-finnish-test': JTopics.MRTYDI_V11_FI_TEST, + 'mrtydi-v1.1-indonesian-train': JTopics.MRTYDI_V11_ID_TRAIN, + 'mrtydi-v1.1-indonesian-dev': JTopics.MRTYDI_V11_ID_DEV, + 'mrtydi-v1.1-indonesian-test': JTopics.MRTYDI_V11_ID_TEST, + 'mrtydi-v1.1-japanese-train': JTopics.MRTYDI_V11_JA_TRAIN, + 'mrtydi-v1.1-japanese-dev': JTopics.MRTYDI_V11_JA_DEV, + 'mrtydi-v1.1-japanese-test': JTopics.MRTYDI_V11_JA_TEST, + 'mrtydi-v1.1-korean-train': JTopics.MRTYDI_V11_KO_TRAIN, + 'mrtydi-v1.1-korean-dev': JTopics.MRTYDI_V11_KO_DEV, + 'mrtydi-v1.1-korean-test': JTopics.MRTYDI_V11_KO_TEST, + 'mrtydi-v1.1-russian-train': JTopics.MRTYDI_V11_RU_TRAIN, + 'mrtydi-v1.1-russian-dev': JTopics.MRTYDI_V11_RU_DEV, + 'mrtydi-v1.1-russian-test': JTopics.MRTYDI_V11_RU_TEST, + 'mrtydi-v1.1-swahili-train': JTopics.MRTYDI_V11_SW_TRAIN, + 'mrtydi-v1.1-swahili-dev': JTopics.MRTYDI_V11_SW_DEV, + 'mrtydi-v1.1-swahili-test': JTopics.MRTYDI_V11_SW_TEST, + 'mrtydi-v1.1-telugu-train': JTopics.MRTYDI_V11_TE_TRAIN, + 'mrtydi-v1.1-telugu-dev': JTopics.MRTYDI_V11_TE_DEV, + 'mrtydi-v1.1-telugu-test': JTopics.MRTYDI_V11_TE_TEST, + 'mrtydi-v1.1-thai-train': JTopics.MRTYDI_V11_TH_TRAIN, + 'mrtydi-v1.1-thai-dev': JTopics.MRTYDI_V11_TH_DEV, + 'mrtydi-v1.1-thai-test': JTopics.MRTYDI_V11_TH_TEST, + 'beir-v1.0.0-trec-covid-test': JTopics.BEIR_V1_0_0_TREC_COVID_TEST, + 'beir-v1.0.0-bioasq-test': JTopics.BEIR_V1_0_0_BIOASQ_TEST, + 'beir-v1.0.0-nfcorpus-test': JTopics.BEIR_V1_0_0_NFCORPUS_TEST, + 'beir-v1.0.0-nq-test': JTopics.BEIR_V1_0_0_NQ_TEST, + 'beir-v1.0.0-hotpotqa-test': JTopics.BEIR_V1_0_0_HOTPOTQA_TEST, + 'beir-v1.0.0-fiqa-test': JTopics.BEIR_V1_0_0_FIQA_TEST, + 'beir-v1.0.0-signal1m-test': JTopics.BEIR_V1_0_0_SIGNAL1M_TEST, + 'beir-v1.0.0-trec-news-test': JTopics.BEIR_V1_0_0_TREC_NEWS_TEST, + 'beir-v1.0.0-robust04-test': JTopics.BEIR_V1_0_0_ROBUST04_TEST, + 'beir-v1.0.0-arguana-test': JTopics.BEIR_V1_0_0_ARGUANA_TEST, + 'beir-v1.0.0-webis-touche2020-test': JTopics.BEIR_V1_0_0_WEBIS_TOUCHE2020_TEST, + 'beir-v1.0.0-cqadupstack-android-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_ANDROID_TEST, + 'beir-v1.0.0-cqadupstack-english-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_ENGLISH_TEST, + 'beir-v1.0.0-cqadupstack-gaming-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_GAMING_TEST, + 'beir-v1.0.0-cqadupstack-gis-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_GIS_TEST, + 'beir-v1.0.0-cqadupstack-mathematica-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_MATHEMATICA_TEST, + 'beir-v1.0.0-cqadupstack-physics-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_PHYSICS_TEST, + 'beir-v1.0.0-cqadupstack-programmers-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_PROGRAMMERS_TEST, + 'beir-v1.0.0-cqadupstack-stats-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_STATS_TEST, + 'beir-v1.0.0-cqadupstack-tex-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_TEX_TEST, + 'beir-v1.0.0-cqadupstack-unix-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_UNIX_TEST, + 'beir-v1.0.0-cqadupstack-webmasters-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_WEBMASTERS_TEST, + 'beir-v1.0.0-cqadupstack-wordpress-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_WORDPRESS_TEST, + 'beir-v1.0.0-quora-test': JTopics.BEIR_V1_0_0_QUORA_TEST, + 'beir-v1.0.0-dbpedia-entity-test': JTopics.BEIR_V1_0_0_DBPEDIA_ENTITY_TEST, + 'beir-v1.0.0-scidocs-test': JTopics.BEIR_V1_0_0_SCIDOCS_TEST, + 'beir-v1.0.0-fever-test': JTopics.BEIR_V1_0_0_FEVER_TEST, + 'beir-v1.0.0-climate-fever-test': JTopics.BEIR_V1_0_0_CLIMATE_FEVER_TEST, + 'beir-v1.0.0-scifact-test': JTopics.BEIR_V1_0_0_SCIFACT_TEST, + 'beir-v1.0.0-trec-covid-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_TREC_COVID_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-bioasq-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_BIOASQ_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-nfcorpus-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_NFCORPUS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-nq-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_NQ_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-hotpotqa-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_HOTPOTQA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-fiqa-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_FIQA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-signal1m-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_SIGNAL1M_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-trec-news-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_TREC_NEWS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-robust04-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_ROBUST04_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-arguana-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_ARGUANA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-webis-touche2020-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_WEBIS_TOUCHE2020_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-cqadupstack-android-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_ANDROID_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-cqadupstack-english-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_ENGLISH_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-cqadupstack-gaming-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_GAMING_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-cqadupstack-gis-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_GIS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-cqadupstack-mathematica-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_MATHEMATICA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-cqadupstack-physics-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_PHYSICS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-cqadupstack-programmers-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_PROGRAMMERS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-cqadupstack-stats-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_STATS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-cqadupstack-tex-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_TEX_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-cqadupstack-unix-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_UNIX_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-cqadupstack-webmasters-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_WEBMASTERS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-cqadupstack-wordpress-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_WORDPRESS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-quora-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_QUORA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-dbpedia-entity-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_DBPEDIA_ENTITY_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-scidocs-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_SCIDOCS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-fever-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_FEVER_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-climate-fever-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CLIMATE_FEVER_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'beir-v1.0.0-scifact-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_SCIFACT_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM, + 'hc4-v1.0-fa-dev-title': JTopics.HC4_V1_0_FA_DEV_TITLE, + 'hc4-v1.0-fa-dev-desc': JTopics.HC4_V1_0_FA_DEV_DESC, + 'hc4-v1.0-fa-dev-desc-title': JTopics.HC4_V1_0_FA_DEV_DESC_TITLE, + 'hc4-v1.0-fa-test-title': JTopics.HC4_V1_0_FA_TEST_TITLE, + 'hc4-v1.0-fa-test-desc': JTopics.HC4_V1_0_FA_TEST_DESC, + 'hc4-v1.0-fa-test-desc-title': JTopics.HC4_V1_0_FA_TEST_DESC_TITLE, + 'hc4-v1.0-fa-en-test-title': JTopics.HC4_V1_0_FA_EN_TEST_TITLE, + 'hc4-v1.0-fa-en-test-desc': JTopics.HC4_V1_0_FA_EN_TEST_DESC, + 'hc4-v1.0-fa-en-test-desc-title': JTopics.HC4_V1_0_FA_EN_TEST_DESC_TITLE, + 'hc4-v1.0-ru-dev-title': JTopics.HC4_V1_0_RU_DEV_TITLE, + 'hc4-v1.0-ru-dev-desc': JTopics.HC4_V1_0_RU_DEV_DESC, + 'hc4-v1.0-ru-dev-desc-title': JTopics.HC4_V1_0_RU_DEV_DESC_TITLE, + 'hc4-v1.0-ru-test-title': JTopics.HC4_V1_0_RU_TEST_TITLE, + 'hc4-v1.0-ru-test-desc': JTopics.HC4_V1_0_RU_TEST_DESC, + 'hc4-v1.0-ru-test-desc-title': JTopics.HC4_V1_0_RU_TEST_DESC_TITLE, + 'hc4-v1.0-ru-en-test-title': JTopics.HC4_V1_0_RU_EN_TEST_TITLE, + 'hc4-v1.0-ru-en-test-desc': JTopics.HC4_V1_0_RU_EN_TEST_DESC, + 'hc4-v1.0-ru-en-test-desc-title': JTopics.HC4_V1_0_RU_EN_TEST_DESC_TITLE, + 'hc4-v1.0-zh-dev-title': JTopics.HC4_V1_0_ZH_DEV_TITLE, + 'hc4-v1.0-zh-dev-desc': JTopics.HC4_V1_0_ZH_DEV_DESC, + 'hc4-v1.0-zh-dev-desc-title': JTopics.HC4_V1_0_ZH_DEV_DESC_TITLE, + 'hc4-v1.0-zh-test-title': JTopics.HC4_V1_0_ZH_TEST_TITLE, + 'hc4-v1.0-zh-test-desc': JTopics.HC4_V1_0_ZH_TEST_DESC, + 'hc4-v1.0-zh-test-desc-title': JTopics.HC4_V1_0_ZH_TEST_DESC_TITLE, + 'hc4-v1.0-zh-en-test-title': JTopics.HC4_V1_0_ZH_EN_TEST_TITLE, + 'hc4-v1.0-zh-en-test-desc': JTopics.HC4_V1_0_ZH_EN_TEST_DESC, + 'hc4-v1.0-zh-en-test-desc-title': JTopics.HC4_V1_0_ZH_EN_TEST_DESC_TITLE, + # NeuCLIR 2022 topics + 'neuclir22-en-title': JTopics.NEUCLIR22_EN_TITLE, + 'neuclir22-en-desc': JTopics.NEUCLIR22_EN_DESC, + 'neuclir22-en-desc-title': JTopics.NEUCLIR22_EN_DESC_TITLE, + 'neuclir22-fa-ht-title': JTopics.NEUCLIR22_FA_HT_TITLE, + 'neuclir22-fa-ht-desc': JTopics.NEUCLIR22_FA_HT_DESC, + 'neuclir22-fa-ht-desc-title': JTopics.NEUCLIR22_FA_HT_DESC_TITLE, + 'neuclir22-fa-mt-title': JTopics.NEUCLIR22_FA_MT_TITLE, + 'neuclir22-fa-mt-desc': JTopics.NEUCLIR22_FA_MT_DESC, + 'neuclir22-fa-mt-desc-title': JTopics.NEUCLIR22_FA_MT_DESC_TITLE, + 'neuclir22-ru-ht-title': JTopics.NEUCLIR22_RU_HT_TITLE, + 'neuclir22-ru-ht-desc': JTopics.NEUCLIR22_RU_HT_DESC, + 'neuclir22-ru-ht-desc-title': JTopics.NEUCLIR22_RU_HT_DESC_TITLE, + 'neuclir22-ru-mt-title': JTopics.NEUCLIR22_RU_MT_TITLE, + 'neuclir22-ru-mt-desc': JTopics.NEUCLIR22_RU_MT_DESC, + 'neuclir22-ru-mt-desc-title': JTopics.NEUCLIR22_RU_MT_DESC_TITLE, + 'neuclir22-zh-ht-title': JTopics.NEUCLIR22_ZH_HT_TITLE, + 'neuclir22-zh-ht-desc': JTopics.NEUCLIR22_ZH_HT_DESC, + 'neuclir22-zh-ht-desc-title': JTopics.NEUCLIR22_ZH_HT_DESC_TITLE, + 'neuclir22-zh-mt-title': JTopics.NEUCLIR22_ZH_MT_TITLE, + 'neuclir22-zh-mt-desc': JTopics.NEUCLIR22_ZH_MT_DESC, + 'neuclir22-zh-mt-desc-title': JTopics.NEUCLIR22_ZH_MT_DESC_TITLE, + # MIRACL topics + 'miracl-v1.0-ar-dev': JTopics.MIRACL_V10_AR_DEV, + 'miracl-v1.0-bn-dev': JTopics.MIRACL_V10_BN_DEV, + 'miracl-v1.0-en-dev': JTopics.MIRACL_V10_EN_DEV, + 'miracl-v1.0-es-dev': JTopics.MIRACL_V10_ES_DEV, + 'miracl-v1.0-fa-dev': JTopics.MIRACL_V10_FA_DEV, + 'miracl-v1.0-fi-dev': JTopics.MIRACL_V10_FI_DEV, + 'miracl-v1.0-fr-dev': JTopics.MIRACL_V10_FR_DEV, + 'miracl-v1.0-hi-dev': JTopics.MIRACL_V10_HI_DEV, + 'miracl-v1.0-id-dev': JTopics.MIRACL_V10_ID_DEV, + 'miracl-v1.0-ja-dev': JTopics.MIRACL_V10_JA_DEV, + 'miracl-v1.0-ko-dev': JTopics.MIRACL_V10_KO_DEV, + 'miracl-v1.0-ru-dev': JTopics.MIRACL_V10_RU_DEV, + 'miracl-v1.0-sw-dev': JTopics.MIRACL_V10_SW_DEV, + 'miracl-v1.0-te-dev': JTopics.MIRACL_V10_TE_DEV, + 'miracl-v1.0-th-dev': JTopics.MIRACL_V10_TH_DEV, + 'miracl-v1.0-zh-dev': JTopics.MIRACL_V10_ZH_DEV, + 'miracl-v1.0-de-dev': JTopics.MIRACL_V10_DE_DEV, + 'miracl-v1.0-yo-dev': JTopics.MIRACL_V10_YO_DEV, +} + +qrels_mapping = { + 'trec1-adhoc': JQrels.TREC1_ADHOC, + 'trec2-adhoc': JQrels.TREC2_ADHOC, + 'trec3-adhoc': JQrels.TREC3_ADHOC, + 'robust04': JQrels.ROBUST04, + 'robust05': JQrels.ROBUST05, + 'core17': JQrels.CORE17, + 'core18': JQrels.CORE18, + 'wt10g': JQrels.WT10G, + 'trec2004-terabyte': JQrels.TREC2004_TERABYTE, + 'trec2005-terabyte': JQrels.TREC2005_TERABYTE, + 'trec2006-terabyte': JQrels.TREC2006_TERABYTE, + 'trec2011-web': JQrels.TREC2011_WEB, + 'trec2012-web': JQrels.TREC2012_WEB, + 'trec2013-web': JQrels.TREC2013_WEB, + 'trec2014-web': JQrels.TREC2014_WEB, + 'mb11': JQrels.MB11, + 'mb12': JQrels.MB12, + 'mb13': JQrels.MB13, + 'mb14': JQrels.MB14, + 'car17v1.5-benchmarkY1test': JQrels.CAR17V15_BENCHMARK_Y1_TEST, + 'car17v2.0-benchmarkY1test': JQrels.CAR17V20_BENCHMARK_Y1_TEST, + 'dl19-doc': JQrels.TREC2019_DL_DOC, + 'dl19-passage': JQrels.TREC2019_DL_PASSAGE, + 'dl20-doc': JQrels.TREC2020_DL_DOC, + 'dl20-passage': JQrels.TREC2020_DL_PASSAGE, + 'dl21-doc': JQrels.TREC2021_DL_DOC, + 'dl21-passage': JQrels.TREC2021_DL_PASSAGE, + 'msmarco-doc-dev': JQrels.MSMARCO_DOC_DEV, + 'msmarco-passage-dev-subset': JQrels.MSMARCO_PASSAGE_DEV_SUBSET, + 'msmarco-v2-doc-dev': JQrels.MSMARCO_V2_DOC_DEV, + 'msmarco-v2-doc-dev2': JQrels.MSMARCO_V2_DOC_DEV2, + 'msmarco-v2-passage-dev': JQrels.MSMARCO_V2_PASSAGE_DEV, + 'msmarco-v2-passage-dev2': JQrels.MSMARCO_V2_PASSAGE_DEV2, + 'ntcir8-zh': JQrels.NTCIR8_ZH, + 'clef2006-fr': JQrels.CLEF2006_FR, + 'trec2002-ar': JQrels.TREC2002_AR, + 'fire2012-bn': JQrels.FIRE2012_BN, + 'fire2012-hi': JQrels.FIRE2012_HI, + 'fire2012-en': JQrels.FIRE2012_EN, + 'covid-complete': JQrels.COVID_COMPLETE, + 'covid-round1': JQrels.COVID_ROUND1, + 'covid-round2': JQrels.COVID_ROUND2, + 'covid-round3': JQrels.COVID_ROUND3, + 'covid-round3-cumulative': JQrels.COVID_ROUND3_CUMULATIVE, + 'covid-round4': JQrels.COVID_ROUND4, + 'covid-round4-cumulative': JQrels.COVID_ROUND4_CUMULATIVE, + 'covid-round5': JQrels.COVID_ROUND5, + 'trec2018-bl': JQrels.TREC2018_BL, + 'trec2019-bl': JQrels.TREC2019_BL, + 'trec2020-bl': JQrels.TREC2020_BL, + 'mrtydi-v1.1-arabic-train': JQrels.MRTYDI_V11_AR_TRAIN, + 'mrtydi-v1.1-arabic-dev': JQrels.MRTYDI_V11_AR_DEV, + 'mrtydi-v1.1-arabic-test': JQrels.MRTYDI_V11_AR_TEST, + 'mrtydi-v1.1-bengali-train': JQrels.MRTYDI_V11_BN_TRAIN, + 'mrtydi-v1.1-bengali-dev': JQrels.MRTYDI_V11_BN_DEV, + 'mrtydi-v1.1-bengali-test': JQrels.MRTYDI_V11_BN_TEST, + 'mrtydi-v1.1-english-train': JQrels.MRTYDI_V11_EN_TRAIN, + 'mrtydi-v1.1-english-dev': JQrels.MRTYDI_V11_EN_DEV, + 'mrtydi-v1.1-english-test': JQrels.MRTYDI_V11_EN_TEST, + 'mrtydi-v1.1-finnish-train': JQrels.MRTYDI_V11_FI_TRAIN, + 'mrtydi-v1.1-finnish-dev': JQrels.MRTYDI_V11_FI_DEV, + 'mrtydi-v1.1-finnish-test': JQrels.MRTYDI_V11_FI_TEST, + 'mrtydi-v1.1-indonesian-train': JQrels.MRTYDI_V11_ID_TRAIN, + 'mrtydi-v1.1-indonesian-dev': JQrels.MRTYDI_V11_ID_DEV, + 'mrtydi-v1.1-indonesian-test': JQrels.MRTYDI_V11_ID_TEST, + 'mrtydi-v1.1-japanese-train': JQrels.MRTYDI_V11_JA_TRAIN, + 'mrtydi-v1.1-japanese-dev': JQrels.MRTYDI_V11_JA_DEV, + 'mrtydi-v1.1-japanese-test': JQrels.MRTYDI_V11_JA_TEST, + 'mrtydi-v1.1-korean-train': JQrels.MRTYDI_V11_KO_TRAIN, + 'mrtydi-v1.1-korean-dev': JQrels.MRTYDI_V11_KO_DEV, + 'mrtydi-v1.1-korean-test': JQrels.MRTYDI_V11_KO_TEST, + 'mrtydi-v1.1-russian-train': JQrels.MRTYDI_V11_RU_TRAIN, + 'mrtydi-v1.1-russian-dev': JQrels.MRTYDI_V11_RU_DEV, + 'mrtydi-v1.1-russian-test': JQrels.MRTYDI_V11_RU_TEST, + 'mrtydi-v1.1-swahili-train': JQrels.MRTYDI_V11_SW_TRAIN, + 'mrtydi-v1.1-swahili-dev': JQrels.MRTYDI_V11_SW_DEV, + 'mrtydi-v1.1-swahili-test': JQrels.MRTYDI_V11_SW_TEST, + 'mrtydi-v1.1-telugu-train': JQrels.MRTYDI_V11_TE_TRAIN, + 'mrtydi-v1.1-telugu-dev': JQrels.MRTYDI_V11_TE_DEV, + 'mrtydi-v1.1-telugu-test': JQrels.MRTYDI_V11_TE_TEST, + 'mrtydi-v1.1-thai-train': JQrels.MRTYDI_V11_TH_TRAIN, + 'mrtydi-v1.1-thai-dev': JQrels.MRTYDI_V11_TH_DEV, + 'mrtydi-v1.1-thai-test': JQrels.MRTYDI_V11_TH_TEST, + 'beir-v1.0.0-trec-covid-test': JQrels.BEIR_V1_0_0_TREC_COVID_TEST, + 'beir-v1.0.0-bioasq-test': JQrels.BEIR_V1_0_0_BIOASQ_TEST, + 'beir-v1.0.0-nfcorpus-test': JQrels.BEIR_V1_0_0_NFCORPUS_TEST, + 'beir-v1.0.0-nq-test': JQrels.BEIR_V1_0_0_NQ_TEST, + 'beir-v1.0.0-hotpotqa-test': JQrels.BEIR_V1_0_0_HOTPOTQA_TEST, + 'beir-v1.0.0-fiqa-test': JQrels.BEIR_V1_0_0_FIQA_TEST, + 'beir-v1.0.0-signal1m-test': JQrels.BEIR_V1_0_0_SIGNAL1M_TEST, + 'beir-v1.0.0-trec-news-test': JQrels.BEIR_V1_0_0_TREC_NEWS_TEST, + 'beir-v1.0.0-robust04-test': JQrels.BEIR_V1_0_0_ROBUST04_TEST, + 'beir-v1.0.0-arguana-test': JQrels.BEIR_V1_0_0_ARGUANA_TEST, + 'beir-v1.0.0-webis-touche2020-test': JQrels.BEIR_V1_0_0_WEBIS_TOUCHE2020_TEST, + 'beir-v1.0.0-cqadupstack-android-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_ANDROID_TEST, + 'beir-v1.0.0-cqadupstack-english-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_ENGLISH_TEST, + 'beir-v1.0.0-cqadupstack-gaming-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_GAMING_TEST, + 'beir-v1.0.0-cqadupstack-gis-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_GIS_TEST, + 'beir-v1.0.0-cqadupstack-mathematica-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_MATHEMATICA_TEST, + 'beir-v1.0.0-cqadupstack-physics-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_PHYSICS_TEST, + 'beir-v1.0.0-cqadupstack-programmers-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_PROGRAMMERS_TEST, + 'beir-v1.0.0-cqadupstack-stats-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_STATS_TEST, + 'beir-v1.0.0-cqadupstack-tex-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_TEX_TEST, + 'beir-v1.0.0-cqadupstack-unix-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_UNIX_TEST, + 'beir-v1.0.0-cqadupstack-webmasters-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_WEBMASTERS_TEST, + 'beir-v1.0.0-cqadupstack-wordpress-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_WORDPRESS_TEST, + 'beir-v1.0.0-quora-test': JQrels.BEIR_V1_0_0_QUORA_TEST, + 'beir-v1.0.0-dbpedia-entity-test': JQrels.BEIR_V1_0_0_DBPEDIA_ENTITY_TEST, + 'beir-v1.0.0-scidocs-test': JQrels.BEIR_V1_0_0_SCIDOCS_TEST, + 'beir-v1.0.0-fever-test': JQrels.BEIR_V1_0_0_FEVER_TEST, + 'beir-v1.0.0-climate-fever-test': JQrels.BEIR_V1_0_0_CLIMATE_FEVER_TEST, + 'beir-v1.0.0-scifact-test': JQrels.BEIR_V1_0_0_SCIFACT_TEST, + 'hc4-v1.0-fa-dev': JQrels.HC4_V1_0_FA_DEV, + 'hc4-v1.0-fa-test': JQrels.HC4_V1_0_FA_TEST, + 'hc4-v1.0-ru-dev': JQrels.HC4_V1_0_RU_DEV, + 'hc4-v1.0-ru-test': JQrels.HC4_V1_0_RU_TEST, + 'hc4-v1.0-zh-dev': JQrels.HC4_V1_0_ZH_DEV, + 'hc4-v1.0-zh-test': JQrels.HC4_V1_0_ZH_TEST, + 'hc4-neuclir22-fa-test': JQrels.HC4_NEUCLIR22_FA_TEST, + 'hc4-neuclir22-ru-test': JQrels.HC4_NEUCLIR22_RU_TEST, + 'hc4-neuclir22-zh-test': JQrels.HC4_NEUCLIR22_ZH_TEST, + 'miracl-v1.0-ar-dev': JQrels.MIRACL_V10_AR_DEV, + 'miracl-v1.0-bn-dev': JQrels.MIRACL_V10_BN_DEV, + 'miracl-v1.0-en-dev': JQrels.MIRACL_V10_EN_DEV, + 'miracl-v1.0-es-dev': JQrels.MIRACL_V10_ES_DEV, + 'miracl-v1.0-fa-dev': JQrels.MIRACL_V10_FA_DEV, + 'miracl-v1.0-fi-dev': JQrels.MIRACL_V10_FI_DEV, + 'miracl-v1.0-fr-dev': JQrels.MIRACL_V10_FR_DEV, + 'miracl-v1.0-hi-dev': JQrels.MIRACL_V10_HI_DEV, + 'miracl-v1.0-id-dev': JQrels.MIRACL_V10_ID_DEV, + 'miracl-v1.0-ja-dev': JQrels.MIRACL_V10_JA_DEV, + 'miracl-v1.0-ko-dev': JQrels.MIRACL_V10_KO_DEV, + 'miracl-v1.0-ru-dev': JQrels.MIRACL_V10_RU_DEV, + 'miracl-v1.0-sw-dev': JQrels.MIRACL_V10_SW_DEV, + 'miracl-v1.0-te-dev': JQrels.MIRACL_V10_TE_DEV, + 'miracl-v1.0-th-dev': JQrels.MIRACL_V10_TH_DEV, + 'miracl-v1.0-zh-dev': JQrels.MIRACL_V10_ZH_DEV, + 'miracl-v1.0-de-dev': JQrels.MIRACL_V10_DE_DEV, + 'miracl-v1.0-yo-dev': JQrels.MIRACL_V10_YO_DEV, +} + + +def get_topics(collection_name): + """ + Parameters + ---------- + collection_name : str + collection_name + + Returns + ------- + result : dictionary + Topics as a dictionary + """ + if collection_name not in topics_mapping: + raise ValueError(f'Topic {collection_name} Not Found') + + topics = JTopicReader.getTopicsWithStringIds(topics_mapping[collection_name]) + + t = {} + for topic in topics.keySet().toArray(): + + if topic.isdigit(): + # parse the keys into integers + topic_key = int(topic) + else: + topic_key = topic + + t[topic_key] = {} + for key in topics.get(topic).keySet().toArray(): + t[topic_key][key] = topics.get(topic).get(key) + return t + + +def get_topics_with_reader(reader_class, file): + # Yes, this is an insanely ridiculous method name. + topics = JTopicReader.getTopicsWithStringIdsFromFileWithTopicReaderClass(reader_class, file) + if topics is None: + raise ValueError(f'Unable to initialize TopicReader {reader_class} with file {file}!') + + t = {} + for topic in topics.keySet().toArray(): + + if topic.isdigit(): + # parse the keys into integers + topic_key = int(topic) + else: + topic_key = topic + + t[topic_key] = {} + for key in topics.get(topic).keySet().toArray(): + t[topic_key][key] = topics.get(topic).get(key) + return t + + +def get_qrels_file(collection_name): + """ + Parameters + ---------- + collection_name : str + collection_name + + Returns + ------- + path : str + path of the qrels file + """ + if collection_name in qrels_mapping: + qrels = qrels_mapping[collection_name] + target_path = JRelevanceJudgments.getQrelsPath(JPath.of(qrels.path)).toString() + if os.path.exists(target_path): + return target_path + target_dir = os.path.split(target_path)[0] + if not os.path.exists(target_dir): + os.makedirs(target_dir) + with open(target_path, 'w') as file: + qrels_content = JRelevanceJudgments.getQrelsResource(JPath.of(target_path)) + file.write(qrels_content) + return target_path + + raise FileNotFoundError(f'no qrels file for {collection_name}') + + +def get_qrels(collection_name): + """ + Parameters + ---------- + collection_name : str + collection_name + + Returns + ------- + result : dictionary + qrels as a dictionary + """ + file_path = get_qrels_file(collection_name) + qrels = {} + with open(file_path, 'r') as f: + for line in f: + qid, _, docid, judgement = line.rstrip().split() + + if qid.isdigit(): + qrels_key = int(qid) + else: + qrels_key = qid + + if docid.isdigit(): + doc_key = int(docid) + else: + doc_key = docid + + if qrels_key in qrels: + qrels[qrels_key][doc_key] = judgement + else: + qrels[qrels_key] = {doc_key: judgement} + return qrels diff --git a/pyserini/search/_deprecated.py b/pyserini/search/_deprecated.py new file mode 100644 index 0000000000000000000000000000000000000000..0d877b086659df95c4e6d493765ec5d9ed2cd0c7 --- /dev/null +++ b/pyserini/search/_deprecated.py @@ -0,0 +1,38 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyserini.search.lucene import LuceneImpactSearcher, LuceneSearcher, LuceneFusionSearcher + + +class SimpleSearcher(LuceneSearcher): + def __new__(cls, *args, **kwargs): + print('SimpleSearcher class has been deprecated, ' + 'please use LuceneSearcher from pyserini.search.lucene instead') + return super().__new__(cls) + + +class ImpactSearcher(LuceneImpactSearcher): + def __new__(cls, *args, **kwargs): + print('ImpactSearcher class has been deprecated, ' + 'please use LuceneImpactSearcher from pyserini.search.lucene instead') + return super().__new__(cls) + + +class SimpleFusionSearcher(LuceneFusionSearcher): + def __new__(cls, *args, **kwargs): + print('SimpleFusionSearcher class has been deprecated, ' + 'please use LuceneFusionSearcher from pyserini.search.lucene instead') + return super().__new__(cls) diff --git a/pyserini/search/faiss/__init__.py b/pyserini/search/faiss/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8cb896ef1b21731a66851902e9ee80641a807517 --- /dev/null +++ b/pyserini/search/faiss/__init__.py @@ -0,0 +1,25 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ._searcher import DenseSearchResult, PRFDenseSearchResult, FaissSearcher, BinaryDenseSearcher, QueryEncoder, \ + DprQueryEncoder, BprQueryEncoder, DkrrDprQueryEncoder, TctColBertQueryEncoder, AnceQueryEncoder, AggretrieverQueryEncoder, AutoQueryEncoder + +from ._model import AnceEncoder +from._prf import DenseVectorAveragePrf, DenseVectorRocchioPrf, DenseVectorAncePrf + +__all__ = ['DenseSearchResult', 'PRFDenseSearchResult', 'FaissSearcher', 'BinaryDenseSearcher', 'QueryEncoder', + 'DprQueryEncoder', 'BprQueryEncoder', 'DkrrDprQueryEncoder', 'TctColBertQueryEncoder', 'AnceEncoder', + 'AnceQueryEncoder', 'AggretrieverQueryEncoder', 'AutoQueryEncoder', 'DenseVectorAveragePrf', 'DenseVectorRocchioPrf', 'DenseVectorAncePrf'] diff --git a/pyserini/search/faiss/__main__.py b/pyserini/search/faiss/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..88d5090bbe3de58b28f184617c9791367a60409d --- /dev/null +++ b/pyserini/search/faiss/__main__.py @@ -0,0 +1,296 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import os +from typing import OrderedDict + +from tqdm import tqdm + +from pyserini.search import FaissSearcher, BinaryDenseSearcher, TctColBertQueryEncoder, QueryEncoder, \ + DprQueryEncoder, BprQueryEncoder, DkrrDprQueryEncoder, AnceQueryEncoder, AggretrieverQueryEncoder, AutoQueryEncoder, DenseVectorAveragePrf, \ + DenseVectorRocchioPrf, DenseVectorAncePrf + +from pyserini.encode import PcaEncoder +from pyserini.query_iterator import get_query_iterator, TopicsFormat +from pyserini.output_writer import get_output_writer, OutputFormat +from pyserini.search.lucene import LuceneSearcher + +# from ._prf import DenseVectorAveragePrf, DenseVectorRocchioPrf + +# Fixes this error: "OMP: Error #15: Initializing libomp.a, but found libomp.dylib already initialized." +# https://stackoverflow.com/questions/53014306/error-15-initializing-libiomp5-dylib-but-found-libiomp5-dylib-already-initial +os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' + + +def define_dsearch_args(parser): + parser.add_argument('--index', type=str, metavar='path to index or index name', required=True, + help="Path to Faiss index or name of prebuilt index.") + parser.add_argument('--encoder-class', type=str, metavar='which query encoder class to use. `default` would infer from the args.encoder', + required=False, + choices=["dkrr", "dpr", "bpr", "tct_colbert", "ance", "sentence", "contriever", "auto", "aggretriever"], + default=None, + help='which query encoder class to use. `default` would infer from the args.encoder') + parser.add_argument('--encoder', type=str, metavar='path to query encoder checkpoint or encoder name', + required=False, + help="Path to query encoder pytorch checkpoint or hgf encoder model name") + parser.add_argument('--tokenizer', type=str, metavar='name or path', + required=False, + help="Path to a hgf tokenizer name or path") + parser.add_argument('--encoded-queries', type=str, metavar='path to query encoded queries dir or queries name', + required=False, + help="Path to query encoder pytorch checkpoint or hgf encoder model name") + parser.add_argument('--pca-model', type=str, metavar='path', required=False, + default=None, help="Path to a faiss pca model") + parser.add_argument('--device', type=str, metavar='device to run query encoder', required=False, default='cpu', + help="Device to run query encoder, cpu or [cuda:0, cuda:1, ...]") + parser.add_argument('--query-prefix', type=str, metavar='str', required=False, default=None, + help="Query prefix if exists.") + parser.add_argument('--searcher', type=str, metavar='str', required=False, default='simple', + help="dense searcher type") + parser.add_argument('--prf-depth', type=int, metavar='num of passages used for PRF', required=False, default=0, + help="Specify how many passages are used for PRF, 0: Simple retrieval with no PRF, > 0: perform PRF") + parser.add_argument('--prf-method', type=str, metavar='avg or rocchio', required=False, default='avg', + help="Choose PRF methods, avg or rocchio") + parser.add_argument('--rocchio-alpha', type=float, metavar='alpha parameter for rocchio', required=False, + default=0.9, + help="The alpha parameter to control the contribution from the query vector") + parser.add_argument('--rocchio-beta', type=float, metavar='beta parameter for rocchio', required=False, default=0.1, + help="The beta parameter to control the contribution from the average vector of the positive PRF passages") + parser.add_argument('--rocchio-gamma', type=float, metavar='gamma parameter for rocchio', required=False, default=0.1, + help="The gamma parameter to control the contribution from the average vector of the negative PRF passages") + parser.add_argument('--rocchio-topk', type=int, metavar='topk passages as positive for rocchio', required=False, default=3, + help="Set topk passages as positive PRF passages for rocchio") + parser.add_argument('--rocchio-bottomk', type=int, metavar='bottomk passages as negative for rocchio', required=False, default=0, + help="Set bottomk passages as negative PRF passages for rocchio, 0: do not use negatives prf passages.") + parser.add_argument('--sparse-index', type=str, metavar='sparse lucene index containing contents', required=False, + help='The path to sparse index containing the passage contents') + parser.add_argument('--ance-prf-encoder', type=str, metavar='query encoder path for ANCE-PRF', required=False, + help='The path or name to ANCE-PRF model checkpoint') + parser.add_argument('--ef-search', type=int, metavar='efSearch for HNSW index', required=False, default=None, + help="Set efSearch for HNSW index") + + +def init_query_encoder(encoder, encoder_class, tokenizer_name, topics_name, encoded_queries, device, prefix): + encoded_queries_map = { + 'msmarco-passage-dev-subset': 'tct_colbert-msmarco-passage-dev-subset', + 'dpr-nq-dev': 'dpr_multi-nq-dev', + 'dpr-nq-test': 'dpr_multi-nq-test', + 'dpr-trivia-dev': 'dpr_multi-trivia-dev', + 'dpr-trivia-test': 'dpr_multi-trivia-test', + 'dpr-wq-test': 'dpr_multi-wq-test', + 'dpr-squad-test': 'dpr_multi-squad-test', + 'dpr-curated-test': 'dpr_multi-curated-test' + } + encoder_class_map = { + "dkrr": DkrrDprQueryEncoder, + "dpr": DprQueryEncoder, + "bpr": BprQueryEncoder, + "tct_colbert": TctColBertQueryEncoder, + "ance": AnceQueryEncoder, + "sentence": AutoQueryEncoder, + "contriever": AutoQueryEncoder, + "aggretriever": AggretrieverQueryEncoder, + "auto": AutoQueryEncoder, + } + + if encoder: + _encoder_class = encoder_class + + # determine encoder_class + if encoder_class is not None: + encoder_class = encoder_class_map[encoder_class] + else: + # if any class keyword was matched in the given encoder name, + # use that encoder class + for class_keyword in encoder_class_map: + if class_keyword in encoder.lower(): + encoder_class = encoder_class_map[class_keyword] + break + + # if none of the class keyword was matched, + # use the AutoQueryEncoder + if encoder_class is None: + encoder_class = AutoQueryEncoder + + # prepare arguments to encoder class + kwargs = dict(encoder_dir=encoder, tokenizer_name=tokenizer_name, device=device, prefix=prefix) + if (_encoder_class == "sentence") or ("sentence" in encoder): + kwargs.update(dict(pooling='mean', l2_norm=True)) + if (_encoder_class == "contriever") or ("contriever" in encoder): + kwargs.update(dict(pooling='mean', l2_norm=False)) + + return encoder_class(**kwargs) + + if encoded_queries: + if os.path.exists(encoded_queries): + if 'bpr' in encoded_queries: + return BprQueryEncoder(encoded_query_dir=encoded_queries) + else: + return QueryEncoder(encoded_queries) + else: + if 'bpr' in encoded_queries: + return BprQueryEncoder.load_encoded_queries(encoded_queries) + else: + return QueryEncoder.load_encoded_queries(encoded_queries) + + if topics_name in encoded_queries_map: + return QueryEncoder.load_encoded_queries(encoded_queries_map[topics_name]) + raise ValueError(f'No encoded queries for topic {topics_name}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Search a Faiss index.') + parser.add_argument('--topics', type=str, metavar='topic_name', required=True, + help="Name of topics. Available: msmarco-passage-dev-subset.") + parser.add_argument('--hits', type=int, metavar='num', required=False, default=1000, help="Number of hits.") + parser.add_argument('--binary-hits', type=int, metavar='num', required=False, default=1000, + help="Number of binary hits.") + parser.add_argument("--rerank", action="store_true", help='whethere rerank bpr sparse results.') + parser.add_argument('--topics-format', type=str, metavar='format', default=TopicsFormat.DEFAULT.value, + help=f"Format of topics. Available: {[x.value for x in list(TopicsFormat)]}") + parser.add_argument('--output-format', type=str, metavar='format', default=OutputFormat.TREC.value, + help=f"Format of output. Available: {[x.value for x in list(OutputFormat)]}") + parser.add_argument('--output', type=str, metavar='path', required=True, help="Path to output file.") + parser.add_argument('--max-passage', action='store_true', + default=False, help="Select only max passage from document.") + parser.add_argument('--max-passage-hits', type=int, metavar='num', required=False, default=100, + help="Final number of hits when selecting only max passage.") + parser.add_argument('--max-passage-delimiter', type=str, metavar='str', required=False, default='#', + help="Delimiter between docid and passage id.") + parser.add_argument('--batch-size', type=int, metavar='num', required=False, default=1, + help="search batch of queries in parallel") + parser.add_argument('--threads', type=int, metavar='num', required=False, default=1, + help="maximum threads to use during search") + # For some test collections, a query is doc from the corpus (e.g., arguana in BEIR). + # We want to remove the query from the results. This is equivalent to -removeQuery in Java. + parser.add_argument('--remove-query', action='store_true', default=False, help="Remove query from results list.") + define_dsearch_args(parser) + args = parser.parse_args() + + query_iterator = get_query_iterator(args.topics, TopicsFormat(args.topics_format)) + topics = query_iterator.topics + + query_encoder = init_query_encoder( + args.encoder, args.encoder_class, args.tokenizer, args.topics, args.encoded_queries, args.device, args.query_prefix) + if args.pca_model: + query_encoder = PcaEncoder(query_encoder, args.pca_model) + kwargs = {} + if os.path.exists(args.index): + # create searcher from index directory + if args.searcher.lower() == 'bpr': + kwargs = dict(binary_k=args.binary_hits, rerank=args.rerank) + searcher = BinaryDenseSearcher(args.index, query_encoder) + else: + searcher = FaissSearcher(args.index, query_encoder) + else: + # create searcher from prebuilt index name + if args.searcher.lower() == 'bpr': + kwargs = dict(binary_k=args.binary_hits, rerank=args.rerank) + searcher = BinaryDenseSearcher.from_prebuilt_index(args.index, query_encoder) + else: + searcher = FaissSearcher.from_prebuilt_index(args.index, query_encoder) + + if args.ef_search: + searcher.set_hnsw_ef_search(args.ef_search) + + if not searcher: + exit() + + # Check PRF Flag + if args.prf_depth > 0 and type(searcher) == FaissSearcher: + PRF_FLAG = True + if args.prf_method.lower() == 'avg': + prfRule = DenseVectorAveragePrf() + elif args.prf_method.lower() == 'rocchio': + prfRule = DenseVectorRocchioPrf(args.rocchio_alpha, args.rocchio_beta, args.rocchio_gamma, + args.rocchio_topk, args.rocchio_bottomk) + # ANCE-PRF is using a new query encoder, so the input to DenseVectorAncePrf is different + elif args.prf_method.lower() == 'ance-prf' and type(query_encoder) == AnceQueryEncoder: + if os.path.exists(args.sparse_index): + sparse_searcher = LuceneSearcher(args.sparse_index) + else: + sparse_searcher = LuceneSearcher.from_prebuilt_index(args.sparse_index) + prf_query_encoder = AnceQueryEncoder(encoder_dir=args.ance_prf_encoder, tokenizer_name=args.tokenizer, + device=args.device) + prfRule = DenseVectorAncePrf(prf_query_encoder, sparse_searcher) + print(f'Running FaissSearcher with {args.prf_method.upper()} PRF...') + else: + PRF_FLAG = False + + # build output path + output_path = args.output + + print(f'Running {args.topics} topics, saving to {output_path}...') + tag = 'Faiss' + + output_writer = get_output_writer(output_path, OutputFormat(args.output_format), 'w', + max_hits=args.hits, tag=tag, topics=topics, + use_max_passage=args.max_passage, + max_passage_delimiter=args.max_passage_delimiter, + max_passage_hits=args.max_passage_hits) + + with output_writer: + batch_topics = list() + batch_topic_ids = list() + for index, (topic_id, text) in enumerate(tqdm(query_iterator, total=len(topics.keys()))): + if args.batch_size <= 1 and args.threads <= 1: + if PRF_FLAG: + emb_q, prf_candidates = searcher.search(text, k=args.prf_depth, return_vector=True, **kwargs) + # ANCE-PRF input is different, do not need query embeddings + if args.prf_method.lower() == 'ance-prf': + prf_emb_q = prfRule.get_prf_q_emb(text, prf_candidates) + else: + prf_emb_q = prfRule.get_prf_q_emb(emb_q[0], prf_candidates) + prf_emb_q = np.expand_dims(prf_emb_q, axis=0).astype('float32') + hits = searcher.search(prf_emb_q, k=args.hits, **kwargs) + else: + hits = searcher.search(text, args.hits, **kwargs) + results = [(topic_id, hits)] + else: + batch_topic_ids.append(str(topic_id)) + batch_topics.append(text) + if (index + 1) % args.batch_size == 0 or \ + index == len(topics.keys()) - 1: + if PRF_FLAG: + q_embs, prf_candidates = searcher.batch_search(batch_topics, batch_topic_ids, + k=args.prf_depth, return_vector=True, **kwargs) + # ANCE-PRF input is different, do not need query embeddings + if args.prf_method.lower() == 'ance-prf': + prf_embs_q = prfRule.get_batch_prf_q_emb(batch_topics, batch_topic_ids, prf_candidates) + else: + prf_embs_q = prfRule.get_batch_prf_q_emb(batch_topic_ids, q_embs, prf_candidates) + results = searcher.batch_search(prf_embs_q, batch_topic_ids, k=args.hits, threads=args.threads, + **kwargs) + results = [(id_, results[id_]) for id_ in batch_topic_ids] + else: + results = searcher.batch_search(batch_topics, batch_topic_ids, args.hits, threads=args.threads, + **kwargs) + results = [(id_, results[id_]) for id_ in batch_topic_ids] + batch_topic_ids.clear() + batch_topics.clear() + else: + continue + + for topic, hits in results: + # For some test collections, a query is doc from the corpus (e.g., arguana in BEIR). + # We want to remove the query from the results. + if args.remove_query: + hits = [hit for hit in hits if hit.docid != topic] + + output_writer.write(topic, hits) + + results.clear() diff --git a/pyserini/search/faiss/__pycache__/__init__.cpython-310.pyc b/pyserini/search/faiss/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0df54fa19276f8521076ae012d07e91ddb54455 Binary files /dev/null and b/pyserini/search/faiss/__pycache__/__init__.cpython-310.pyc differ diff --git a/pyserini/search/faiss/__pycache__/_model.cpython-310.pyc b/pyserini/search/faiss/__pycache__/_model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9e82f879056b9defcaef21096ca87656a8c6afc Binary files /dev/null and b/pyserini/search/faiss/__pycache__/_model.cpython-310.pyc differ diff --git a/pyserini/search/faiss/__pycache__/_prf.cpython-310.pyc b/pyserini/search/faiss/__pycache__/_prf.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1591e715df74068927e0eb972926956ecfc8be97 Binary files /dev/null and b/pyserini/search/faiss/__pycache__/_prf.cpython-310.pyc differ diff --git a/pyserini/search/faiss/__pycache__/_searcher.cpython-310.pyc b/pyserini/search/faiss/__pycache__/_searcher.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25494531bdaa7b3b165bdc7651277050b5190be5 Binary files /dev/null and b/pyserini/search/faiss/__pycache__/_searcher.cpython-310.pyc differ diff --git a/pyserini/search/faiss/_model.py b/pyserini/search/faiss/_model.py new file mode 100644 index 0000000000000000000000000000000000000000..3c6e3e5fc15aab9d11d5cdfa2a1678dd040443d7 --- /dev/null +++ b/pyserini/search/faiss/_model.py @@ -0,0 +1,77 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional + +from transformers import PreTrainedModel, RobertaConfig, RobertaModel +from transformers.file_utils import is_torch_available, requires_backends + +if is_torch_available(): + import torch + + +class AnceEncoder(PreTrainedModel): + config_class = RobertaConfig + base_model_prefix = 'ance_encoder' + load_tf_weights = None + _keys_to_ignore_on_load_missing = [r'position_ids'] + _keys_to_ignore_on_load_unexpected = [r'pooler', r'classifier'] + + def __init__(self, config: RobertaConfig): + requires_backends(self, 'torch') + super().__init__(config) + self.config = config + self.roberta = RobertaModel(config) + self.embeddingHead = torch.nn.Linear(config.hidden_size, 768) + self.norm = torch.nn.LayerNorm(768) + self.init_weights() + + # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights + def _init_weights(self, module): + """ Initialize the weights """ + if isinstance(module, (torch.nn.Linear, torch.nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, torch.nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, torch.nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + def init_weights(self): + self.roberta.init_weights() + self.embeddingHead.apply(self._init_weights) + self.norm.apply(self._init_weights) + + def forward( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + ): + input_shape = input_ids.size() + device = input_ids.device + if attention_mask is None: + attention_mask = ( + torch.ones(input_shape, device=device) + if input_ids is None + else (input_ids != self.roberta.config.pad_token_id) + ) + outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask) + sequence_output = outputs.last_hidden_state + pooled_output = sequence_output[:, 0, :] + pooled_output = self.norm(self.embeddingHead(pooled_output)) + return pooled_output diff --git a/pyserini/search/faiss/_prf.py b/pyserini/search/faiss/_prf.py new file mode 100644 index 0000000000000000000000000000000000000000..68167318d6ae252568b9423ab3cc84b9b72346e6 --- /dev/null +++ b/pyserini/search/faiss/_prf.py @@ -0,0 +1,209 @@ +import numpy as np +from typing import List, Dict +from pyserini.search.faiss import PRFDenseSearchResult, AnceQueryEncoder +from pyserini.search.lucene import LuceneSearcher +import json + + +class DenseVectorPrf: + def __init__(self): + pass + + def get_prf_q_emb(self, **kwargs): + pass + + def get_batch_prf_q_emb(self, **kwargs): + pass + + +class DenseVectorAveragePrf(DenseVectorPrf): + + def get_prf_q_emb(self, emb_qs: np.ndarray = None, prf_candidates: List[PRFDenseSearchResult] = None): + """Perform Average PRF with Dense Vectors + + Parameters + ---------- + emb_qs : np.ndarray + Query embedding + prf_candidates : List[PRFDenseSearchResult] + List of PRFDenseSearchResult, contains document embeddings. + + Returns + ------- + np.ndarray + return new query embeddings + """ + all_candidate_embs = [item.vectors for item in prf_candidates] + new_emb_qs = np.mean(np.vstack((emb_qs, all_candidate_embs)), axis=0) + return new_emb_qs + + def get_batch_prf_q_emb(self, topic_ids: List[str] = None, emb_qs: np.ndarray = None, + prf_candidates: Dict[str, List[PRFDenseSearchResult]] = None): + """Perform Average PRF with Dense Vectors + + Parameters + ---------- + topic_ids : List[str] + List of topic ids. + emb_qs : np.ndarray + Query embeddings + prf_candidates : List[PRFDenseSearchResult] + List of PRFDenseSearchResult, contains document embeddings. + + Returns + ------- + np.ndarray + return new query embeddings + """ + + qids = list() + new_emb_qs = list() + for index, topic_id in enumerate(topic_ids): + qids.append(topic_id) + new_emb_qs.append(self.get_prf_q_emb(emb_qs[index], prf_candidates[topic_id])) + new_emb_qs = np.array(new_emb_qs).astype('float32') + return new_emb_qs + + +class DenseVectorRocchioPrf(DenseVectorPrf): + def __init__(self, alpha: float, beta: float, gamma: float, topk: int, bottomk: int): + """ + Parameters + ---------- + alpha : float + Rocchio parameter, controls the weight assigned to the original query embedding. + beta : float + Rocchio parameter, controls the weight assigned to the positive document embeddings. + gamma : float + Rocchio parameter, controls the weight assigned to the negative document embeddings. + topk : int + Rocchio parameter, set topk documents as positive document feedbacks. + bottomk : int + Rocchio parameter, set bottomk documents as negative document feedbacks. + """ + DenseVectorPrf.__init__(self) + self.alpha = alpha + self.beta = beta + self.gamma = gamma + self.topk = topk + self.bottomk = bottomk + + def get_prf_q_emb(self, emb_qs: np.ndarray = None, prf_candidates: List[PRFDenseSearchResult] = None): + """Perform Rocchio PRF with Dense Vectors + + Parameters + ---------- + emb_qs : np.ndarray + query embedding + prf_candidates : List[PRFDenseSearchResult] + List of PRFDenseSearchResult, contains document embeddings. + + Returns + ------- + np.ndarray + return new query embeddings + """ + + all_candidate_embs = [item.vectors for item in prf_candidates] + weighted_query_embs = self.alpha * emb_qs + weighted_mean_pos_doc_embs = self.beta * np.mean(all_candidate_embs[:self.topk], axis=0) + new_emb_q = weighted_query_embs + weighted_mean_pos_doc_embs + if self.bottomk > 0: + weighted_mean_neg_doc_embs = self.gamma * np.mean(all_candidate_embs[-self.bottomk:], axis=0) + new_emb_q -= weighted_mean_neg_doc_embs + return new_emb_q + + def get_batch_prf_q_emb(self, topic_ids: List[str] = None, emb_qs: np.ndarray = None, + prf_candidates: Dict[str, List[PRFDenseSearchResult]] = None): + """Perform Rocchio PRF with Dense Vectors + + Parameters + ---------- + topic_ids : List[str] + List of topic ids. + emb_qs : np.ndarray + Query embeddings + prf_candidates : List[PRFDenseSearchResult] + List of PRFDenseSearchResult, contains document embeddings. + + Returns + ------- + np.ndarray + return new query embeddings + """ + qids = list() + new_emb_qs = list() + for index, topic_id in enumerate(topic_ids): + qids.append(topic_id) + new_emb_qs.append(self.get_prf_q_emb(emb_qs[index], prf_candidates[topic_id])) + new_emb_qs = np.array(new_emb_qs).astype('float32') + return new_emb_qs + + +class DenseVectorAncePrf(DenseVectorPrf): + def __init__(self, encoder: AnceQueryEncoder, sparse_searcher: LuceneSearcher): + """ + Parameters + ---------- + encoder : AnceQueryEncoder + The new ANCE query encoder for ANCE-PRF. + sparse_searcher : LuceneSearcher + The sparse searcher using lucene index, for retrieving doc contents. + """ + DenseVectorPrf.__init__(self) + self.encoder = encoder + self.sparse_searcher = sparse_searcher + + def get_prf_q_emb(self, query: str = None, prf_candidates: List[PRFDenseSearchResult] = None): + """Perform single ANCE-PRF with Dense Vectors + + Parameters + ---------- + query : str + query text + prf_candidates : List[PRFDenseSearchResult] + List of PRFDenseSearchResult, contains document embeddings. + + Returns + ------- + np.ndarray + return new query embeddings + """ + passage_texts = [query] + for item in prf_candidates: + raw_text = json.loads(self.sparse_searcher.doc(item.docid).raw()) + passage_texts.append(raw_text['contents']) + full_text = f'{self.encoder.tokenizer.cls_token}{self.encoder.tokenizer.sep_token.join(passage_texts)}{self.encoder.tokenizer.sep_token}' + emb_q = self.encoder.prf_encode(full_text) + emb_q = emb_q.reshape((1, len(emb_q))) + return emb_q + + def get_batch_prf_q_emb(self, topics: List[str], topic_ids: List[str], + prf_candidates: Dict[str, List[PRFDenseSearchResult]]) -> np.ndarray: + """Perform batch ANCE-PRF with Dense Vectors + + Parameters + ---------- + topics : List[str] + List of query texts. + topic_ids: List[str] + List of topic ids. + prf_candidates : List[PRFDenseSearchResult] + List of PRFDenseSearchResult, contains document embeddings. + + Returns + ------- + np.ndarray + return new query embeddings + """ + prf_passage_texts = list() + for index, query in enumerate(topics): + passage_texts = [query] + prf_candidate = prf_candidates[topic_ids[index]] + for item in prf_candidate: + raw_text = json.loads(self.sparse_searcher.doc(item.docid).raw()) + passage_texts.append(raw_text['contents']) + full_text = f'{self.encoder.tokenizer.cls_token}{self.encoder.tokenizer.sep_token.join(passage_texts)}{self.encoder.tokenizer.sep_token}' + prf_passage_texts.append(full_text) + emb_q = self.encoder.prf_batch_encode(prf_passage_texts) + return emb_q diff --git a/pyserini/search/faiss/_searcher.py b/pyserini/search/faiss/_searcher.py new file mode 100644 index 0000000000000000000000000000000000000000..828fc5ad25d462024586d93f0d3130e0b2499b79 --- /dev/null +++ b/pyserini/search/faiss/_searcher.py @@ -0,0 +1,696 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This module provides Pyserini's dense search interface to FAISS index. +The main entry point is the ``FaissSearcher`` class. +""" + +import os +from dataclasses import dataclass +from typing import Dict, List, Union, Optional, Tuple + +import numpy as np +import pandas as pd + +from transformers import (AutoModel, AutoTokenizer, BertModel, BertTokenizer, BertTokenizerFast, + DPRQuestionEncoder, DPRQuestionEncoderTokenizer, RobertaTokenizer) +from transformers.file_utils import is_faiss_available, requires_backends + +from pyserini.util import (download_encoded_queries, download_prebuilt_index, + get_dense_indexes_info, get_sparse_index) +from pyserini.search.lucene import LuceneSearcher +from pyserini.index import Document + +from ._model import AnceEncoder +import torch + +from ...encode import PcaEncoder +from ...encode._aggretriever import BERTAggretrieverEncoder, DistlBERTAggretrieverEncoder + +if is_faiss_available(): + import faiss + + +class QueryEncoder: + def __init__(self, encoded_query_dir: str = None): + self.has_model = False + self.has_encoded_query = False + if encoded_query_dir: + self.embedding = self._load_embeddings(encoded_query_dir) + self.has_encoded_query = True + + def encode(self, query: str): + return self.embedding[query] + + @classmethod + def load_encoded_queries(cls, encoded_query_name: str): + """Build a query encoder from a pre-encoded query; download the encoded queries if necessary. + + Parameters + ---------- + encoded_query_name : str + pre encoded query name. + + Returns + ------- + QueryEncoder + Encoder built from the pre encoded queries. + """ + print(f'Attempting to initialize pre-encoded queries {encoded_query_name}.') + try: + query_dir = download_encoded_queries(encoded_query_name) + except ValueError as e: + print(str(e)) + return None + + print(f'Initializing {encoded_query_name}...') + return cls(encoded_query_dir=query_dir) + + @staticmethod + def _load_embeddings(encoded_query_dir): + df = pd.read_pickle(os.path.join(encoded_query_dir, 'embedding.pkl')) + return dict(zip(df['text'].tolist(), df['embedding'].tolist())) + + +class AggretrieverQueryEncoder(QueryEncoder): + def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, + encoded_query_dir: str = None, device: str = 'cpu', **kwargs): + if encoder_dir: + self.device = device + if 'distilbert' in encoder_dir.lower(): + self.model = DistlBERTAggretrieverEncoder.from_pretrained(encoder_dir) + else: + self.model = BERTAggretrieverEncoder.from_pretrained(encoder_dir) + self.model.to(self.device) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir) + self.has_model = True + if (not self.has_model) and (not self.has_encoded_query): + raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one') + + def encode(self, query: str, max_length: int=32): + if self.has_model: + inputs = self.tokenizer( + query, + max_length=max_length, + padding="longest", + truncation=True, + add_special_tokens=True, + return_tensors='pt' + ) + inputs.to(self.device) + outputs = self.model(**inputs) + embeddings = outputs.detach().cpu().numpy() + return embeddings.flatten() + else: + return super().encode(query) + + +class TctColBertQueryEncoder(QueryEncoder): + + def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, + encoded_query_dir: str = None, device: str = 'cpu', **kwargs): + super().__init__(encoded_query_dir) + if encoder_dir: + self.device = device + self.model = BertModel.from_pretrained(encoder_dir) + self.model.to(self.device) + self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name or encoder_dir) + self.has_model = True + if (not self.has_model) and (not self.has_encoded_query): + raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one') + + def encode(self, query: str): + if self.has_model: + max_length = 36 # hardcode for now + inputs = self.tokenizer( + '[CLS] [Q] ' + query + '[MASK]' * max_length, + max_length=max_length, + truncation=True, + add_special_tokens=False, + return_tensors='pt' + ) + inputs.to(self.device) + outputs = self.model(**inputs) + embeddings = outputs.last_hidden_state.detach().cpu().numpy() + return np.average(embeddings[:, 4:, :], axis=-2).flatten() + else: + return super().encode(query) + + +class DprQueryEncoder(QueryEncoder): + + def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, + encoded_query_dir: str = None, device: str = 'cpu', **kwargs): + super().__init__(encoded_query_dir) + if encoder_dir: + self.device = device + self.model = DPRQuestionEncoder.from_pretrained(encoder_dir) + self.model.to(self.device) + self.tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(tokenizer_name or encoder_dir) + self.has_model = True + if (not self.has_model) and (not self.has_encoded_query): + raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one') + + def encode(self, query: str): + if self.has_model: + input_ids = self.tokenizer(query, return_tensors='pt') + input_ids.to(self.device) + embeddings = self.model(input_ids["input_ids"]).pooler_output.detach().cpu().numpy() + return embeddings.flatten() + else: + return super().encode(query) + + +class BprQueryEncoder(QueryEncoder): + + def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, + encoded_query_dir: str = None, device: str = 'cpu', **kwargs): + self.has_model = False + self.has_encoded_query = False + if encoded_query_dir: + self.embedding = self._load_embeddings(encoded_query_dir) + self.has_encoded_query = True + + if encoder_dir: + self.device = device + self.model = DPRQuestionEncoder.from_pretrained(encoder_dir) + self.model.to(self.device) + self.tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(tokenizer_name or encoder_dir) + self.has_model = True + if (not self.has_model) and (not self.has_encoded_query): + raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one') + + def encode(self, query: str): + if self.has_model: + input_ids = self.tokenizer(query, return_tensors='pt') + input_ids.to(self.device) + embeddings = self.model(input_ids["input_ids"]).pooler_output.detach().cpu() + dense_embeddings = embeddings.numpy() + sparse_embeddings = self.convert_to_binary_code(embeddings).numpy() + return {'dense': dense_embeddings.flatten(), 'sparse': sparse_embeddings.flatten()} + else: + return super().encode(query) + + def convert_to_binary_code(self, input_repr: torch.Tensor): + return input_repr.new_ones(input_repr.size()).masked_fill_(input_repr < 0, -1.0) + + @staticmethod + def _load_embeddings(encoded_query_dir): + df = pd.read_pickle(os.path.join(encoded_query_dir, 'embedding.pkl')) + ret = {} + for text, dense, sparse in zip(df['text'].tolist(), df['dense_embedding'].tolist(), + df['sparse_embedding'].tolist()): + ret[text] = {'dense': dense, 'sparse': sparse} + return ret + + +class DkrrDprQueryEncoder(QueryEncoder): + + def __init__(self, encoder_dir: str = None, encoded_query_dir: str = None, device: str = 'cpu', + prefix: str = "question:", **kwargs): + super().__init__(encoded_query_dir) + self.device = device + self.model = BertModel.from_pretrained(encoder_dir) + self.model.to(self.device) + self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") + self.has_model = True + self.prefix = prefix + + @staticmethod + def _mean_pooling(model_output, attention_mask): + model_output = model_output[0].masked_fill(1 - attention_mask[:, :, None], 0.) + model_output = torch.sum(model_output, dim=1) / torch.clamp(torch.sum(attention_mask, dim=1), min=1e-9)[:, None] + return model_output.flatten() + + def encode(self, query: str): + if self.has_model: + if self.prefix: + query = f'{self.prefix} {query}' + inputs = self.tokenizer(query, return_tensors='pt', max_length=40, padding="max_length") + inputs.to(self.device) + outputs = self.model(input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"]) + embeddings = self._mean_pooling(outputs, inputs['attention_mask']).detach().cpu().numpy() + return embeddings.flatten() + else: + return super().encode(query) + + +class AnceQueryEncoder(QueryEncoder): + + def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, + encoded_query_dir: str = None, device: str = 'cpu', **kwargs): + super().__init__(encoded_query_dir) + if encoder_dir: + self.device = device + self.model = AnceEncoder.from_pretrained(encoder_dir) + self.model.to(self.device) + self.tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name or encoder_dir) + self.has_model = True + self.tokenizer.do_lower_case = True + if (not self.has_model) and (not self.has_encoded_query): + raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one') + + def encode(self, query: str): + if self.has_model: + inputs = self.tokenizer( + [query], + max_length=64, + padding='longest', + truncation=True, + add_special_tokens=True, + return_tensors='pt' + ) + inputs.to(self.device) + embeddings = self.model(inputs["input_ids"]).detach().cpu().numpy() + return embeddings.flatten() + else: + return super().encode(query) + + def prf_encode(self, query: str): + if self.has_model: + inputs = self.tokenizer( + [query], + max_length=512, + padding='longest', + truncation=True, + add_special_tokens=False, + return_tensors='pt' + ) + inputs.to(self.device) + embeddings = self.model(inputs["input_ids"]).detach().cpu().numpy() + return embeddings.flatten() + else: + return super().encode(query) + + def prf_batch_encode(self, query: List[str]): + inputs = self.tokenizer( + query, + max_length=512, + padding='longest', + truncation=True, + add_special_tokens=False, + return_tensors='pt' + ) + inputs.to(self.device) + embeddings = self.model(inputs["input_ids"]).detach().cpu().numpy() + return embeddings + + +class AutoQueryEncoder(QueryEncoder): + + def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, + encoded_query_dir: str = None, device: str = 'cpu', + pooling: str = 'cls', l2_norm: bool = False, **kwargs): + super().__init__(encoded_query_dir) + if encoder_dir: + self.device = device + self.model = AutoModel.from_pretrained(encoder_dir) + self.model.to(self.device) + try: + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir) + except: + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir, use_fast=False) + self.has_model = True + self.pooling = pooling + self.l2_norm = l2_norm + if (not self.has_model) and (not self.has_encoded_query): + raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one') + + @staticmethod + def _mean_pooling(model_output, attention_mask): + token_embeddings = model_output[0] # First element of model_output contains all token embeddings + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) + sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) + return sum_embeddings / sum_mask + + def encode(self, query: str): + if self.has_model: + inputs = self.tokenizer( + query, + add_special_tokens=True, + return_tensors='pt', + truncation='only_first', + padding='longest', + return_token_type_ids=False, + ) + + inputs.to(self.device) + outputs = self.model(**inputs) + if self.pooling == "mean": + embeddings = self._mean_pooling(outputs, inputs['attention_mask']).detach().cpu().numpy() + else: + embeddings = outputs[0][:, 0, :].detach().cpu().numpy() + if self.l2_norm: + faiss.normalize_L2(embeddings) + return embeddings.flatten() + else: + return super().encode(query) + + +@dataclass +class DenseSearchResult: + docid: str + score: float + + +@dataclass +class PRFDenseSearchResult: + docid: str + score: float + vectors: [float] + + +class FaissSearcher: + """Simple Searcher for dense representation + + Parameters + ---------- + index_dir : str + Path to faiss index directory. + """ + + def __init__(self, index_dir: str, query_encoder: Union[QueryEncoder, str], + prebuilt_index_name: Optional[str] = None): + requires_backends(self, "faiss") + if isinstance(query_encoder, QueryEncoder) or isinstance(query_encoder, PcaEncoder): + self.query_encoder = query_encoder + else: + self.query_encoder = self._init_encoder_from_str(query_encoder) + self.index, self.docids = self.load_index(index_dir) + self.dimension = self.index.d + self.num_docs = self.index.ntotal + + assert self.docids is None or self.num_docs == len(self.docids) + if prebuilt_index_name: + sparse_index = get_sparse_index(prebuilt_index_name) + self.ssearcher = LuceneSearcher.from_prebuilt_index(sparse_index) + + @classmethod + def from_prebuilt_index(cls, prebuilt_index_name: str, query_encoder: QueryEncoder): + """Build a searcher from a pre-built index; download the index if necessary. + + Parameters + ---------- + query_encoder: QueryEncoder + the query encoder, which has `encode` method that convert query text to embedding + prebuilt_index_name : str + Prebuilt index name. + + Returns + ------- + FaissSearcher + Searcher built from the prebuilt faiss index. + """ + print(f'Attempting to initialize pre-built index {prebuilt_index_name}.') + try: + index_dir = download_prebuilt_index(prebuilt_index_name) + except ValueError as e: + print(str(e)) + return None + + print(f'Initializing {prebuilt_index_name}...') + return cls(index_dir, query_encoder, prebuilt_index_name) + + @staticmethod + def list_prebuilt_indexes(): + """Display information about available prebuilt indexes.""" + get_dense_indexes_info() + + def search(self, query: Union[str, np.ndarray], k: int = 10, threads: int = 1, return_vector: bool = False) \ + -> Union[List[DenseSearchResult], Tuple[np.ndarray, List[PRFDenseSearchResult]]]: + """Search the collection. + + Parameters + ---------- + query : Union[str, np.ndarray] + query text or query embeddings + k : int + Number of hits to return. + threads : int + Maximum number of threads to use for intra-query search. + return_vector : bool + Return the results with vectors + Returns + ------- + Union[List[DenseSearchResult], Tuple[np.ndarray, List[PRFDenseSearchResult]]] + Either returns a list of search results. + Or returns the query vector with the list of PRF dense search results with vectors. + """ + if isinstance(query, str): + emb_q = self.query_encoder.encode(query) + assert len(emb_q) == self.dimension + emb_q = emb_q.reshape((1, len(emb_q))) + else: + emb_q = query + faiss.omp_set_num_threads(threads) + if return_vector: + distances, indexes, vectors = self.index.search_and_reconstruct(emb_q, k) + vectors = vectors[0] + distances = distances.flat + indexes = indexes.flat + return emb_q, [PRFDenseSearchResult(self.docids[idx], score, vector) + for score, idx, vector in zip(distances, indexes, vectors) if idx != -1] + else: + distances, indexes = self.index.search(emb_q, k) + distances = distances.flat + indexes = indexes.flat + return [DenseSearchResult(self.docids[idx], score) + for score, idx in zip(distances, indexes) if idx != -1] + + def batch_search(self, queries: Union[List[str], np.ndarray], q_ids: List[str], k: int = 10, + threads: int = 1, return_vector: bool = False) \ + -> Union[Dict[str, List[DenseSearchResult]], Tuple[np.ndarray, Dict[str, List[PRFDenseSearchResult]]]]: + """ + + Parameters + ---------- + queries : Union[List[str], np.ndarray] + List of query texts or list of query embeddings + q_ids : List[str] + List of corresponding query ids. + k : int + Number of hits to return. + threads : int + Maximum number of threads to use. + return_vector : bool + Return the results with vectors + + Returns + ------- + Union[Dict[str, List[DenseSearchResult]], Tuple[np.ndarray, Dict[str, List[PRFDenseSearchResult]]]] + Either returns a dictionary holding the search results, with the query ids as keys and the + corresponding lists of search results as the values. + Or returns a tuple with ndarray of query vectors and a dictionary of PRF Dense Search Results with vectors + """ + if isinstance(queries, np.ndarray): + q_embs = queries + else: + q_embs = np.array([self.query_encoder.encode(q) for q in queries]) + n, m = q_embs.shape + assert m == self.dimension + faiss.omp_set_num_threads(threads) + if return_vector: + D, I, V = self.index.search_and_reconstruct(q_embs, k) + return q_embs, {key: [PRFDenseSearchResult(self.docids[idx], score, vector) + for score, idx, vector in zip(distances, indexes, vectors) if idx != -1] + for key, distances, indexes, vectors in zip(q_ids, D, I, V)} + else: + D, I = self.index.search(q_embs, k) + return {key: [DenseSearchResult(self.docids[idx], score) + for score, idx in zip(distances, indexes) if idx != -1] + for key, distances, indexes in zip(q_ids, D, I)} + + def load_index(self, index_dir: str): + index_path = os.path.join(index_dir, 'index') + docid_path = os.path.join(index_dir, 'docid') + index = faiss.read_index(index_path) + docids = self.load_docids(docid_path) + return index, docids + + def doc(self, docid: Union[str, int]) -> Optional[Document]: + """Return the :class:`Document` corresponding to ``docid``. Since dense indexes don't store documents + but sparse indexes do, route over to corresponding sparse index (according to prebuilt_index_info.py) + and use its doc API + + Parameters + ---------- + docid : Union[str, int] + Overloaded ``docid``: either an external collection ``docid`` (``str``) or an internal Lucene ``docid`` + (``int``). + + Returns + ------- + Document + :class:`Document` corresponding to the ``docid``. + """ + return self.ssearcher.doc(docid) if self.ssearcher else None + + @staticmethod + def _init_encoder_from_str(encoder): + encoder_lower = encoder.lower() + if 'dpr' in encoder_lower: + return DprQueryEncoder(encoder_dir=encoder) + elif 'tct_colbert' in encoder_lower: + return TctColBertQueryEncoder(encoder_dir=encoder) + elif 'ance' in encoder_lower: + return AnceQueryEncoder(encoder_dir=encoder) + elif 'sentence' in encoder_lower: + return AutoQueryEncoder(encoder_dir=encoder, pooling='mean', l2_norm=True) + else: + return AutoQueryEncoder(encoder_dir=encoder) + + @staticmethod + def load_docids(docid_path: str) -> List[str]: + id_f = open(docid_path, 'r') + docids = [line.rstrip() for line in id_f.readlines()] + id_f.close() + return docids + + def set_hnsw_ef_search(self, ef_search: int): + self.index.hnsw.efSearch = ef_search + + +class BinaryDenseSearcher(FaissSearcher): + """Simple Searcher for binary-dense representation + + Parameters + ---------- + index_dir : str + Path to faiss index directory. + """ + + def __init__(self, index_dir: str, query_encoder: Union[QueryEncoder, str], + prebuilt_index_name: Optional[str] = None): + super().__init__(index_dir, query_encoder, prebuilt_index_name) + + def search(self, query: str, k: int = 10, binary_k: int = 100, rerank: bool = True, threads: int = 1) \ + -> List[DenseSearchResult]: + """Search the collection. + + Parameters + ---------- + query : str + query text + k : int + Number of hits to return at second stage. + binary_k : int + Number of hits to return at first stage. + rerank: bool + Whether to use dense repr to rerank the binary ranking results. + threads : int + Maximum number of threads to use for intra-query search. + Returns + ------- + List[DenseSearchResult] + List of search results. + """ + ret = self.query_encoder.encode(query) + dense_emb_q = ret['dense'] + sparse_emb_q = ret['sparse'] + assert len(dense_emb_q) == self.dimension + assert len(sparse_emb_q) == self.dimension + + dense_emb_q = dense_emb_q.reshape((1, len(dense_emb_q))) + sparse_emb_q = sparse_emb_q.reshape((1, len(sparse_emb_q))) + faiss.omp_set_num_threads(threads) + distances, indexes = self.binary_dense_search(k, binary_k, rerank, dense_emb_q, sparse_emb_q) + distances = distances.flat + indexes = indexes.flat + return [DenseSearchResult(str(idx), score) + for score, idx in zip(distances, indexes) if idx != -1] + + def batch_search(self, queries: List[str], q_ids: List[str], k: int = 10, binary_k: int = 100, + rerank: bool = True, threads: int = 1) -> Dict[str, List[DenseSearchResult]]: + """ + + Parameters + ---------- + queries : List[str] + List of query texts + q_ids : List[str] + List of corresponding query ids. + k : int + Number of hits to return. + binary_k : int + Number of hits to return at first stage. + rerank: bool + Whether to use dense repr to rerank the binary ranking results. + threads : int + Maximum number of threads to use. + + Returns + ------- + Dict[str, List[DenseSearchResult]] + Dictionary holding the search results, with the query ids as keys and the corresponding lists of search + results as the values. + """ + dense_q_embs = [] + sparse_q_embs = [] + for q in queries: + ret = self.query_encoder.encode(q) + dense_q_embs.append(ret['dense']) + sparse_q_embs.append(ret['sparse']) + dense_q_embs = np.array(dense_q_embs) + sparse_q_embs = np.array(sparse_q_embs) + n, m = dense_q_embs.shape + assert m == self.dimension + faiss.omp_set_num_threads(threads) + D, I = self.binary_dense_search(k, binary_k, rerank, dense_q_embs, sparse_q_embs) + return {key: [DenseSearchResult(str(idx), score) + for score, idx in zip(distances, indexes) if idx != -1] + for key, distances, indexes in zip(q_ids, D, I)} + + def binary_dense_search(self, k, binary_k, rerank, dense_emb_q, sparse_emb_q): + num_queries = dense_emb_q.shape[0] + sparse_emb_q = np.packbits(np.where(sparse_emb_q > 0, 1, 0)).reshape(num_queries, -1) + + if not rerank: + distances, indexes = self.index.search(sparse_emb_q, k) + else: + raw_index = self.index.index + _, indexes = raw_index.search(sparse_emb_q, binary_k) + sparse_emb_p = np.vstack( + [np.unpackbits(raw_index.reconstruct(int(id_))) for id_ in indexes.reshape(-1)] + ) + sparse_emb_p = sparse_emb_p.reshape( + dense_emb_q.shape[0], binary_k, dense_emb_q.shape[1] + ) + sparse_emb_p = sparse_emb_p.astype(np.float32) + sparse_emb_p = sparse_emb_p * 2 - 1 + distances = np.einsum("ijk,ik->ij", sparse_emb_p, dense_emb_q) + sorted_indices = np.argsort(-distances, axis=1) + + indexes = indexes[np.arange(num_queries)[:, None], sorted_indices] + indexes = np.array([self.index.id_map.at(int(id_)) for id_ in indexes.reshape(-1)], dtype=np.int) + indexes = indexes.reshape(num_queries, -1)[:, :k] + distances = distances[np.arange(num_queries)[:, None], sorted_indices][:, :k] + return distances, indexes + + def load_index(self, index_dir: str): + index_path = os.path.join(index_dir, 'index') + index = faiss.read_index_binary(index_path) + return index, None + + @staticmethod + def _init_encoder_from_str(encoder): + encoder = encoder.lower() + if 'bpr' in encoder: + return BprQueryEncoder(encoder_dir=encoder) + else: + raise NotImplementedError diff --git a/pyserini/search/hybrid/__init__.py b/pyserini/search/hybrid/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..61ca8b0c8ed5d15cf1189c6468158999d869bdf5 --- /dev/null +++ b/pyserini/search/hybrid/__init__.py @@ -0,0 +1,19 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ._searcher import HybridSearcher + +__all__ = ['HybridSearcher'] \ No newline at end of file diff --git a/pyserini/search/hybrid/__main__.py b/pyserini/search/hybrid/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..4d19a0c3c2afefdc286a5aa4d775d80bcac8bf4a --- /dev/null +++ b/pyserini/search/hybrid/__main__.py @@ -0,0 +1,185 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import json +import os +import sys + +from tqdm import tqdm + +from pyserini.search.faiss import FaissSearcher +from pyserini.query_iterator import get_query_iterator, TopicsFormat +from pyserini.output_writer import get_output_writer, OutputFormat +from pyserini.search.lucene import LuceneImpactSearcher, LuceneSearcher +from pyserini.search.hybrid import HybridSearcher + +from pyserini.search.faiss.__main__ import define_dsearch_args, init_query_encoder +from pyserini.search.lucene.__main__ import define_search_args, set_bm25_parameters + +# Fixes this error: "OMP: Error #15: Initializing libomp.a, but found libomp.dylib already initialized." +# https://stackoverflow.com/questions/53014306/error-15-initializing-libiomp5-dylib-but-found-libiomp5-dylib-already-initial +os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' + + +def define_fusion_args(parser): + parser.add_argument('--alpha', type=float, metavar='num', required=False, default=0.1, + help="alpha for hybrid search") + parser.add_argument('--hits', type=int, required=False, default=1000, help='number of hits from dense and sparse') + parser.add_argument('--normalization', action='store_true', required=False, help='hybrid score with normalization') + parser.add_argument('--weight-on-dense', action='store_true', required=False, help='weight on dense part') + + +def parse_args(parser, commands): + # Divide argv by commands + split_argv = [[]] + for c in sys.argv[1:]: + if c in commands.choices: + split_argv.append([c]) + else: + split_argv[-1].append(c) + # Initialize namespace + args = argparse.Namespace() + for c in commands.choices: + setattr(args, c, None) + # Parse each command + parser.parse_args(split_argv[0], namespace=args) # Without command + for argv in split_argv[1:]: # Commands + n = argparse.Namespace() + setattr(args, argv[0], n) + parser.parse_args(argv, namespace=n) + return args + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Conduct a hybrid search on dense+sparse indexes.') + + commands = parser.add_subparsers(title='sub-commands') + + dense_parser = commands.add_parser('dense') + define_dsearch_args(dense_parser) + + sparse_parser = commands.add_parser('sparse') + define_search_args(sparse_parser) + + fusion_parser = commands.add_parser('fusion') + define_fusion_args(fusion_parser) + + run_parser = commands.add_parser('run') + run_parser.add_argument('--topics', type=str, metavar='topic_name', required=False, + help="Name of topics. Available: msmarco-passage-dev-subset.") + run_parser.add_argument('--hits', type=int, metavar='num', required=False, default=1000, help="Number of hits.") + run_parser.add_argument('--topics-format', type=str, metavar='format', default=TopicsFormat.DEFAULT.value, + help=f"Format of topics. Available: {[x.value for x in list(TopicsFormat)]}") + run_parser.add_argument('--output-format', type=str, metavar='format', default=OutputFormat.TREC.value, + help=f"Format of output. Available: {[x.value for x in list(OutputFormat)]}") + run_parser.add_argument('--output', type=str, metavar='path', required=False, help="Path to output file.") + run_parser.add_argument('--max-passage', action='store_true', + default=False, help="Select only max passage from document.") + run_parser.add_argument('--max-passage-hits', type=int, metavar='num', required=False, default=100, + help="Final number of hits when selecting only max passage.") + run_parser.add_argument('--max-passage-delimiter', type=str, metavar='str', required=False, default='#', + help="Delimiter between docid and passage id.") + run_parser.add_argument('--batch-size', type=int, metavar='num', required=False, + default=1, help="Specify batch size to search the collection concurrently.") + run_parser.add_argument('--threads', type=int, metavar='num', required=False, + default=1, help="Maximum number of threads to use.") + + args = parse_args(parser, commands) + + query_iterator = get_query_iterator(args.run.topics, TopicsFormat(args.run.topics_format)) + topics = query_iterator.topics + + query_encoder = init_query_encoder(args.dense.encoder, + args.dense.encoder_class, + args.dense.tokenizer, + args.run.topics, + args.dense.encoded_queries, + args.dense.device, + args.dense.query_prefix) + + if os.path.exists(args.dense.index): + # create searcher from index directory + dsearcher = FaissSearcher(args.dense.index, query_encoder) + else: + # create searcher from prebuilt index name + dsearcher = FaissSearcher.from_prebuilt_index(args.dense.index, query_encoder) + + if not dsearcher: + exit() + + if os.path.exists(args.sparse.index): + # create searcher from index directory + if args.sparse.impact: + ssearcher = LuceneImpactSearcher(args.sparse.index, args.sparse.encoder, args.sparse.min_idf) + else: + ssearcher = LuceneSearcher(args.sparse.index) + else: + # create searcher from prebuilt index name + if args.sparse.impact: + ssearcher = LuceneImpactSearcher.from_prebuilt_index(args.sparse.index, args.sparse.encoder, args.sparse.min_idf) + else: + ssearcher = LuceneSearcher.from_prebuilt_index(args.sparse.index) + + if not ssearcher: + exit() + + set_bm25_parameters(ssearcher, args.sparse.index, args.sparse.k1, args.sparse.b) + + if args.sparse.language != 'en': + ssearcher.set_language(args.sparse.language) + + hsearcher = HybridSearcher(dsearcher, ssearcher) + if not hsearcher: + exit() + + # build output path + output_path = args.run.output + + print(f'Running {args.run.topics} topics, saving to {output_path}...') + tag = 'hybrid' + + output_writer = get_output_writer(output_path, OutputFormat(args.run.output_format), 'w', + max_hits=args.run.hits, tag=tag, topics=topics, + use_max_passage=args.run.max_passage, + max_passage_delimiter=args.run.max_passage_delimiter, + max_passage_hits=args.run.max_passage_hits) + + with output_writer: + batch_topics = list() + batch_topic_ids = list() + for index, (topic_id, text) in enumerate(tqdm(query_iterator, total=len(topics.keys()))): + if args.run.batch_size <= 1 and args.run.threads <= 1: + hits = hsearcher.search(text, args.fusion.hits, args.run.hits, args.fusion.alpha, args.fusion.normalization, args.fusion.weight_on_dense) + results = [(topic_id, hits)] + else: + batch_topic_ids.append(str(topic_id)) + batch_topics.append(text) + if (index + 1) % args.run.batch_size == 0 or \ + index == len(topics.keys()) - 1: + results = hsearcher.batch_search( + batch_topics, batch_topic_ids, args.fusion.hits, args.run.hits, args.run.threads, + args.fusion.alpha, args.fusion.normalization, args.fusion.weight_on_dense) + results = [(id_, results[id_]) for id_ in batch_topic_ids] + batch_topic_ids.clear() + batch_topics.clear() + else: + continue + + for topic, hits in results: + output_writer.write(topic, hits) + + results.clear() diff --git a/pyserini/search/hybrid/_searcher.py b/pyserini/search/hybrid/_searcher.py new file mode 100644 index 0000000000000000000000000000000000000000..0817f6c85ddf4b5b5bf554421dd3155fdefb8621 --- /dev/null +++ b/pyserini/search/hybrid/_searcher.py @@ -0,0 +1,81 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This module provides Pyserini's hybrid searcher by Dense + Sparse +""" + +from typing import List, Dict +from pyserini.search.lucene import LuceneSearcher +from pyserini.search.faiss import FaissSearcher, DenseSearchResult + + +class HybridSearcher: + """Hybrid Searcher for dense + sparse + + Parameters + ---------- + dense_searcher : FaissSearcher + sparse_searcher : LuceneSearcher + """ + + def __init__(self, dense_searcher, sparse_searcher): + self.dense_searcher = dense_searcher + self.sparse_searcher = sparse_searcher + + def search(self, query: str, k0: int = 10, k: int = 10, alpha: float = 0.1, normalization: bool = False, weight_on_dense: bool = False) -> List[DenseSearchResult]: + dense_hits = self.dense_searcher.search(query, k0) + sparse_hits = self.sparse_searcher.search(query, k0) + return self._hybrid_results(dense_hits, sparse_hits, alpha, k, normalization, weight_on_dense) + + def batch_search(self, queries: List[str], q_ids: List[str], k0: int = 10, k: int = 10, threads: int = 1, + alpha: float = 0.1, normalization: bool = False, weight_on_dense: bool = False) \ + -> Dict[str, List[DenseSearchResult]]: + dense_result = self.dense_searcher.batch_search(queries, q_ids, k0, threads) + sparse_result = self.sparse_searcher.batch_search(queries, q_ids, k0, threads) + hybrid_result = { + key: self._hybrid_results(dense_result[key], sparse_result[key], alpha, k, normalization, weight_on_dense) + for key in dense_result + } + return hybrid_result + + @staticmethod + def _hybrid_results(dense_results, sparse_results, alpha, k, normalization=False, weight_on_dense=False): + dense_hits = {hit.docid: hit.score for hit in dense_results} + sparse_hits = {hit.docid: hit.score for hit in sparse_results} + hybrid_result = [] + min_dense_score = min(dense_hits.values()) if len(dense_hits) > 0 else 0 + max_dense_score = max(dense_hits.values()) if len(dense_hits) > 0 else 1 + min_sparse_score = min(sparse_hits.values()) if len(sparse_hits) > 0 else 0 + max_sparse_score = max(sparse_hits.values()) if len(sparse_hits) > 0 else 1 + for doc in set(dense_hits.keys()) | set(sparse_hits.keys()): + if doc not in dense_hits: + sparse_score = sparse_hits[doc] + dense_score = min_dense_score + elif doc not in sparse_hits: + sparse_score = min_sparse_score + dense_score = dense_hits[doc] + else: + sparse_score = sparse_hits[doc] + dense_score = dense_hits[doc] + if normalization: + sparse_score = (sparse_score - (min_sparse_score + max_sparse_score) / 2) \ + / (max_sparse_score - min_sparse_score) + dense_score = (dense_score - (min_dense_score + max_dense_score) / 2) \ + / (max_dense_score - min_dense_score) + score = alpha * sparse_score + dense_score if not weight_on_dense else sparse_score + alpha * dense_score + hybrid_result.append(DenseSearchResult(doc, score)) + return sorted(hybrid_result, key=lambda x: x.score, reverse=True)[:k] diff --git a/pyserini/search/lucene/__init__.py b/pyserini/search/lucene/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..22063fc37248c5cfc310c5fc4bd74c020b02c264 --- /dev/null +++ b/pyserini/search/lucene/__init__.py @@ -0,0 +1,29 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ._geo_searcher import LuceneGeoSearcher +from ._impact_searcher import JImpactSearcherResult, LuceneImpactSearcher, SlimSearcher +from ._searcher import JLuceneSearcherResult, LuceneSimilarities, \ + LuceneFusionSearcher, LuceneSearcher + +__all__ = ['JImpactSearcherResult', + 'JLuceneSearcherResult', + 'LuceneFusionSearcher', + 'LuceneGeoSearcher', + 'LuceneImpactSearcher', + 'LuceneSearcher', + 'SlimSearcher', + 'LuceneSimilarities'] diff --git a/pyserini/search/lucene/__main__.py b/pyserini/search/lucene/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..85dac82c7f532fafcdb3da23558b449e0aeecdf4 --- /dev/null +++ b/pyserini/search/lucene/__main__.py @@ -0,0 +1,346 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import os + +from tqdm import tqdm +from transformers import AutoTokenizer + +from pyserini.analysis import JDefaultEnglishAnalyzer, JWhiteSpaceAnalyzer +from pyserini.output_writer import OutputFormat, get_output_writer +from pyserini.pyclass import autoclass +from pyserini.query_iterator import get_query_iterator, TopicsFormat +from pyserini.search import JDisjunctionMaxQueryGenerator +from . import LuceneImpactSearcher, LuceneSearcher, SlimSearcher +from .reranker import ClassifierType, PseudoRelevanceClassifierReranker + + +def set_bm25_parameters(searcher, index, k1=None, b=None): + if k1 is not None or b is not None: + if k1 is None or b is None: + print('Must set *both* k1 and b for BM25!') + exit() + print(f'Setting BM25 parameters: k1={k1}, b={b}') + searcher.set_bm25(k1, b) + else: + # Automatically set bm25 parameters based on known index... + if index == 'msmarco-passage' or index == 'msmarco-passage-slim' or index == 'msmarco-v1-passage' or \ + index == 'msmarco-v1-passage-slim' or index == 'msmarco-v1-passage-full': + # See https://github.com/castorini/anserini/blob/master/docs/regressions-msmarco-passage.md + print('MS MARCO passage: setting k1=0.82, b=0.68') + searcher.set_bm25(0.82, 0.68) + elif index == 'msmarco-passage-expanded' or \ + index == 'msmarco-v1-passage-d2q-t5' or \ + index == 'msmarco-v1-passage-d2q-t5-docvectors': + # See https://github.com/castorini/anserini/blob/master/docs/regressions-msmarco-passage-docTTTTTquery.md + print('MS MARCO passage w/ doc2query-T5 expansion: setting k1=2.18, b=0.86') + searcher.set_bm25(2.18, 0.86) + elif index == 'msmarco-doc' or index == 'msmarco-doc-slim' or index == 'msmarco-v1-doc' or \ + index == 'msmarco-v1-doc-slim' or index == 'msmarco-v1-doc-full': + # See https://github.com/castorini/anserini/blob/master/docs/regressions-msmarco-doc.md + print('MS MARCO doc: setting k1=4.46, b=0.82') + searcher.set_bm25(4.46, 0.82) + elif index == 'msmarco-doc-per-passage' or index == 'msmarco-doc-per-passage-slim' or \ + index == 'msmarco-v1-doc-segmented' or index == 'msmarco-v1-doc-segmented-slim' or \ + index == 'msmarco-v1-doc-segmented-full': + # See https://github.com/castorini/anserini/blob/master/docs/regressions-msmarco-doc-segmented.md + print('MS MARCO doc, per passage: setting k1=2.16, b=0.61') + searcher.set_bm25(2.16, 0.61) + elif index == 'msmarco-doc-expanded-per-doc' or \ + index == 'msmarco-v1-doc-d2q-t5' or \ + index == 'msmarco-v1-doc-d2q-t5-docvectors': + # See https://github.com/castorini/anserini/blob/master/docs/regressions-msmarco-doc-docTTTTTquery.md + print('MS MARCO doc w/ doc2query-T5 (per doc) expansion: setting k1=4.68, b=0.87') + searcher.set_bm25(4.68, 0.87) + elif index == 'msmarco-doc-expanded-per-passage' or \ + index == 'msmarco-v1-doc-segmented-d2q-t5' or \ + index == 'msmarco-v1-doc-segmented-d2q-t5-docvectors': + # See https://github.com/castorini/anserini/blob/master/docs/regressions-msmarco-doc-segmented-docTTTTTquery.md + print('MS MARCO doc w/ doc2query-T5 (per passage) expansion: setting k1=2.56, b=0.59') + searcher.set_bm25(2.56, 0.59) + + +def define_search_args(parser): + parser.add_argument('--index', type=str, metavar='path to index or index name', required=True, + help="Path to Lucene index or name of prebuilt index.") + parser.add_argument('--encoded-corpus', type=str, default=None, help="path to stored sparse vectors") + + parser.add_argument('--impact', action='store_true', help="Use Impact.") + parser.add_argument('--encoder', type=str, default=None, help="encoder name") + parser.add_argument('--onnx-encoder', type=str, default=None, help="onnx encoder name") + parser.add_argument('--min-idf', type=int, default=0, help="minimum idf") + + parser.add_argument('--bm25', action='store_true', default=True, help="Use BM25 (default).") + parser.add_argument('--k1', type=float, help='BM25 k1 parameter.') + parser.add_argument('--b', type=float, help='BM25 b parameter.') + + parser.add_argument('--rm3', action='store_true', help="Use RM3") + parser.add_argument('--rocchio', action='store_true', help="Use Rocchio") + parser.add_argument('--rocchio-use-negative', action='store_true', help="Use nonrelevant labels in Rocchio") + parser.add_argument('--qld', action='store_true', help="Use QLD") + + parser.add_argument('--language', type=str, help='language code for BM25, e.g. zh for Chinese', default='en') + parser.add_argument('--pretokenized', action='store_true', help="Boolean switch to accept pre-tokenized topics") + + parser.add_argument('--prcl', type=ClassifierType, nargs='+', default=[], + help='Specify the classifier PseudoRelevanceClassifierReranker uses.') + parser.add_argument('--prcl.vectorizer', dest='vectorizer', type=str, + help='Type of vectorizer. Available: TfidfVectorizer, BM25Vectorizer.') + parser.add_argument('--prcl.r', dest='r', type=int, default=10, + help='Number of positive labels in pseudo relevance feedback.') + parser.add_argument('--prcl.n', dest='n', type=int, default=100, + help='Number of negative labels in pseudo relevance feedback.') + parser.add_argument('--prcl.alpha', dest='alpha', type=float, default=0.5, + help='Alpha value for interpolation in pseudo relevance feedback.') + + parser.add_argument('--fields', metavar="key=value", nargs='+', + help='Fields to search with assigned float weights.') + parser.add_argument('--dismax', action='store_true', default=False, + help='Use disjunction max queries when searching multiple fields.') + parser.add_argument('--dismax.tiebreaker', dest='tiebreaker', type=float, default=0.0, + help='The tiebreaker weight to use in disjunction max queries.') + + parser.add_argument('--stopwords', type=str, help='Path to file with customstopwords.') + + +if __name__ == "__main__": + JLuceneSearcher = autoclass('io.anserini.search.SimpleSearcher') + parser = argparse.ArgumentParser(description='Search a Lucene index.') + define_search_args(parser) + parser.add_argument('--topics', type=str, metavar='topic_name', required=True, + help="Name of topics. Available: robust04, robust05, core17, core18.") + parser.add_argument('--hits', type=int, metavar='num', + required=False, default=1000, help="Number of hits.") + parser.add_argument('--topics-format', type=str, metavar='format', default=TopicsFormat.DEFAULT.value, + help=f"Format of topics. Available: {[x.value for x in list(TopicsFormat)]}") + parser.add_argument('--output-format', type=str, metavar='format', default=OutputFormat.TREC.value, + help=f"Format of output. Available: {[x.value for x in list(OutputFormat)]}") + parser.add_argument('--output', type=str, metavar='path', + help="Path to output file.") + parser.add_argument('--max-passage', action='store_true', + default=False, help="Select only max passage from document.") + parser.add_argument('--max-passage-hits', type=int, metavar='num', required=False, default=100, + help="Final number of hits when selecting only max passage.") + parser.add_argument('--max-passage-delimiter', type=str, metavar='str', required=False, default='#', + help="Delimiter between docid and passage id.") + parser.add_argument('--batch-size', type=int, metavar='num', required=False, + default=1, help="Specify batch size to search the collection concurrently.") + parser.add_argument('--threads', type=int, metavar='num', required=False, + default=1, help="Maximum number of threads to use.") + parser.add_argument('--tokenizer', type=str, help='tokenizer used to preprocess topics') + parser.add_argument('--remove-duplicates', action='store_true', default=False, help="Remove duplicate docs.") + # For some test collections, a query is doc from the corpus (e.g., arguana in BEIR). + # We want to remove the query from the results. This is equivalent to -removeQuery in Java. + parser.add_argument('--remove-query', action='store_true', default=False, help="Remove query from results list.") + + args = parser.parse_args() + + query_iterator = get_query_iterator(args.topics, TopicsFormat(args.topics_format)) + topics = query_iterator.topics + + if not args.impact: + if os.path.exists(args.index): + # create searcher from index directory + searcher = LuceneSearcher(args.index) + else: + # create searcher from prebuilt index name + searcher = LuceneSearcher.from_prebuilt_index(args.index) + elif args.impact: + if args.encoder and args.onnx_encoder: + raise ValueError("Cannot specify both --encoder and --onnx-encoder") + if args.encoder: + if os.path.exists(args.index): + if args.encoded_corpus is not None: + searcher = SlimSearcher(args.encoded_corpus, args.index, args.encoder, args.min_idf) + else: + searcher = LuceneImpactSearcher(args.index, args.encoder, args.min_idf) + else: + if args.encoded_corpus is not None: + searcher = SlimSearcher.from_prebuilt_index(args.encoded_corpus, args.index, args.encoder, args.min_idf) + else: + searcher = LuceneImpactSearcher.from_prebuilt_index(args.index, args.encoder, args.min_idf) + elif args.onnx_encoder: + if os.path.exists(args.index): + if args.encoded_corpus is not None: + searcher = SlimSearcher(args.encoded_corpus, args.index, args.onnx_encoder, args.min_idf) + else: + searcher = LuceneImpactSearcher(args.index, args.onnx_encoder, args.min_idf, 'onnx') + else: + if args.encoded_corpus is not None: + searcher = SlimSearcher.from_prebuilt_index(args.encoded_corpus, args.index, args.onnx_encoder, args.min_idf) + else: + searcher = LuceneImpactSearcher.from_prebuilt_index(args.index, args.onnx_encoder, args.min_idf, 'onnx') + # These are the cases where we're specifying pre-encoded queries + elif os.path.exists(args.index): + searcher = LuceneImpactSearcher(args.index, args.encoder, args.min_idf) + else: + searcher = LuceneImpactSearcher.from_prebuilt_index(args.index, args.encoder, args.min_idf) + + if args.language != 'en': + searcher.set_language(args.language) + + if not searcher: + exit() + + search_rankers = [] + + if args.qld: + search_rankers.append('qld') + searcher.set_qld() + elif args.bm25: + search_rankers.append('bm25') + set_bm25_parameters(searcher, args.index, args.k1, args.b) + + if args.rm3: + search_rankers.append('rm3') + searcher.set_rm3() + + if args.rocchio: + search_rankers.append('rocchio') + if args.rocchio_use_negative: + searcher.set_rocchio(gamma=0.15, use_negative=True) + else: + searcher.set_rocchio() + + fields = dict() + if args.fields: + fields = dict([pair.split('=') for pair in args.fields]) + print(f'Searching over fields: {fields}') + + query_generator = None + if args.dismax: + query_generator = JDisjunctionMaxQueryGenerator(args.tiebreaker) + print(f'Using dismax query generator with tiebreaker={args.tiebreaker}') + + if args.pretokenized: + analyzer = JWhiteSpaceAnalyzer() + searcher.set_analyzer(analyzer) + if args.tokenizer is not None: + raise ValueError(f"--tokenizer is not supported with when setting --pretokenized.") + + if args.tokenizer != None: + analyzer = JWhiteSpaceAnalyzer() + searcher.set_analyzer(analyzer) + print(f'Using whitespace analyzer because of pretokenized topics') + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) + print(f'Using {args.tokenizer} to preprocess topics') + + if args.stopwords: + analyzer = JDefaultEnglishAnalyzer.fromArguments('porter', False, args.stopwords) + searcher.set_analyzer(analyzer) + print(f'Using custom stopwords={args.stopwords}') + + # get re-ranker + use_prcl = args.prcl and len(args.prcl) > 0 and args.alpha > 0 + if use_prcl is True: + ranker = PseudoRelevanceClassifierReranker( + searcher.index_dir, args.vectorizer, args.prcl, r=args.r, n=args.n, alpha=args.alpha) + + # build output path + output_path = args.output + if output_path is None: + if use_prcl is True: + clf_rankers = [] + for t in args.prcl: + if t == ClassifierType.LR: + clf_rankers.append('lr') + elif t == ClassifierType.SVM: + clf_rankers.append('svm') + + r_str = f'prcl.r_{args.r}' + n_str = f'prcl.n_{args.n}' + a_str = f'prcl.alpha_{args.alpha}' + clf_str = 'prcl_' + '+'.join(clf_rankers) + tokens1 = ['run', args.topics, '+'.join(search_rankers)] + tokens2 = [args.vectorizer, clf_str, r_str, n_str, a_str] + output_path = '.'.join(tokens1) + '-' + '-'.join(tokens2) + ".txt" + else: + tokens = ['run', args.topics, '+'.join(search_rankers), 'txt'] + output_path = '.'.join(tokens) + + print(f'Running {args.topics} topics, saving to {output_path}...') + tag = output_path[:-4] if args.output is None else 'Anserini' + + output_writer = get_output_writer(output_path, OutputFormat(args.output_format), 'w', + max_hits=args.hits, tag=tag, topics=topics, + use_max_passage=args.max_passage, + max_passage_delimiter=args.max_passage_delimiter, + max_passage_hits=args.max_passage_hits) + + with output_writer: + batch_topics = list() + batch_topic_ids = list() + for index, (topic_id, text) in enumerate(tqdm(query_iterator, total=len(topics.keys()))): + if (args.tokenizer != None): + toks = tokenizer.tokenize(text) + text = ' ' + text = text.join(toks) + if args.batch_size <= 1 and args.threads <= 1: + if args.impact: + hits = searcher.search(text, args.hits, fields=fields) + else: + hits = searcher.search(text, args.hits, query_generator=query_generator, fields=fields) + results = [(topic_id, hits)] + else: + batch_topic_ids.append(str(topic_id)) + batch_topics.append(text) + if (index + 1) % args.batch_size == 0 or \ + index == len(topics.keys()) - 1: + if args.impact: + results = searcher.batch_search( + batch_topics, batch_topic_ids, args.hits, args.threads, fields=fields + ) + else: + results = searcher.batch_search( + batch_topics, batch_topic_ids, args.hits, args.threads, + query_generator=query_generator, fields=fields + ) + results = [(id_, results[id_]) for id_ in batch_topic_ids] + batch_topic_ids.clear() + batch_topics.clear() + else: + continue + + for topic, hits in results: + # do rerank + if use_prcl and len(hits) > (args.r + args.n): + docids = [hit.docid.strip() for hit in hits] + scores = [hit.score for hit in hits] + scores, docids = ranker.rerank(docids, scores) + docid_score_map = dict(zip(docids, scores)) + for hit in hits: + hit.score = docid_score_map[hit.docid.strip()] + + if args.remove_duplicates: + seen_docids = set() + dedup_hits = [] + for hit in hits: + if hit.docid.strip() in seen_docids: + continue + seen_docids.add(hit.docid.strip()) + dedup_hits.append(hit) + hits = dedup_hits + + # For some test collections, a query is doc from the corpus (e.g., arguana in BEIR). + # We want to remove the query from the results. + if args.remove_query: + hits = [hit for hit in hits if hit.docid != topic] + + # write results + output_writer.write(topic, hits) + + results.clear() diff --git a/pyserini/search/lucene/__pycache__/__init__.cpython-310.pyc b/pyserini/search/lucene/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eea080b3f90ee9b33a5ce4551fdf5cf075b2e718 Binary files /dev/null and b/pyserini/search/lucene/__pycache__/__init__.cpython-310.pyc differ diff --git a/pyserini/search/lucene/__pycache__/_geo_searcher.cpython-310.pyc b/pyserini/search/lucene/__pycache__/_geo_searcher.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5adea0dd441895a94c023d9ffae989b8c870152e Binary files /dev/null and b/pyserini/search/lucene/__pycache__/_geo_searcher.cpython-310.pyc differ diff --git a/pyserini/search/lucene/__pycache__/_impact_searcher.cpython-310.pyc b/pyserini/search/lucene/__pycache__/_impact_searcher.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d406066d7b5c5c6518c7e06abf74775abc4285f Binary files /dev/null and b/pyserini/search/lucene/__pycache__/_impact_searcher.cpython-310.pyc differ diff --git a/pyserini/search/lucene/__pycache__/_searcher.cpython-310.pyc b/pyserini/search/lucene/__pycache__/_searcher.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f7a381c4b8e4fbc4ad0d7be6b0acdfcf9d9cdf4 Binary files /dev/null and b/pyserini/search/lucene/__pycache__/_searcher.cpython-310.pyc differ diff --git a/pyserini/search/lucene/_geo_searcher.py b/pyserini/search/lucene/_geo_searcher.py new file mode 100644 index 0000000000000000000000000000000000000000..24c1f7e467bc7068e36a849db030410ea0b01d86 --- /dev/null +++ b/pyserini/search/lucene/_geo_searcher.py @@ -0,0 +1,82 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This module provides Pyserini's Python search interface to Anserini. The main entry point is the ``LuceneGeoSearcher`` +class, which wraps the Java class ``SimpleGeoSearcher`` in Anserini. +""" + +import logging +from typing import List + +from pyserini.pyclass import autoclass +from pyserini.search import JQuery + + +logger = logging.getLogger(__name__) + + +# Wrappers around Lucene classes +JSort = autoclass('org.apache.lucene.search.Sort') +JLatLonDocValuesField = autoclass('org.apache.lucene.document.LatLonDocValuesField') +JLatLonShape = autoclass('org.apache.lucene.document.LatLonShape') +JQueryRelation = autoclass('org.apache.lucene.document.ShapeField$QueryRelation') +JLongPoint = autoclass('org.apache.lucene.document.LongPoint') + +# Wrappers around Anserini classes +JGeoSearcher = autoclass('io.anserini.search.SimpleGeoSearcher') +JGeoSearcherResult = autoclass('io.anserini.search.SimpleSearcher$Result') + + +class LuceneGeoSearcher: + """Wrapper class for ``SimpleGeoSearcher`` in Anserini. + + Parameters + ---------- + index_dir : str + Path to Lucene index directory. + """ + + def __init__(self, index_dir: str): + self.index_dir = index_dir + self.object = JGeoSearcher(index_dir) + + def search(self, q: JQuery, k: int = 10, sort: JSort = None) -> List[JGeoSearcherResult]: + """Search the collection. + + Parameters + ---------- + q : JQuery + Lucene query. + k : int + Number of hits to return. + sort : JSort + Optional distance sort that allows searcher to return results based on distance to a point. + + Returns + ------- + List[JGeoSearcherResult] + List of search results. + """ + if sort: + hits = self.object.searchGeo(q, k, sort) + else: + hits = self.object.searchGeo(q, k) + return hits + + def close(self): + """Close the searcher.""" + self.object.close() diff --git a/pyserini/search/lucene/_impact_searcher.py b/pyserini/search/lucene/_impact_searcher.py new file mode 100644 index 0000000000000000000000000000000000000000..c6d433526d6c56d0a885753b226e625362dd1434 --- /dev/null +++ b/pyserini/search/lucene/_impact_searcher.py @@ -0,0 +1,406 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This module provides Pyserini's Python search interface to Anserini. The main entry point is the ``LuceneImpactSearcher`` +class, which wraps the Java class with the same name in Anserini. +""" + +import logging +import os +import pickle +from tqdm import tqdm +from typing import Dict, List, Optional, Union +from collections import namedtuple + +import numpy as np +import scipy + +from pyserini.encode import QueryEncoder, TokFreqQueryEncoder, UniCoilQueryEncoder, \ + CachedDataQueryEncoder, SpladeQueryEncoder, SlimQueryEncoder +from pyserini.index import Document +from pyserini.pyclass import autoclass, JFloat, JArrayList, JHashMap +from pyserini.util import download_prebuilt_index, download_encoded_corpus + +logger = logging.getLogger(__name__) + +# Wrappers around Anserini classes +JImpactSearcher = autoclass('io.anserini.search.SimpleImpactSearcher') +JImpactSearcherResult = autoclass('io.anserini.search.SimpleImpactSearcher$Result') + + +class LuceneImpactSearcher: + """Wrapper class for ``ImpactSearcher`` in Anserini. + + Parameters + ---------- + index_dir : str + Path to Lucene index directory. + query_encoder: QueryEncoder or str + QueryEncoder to encode query text + """ + + def __init__(self, index_dir: str, query_encoder: Union[QueryEncoder, str], min_idf=0, encoder_type: str='pytorch'): + self.index_dir = index_dir + self.idf = self._compute_idf(index_dir) + self.min_idf = min_idf + self.object = JImpactSearcher(index_dir) + self.num_docs = self.object.get_total_num_docs() + self.encoder_type = encoder_type + self.query_encoder = query_encoder + if encoder_type == 'onnx': + if isinstance(query_encoder, str) or query_encoder is None: + self.object.set_onnx_query_encoder(query_encoder) + else: + raise ValueError(f'Invalid query encoder type: {type(query_encoder)} for onnx encoder') + elif encoder_type == 'pytorch': + if isinstance(query_encoder, str) or query_encoder is None: + self.query_encoder = self._init_query_encoder_from_str(query_encoder) + else: + self.query_encoder = query_encoder + else: + raise ValueError(f'Invalid encoder type: {encoder_type}') + + @classmethod + def from_prebuilt_index(cls, prebuilt_index_name: str, query_encoder: Union[QueryEncoder, str], min_idf=0, encoder_type: str='pytorch'): + """Build a searcher from a pre-built index; download the index if necessary. + + Parameters + ---------- + prebuilt_index_name : str + Prebuilt index name. + query_encoder: QueryEncoder or str + QueryEncoder to encode query text + min_idf : int + Minimum idf for query tokens + encoder_type : str + Encoder type, either 'pytorch' or 'onnx' + + Returns + ------- + LuceneSearcher + Searcher built from the prebuilt index. + """ + print(f'Attempting to initialize pre-built index {prebuilt_index_name}.') + try: + index_dir = download_prebuilt_index(prebuilt_index_name) + except ValueError as e: + print(str(e)) + return None + + print(f'Initializing {prebuilt_index_name}...') + return cls(index_dir, query_encoder, min_idf, encoder_type) + + def encode(self, query): + if self.encoder_type == 'onnx': + encoded_query = self.object.encode_with_onnx(query) + else: + encoded_query = self.query_encoder.encode(query) + return encoded_query + + @staticmethod + def list_prebuilt_indexes(): + """Display information about available prebuilt indexes.""" + print("Not Implemented") + + def search(self, q: str, k: int = 10, fields=dict()) -> List[JImpactSearcherResult]: + """Search the collection. + + Parameters + ---------- + q : str + Query string. + k : int + Number of hits to return. + min_idf : int + Minimum idf for query tokens + fields : dict + Optional map of fields to search with associated boosts. + + Returns + ------- + List[JImpactSearcherResult] + List of search results. + """ + + jfields = JHashMap() + for (field, boost) in fields.items(): + jfields.put(field, JFloat(boost)) + + encoded_query = self.encode(q) + + jquery = encoded_query + if self.encoder_type == 'pytorch': + for (token, weight) in encoded_query.items(): + if token in self.idf and self.idf[token] > self.min_idf: + jquery.put(token, JFloat(weight)) + + if not fields: + hits = self.object.search(jquery, k) + else: + hits = self.object.searchFields(jquery, jfields, k) + + return hits + + def batch_search(self, queries: List[str], qids: List[str], + k: int = 10, threads: int = 1, fields=dict()) -> Dict[str, List[JImpactSearcherResult]]: + """Search the collection concurrently for multiple queries, using multiple threads. + + Parameters + ---------- + queries : List[str] + List of query string. + qids : List[str] + List of corresponding query ids. + k : int + Number of hits to return. + threads : int + Maximum number of threads to use. + min_idf : int + Minimum idf for query tokens + fields : dict + Optional map of fields to search with associated boosts. + + Returns + ------- + Dict[str, List[JImpactSearcherResult]] + Dictionary holding the search results, with the query ids as keys and the corresponding lists of search + results as the values. + """ + query_lst = JArrayList() + qid_lst = JArrayList() + for q in queries: + encoded_query = self.encode(q) + jquery = JHashMap() + if self.encoder_type == 'pytorch': + for (token, weight) in encoded_query.items(): + if token in self.idf and self.idf[token] > self.min_idf: + jquery.put(token, JFloat(weight)) + else: + jquery = encoded_query + query_lst.add(jquery) + + for qid in qids: + jqid = qid + qid_lst.add(jqid) + + jfields = JHashMap() + for (field, boost) in fields.items(): + jfields.put(field, JFloat(boost)) + + if not fields: + results = self.object.batch_search(query_lst, qid_lst, int(k), int(threads)) + else: + results = self.object.batch_search_fields(query_lst, qid_lst, int(k), int(threads), jfields) + return {r.getKey(): r.getValue() for r in results.entrySet().toArray()} + + def doc(self, docid: Union[str, int]) -> Optional[Document]: + """Return the :class:`Document` corresponding to ``docid``. The ``docid`` is overloaded: if it is of type + ``str``, it is treated as an external collection ``docid``; if it is of type ``int``, it is treated as an + internal Lucene ``docid``. Method returns ``None`` if the ``docid`` does not exist in the index. + + Parameters + ---------- + docid : Union[str, int] + Overloaded ``docid``: either an external collection ``docid`` (``str``) or an internal Lucene ``docid`` + (``int``). + + Returns + ------- + Document + :class:`Document` corresponding to the ``docid``. + """ + lucene_document = self.object.document(docid) + if lucene_document is None: + return None + return Document(lucene_document) + + def doc_by_field(self, field: str, q: str) -> Optional[Document]: + """Return the :class:`Document` based on a ``field`` with ``id``. For example, this method can be used to fetch + document based on alternative primary keys that have been indexed, such as an article's DOI. Method returns + ``None`` if no such document exists. + + Parameters + ---------- + field : str + Field to look up. + q : str + Unique id of document. + + Returns + ------- + Document + :class:`Document` whose ``field`` is ``id``. + """ + lucene_document = self.object.documentByField(field, q) + if lucene_document is None: + return None + return Document(lucene_document) + + def close(self): + """Close the searcher.""" + self.object.close() + + @staticmethod + def _init_query_encoder_from_str(query_encoder): + if query_encoder is None: + return TokFreqQueryEncoder() + elif os.path.isfile(query_encoder) and (query_encoder.endswith('jsonl') or query_encoder.encode('json')): + return CachedDataQueryEncoder(query_encoder) + elif 'unicoil' in query_encoder.lower(): + return UniCoilQueryEncoder(query_encoder) + elif 'splade' in query_encoder.lower(): + return SpladeQueryEncoder(query_encoder) + elif 'slim' in query_encoder.lower(): + return SlimQueryEncoder(query_encoder) + + @staticmethod + def _compute_idf(index_path): + from pyserini.index.lucene import IndexReader + index_reader = IndexReader(index_path) + tokens = [] + dfs = [] + for term in index_reader.terms(): + dfs.append(term.df) + tokens.append(term.term) + idfs = np.log((index_reader.stats()['documents'] / (np.array(dfs)))) + return dict(zip(tokens, idfs)) + + +SlimResult = namedtuple("SlimResult", "docid score") + +def maxsim(entry): + q_embed, d_embeds, d_lens, qid, scores, docids = entry + if len(d_embeds) == 0: + return qid, scores, docids + d_embeds = scipy.sparse.vstack(d_embeds).transpose() # (LD x 1000) x D + max_scores = (q_embed@d_embeds).todense() # LQ x (LD x 1000) + scores = [] + start = 0 + for d_len in d_lens: + scores.append(max_scores[:, start:start+d_len].max(1).sum()) + start += d_len + scores, docids = list(zip(*sorted(list(zip(scores, docids)), key=lambda x: -x[0]))) + return qid, scores, docids + +class SlimSearcher(LuceneImpactSearcher): + def __init__(self, encoded_corpus, *args, **kwargs): + super().__init__(*args, **kwargs) + print("Loading sparse corpus vectors for fast reranking...") + with open(os.path.join(encoded_corpus, "sparse_range.pkl"), "rb") as f: + self.sparse_ranges = pickle.load(f) + sparse_vecs = scipy.sparse.load_npz(os.path.join(encoded_corpus, "sparse_vec.npz")) + self.sparse_vecs = [sparse_vecs[start:end] for start, end in tqdm(self.sparse_ranges)] + + @classmethod + def from_prebuilt_index(cls, encoded_corpus:str, prebuilt_index_name: str, query_encoder: Union[QueryEncoder, str], min_idf=0): + print(f'Attempting to initialize pre-built index {prebuilt_index_name}.') + try: + index_dir = download_prebuilt_index(prebuilt_index_name) + encoded_corpus = download_encoded_corpus(encoded_corpus) + except ValueError as e: + print(str(e)) + return None + + print(f'Initializing {prebuilt_index_name}...') + return cls(encoded_corpus, index_dir, query_encoder, min_idf) + + def search(self, q: str, k: int = 10, fields=dict()) -> List[JImpactSearcherResult]: + jfields = JHashMap() + for (field, boost) in fields.items(): + jfields.put(field, JFloat(boost)) + + fusion_encoded_query, sparse_encoded_query = self.query_encoder.encode(q, return_sparse=True) + jquery = JHashMap() + for (token, weight) in fusion_encoded_query.items(): + if token in self.idf and self.idf[token] > self.min_idf: + jquery.put(token, JFloat(weight)) + + if self.sparse_vecs is not None: + search_k = k * (self.min_idf + 1) + if not fields: + hits = self.object.search(jquery, search_k) + else: + hits = self.object.searchFields(jquery, jfields, search_k) + hits = self.fast_rerank([sparse_encoded_query], {0: hits}, k)[0] + return hits + + def batch_search(self, queries: List[str], qids: List[str], + k: int = 10, threads: int = 1, fields=dict()) -> Dict[str, List[JImpactSearcherResult]]: + query_lst = JArrayList() + qid_lst = JArrayList() + sparse_encoded_queries = {} + for qid, q in zip(qids, queries): + fusion_encoded_query, sparse_encoded_query = self.query_encoder.encode(q, return_sparse=True) + jquery = JHashMap() + for (token, weight) in fusion_encoded_query.items(): + if token in self.idf and self.idf[token] > self.min_idf: + jquery.put(token, JFloat(weight)) + query_lst.add(jquery) + sparse_encoded_queries[qid] = sparse_encoded_query + + for qid in qids: + jqid = qid + qid_lst.add(jqid) + + jfields = JHashMap() + for (field, boost) in fields.items(): + jfields.put(field, JFloat(boost)) + + if not fields: + results = self.object.batch_search(query_lst, qid_lst, k * (self.min_idf + 1), threads) + else: + results = self.object.batch_search_fields(query_lst, qid_lst, k * (self.min_idf + 1), threads, jfields) + + results = {r.getKey(): r.getValue() for r in results.entrySet().toArray()} + results = self.fast_rerank(sparse_encoded_queries, results, k) + return results + + def fast_rerank(self, q_embeds, results, k): + all_scores = [] + all_docids = [] + all_q_embeds = [] + all_d_embeds = [] + all_d_lens = [] + qids = [] + for qid in results.keys(): + all_q_embeds.append(q_embeds[qid]) + qids.append(qid) + hits = results[qid] + docids = [] + scores = [] + d_embeds = [] + d_lens = [] + for hit in hits: + docids.append(hit.docid) + scores.append(hit.score) + start, end = self.sparse_ranges[int(hit.docid)] + d_embeds.append(self.sparse_vecs[int(hit.docid)]) + d_lens.append(end-start) + all_scores.append(scores) + all_docids.append(docids) + all_d_embeds.append(d_embeds) + all_d_lens.append(d_lens) + + entries = list(zip(all_q_embeds, all_d_embeds, all_d_lens, qids, all_scores, all_docids)) + results = [maxsim(entry) for entry in entries] + anserini_results = {} + for qid, scores, docids in results: + hits = [] + for score, docid in list(zip(scores, docids))[:k]: + hits.append(SlimResult(docid, score)) + anserini_results[qid] = hits + return anserini_results diff --git a/pyserini/search/lucene/_searcher.py b/pyserini/search/lucene/_searcher.py new file mode 100644 index 0000000000000000000000000000000000000000..45677db52f428fc00bca5b75d7dd464cb6d846bc --- /dev/null +++ b/pyserini/search/lucene/_searcher.py @@ -0,0 +1,477 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This module provides Pyserini's Python search interface to Anserini. The main entry point is the ``LuceneSearcher`` +class, which wraps the Java class with the same name in Anserini. +""" + +import logging +from typing import Dict, List, Optional, Union + +from pyserini.fusion import FusionMethod, reciprocal_rank_fusion +from pyserini.index import Document, IndexReader +from pyserini.pyclass import autoclass, JFloat, JArrayList, JHashMap +from pyserini.search import JQuery, JQueryGenerator +from pyserini.trectools import TrecRun +from pyserini.util import download_prebuilt_index, get_sparse_indexes_info + +logger = logging.getLogger(__name__) + + +# Wrappers around Anserini classes +JLuceneSearcher = autoclass('io.anserini.search.SimpleSearcher') +JLuceneSearcherResult = autoclass('io.anserini.search.SimpleSearcher$Result') + + +class LuceneSearcher: + """Wrapper class for ``SimpleSearcher`` in Anserini. + + Parameters + ---------- + index_dir : str + Path to Lucene index directory. + """ + + def __init__(self, index_dir: str, prebuilt_index_name=None): + self.index_dir = index_dir + self.object = JLuceneSearcher(index_dir) + self.num_docs = self.object.get_total_num_docs() + # Keep track if self is a known pre-built index. + self.prebuilt_index_name = prebuilt_index_name + + @classmethod + def from_prebuilt_index(cls, prebuilt_index_name: str, verbose=False): + """Build a searcher from a pre-built index; download the index if necessary. + + Parameters + ---------- + prebuilt_index_name : str + Prebuilt index name. + verbose : bool + Print status information. + + Returns + ------- + LuceneSearcher + Searcher built from the prebuilt index. + """ + if verbose: + print(f'Attempting to initialize pre-built index {prebuilt_index_name}.') + + try: + index_dir = download_prebuilt_index(prebuilt_index_name, verbose=verbose) + except ValueError as e: + print(str(e)) + return None + + # Currently, the only way to validate stats is to create a separate IndexReader, because there is no method + # to obtain the underlying reader of a SimpleSearcher; see https://github.com/castorini/anserini/issues/2013 + index_reader = IndexReader(index_dir) + # This is janky as we're created a separate IndexReader for the sole purpose of validating index stats. + index_reader.validate(prebuilt_index_name, verbose=verbose) + + if verbose: + print(f'Initializing {prebuilt_index_name}...') + + return cls(index_dir, prebuilt_index_name=prebuilt_index_name) + + @staticmethod + def list_prebuilt_indexes(): + """Display information about available prebuilt indexes.""" + get_sparse_indexes_info() + + def search(self, q: Union[str, JQuery], k: int = 10, query_generator: JQueryGenerator = None, + fields=dict(), strip_segment_id=False, remove_dups=False) -> List[JLuceneSearcherResult]: + """Search the collection. + + Parameters + ---------- + q : Union[str, JQuery] + Query string or the ``JQuery`` objected. + k : int + Number of hits to return. + query_generator : JQueryGenerator + Generator to build queries. Set to ``None`` by default to use Anserini default. + fields : dict + Optional map of fields to search with associated boosts. + strip_segment_id : bool + Remove the .XXXXX suffix used to denote different segments from an document. + remove_dups : bool + Remove duplicate docids when writing final run output. + + Returns + ------- + List[JLuceneSearcherResult] + List of search results. + """ + + jfields = JHashMap() + for (field, boost) in fields.items(): + jfields.put(field, JFloat(boost)) + + hits = None + if query_generator: + if not fields: + hits = self.object.search(query_generator, q, k) + else: + hits = self.object.searchFields(query_generator, q, jfields, k) + elif isinstance(q, JQuery): + # Note that RM3 requires the notion of a query (string) to estimate the appropriate models. If we're just + # given a Lucene query, it's unclear what the "query" is for this estimation. One possibility is to extract + # all the query terms from the Lucene query, although this might yield unexpected behavior from the user's + # perspective. Until we think through what exactly is the "right thing to do", we'll raise an exception + # here explicitly. + if self.is_using_rm3(): + raise NotImplementedError('RM3 incompatible with search using a Lucene query.') + if fields: + raise NotImplementedError('Cannot specify fields to search when using a Lucene query.') + hits = self.object.search(q, k) + else: + if not fields: + hits = self.object.search(q, k) + else: + hits = self.object.search_fields(q, jfields, k) + + docids = set() + filtered_hits = [] + + for hit in hits: + if strip_segment_id is True: + hit.docid = hit.docid.split('.')[0] + + if hit.docid in docids: + continue + + filtered_hits.append(hit) + + if remove_dups is True: + docids.add(hit.docid) + + return filtered_hits + + def batch_search(self, queries: List[str], qids: List[str], k: int = 10, threads: int = 1, + query_generator: JQueryGenerator = None, fields = dict()) -> Dict[str, List[JLuceneSearcherResult]]: + """Search the collection concurrently for multiple queries, using multiple threads. + + Parameters + ---------- + queries : List[str] + List of query strings. + qids : List[str] + List of corresponding query ids. + k : int + Number of hits to return. + threads : int + Maximum number of threads to use. + query_generator : JQueryGenerator + Generator to build queries. Set to ``None`` by default to use Anserini default. + fields : dict + Optional map of fields to search with associated boosts. + + Returns + ------- + Dict[str, List[JLuceneSearcherResult]] + Dictionary holding the search results, with the query ids as keys and the corresponding lists of search + results as the values. + """ + query_strings = JArrayList() + qid_strings = JArrayList() + for query in queries: + query_strings.add(query) + + for qid in qids: + qid_strings.add(qid) + + jfields = JHashMap() + for (field, boost) in fields.items(): + jfields.put(field, JFloat(boost)) + + if query_generator: + if not fields: + results = self.object.batch_search(query_generator, query_strings, qid_strings, int(k), int(threads)) + else: + results = self.object.batch_search_fields(query_generator, query_strings, qid_strings, int(k), int(threads), jfields) + else: + if not fields: + results = self.object.batch_search(query_strings, qid_strings, int(k), int(threads)) + else: + results = self.object.batch_search_fields(query_strings, qid_strings, int(k), int(threads), jfields) + return {r.getKey(): r.getValue() for r in results.entrySet().toArray()} + + def get_feedback_terms(self, q: str) -> Dict[str, float]: + """Returns feedback terms and their weights. + + Parameters + ---------- + q : str + Query string or the ``JQuery`` objected. + + Returns + ------- + Dict[str, float] + Feedback terms and their weights. + """ + + terms_map = self.object.get_feedback_terms(q) + if terms_map: + return {r.getKey(): r.getValue() for r in terms_map.entrySet().toArray()} + else: + return None + + def set_analyzer(self, analyzer): + """Set the Java ``Analyzer`` to use. + + Parameters + ---------- + analyzer : JAnalyzer + Java ``Analyzer`` object. + """ + self.object.set_analyzer(analyzer) + + def set_language(self, language): + """Set language of LuceneSearcher""" + self.object.set_language(language) + + def set_rm3(self, fb_terms=10, fb_docs=10, original_query_weight=float(0.5), debug=False, filter_terms=True): + """Configure RM3 pseudo-relevance feedback. + + Parameters + ---------- + fb_terms : int + RM3 parameter for number of expansion terms. + fb_docs : int + RM3 parameter for number of expansion documents. + original_query_weight : float + RM3 parameter for weight to assign to the original query. + debug : bool + Print the original and expanded queries as debug output. + filter_terms: bool + Whether to remove non-English terms. + """ + if self.object.reader.getTermVectors(0): + self.object.set_rm3(None, fb_terms, fb_docs, original_query_weight, debug, filter_terms) + elif self.prebuilt_index_name in ['msmarco-v1-passage', 'msmarco-v1-doc', 'msmarco-v1-doc-segmented']: + self.object.set_rm3('JsonCollection', fb_terms, fb_docs, original_query_weight, debug, filter_terms) + elif self.prebuilt_index_name in ['msmarco-v2-passage', 'msmarco-v2-passage-augmented']: + self.object.set_rm3('MsMarcoV2PassageCollection', fb_terms, fb_docs, original_query_weight, debug, filter_terms) + elif self.prebuilt_index_name in ['msmarco-v2-doc', 'msmarco-v2-doc-segmented']: + self.object.set_rm3('MsMarcoV2DocCollection', fb_terms, fb_docs, original_query_weight, debug, filter_terms) + else: + raise TypeError("RM3 is not supported for indexes without document vectors.") + + def unset_rm3(self): + """Disable RM3 pseudo-relevance feedback.""" + self.object.unset_rm3() + + def is_using_rm3(self) -> bool: + """Check if RM3 pseudo-relevance feedback is being performed.""" + return self.object.use_rm3() + + def set_rocchio(self, top_fb_terms=10, top_fb_docs=10, bottom_fb_terms=10, bottom_fb_docs=10, + alpha=1, beta=0.75, gamma=0, debug=False, use_negative=False): + """Configure Rocchio pseudo-relevance feedback. + + Parameters + ---------- + top_fb_terms : int + Rocchio parameter for number of relevant expansion terms. + top_fb_docs : int + Rocchio parameter for number of relevant expansion documents. + bottom_fb_terms : int + Rocchio parameter for number of non-relevant expansion terms. + bottom_fb_docs : int + Rocchio parameter for number of non-relevant expansion documents. + alpha : float + Rocchio parameter for weight to assign to the original query. + beta: float + Rocchio parameter for weight to assign to the relevant document vector. + gamma: float + Rocchio parameter for weight to assign to the nonrelevant document vector. + debug : bool + Print the original and expanded queries as debug output. + use_negative : bool + Rocchio parameter to use negative labels. + """ + if self.object.reader.getTermVectors(0): + self.object.set_rocchio(None, top_fb_terms, top_fb_docs, bottom_fb_terms, bottom_fb_docs, + alpha, beta, gamma, debug, use_negative) + elif self.prebuilt_index_name in ['msmarco-v1-passage', 'msmarco-v1-doc', 'msmarco-v1-doc-segmented']: + self.object.set_rocchio('JsonCollection', top_fb_terms, top_fb_docs, bottom_fb_terms, bottom_fb_docs, + alpha, beta, gamma, debug, use_negative) + # Note, we don't have any Pyserini 2CRs that use Rocchio for MS MARCO v2, so there's currently no + # corresponding code branch here. To avoid introducing bugs (without 2CR tests), we'll add when it's needed. + else: + raise TypeError("Rocchio is not supported for indexes without document vectors.") + + def unset_rocchio(self): + """Disable Rocchio pseudo-relevance feedback.""" + self.object.unset_rocchio() + + def is_using_rocchio(self) -> bool: + """Check if Rocchio pseudo-relevance feedback is being performed.""" + return self.object.use_rocchio() + + def set_qld(self, mu=float(1000)): + """Configure query likelihood with Dirichlet smoothing as the scoring function. + + Parameters + ---------- + mu : float + Dirichlet smoothing parameter mu. + """ + self.object.set_qld(float(mu)) + + def set_bm25(self, k1=float(0.9), b=float(0.4)): + """Configure BM25 as the scoring function. + + Parameters + ---------- + k1 : float + BM25 k1 parameter. + b : float + BM25 b parameter. + """ + self.object.set_bm25(float(k1), float(b)) + + def get_similarity(self): + """Return the Lucene ``Similarity`` used as the scoring function.""" + return self.object.get_similarity() + + def doc(self, docid: Union[str, int]) -> Optional[Document]: + """Return the :class:`Document` corresponding to ``docid``. The ``docid`` is overloaded: if it is of type + ``str``, it is treated as an external collection ``docid``; if it is of type ``int``, it is treated as an + internal Lucene ``docid``. Method returns ``None`` if the ``docid`` does not exist in the index. + + Parameters + ---------- + docid : Union[str, int] + Overloaded ``docid``: either an external collection ``docid`` (``str``) or an internal Lucene ``docid`` + (``int``). + + Returns + ------- + Document + :class:`Document` corresponding to the ``docid``. + """ + lucene_document = self.object.doc(docid) + if lucene_document is None: + return None + return Document(lucene_document) + + def batch_doc(self, docids: List[str], threads: int) -> Dict[str, Document]: + """Concurrently fetching documents for multiple document ids. + Return dictionary that maps ``docid`` to :class:`Document`. Returned dictionary does not + contain ``docid`` if a corresponding :class:`Document` does not exist in the index. + + Parameters + ---------- + docids : List[str] + An external collection ``docid`` (``str``). + threads : int + Maximum number of threads to use. + + Returns + ------- + Dict[str, Document] + Dictionary mapping the ``docid`` to the corresponding :class:`Document`. + """ + docid_strings = JArrayList() + for docid in docids: + docid_strings.add(docid) + + results = self.object.batch_get_docs(docid_strings, threads) + batch_document = {r.getKey(): Document(r.getValue()) + for r in results.entrySet().toArray()} + return batch_document + + def doc_by_field(self, field: str, q: str) -> Optional[Document]: + """Return the :class:`Document` based on a ``field`` with ``id``. For example, this method can be used to fetch + document based on alternative primary keys that have been indexed, such as an article's DOI. Method returns + ``None`` if no such document exists. + + Parameters + ---------- + field : str + Field to look up. + q : str + Unique id of document. + + Returns + ------- + Document + :class:`Document` whose ``field`` is ``id``. + """ + lucene_document = self.object.doc_by_field(field, q) + if lucene_document is None: + return None + return Document(lucene_document) + + def close(self): + """Close the searcher.""" + self.object.close() + + +class LuceneSimilarities: + @staticmethod + def bm25(k1=0.9, b=0.4): + return autoclass('org.apache.lucene.search.similarities.BM25Similarity')(k1, b) + + @staticmethod + def qld(mu=1000): + return autoclass('org.apache.lucene.search.similarities.LMDirichletSimilarity')(mu) + + +class LuceneFusionSearcher: + def __init__(self, index_dirs: List[str], method: FusionMethod): + self.method = method + self.searchers = [LuceneSearcher(index_dir) for index_dir in index_dirs] + + def get_searchers(self) -> List[LuceneSearcher]: + return self.searchers + + def search(self, q: Union[str, JQuery], k: int = 10, query_generator: JQueryGenerator = None, strip_segment_id=False, remove_dups=False) -> List[JLuceneSearcherResult]: + trec_runs, docid_to_search_result = list(), dict() + + for searcher in self.searchers: + docid_score_pair = list() + hits = searcher.search(q, k=k, query_generator=query_generator, + strip_segment_id=strip_segment_id, remove_dups=remove_dups) + + for hit in hits: + docid_to_search_result[hit.docid] = hit + docid_score_pair.append((hit.docid, hit.score)) + + run = TrecRun.from_search_results(docid_score_pair) + trec_runs.append(run) + + if self.method == FusionMethod.RRF: + fused_run = reciprocal_rank_fusion(trec_runs, rrf_k=60, depth=1000, k=1000) + else: + raise NotImplementedError() + + return self.convert_to_search_result(fused_run, docid_to_search_result) + + @staticmethod + def convert_to_search_result(run: TrecRun, docid_to_search_result: Dict[str, JLuceneSearcherResult]) -> List[JLuceneSearcherResult]: + search_results = [] + + for _, _, docid, _, score, _ in run.to_numpy(): + search_result = docid_to_search_result[docid] + search_result.score = score + search_results.append(search_result) + + return search_results diff --git a/pyserini/search/lucene/irst/__init__.py b/pyserini/search/lucene/irst/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..463ef3d6224259ae0c5e01f4f16a2a1e81da61ba --- /dev/null +++ b/pyserini/search/lucene/irst/__init__.py @@ -0,0 +1,18 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ._searcher import LuceneIrstSearcher +__all__ = ['LuceneIrstSearcher'] diff --git a/pyserini/search/lucene/irst/__main__.py b/pyserini/search/lucene/irst/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..27ed16b4a7933279081d4aa6756ddd3151e24312 --- /dev/null +++ b/pyserini/search/lucene/irst/__main__.py @@ -0,0 +1,162 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import argparse +from typing import List +from tqdm import tqdm +from transformers import AutoTokenizer +from pyserini.search import get_topics +from pyserini.search.lucene.irst import LuceneIrstSearcher + + +def normalize(scores: List[float]): + low = min(scores) + high = max(scores) + width = high - low + if width != 0: + return [(s-low)/width for s in scores] + return scores + + +def query_loader(topic: str): + queries = {} + bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + topics_dic = get_topics(topic) + line_num = 0 + for topic_id in topics_dic: + line_num += 1 + query_text = topics_dic[topic_id]['title'] + text_bert_tok = bert_tokenizer.tokenize(query_text.lower()) + if len(text_bert_tok) >= 0: + query = {"raw": query_text, + "contents": ' '.join(text_bert_tok)} + queries[topic_id] = query + if line_num % 10000 == 0: + print(f"Processed {line_num} queries") + print(f"Processed {line_num} queries") + return queries + + +def baseline_loader(base_path: str): + result_dic = {} + with open(base_path, 'r') as f: + for line in f: + tokens = line.strip().split() + topic = tokens[0] + doc_id = tokens[2] + score = float(tokens[-2]) + if topic in result_dic.keys(): + result_dic[topic][0].append(doc_id) + result_dic[topic][1].append(score) + else: + result_dic[topic] = [[doc_id], [score]] + + return result_dic + + +def generate_maxP(preds: List[float], docs: List[str]): + scores = {} + for index, (score, doc_id) in enumerate(zip(preds, docs)): + docid = doc_id.split('#')[0] + if (docid not in scores or score > scores[docid]): + scores[docid] = score + docid_scores = sorted(scores.items(), key=lambda kv: kv[1], reverse=True) + return docid_scores + + +def sort_dual_list(pred: List[float], docs: List[str]): + zipped_lists = zip(pred, docs) + sorted_pairs = sorted(zipped_lists) + + tuples = zip(*sorted_pairs) + pred, docs = [list(tuple) for tuple in tuples] + + pred.reverse() + docs.reverse() + return pred, docs + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='use ibm model 1 feature to rerank the base run file') + parser.add_argument('--tag', type=str, default="ibm", + metavar="tag_name", help='tag name for resulting Qrun') + parser.add_argument('--base-path', type=str, required=False, + metavar="path_to_base_run", help='path to base run') + parser.add_argument('--topics', type=str, required=True, + help='existing topics name or path to query topics') + parser.add_argument('--index', type=str, required=True, + metavar="path_to_lucene_index", help='path to lucene index folder') + parser.add_argument('--output', type=str, required=True, + metavar="path_to_reranked_run", help='the path to store reranked run file') + parser.add_argument('--alpha', type=float, default="0.3", + metavar="type of field", help='interpolation weight') + parser.add_argument('--num-threads', type=int, default="24", + metavar="num_of_threads", help='number of threads to use') + parser.add_argument('--max-sim', default=False, action="store_true", + help='whether we use max sim operator or avg instead') + parser.add_argument('--segments', default=False, action="store_true", + help='whether we use segmented index or not') + parser.add_argument('--k1', type=float, default="0.81", + metavar="bm25_k1_parameter", help='k1 parameter for bm25 search') + parser.add_argument('--b', type=float, default="0.68", + metavar="bm25_b_parameter", help='b parameter for bm25 search') + parser.add_argument('--hits', type=int, metavar='number of hits generated in runfile', + required=False, default=1000, help="Number of hits.") + args = parser.parse_args() + + print('Using max sim operator or not:', args.max_sim) + + f = open(args.output, 'w') + + reranker = LuceneIrstSearcher(args.index, args.k1, args.b, args.num_threads) + queries = query_loader(args.topics) + query_text_lst = [queries[topic]['raw'] for topic in queries.keys()] + qid_lst = [str(topic) for topic in queries.keys()] + i = 0 + for topic in queries: + if i % 100 == 0: + print(f'Reranking {i} topic') + query_text_field = queries[topic]['contents'] + query_text = queries[topic]['raw'] + bm25_results = reranker.bm25search.search(query_text, args.hits) + if args.base_path: + baseline_dic = baseline_loader(args.base_path) + docids, rank_scores, base_scores = reranker.rerank( + query_text, query_text_field, baseline_dic[topic], args.max_sim, bm25_results) + else: + docids, rank_scores, base_scores = reranker.search( + query_text, query_text_field, args.max_sim, bm25_results) + ibm_scores = normalize([p for p in rank_scores]) + base_scores = normalize([p for p in base_scores]) + + interpolated_scores = [ + a * args.alpha + b * (1-args.alpha) for a, b in zip(base_scores, ibm_scores)] + + preds, docs = sort_dual_list(interpolated_scores, docids) + i = i+1 + if args.segments: + docid_scores = generate_maxP(preds, docs) + rank = 1 + for doc_id, score in docid_scores: + if rank > 1000: + break + f.write(f'{topic} Q0 {doc_id} {rank} {score} {args.tag}\n') + rank = rank + 1 + else: + for index, (score, doc_id) in enumerate(zip(preds, docs)): + rank = index + 1 + f.write(f'{topic} Q0 {doc_id} {rank} {score} {args.tag}\n') + f.close() diff --git a/pyserini/search/lucene/irst/_searcher.py b/pyserini/search/lucene/irst/_searcher.py new file mode 100644 index 0000000000000000000000000000000000000000..85a69ad2cd475b40510a22ccf3bb17b921f4050e --- /dev/null +++ b/pyserini/search/lucene/irst/_searcher.py @@ -0,0 +1,288 @@ +# +# Pyserini: Python interface to the Anserini IR toolkit built on Lucene +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This module provides Pyserini's Python translation probability search +interface on MS MARCO dataset. The main entry point is the +``TranslationProbabilitySearcher`` class. +""" + +import json +import math +import os +import pickle +import struct +from multiprocessing.pool import ThreadPool +from typing import Dict + +from transformers import AutoTokenizer + +from pyserini.pyclass import autoclass +from pyserini.search.lucene import LuceneSearcher +from pyserini.util import download_prebuilt_index, get_cache_home, download_url, download_and_unpack_index +from pyserini.prebuilt_index_info import TF_INDEX_INFO + +# Wrappers around Anserini classes +JQuery = autoclass('org.apache.lucene.search.Query') +JLuceneSearcher = autoclass('io.anserini.search.SimpleSearcher') +JIndexReader = autoclass('io.anserini.index.IndexReaderUtils') +JTerm = autoclass('org.apache.lucene.index.Term') + + +class LuceneIrstSearcher(object): + SELF_TRAN = 0.35 + MIN_PROB = 0.0025 + LAMBDA_VALUE = 0.3 + MIN_COLLECT_PROB = 1e-9 + + def __init__(self, index: str, k1: int, b: int, num_threads: int): + translation_url = 'https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-models/ibm_model_1_bert_tok_20211117.tar.gz' + translation_directory = os.path.join(get_cache_home(), 'models') + self.termfreq_dic = self.download_and_load_wp_stats(index) + # This is used to download and unpack translation model instead of index, we use the function (download_and_unpack_index) for convenience. + self.translation_model = download_and_unpack_index(translation_url, translation_directory) + self.bm25search = LuceneSearcher.from_prebuilt_index(index) + self.bm25search.set_bm25(k1, b) + index_directory = os.path.join(get_cache_home(), 'indexes') + if index == 'msmarco-v1-passage': + index_path = os.path.join(index_directory, + TF_INDEX_INFO['msmarco-v1-passage']['filename'][:-6] + + TF_INDEX_INFO['msmarco-v1-passage']['md5']) + elif index == 'msmarco-v1-doc': + index_path = os.path.join(index_directory, + TF_INDEX_INFO['msmarco-v1-doc']['filename'][:-6] + + TF_INDEX_INFO['msmarco-v1-doc']['md5']) + elif index == 'msmarco-v1-doc-segmented': + index_path = os.path.join(index_directory, + TF_INDEX_INFO['msmarco-v1-doc-segmented']['filename'][:-6] + + TF_INDEX_INFO['msmarco-v1-doc-segmented']['md5']) + else: + print("We currently only support three indexes: msmarco-passage, msmarco-v1-doc and msmarco-v1-doc-segmented but the index you inserted is not one of those") + self.object = JLuceneSearcher(index_path) + self.source_lookup, self.target_lookup, self.tran = self.load_tranprobs_table() + self.bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + self.pool = ThreadPool(num_threads) + + + @classmethod + def from_prebuilt_index(cls, prebuilt_index_name: str): + """Build a searcher from a pre-built index; download the index if necessary. + + Parameters + ---------- + prebuilt_index_name : str + Prebuilt index name. + + Returns + ------- + LuceneSearcher + Searcher built from the prebuilt index. + """ + print(f'Attempting to initialize pre-built index {prebuilt_index_name}.') + try: + index_dir = download_prebuilt_index(prebuilt_index_name) + except ValueError as e: + print(str(e)) + return None + + print(f'Initializing {prebuilt_index_name}...') + return cls(index_dir) + + def download_and_load_wp_stats(self, index: str): + translation_directory = os.path.join(get_cache_home(), 'models') + if not os.path.exists(translation_directory): + os.makedirs(translation_directory) + if (index == 'msmarco-v1-passage'): + local_filename = 'bert_wp_term_freq.msmarco-passage.20220411.pickle' + wp_stats_path = os.path.join(translation_directory, local_filename) + url = 'https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/bert_wp_term_freq.msmarco-passage.20220411.pickle' + elif (index == 'msmarco-v1-doc'): + local_filename = 'bert_wp_term_freq.msmarco-doc.20220411.pickle' + wp_stats_path = os.path.join(translation_directory, local_filename) + url = 'https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/bert_wp_term_freq.msmarco-doc.20220411.pickle' + elif (index == 'msmarco-v1-doc-segmented'): + local_filename = 'bert_wp_term_freq.msmarco-doc-segmented.20220411.pickle' + wp_stats_path = os.path.join(translation_directory, local_filename) + url = 'https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/bert_wp_term_freq.msmarco-doc-segmented.20220411.pickle' + + if os.path.exists(wp_stats_path): + print(f'{wp_stats_path} already exists, skipping download.') + else: + download_url(url, translation_directory, local_filename) + with open(wp_stats_path, 'rb') as fin: + termfreq_dic = pickle.load(fin) + return termfreq_dic + + @staticmethod + def intbits_to_float(b: bytes): + s = struct.pack('>l', b) + return struct.unpack('>f', s)[0] + + def rescale( + self, source_lookup: Dict[str, int], target_lookup: Dict[str, int], + tran_lookup: Dict[str, Dict[str, float]], + target_voc: Dict[int, str], source_voc: Dict[int, str] + ): + + for target_id in tran_lookup: + if target_id > 0: + adjust_mult = (1 - self.SELF_TRAN) + else: + adjust_mult = 1 + # adjust the prob with adjust_mult + # add SELF_TRAN prob to self-translation pair + for source_id in tran_lookup[target_id].keys(): + tran_prob = tran_lookup[target_id][source_id] + if source_id > 0: + source_word = source_voc[source_id] + target_word = target_voc[target_id] + tran_prob *= adjust_mult + if (source_word == target_word): + tran_prob += self.SELF_TRAN + tran_lookup[target_id][source_id] = tran_prob + # in case if self-translation pair was not included in TransTable + if target_id not in tran_lookup[target_id].keys(): + target_word = target_voc[target_id] + source_id = source_lookup[target_word] + tran_lookup[target_id][source_id] = self.SELF_TRAN + return source_lookup, target_lookup, tran_lookup + + def load_tranprobs_table(self): + dir_path = self.translation_model + source_path = dir_path + "/source.vcb" + source_lookup = {} + source_voc = {} + with open(source_path) as f: + lines = f.readlines() + for line in lines: + id, voc, freq = line.split(" ") + source_voc[int(id)] = voc + source_lookup[voc] = int(id) + + target_path = dir_path + "/target.vcb" + target_lookup = {} + target_voc = {} + with open(target_path) as f: + lines = f.readlines() + for line in lines: + id, voc, freq = line.split(" ") + target_voc[int(id)] = voc + target_lookup[voc] = int(id) + tran_path = dir_path + "/output.t1.5.bin" + tran_lookup = {} + with open(tran_path, "rb") as file: + byte = file.read(4) + while byte: + source_id = int.from_bytes(byte, "big") + assert(source_id == 0 or source_id in source_voc.keys()) + byte = file.read(4) + target_id = int.from_bytes(byte, "big") + assert(target_id in target_voc.keys()) + byte = file.read(4) + tran_prob = self.intbits_to_float(int.from_bytes(byte, "big")) + if (target_id in tran_lookup.keys()) and (tran_prob > self.MIN_PROB): + tran_lookup[target_id][source_id] = tran_prob + elif tran_prob > self.MIN_PROB: + tran_lookup[target_id] = {} + tran_lookup[target_id][source_id] = tran_prob + byte = file.read(4) + return self.rescale( + source_lookup, target_lookup, + tran_lookup, target_voc, source_voc) + + def get_ibm_score(self, arguments): + (query_text_lst, test_doc, searcher, source_lookup, + target_lookup, tran, collect_probs, max_sim) = arguments + + if searcher.doc_raw(test_doc) is None: + print(f"{test_doc} is not found in searcher") + contents = json.loads(self.object.doc_raw(test_doc))['contents'] + doc_token_lst = self.bert_tokenizer.tokenize(contents.lower(), truncation=True) + total_query_prob = 0 + doc_size = len(doc_token_lst) + query_size = len(query_text_lst) + for querytoken in query_text_lst: + target_map = {} + total_tran_prob = 0 + collect_prob = collect_probs[querytoken] + max_sim_score = 0 + if querytoken in target_lookup.keys(): + query_word_id = target_lookup[querytoken] + if query_word_id in tran.keys(): + target_map = tran[query_word_id] + for doctoken in doc_token_lst: + tran_prob = 0 + doc_word_id = 0 + if doctoken in source_lookup.keys(): + doc_word_id = source_lookup[doctoken] + if doc_word_id in target_map.keys(): + tran_prob = max(target_map[doc_word_id], tran_prob) + max_sim_score = max(tran_prob, max_sim_score) + total_tran_prob += (tran_prob/doc_size) + if max_sim: + query_word_prob = math.log( + (1 - self.LAMBDA_VALUE) * max_sim_score + self.LAMBDA_VALUE * collect_prob) + else: + query_word_prob = math.log( + (1 - self.LAMBDA_VALUE) * total_tran_prob + self.LAMBDA_VALUE * collect_prob) + + total_query_prob += query_word_prob + return total_query_prob / query_size + + def search(self, query_text, query_field_text, max_sim, bm25_results): + origin_scores = [bm25_result.score for bm25_result in bm25_results] + test_docs = [bm25_result.docid for bm25_result in bm25_results] + if (test_docs == []): + print(query_text) + + query_field_text_lst = query_field_text.split(' ') + total_term_freq = self.termfreq_dic['TOTAL'] + collect_probs = {} + for querytoken in query_field_text_lst: + if querytoken in self.termfreq_dic: + collect_probs[querytoken] = max(self.termfreq_dic[querytoken] / total_term_freq, self.MIN_COLLECT_PROB) + else: + collect_probs[querytoken] = self.MIN_COLLECT_PROB + arguments = [( + query_field_text_lst, test_doc, self.object, + self.source_lookup, self.target_lookup, + self.tran, collect_probs, max_sim) + for test_doc in test_docs] + + rank_scores = self.pool.map(self.get_ibm_score, arguments) + return test_docs, rank_scores, origin_scores + + def rerank(self, query_text, query_field_text, baseline, max_sim, tf_table): + test_docs, origin_scores = baseline + if (test_docs == []): + print(query_text) + + query_field_text_lst = query_field_text.split(' ') + total_term_freq = self.termfreq_dic['TOTAL'] + collect_probs = {} + for querytoken in query_field_text_lst: + if querytoken in self.termfreq_dic: + collect_probs[querytoken] = max(self.termfreq_dic[querytoken] / total_term_freq, self.MIN_COLLECT_PROB) + else: + collect_probs[querytoken] = self.MIN_COLLECT_PROB + arguments = [( + query_field_text_lst, test_doc, self.object, + self.source_lookup, self.target_lookup, + self.tran, collect_probs, max_sim) + for test_doc in test_docs] + + rank_scores = self.pool.map(self.get_ibm_score, arguments) + return test_docs, rank_scores, origin_scores diff --git a/pyserini/search/lucene/ltr/__init__.py b/pyserini/search/lucene/ltr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ad82edd5febc24513b48efd4de59aba900660b02 --- /dev/null +++ b/pyserini/search/lucene/ltr/__init__.py @@ -0,0 +1,30 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ._base import FeatureExtractor, BM25Stat, LmDirStat, DfrGl2Stat, DfrInExpB2Stat, DphStat, Proximity, TpScore, TpDist,\ + DocSize, MatchingTermCount, QueryLength, SCS, SumMatchingTF, UniqueTermCount, QueryCoverageRatio, \ + UnorderedSequentialPairs, OrderedSequentialPairs, UnorderedQueryPairs, OrderedQueryPairs, \ + AvgPooler, SumPooler, MedianPooler, MinPooler, MaxPooler, VarPooler, TfStat, TfIdfStat, NormalizedTfStat, \ + IdfStat, IcTfStat, ConfidencePooler, MaxMinRatioPooler, \ + NormalizedTfIdf, ProbalitySum, RunList, IbmModel1, SpacyTextParser +from ._search_msmarco import MsmarcoLtrSearcher + +__all__ = ['FeatureExtractor', 'BM25Stat', 'LmDirStat', 'DfrGl2Stat', 'DfrInExpB2Stat', 'DphStat', 'Proximity', 'TpScore', 'TpDist', + 'DocSize', 'MatchingTermCount', 'QueryLength', 'SCS', 'SumMatchingTF', 'UniqueTermCount', 'QueryCoverageRatio', + 'UnorderedSequentialPairs', 'OrderedSequentialPairs', 'UnorderedQueryPairs', 'OrderedQueryPairs', + 'AvgPooler', 'SumPooler', 'MedianPooler', 'MinPooler', 'MaxPooler', 'VarPooler', 'TfStat', 'TfIdfStat', + 'NormalizedTfStat','IdfStat', 'IcTfStat', 'ConfidencePooler', 'MaxMinRatioPooler','NormalizedTfIdf', + 'ProbalitySum', 'RunList', 'IbmModel1', 'MsmarcoLtrSearcher','SpacyTextParser'] \ No newline at end of file diff --git a/pyserini/search/lucene/ltr/__main__.py b/pyserini/search/lucene/ltr/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..1278487c38acb283db70f4d285fa9d06566957a1 --- /dev/null +++ b/pyserini/search/lucene/ltr/__main__.py @@ -0,0 +1,297 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys + +# We're going to explicitly use a local installation of Pyserini (as opposed to a pip-installed one). +# Comment these lines out to use a pip-installed one instead. +sys.path.insert(0, './') + +import argparse +import numpy as np +import pandas as pd +from tqdm import tqdm +from collections import defaultdict +from transformers import AutoTokenizer +from pyserini.search.lucene.ltr._search_msmarco import MsmarcoLtrSearcher +from pyserini.search.lucene.ltr import * +from pyserini.search.lucene import LuceneSearcher +from pyserini.analysis import Analyzer, get_lucene_analyzer + +""" +Running prediction on candidates +""" +def dev_data_loader(file, format, topic, rerank, prebuilt, qrel, granularity, top=1000): + if rerank: + if format == 'tsv': + dev = pd.read_csv(file, sep="\t", + names=['qid', 'pid', 'rank'], + dtype={'qid': 'S','pid': 'S', 'rank':'i',}) + elif format == 'trec': + dev = pd.read_csv(file, sep="\s+", + names=['qid', 'q0', 'pid', 'rank', 'score', 'tag'], + usecols=['qid', 'pid', 'rank'], + dtype={'qid': 'S','pid': 'S', 'rank':'i',}) + else: + raise Exception('unknown parameters') + assert dev['qid'].dtype == object + assert dev['pid'].dtype == object + assert dev['rank'].dtype == np.int32 + dev = dev[dev['rank']<=top] + else: + if prebuilt: + bm25search = LuceneSearcher.from_prebuilt_index(args.index) + else: + bm25search = LuceneSearcher(args.index) + bm25search.set_bm25(0.82, 0.68) + dev_dic = {"qid":[], "pid":[], "rank":[]} + for topic in tqdm(queries.keys()): + query_text = queries[topic]['raw'] + bm25_dev = bm25search.search(query_text, args.hits) + doc_ids = [bm25_result.docid for bm25_result in bm25_dev] + qid = [topic for _ in range(len(doc_ids))] + rank = [i for i in range(1, len(doc_ids)+1)] + dev_dic['qid'].extend(qid) + dev_dic['pid'].extend(doc_ids) + dev_dic['rank'].extend(rank) + dev = pd.DataFrame(dev_dic) + dev['rank'].astype(np.int32) + if granularity == 'document': + seperation = "\t" + else: + seperation = " " + dev_qrel = pd.read_csv(qrel, sep=seperation, + names=["qid", "q0", "pid", "rel"], usecols=['qid', 'pid', 'rel'], + dtype={'qid': 'S','pid': 'S', 'rel':'i'}) + dev = dev.merge(dev_qrel, left_on=['qid', 'pid'], right_on=['qid', 'pid'], how='left') + dev['rel'] = dev['rel'].fillna(0).astype(np.int32) + dev = dev.sort_values(['qid', 'pid']).set_index(['qid', 'pid']) + + print(dev.shape) + print(dev.index.get_level_values('qid').drop_duplicates().shape) + print(dev.groupby('qid').count().mean()) + print(dev.head(10)) + print(dev.info()) + + dev_rel_num = dev_qrel[dev_qrel['rel'] > 0].groupby('qid').count()['rel'] + + recall_point = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000] + recall_curve = {k: [] for k in recall_point} + for qid, group in tqdm(dev.groupby('qid')): + group = group.reset_index() + assert len(group['pid'].tolist()) == len(set(group['pid'].tolist())) + total_rel = dev_rel_num.loc[qid] + query_recall = [0 for k in recall_point] + for t in group.sort_values('rank').itertuples(): + if t.rel > 0: + for i, p in enumerate(recall_point): + if t.rank <= p: + query_recall[i] += 1 + for i, p in enumerate(recall_point): + if total_rel > 0: + recall_curve[p].append(query_recall[i] / total_rel) + else: + recall_curve[p].append(0.) + + for k, v in recall_curve.items(): + avg = np.mean(v) + print(f'recall@{k}:{avg}') + + return dev, dev_qrel + + +def query_loader(topic): + queries = {} + nlp = SpacyTextParser('en_core_web_sm', keep_only_alpha_num=True, lower_case=True) + analyzer = Analyzer(get_lucene_analyzer()) + bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + inp_file = open(topic) + ln = 0 + for line in tqdm(inp_file): + ln += 1 + line = line.strip() + if not line: + continue + fields = line.split('\t') + if len(fields) != 2: + print('Misformated line %d ignoring:' % ln) + print(line.replace('\t', '')) + continue + did, query = fields + query_lemmas, query_unlemm = nlp.proc_text(query) + analyzed = analyzer.analyze(query) + for token in analyzed: + if ' ' in token: + print(analyzed) + query_toks = query_lemmas.split() + if len(query_toks) >= 0: + query = {"raw" : query, + "text": query_lemmas.split(' '), + "text_unlemm": query_unlemm.split(' '), + "analyzed": analyzed, + "text_bert_tok": bert_tokenizer.tokenize(query.lower())} + queries[did] = query + + if ln % 10000 == 0: + print('Processed %d queries' % ln) + + print('Processed %d queries' % ln) + return queries + + +def eval_mrr(dev_data): + score_tie_counter = 0 + score_tie_query = set() + MRR = [] + for qid, group in tqdm(dev_data.groupby('qid')): + group = group.reset_index() + rank = 0 + prev_score = None + assert len(group['pid'].tolist()) == len(set(group['pid'].tolist())) + # stable sort is also used in LightGBM + + for t in group.sort_values('score', ascending=False, kind='mergesort').itertuples(): + if prev_score is not None and abs(t.score - prev_score) < 1e-8: + score_tie_counter += 1 + score_tie_query.add(qid) + prev_score = t.score + rank += 1 + if t.rel > 0: + MRR.append(1.0 / rank) + break + elif rank == 10 or rank == len(group): + MRR.append(0.) + break + + score_tie = f'score_tie occurs {score_tie_counter} times in {len(score_tie_query)} queries' + print(score_tie) + mrr_10 = np.mean(MRR).item() + print(f'MRR@10:{mrr_10} with {len(MRR)} queries') + return {'score_tie': score_tie, 'mrr_10': mrr_10} + + +def eval_recall(dev_qrel, dev_data): + dev_rel_num = dev_qrel[dev_qrel['rel'] > 0].groupby('qid').count()['rel'] + + score_tie_counter = 0 + score_tie_query = set() + + recall_point = [10,20,50,100,200,250,300,333,400,500,1000] + recall_curve = {k: [] for k in recall_point} + for qid, group in tqdm(dev_data.groupby('qid')): + group = group.reset_index() + rank = 0 + prev_score = None + assert len(group['pid'].tolist()) == len(set(group['pid'].tolist())) + # stable sort is also used in LightGBM + total_rel = dev_rel_num.loc[qid] + query_recall = [0 for k in recall_point] + for t in group.sort_values('score', ascending=False, kind='mergesort').itertuples(): + if prev_score is not None and abs(t.score - prev_score) < 1e-8: + score_tie_counter += 1 + score_tie_query.add(qid) + prev_score = t.score + rank += 1 + if t.rel > 0: + for i, p in enumerate(recall_point): + if rank <= p: + query_recall[i] += 1 + for i, p in enumerate(recall_point): + if total_rel > 0: + recall_curve[p].append(query_recall[i] / total_rel) + else: + recall_curve[p].append(0.) + + score_tie = f'score_tie occurs {score_tie_counter} times in {len(score_tie_query)} queries' + print(score_tie) + res = {'score_tie': score_tie} + + for k, v in recall_curve.items(): + avg = np.mean(v) + print(f'recall@{k}:{avg}') + res[f'recall@{k}'] = avg + + return res + + +def output(file, dev_data, format, maxp): + score_tie_counter = 0 + score_tie_query = set() + output_file = open(file,'w') + results = defaultdict(dict) + idx = 0 + for qid, group in tqdm(dev_data.groupby('qid')): + group = group.reset_index() + rank = 0 + prev_score = None + assert len(group['pid'].tolist()) == len(set(group['pid'].tolist())) + # stable sort is also used in LightGBM + for t in group.sort_values('score', ascending=False, kind='mergesort').itertuples(): + if prev_score is not None and abs(t.score - prev_score) < 1e-8: + score_tie_counter += 1 + score_tie_query.add(qid) + prev_score = t.score + if maxp: + docid = t.pid.split('#')[0] + if qid not in results or docid not in results[qid] or t.score > results[qid][docid]: + results[qid][docid] = t.score + else: + results[qid][t.pid] = t.score + + + for qid in tqdm(results.keys()): + rank = 1 + docid_score = results[qid] + docid_score = sorted(docid_score.items(),key=lambda kv: kv[1], reverse=True) + for docid, score in docid_score: + if format=='trec': + output_file.write(f"{qid}\tQ0\t{docid}\t{rank}\t{score}\tltr\n") + else: + output_file.write(f"{qid}\t{docid}\t{rank}\n") + rank += 1 + score_tie = f'score_tie occurs {score_tie_counter} times in {len(score_tie_query)} queries' + print(score_tie) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Learning to rank reranking') + parser.add_argument('--input', default='') + parser.add_argument('--hits', type=int, default=1000) + parser.add_argument('--input-format', default = 'trec') + parser.add_argument('--model', required=True) + parser.add_argument('--index', required=True) + parser.add_argument('--output', required=True) + parser.add_argument('--ibm-model', required=True) + parser.add_argument('--topic', required=True) + parser.add_argument('--output-format', default='tsv') + parser.add_argument('--max-passage', action='store_true') + parser.add_argument('--rerank', action='store_true') + parser.add_argument('--qrel', required=True) + parser.add_argument('--granularity', default='passage') + + args = parser.parse_args() + queries = query_loader(args.topic) + print("---------------------loading dev----------------------------------------") + prebuilt = args.index == 'msmarco-passage-ltr' or args.index == 'msmarco-doc-per-passage-ltr' + dev, dev_qrel = dev_data_loader(args.input, args.input_format, args.topic, args.rerank, prebuilt, args.qrel, args.granularity, args.hits) + searcher = MsmarcoLtrSearcher(args.model, args.ibm_model, args.index, args.granularity, prebuilt, args.topic) + searcher.add_fe() + batch_info = searcher.search(dev, queries) + del dev, queries + + eval_res = eval_mrr(batch_info) + eval_recall(dev_qrel, batch_info) + output(args.output, batch_info,args.output_format, args.max_passage) + print('Done!') \ No newline at end of file diff --git a/pyserini/search/lucene/ltr/_base.py b/pyserini/search/lucene/ltr/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..879b897ec7d7afeba989fc37744d578033e17c82 --- /dev/null +++ b/pyserini/search/lucene/ltr/_base.py @@ -0,0 +1,369 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyserini.pyclass import autoclass +import json +import numpy as np +import pandas as pd +import spacy +import re + +class Feature: + def name(self): + return self.extractor.getName() + +class NormalizedTfIdf(Feature): + def __init__(self, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.NormalizedTfIdf') + self.extractor = Jclass(field, qfield) + +class ProbalitySum(Feature): + def __init__(self, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.ProbalitySum') + self.extractor = Jclass(field, qfield) + +class IbmModel1(Feature): + def __init__(self, path, field, tag, qfield): + Jclass = autoclass('io.anserini.ltr.feature.IbmModel1') + self.extractor = Jclass(path, field, tag, qfield) + +class Proximity(Feature): + def __init__(self, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.Proximity') + self.extractor = Jclass(field, qfield) + +class TpScore(Feature): + def __init__(self, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.TpScore') + self.extractor = Jclass(field, qfield) + +class TpDist(Feature): + def __init__(self, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.TpDist') + self.extractor = Jclass(field, qfield) + +class DocSize(Feature): + def __init__(self, field='contents'): + Jclass = autoclass('io.anserini.ltr.feature.DocSize') + self.extractor = Jclass(field) + +class MatchingTermCount(Feature): + def __init__(self, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.MatchingTermCount') + self.extractor = Jclass(field, qfield) + +class QueryLength(Feature): + def __init__(self, qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.QueryLength') + self.extractor = Jclass(qfield) + +class SCS(Feature): + def __init__(self, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.SCS') + self.extractor = Jclass(field, qfield) + +class SumMatchingTF(Feature): + def __init__(self, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.SumMatchingTF') + self.extractor = Jclass(field, qfield) + +class QueryCoverageRatio(Feature): + def __init__(self, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.QueryCoverageRatio') + self.extractor = Jclass(field, qfield) + +class RunList(Feature): + def __init__(self,filename,tag): + Jclass = autoclass('io.anserini.ltr.feature.RunList') + self.extractor = Jclass(filename,tag) + +class UniqueTermCount(Feature): + def __init__(self, qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.UniqueTermCount') + self.extractor = Jclass(qfield) + +class UnorderedSequentialPairs(Feature): + def __init__(self, gap=8, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.UnorderedSequentialPairs') + self.extractor = Jclass(gap, field, qfield) + +class OrderedSequentialPairs(Feature): + def __init__(self, gap=8, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.OrderedSequentialPairs') + self.extractor = Jclass(gap, field, qfield) + +class UnorderedQueryPairs(Feature): + def __init__(self, gap=8, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.UnorderedQueryPairs') + self.extractor = Jclass(gap, field, qfield) + +class OrderedQueryPairs(Feature): + def __init__(self, gap=8, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.OrderedQueryPairs') + self.extractor = Jclass(gap, field, qfield) + +class AvgPooler(Feature): + def __init__(self): + Jclass = autoclass('io.anserini.ltr.AvgPooler') + self.extractor = Jclass() + +class SumPooler(Feature): + def __init__(self): + Jclass = autoclass('io.anserini.ltr.SumPooler') + self.extractor = Jclass() + +class MedianPooler(Feature): + def __init__(self): + Jclass = autoclass('io.anserini.ltr.MedianPooler') + self.extractor = Jclass() + +class MinPooler(Feature): + def __init__(self): + Jclass = autoclass('io.anserini.ltr.MinPooler') + self.extractor = Jclass() + +class MaxPooler(Feature): + def __init__(self): + Jclass = autoclass('io.anserini.ltr.MaxPooler') + self.extractor = Jclass() + +class VarPooler(Feature): + def __init__(self): + Jclass = autoclass('io.anserini.ltr.VarPooler') + self.extractor = Jclass() + +class ConfidencePooler(Feature): + def __init__(self): + Jclass = autoclass('io.anserini.ltr.ConfidencePooler') + self.extractor = Jclass() + +class MaxMinRatioPooler(Feature): + def __init__(self): + Jclass = autoclass('io.anserini.ltr.MaxMinRatioPooler') + self.extractor = Jclass() + +class TfStat(Feature): + def __init__(self, pooler, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.TfStat') + self.extractor = Jclass(pooler.extractor, field, qfield) + +class TfIdfStat(Feature): + def __init__(self, sublinear, pooler, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.TfIdfStat') + JBoolean = autoclass('java.lang.Boolean') + self.extractor = Jclass(JBoolean(sublinear), pooler.extractor, field, qfield) + +class NormalizedTfStat(Feature): + def __init__(self, pooler, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.NormalizedTfStat') + self.extractor = Jclass(pooler.extractor, field, qfield) + +class IdfStat(Feature): + def __init__(self, pooler, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.IdfStat') + self.extractor = Jclass(pooler.extractor, field, qfield) + +class IcTfStat(Feature): + def __init__(self, pooler, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.IcTfStat') + self.extractor = Jclass(pooler.extractor, field, qfield) + +class BM25Stat(Feature): + def __init__(self, pooler, k1=0.9, b=0.4, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.BM25Stat') + self.extractor = Jclass(pooler.extractor, k1, b, field, qfield) + +class DfrInExpB2Stat(Feature): + def __init__(self, pooler, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.DfrInExpB2Stat') + self.extractor = Jclass(pooler.extractor, field, qfield) + +class DphStat(Feature): + def __init__(self, pooler, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.DphStat') + self.extractor = Jclass(pooler.extractor, field, qfield) + +class LmDirStat(Feature): + def __init__(self, pooler, mu=1000, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.LmDirStat') + self.extractor = Jclass(pooler.extractor, mu, field, qfield) + +class DfrGl2Stat(Feature): + def __init__(self, pooler, field='contents', qfield='analyzed'): + Jclass = autoclass('io.anserini.ltr.feature.DfrGl2Stat') + self.extractor = Jclass(pooler.extractor, field, qfield) + + +class FeatureExtractor: + def __init__(self, index_dir, worker_num=1): + JFeatureExtractorUtils = autoclass('io.anserini.ltr.FeatureExtractorUtils') + self.utils = JFeatureExtractorUtils(index_dir, worker_num) + self.feature_name = [] + + def add(self, pyclass): + """ + add feature extractor; cannot add feature extractors in the middle of extraction + Parameters + ---------- + pyclass: Feature + an initialized feature extractor + """ + self.utils.add(pyclass.extractor) + self.feature_name.append(pyclass.name()) + + def feature_names(self): + """ + get all feature names + Returns + ------- + List[str] all the feature names in order + """ + return self.feature_name + + def lazy_extract(self, qid, doc_ids, query_dict): + input = {'qid': qid, 'docIds': doc_ids} + input.update(query_dict) + self.utils.lazyExtract(json.dumps(input)) + + def batch_extract(self, tasks): + need_rows = 0 + for task in tasks: + self.lazy_extract(task['qid'], task['docIds'], task['query_dict']) + need_rows += len(task['docIds']) + feature_name = self.feature_names() + feature = np.zeros(shape=(need_rows, len(feature_name)), dtype=np.float32) + idx = 0 + for task in tasks: + flattened = self.get_result(task['qid']) + feature[idx:idx+len(task['docIds']),:] = flattened.reshape(len(task['docIds']), len(feature_name)) + idx += len(task['docIds']) + return pd.DataFrame(feature, columns=feature_name) + + + def get_result(self, qid): + res = self.utils.getResult(qid).tostring() + dt = np.dtype(np.float32) + dt = dt.newbyteorder('>') + return np.frombuffer(res, dt) + +class SpacyTextParser: + def __init__(self, model_name, + remove_punct=True, + sent_split=False, + keep_only_alpha_num=False, + lower_case=True, + enable_POS=True): + + disable_list = ['ner', 'parser'] + if not enable_POS: + disable_list.append('tagger') + print('Disabled Spacy components: ', disable_list) + + self._nlp = spacy.load(model_name, disable=disable_list) + if sent_split: + sentencizer = self._nlp.create_pipe("sentencizer") + self._nlp.add_pipe(sentencizer) + + self._remove_punct = remove_punct + sw = ['a', 'about', 'above', 'according', 'across', 'after', + 'afterwards', 'again', 'against', 'albeit', 'all', 'almost', + 'alone', 'along', 'already', 'also', 'although', 'always', 'am', + 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', + 'anyone', 'anything', 'anyway', 'anywhere', 'apart', 'are', 'around', + 'as', 'at', 'av', 'be', 'became', 'because', 'become', 'becomes', + 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', + 'beside', 'besides', 'between', 'beyond', 'both', 'but', 'by', 'can', + 'cannot', 'canst', 'certain', 'cf', 'choose', 'contrariwise', 'cos', + 'could', 'cu', 'day', 'do', 'does', "doesn't", 'doing', 'dost', 'doth', + 'double', 'down', 'dual', 'during', 'each', 'either', 'else', 'elsewhere', + 'enough', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', + 'everything', 'everywhere', 'except', 'excepted', 'excepting', 'exception', + 'exclude', 'excluding', 'exclusive', 'far', 'farther', 'farthest', 'few', + 'ff', 'first', 'for', 'formerly', 'forth', 'forward', 'from', 'front', + 'further', 'furthermore', 'furthest', 'get', 'go', 'had', 'halves', 'hardly', + 'has', 'hast', 'hath', 'have', 'he', 'hence', 'henceforth', 'her', 'here', + 'hereabouts', 'hereafter', 'hereby', 'herein', 'hereto', 'hereupon', 'hers', + 'herself', 'him', 'himself', 'hindmost', 'his', 'hither', 'hitherto', 'how', + 'however', 'howsoever', 'i', 'ie', 'if', 'in', 'inasmuch', 'inc', 'include', + 'included', 'including', 'indeed', 'indoors', 'inside', 'insomuch', 'instead', + 'into', 'inward', 'inwards', 'is', 'it', 'its', 'itself', 'just', 'kind', 'kg', + 'km', 'last', 'latter', 'latterly', 'less', 'lest', 'let', 'like', 'little', 'ltd', + 'many', 'may', 'maybe', 'me', 'meantime', 'meanwhile', 'might', 'moreover', 'most', + 'mostly', 'more', 'mr', 'mrs', 'ms', 'much', 'must', 'my', 'myself', 'namely', 'need', + 'neither', 'never', 'nevertheless', 'next', 'no', 'nobody', 'none', 'nonetheless', + 'noone', 'nope', 'nor', 'not', 'nothing', 'notwithstanding', 'now', 'nowadays', + 'nowhere', 'of', 'off', 'often', 'ok', 'on', 'once', 'one', 'only', 'onto', 'or', + 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', + 'over', 'own', 'per', 'perhaps', 'plenty', 'provide', 'quite', 'rather', 'really', + 'round', 'said', 'sake', 'same', 'sang', 'save', 'saw', 'see', 'seeing', 'seem', 'seemed', + 'seeming', 'seems', 'seen', 'seldom', 'selves', 'sent', 'several', 'shalt', 'she', 'should', + 'shown', 'sideways', 'since', 'slept', 'slew', 'slung', 'slunk', 'smote', 'so', 'some', + 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', + 'spake', 'spat', 'spoke', 'spoken', 'sprang', 'sprung', 'stave', 'staves', 'still', 'such', + 'supposing', 'than', 'that', 'the', 'thee', 'their', 'them', 'themselves', 'then', 'thence', + 'thenceforth', 'there', 'thereabout', 'thereabouts', 'thereafter', 'thereby', 'therefore', + 'therein', 'thereof', 'thereon', 'thereto', 'thereupon', 'these', 'they', 'this', 'those', + 'thou', 'though', 'thrice', 'through', 'throughout', 'thru', 'thus', 'thy', 'thyself', 'till', + 'to', 'together', 'too', 'toward', 'towards', 'ugh', 'unable', 'under', 'underneath', 'unless', + 'unlike', 'until', 'up', 'upon', 'upward', 'upwards', 'us', 'use', 'used', 'using', 'very', 'via', + 'vs', 'want', 'was', 'we', 'week', 'well', 'were', 'what', 'whatever', 'whatsoever', 'when', + 'whence', 'whenever', 'whensoever', 'where', 'whereabouts', 'whereafter', 'whereas', 'whereat', + 'whereby', 'wherefore', 'wherefrom', 'wherein', 'whereinto', 'whereof', 'whereon', 'wheresoever', + 'whereto', 'whereunto', 'whereupon', 'wherever', 'wherewith', 'whether', 'whew', 'which', + 'whichever', 'whichsoever', 'while', 'whilst', 'whither', 'who', 'whoa', 'whoever', 'whole', + 'whom', 'whomever', 'whomsoever', 'whose', 'whosoever', 'why', 'will', 'wilt', 'with', 'within', + 'without', 'worse', 'worst', 'would', 'wow', 'ye', 'yet', 'year', 'yippee', 'you', 'your', 'yours', + 'yourself', 'yourselves', "n't", "'d", "'ll", "'m", "'re", "'s", "'ves"] + stopwords = set(sw) + self._stopwords = frozenset([w.lower() for w in stopwords]) + self._keep_only_alpha_num = keep_only_alpha_num + self._lower_case = lower_case + + @staticmethod + def _basic_clean(text): + return text.replace("’", "'") + + def __call__(self, text): + return self._nlp(SpacyTextParser._basic_clean(text)) + + def is_alpha_num(self, s): + return s and (re.match("^[a-zA-Z-_.0-9]+$", s) is not None) + + def proc_text(self, text): + lemmas = [] + tokens = [] + doc = self(text) + for tokObj in doc: + if self._remove_punct and tokObj.is_punct: + continue + lemma = tokObj.lemma_ + text = tokObj.text + if self._keep_only_alpha_num and not self.is_alpha_num(text): + continue + tok1 = text.lower() + tok2 = lemma.lower() + if tok1 in self._stopwords or tok2 in self._stopwords: + continue + + if self._lower_case: + text = text.lower() + lemma = lemma.lower() + + lemmas.append(lemma) + tokens.append(text) + + return ' '.join(lemmas), ' '.join(tokens) + \ No newline at end of file diff --git a/pyserini/search/lucene/ltr/_search_msmarco.py b/pyserini/search/lucene/ltr/_search_msmarco.py new file mode 100644 index 0000000000000000000000000000000000000000..bc6ced6ae12343cb89b318bbd5d99d061d536005 --- /dev/null +++ b/pyserini/search/lucene/ltr/_search_msmarco.py @@ -0,0 +1,255 @@ +# +# Pyserini: Python interface to the Anserini IR toolkit built on Lucene +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This module provides Pyserini's Python ltr search interface on MS MARCO passage. The main entry point is the ``MsmarcoPassageLtrSearcher`` +class. +""" + +import logging +import multiprocessing +import time +import os +from tqdm import tqdm +import pickle +from pyserini.index.lucene import IndexReader +from pyserini.search.lucene import LuceneSearcher +from pyserini.util import get_cache_home + +from pyserini.search.lucene.ltr._base import * + + +logger = logging.getLogger(__name__) + +class MsmarcoLtrSearcher: + def __init__(self, model: str, ibm_model:str, index:str, data: str, prebuilt: bool, topic: str): + #msmarco-ltr-passage + self.model = model + self.ibm_model = ibm_model + if prebuilt: + self.lucene_searcher = LuceneSearcher.from_prebuilt_index(index) + index_directory = os.path.join(get_cache_home(), 'indexes') + if data == 'passage': + index_path = os.path.join(index_directory, 'index-msmarco-passage-ltr-20210519-e25e33f.a5de642c268ac1ed5892c069bdc29ae3') + else: + index_path = os.path.join(index_directory, 'index-msmarco-doc-per-passage-ltr-20211031-33e4151.bd60e89041b4ebbabc4bf0cfac608a87') + self.index_reader = IndexReader.from_prebuilt_index(index) + else: + index_path = index + self.index_reader = IndexReader(index) + self.fe = FeatureExtractor(index_path, max(multiprocessing.cpu_count()//2, 1)) + self.data = data + + + def add_fe(self): + #self.fe.add(RunList('collections/msmarco-ltr-passage/run.monot5.run_list.whole.trec','t5')) + #self.fe.add(RunList('../bert.whole.doc.trec','bert')) + for qfield, ifield in [('analyzed', 'contents'), + ('text_unlemm', 'text_unlemm'), + ('text_bert_tok', 'text_bert_tok')]: + print(qfield, ifield) + self.fe.add(BM25Stat(SumPooler(), k1=2.0, b=0.75, field=ifield, qfield=qfield)) + self.fe.add(BM25Stat(AvgPooler(), k1=2.0, b=0.75, field=ifield, qfield=qfield)) + self.fe.add(BM25Stat(MedianPooler(), k1=2.0, b=0.75, field=ifield, qfield=qfield)) + self.fe.add(BM25Stat(MaxPooler(), k1=2.0, b=0.75, field=ifield, qfield=qfield)) + self.fe.add(BM25Stat(MinPooler(), k1=2.0, b=0.75, field=ifield, qfield=qfield)) + self.fe.add(BM25Stat(MaxMinRatioPooler(), k1=2.0, b=0.75, field=ifield, qfield=qfield)) + + self.fe.add(LmDirStat(SumPooler(), mu=1000, field=ifield, qfield=qfield)) + self.fe.add(LmDirStat(AvgPooler(), mu=1000, field=ifield, qfield=qfield)) + self.fe.add(LmDirStat(MedianPooler(), mu=1000, field=ifield, qfield=qfield)) + self.fe.add(LmDirStat(MaxPooler(), mu=1000, field=ifield, qfield=qfield)) + self.fe.add(LmDirStat(MinPooler(), mu=1000, field=ifield, qfield=qfield)) + self.fe.add(LmDirStat(MaxMinRatioPooler(), mu=1000, field=ifield, qfield=qfield)) + + self.fe.add(NormalizedTfIdf(field=ifield, qfield=qfield)) + self.fe.add(ProbalitySum(field=ifield, qfield=qfield)) + + self.fe.add(DfrGl2Stat(SumPooler(), field=ifield, qfield=qfield)) + self.fe.add(DfrGl2Stat(AvgPooler(), field=ifield, qfield=qfield)) + self.fe.add(DfrGl2Stat(MedianPooler(), field=ifield, qfield=qfield)) + self.fe.add(DfrGl2Stat(MaxPooler(), field=ifield, qfield=qfield)) + self.fe.add(DfrGl2Stat(MinPooler(), field=ifield, qfield=qfield)) + self.fe.add(DfrGl2Stat(MaxMinRatioPooler(), field=ifield, qfield=qfield)) + + self.fe.add(DfrInExpB2Stat(SumPooler(), field=ifield, qfield=qfield)) + self.fe.add(DfrInExpB2Stat(AvgPooler(), field=ifield, qfield=qfield)) + self.fe.add(DfrInExpB2Stat(MedianPooler(), field=ifield, qfield=qfield)) + self.fe.add(DfrInExpB2Stat(MaxPooler(), field=ifield, qfield=qfield)) + self.fe.add(DfrInExpB2Stat(MinPooler(), field=ifield, qfield=qfield)) + self.fe.add(DfrInExpB2Stat(MaxMinRatioPooler(), field=ifield, qfield=qfield)) + + self.fe.add(DphStat(SumPooler(), field=ifield, qfield=qfield)) + self.fe.add(DphStat(AvgPooler(), field=ifield, qfield=qfield)) + self.fe.add(DphStat(MedianPooler(), field=ifield, qfield=qfield)) + self.fe.add(DphStat(MaxPooler(), field=ifield, qfield=qfield)) + self.fe.add(DphStat(MinPooler(), field=ifield, qfield=qfield)) + self.fe.add(DphStat(MaxMinRatioPooler(), field=ifield, qfield=qfield)) + + self.fe.add(Proximity(field=ifield, qfield=qfield)) + self.fe.add(TpScore(field=ifield, qfield=qfield)) + self.fe.add(TpDist(field=ifield, qfield=qfield)) + + self.fe.add(DocSize(field=ifield)) + + self.fe.add(QueryLength(qfield=qfield)) + self.fe.add(QueryCoverageRatio(qfield=qfield)) + self.fe.add(UniqueTermCount(qfield=qfield)) + self.fe.add(MatchingTermCount(field=ifield, qfield=qfield)) + self.fe.add(SCS(field=ifield, qfield=qfield)) + + self.fe.add(TfStat(AvgPooler(), field=ifield, qfield=qfield)) + self.fe.add(TfStat(MedianPooler(), field=ifield, qfield=qfield)) + self.fe.add(TfStat(SumPooler(), field=ifield, qfield=qfield)) + self.fe.add(TfStat(MinPooler(), field=ifield, qfield=qfield)) + self.fe.add(TfStat(MaxPooler(), field=ifield, qfield=qfield)) + self.fe.add(TfStat(MaxMinRatioPooler(), field=ifield, qfield=qfield)) + + self.fe.add(TfIdfStat(True, AvgPooler(), field=ifield, qfield=qfield)) + self.fe.add(TfIdfStat(True, MedianPooler(), field=ifield, qfield=qfield)) + self.fe.add(TfIdfStat(True, SumPooler(), field=ifield, qfield=qfield)) + self.fe.add(TfIdfStat(True, MinPooler(), field=ifield, qfield=qfield)) + self.fe.add(TfIdfStat(True, MaxPooler(), field=ifield, qfield=qfield)) + self.fe.add(TfIdfStat(True, MaxMinRatioPooler(), field=ifield, qfield=qfield)) + + self.fe.add(NormalizedTfStat(AvgPooler(), field=ifield, qfield=qfield)) + self.fe.add(NormalizedTfStat(MedianPooler(), field=ifield, qfield=qfield)) + self.fe.add(NormalizedTfStat(SumPooler(), field=ifield, qfield=qfield)) + self.fe.add(NormalizedTfStat(MinPooler(), field=ifield, qfield=qfield)) + self.fe.add(NormalizedTfStat(MaxPooler(), field=ifield, qfield=qfield)) + self.fe.add(NormalizedTfStat(MaxMinRatioPooler(), field=ifield, qfield=qfield)) + + self.fe.add(IdfStat(AvgPooler(), field=ifield, qfield=qfield)) + self.fe.add(IdfStat(MedianPooler(), field=ifield, qfield=qfield)) + self.fe.add(IdfStat(SumPooler(), field=ifield, qfield=qfield)) + self.fe.add(IdfStat(MinPooler(), field=ifield, qfield=qfield)) + self.fe.add(IdfStat(MaxPooler(), field=ifield, qfield=qfield)) + self.fe.add(IdfStat(MaxMinRatioPooler(), field=ifield, qfield=qfield)) + + self.fe.add(IcTfStat(AvgPooler(), field=ifield, qfield=qfield)) + self.fe.add(IcTfStat(MedianPooler(), field=ifield, qfield=qfield)) + self.fe.add(IcTfStat(SumPooler(), field=ifield, qfield=qfield)) + self.fe.add(IcTfStat(MinPooler(), field=ifield, qfield=qfield)) + self.fe.add(IcTfStat(MaxPooler(), field=ifield, qfield=qfield)) + self.fe.add(IcTfStat(MaxMinRatioPooler(), field=ifield, qfield=qfield)) + + self.fe.add(UnorderedSequentialPairs(3, field=ifield, qfield=qfield)) + self.fe.add(UnorderedSequentialPairs(8, field=ifield, qfield=qfield)) + self.fe.add(UnorderedSequentialPairs(15, field=ifield, qfield=qfield)) + self.fe.add(OrderedSequentialPairs(3, field=ifield, qfield=qfield)) + self.fe.add(OrderedSequentialPairs(8, field=ifield, qfield=qfield)) + self.fe.add(OrderedSequentialPairs(15, field=ifield, qfield=qfield)) + self.fe.add(UnorderedQueryPairs(3, field=ifield, qfield=qfield)) + self.fe.add(UnorderedQueryPairs(8, field=ifield, qfield=qfield)) + self.fe.add(UnorderedQueryPairs(15, field=ifield, qfield=qfield)) + self.fe.add(OrderedQueryPairs(3, field=ifield, qfield=qfield)) + self.fe.add(OrderedQueryPairs(8, field=ifield, qfield=qfield)) + self.fe.add(OrderedQueryPairs(15, field=ifield, qfield=qfield)) + + start = time.time() + self.fe.add(IbmModel1(f"{self.ibm_model}/title_unlemm", "text_unlemm", "title_unlemm", "text_unlemm")) + end = time.time() + print('IBM model Load takes %.2f seconds' % (end - start)) + start = end + self.fe.add(IbmModel1(f"{self.ibm_model}url_unlemm", "text_unlemm", "url_unlemm", "text_unlemm")) + end = time.time() + print('IBM model Load takes %.2f seconds' % (end - start)) + start = end + self.fe.add(IbmModel1(f"{self.ibm_model}body", "text_unlemm", "body", "text_unlemm")) + end = time.time() + print('IBM model Load takes %.2f seconds' % (end - start)) + start = end + self.fe.add(IbmModel1(f"{self.ibm_model}text_bert_tok", "text_bert_tok", "text_bert_tok", "text_bert_tok")) + end = time.time() + print('IBM model Load takes %.2f seconds' % (end - start)) + start = end + + def batch_extract(self, df, queries, fe): + tasks = [] + task_infos = [] + group_lst = [] + + for qid, group in tqdm(df.groupby('qid')): + task = { + "qid": qid, + "docIds": [], + "rels": [], + "query_dict": queries[qid] + } + for t in group.reset_index().itertuples(): + if self.data == 'document': + if self.index_reader.doc(t.pid) != None: + task["docIds"].append(t.pid) + task_infos.append((qid, t.pid, t.rel)) + else: + task["docIds"].append(t.pid) + task_infos.append((qid, t.pid, t.rel)) + tasks.append(task) + group_lst.append((qid, len(task['docIds']))) + if len(tasks) == 1000: + features = fe.batch_extract(tasks) + task_infos = pd.DataFrame(task_infos, columns=['qid', 'pid', 'rel']) + group = pd.DataFrame(group_lst, columns=['qid', 'count']) + print(features.shape) + print(task_infos.qid.drop_duplicates().shape) + print(group.mean()) + print(features.head(10)) + print(features.info()) + yield task_infos, features, group + tasks = [] + task_infos = [] + group_lst = [] + # deal with rest + if len(tasks) > 0: + features = fe.batch_extract(tasks) + task_infos = pd.DataFrame(task_infos, columns=['qid', 'pid', 'rel']) + group = pd.DataFrame(group_lst, columns=['qid', 'count']) + print(features.shape) + print(task_infos.qid.drop_duplicates().shape) + print(group.mean()) + print(features.head(10)) + print(features.info()) + yield task_infos, features, group + + return + + def batch_predict(self, models, dev_extracted, feature_name): + task_infos, features, group = dev_extracted + dev_X = features.loc[:, feature_name] + + task_infos['score'] = 0. + for gbm in models: + task_infos['score'] += gbm.predict(dev_X) + + def search(self, dev, queries): + batch_info = [] + start_extract = time.time() + models = pickle.load(open(self.model+'/model.pkl', 'rb')) + metadata = json.load(open(self.model+'/metadata.json', 'r')) + feature_used = metadata['feature_names'] + for dev_extracted in self.batch_extract(dev, queries, self.fe): + end_extract = time.time() + print(f'extract 1000 queries take {end_extract - start_extract}s') + task_infos, features, group = dev_extracted + start_predict = time.time() + self.batch_predict(models, dev_extracted, feature_used) + end_predict = time.time() + print(f'predict 1000 queries take {end_predict - start_predict}s') + batch_info.append(task_infos) + start_extract = time.time() + batch_info = pd.concat(batch_info, axis=0, ignore_index=True) + return batch_info + diff --git a/pyserini/search/lucene/querybuilder.py b/pyserini/search/lucene/querybuilder.py new file mode 100644 index 0000000000000000000000000000000000000000..7627121c2bd79f616a2a5854f1670454acfc6e4e --- /dev/null +++ b/pyserini/search/lucene/querybuilder.py @@ -0,0 +1,90 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This module provides Pyserini's Python interface query building for Anserini. +""" +import logging +from enum import Enum + +from pyserini.analysis import get_lucene_analyzer, Analyzer +from pyserini.pyclass import autoclass + +logger = logging.getLogger(__name__) + + +# Wrapper around Lucene clases +JTerm = autoclass('org.apache.lucene.index.Term') +JBooleanClause = autoclass('org.apache.lucene.search.BooleanClause') +JBoostQuery = autoclass('org.apache.lucene.search.BoostQuery') +JTermQuery = autoclass('org.apache.lucene.search.TermQuery') + +# Wrappers around Anserini classes +JQueryGeneratorUtils = autoclass('io.anserini.search.query.QueryGeneratorUtils') + + +class JBooleanClauseOccur(Enum): + should = JQueryGeneratorUtils.getBooleanClauseShould() + must = JQueryGeneratorUtils.getBooleanClauseMust() + must_not = JQueryGeneratorUtils.getBooleanClauseMustNot() + filter = JQueryGeneratorUtils.getBooleanClauseFilter() + + +def get_boolean_query_builder(): + """Get a BooleanQueryBuilder object. + + Returns + ------- + JBooleanQueryBuilder + """ + return JQueryGeneratorUtils.getBooleanQueryBuilder() + + +def get_term_query(term, field="contents", analyzer=get_lucene_analyzer()): + """Searches the collection. + + Parameters + ---------- + term : str + The query term string. + field : str + Field to search. + analyzer : Analyzer + Analyzer to use for tokenizing the query term. + + Returns + ------- + JTermQuery + """ + analyzer = Analyzer(analyzer) + return JTermQuery(JTerm(field, analyzer.analyze(term)[0])) + + +def get_boost_query(query, boost): + """Get boost query. + + Parameters + ---------- + query : str + The query object to boost. + boost : float + Score multiplier. + + Returns + ------- + JBoostQuery + """ + return JBoostQuery(query, boost) diff --git a/pyserini/search/lucene/reranker.py b/pyserini/search/lucene/reranker.py new file mode 100644 index 0000000000000000000000000000000000000000..e5fa177593b01189059621b5cccba5861c430425 --- /dev/null +++ b/pyserini/search/lucene/reranker.py @@ -0,0 +1,123 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import enum +import importlib +import os +import uuid +from sklearn.linear_model import LogisticRegression +from sklearn.svm import SVC +from typing import List + + +class ClassifierType(enum.Enum): + LR = 'lr' + SVM = 'svm' + + +class FusionMethod(enum.Enum): + AVG = 'avg' + + +class PseudoRelevanceClassifierReranker: + def __init__(self, lucene_index: str, vectorizer_class: str, clf_type: List[ClassifierType], r=10, n=100, alpha=0.5): + self.r = r + self.n = n + self.alpha = alpha + self.clf_type = clf_type + + # get vectorizer + module = importlib.import_module("pyserini.vectorizer") + VectorizerClass = getattr(module, vectorizer_class) + self.vectorizer = VectorizerClass(lucene_index, min_df=5) + + if len(clf_type) > 2: + raise Exception('Re-ranker takes at most two classifiers') + + def _set_classifier(self, clf_type: ClassifierType): + if clf_type == ClassifierType.LR: + self.clf = LogisticRegression(random_state=42) + elif clf_type == ClassifierType.SVM: + self.clf = SVC(kernel='linear', probability=True, random_state=42) + else: + raise Exception("Invalid classifier type") + + def _get_prf_vectors(self, doc_ids: List[str]): + train_docs = doc_ids[:self.r] + doc_ids[-self.n:] + train_labels = [1] * self.r + [0] * self.n + + train_vecs = self.vectorizer.get_vectors(train_docs) + test_vecs = self.vectorizer.get_vectors(doc_ids) + + return train_vecs, train_labels, test_vecs + + def _rerank_with_classifier(self, doc_ids: List[str], search_scores: List[float]): + train_vecs, train_labels, test_vecs = self._get_prf_vectors(doc_ids) + + # classification + self.clf.fit(train_vecs, train_labels) + pred = self.clf.predict_proba(test_vecs) + classifier_scores = self._normalize([p[1] for p in pred]) + search_scores = self._normalize(search_scores) + + # interpolation + interpolated_scores = [a * self.alpha + b * (1-self.alpha) for a, b in zip(classifier_scores, search_scores)] + + return self._sort_dual_list(interpolated_scores, doc_ids) + + def rerank(self, doc_ids: List[str], search_scores: List[float]): + # one classifier + if len(self.clf_type) == 1: + self._set_classifier(self.clf_type[0]) + return self._rerank_with_classifier(doc_ids, search_scores) + + # two classifier with FusionMethod.AVG + doc_score_dict = {} + for i in range(2): + self._set_classifier(self.clf_type[i]) + i_scores, i_doc_ids = self._rerank_with_classifier(doc_ids, search_scores) + + for score, doc_id in zip(i_scores, i_doc_ids): + if doc_id not in doc_score_dict: + doc_score_dict[doc_id] = set() + doc_score_dict[doc_id].add(score) + + r_scores, r_doc_ids = [], [] + for doc_id, score in doc_score_dict.items(): + avg = sum(score) / len(score) + r_doc_ids.append(doc_id) + r_scores.append(avg) + + return r_scores, r_doc_ids + + def _normalize(self, scores: List[float]): + low = min(scores) + high = max(scores) + width = high - low + + return [(s-low)/width for s in scores] + + # sort both list in decreasing order by using the list1 to compare + def _sort_dual_list(self, list1, list2): + zipped_lists = zip(list1, list2) + sorted_pairs = sorted(zipped_lists) + + tuples = zip(*sorted_pairs) + list1, list2 = [list(tuple) for tuple in tuples] + + list1.reverse() + list2.reverse() + return list1, list2 diff --git a/pyserini/search/nmslib/__init__.py b/pyserini/search/nmslib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e3188206e5cf2f8ce6d91dddf748fcd8bac193ff --- /dev/null +++ b/pyserini/search/nmslib/__init__.py @@ -0,0 +1,19 @@ +# +# Pyserini: Python interface to the Anserini IR toolkit built on Lucene +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ._searcher import SearchResult, NmslibSearcher + +__all__ = ['SearchResult', 'NmslibSearcher'] diff --git a/pyserini/search/nmslib/__main__.py b/pyserini/search/nmslib/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..581fafa916a1c3e586308b7491688802b382d0b2 --- /dev/null +++ b/pyserini/search/nmslib/__main__.py @@ -0,0 +1,94 @@ +# +# Pyserini: Python interface to the Anserini IR toolkit built on Lucene +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import json +import time +from tqdm import tqdm + +from ._searcher import NmslibSearcher +from pyserini.output_writer import get_output_writer, OutputFormat, tie_breaker + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Search a nmslib index.') + parser.add_argument('--index', type=str, metavar='path to index or index name', required=True, + help="Path to nmslib index.") + parser.add_argument('--topics', type=str, required=True, help="path to topics") + parser.add_argument('--hits', type=int, metavar='num', required=False, default=1000, help="Number of hits.") + parser.add_argument('--output-format', type=str, metavar='format', default=OutputFormat.TREC.value, + help=f"Format of output. Available: {[x.value for x in list(OutputFormat)]}") + parser.add_argument('--output', type=str, metavar='path', required=True, help="Path to output file.") + parser.add_argument('--ef', type=int, required=False, default=256, help="hnsw ef_search") + parser.add_argument('--threads', type=int, metavar='num', required=False, default=1, + help="maximum threads to use during search") + parser.add_argument('--batch-size', type=int, metavar='num', required=False, default=1, + help="search batch of queries in parallel") + parser.add_argument('--is-sparse', action='store_true', required=False) + args = parser.parse_args() + + searcher = NmslibSearcher(args.index, ef_search=args.ef, is_sparse=args.is_sparse) + + topic_ids = [] + topic_vectors = [] + with open(args.topics) as topic_f: + for line in topic_f: + info = json.loads(line) + topic_ids.append(info['id']) + topic_vectors.append(info['vector']) + + if not searcher: + exit() + + # build output path + output_path = args.output + + print(f'Running {args.topics} topics, saving to {output_path}...') + tag = 'HNSW' + + # support trec and msmarco format only for now + output_writer = get_output_writer(output_path, OutputFormat(args.output_format), max_hits=args.hits, tag=tag) + + search_time = 0 + with output_writer: + batch_topic_vectors = list() + batch_topic_ids = list() + for index, (topic_id, vec) in enumerate(tqdm(zip(topic_ids, topic_vectors))): + if args.batch_size <= 1 and args.threads <= 1: + start = time.time() + hits = searcher.search(vec, args.hits) + search_time += time.time() - start + results = [(topic_id, hits)] + else: + batch_topic_ids.append(str(topic_id)) + batch_topic_vectors.append(vec) + if (index + 1) % args.batch_size == 0 or \ + index == len(topic_ids) - 1: + start = time.time() + results = searcher.batch_search( + batch_topic_vectors, batch_topic_ids, args.hits, args.threads) + search_time += time.time() - start + results = [(id_, results[id_]) for id_ in batch_topic_ids] + batch_topic_ids.clear() + batch_topic_vectors.clear() + else: + continue + + for topic, hits in results: + output_writer.write(topic, tie_breaker(hits)) + + results.clear() + + print(f'Search {len(topic_ids)} topics in {search_time} seconds') diff --git a/pyserini/search/nmslib/_searcher.py b/pyserini/search/nmslib/_searcher.py new file mode 100644 index 0000000000000000000000000000000000000000..e0db6a24743c5e479d221f906c1bcc3bc107dc87 --- /dev/null +++ b/pyserini/search/nmslib/_searcher.py @@ -0,0 +1,142 @@ +# +# Pyserini: Python interface to the Anserini IR toolkit built on Lucene +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +import os +from dataclasses import dataclass +from typing import Dict, List + +import nmslib +import numpy as np +from scipy.sparse import csr_matrix, vstack + + +@dataclass +class SearchResult: + docid: str + score: float + + +class NmslibSearcher: + """Simple Searcher for vector representation + """ + + def __init__(self, index_dir: str, ef_search: int = 1000, is_sparse=False): + self.is_sparse = is_sparse + self.index, self.docids, self.token2id, self.metadata = self._load_index(index_dir, self.is_sparse) + self.index.setQueryTimeParams({'efSearch': ef_search}) + self.dimension = len(self.token2id) if self.is_sparse else None + + def search(self, query, k: int = 10) -> List[SearchResult]: + """Search the collection. + + Parameters + ---------- + query : query vector + k : int + Number of hits to return. + threads : int + Maximum number of threads to use for intra-query search. + Returns + ------- + List[SearchResult] + List of search results. + """ + if self.is_sparse: + query = self._token_dict_to_sparse_vector(query) + else: + query = np.array([query]) + indexes, scores = self.index.knnQueryBatch(query, k=k, num_threads=1)[0] + return [SearchResult(self.docids[idx], -score) + for score, idx in zip(scores, indexes) if idx != -1] + + def batch_search(self, queries, q_ids: List[str], k: int = 10, threads: int = 1) \ + -> Dict[str, List[SearchResult]]: + """ + + Parameters + ---------- + queries : vectors + q_ids : List[str] + List of corresponding query ids. + k : int + Number of hits to return. + threads : int + Maximum number of threads to use. + + Returns + ------- + Dict[str, List[SearchResult]] + Dictionary holding the search results, with the query ids as keys and the corresponding lists of search + results as the values. + """ + if self.is_sparse: + queries = [self._token_dict_to_sparse_vector(query) for query in queries] + queries = vstack(queries) + else: + queries = np.array(queries) + I, D = zip(*self.index.knnQueryBatch(queries, k=k, num_threads=threads)) + return {key: [SearchResult(self.docids[idx], -score) + for score, idx in zip(distances, indexes) if idx != -1] + for key, distances, indexes in zip(q_ids, D, I)} + + def _load_index(self, index_dir: str, is_sparse: bool): + if is_sparse: + index = nmslib.init(method='hnsw', space='negdotprod_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) + else: + index = nmslib.init(method='hnsw', space='negdotprod', data_type=nmslib.DataType.DENSE_VECTOR) + index_path = os.path.join(index_dir, 'index.bin') + docid_path = os.path.join(index_dir, 'docid') + tokens_path = os.path.join(index_dir, 'tokens') + metadata_path = os.path.join(index_dir, 'meta') + index.loadIndex(index_path, load_data=True) + docids = self._load_docids(docid_path) + token2id = self._load_tokens(tokens_path) + metadata = self._load_metadata(metadata_path) + return index, docids, token2id, metadata + + def _token_dict_to_sparse_vector(self, token_dict): + matrix_row, matrix_col, matrix_data = [], [], [] + tokens = token_dict.keys() + col = [] + data = [] + for tok in tokens: + if tok in self.token2id: + col.append(self.token2id[tok]) + data.append(token_dict[tok]) + matrix_row.extend([0] * len(col)) + matrix_col.extend(col) + matrix_data.extend(data) + vector = csr_matrix((matrix_data, (matrix_row, matrix_col)), shape=(1, self.dimension)) + return vector + + @staticmethod + def _load_docids(docid_path: str) -> List[str]: + docids = [line.rstrip() for line in open(docid_path, 'r').readlines()] + return docids + + @staticmethod + def _load_tokens(tokens_path: str): + if not os.path.exists(tokens_path): + return None + tokens = [line.rstrip() for line in open(tokens_path, 'r').readlines()] + return dict(zip(tokens, range(len(tokens)))) + + @staticmethod + def _load_metadata(metadata_path): + if not os.path.exists(metadata_path): + return None + meta = json.load(open(metadata_path)) + return meta diff --git a/pyserini/setup.py b/pyserini/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..1cc1560c24235a5242bad80c9735a52fa29c2ef5 --- /dev/null +++ b/pyserini/setup.py @@ -0,0 +1,40 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Module for adding Anserini jar to classpath for pyjnius usage +""" + +import glob +import os + +import jnius_config + + +def configure_classpath(anserini_root="."): + """ + Parameters + ---------- + anserini_root : str + (Optional) path to root anserini directory. + + """ + paths = glob.glob(os.path.join(anserini_root, 'anserini-*-fatjar.jar')) + if not paths: + raise Exception('No matching jar file found in {}'.format(os.path.abspath(anserini_root))) + + latest = max(paths, key=os.path.getctime) + jnius_config.add_classpath(latest) diff --git a/pyserini/tokenize_json_collection.py b/pyserini/tokenize_json_collection.py new file mode 100644 index 0000000000000000000000000000000000000000..b5ae47d6e6f6b7793bca32965c1946db1ffcb4fc --- /dev/null +++ b/pyserini/tokenize_json_collection.py @@ -0,0 +1,62 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import json +import os + +from transformers import BertTokenizer, T5Tokenizer + + +def write_to_file(tokenizer, input, output): + with open(input, encoding='utf-8') as f: + out_f = open(output, 'w') + for i, line in enumerate(f): + fdict = json.loads(line) + contents = fdict['contents'] + tok = tokenizer.tokenize(contents) + tokcont = ' ' + fdict['contents'] = tokcont.join(tok) + out_f.write(json.dumps(fdict) + '\n') + if i % 10000 == 0: + print(f'Converted {i:,} docs, writing into file {output}') + out_f.close() + + +def main(args): + if ('bert' in args.tokenizer): + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + else: + tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco') + if (os.path.isdir(args.input)): + for i, inf in enumerate(sorted(os.listdir(args.input))): + if not os.path.isdir(args.output): + os.mkdir(args.output) + outf = os.path.join(args.output, 'docs{:02d}.json'.format(i)) + write_to_file(tokenizer,os.path.join(args.input, inf), outf) + else: + write_to_file(tokenizer,args.input, args.output) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--input", type=str, help='Input file/dir', required=True) + parser.add_argument("--output", type=str, help='Output file/dir', required=True) + parser.add_argument("--tokenizer", type=str, help='full name of tokenizer', default='bert-base-uncased') + + args = parser.parse_args() + + main(parser.parse_args()) \ No newline at end of file diff --git a/pyserini/trectools/__init__.py b/pyserini/trectools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c5faca29258764830765c3a4709f81c4fded322f --- /dev/null +++ b/pyserini/trectools/__init__.py @@ -0,0 +1,19 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ._base import AggregationMethod, RescoreMethod, TrecRun, Qrels + +__all__ = ['AggregationMethod', 'RescoreMethod', 'TrecRun', 'Qrels'] diff --git a/pyserini/trectools/__pycache__/__init__.cpython-310.pyc b/pyserini/trectools/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2bb310a213c36dc35fe93cfbcabfcc12b3383049 Binary files /dev/null and b/pyserini/trectools/__pycache__/__init__.cpython-310.pyc differ diff --git a/pyserini/trectools/__pycache__/_base.cpython-310.pyc b/pyserini/trectools/__pycache__/_base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ec8cd46da44b58792e55ac20c2dfe55222dbea1 Binary files /dev/null and b/pyserini/trectools/__pycache__/_base.cpython-310.pyc differ diff --git a/pyserini/trectools/_base.py b/pyserini/trectools/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..e75a13a8b11cc57c5b5f777f84b7f9be7935fc42 --- /dev/null +++ b/pyserini/trectools/_base.py @@ -0,0 +1,351 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import itertools +import numpy as np +import pandas as pd + +from concurrent.futures import ThreadPoolExecutor +from copy import deepcopy +from enum import Enum +from typing import List, Set, Tuple + + +class AggregationMethod(Enum): + SUM = 'sum' + + +class RescoreMethod(Enum): + RRF = 'rrf' + SCALE = 'scale' + NORMALIZE = 'normalize' + + +class Qrels: + """Wrapper class for TREC Qrels. + + Parameters + ---------- + filepath : str + File path of a given TREC Qrels. + """ + + columns = ['topic', 'q0', 'docid', 'relevance_grade'] + + def __init__(self, filepath: str = None): + self.filepath = filepath + self.qrels_data = pd.DataFrame(columns=Qrels.columns) + + if filepath is not None: + self.read_run(self.filepath) + + def read_run(self, filepath: str): + self.qrels_data = pd.read_csv(filepath, sep='\s+', names=Qrels.columns) + + def get_relevance_grades(self) -> Set[str]: + """Return a set with all relevance grades.""" + + return set(sorted(self.qrels_data["relevance_grade"].unique())) + + def topics(self) -> Set[str]: + """Return a set with all topics.""" + + return set(sorted(self.qrels_data["topic"].unique())) + + def get_docids(self, topic, relevance_grades=None) -> List[str]: + """"Return a list of docids for a given topic and a list relevance grades. + + Parameters: + ---------- + relevance : List[int] + E.g. [0, 1, 2]. If not provided, then all relevance will be returned. + topic : int + """ + + if relevance_grades is None: + relevance_grades = self.get_relevance_grades() + + filtered_df = self.qrels_data[self.qrels_data['topic'] == topic] + filtered_df = filtered_df[filtered_df['relevance_grade'].isin(relevance_grades)] + + return filtered_df['docid'].tolist() + + +class TrecRun: + """Wrapper class for a TREC run. + + Parameters + ---------- + filepath : str + File path of a given TREC Run. + """ + + columns = ['topic', 'q0', 'docid', 'rank', 'score', 'tag'] + + def __init__(self, filepath: str = None, resort: bool = False): + self.reset_data() + self.filepath = filepath + self.resort = resort + + if filepath is not None: + self.read_run(self.filepath,self.resort) + + def reset_data(self): + self.run_data = pd.DataFrame(columns=TrecRun.columns) + + def read_run(self, filepath: str, resort: bool = False) -> None: + self.run_data = pd.read_csv(filepath, sep='\s+', names=TrecRun.columns, dtype={'docid': 'str'}) + if resort: + self.run_data.sort_values(["topic", "score"], inplace=True, ascending=[True, False]) + self.run_data["rank"] = self.run_data.groupby("topic")["score"].rank(ascending=False,method='first') + + def topics(self) -> Set[str]: + """Return a set with all topics.""" + return set(sorted(self.run_data["topic"].unique())) + + def clone(self): + """Return a deep copy of the current instance.""" + return deepcopy(self) + + def save_to_txt(self, output_path: str, tag: str = None) -> None: + if len(self.run_data) == 0: + raise Exception('Nothing to save. TrecRun is empty') + + if tag is not None: + self.run_data['tag'] = tag + + self.run_data = self.run_data.sort_values(by=['topic', 'score'], ascending=[True, False]) + self.run_data.to_csv(output_path, sep=' ', header=False, index=False) + + def get_docs_by_topic(self, topic: str, max_docs: int = None): + docs = self.run_data[self.run_data['topic'] == topic] + + if max_docs is not None: + docs = docs.head(max_docs) + + return docs + + def rescore(self, method: RescoreMethod, rrf_k: int = None, scale: float = None): + # Refer to this guide on how to efficiently manipulate dataframes: https://engineering.upside.com/a-beginners-guide-to-optimizing-pandas-code-for-speed-c09ef2c6a4d6 + if method == RescoreMethod.RRF: + assert rrf_k is not None, 'Parameter "rrf_k" must be a valid integer.' + self.run_data['score'] = 1 / (rrf_k + self.run_data['rank'].values) + elif method == RescoreMethod.SCALE: + assert scale is not None, 'Parameter "scale" must not be none.' + self.run_data['score'] = self.run_data['score'].values * scale + elif method == RescoreMethod.NORMALIZE: + for topic in self.topics(): + scores = self.run_data[self.run_data['topic'] == topic]['score'].copy().values + low = np.min(scores) + high = np.max(scores) + + if high - low == 0: + self.run_data.loc[self.run_data['topic'] == topic, 'score'] = 1 + else: + scores = (scores - low) / (high - low) + scores = [float(score) for score in scores] + self.run_data.loc[self.run_data['topic'] == topic, 'score'] = scores + else: + raise NotImplementedError() + + return self + + def to_numpy(self) -> np.ndarray: + return self.run_data.to_numpy(copy=True) + + def discard_qrels(self, qrels: Qrels, clone=True): + """Discard each docid in self if docid is also in the given qrels. + This operation is performed on each topic separately. + + Parameters: + ---------- + qrels : Qrels + Qrels with docids to remove from TrecRun. + clone : Bool + Return a new TrecRun object if True, else self will be modified and returned. + """ + + return self._filter_from_qrels(qrels, False, clone=clone) + + def retain_qrels(self, qrels: Qrels, clone=True): + """Retain each docid in self if docid is also in the given qrels. + This operation is performed on each topic separately. + After this operation, judged@x based on the given qrels should be 1. + + Parameters: + ---------- + qrels : Qrels + Qrels with docids to keep in TrecRun. + clone : Bool + Return a new TrecRun object if True, else self will be modified and returned. + """ + + return self._filter_from_qrels(qrels, True, clone=clone) + + def _filter_from_qrels(self, qrels: Qrels, keep: bool, clone=True): + """Private helper function to remove/keep each docid in self if docid is also in the given Qrels object. + This operation is performed on each topic separately. + + Parameters: + ---------- + qrels : Qrels + Qrels with docids to remove from or keep in TrecRun. + clone : Bool + Return a new TrecRun object if True, else self will be modified and returned. + """ + + df_list = [] + for topic in self.topics(): + if topic not in qrels.topics(): + continue + + qrels_docids = qrels.get_docids(topic) + topic_df = self.run_data[self.run_data['topic'] == topic] + if keep is True: + topic_df = topic_df[topic_df['docid'].isin(qrels_docids)] + else: + topic_df = topic_df[~topic_df['docid'].isin(qrels_docids)] + df_list.append(topic_df) + + run = TrecRun() if clone is True else self + return TrecRun.from_dataframes(df_list, run) + + @staticmethod + def get_all_topics_from_runs(runs) -> Set[str]: + all_topics = set() + for run in runs: + all_topics = all_topics.union(run.topics()) + + return all_topics + + @staticmethod + def merge(runs, aggregation: AggregationMethod, depth: int = None, k: int = None): + """Return a TrecRun by aggregating docid in various ways such as summing scores + + Parameters + ---------- + runs : List[TrecRun] + List of ``TrecRun`` objects. + aggregation : AggregationMethod + The aggregation method to use. + depth : int + Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that + the complete list of results is considered. + k : int + Length of final results list. Set to ``None`` by default, which indicates that the union of all input documents + are ranked. + """ + + if len(runs) < 2: + raise Exception('Merge requires at least 2 runs.') + + rows = [] + + if aggregation == AggregationMethod.SUM: + topics = list(TrecRun.get_all_topics_from_runs(runs)) + + def merge_topic(topic): + doc_scores = dict() + + for run in runs: + for docid, score in run.get_docs_by_topic(topic, depth)[['docid', 'score']].values: + doc_scores[docid] = doc_scores.get(docid, 0.0) + score + + sorted_doc_scores = sorted(iter(doc_scores.items()), key=lambda x: (-x[1], x[0])) + sorted_doc_scores = sorted_doc_scores if k is None else sorted_doc_scores[:k] + + return [ + (topic, 'Q0', docid, rank, score, 'merge_sum') + for rank, (docid, score) in enumerate(sorted_doc_scores, start=1) + ] + + max_workers = max(len(topics)/10, 1) + with ThreadPoolExecutor(max_workers=int(max_workers)) as exec: + results = list(exec.map(merge_topic, topics)) + + rows = list(itertools.chain.from_iterable(results)) + else: + raise NotImplementedError() + + return TrecRun.from_list(rows) + + @staticmethod + def from_dataframes(dfs, run=None): + """Return a TrecRun by populating dataframe with the provided list of dataframes. + + Parameters + ---------- + dfs: List[Dataframe] + A list of Dataframes conforming to TrecRun.columns + + run: TrecRun + Set to ``None`` by default. If None, then a new instance of TrecRun will be created. + Else, the given TrecRun will be modified. + """ + + res = TrecRun() if run is None else run + res.reset_data() + res.run_data = pd.concat([df for df in dfs]) + + return res + + @staticmethod + def from_list(rows, run=None): + """Return a TrecRun by populating dataframe with the provided list of tuples. + For performance reasons, df.to_numpy() is faster than df.iterrows(). + When manipulating dataframes, we first dump to np.ndarray and construct a list of tuples with new values. + Then use this function to convert the list of tuples to a TrecRun object. + + Parameters + ---------- + rows: List[tuples] + List of tuples in the following format: (topic, 'Q0', docid, rank, score, tag) + + run: TrecRun + Set to ``None`` by default. If None, then a new instance of TrecRun will be created. + Else, the given TrecRun will be modified. + """ + + res = TrecRun() if run is None else run + + df = pd.DataFrame(rows) + df.columns = TrecRun.columns + res.run_data = df.copy() + + return res + + @staticmethod + def from_search_results(docid_score_pair: Tuple[str, float], topic=1): + rows = [] + + for rank, (docid, score) in enumerate(docid_score_pair, start=1): + rows.append((topic, 'Q0', docid, rank, score, 'searcher')) + + return TrecRun.from_list(rows) + + @staticmethod + def concat(runs): + """Return a new TrecRun by concatenating a list of TrecRuns + + Parameters + ---------- + runs : List[TrecRun] + List of ``TrecRun`` objects. + """ + + run = TrecRun() + run.run_data = pd.concat([run.run_data for run in runs]) + return run diff --git a/pyserini/util.py b/pyserini/util.py new file mode 100644 index 0000000000000000000000000000000000000000..5309153425c59e89cbb61e9db48aee8a06900bcd --- /dev/null +++ b/pyserini/util.py @@ -0,0 +1,283 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import hashlib +import os +import re +import shutil +import tarfile +import logging +from urllib.error import HTTPError, URLError +from urllib.request import urlretrieve + +import pandas as pd +from tqdm import tqdm + +from pyserini.encoded_query_info import QUERY_INFO +from pyserini.encoded_corpus_info import CORPUS_INFO +from pyserini.evaluate_script_info import EVALUATION_INFO +from pyserini.prebuilt_index_info import TF_INDEX_INFO, FAISS_INDEX_INFO, IMPACT_INDEX_INFO + + +logger = logging.getLogger(__name__) + + +# https://gist.github.com/leimao/37ff6e990b3226c2c9670a2cd1e4a6f5 +class TqdmUpTo(tqdm): + def update_to(self, b=1, bsize=1, tsize=None): + """ + b : int, optional + Number of blocks transferred so far [default: 1]. + bsize : int, optional + Size of each block (in tqdm units) [default: 1]. + tsize : int, optional + Total size (in tqdm units). If [default: None] remains unchanged. + """ + if tsize is not None: + self.total = tsize + self.update(b * bsize - self.n) # will also set self.n = b * bsize + + +# For large files, we need to compute MD5 block by block. See: +# https://stackoverflow.com/questions/1131220/get-md5-hash-of-big-files-in-python +def compute_md5(file, block_size=2**20): + m = hashlib.md5() + with open(file, 'rb') as f: + while True: + buf = f.read(block_size) + if not buf: + break + m.update(buf) + return m.hexdigest() + + +def download_url(url, save_dir, local_filename=None, md5=None, force=False, verbose=True): + # If caller does not specify local filename, figure it out from the download URL: + if not local_filename: + filename = url.split('/')[-1] + filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter + else: + # Otherwise, use the specified local_filename: + filename = local_filename + + destination_path = os.path.join(save_dir, filename) + + if verbose: + print(f'Downloading {url} to {destination_path}...') + + # Check to see if file already exists, if so, simply return (quietly) unless force=True, in which case we remove + # destination file and download fresh copy. + if os.path.exists(destination_path): + if verbose: + print(f'{destination_path} already exists!') + if not force: + if verbose: + print(f'Skipping download.') + return destination_path + if verbose: + print(f'force=True, removing {destination_path}; fetching fresh copy...') + os.remove(destination_path) + + with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc=filename) as t: + urlretrieve(url, filename=destination_path, reporthook=t.update_to) + + if md5: + md5_computed = compute_md5(destination_path) + assert md5_computed == md5, f'{destination_path} does not match checksum! Expecting {md5} got {md5_computed}.' + + return destination_path + + +def get_cache_home(): + custom_dir = os.environ.get("PYSERINI_CACHE") + if custom_dir is not None and custom_dir != '': + return custom_dir + return os.path.expanduser(os.path.join(f'~{os.path.sep}.cache', "pyserini")) + +def download_and_unpack_index(url, index_directory='indexes', local_filename=False, + force=False, verbose=True, prebuilt=False, md5=None): + # If caller does not specify local filename, figure it out from the download URL: + if not local_filename: + index_name = url.split('/')[-1] + else: + # Otherwise, use the specified local_filename: + index_name = local_filename + # Remove the suffix: + index_name = re.sub('''.tar.gz.*$''', '', index_name) + + if prebuilt: + index_directory = os.path.join(get_cache_home(), index_directory) + index_path = os.path.join(index_directory, f'{index_name}.{md5}') + + if not os.path.exists(index_directory): + os.makedirs(index_directory) + + local_tarball = os.path.join(index_directory, f'{index_name}.tar.gz') + # If there's a local tarball, it's likely corrupted, because we remove the local tarball on success (below). + # So, we want to remove. + if os.path.exists(local_tarball): + os.remove(local_tarball) + else: + local_tarball = os.path.join(index_directory, f'{index_name}.tar.gz') + index_path = os.path.join(index_directory, f'{index_name}') + + # Check to see if index already exists, if so, simply return (quietly) unless force=True, in which case we remove + # index and download fresh copy. + if os.path.exists(index_path): + if not force: + if verbose: + print(f'{index_path} already exists, skipping download.') + return index_path + if verbose: + print(f'{index_path} already exists, but force=True, removing {index_path} and fetching fresh copy...') + shutil.rmtree(index_path) + + print(f'Downloading index at {url}...') + download_url(url, index_directory, local_filename=local_filename, verbose=False, md5=md5) + + if verbose: + print(f'Extracting {local_tarball} into {index_path}...') + try: + tarball = tarfile.open(local_tarball) + except: + local_tarball = os.path.join(index_directory, f'{index_name}') + tarball = tarfile.open(local_tarball) + dirs_in_tarball = [member.name for member in tarball if member.isdir()] + assert len(dirs_in_tarball), f"Detect multiple members ({', '.join(dirs_in_tarball)}) under the tarball {local_tarball}." + tarball.extractall(index_directory) + tarball.close() + os.remove(local_tarball) + + if prebuilt: + dir_in_tarball = dirs_in_tarball[0] + if dir_in_tarball != index_name: + logger.info(f"Renaming {index_directory}/{dir_in_tarball} into {index_directory}/{index_name}.") + index_name = dir_in_tarball + os.rename(os.path.join(index_directory, f'{index_name}'), index_path) + + return index_path + + +def check_downloaded(index_name): + if index_name in TF_INDEX_INFO: + target_index = TF_INDEX_INFO[index_name] + elif index_name in IMPACT_INDEX_INFO: + target_index = IMPACT_INDEX_INFO[index_name] + else: + target_index = FAISS_INDEX_INFO[index_name] + index_url = target_index['urls'][0] + index_md5 = target_index['md5'] + index_name = index_url.split('/')[-1] + index_name = re.sub('''.tar.gz.*$''', '', index_name) + index_directory = os.path.join(get_cache_home(), 'indexes') + index_path = os.path.join(index_directory, f'{index_name}.{index_md5}') + + return os.path.exists(index_path) + + +def get_sparse_indexes_info(): + df = pd.DataFrame.from_dict({**TF_INDEX_INFO, **IMPACT_INDEX_INFO}) + for index in df.keys(): + df[index]['downloaded'] = check_downloaded(index) + + with pd.option_context('display.max_rows', None, 'display.max_columns', + None, 'display.max_colwidth', None, 'display.colheader_justify', 'left'): + print(df) + + +def get_impact_indexes_info(): + df = pd.DataFrame.from_dict(IMPACT_INDEX_INFO) + for index in df.keys(): + df[index]['downloaded'] = check_downloaded(index) + + with pd.option_context('display.max_rows', None, 'display.max_columns', + None, 'display.max_colwidth', None, 'display.colheader_justify', 'left'): + print(df) + + +def get_dense_indexes_info(): + df = pd.DataFrame.from_dict(FAISS_INDEX_INFO) + for index in df.keys(): + df[index]['downloaded'] = check_downloaded(index) + + with pd.option_context('display.max_rows', None, 'display.max_columns', + None, 'display.max_colwidth', None, 'display.colheader_justify', 'left'): + print(df) + + +def download_prebuilt_index(index_name, force=False, verbose=True, mirror=None): + if index_name not in TF_INDEX_INFO and index_name not in FAISS_INDEX_INFO and index_name not in IMPACT_INDEX_INFO: + raise ValueError(f'Unrecognized index name {index_name}') + if index_name in TF_INDEX_INFO: + target_index = TF_INDEX_INFO[index_name] + elif index_name in IMPACT_INDEX_INFO: + target_index = IMPACT_INDEX_INFO[index_name] + else: + target_index = FAISS_INDEX_INFO[index_name] + index_md5 = target_index['md5'] + for url in target_index['urls']: + local_filename = target_index['filename'] if 'filename' in target_index else None + try: + return download_and_unpack_index(url, local_filename=local_filename, + prebuilt=True, md5=index_md5, verbose=verbose) + except (HTTPError, URLError) as e: + print(f'Unable to download pre-built index at {url}, trying next URL...') + raise ValueError(f'Unable to download pre-built index at any known URLs.') + + +def download_encoded_queries(query_name, force=False, verbose=True, mirror=None): + if query_name not in QUERY_INFO: + raise ValueError(f'Unrecognized query name {query_name}') + query_md5 = QUERY_INFO[query_name]['md5'] + for url in QUERY_INFO[query_name]['urls']: + try: + return download_and_unpack_index(url, index_directory='queries', prebuilt=True, md5=query_md5) + except (HTTPError, URLError) as e: + print(f'Unable to download encoded query at {url}, trying next URL...') + raise ValueError(f'Unable to download encoded query at any known URLs.') + + +def download_encoded_corpus(corpus_name, force=False, verbose=True, mirror=None): + if corpus_name not in CORPUS_INFO: + raise ValueError(f'Unrecognized corpus name {corpus_name}') + corpus_md5 = CORPUS_INFO[corpus_name]['md5'] + for url in CORPUS_INFO[corpus_name]['urls']: + local_filename = CORPUS_INFO[corpus_name]['filename'] if 'filename' in CORPUS_INFO[corpus_name] else None + try: + return download_and_unpack_index(url, local_filename=local_filename, index_directory='corpus', prebuilt=True, md5=corpus_md5) + except (HTTPError, URLError) as e: + print(f'Unable to download encoded corpus at {url}, trying next URL...') + raise ValueError(f'Unable to download encoded corpus at any known URLs.') + + +def download_evaluation_script(evaluation_name, force=False, verbose=True, mirror=None): + if evaluation_name not in EVALUATION_INFO: + raise ValueError(f'Unrecognized evaluation name {evaluation_name}') + for url in EVALUATION_INFO[evaluation_name]['urls']: + try: + save_dir = os.path.join(get_cache_home(), 'eval') + if not os.path.exists(save_dir): + os.makedirs(save_dir) + return download_url(url, save_dir=save_dir) + except HTTPError: + print(f'Unable to download evaluation script at {url}, trying next URL...') + raise ValueError(f'Unable to download evaluation script at any known URLs.') + + +def get_sparse_index(index_name): + if index_name not in FAISS_INDEX_INFO: + raise ValueError(f'Unrecognized index name {index_name}') + return FAISS_INDEX_INFO[index_name]["texts"] diff --git a/pyserini/vectorizer/__init__.py b/pyserini/vectorizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dafc6252140de2bc22696d1d19d5df7da82ccc67 --- /dev/null +++ b/pyserini/vectorizer/__init__.py @@ -0,0 +1,19 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ._base import BM25Vectorizer, TfidfVectorizer + +__all__ = ['BM25Vectorizer', 'TfidfVectorizer'] diff --git a/pyserini/vectorizer/_base.py b/pyserini/vectorizer/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..255656c3848cfc3b6e436c2775cefc6ab05b05d1 --- /dev/null +++ b/pyserini/vectorizer/_base.py @@ -0,0 +1,194 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import math +from typing import List, Optional +from sklearn.preprocessing import normalize + +from scipy.sparse import csr_matrix + +from pyserini import index, search +from pyserini.analysis import Analyzer, get_lucene_analyzer +from tqdm import tqdm + + +class Vectorizer: + """Base class for vectorizer implemented on top of Pyserini. + + Parameters + ---------- + lucene_index_path : str + Path to lucene index folder + min_df : int + Minimum acceptable document frequency + verbose : bool + Whether to print out debugging information + """ + + def __init__(self, lucene_index_path: str, min_df: int = 1, verbose: bool = False): + self.min_df: int = min_df + self.verbose: bool = verbose + self.index_reader = index.IndexReader(lucene_index_path) + self.searcher = search.LuceneSearcher(lucene_index_path) + self.num_docs: int = self.searcher.num_docs + self.stats = self.index_reader.stats() + self.analyzer = Analyzer(get_lucene_analyzer()) + + # build vocabulary + self.vocabulary_ = set() + for term in self.index_reader.terms(): + if term.df > self.min_df: + self.vocabulary_.add(term.term) + self.vocabulary_ = sorted(self.vocabulary_) + + # build term to index mapping + self.term_to_index = {} + for i, term in enumerate(self.vocabulary_): + self.term_to_index[term] = i + self.vocabulary_size = len(self.vocabulary_) + + if self.verbose: + print(f'Found {self.vocabulary_size} terms with min_df={self.min_df}') + + def get_query_vector(self, query: str): + matrix_row, matrix_col, matrix_data = [], [], [] + tokens = self.analyzer.analyze(query) + for term in tokens: + if term in self.vocabulary_: + matrix_row.append(0) + matrix_col.append(self.term_to_index[term]) + matrix_data.append(1) + vectors = csr_matrix((matrix_data, (matrix_row, matrix_col)), shape=(1, self.vocabulary_size)) + return vectors + + +class TfidfVectorizer(Vectorizer): + """Wrapper class for tf-idf vectorizer implemented on top of Pyserini. + + Parameters + ---------- + lucene_index_path : str + Path to lucene index folder + min_df : int + Minimum acceptable document frequency + verbose : bool + Whether to print out debugging information + """ + + def __init__(self, lucene_index_path: str, min_df: int = 1, verbose: bool = False): + super().__init__(lucene_index_path, min_df, verbose) + + self.idf_ = {} + for term in self.index_reader.terms(): + self.idf_[term.term] = math.log(self.num_docs / term.df) + + def get_vectors(self, docids: List[str], norm: Optional[str] = 'l2'): + """Get the tf-idf vectors given a list of docids + + Parameters + ---------- + norm : str + Normalize the sparse matrix + docids : List[str] + The piece of text to analyze. + + Returns + ------- + csr_matrix + Sparse matrix representation of tf-idf vectors + """ + matrix_row, matrix_col, matrix_data = [], [], [] + num_docs = len(docids) + + for index, doc_id in enumerate(tqdm(docids)): + # Term Frequency + tf = self.index_reader.get_document_vector(doc_id) + if tf is None: + continue + + # Filter out in-eligible terms + tf = {t: tf[t] for t in tf if t in self.term_to_index} + + # Convert from dict to sparse matrix + for term in tf: + tfidf = tf[term] * self.idf_[term] + matrix_row.append(index) + matrix_col.append(self.term_to_index[term]) + matrix_data.append(tfidf) + + vectors = csr_matrix((matrix_data, (matrix_row, matrix_col)), shape=(num_docs, self.vocabulary_size)) + + if norm: + return normalize(vectors, norm=norm) + return vectors + + +class BM25Vectorizer(Vectorizer): + """Wrapper class for BM25 vectorizer implemented on top of Pyserini. + + Parameters + ---------- + lucene_index_path : str + Path to lucene index folder + min_df : int + Minimum acceptable document frequency + verbose : bool + Whether to print out debugging information + """ + + def __init__(self, lucene_index_path: str, min_df: int = 1, verbose: bool = False): + super().__init__(lucene_index_path, min_df, verbose) + + def get_vectors(self, docids: List[str], norm: Optional[str] = 'l2'): + """Get the BM25 vectors given a list of docids + + Parameters + ---------- + norm : str + Normalize the sparse matrix + docids : List[str] + The piece of text to analyze. + + Returns + ------- + csr_matrix + Sparse matrix representation of BM25 vectors + """ + matrix_row, matrix_col, matrix_data = [], [], [] + num_docs = len(docids) + + for index, doc_id in enumerate(tqdm(docids)): + + # Term Frequency + tf = self.index_reader.get_document_vector(doc_id) + if tf is None: + continue + + # Filter out in-eligible terms + tf = {t: tf[t] for t in tf if t in self.term_to_index} + + # Convert from dict to sparse matrix + for term in tf: + bm25_weight = self.index_reader.compute_bm25_term_weight(doc_id, term, analyzer=None) + matrix_row.append(index) + matrix_col.append(self.term_to_index[term]) + matrix_data.append(bm25_weight) + + vectors = csr_matrix((matrix_data, (matrix_row, matrix_col)), shape=(num_docs, self.vocabulary_size)) + + if norm: + return normalize(vectors, norm=norm) + return vectors