Liangrj5 commited on Jun 26, 2024

Commit

5019d3f

1 Parent(s): 876e08a

init

Files changed (23) hide show

.gitattributes +3 -2
.gitignore +139 -0
LICENSE +121 -0
README.md +81 -3
figures/taskComparisonV.png +0 -0
infer.py +33 -0
infer_top20.sh +17 -0
modules/ReLoCLNet.py +362 -0
modules/contrastive.py +167 -0
modules/dataset_init.py +82 -0
modules/dataset_tvrr.py +208 -0
modules/infer_lib.py +101 -0
modules/model_components.py +317 -0
modules/ndcg_iou.py +64 -0
modules/optimization.py +343 -0
run_top20.sh +14 -0
train.py +69 -0
utils/__init__.py +0 -0
utils/basic_utils.py +270 -0
utils/run_utils.py +112 -0
utils/setup.py +101 -0
utils/temporal_nms.py +74 -0
utils/tensor_utils.py +141 -0

.gitattributes CHANGED Viewed

@@ -1,3 +1,6 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
@@ -33,5 +36,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-*.json filter=lfs diff=lfs merge=lfs -text
-*.csv filter=lfs diff=lfs merge=lfs -text

+*.json filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,139 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+unused
+results
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# custom
+.idea/
+.vscode/
+data/tvr_feature_release/

LICENSE ADDED Viewed

	@@ -0,0 +1,121 @@

+Creative Commons Legal Code
+CC0 1.0 Universal
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+Statement of Purpose
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+4. Limitations and Disclaimers.
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.

README.md CHANGED Viewed

@@ -1,3 +1,81 @@
----
-license: cc
----

+# Video Moment Retrieval in Practical Setting: A Dataset of Ranked Moments for Imprecise  Queries
+The benchmark and dataset for the paper "Video Moment Retrieval in Practical Settings: A Dataset of Ranked Moments for Imprecise Queries" is coming soon.
+We recommend cloning the code, data, and feature files from the Hugging Face repository at [TVR-Ranking](https://huggingface.co/axgroup/TVR-Ranking).
+![TVR_Ranking_overview](./figures/taskComparisonV.png)
+## Getting started
+### 1. Install the requisites
+The Python packages we used are listed as follows. Commonly, the most recent versions work well.
+```shell
+conda create --name tvr_ranking python=3.11
+conda activate tvr_ranking
+pip install pytorch # 2.2.1+cu121
+pip install tensorboard
+pip install h5py pandas tqdm easydict pyyaml
+```
+### 2. Download full dataset
+For the full dataset, please go down from Hugging Face [TVR-Ranking](https://huggingface.co/axgroup/TVR-Ranking). \
+The detailed introduction and raw annotations is available at [Dataset Introduction](data/TVR_Ranking/readme.md).
+```
+TVR_Ranking/
+  -val.json
+  -test.json
+  -train_top01.json
+  -train_top20.json
+  -train_top40.json
+  -video_corpus.json
+```
+### 3. Download features
+For the query BERT features, you can download them from Hugging Face [TVR-Ranking](https://huggingface.co/axgroup/TVR-Ranking). \
+For the video and subtitle features, please request them at [TVR](https://tvr.cs.unc.edu/).
+```shell
+tar -xf tvr_feature_release.tar.gz -C data/TVR_Ranking/feature
+```
+### 4. Training
+```shell
+# modify the data path first
+sh run_top20.sh
+```
+## Baseline
+(ToDo: running the new version...) \
+The baseline performance of  $NDGC@20$ was shown as follows.
+Top $N$ moments were comprised of a pseudo training set by the query-caption similarity.
+| Model          | $N$ | IoU = 0.3, val | IoU = 0.3, test | IoU = 0.5, val | IoU = 0.5, test | IoU = 0.7, val | IoU = 0.7, test |
+|----------------|-----|----------------|-----------------|----------------|-----------------|----------------|-----------------|
+| **XML**        | 1   | 0.1050         | 0.1047          | 0.0767         | 0.0751          | 0.0287         | 0.0314          |
+|                | 20  | 0.1948         | 0.1964          | 0.1417         | 0.1434          | 0.0519         | 0.0583          |
+|                | 40  | 0.2101         | 0.2110          | 0.1525         | 0.1533          | 0.0613         | 0.0617          |
+| **CONQUER**    | 1   | 0.0979         | 0.0830          | 0.0817         | 0.0686          | 0.0547         | 0.0479          |
+|                | 20  | 0.2007         | 0.1935          | 0.1844         | 0.1803          | 0.1391         | 0.1341          |
+|                | 40  | 0.2094         | 0.1943          | 0.1930         | 0.1825          | 0.1481         | 0.1334          |
+| **ReLoCLNet**  | 1   | 0.1306         | 0.1299          | 0.1169         | 0.1154          | 0.0738         | 0.0789          |
+|                | 20  | 0.3264         | 0.3214          | 0.3007         | 0.2956          | 0.2074         | 0.2084          |
+|                | 40  | 0.3479         | 0.3473          | 0.3221         | 0.3217          | 0.2218         | 0.2275          |
+### 4. Inferring
+[ToDo] The checkpoint can all be accessed from Hugging Face [TVR-Ranking](https://huggingface.co/axgroup/TVR-Ranking).
+## Citation
+If you feel this project helpful to your research, please cite our work.
+```
+```

figures/taskComparisonV.png ADDED Viewed

infer.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os, json
+import torch
+from tqdm import tqdm
+from modules.dataset_init import prepare_dataset
+from modules.infer_lib import grab_corpus_feature, eval_epoch
+from utils.basic_utils import AverageMeter, get_logger
+from utils.setup import set_seed, get_args
+from utils.run_utils import prepare_optimizer, prepare_model, logger_ndcg_iou
+def main():
+    opt = get_args()
+    logger = get_logger(opt.results_path, opt.exp_id)
+    set_seed(opt.seed)
+    logger.info("Arguments:\n%s", json.dumps(vars(opt), indent=4))
+    opt.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    logger.info(f"device: {opt.device}")
+    train_loader, corpus_loader, corpus_video_list, val_loader, test_loader, val_gt, test_gt = prepare_dataset(opt)
+    model = prepare_model(opt, logger)
+    # optimizer = prepare_optimizer(model, opt, len(train_loader) * opt.n_epoch)
+    corpus_feature = grab_corpus_feature(model, corpus_loader, opt.device)
+    val_ndcg_iou = eval_epoch(model, corpus_feature, val_loader, val_gt, opt, corpus_video_list)
+    test_ndcg_iou = eval_epoch(model, corpus_feature, test_loader, test_gt, opt, corpus_video_list)
+    logger_ndcg_iou(val_ndcg_iou, logger, "VAL")
+    logger_ndcg_iou(test_ndcg_iou, logger, "TEST")
+if __name__ == '__main__':
+    main()

infer_top20.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+python infer.py \
+    --results_path      results/tvr_ranking \
+    --checkpoint        results/tvr_ranking/best_model.pt \
+    --train_path        data/TVR_Ranking/train_top20.json \
+    --val_path          data/TVR_Ranking/val.json \
+    --test_path         data/TVR_Ranking/test.json \
+    --corpus_path       data/TVR_Ranking/video_corpus.json \
+    --desc_bert_path    /home/renjie.liang/datasets/TVR_Ranking/features/query_bert.h5 \
+    --video_feat_path   /home/share/czzhang/Dataset/TVR/TVR_feature/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5 \
+    --sub_bert_path     /home/share/czzhang/Dataset/TVR/TVR_feature/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5 \
+    --exp_id infer
+# qsub -I -l select=1:ngpus=1 -P gs_slab -q slab_gpu8
+# cd /home/renjie.liang/11_TVR-Ranking/ReLoCLNet; conda activate py11; sh infer_top20.sh
+    # --hard_negative_start_epoch 0 \
+    # --no_norm_vfeat \
+    # --use_hard_negative

modules/ReLoCLNet.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from easydict import EasyDict as edict
+from modules.model_components import BertAttention, LinearLayer, BertSelfAttention, TrainablePositionalEncoding
+from modules.model_components import MILNCELoss
+from modules.contrastive import batch_video_query_loss
+class ReLoCLNet(nn.Module):
+    def __init__(self, config):
+        super(ReLoCLNet, self).__init__()
+        self.config = config
+        self.query_pos_embed = TrainablePositionalEncoding(max_position_embeddings=config.max_desc_l,
+                                                           hidden_size=config.hidden_size, dropout=config.input_drop)
+        self.ctx_pos_embed = TrainablePositionalEncoding(max_position_embeddings=config.max_ctx_l,
+                                                         hidden_size=config.hidden_size, dropout=config.input_drop)
+        self.query_input_proj = LinearLayer(config.query_input_size, config.hidden_size, layer_norm=True,
+                                            dropout=config.input_drop, relu=True)
+        self.query_encoder = BertAttention(edict(hidden_size=config.hidden_size, intermediate_size=config.hidden_size,
+                                                 hidden_dropout_prob=config.drop, num_attention_heads=config.n_heads,
+                                                 attention_probs_dropout_prob=config.drop))
+        self.query_encoder1 = copy.deepcopy(self.query_encoder)
+        cross_att_cfg = edict(hidden_size=config.hidden_size, num_attention_heads=config.n_heads,
+                              attention_probs_dropout_prob=config.drop)
+        # use_video
+        self.video_input_proj = LinearLayer(config.visual_input_size, config.hidden_size, layer_norm=True,
+                                            dropout=config.input_drop, relu=True)
+        self.video_encoder1 = copy.deepcopy(self.query_encoder)
+        self.video_encoder2 = copy.deepcopy(self.query_encoder)
+        self.video_encoder3 = copy.deepcopy(self.query_encoder)
+        self.video_cross_att = BertSelfAttention(cross_att_cfg)
+        self.video_cross_layernorm = nn.LayerNorm(config.hidden_size)
+        self.video_query_linear = nn.Linear(config.hidden_size, config.hidden_size)
+        # use_sub
+        self.sub_input_proj = LinearLayer(config.sub_input_size, config.hidden_size, layer_norm=True,
+                                          dropout=config.input_drop, relu=True)
+        self.sub_encoder1 = copy.deepcopy(self.query_encoder)
+        self.sub_encoder2 = copy.deepcopy(self.query_encoder)
+        self.sub_encoder3 = copy.deepcopy(self.query_encoder)
+        self.sub_cross_att = BertSelfAttention(cross_att_cfg)
+        self.sub_cross_layernorm = nn.LayerNorm(config.hidden_size)
+        self.sub_query_linear = nn.Linear(config.hidden_size, config.hidden_size)
+        self.modular_vector_mapping = nn.Linear(in_features=config.hidden_size, out_features=2, bias=False)
+        conv_cfg = dict(in_channels=1, out_channels=1, kernel_size=config.conv_kernel_size,
+                        stride=config.conv_stride, padding=config.conv_kernel_size // 2, bias=False)
+        self.merged_st_predictor = nn.Conv1d(**conv_cfg)
+        self.merged_ed_predictor = nn.Conv1d(**conv_cfg)
+        # self.temporal_criterion = nn.CrossEntropyLoss(reduction="mean")
+        self.temporal_criterion = nn.CrossEntropyLoss(reduction="none")
+        self.nce_criterion = MILNCELoss(reduction=False)
+        # self.nce_criterion = MILNCELoss(reduction='mean')
+        self.reset_parameters()
+    def reset_parameters(self):
+        """ Initialize the weights."""
+        def re_init(module):
+            if isinstance(module, (nn.Linear, nn.Embedding)):
+                # Slightly different from the TF version which uses truncated_normal for initialization
+                # cf https://github.com/pytorch/pytorch/pull/5617
+                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            elif isinstance(module, nn.LayerNorm):
+                module.bias.data.zero_()
+                module.weight.data.fill_(1.0)
+            elif isinstance(module, nn.Conv1d):
+                module.reset_parameters()
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        self.apply(re_init)
+    def set_hard_negative(self, use_hard_negative, hard_pool_size):
+        """use_hard_negative: bool; hard_pool_size: int, """
+        self.config.use_hard_negative = use_hard_negative
+        self.config.hard_pool_size = hard_pool_size
+    def forward(self, query_feat, query_mask, video_feat, video_mask, sub_feat, sub_mask, st_ed_indices, match_labels, simi):
+        """
+        Args:
+            query_feat: (N, Lq, Dq)
+            query_mask: (N, Lq)
+            video_feat: (N, Lv, Dv) or None
+            video_mask: (N, Lv) or None
+            sub_feat: (N, Lv, Ds) or None
+            sub_mask: (N, Lv) or None
+            st_ed_indices: (N, 2), torch.LongTensor, 1st, 2nd columns are st, ed labels respectively.
+            match_labels: (N, Lv), torch.LongTensor, matching labels for detecting foreground and background (not used)
+        """
+        video_feat, sub_feat, mid_x_video_feat, mid_x_sub_feat, x_video_feat, x_sub_feat = self.encode_context(
+            video_feat, video_mask, sub_feat, sub_mask, return_mid_output=True)
+        video_query, sub_query, query_context_scores, st_prob, ed_prob = self.get_pred_from_raw_query(
+            query_feat, query_mask, x_video_feat, video_mask, x_sub_feat, sub_mask, cross=False,
+            return_query_feats=True)
+        # frame level contrastive learning loss (FrameCL)
+        loss_fcl = 0
+        if self.config.lw_fcl != 0:
+            loss_fcl_vq = batch_video_query_loss(mid_x_video_feat, video_query, match_labels, video_mask, measure='JSD')
+            loss_fcl_sq = batch_video_query_loss(mid_x_sub_feat, sub_query, match_labels, sub_mask, measure='JSD')
+            loss_fcl = (loss_fcl_vq + loss_fcl_sq) / 2.0
+            loss_fcl = self.config.lw_fcl * loss_fcl
+        # video level contrastive learning loss (VideoCL)
+        loss_vcl = 0
+        if self.config.lw_vcl != 0:
+            mid_video_q2ctx_scores = self.get_unnormalized_video_level_scores(video_query, mid_x_video_feat, video_mask)
+            mid_sub_q2ctx_scores = self.get_unnormalized_video_level_scores(sub_query, mid_x_sub_feat, sub_mask)
+            mid_video_q2ctx_scores, _ = torch.max(mid_video_q2ctx_scores, dim=1)
+            mid_sub_q2ctx_scores, _ = torch.max(mid_sub_q2ctx_scores, dim=1)
+            # exclude the contrastive loss for the same query
+            mid_q2ctx_scores = (mid_video_q2ctx_scores + mid_sub_q2ctx_scores) / 2.0 # * video_contrastive_mask
+            loss_vcl = self.nce_criterion(mid_q2ctx_scores)
+            loss_vcl = self.config.lw_vcl * loss_vcl
+        # moment localization loss
+        loss_st_ed = 0
+        if self.config.lw_st_ed != 0:
+            loss_st = self.temporal_criterion(st_prob, st_ed_indices[:, 0])
+            loss_ed = self.temporal_criterion(ed_prob, st_ed_indices[:, 1])
+            loss_st_ed = loss_st + loss_ed
+            loss_st_ed = self.config.lw_st_ed * loss_st_ed
+        # video level retrieval loss
+        loss_neg_ctx, loss_neg_q = 0, 0
+        if self.config.lw_neg_ctx != 0 or self.config.lw_neg_q != 0:
+            loss_neg_ctx, loss_neg_q = self.get_video_level_loss(query_context_scores)
+            loss_neg_ctx = self.config.lw_neg_ctx * loss_neg_ctx
+            loss_neg_q = self.config.lw_neg_q * loss_neg_q
+        # sum loss
+        # loss = loss_fcl + loss_vcl + loss_st_ed + loss_neg_ctx + loss_neg_q
+        # simi = torch.exp(10*(simi-0.7))
+        simi = simi
+        loss = ((loss_fcl + loss_vcl + loss_st_ed) * simi).mean() +  loss_neg_ctx + loss_neg_q
+        return loss
+    def encode_query(self, query_feat, query_mask):
+        encoded_query = self.encode_input(query_feat, query_mask, self.query_input_proj, self.query_encoder,
+                                          self.query_pos_embed)  # (N, Lq, D)
+        encoded_query = self.query_encoder1(encoded_query, query_mask.unsqueeze(1))
+        video_query, sub_query = self.get_modularized_queries(encoded_query, query_mask)  # (N, D) * 2
+        return video_query, sub_query
+    def encode_context(self, video_feat, video_mask, sub_feat, sub_mask, return_mid_output=False):
+        # encoding video and subtitle features, respectively
+        encoded_video_feat = self.encode_input(video_feat, video_mask, self.video_input_proj, self.video_encoder1,
+                                               self.ctx_pos_embed)
+        encoded_sub_feat = self.encode_input(sub_feat, sub_mask, self.sub_input_proj, self.sub_encoder1,
+                                             self.ctx_pos_embed)
+        # cross encoding subtitle features
+        x_encoded_video_feat = self.cross_context_encoder(encoded_video_feat, video_mask, encoded_sub_feat, sub_mask,
+                                                          self.video_cross_att, self.video_cross_layernorm)  # (N, L, D)
+        x_encoded_video_feat_ = self.video_encoder2(x_encoded_video_feat, video_mask.unsqueeze(1))
+        # cross encoding video features
+        x_encoded_sub_feat = self.cross_context_encoder(encoded_sub_feat, sub_mask, encoded_video_feat, video_mask,
+                                                        self.sub_cross_att, self.sub_cross_layernorm)  # (N, L, D)
+        x_encoded_sub_feat_ = self.sub_encoder2(x_encoded_sub_feat, sub_mask.unsqueeze(1))
+        # additional self encoding process
+        x_encoded_video_feat = self.video_encoder3(x_encoded_video_feat_, video_mask.unsqueeze(1))
+        x_encoded_sub_feat = self.sub_encoder3(x_encoded_sub_feat_, sub_mask.unsqueeze(1))
+        if return_mid_output:
+            return (encoded_video_feat, encoded_sub_feat, x_encoded_video_feat_, x_encoded_sub_feat_,
+                    x_encoded_video_feat, x_encoded_sub_feat)
+        else:
+            return x_encoded_video_feat, x_encoded_sub_feat
+    @staticmethod
+    def cross_context_encoder(main_context_feat, main_context_mask, side_context_feat, side_context_mask,
+                              cross_att_layer, norm_layer):
+        """
+        Args:
+            main_context_feat: (N, Lq, D)
+            main_context_mask: (N, Lq)
+            side_context_feat: (N, Lk, D)
+            side_context_mask: (N, Lk)
+            cross_att_layer: cross attention layer
+            norm_layer: layer norm layer
+        """
+        cross_mask = torch.einsum("bm,bn->bmn", main_context_mask, side_context_mask)  # (N, Lq, Lk)
+        cross_out = cross_att_layer(main_context_feat, side_context_feat, side_context_feat, cross_mask)  # (N, Lq, D)
+        residual_out = norm_layer(cross_out + main_context_feat)
+        return residual_out
+    @staticmethod
+    def encode_input(feat, mask, input_proj_layer, encoder_layer, pos_embed_layer):
+        """
+        Args:
+            feat: (N, L, D_input), torch.float32
+            mask: (N, L), torch.float32, with 1 indicates valid query, 0 indicates mask
+            input_proj_layer: down project input
+            encoder_layer: encoder layer
+            pos_embed_layer: positional embedding layer
+        """
+        feat = input_proj_layer(feat)
+        feat = pos_embed_layer(feat)
+        mask = mask.unsqueeze(1)  # (N, 1, L), torch.FloatTensor
+        return encoder_layer(feat, mask)  # (N, L, D_hidden)
+    def get_modularized_queries(self, encoded_query, query_mask, return_modular_att=False):
+        """
+        Args:
+            encoded_query: (N, L, D)
+            query_mask: (N, L)
+            return_modular_att: bool
+        """
+        modular_attention_scores = self.modular_vector_mapping(encoded_query)  # (N, L, 2 or 1)
+        modular_attention_scores = F.softmax(mask_logits(modular_attention_scores, query_mask.unsqueeze(2)), dim=1)
+        modular_queries = torch.einsum("blm,bld->bmd", modular_attention_scores, encoded_query)  # (N, 2 or 1, D)
+        if return_modular_att:
+            assert modular_queries.shape[1] == 2
+            return modular_queries[:, 0], modular_queries[:, 1], modular_attention_scores
+        else:
+            assert modular_queries.shape[1] == 2
+            return modular_queries[:, 0], modular_queries[:, 1]  # (N, D) * 2
+    @staticmethod
+    def get_video_level_scores(modularied_query, context_feat, context_mask):
+        """ Calculate video2query scores for each pair of video and query inside the batch.
+        Args:
+            modularied_query: (N, D)
+            context_feat: (N, L, D), output of the first transformer encoder layer
+            context_mask: (N, L)
+        Returns:
+            context_query_scores: (N, N)  score of each query w.r.t. each video inside the batch,
+                diagonal positions are positive. used to get negative samples.
+        """
+        modularied_query = F.normalize(modularied_query, dim=-1)
+        context_feat = F.normalize(context_feat, dim=-1)
+        query_context_scores = torch.einsum("md,nld->mln", modularied_query, context_feat)  # (N, L, N)
+        context_mask = context_mask.transpose(0, 1).unsqueeze(0)  # (1, L, N)
+        query_context_scores = mask_logits(query_context_scores, context_mask)  # (N, L, N)
+        query_context_scores, _ = torch.max(query_context_scores, dim=1)  # (N, N) diagonal positions are positive pairs
+        return query_context_scores
+    @staticmethod
+    def get_unnormalized_video_level_scores(modularied_query, context_feat, context_mask):
+        """ Calculate video2query scores for each pair of video and query inside the batch.
+        Args:
+            modularied_query: (N, D)
+            context_feat: (N, L, D), output of the first transformer encoder layer
+            context_mask: (N, L)
+        Returns:
+            context_query_scores: (N, N)  score of each query w.r.t. each video inside the batch,
+                diagonal positions are positive. used to get negative samples.
+        """
+        query_context_scores = torch.einsum("md,nld->mln", modularied_query, context_feat)  # (N, L, N)
+        context_mask = context_mask.transpose(0, 1).unsqueeze(0)  # (1, L, N)
+        query_context_scores = mask_logits(query_context_scores, context_mask)  # (N, L, N)
+        return query_context_scores
+    def get_merged_score(self, video_query, video_feat, sub_query, sub_feat, cross=False):
+        video_query = self.video_query_linear(video_query)
+        sub_query = self.sub_query_linear(sub_query)
+        if cross:
+            video_similarity = torch.einsum("md,nld->mnl", video_query, video_feat)
+            sub_similarity = torch.einsum("md,nld->mnl", sub_query, sub_feat)
+            similarity = (video_similarity + sub_similarity) / 2  # (Nq, Nv, L)  from query to all videos.
+        else:
+            video_similarity = torch.einsum("bd,bld->bl", video_query, video_feat)  # (N, L)
+            sub_similarity = torch.einsum("bd,bld->bl", sub_query, sub_feat)  # (N, L)
+            similarity = (video_similarity + sub_similarity) / 2
+        return similarity
+    def get_merged_st_ed_prob(self, similarity, context_mask, cross=False):
+        if cross:
+            n_q, n_c, length = similarity.shape
+            similarity = similarity.view(n_q * n_c, 1, length)
+            st_prob = self.merged_st_predictor(similarity).view(n_q, n_c, length)  # (Nq, Nv, L)
+            ed_prob = self.merged_ed_predictor(similarity).view(n_q, n_c, length)  # (Nq, Nv, L)
+        else:
+            st_prob = self.merged_st_predictor(similarity.unsqueeze(1)).squeeze()  # (N, L)
+            ed_prob = self.merged_ed_predictor(similarity.unsqueeze(1)).squeeze()  # (N, L)
+        st_prob = mask_logits(st_prob, context_mask)  # (N, L)
+        ed_prob = mask_logits(ed_prob, context_mask)
+        return st_prob, ed_prob
+    def get_pred_from_raw_query(self, query_feat, query_mask, video_feat, video_mask, sub_feat, sub_mask, cross=False,
+                                return_query_feats=False):
+        """
+        Args:
+            query_feat: (N, Lq, Dq)
+            query_mask: (N, Lq)
+            video_feat: (N, Lv, D) or None
+            video_mask: (N, Lv)
+            sub_feat: (N, Lv, D) or None
+            sub_mask: (N, Lv)
+            cross:
+            return_query_feats:
+        """
+        video_query, sub_query = self.encode_query(query_feat, query_mask)
+        # get video-level retrieval scores
+        video_q2ctx_scores = self.get_video_level_scores(video_query, video_feat, video_mask)
+        sub_q2ctx_scores = self.get_video_level_scores(sub_query, sub_feat, sub_mask)
+        q2ctx_scores = (video_q2ctx_scores + sub_q2ctx_scores) / 2  # (N, N)
+        # compute start and end probs
+        similarity = self.get_merged_score(video_query, video_feat, sub_query, sub_feat, cross=cross)
+        st_prob, ed_prob = self.get_merged_st_ed_prob(similarity, video_mask, cross=cross)
+        if return_query_feats:
+            return video_query, sub_query, q2ctx_scores, st_prob, ed_prob
+        else:
+            return q2ctx_scores, st_prob, ed_prob  # un-normalized masked probabilities!!!!!
+    def get_video_level_loss(self, query_context_scores):
+        """ ranking loss between (pos. query + pos. video) and (pos. query + neg. video) or (neg. query + pos. video)
+        Args:
+            query_context_scores: (N, N), cosine similarity [-1, 1],
+                Each row contains the scores between the query to each of the videos inside the batch.
+        """
+        bsz = len(query_context_scores)
+        diagonal_indices = torch.arange(bsz).to(query_context_scores.device)
+        pos_scores = query_context_scores[diagonal_indices, diagonal_indices]  # (N, )
+        query_context_scores_masked = copy.deepcopy(query_context_scores.data)
+        # impossibly large for cosine similarity, the copy is created as modifying the original will cause error
+        query_context_scores_masked[diagonal_indices, diagonal_indices] = 999
+        pos_query_neg_context_scores = self.get_neg_scores(query_context_scores, query_context_scores_masked)
+        neg_query_pos_context_scores = self.get_neg_scores(query_context_scores.transpose(0, 1),
+                                                           query_context_scores_masked.transpose(0, 1))
+        loss_neg_ctx = self.get_ranking_loss(pos_scores, pos_query_neg_context_scores)
+        loss_neg_q = self.get_ranking_loss(pos_scores, neg_query_pos_context_scores)
+        return loss_neg_ctx, loss_neg_q
+    def get_neg_scores(self, scores, scores_masked):
+        """
+        scores: (N, N), cosine similarity [-1, 1],
+            Each row are scores: query --> all videos. Transposed version: video --> all queries.
+        scores_masked: (N, N) the same as scores, except that the diagonal (positive) positions
+            are masked with a large value.
+        """
+        bsz = len(scores)
+        batch_indices = torch.arange(bsz).to(scores.device)
+        _, sorted_scores_indices = torch.sort(scores_masked, descending=True, dim=1)
+        sample_min_idx = 1  # skip the masked positive
+        sample_max_idx = min(sample_min_idx + self.config.hard_pool_size, bsz) if self.config.use_hard_negative else bsz
+        # (N, )
+        sampled_neg_score_indices = sorted_scores_indices[batch_indices, torch.randint(sample_min_idx, sample_max_idx,
+                                                                                       size=(bsz,)).to(scores.device)]
+        sampled_neg_scores = scores[batch_indices, sampled_neg_score_indices]  # (N, )
+        return sampled_neg_scores
+    def get_ranking_loss(self, pos_score, neg_score):
+        """ Note here we encourage positive scores to be larger than negative scores.
+        Args:
+            pos_score: (N, ), torch.float32
+            neg_score: (N, ), torch.float32
+        """
+        if self.config.ranking_loss_type == "hinge":  # max(0, m + S_neg - S_pos)
+            return torch.clamp(self.config.margin + neg_score - pos_score, min=0).sum() / len(pos_score)
+        elif self.config.ranking_loss_type == "lse":  # log[1 + exp(S_neg - S_pos)]
+            return torch.log1p(torch.exp(neg_score - pos_score)).sum() / len(pos_score)
+        else:
+            raise NotImplementedError("Only support 'hinge' and 'lse'")
+def mask_logits(target, mask):
+    return target * mask + (1 - mask) * (-1e10)

modules/contrastive.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import torch
+import math
+import torch.nn.functional as F
+def log_sum_exp(x, axis=None):
+    """
+    Log sum exp function
+    Args:
+        x: Input.
+        axis: Axis over which to perform sum.
+    Returns:
+        torch.Tensor: log sum exp
+    """
+    x_max = torch.max(x, axis)[0]
+    y = torch.log((torch.exp(x - x_max)).sum(axis)) + x_max
+    return y
+def get_positive_expectation(p_samples, measure='JSD', average=True):
+    """
+    Computes the positive part of a divergence / difference.
+    Args:
+        p_samples: Positive samples.
+        measure: Measure to compute for.
+        average: Average the result over samples.
+    Returns:
+        torch.Tensor
+    """
+    log_2 = math.log(2.)
+    if measure == 'GAN':
+        Ep = - F.softplus(-p_samples)
+    elif measure == 'JSD':
+        Ep = log_2 - F.softplus(-p_samples)
+    elif measure == 'X2':
+        Ep = p_samples ** 2
+    elif measure == 'KL':
+        Ep = p_samples + 1.
+    elif measure == 'RKL':
+        Ep = -torch.exp(-p_samples)
+    elif measure == 'DV':
+        Ep = p_samples
+    elif measure == 'H2':
+        Ep = torch.ones_like(p_samples) - torch.exp(-p_samples)
+    elif measure == 'W1':
+        Ep = p_samples
+    else:
+        raise ValueError('Unknown measurement {}'.format(measure))
+    if average:
+        return Ep.mean()
+    else:
+        return Ep
+def get_negative_expectation(q_samples, measure='JSD', average=True):
+    """
+    Computes the negative part of a divergence / difference.
+    Args:
+        q_samples: Negative samples.
+        measure: Measure to compute for.
+        average: Average the result over samples.
+    Returns:
+        torch.Tensor
+    """
+    log_2 = math.log(2.)
+    if measure == 'GAN':
+        Eq = F.softplus(-q_samples) + q_samples
+    elif measure == 'JSD':
+        Eq = F.softplus(-q_samples) + q_samples - log_2
+    elif measure == 'X2':
+        Eq = -0.5 * ((torch.sqrt(q_samples ** 2) + 1.) ** 2)
+    elif measure == 'KL':
+        Eq = torch.exp(q_samples)
+    elif measure == 'RKL':
+        Eq = q_samples - 1.
+    elif measure == 'DV':
+        Eq = log_sum_exp(q_samples, 0) - math.log(q_samples.size(0))
+    elif measure == 'H2':
+        Eq = torch.exp(q_samples) - 1.
+    elif measure == 'W1':
+        Eq = q_samples
+    else:
+        raise ValueError('Unknown measurement {}'.format(measure))
+    if average:
+        return Eq.mean()
+    else:
+        return Eq
+def batch_video_query_loss(video, query, match_labels, mask, measure='JSD'):
+    """
+        QV-CL module
+        Computing the Contrastive Loss between the video and query.
+        :param video: video rep (bsz, Lv, dim)
+        :param query: query rep (bsz, dim)
+        :param match_labels: match labels (bsz, Lv)
+        :param mask: mask (bsz, Lv)
+        :param measure: estimator of the mutual information
+        :return: L_{qv}
+    """
+    # generate mask
+    pos_mask = match_labels.type(torch.float32)  # (bsz, Lv)
+    neg_mask = (torch.ones_like(pos_mask) - pos_mask) * mask  # (bsz, Lv)
+    # compute scores
+    query = query.unsqueeze(2)  # (bsz, dim, 1)
+    res = torch.matmul(video, query).squeeze(2)  # (bsz, Lv)
+    # computing expectation for the MI between the target moment (positive samples) and query.
+    E_pos = get_positive_expectation(res * pos_mask, measure, average=False)
+    E_pos = torch.sum(E_pos * pos_mask, dim=1) / (torch.sum(pos_mask, dim=1) + 1e-12)  # (bsz, )
+    # computing expectation for the MI between clips except target moment (negative samples) and query.
+    E_neg = get_negative_expectation(res * neg_mask, measure, average=False)
+    E_neg = torch.sum(E_neg * neg_mask, dim=1) / (torch.sum(neg_mask, dim=1) + 1e-12)  # (bsz, )
+    E = E_neg - E_pos  # (bsz, )
+    # return torch.mean(E)
+    return E
+def batch_video_video_loss(video, st_ed_indices, match_labels, mask, measure='JSD'):
+    """
+        VV-CL module
+        Computing the Contrastive loss between the start/end clips and the video
+        :param video: video rep (bsz, Lv, dim)
+        :param st_ed_indices: (bsz, 2)
+        :param match_labels: match labels (bsz, Lv)
+        :param mask: mask (bsz, Lv)
+        :param measure: estimator of the mutual information
+        :return: L_{vv}
+    """
+    # generate mask
+    pos_mask = match_labels.type(torch.float32)  # (bsz, Lv)
+    neg_mask = (torch.ones_like(pos_mask) - pos_mask) * mask  # (bsz, Lv)
+    # select start and end indices features
+    st_indices, ed_indices = st_ed_indices[:, 0], st_ed_indices[:, 1]  # (bsz, )
+    batch_indices = torch.arange(0, video.shape[0]).long()  # (bsz, )
+    video_s = video[batch_indices, st_indices, :]  # (bsz, dim)
+    video_e = video[batch_indices, ed_indices, :]  # (bsz, dim)
+    # compute scores
+    video_s = video_s.unsqueeze(2)  # (bsz, dim, 1)
+    res_s = torch.matmul(video, video_s).squeeze(2)  # (bsz, Lv), fusion between the start clips and the video
+    video_e = video_e.unsqueeze(2)  # (bsz, dim, 1)
+    res_e = torch.matmul(video, video_e).squeeze(2)  # (bsz, Lv), fusion between the end clips and the video
+    # start clips: MI expectation for all positive samples
+    E_s_pos = get_positive_expectation(res_s * pos_mask, measure, average=False)
+    E_s_pos = torch.sum(E_s_pos * pos_mask, dim=1) / (torch.sum(pos_mask, dim=1) + 1e-12)  # (bsz, )
+    # end clips: MI expectation for all positive samples
+    E_e_pos = get_positive_expectation(res_e * pos_mask, measure, average=False)
+    E_e_pos = torch.sum(E_e_pos * pos_mask, dim=1) / (torch.sum(pos_mask, dim=1) + 1e-12)
+    E_pos = E_s_pos + E_e_pos
+    # start clips: MI expectation for all negative samples
+    E_s_neg = get_negative_expectation(res_s * neg_mask, measure, average=False)
+    E_s_neg = torch.sum(E_s_neg * neg_mask, dim=1) / (torch.sum(neg_mask, dim=1) + 1e-12)
+    # end clips: MI expectation for all negative samples
+    E_e_neg = get_negative_expectation(res_e * neg_mask, measure, average=False)
+    E_e_neg = torch.sum(E_e_neg * neg_mask, dim=1) / (torch.sum(neg_mask, dim=1) + 1e-12)
+    E_neg = E_s_neg + E_e_neg
+    E = E_neg - E_pos  # (bsz, )
+    return torch.mean(E)

modules/dataset_init.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from modules.dataset_tvrr import TrainDataset, QueryEvalDataset, CorpusEvalDataset
+import torch
+from torch.utils.data import DataLoader
+from utils.tensor_utils import pad_sequences_1d
+import numpy as np
+def collate_fn(batch, task):
+    fixed_length = 128
+    batch_data = dict()
+    if task == "train":
+        simis = [e["simi"] for e in batch]
+        batch_data["simi"] =  torch.tensor(simis)
+        query_feat_mask = pad_sequences_1d([e["query_feat"] for e in batch], dtype=torch.float32, fixed_length=None)
+        batch_data["query_feat"] = query_feat_mask[0]
+        batch_data["query_mask"] = query_feat_mask[1]
+        video_feat_mask = pad_sequences_1d([e["video_feat"] for e in batch], dtype=torch.float32, fixed_length=fixed_length)
+        batch_data["video_feat"] = video_feat_mask[0]
+        batch_data["video_mask"] = video_feat_mask[1]
+        sub_feat_mask = pad_sequences_1d([e["sub_feat"] for e in batch], dtype=torch.float32, fixed_length=fixed_length)
+        batch_data["sub_feat"] = sub_feat_mask[0]
+        batch_data["sub_mask"] = sub_feat_mask[1]
+        st_ed_indices = [e["st_ed_indices"] for e in batch]
+        batch_data["st_ed_indices"] = torch.stack(st_ed_indices, dim=0)
+        match_labels = np.zeros(shape=(len(st_ed_indices), fixed_length), dtype=np.int32)
+        for idx, st_ed_index in enumerate(st_ed_indices):
+            st_ed = st_ed_index.cpu().numpy()
+            st, ed = st_ed[0], st_ed[1]
+            match_labels[idx][st:(ed + 1)] = 1
+        batch_data['match_labels'] = torch.tensor(match_labels, dtype=torch.long)
+    if task == "corpus":
+        video_feat_mask = pad_sequences_1d([e["video_feat"] for e in batch], dtype=torch.float32, fixed_length=fixed_length)
+        batch_data["video_feat"] = video_feat_mask[0]
+        batch_data["video_mask"] = video_feat_mask[1]
+        sub_feat_mask = pad_sequences_1d([e["sub_feat"] for e in batch], dtype=torch.float32, fixed_length=fixed_length)
+        batch_data["sub_feat"] = sub_feat_mask[0]
+        batch_data["sub_mask"] = sub_feat_mask[1]
+    if task == "eval":
+        query_feat_mask = pad_sequences_1d([e["query_feat"] for e in batch], dtype=torch.float32, fixed_length=None)
+        batch_data["query_feat"] = query_feat_mask[0]
+        batch_data["query_mask"] = query_feat_mask[1]
+        query_id = [e["query_id"] for e in batch]
+        batch_data["query_id"] =  torch.tensor(query_id)
+    return  batch_data
+def prepare_dataset(opt):
+    train_set = TrainDataset(
+        data_path=opt.train_path,
+        desc_bert_path=opt.desc_bert_path,
+        sub_bert_path=opt.sub_bert_path,
+        max_desc_len=opt.max_desc_l,
+        max_ctx_len=opt.max_ctx_l,
+        video_feat_path=opt.video_feat_path,
+        clip_length=opt.clip_length,
+        ctx_mode=opt.ctx_mode,
+        normalize_vfeat=not opt.no_norm_vfeat,
+        normalize_tfeat=not opt.no_norm_tfeat)
+    train_loader = DataLoader(train_set, collate_fn=lambda batch: collate_fn(batch, task='train'), batch_size=opt.bsz, num_workers=opt.num_workers, shuffle=True, pin_memory=True)
+    corpus_set = CorpusEvalDataset(corpus_path=opt.corpus_path, max_ctx_len=opt.max_ctx_l, sub_bert_path=opt.sub_bert_path, video_feat_path=opt.video_feat_path, ctx_mode=opt.ctx_mode)
+    corpus_loader = DataLoader(corpus_set, collate_fn=lambda batch: collate_fn(batch, task='corpus'), batch_size=opt.bsz, num_workers=opt.num_workers, shuffle=False, pin_memory=True)
+    val_set = QueryEvalDataset(data_path=opt.val_path, desc_bert_path=opt.desc_bert_path, max_desc_len=opt.max_desc_l)
+    val_loader = DataLoader(val_set, collate_fn=lambda batch: collate_fn(batch, task='eval'), batch_size=opt.bsz_eval, num_workers=opt.num_workers, shuffle=False, pin_memory=True)
+    test_set = QueryEvalDataset(data_path=opt.test_path, desc_bert_path=opt.desc_bert_path, max_desc_len=opt.max_desc_l)
+    test_loader = DataLoader(test_set, collate_fn=lambda batch: collate_fn(batch, task='eval'), batch_size=opt.bsz_eval, num_workers=opt.num_workers, shuffle=False, pin_memory=True)
+    val_gt = val_set.ground_truth
+    test_gt = test_set.ground_truth
+    corpus_video_list = corpus_set.corpus_video_list
+    return train_loader, corpus_loader, corpus_video_list, val_loader, test_loader, val_gt, test_gt

modules/dataset_tvrr.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import h5py
+import math
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from utils.basic_utils import load_json, load_json, l2_normalize_np_array, uniform_feature_sampling
+from utils.tensor_utils import pad_sequences_1d
+class TrainDataset(Dataset):
+    def __init__(self, data_path, desc_bert_path, sub_bert_path, max_desc_len,
+                 max_ctx_len, video_feat_path, clip_length, ctx_mode, normalize_vfeat=True,
+                 normalize_tfeat=True):
+        self.annotations = self.expand_annotations(load_json(data_path))
+        self.max_desc_len = max_desc_len
+        self.max_ctx_len = max_ctx_len
+        self.clip_length = clip_length
+        # prepare desc data
+        self.use_video = "video" in ctx_mode
+        self.use_sub = "sub" in ctx_mode
+        self.desc_bert_h5 = h5py.File(desc_bert_path, "r")
+        if self.use_video:
+            self.vid_feat_h5 = h5py.File(video_feat_path, "r")
+        if self.use_sub:
+            self.sub_bert_h5 = h5py.File(sub_bert_path, "r")
+        self.normalize_vfeat = normalize_vfeat
+        self.normalize_tfeat = normalize_tfeat
+    def __len__(self):
+        return len(self.annotations)
+    def __getitem__(self, index):
+        raw_data = self.annotations[index]
+        # initialize with basic data
+        # meta = dict(query_id=raw_data["query_id"], desc=raw_data["query"], vid_name=raw_data["video_name"],
+        #             duration=raw_data["duration"], ts=raw_data["timestamp"], simi=raw_data["similarity"], caption=raw_data["caption"])
+        '''
+        return a dictionary:
+        {
+            "simi":
+            "query_feat":
+            "video_feat":
+            "sub_feat":
+            "st_ed_indices":
+        }
+        '''
+        query_id=raw_data["query_id"]
+        video_name=raw_data["video_name"]
+        timestamp = raw_data["timestamp"]
+        model_inputs = dict()
+        model_inputs["simi"] = raw_data["similarity"]
+        model_inputs["query_feat"] = self.get_query_feat_by_query_id(query_id)
+        ctx_l = 0
+        if self.use_video:
+            video_feat = uniform_feature_sampling(self.vid_feat_h5[video_name][:], self.max_ctx_len)
+            if self.normalize_vfeat:
+                video_feat = l2_normalize_np_array(video_feat)
+            model_inputs["video_feat"] = torch.from_numpy(video_feat)
+            ctx_l = len(video_feat)
+        else:
+            model_inputs["video_feat"] = torch.zeros((2, 2))
+        if self.use_sub:  # no need for ctx feature, as the features are already contextualized
+            sub_feat = uniform_feature_sampling(self.sub_bert_h5[video_name][:], self.max_ctx_len)
+            if self.normalize_tfeat:
+                sub_feat = l2_normalize_np_array(sub_feat)
+            model_inputs["sub_feat"] = torch.from_numpy(sub_feat)
+            ctx_l = len(sub_feat)
+        else:
+            model_inputs["sub_feat"] = torch.zeros((2, 2))
+        # print(ctx_l)
+        # print(timestamp)
+        model_inputs["st_ed_indices"] = self.get_st_ed_label(timestamp, max_idx=ctx_l - 1)
+        # print(model_inputs["st_ed_indices"])
+        return model_inputs
+        # return dict(meta=meta, model_inputs=model_inputs)
+    def get_st_ed_label(self, ts, max_idx):
+        """
+        Args:
+            ts: [st (float), ed (float)] in seconds, ed > st
+            max_idx: length of the video
+        Returns:
+            [st_idx, ed_idx]: int,
+        Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6,
+        clips should be indexed as [2: 6), the translated back ts should be [3:9].
+        """
+        st_idx = min(math.floor(ts[0] / self.clip_length), max_idx)
+        ed_idx = min(math.ceil(ts[1] / self.clip_length), max_idx)  # -1
+        return torch.tensor([st_idx, ed_idx], dtype=torch.long)
+    def get_query_feat_by_query_id(self, query_id):
+        query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len]
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+        return torch.from_numpy(query_feat)
+    def expand_annotations(self, annotations):
+        new_annotations = []
+        for i in annotations:
+            query = i["query"]
+            query_id = i["query_id"]
+            for moment in  i["relevant_moment"]:
+                moment.update({'query': query, 'query_id': query_id})
+                new_annotations.append(moment)
+        return new_annotations
+class QueryEvalDataset(Dataset):
+    def __init__(self, data_path, desc_bert_path, max_desc_len, normalize_tfeat=True):
+        self.max_desc_len = max_desc_len
+        self.desc_bert_h5 = h5py.File(desc_bert_path, "r")
+        self.annotations = load_json(data_path)
+        self.normalize_tfeat = normalize_tfeat
+        self.ground_truth = self.get_relevant_moment_gt()
+    def __len__(self):
+        return len(self.annotations)
+    def __getitem__(self, index):
+        raw_data = self.annotations[index]
+        query_id = raw_data["query_id"]
+        query = raw_data["query"]
+        model_inputs =  {"query_id": query_id,
+                         "query_feat": self.get_query_feat_by_query_id(query_id)}
+        return model_inputs
+    def get_query_feat_by_query_id(self, query_id):
+        query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len]
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+        return torch.from_numpy(query_feat)
+    def get_relevant_moment_gt(self):
+        gt_all = {}
+        for data in self.annotations:
+            gt_all[data["query_id"]] = data["relevant_moment"]
+            # gt_all.append({
+            #     "query_id": data["query_id"],
+            #     "relevant_moment": data["relevant_moment"]})
+        return gt_all
+    def get_st_ed_label(self, ts, max_idx):
+        st_idx = min(math.floor(ts[0] / self.clip_length), max_idx)
+        ed_idx = min(math.ceil(ts[1] / self.clip_length), max_idx)
+        return torch.tensor([st_idx, ed_idx], dtype=torch.long)
+class CorpusEvalDataset(Dataset):
+    def __init__(self, corpus_path, max_ctx_len, sub_bert_path, video_feat_path, ctx_mode,
+                 normalize_vfeat=True, normalize_tfeat=True):
+        self.normalize_vfeat = normalize_vfeat
+        self.normalize_tfeat = normalize_tfeat
+        self.max_ctx_len = max_ctx_len
+        video_data = load_json(corpus_path)
+        self.video_data = [{"vid_name": k, "duration": v} for k, v in video_data.items()]
+        self.corpus_video_list = list(video_data.keys())
+        self.use_video = "video" in ctx_mode
+        self.use_sub = "sub" in ctx_mode
+        if self.use_video:
+            self.vid_feat_h5 = h5py.File(video_feat_path, "r")
+        if self.use_sub:
+            self.sub_bert_h5 = h5py.File(sub_bert_path, "r")
+    def __len__(self):
+        return len(self.video_data)
+    def __getitem__(self, index):
+        """No need to batch, since it has already been batched here"""
+        raw_data = self.video_data[index]
+        # initialize with basic data
+        meta = dict(vid_name=raw_data["vid_name"], duration=raw_data["duration"])
+        model_inputs = dict()
+        if self.use_video:
+            video_feat = uniform_feature_sampling(self.vid_feat_h5[meta["vid_name"]][:], self.max_ctx_len)
+            if self.normalize_vfeat:
+                video_feat = l2_normalize_np_array(video_feat)
+            model_inputs["video_feat"] = torch.from_numpy(video_feat)
+        else:
+            model_inputs["video_feat"] = torch.zeros((2, 2))
+        if self.use_sub:  # no need for ctx feature, as the features are already contextualized
+            sub_feat = uniform_feature_sampling(self.sub_bert_h5[meta["vid_name"]][:], self.max_ctx_len)
+            if self.normalize_tfeat:
+                sub_feat = l2_normalize_np_array(sub_feat)
+            model_inputs["sub_feat"] = torch.from_numpy(sub_feat)
+        else:
+            model_inputs["sub_feat"] = torch.zeros((2, 2))
+        return model_inputs

modules/infer_lib.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from tqdm import tqdm, trange
+import torch
+import torch.nn.functional as F
+import numpy as np
+from utils.run_utils import topk_3d, generate_min_max_length_mask, extract_topk_elements
+from modules.ndcg_iou import calculate_ndcg_iou
+def grab_corpus_feature(model, corpus_loader, device):
+    model.eval()
+    all_video_feat, all_video_mask = [], []
+    all_sub_feat, all_sub_mask = [], []
+    for batch_input in tqdm(corpus_loader, desc="Compute Corpus Feature: ", total=len(corpus_loader)):
+        batch_input = {k: v.to(device) for k, v in batch_input.items()}
+        _video_feat, _sub_feat = model.encode_context(batch_input["video_feat"], batch_input["video_mask"],
+                                                      batch_input["sub_feat"], batch_input["sub_mask"])
+        all_video_feat.append(_video_feat.detach().cpu())
+        all_video_mask.append(batch_input["video_mask"].detach().cpu())
+        all_sub_feat.append(_sub_feat.detach().cpu())
+        all_sub_mask.append(batch_input["sub_mask"].detach().cpu())
+    all_video_feat = torch.cat(all_video_feat, dim=0)
+    all_video_mask = torch.cat(all_video_mask, dim=0)
+    all_sub_feat = torch.cat(all_sub_feat, dim=0)
+    all_sub_mask = torch.cat(all_sub_mask, dim=0)
+    return  { "all_video_feat": all_video_feat,
+              "all_video_mask": all_video_mask,
+              "all_sub_feat": all_sub_feat,
+              "all_sub_mask": all_sub_mask}
+def eval_epoch(model, corpus_feature, eval_loader, eval_gt, opt, corpus_video_list):
+    topn_video = 100
+    device = opt.device
+    model.eval()
+    all_query_id = []
+    all_video_feat = corpus_feature["all_video_feat"].to(device)
+    all_video_mask = corpus_feature["all_video_mask"].to(device)
+    all_sub_feat = corpus_feature["all_sub_feat"].to(device)
+    all_sub_mask = corpus_feature["all_sub_mask"].to(device)
+    all_query_score, all_end_prob, all_start_prob = [], [], []
+    for batch_input in tqdm(eval_loader, desc="Compute Query Scores: ", total=len(eval_loader)):
+        batch_input = {k: v.to(device) for k, v in batch_input.items()}
+        query_scores, start_probs, end_probs = model.get_pred_from_raw_query(
+            query_feat = batch_input["query_feat"],
+            query_mask = batch_input["query_mask"],
+            video_feat = all_video_feat,
+            video_mask = all_video_mask,
+            sub_feat = all_sub_feat,
+            sub_mask = all_sub_mask,
+            cross=True)
+        query_scores = torch.exp(opt.q2c_alpha * query_scores)
+        start_probs = F.softmax(start_probs, dim=-1)
+        end_probs = F.softmax(end_probs, dim=-1)
+        query_scores, start_probs, end_probs = extract_topk_elements(query_scores, start_probs, end_probs, topn_video)
+        all_query_id.append(batch_input["query_id"].detach().cpu())
+        all_query_score.append(query_scores.detach().cpu())
+        all_start_prob.append(start_probs.detach().cpu())
+        all_end_prob.append(end_probs.detach().cpu())
+    all_query_id = torch.cat(all_query_id, dim=0)
+    all_query_id = all_query_id.tolist()
+    all_query_score = torch.cat(all_query_score, dim=0)
+    all_start_prob = torch.cat(all_start_prob, dim=0)
+    all_end_prob = torch.cat(all_end_prob, dim=0)
+    average_ndcg = calculate_average_ndcg(all_query_id, all_start_prob, all_query_score, all_end_prob, corpus_video_list, eval_gt, opt)
+    return average_ndcg
+def calculate_average_ndcg(all_query_id, all_start_prob, all_query_score, all_end_prob, corpus_video_list, eval_gt, opt):
+    topn_moment = max(opt.ndcg_topk)
+    all_2D_map = torch.einsum("qvm,qv,qvn->qvmn", all_start_prob, all_query_score, all_end_prob)
+    map_mask = generate_min_max_length_mask(all_2D_map.shape, min_l=opt.min_pred_l, max_l=opt.max_pred_l)
+    all_2D_map = all_2D_map * map_mask
+    all_pred = {}
+    for i in trange(len(all_2D_map), desc="Collect Predictions: "):
+        query_id = all_query_id[i]
+        score_map = all_2D_map[i]
+        top_score, top_idx = topk_3d(score_map, topn_moment)
+        pred_videos = [corpus_video_list[i[0]] for i in top_idx]
+        pre_start_time = [i[1].item() * opt.clip_length for i in top_idx]
+        pre_end_time   = [i[2].item() * opt.clip_length for i in top_idx]
+        pred_result = []
+        for video_name, s, e, score, in zip(pred_videos, pre_start_time, pre_end_time, top_score):
+            pred_result.append({
+                "video_name": video_name,
+                "timestamp": [s, e],
+                "model_scores": score
+            })
+        print(pred_result)
+        all_pred[query_id] = pred_result
+    average_ndcg = calculate_ndcg_iou(eval_gt, all_pred, opt.iou_threshold, opt.ndcg_topk)
+    return average_ndcg

modules/model_components.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def onehot(indexes, N=None):
+    """
+    Creates a one-representation of indexes with N possible entries
+    if N is not specified, it will suit the maximum index appearing.
+    indexes is a long-tensor of indexes
+    """
+    if N is None:
+        N = indexes.max() + 1
+    sz = list(indexes.size())
+    output = indexes.new().long().resize_(*sz, N).zero_()
+    output.scatter_(-1, indexes.unsqueeze(-1), 1)
+    return output
+class SmoothedCrossEntropyLoss(nn.Module):
+    def __init__(self, reduction='mean'):
+        super(SmoothedCrossEntropyLoss, self).__init__()
+        self.reduction = reduction
+    def forward(self, logits, labels, smooth_eps=0.1, mask=None, from_logits=True):
+        """
+        Args:
+            logits: (N, Lv), unnormalized probabilities, torch.float32
+            labels: (N, Lv) or (N, ), one hot labels or indices labels, torch.float32 or torch.int64
+            smooth_eps: float
+            mask: (N, Lv)
+            from_logits: bool
+        """
+        if from_logits:
+            probs = F.log_softmax(logits, dim=-1)
+        else:
+            probs = logits
+        num_classes = probs.size()[-1]
+        if len(probs.size()) > len(labels.size()):
+            labels = onehot(labels, num_classes).type(probs.dtype)
+        if mask is None:
+            labels = labels * (1 - smooth_eps) + smooth_eps / num_classes
+        else:
+            mask = mask.type(probs.dtype)
+            valid_samples = torch.sum(mask, dim=-1, keepdim=True, dtype=probs.dtype)  # (N, 1)
+            eps_per_sample = smooth_eps / valid_samples
+            labels = (labels * (1 - smooth_eps) + eps_per_sample) * mask
+        loss = -torch.sum(labels * probs, dim=-1)
+        if self.reduction == 'sum':
+            return torch.sum(loss)
+        elif self.reduction == 'mean':
+            return torch.mean(loss)
+        else:
+            return loss  # (N, )
+class MILNCELoss(nn.Module):
+    def __init__(self, reduction='mean'):
+        super(MILNCELoss, self).__init__()
+        self.reduction = reduction
+    def forward(self, q2ctx_scores=None, contexts=None, queries=None):
+        if q2ctx_scores is None:
+            assert contexts is not None and queries is not None
+            x = torch.matmul(contexts, queries.t())
+            device = contexts.device
+            bsz = contexts.shape[0]
+        else:
+            x = q2ctx_scores
+            device = q2ctx_scores.device
+            bsz = q2ctx_scores.shape[0]
+        x = x.view(bsz, bsz, -1)
+        nominator = x * torch.eye(x.shape[0], dtype=torch.float32, device=device)[:, :, None]
+        nominator = nominator.sum(dim=1)
+        nominator = torch.logsumexp(nominator, dim=1)
+        denominator = torch.cat((x, x.permute(1, 0, 2)), dim=1).view(x.shape[0], -1)
+        denominator = torch.logsumexp(denominator, dim=1)
+        if self.reduction:
+            return torch.mean(denominator - nominator)
+        else:
+            return denominator - nominator
+class DepthwiseSeparableConv(nn.Module):
+    """
+    Depth-wise separable convolution uses less parameters to generate output by convolution.
+    :Examples:
+        >>> m = DepthwiseSeparableConv(300, 200, 5, dim=1)
+        >>> input_tensor = torch.randn(32, 300, 20)
+        >>> output = m(input_tensor)
+    """
+    def __init__(self, in_ch, out_ch, k, dim=1, relu=True):
+        """
+        :param in_ch: input hidden dimension size
+        :param out_ch: output hidden dimension size
+        :param k: kernel size
+        :param dim: default 1. 1D conv or 2D conv
+        """
+        super(DepthwiseSeparableConv, self).__init__()
+        self.relu = relu
+        if dim == 1:
+            self.depthwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=in_ch, kernel_size=k, groups=in_ch,
+                                            padding=k // 2)
+            self.pointwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=out_ch, kernel_size=1, padding=0)
+        elif dim == 2:
+            self.depthwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=in_ch, kernel_size=k, groups=in_ch,
+                                            padding=k // 2)
+            self.pointwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=out_ch, kernel_size=1, padding=0)
+        else:
+            raise Exception("Incorrect dimension!")
+    def forward(self, x):
+        """
+        :Input: (N, L_in, D)
+        :Output: (N, L_out, D)
+        """
+        x = x.transpose(1, 2)
+        if self.relu:
+            out = F.relu(self.pointwise_conv(self.depthwise_conv(x)), inplace=True)
+        else:
+            out = self.pointwise_conv(self.depthwise_conv(x))
+        return out.transpose(1, 2)  # (N, L, D)
+class ConvEncoder(nn.Module):
+    def __init__(self, kernel_size=7, n_filters=128, dropout=0.1):
+        super(ConvEncoder, self).__init__()
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm = nn.LayerNorm(n_filters)
+        self.conv = DepthwiseSeparableConv(in_ch=n_filters, out_ch=n_filters, k=kernel_size, relu=True)
+    def forward(self, x):
+        """
+        :param x: (N, L, D)
+        :return: (N, L, D)
+        """
+        return self.layer_norm(self.dropout(self.conv(x)) + x)  # (N, L, D)
+class TrainablePositionalEncoding(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    def __init__(self, max_position_embeddings, hidden_size, dropout=0.1):
+        super(TrainablePositionalEncoding, self).__init__()
+        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
+        self.LayerNorm = nn.LayerNorm(hidden_size)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, input_feat):
+        bsz, seq_length = input_feat.shape[:2]
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_feat.device)
+        position_ids = position_ids.unsqueeze(0).repeat(bsz, 1)  # (N, L)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = self.LayerNorm(input_feat + position_embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+    def add_position_emb(self, input_feat):
+        bsz, seq_length = input_feat.shape[:2]
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_feat.device)
+        position_ids = position_ids.unsqueeze(0).repeat(bsz, 1)  # (N, L)
+        position_embeddings = self.position_embeddings(position_ids)
+        return input_feat + position_embeddings
+class LinearLayer(nn.Module):
+    """linear layer configurable with layer normalization, dropout, ReLU."""
+    def __init__(self, in_hsz, out_hsz, layer_norm=True, dropout=0.1, relu=True):
+        super(LinearLayer, self).__init__()
+        self.relu = relu
+        self.layer_norm = layer_norm
+        if layer_norm:
+            self.LayerNorm = nn.LayerNorm(in_hsz)
+        layers = [nn.Dropout(dropout), nn.Linear(in_hsz, out_hsz)]
+        self.net = nn.Sequential(*layers)
+    def forward(self, x):
+        """(N, L, D)"""
+        if self.layer_norm:
+            x = self.LayerNorm(x)
+        x = self.net(x)
+        if self.relu:
+            x = F.relu(x, inplace=True)
+        return x  # (N, L, D)
+class BertLayer(nn.Module):
+    def __init__(self, config, use_self_attention=True):
+        super(BertLayer, self).__init__()
+        self.use_self_attention = use_self_attention
+        if use_self_attention:
+            self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(self, hidden_states, attention_mask):
+        """
+        Args:
+            hidden_states:  (N, L, D)
+            attention_mask:  (N, L) with 1 indicate valid, 0 indicates invalid
+        """
+        if self.use_self_attention:
+            attention_output = self.attention(hidden_states, attention_mask)
+        else:
+            attention_output = hidden_states
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+    def forward(self, input_tensor, attention_mask):
+        """
+        Args:
+            input_tensor: (N, L, D)
+            attention_mask: (N, L)
+        """
+        self_output = self.self(input_tensor, input_tensor, input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Sequential(nn.Linear(config.hidden_size, config.intermediate_size), nn.ReLU(True))
+    def forward(self, hidden_states):
+        return self.dense(hidden_states)
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError("The hidden size (%d) is not a multiple of the number of attention heads (%d)" % (
+                config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)  # (N, L, nh, dh)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)  # (N, nh, L, dh)
+    def forward(self, query_states, key_states, value_states, attention_mask):
+        """
+        Args:
+            query_states: (N, Lq, D)
+            key_states: (N, L, D)
+            value_states: (N, L, D)
+            attention_mask: (N, Lq, L)
+        """
+        # only need to mask the dimension where the softmax (last dim) is applied, as another dim (second last)
+        # will be ignored in future computation anyway
+        attention_mask = (1 - attention_mask.unsqueeze(1)) * -10000.  # (N, 1, Lq, L)
+        mixed_query_layer = self.query(query_states)
+        mixed_key_layer = self.key(key_states)
+        mixed_value_layer = self.value(value_states)
+        # transpose
+        query_layer = self.transpose_for_scores(mixed_query_layer)  # (N, nh, Lq, dh)
+        key_layer = self.transpose_for_scores(mixed_key_layer)  # (N, nh, L, dh)
+        value_layer = self.transpose_for_scores(mixed_value_layer)  # (N, nh, L, dh)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))  # (N, nh, Lq, L)
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # compute output context
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states

modules/ndcg_iou.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import pandas as pd
+from tqdm import tqdm, trange
+import numpy as np
+from collections import defaultdict
+import copy
+def calculate_iou(pred_start: float, pred_end: float, gt_start: float, gt_end: float) -> float:
+    intersection_start = max(pred_start, gt_start)
+    intersection_end = min(pred_end, gt_end)
+    intersection = max(0, intersection_end - intersection_start)
+    union = (pred_end - pred_start) + (gt_end - gt_start) - intersection
+    return intersection / union if union > 0 else 0
+# Function to calculate DCG
+def calculate_dcg(scores):
+    return sum((2**score - 1) / np.log2(idx + 2) for idx, score in enumerate(scores))
+# Function to calculate NDCG
+def calculate_ndcg(pred_scores, true_scores):
+    dcg = calculate_dcg(pred_scores)
+    idcg = calculate_dcg(sorted(true_scores, reverse=True))
+    return dcg / idcg if idcg > 0 else 0
+def calculate_ndcg_iou(all_gt, all_pred, TS, KS):
+    performance = defaultdict(lambda: defaultdict(list))
+    performance_avg = defaultdict(lambda: defaultdict(float))
+    for k in all_pred.keys():
+        one_pred = all_pred[k]
+        one_gt = all_gt[k]
+        one_gt.sort(key=lambda x: x["relevance"], reverse=True)
+        for T in TS:
+            one_gt_drop = copy.deepcopy(one_gt)
+            predictions_with_scores = []
+            for pred in one_pred:
+                pred_video_name, pred_time = pred["video_name"], pred["timestamp"]
+                matched_rows = [gt for gt in one_gt_drop if gt["video_name"] == pred_video_name]
+                if not matched_rows:
+                    pred["pred_relevance"] = 0
+                else:
+                    ious = [calculate_iou(pred_time[0], pred_time[1], gt["timestamp"][0], gt["timestamp"][1]) for gt in matched_rows]
+                    max_iou_idx = np.argmax(ious)
+                    max_iou_row = matched_rows[max_iou_idx]
+                    if ious[max_iou_idx] > T:
+                        pred["pred_relevance"] = max_iou_row["relevance"]
+                        # Remove the matched ground truth row
+                        original_idx = one_gt_drop.index(max_iou_row)
+                        one_gt_drop.pop(original_idx)
+                    else:
+                        pred["pred_relevance"] = 0
+                predictions_with_scores.append(pred)
+            for K in KS:
+                true_scores = [gt["relevance"] for gt in one_gt][:K]
+                pred_scores = [pred["pred_relevance"] for pred in predictions_with_scores][:K]
+                ndcg_score = calculate_ndcg(pred_scores, true_scores)
+                performance[K][T].append(ndcg_score)
+    for K, vs in performance.items():
+        for T, v in vs.items():
+            performance_avg[K][T] = np.mean(v)
+    return performance_avg

modules/optimization.py ADDED Viewed

	@@ -0,0 +1,343 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for BERT model."""
+import math
+import torch
+from torch.optim import Optimizer
+from torch.optim.optimizer import required
+from torch.nn.utils import clip_grad_norm_
+import logging
+import abc
+import sys
+logger = logging.getLogger(__name__)
+if sys.version_info >= (3, 4):
+    ABC = abc.ABC
+else:
+    ABC = abc.ABCMeta('ABC', (), {})
+class _LRSchedule(ABC):
+    """ Parent of all LRSchedules here. """
+    warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
+    def __init__(self, warmup=0.002, t_total=-1, **kw):
+        """
+        :param warmup:  what fraction of t_total steps will be used for linear warmup
+        :param t_total: how many training steps (updates) are planned
+        :param kw:
+        """
+        super(_LRSchedule, self).__init__(**kw)
+        if t_total < 0:
+            logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+        warmup = max(warmup, 0.)
+        self.warmup, self.t_total = float(warmup), float(t_total)
+        self.warned_for_t_total_at_progress = -1
+    def get_lr(self, step, nowarn=False):
+        """
+        :param step:    which of t_total steps we're on
+        :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps
+        :return:        learning rate multiplier for current update
+        """
+        if self.t_total < 0:
+            return 1.
+        progress = float(step) / self.t_total
+        ret = self.get_lr_(progress)
+        # warning for exceeding t_total (only active with warmup_linear
+        if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
+            logger.warning("Training beyond specified 't_total'. Learning rate multiplier set to {}. Please "
+                           "set 't_total' of {} correctly.".format(ret, self.__class__.__name__))
+            self.warned_for_t_total_at_progress = progress
+        # end warning
+        return ret
+    @abc.abstractmethod
+    def get_lr_(self, progress):
+        """
+        :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress
+        :return:            learning rate multiplier for current update
+        """
+        return 1.
+class ConstantLR(_LRSchedule):
+    def get_lr_(self, progress):
+        return 1.
+class WarmupCosineSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
+    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+    """
+    warn_t_total = True
+    def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
+        """
+        :param warmup:      see LRSchedule
+        :param t_total:     see LRSchedule
+        :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1.
+                            at progress==warmup and 0 at progress==1.
+        :param kw:
+        """
+        super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
+        self.cycles = cycles
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
+            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
+class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
+    learning rate (with hard restarts).
+    """
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+        assert(cycles >= 1.)
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
+            return ret
+class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
+    """
+    All training progress is divided in `cycles` (default=1.) parts of equal length.
+    Every part follows a schedule with the first `warmup` fraction of training steps linearly increasing from 0. to 1.,
+    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
+    """
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        assert(warmup * cycles < 1.)
+        warmup = warmup * cycles if warmup >= 0 else warmup
+        super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles,
+                                                                     **kw)
+    def get_lr_(self, progress):
+        progress = progress * self.cycles % 1.
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * progress))
+            return ret
+class WarmupConstantSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Keeps learning rate equal to 1. after warmup.
+    """
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return 1.
+class WarmupLinearSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
+    """
+    warn_t_total = True
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return max((progress - 1.) / (self.warmup - 1.), 0.)
+SCHEDULES = {
+    None:       ConstantLR,
+    "none":     ConstantLR,
+    "warmup_cosine": WarmupCosineSchedule,
+    "warmup_constant": WarmupConstantSchedule,
+    "warmup_linear": WarmupLinearSchedule
+}
+class EMA(object):
+    """ Exponential Moving Average for model parameters.
+    references:
+    [1] https://github.com/BangLiu/QANet-PyTorch/blob/master/model/modules/ema.py
+    [2] https://github.com/hengruo/QANet-pytorch/blob/e2de07cd2c711d525f5ffee35c3764335d4b501d/main.py"""
+    def __init__(self, decay):
+        self.decay = decay
+        self.shadow = {}
+        self.original = {}
+    def register(self, name, val):
+        self.shadow[name] = val.clone()
+    def __call__(self, model, step):
+        decay = min(self.decay,  (1 + step) / (10.0 + step))
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                assert name in self.shadow
+                new_average = \
+                    (1.0 - decay) * param.data + decay * self.shadow[name]
+                self.shadow[name] = new_average.clone()
+    def assign(self, model):
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                assert name in self.shadow
+                self.original[name] = param.data.clone()
+                param.data = self.shadow[name]
+    def resume(self, model):
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                assert name in self.shadow
+                param.data = self.original[name]
+class BertAdam(Optimizer):
+    """Implements BERT version of Adam algorithm with weight decay fix.
+    Params:
+        lr: learning rate
+        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
+        t_total: total number of training steps for the learning
+            rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
+        schedule: schedule to use for the warmup (see above).
+            Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object
+            (see below).
+            If `None` or `'none'`, learning rate is always kept constant.
+            Default : `'warmup_linear'`
+        b1: Adams b1. Default: 0.9
+        b2: Adams b2. Default: 0.999
+        e: Adams epsilon. Default: 1e-6
+        weight_decay: Weight decay. Default: 0.01
+        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
+    """
+    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
+                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
+            raise ValueError("Invalid schedule parameter: {}".format(schedule))
+        if not 0.0 <= b1 < 1.0:
+            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
+        if not 0.0 <= b2 < 1.0:
+            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
+        if not e >= 0.0:
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+        # initialize schedule object
+        if not isinstance(schedule, _LRSchedule):
+            schedule_type = SCHEDULES[schedule]
+            schedule = schedule_type(warmup=warmup, t_total=t_total)
+        else:
+            if warmup != -1 or t_total != -1:
+                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is "
+                               "provided as schedule. Please specify custom warmup and t_total in _LRSchedule object.")
+        defaults = dict(lr=lr, schedule=schedule,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm)
+        super(BertAdam, self).__init__(params, defaults)
+    def get_lr(self):
+        lr = []
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                if len(state) == 0:
+                    return [0]
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
+                lr.append(lr_scheduled)
+        return lr
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['next_m'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['next_v'] = torch.zeros_like(p.data)
+                next_m, next_v = state['next_m'], state['next_v']
+                beta1, beta2 = group['b1'], group['b2']
+                # Add grad clipping
+                if group['max_grad_norm'] > 0:
+                    clip_grad_norm_(p, group['max_grad_norm'])
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                next_m.mul_(beta1).add_(grad, alpha=1 - beta1)
+                next_v.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+                update = next_m / (next_v.sqrt() + group['e'])
+                # Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                if group['weight_decay'] > 0.0:
+                    update += group['weight_decay'] * p.data
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
+                update_with_lr = lr_scheduled * update
+                p.data.add_(-update_with_lr)
+                state['step'] += 1
+                # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
+                # No bias correction
+                # bias_correction1 = 1 - beta1 ** state['step']
+                # bias_correction2 = 1 - beta2 ** state['step']
+        return loss

run_top20.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+python train.py \
+    --results_path      results/tvr_ranking \
+    --train_path        data/TVR_Ranking/train_top20.json \
+    --val_path          data/TVR_Ranking/val.json \
+    --test_path         data/TVR_Ranking/test.json \
+    --corpus_path       data/TVR_Ranking/video_corpus.json \
+    --desc_bert_path    data/TVR_Ranking/features/query_bert.h5 \
+    --video_feat_path   data/TVR_Ranking/features/tvr_i3d_rgb600_avg_cl-1.5.h5 \
+    --sub_bert_path     data/TVR_Ranking/features/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5 \
+    --n_epoch 100 \
+    --eval_num_per_epoch 1 \
+    --seed 2024 \
+    --exp_id new_version

train.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os, json
+import torch
+from tqdm import tqdm
+from modules.dataset_init import prepare_dataset
+from modules.infer_lib import grab_corpus_feature, eval_epoch
+from utils.basic_utils import AverageMeter, get_logger
+from utils.setup import set_seed, get_args
+from utils.run_utils import prepare_optimizer, prepare_model, logger_ndcg_iou
+def main():
+    opt = get_args()
+    logger = get_logger(opt.results_path, opt.exp_id)
+    set_seed(opt.seed)
+    logger.info("Arguments:\n%s", json.dumps(vars(opt), indent=4))
+    opt.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    logger.info(f"device: {opt.device}")
+    train_loader, corpus_loader, corpus_video_list, val_loader, test_loader, val_gt, test_gt = prepare_dataset(opt)
+    model = prepare_model(opt, logger)
+    optimizer = prepare_optimizer(model, opt, len(train_loader) * opt.n_epoch)
+    eval_step = len(train_loader) // opt.eval_num_per_epoch
+    best_val_ndcg = 0
+    for epoch_i in range(0, opt.n_epoch):
+        logger.info(f"TRAIN EPOCH: {epoch_i}|{opt.n_epoch}")
+        model.train()
+        if opt.hard_negative_start_epoch != -1 and epoch_i >= opt.hard_negative_start_epoch:
+            model.set_hard_negative(True, opt.hard_pool_size)
+        model.train()
+        for step, batch_input in tqdm(enumerate(train_loader), desc="Training", total=len(train_loader)):
+            step += 1
+            batch_input = {k: v.to(opt.device) for k, v in batch_input.items()}
+            loss = model(**batch_input)
+            optimizer.zero_grad()
+            loss.backward()
+            # nn.utils.clip_grad_norm_(model.parameters())
+            optimizer.step()
+            if step % opt.log_step == 0:
+                logger.info(f"EPOCH {epoch_i}/{opt.n_epoch} | STEP: {step}|{len(train_loader)} | Loss: {loss.item():.6f}")
+            if step % eval_step == 0 or step == len(train_loader):
+                corpus_feature = grab_corpus_feature(model, corpus_loader, opt.device)
+                val_ndcg_iou = eval_epoch(model, corpus_feature, val_loader, val_gt, opt, corpus_video_list)
+                test_ndcg_iou = eval_epoch(model, corpus_feature, test_loader, test_gt, opt, corpus_video_list)
+                logger_ndcg_iou(val_ndcg_iou, logger, "VAL")
+                logger_ndcg_iou(test_ndcg_iou, logger, "TEST")
+                if val_ndcg_iou[20][0.5] > best_val_ndcg:
+                    best_val_ndcg = val_ndcg_iou[20][0.5]
+                    logger_ndcg_iou(val_ndcg_iou, logger, "BEST VAL")
+                    logger_ndcg_iou(test_ndcg_iou, logger, "BEST TEST")
+                    checkpoint = {"model": model.state_dict(), "model_cfg": model.config, "epoch": epoch_i}
+                    bestmodel_path = os.path.join(opt.results_path, "best_model.pt")
+                    torch.save(checkpoint, bestmodel_path)
+                    logger.info(f"Save checkpoint at {bestmodel_path}")
+                    logger.info("")
+if __name__ == '__main__':
+    main()

utils/__init__.py ADDED Viewed

File without changes

utils/basic_utils.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import os
+import json
+import zipfile
+import numpy as np
+import pickle
+import yaml
+def uniform_feature_sampling(features, max_len):
+    num_clips = features.shape[0]
+    if max_len is None or num_clips <= max_len:
+        return features
+    idxs = np.arange(0, max_len + 1, 1.0) / max_len * num_clips
+    idxs = np.round(idxs).astype(np.int32)
+    idxs[idxs > num_clips - 1] = num_clips - 1
+    new_features = []
+    for i in range(max_len):
+        s_idx, e_idx = idxs[i], idxs[i + 1]
+        if s_idx < e_idx:
+            new_features.append(np.mean(features[s_idx:e_idx], axis=0))
+        else:
+            new_features.append(features[s_idx])
+    new_features = np.asarray(new_features)
+    return new_features
+def compute_overlap(pred, gt):
+    # check format
+    assert isinstance(pred, list) and isinstance(gt, list)
+    pred_is_list = isinstance(pred[0], list)
+    gt_is_list = isinstance(gt[0], list)
+    pred = pred if pred_is_list else [pred]
+    gt = gt if gt_is_list else [gt]
+    # compute overlap
+    pred, gt = np.array(pred), np.array(gt)
+    inter_left = np.maximum(pred[:, 0, None], gt[None, :, 0])
+    inter_right = np.minimum(pred[:, 1, None], gt[None, :, 1])
+    inter = np.maximum(0.0, inter_right - inter_left)
+    union_left = np.minimum(pred[:, 0, None], gt[None, :, 0])
+    union_right = np.maximum(pred[:, 1, None], gt[None, :, 1])
+    union = np.maximum(1e-12, union_right - union_left)
+    overlap = 1.0 * inter / union
+    # reformat output
+    overlap = overlap if gt_is_list else overlap[:, 0]
+    overlap = overlap if pred_is_list else overlap[0]
+    return overlap
+def time_to_index(start_time, end_time, num_units, duration):
+    s_times = np.arange(0, num_units).astype(np.float32) / float(num_units) * duration
+    e_times = np.arange(1, num_units + 1).astype(np.float32) / float(num_units) * duration
+    candidates = np.stack([np.repeat(s_times[:, None], repeats=num_units, axis=1),
+                           np.repeat(e_times[None, :], repeats=num_units, axis=0)], axis=2).reshape((-1, 2))
+    overlaps = compute_overlap(candidates.tolist(), [start_time, end_time]).reshape(num_units, num_units)
+    start_index = np.argmax(overlaps) // num_units
+    end_index = np.argmax(overlaps) % num_units
+    return start_index, end_index
+def load_yaml(filename):
+    try:
+        with open(filename, 'r') as file:
+            return yaml.safe_load(file)
+    except yaml.YAMLError as exc:
+        print(f"Error parsing YAML file: {exc}")
+        return None
+    except FileNotFoundError:
+        print(f"File not found: {filename}")
+        return None
+def load_pickle(filename):
+    with open(filename, "rb") as f:
+        return pickle.load(f)
+def save_pickle(data, filename):
+    with open(filename, "wb") as f:
+        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
+def load_json(filename):
+    with open(filename, "r") as f:
+        return json.load(f)
+def save_json(data, filename, save_pretty=False, sort_keys=False):
+    with open(filename, "w") as f:
+        if save_pretty:
+            f.write(json.dumps(data, indent=4, sort_keys=sort_keys))
+        else:
+            json.dump(data, f)
+def load_jsonl(filename):
+    with open(filename, "r") as f:
+        return [json.loads(l.strip("\n")) for l in f.readlines()]
+def save_jsonl(data, filename):
+    """data is a list"""
+    with open(filename, "w") as f:
+        f.write("\n".join([json.dumps(e) for e in data]))
+def save_lines(list_of_str, filepath):
+    with open(filepath, "w") as f:
+        f.write("\n".join(list_of_str))
+def read_lines(filepath):
+    with open(filepath, "r") as f:
+        return [e.strip("\n") for e in f.readlines()]
+def mkdirp(p):
+    if not os.path.exists(p):
+        os.makedirs(p)
+def flat_list_of_lists(l):
+    """flatten a list of lists [[1,2], [3,4]] to [1,2,3,4]"""
+    return [item for sublist in l for item in sublist]
+def convert_to_seconds(hms_time):
+    """ convert '00:01:12' to 72 seconds.
+    :hms_time (str): time in comma separated string, e.g. '00:01:12'
+    :return (int): time in seconds, e.g. 72
+    """
+    times = [float(t) for t in hms_time.split(":")]
+    return times[0] * 3600 + times[1] * 60 + times[2]
+def get_video_name_from_url(url):
+    return url.split("/")[-1][:-4]
+def merge_dicts(list_dicts):
+    merged_dict = list_dicts[0].copy()
+    for i in range(1, len(list_dicts)):
+        merged_dict.update(list_dicts[i])
+    return merged_dict
+def l2_normalize_np_array(np_array, eps=1e-5):
+    """np_array: np.ndarray, (*, D), where the last dim will be normalized"""
+    return np_array / (np.linalg.norm(np_array, axis=-1, keepdims=True) + eps)
+def make_zipfile(src_dir, save_path, enclosing_dir="", exclude_dirs=None, exclude_extensions=None,
+                 exclude_dirs_substring=None):
+    """make a zip file of root_dir, save it to save_path.
+    exclude_paths will be excluded if it is a subdir of root_dir.
+    An enclosing_dir is added is specified.
+    """
+    abs_src = os.path.abspath(src_dir)
+    with zipfile.ZipFile(save_path, "w") as zf:
+        for dirname, subdirs, files in os.walk(src_dir):
+            if exclude_dirs is not None:
+                for e_p in exclude_dirs:
+                    if e_p in subdirs:
+                        subdirs.remove(e_p)
+            if exclude_dirs_substring is not None:
+                to_rm = []
+                for d in subdirs:
+                    if exclude_dirs_substring in d:
+                        to_rm.append(d)
+                for e in to_rm:
+                    subdirs.remove(e)
+            arcname = os.path.join(enclosing_dir, dirname[len(abs_src) + 1:])
+            zf.write(dirname, arcname)
+            for filename in files:
+                if exclude_extensions is not None:
+                    if os.path.splitext(filename)[1] in exclude_extensions:
+                        continue  # do not zip it
+                absname = os.path.join(dirname, filename)
+                arcname = os.path.join(enclosing_dir, absname[len(abs_src) + 1:])
+                zf.write(absname, arcname)
+class AverageMeter(object):
+    """Computes and stores the average and current/max/min value"""
+    def __init__(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+        self.max = -1e10
+        self.min = 1e10
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+        self.max = -1e10
+        self.min = 1e10
+    def update(self, val, n=1):
+        self.max = max(val, self.max)
+        self.min = min(val, self.min)
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+def dissect_by_lengths(np_array, lengths, dim=0, assert_equal=True):
+    """Dissect an array (N, D) into a list a sub-array,
+    np_array.shape[0] == sum(lengths), Output is a list of nd arrays, singlton dimention is kept"""
+    if assert_equal:
+        assert len(np_array) == sum(lengths)
+    length_indices = [0, ]
+    for i in range(len(lengths)):
+        length_indices.append(length_indices[i] + lengths[i])
+    if dim == 0:
+        array_list = [np_array[length_indices[i]:length_indices[i+1]] for i in range(len(lengths))]
+    elif dim == 1:
+        array_list = [np_array[:, length_indices[i]:length_indices[i + 1]] for i in range(len(lengths))]
+    elif dim == 2:
+        array_list = [np_array[:, :, length_indices[i]:length_indices[i + 1]] for i in range(len(lengths))]
+    else:
+        raise NotImplementedError
+    return array_list
+def get_ratio_from_counter(counter_obj, threshold=200):
+    keys = counter_obj.keys()
+    values = counter_obj.values()
+    filtered_values = [counter_obj[k] for k in keys if k > threshold]
+    return float(sum(filtered_values)) / sum(values)
+def get_show_name(vid_name):
+    """
+    get tvshow name from vid_name
+    :param vid_name: video clip name
+    :return: tvshow name
+    """
+    show_list = ["friends", "met", "castle", "house", "grey"]
+    vid_name_prefix = vid_name.split("_")[0]
+    show_name = vid_name_prefix if vid_name_prefix in show_list else "bbt"
+    return show_name
+import time
+import logging
+import os
+def get_logger(dir, tile):
+    os.makedirs(dir, exist_ok=True)
+    log_file = time.strftime("%Y%m%d_%H%M%S", time.localtime())
+    log_file = os.path.join(dir, "{}_{}.log".format(log_file, tile))
+    logger = logging.getLogger()
+    logger.setLevel('DEBUG')
+    BASIC_FORMAT = "%(levelname)s:%(message)s"
+    # DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
+    formatter = logging.Formatter(BASIC_FORMAT)
+    chlr = logging.StreamHandler()
+    chlr.setFormatter(formatter)
+    fhlr = logging.FileHandler(log_file)
+    fhlr.setFormatter(formatter)
+    fhlr.setLevel('INFO')
+    logger.addHandler(chlr)
+    logger.addHandler(fhlr)
+    return logger

utils/run_utils.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+from modules.ReLoCLNet import ReLoCLNet
+from modules.optimization import BertAdam
+import numpy as np
+def count_parameters(model, verbose=True):
+    """Count number of parameters in PyTorch model,
+    References: https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/7.
+    from utils.utils import count_parameters
+    count_parameters(model)
+    import sys
+    sys.exit(1)
+    """
+    n_all = sum(p.numel() for p in model.parameters())
+    n_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    if verbose:
+        print("Parameter Count: all {:,d}; trainable {:,d}".format(n_all, n_trainable))
+    return n_all, n_trainable
+def prepare_model(opt, logger):
+    model = ReLoCLNet(opt)
+    count_parameters(model)
+    if opt.checkpoint is not None:
+        checkpoint = torch.load(opt.checkpoint, map_location=opt.device)
+        model.load_state_dict(checkpoint['model'])
+        logger.info(f"Loading checkpoint from {opt.checkpoint}")
+    # Prepare optimizer (unchanged)
+    if opt.device.type == "cuda":
+        logger.info("CUDA enabled.")
+        model.to(opt.device)
+    return model
+def prepare_optimizer(model, opt, total_train_steps):
+    param_optimizer = list(model.named_parameters())
+    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01},
+        {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}]
+    optimizer = BertAdam(optimizer_grouped_parameters, lr=opt.lr, weight_decay=opt.wd, warmup=opt.lr_warmup_proportion,
+                         t_total=total_train_steps, schedule="warmup_linear")
+    return optimizer
+def topk_3d(tensor, k):
+    """
+    Find the top k values and their corresponding indices in a 3D tensor.
+    Args:
+    tensor (torch.Tensor): A 3D tensor of shape [v, m, n].
+    k (int): The number of top elements to find.
+    Returns:
+    topk_values (torch.Tensor): The top k values.
+    indices_3d (torch.Tensor): The indices of the top k values in the format [i, j, k].
+    """
+    # Step 1: Flatten the tensor to 1D
+    flat_tensor = tensor.view(-1)
+    # Step 2: Find the top k values and their indices in the flattened tensor
+    topk_values, topk_indices = torch.topk(flat_tensor, k)
+    # Step 3: Convert the flat indices back to the original 3D tensor's indices
+    v, m, n = tensor.shape
+    indices_3d = torch.stack(torch.unravel_index(topk_indices, (v, m, n)), dim=1)
+    return topk_values, indices_3d
+def generate_min_max_length_mask(array_shape, min_l, max_l):
+    """ The last two dimension denotes matrix of upper-triangle with upper-right corner masked,
+    below is the case for 4x4.
+    [[0, 1, 1, 0],
+     [0, 0, 1, 1],
+     [0, 0, 0, 1],
+     [0, 0, 0, 0]]
+    Args:
+        array_shape: np.shape??? The last two dimensions should be the same
+        min_l: int, minimum length of predicted span
+        max_l: int, maximum length of predicted span
+    Returns:
+    """
+    single_dims = (1, ) * (len(array_shape) - 2)
+    mask_shape = single_dims + array_shape[-2:]
+    extra_length_mask_array = np.ones(mask_shape, dtype=np.float32)  # (1, ..., 1, L, L)
+    mask_triu = np.triu(extra_length_mask_array, k=min_l)
+    mask_triu_reversed = 1 - np.triu(extra_length_mask_array, k=max_l)
+    final_prob_mask = mask_triu * mask_triu_reversed
+    return final_prob_mask  # with valid bit to be 1
+def extract_topk_elements(query_scores, start_probs, end_probs, k):
+    # Step 1: Find the top k values and their indices in query_scores
+    topk_values, topk_indices = torch.topk(query_scores, k)
+    # Step 2: Use these indices to select the corresponding elements from start_probs and end_probs
+    selected_start_probs = torch.stack([start_probs[i, indices] for i, indices in enumerate(topk_indices)], dim=0)
+    selected_end_probs = torch.stack([end_probs[i, indices] for i, indices in enumerate(topk_indices)], dim=0)
+    return topk_values, selected_start_probs, selected_end_probs
+def logger_ndcg_iou(val_ndcg_iou, logger, suffix):
+    for K, vs in val_ndcg_iou.items():
+        for T, v in vs.items():
+            logger.info(f"{suffix} NDCG@{K}, IoU={T}: {v:.6f}")
+    logger.info("")

utils/setup.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import random, torch, os
+import numpy as np
+import argparse
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--train_path", type=str, default=None)
+    parser.add_argument("--corpus_path", type=str, default=None)
+    parser.add_argument("--val_path", type=str, default=None)
+    parser.add_argument("--test_path", type=str, default=None)
+    parser.add_argument("--video_feat_path", type=str, default="")
+    parser.add_argument("--desc_bert_path", type=str, default=None)
+    parser.add_argument("--sub_bert_path", type=str, default=None)
+    parser.add_argument("--results_path", type=str, default="results")
+    # setup
+    parser.add_argument("--checkpoint", type=str, default=None)
+    parser.add_argument("--exp_id", type=str, default=None, help="id of this run, required at training")
+    parser.add_argument("--seed", type=int, default=2024, help="random seed")
+    parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu")
+    parser.add_argument("--num_workers", type=int, default=4, help="num subprocesses used to load the data, 0: use main process")
+    # dataloader
+    # training config
+    parser.add_argument("--bsz", type=int, default=128, help="mini-batch size")
+    parser.add_argument("--bsz_eval", type=int, default=16, help="mini-batch size")
+    parser.add_argument("--n_epoch", type=int, default=100, help="number of epochs to run")
+    parser.add_argument("--eval_num_per_epoch", type=float, default=1.0, help="eval times during each epoch")
+    parser.add_argument("--log_step", type=int, default=100)
+    parser.add_argument("--lr", type=float, default=1e-4, help="learning rate")
+    parser.add_argument("--lr_warmup_proportion", type=float, default=0.01, help="Proportion of training to perform linear learning rate warmup.")
+    parser.add_argument("--wd", type=float, default=0.01, help="weight decay")
+    # Model loss
+    parser.add_argument("--margin", type=float, default=0.1, help="margin for hinge loss")
+    parser.add_argument("--lw_neg_q", type=float, default=1, help="weight for ranking loss with negative query and positive context")
+    parser.add_argument("--lw_neg_ctx", type=float, default=1, help="weight for ranking loss with positive query and negative context")
+    parser.add_argument("--lw_st_ed", type=float, default=0.01, help="weight for st ed prediction loss")
+    parser.add_argument("--lw_fcl", type=float, default=0.03, help="weight for frame CL loss")
+    parser.add_argument("--lw_vcl", type=float, default=0.03, help="weight for video CL loss")
+    parser.add_argument("--ranking_loss_type", type=str, default="hinge", choices=["hinge", "lse"],  help="att loss type, can be hinge loss or its smooth approximation LogSumExp")
+    parser.add_argument("--hard_negative_start_epoch", type=int, default=20, help="which epoch to start hard negative sampling for video-level ranking loss, use -1 to disable")
+    parser.add_argument("--hard_pool_size", type=int, default=20, help="hard negatives are still sampled, but from a harder pool.")
+    parser.add_argument("--use_hard_negative", type=bool, default=False)
+    # Data config
+    parser.add_argument("--ctx_mode", type=str, default="video_sub", help="which context to use a combination of [video, sub, tef]")
+    parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions")
+    parser.add_argument("--max_ctx_l", type=int, default=128, help="max number of snippets, 100 for tvr clip_length=1.5, oly 109/21825 > 100")
+    parser.add_argument("--clip_length", type=float, default=1.5, help="each video will be uniformly segmented into small clips,  will automatically loaded from ProposalConfigs if None")
+    parser.add_argument("--no_norm_vfeat", action="store_true", help="Do not do normalization on video feat, use it only when using resnet_i3d feat")
+    parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat")
+    # Model config
+    parser.add_argument("--visual_input_size", type=int, default=1024)
+    parser.add_argument("--sub_input_size", type=int, default=768)
+    parser.add_argument("--query_input_size", type=int, default=768)
+    parser.add_argument("--max_position_embeddings", type=int, default=300)
+    parser.add_argument("--hidden_size", type=int, default=384)
+    parser.add_argument("--n_heads", type=int, default=8)
+    parser.add_argument("--input_drop", type=float, default=0.1, help="Applied to all inputs")
+    parser.add_argument("--drop", type=float, default=0.1, help="Applied to all other layers")
+    parser.add_argument("--conv_kernel_size", type=int, default=5)
+    parser.add_argument("--conv_stride", type=int, default=1)
+    parser.add_argument("--initializer_range", type=float, default=0.02, help="initializer range for layers")
+    # post processing
+    parser.add_argument("--min_pred_l", type=int, default=2, help="constrain the [st, ed] with ed - st >= 2 (2 clips with length 1.5 each, 3 secs  in total this is the min length for proposal-based backup_method)")
+    parser.add_argument("--max_pred_l", type=int, default=16, help="constrain the [st, ed] pairs with ed - st <= 16, 24 secs in total (16 clips  with length 1.5 each, this is the max length for proposal-based backup_method)")
+    parser.add_argument("--q2c_alpha", type=float, default=30, help="give more importance to top scored videos' spans, the new score will be: s_new = exp(alpha * s),  igher alpha indicates more importance. Note s in [-1, 1]")
+    parser.add_argument("--max_before_nms", type=int, default=200)
+    parser.add_argument("--max_vcmr_video", type=int, default=100, help="re-ranking in top-max_vcmr_video")
+    parser.add_argument("--nms_thd", type=float, default=-1, help="additionally use non-maximum suppression (or non-minimum suppression for distance) to post-processing the predictions. -1: do not use nms. 0.6 for charades_sta, 0.5 for anet_cap")
+    # evaluation
+    parser.add_argument("--iou_threshold", type=float, nargs='+', default=[0.3, 0.5, 0.7], help="List of IOU thresholds")
+    parser.add_argument("--ndcg_topk", type=int, nargs='+', default=[10, 20, 40], help="List of NDCG top k values")
+    args = parser.parse_args()
+    os.makedirs(args.results_path, exist_ok=True)
+    if args.hard_negative_start_epoch != -1:
+        if args.hard_pool_size > args.bsz:
+            print("[WARNING] hard_pool_size is larger than bsz")
+    return args
+def set_seed(seed, use_cuda=True):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if use_cuda:
+        torch.cuda.manual_seed_all(seed)

utils/temporal_nms.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+Non-Maximum Suppression for video proposals.
+"""
+def compute_temporal_iou(pred, gt):
+    """ deprecated due to performance concerns
+    compute intersection-over-union along temporal axis
+    Args:
+        pred: [st (float), ed (float)]
+        gt: [st (float), ed (float)]
+    Returns:
+        iou (float):
+    Ref: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py
+    """
+    intersection = max(0, min(pred[1], gt[1]) - max(pred[0], gt[0]))
+    union = max(pred[1], gt[1]) - min(pred[0], gt[0])  # not the correct union though
+    if union == 0:
+        return 0
+    else:
+        return 1.0 * intersection / union
+def temporal_non_maximum_suppression(predictions, nms_threshold, max_after_nms=100):
+    """
+    Args:
+        predictions: list(sublist), each sublist is [st (float), ed(float), score (float)],
+            note larger scores are better and are preserved. For metrics that are better when smaller,
+            please convert to its negative, e.g., convert distance to negative distance.
+        nms_threshold: float in [0, 1]
+        max_after_nms:
+    Returns:
+        predictions_after_nms: list(sublist), each sublist is [st (float), ed(float), score (float)]
+    References:
+        https://github.com/wzmsltw/BSN-boundary-sensitive-network/blob/7b101fc5978802aa3c95ba5779eb54151c6173c6/Post_processing.py#L42
+    """
+    if len(predictions) == 1:  # only has one prediction, no need for nms
+        return predictions
+    predictions = sorted(predictions, key=lambda x: x[2], reverse=True)  # descending order
+    tstart = [e[0] for e in predictions]
+    tend = [e[1] for e in predictions]
+    tscore = [e[2] for e in predictions]
+    rstart = []
+    rend = []
+    rscore = []
+    while len(tstart) > 1 and len(rscore) < max_after_nms:  # max 100 after nms
+        idx = 1
+        while idx < len(tstart):  # compare with every prediction in the list.
+            if compute_temporal_iou([tstart[0], tend[0]], [tstart[idx], tend[idx]]) > nms_threshold:
+                # rm highly overlapped lower score entries.
+                tstart.pop(idx)
+                tend.pop(idx)
+                tscore.pop(idx)
+                # print("--------------------------------")
+                # print(compute_temporal_iou([tstart[0], tend[0]], [tstart[idx], tend[idx]]))
+                # print([tstart[0], tend[0]], [tstart[idx], tend[idx]])
+                # print(tstart.pop(idx), tend.pop(idx), tscore.pop(idx))
+            else:
+                # move to next
+                idx += 1
+        rstart.append(tstart.pop(0))
+        rend.append(tend.pop(0))
+        rscore.append(tscore.pop(0))
+    if len(rscore) < max_after_nms and len(tstart) >= 1:  # add the last, possibly empty.
+        rstart.append(tstart.pop(0))
+        rend.append(tend.pop(0))
+        rscore.append(tscore.pop(0))
+    predictions_after_nms = [[st, ed, s] for s, st, ed in zip(rscore, rstart, rend)]
+    return predictions_after_nms

utils/tensor_utils.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import numpy as np
+import torch
+def pad_sequences_1d(sequences, dtype=torch.long, device=torch.device("cpu"), fixed_length=None):
+    """ Pad a single-nested list or a sequence of n-d array (torch.tensor or np.ndarray)
+    into a (n+1)-d array, only allow the first dim has variable lengths.
+    Args:
+        sequences: list(n-d tensor or list)
+        dtype: np.dtype or torch.dtype
+        device:
+        fixed_length: pad all seq in sequences to fixed length. All seq should have a length <= fixed_length.
+            return will be of shape [len(sequences), fixed_length, ...]
+    Returns:
+        padded_seqs: ((n+1)-d tensor) padded with zeros
+        mask: (2d tensor) of the same shape as the first two dims of padded_seqs,
+              1 indicate valid, 0 otherwise
+    Examples:
+        >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]]
+        >>> pad_sequences_1d(test_data_list, dtype=torch.long)
+        >>> test_data_3d = [torch.randn(2,3,4), torch.randn(4,3,4), torch.randn(1,3,4)]
+        >>> pad_sequences_1d(test_data_3d, dtype=torch.float)
+        >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]]
+        >>> pad_sequences_1d(test_data_list, dtype=np.float32)
+        >>> test_data_3d = [np.random.randn(2,3,4), np.random.randn(4,3,4), np.random.randn(1,3,4)]
+        >>> pad_sequences_1d(test_data_3d, dtype=np.float32)
+    """
+    if isinstance(sequences[0], list):
+        if "torch" in str(dtype):
+            sequences = [torch.tensor(s, dtype=dtype, device=device) for s in sequences]
+        else:
+            sequences = [np.asarray(s, dtype=dtype) for s in sequences]
+    extra_dims = sequences[0].shape[1:]  # the extra dims should be the same for all elements
+    lengths = [len(seq) for seq in sequences]
+    if fixed_length is not None:
+        max_length = fixed_length
+    else:
+        max_length = max(lengths)
+    if isinstance(sequences[0], torch.Tensor):
+        assert "torch" in str(dtype), "dtype and input type does not match"
+        padded_seqs = torch.zeros((len(sequences), max_length) + extra_dims, dtype=dtype, device=device)
+        mask = torch.zeros((len(sequences), max_length), dtype=torch.float32, device=device)
+    else:  # np
+        assert "numpy" in str(dtype), "dtype and input type does not match"
+        padded_seqs = np.zeros((len(sequences), max_length) + extra_dims, dtype=dtype)
+        mask = np.zeros((len(sequences), max_length), dtype=np.float32)
+    for idx, seq in enumerate(sequences):
+        end = lengths[idx]
+        padded_seqs[idx, :end] = seq
+        mask[idx, :end] = 1
+    return padded_seqs, mask  # , lengths
+def pad_sequences_2d(sequences, dtype=torch.long):
+    """ Pad a double-nested list or a sequence of n-d torch tensor into a (n+1)-d tensor,
+        only allow the first two dims has variable lengths
+    Args:
+        sequences: list(n-d tensor or list)
+        dtype: torch.long for word indices / torch.float (float32) for other cases
+    Returns:
+    Examples:
+        >>> test_data_list = [[[1, 3, 5], [3, 7, 4, 1]], [[98, 34, 11, 89, 90], [22], [34, 56]],]
+        >>> pad_sequences_2d(test_data_list, dtype=torch.long)  # torch.Size([2, 3, 5])
+        >>> test_data_3d = [torch.randn(2,2,4), torch.randn(4,3,4), torch.randn(1,5,4)]
+        >>> pad_sequences_2d(test_data_3d, dtype=torch.float)  # torch.Size([2, 3, 5])
+        >>> test_data_3d2 = [[torch.randn(2,4), ], [torch.randn(3,4), torch.randn(5,4)]]
+        >>> pad_sequences_2d(test_data_3d2, dtype=torch.float)  # torch.Size([2, 3, 5])
+    # TODO add support for numpy array
+    """
+    bsz = len(sequences)
+    para_lengths = [len(seq) for seq in sequences]
+    max_para_len = max(para_lengths)
+    sen_lengths = [[len(word_seq) for word_seq in seq] for seq in sequences]
+    max_sen_len = max([max(e) for e in sen_lengths])
+    if isinstance(sequences[0], torch.Tensor):
+        extra_dims = sequences[0].shape[2:]
+    elif isinstance(sequences[0][0], torch.Tensor):
+        extra_dims = sequences[0][0].shape[1:]
+    else:
+        sequences = [[torch.Tensor(word_seq, dtype=dtype) for word_seq in seq] for seq in sequences]
+        extra_dims = ()
+    padded_seqs = torch.zeros((bsz, max_para_len, max_sen_len) + extra_dims, dtype=dtype)
+    mask = torch.zeros(bsz, max_para_len, max_sen_len).float()
+    for b_i in range(bsz):
+        for sen_i, sen_l in enumerate(sen_lengths[b_i]):
+            padded_seqs[b_i, sen_i, :sen_l] = sequences[b_i][sen_i]
+            mask[b_i, sen_i, :sen_l] = 1
+    return padded_seqs, mask  # , sen_lengths
+def find_max_triples(st_prob, ed_prob, top_n=5, prob_thd=None, tensor_type="torch"):
+    """ Find a list of (k1, k2) where k1 < k2 with the maximum values of st_prob[k1] * ed_prob[k2]
+    Args:
+        st_prob (torch.Tensor or np.ndarray): (N, L) batched start_idx probabilities
+        ed_prob (torch.Tensor  or np.ndarray): (N, L) batched end_idx probabilities
+        top_n (int): return topN pairs with highest values
+        prob_thd (float):
+        tensor_type: str, np or torch
+    Returns:
+        batched_sorted_triple: N * [(st_idx, ed_idx, confidence), ...]
+    """
+    if tensor_type == "torch":
+        st_prob, ed_prob = st_prob.data.numpy(), ed_prob.data.numpy()
+    product = np.einsum("bm,bn->bmn", st_prob, ed_prob)
+    # (N, L, L) the lower part becomes zeros, start_idx < ed_idx
+    upper_product = np.triu(product, k=1)
+    return find_max_triples_from_upper_triangle_product(upper_product, top_n=top_n, prob_thd=prob_thd)
+def find_max_triples_from_upper_triangle_product(upper_product, top_n=5, prob_thd=None):
+    """ Find a list of (k1, k2) where k1 < k2 with the maximum values of p1[k1] * p2[k2]
+    Args:
+        upper_product (torch.Tensor or np.ndarray): (N, L, L), the lower part becomes zeros, end_idx > start_idx
+        top_n (int): return topN pairs with highest values
+        prob_thd (float or None):
+    Returns:
+        batched_sorted_triple: N * [(st_idx, ed_idx, confidence), ...]
+    """
+    batched_sorted_triple = []
+    for idx, e in enumerate(upper_product):
+        sorted_triple = top_n_array_2d(e, top_n=top_n)
+        if prob_thd is not None:
+            sorted_triple = sorted_triple[sorted_triple[2] >= prob_thd]
+        batched_sorted_triple.append(sorted_triple)
+    return batched_sorted_triple
+def top_n_array_2d(array_2d, top_n):
+    """ Get topN indices and values of a 2d array, return a tuple of indices and their values,
+    ranked by the value
+    """
+    row_indices, column_indices = np.unravel_index(np.argsort(array_2d, axis=None), array_2d.shape)
+    row_indices = row_indices[::-1][:top_n]
+    column_indices = column_indices[::-1][:top_n]
+    sorted_values = array_2d[row_indices, column_indices]
+    return np.stack([row_indices, column_indices, sorted_values], axis=1)  # (N, 3)