English
File size: 6,778 Bytes
5019d3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dae63ab
5019d3f
dae63ab
5019d3f
 
 
dae63ab
5019d3f
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import random, torch, os
import numpy as np
import argparse


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_path", type=str, default=None)
    parser.add_argument("--corpus_path", type=str, default=None)
    parser.add_argument("--val_path", type=str, default=None)
    parser.add_argument("--test_path", type=str, default=None)
    parser.add_argument("--video_feat_path", type=str, default="")
        
    parser.add_argument("--desc_bert_path", type=str, default=None)
    parser.add_argument("--sub_bert_path", type=str, default=None)
    parser.add_argument("--results_path", type=str, default="results")
        
    # setup 
    parser.add_argument("--checkpoint", type=str, default=None)
    parser.add_argument("--exp_id", type=str, default=None, help="id of this run, required at training")
    parser.add_argument("--seed", type=int, default=2024, help="random seed")
    parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu")
    parser.add_argument("--num_workers", type=int, default=4, help="num subprocesses used to load the data, 0: use main process")
        
    # dataloader
        
        
    # training config
    parser.add_argument("--bsz", type=int, default=128, help="mini-batch size")
    parser.add_argument("--bsz_eval", type=int, default=16, help="mini-batch size")
    parser.add_argument("--n_epoch", type=int, default=100, help="number of epochs to run")
    parser.add_argument("--eval_num_per_epoch", type=float, default=1.0, help="eval times during each epoch")
    parser.add_argument("--log_step", type=int, default=100)
    parser.add_argument("--lr", type=float, default=1e-4, help="learning rate")
    parser.add_argument("--lr_warmup_proportion", type=float, default=0.01, help="Proportion of training to perform linear learning rate warmup.")
    parser.add_argument("--wd", type=float, default=0.01, help="weight decay")
        
        
    # Model loss
    parser.add_argument("--margin", type=float, default=0.1, help="margin for hinge loss")
    parser.add_argument("--lw_neg_q", type=float, default=1, help="weight for ranking loss with negative query and positive context")
    parser.add_argument("--lw_neg_ctx", type=float, default=1, help="weight for ranking loss with positive query and negative context")
    parser.add_argument("--lw_st_ed", type=float, default=0.01, help="weight for st ed prediction loss")
    parser.add_argument("--lw_fcl", type=float, default=0.03, help="weight for frame CL loss")
    parser.add_argument("--lw_vcl", type=float, default=0.03, help="weight for video CL loss")
    parser.add_argument("--ranking_loss_type", type=str, default="hinge", choices=["hinge", "lse"],  help="att loss type, can be hinge loss or its smooth approximation LogSumExp")
    parser.add_argument("--hard_negative_start_epoch", type=int, default=20, help="which epoch to start hard negative sampling for video-level ranking loss, use -1 to disable")
    parser.add_argument("--hard_pool_size", type=int, default=20, help="hard negatives are still sampled, but from a harder pool.")
    parser.add_argument("--use_hard_negative", type=bool, default=False)
    # Data config
    parser.add_argument("--ctx_mode", type=str, default="video_sub", help="which context to use a combination of [video, sub, tef]")
    parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions")
    parser.add_argument("--max_ctx_l", type=int, default=128, help="max number of snippets, 100 for tvr clip_length=1.5, oly 109/21825 > 100")
    parser.add_argument("--clip_length", type=float, default=1.5, help="each video will be uniformly segmented into small clips,  will automatically loaded from ProposalConfigs if None")
        
    parser.add_argument("--no_norm_vfeat", action="store_true", help="Do not do normalization on video feat, use it only when using resnet_i3d feat")
    parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat")
        
    # Model config
    parser.add_argument("--visual_input_size", type=int, default=1024)
    parser.add_argument("--sub_input_size", type=int, default=768)
    parser.add_argument("--query_input_size", type=int, default=768)

    parser.add_argument("--max_position_embeddings", type=int, default=300)
    parser.add_argument("--hidden_size", type=int, default=384)
    parser.add_argument("--n_heads", type=int, default=8)
    parser.add_argument("--input_drop", type=float, default=0.1, help="Applied to all inputs")
    parser.add_argument("--drop", type=float, default=0.1, help="Applied to all other layers")
    parser.add_argument("--conv_kernel_size", type=int, default=5)
    parser.add_argument("--conv_stride", type=int, default=1)
    parser.add_argument("--initializer_range", type=float, default=0.02, help="initializer range for layers")
        
        
    # post processing
    parser.add_argument("--min_pred_l", type=int, default=2, help="constrain the [st, ed] with ed - st >= 2 (2 clips with length 1.5 each, 3 secs  in total this is the min length for proposal-based backup_method)")
    parser.add_argument("--max_pred_l", type=int, default=16, help="constrain the [st, ed] pairs with ed - st <= 16, 24 secs in total (16 clips  with length 1.5 each, this is the max length for proposal-based backup_method)")
    parser.add_argument("--q2c_alpha", type=float, default=30, help="give more importance to top scored videos' spans, the new score will be: s_new = exp(alpha * s),  igher alpha indicates more importance. Note s in [-1, 1]")
    parser.add_argument("--max_before_nms", type=int, default=200)
    parser.add_argument("--max_vcmr_video", type=int, default=100, help="re-ranking in top-max_vcmr_video")
    parser.add_argument("--nms_thd", type=float, default=-1, help="additionally use non-maximum suppression (or non-minimum suppression for distance) to post-processing the predictions. -1: do not use nms. 0.6 for charades_sta, 0.5 for anet_cap")

    # evaluation 
    parser.add_argument("--iou_threshold", type=float, nargs='+', default=[0.3, 0.5, 0.7], help="List of IOU thresholds")
    parser.add_argument("--ndcg_topk", type=int, nargs='+', default=[10, 20, 40], help="List of NDCG top k values")
    args = parser.parse_args()
    
    args.results_path = os.path.join(args.results_path, args.exp_id)
    os.makedirs(args.results_path, exist_ok=True)
    
    if args.hard_negative_start_epoch != -1:
        if args.hard_pool_size > args.bsz:
            print("[WARNING] hard_pool_size is larger than bsz")

    return args


def set_seed(seed, use_cuda=True):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if use_cuda:
        torch.cuda.manual_seed_all(seed)