Spaces:
Runtime error
Runtime error
initial commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- DOCKERFILE +29 -0
- README.md +5 -8
- app.py +71 -0
- configs/datasets/bdd/bdd_dataset.py +44 -0
- configs/datasets/tao/tao_dataset_v05.py +43 -0
- configs/datasets/tao/tao_dataset_v1.py +44 -0
- configs/default_runtime.py +23 -0
- configs/masa-detic/bdd_test/masa_detic_bdd_mot_test.py +224 -0
- configs/masa-detic/bdd_test/masa_detic_bdd_mots_test.py +227 -0
- configs/masa-detic/open_vocabulary_mot_test/masa_detic_swinb_open_vocabulary_test.py +236 -0
- configs/masa-detic/tao_teta_test/masa_detic_swinb_tao_test_detic_dets.py +219 -0
- configs/masa-detic/tao_teta_test/masa_detic_swinb_tao_test_teter_swinT_dets.py +219 -0
- configs/masa-gdino/bdd_test/masa_gdino_bdd_mot_test.py +226 -0
- configs/masa-gdino/bdd_test/masa_gdino_bdd_mots_test.py +227 -0
- configs/masa-gdino/masa_gdino_swinb_inference.py +216 -0
- configs/masa-gdino/masa_gdino_swinb_plug_and_play.py +218 -0
- configs/masa-gdino/open_vocabulary_mot_test/masa_gdino_swinb_open_vocabulary_test.py +236 -0
- configs/masa-gdino/tao_teta_test/masa_gdino_swinb_tao_test_detic_dets.py +235 -0
- configs/masa-gdino/tao_teta_test/masa_gdino_swinb_tao_test_teter_swinT_dets.py +240 -0
- configs/masa-one/bdd_test/masa_r50_bdd_mot_test.py +235 -0
- configs/masa-one/bdd_test/masa_r50_bdd_mots_test.py +238 -0
- configs/masa-one/masa_r50_plug_and_play.py +214 -0
- configs/masa-one/open_vocabulary_mot_test/masa_r50_open_vocabulary_test.py +231 -0
- configs/masa-one/tao_teta_test/masa_r50_tao_test_detic_dets.py +230 -0
- configs/masa-one/tao_teta_test/masa_r50_tao_test_teter_swinT_dets.py +230 -0
- configs/masa-sam/bdd_test/masa_sam_vitb_bdd_mot_test.py +245 -0
- configs/masa-sam/bdd_test/masa_sam_vitb_bdd_mots_test.py +241 -0
- configs/masa-sam/bdd_test/masa_sam_vith_bdd_mot_test.py +246 -0
- configs/masa-sam/bdd_test/masa_sam_vith_bdd_mots_test.py +240 -0
- configs/masa-sam/open_vocabulary_mot_test/masa_sam_vitb_open_vocabulary_test.py +233 -0
- configs/masa-sam/open_vocabulary_mot_test/masa_sam_vith_open_vocabulary_test.py +234 -0
- configs/masa-sam/sam-vitb.py +30 -0
- configs/masa-sam/sam-vith.py +30 -0
- configs/masa-sam/tao_teta_test/masa_sam_vitb_tao_test_detic_dets.py +232 -0
- configs/masa-sam/tao_teta_test/masa_sam_vitb_tao_test_teter_swinT_dets.py +238 -0
- configs/masa-sam/tao_teta_test/masa_sam_vith_tao_test_detic_dets.py +233 -0
- configs/masa-sam/tao_teta_test/masa_sam_vith_tao_test_teter_swinT_dets.py +239 -0
- environment_docker.yml +302 -0
- masa/__init__.py +3 -0
- masa/__pycache__/__init__.cpython-311.pyc +0 -0
- masa/apis/__init__.py +10 -0
- masa/apis/__pycache__/__init__.cpython-311.pyc +0 -0
- masa/apis/__pycache__/masa_inference.cpython-311.pyc +0 -0
- masa/apis/masa_inference.py +297 -0
- masa/datasets/__init__.py +19 -0
- masa/datasets/__pycache__/__init__.cpython-311.pyc +0 -0
- masa/datasets/__pycache__/bdd_masa_dataset.cpython-311.pyc +0 -0
- masa/datasets/__pycache__/dataset_wrappers.cpython-311.pyc +0 -0
- masa/datasets/__pycache__/masa_dataset.cpython-311.pyc +0 -0
- masa/datasets/__pycache__/rsconcat_dataset.cpython-311.pyc +0 -0
DOCKERFILE
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM continuumio/anaconda3:main
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
COPY ./environment_docker.yml /code/environment_docker.yml
|
5 |
+
|
6 |
+
# Create the environment using the environment.yml file
|
7 |
+
RUN conda env create -f /code/environment_docker.yml
|
8 |
+
|
9 |
+
# Set up a new user named "user" with user ID 1000
|
10 |
+
RUN useradd -m -u 1000 user
|
11 |
+
# Switch to the "user" user
|
12 |
+
USER user
|
13 |
+
# Set home to the user's home directory
|
14 |
+
ENV HOME=/home/user \
|
15 |
+
PYTHONPATH=$HOME/app \
|
16 |
+
PYTHONUNBUFFERED=1 \
|
17 |
+
GRADIO_ALLOW_FLAGGING=never \
|
18 |
+
GRADIO_NUM_PORTS=1 \
|
19 |
+
GRADIO_SERVER_NAME=0.0.0.0 \
|
20 |
+
GRADIO_THEME=huggingface \
|
21 |
+
SYSTEM=spaces
|
22 |
+
|
23 |
+
# Set the working directory to the user's home directory
|
24 |
+
WORKDIR $HOME/app
|
25 |
+
|
26 |
+
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
27 |
+
COPY --chown=user . $HOME/app
|
28 |
+
|
29 |
+
CMD ["./run.sh"]
|
README.md
CHANGED
@@ -1,11 +1,8 @@
|
|
1 |
---
|
2 |
-
title: MASA GroundingDINO
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
-
|
8 |
-
license: mit
|
9 |
---
|
10 |
-
|
11 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: MASA + GroundingDINO Space
|
3 |
+
emoji: 🐳
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: gray
|
6 |
sdk: docker
|
7 |
+
app_port: 7860
|
|
|
8 |
---
|
|
|
|
app.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
import subprocess
|
5 |
+
|
6 |
+
# Define the function to call the command line script
|
7 |
+
def process_video(uploaded_video_path, texts):
|
8 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile:
|
9 |
+
output_video_path = tmpfile.name
|
10 |
+
|
11 |
+
command = [
|
12 |
+
"python", "demo/video_demo_with_text.py", uploaded_video_path,
|
13 |
+
"--out", output_video_path,
|
14 |
+
"--masa_config", "configs/masa-gdino/masa_gdino_swinb_inference.py",
|
15 |
+
"--masa_checkpoint", "saved_models/masa_models/gdino_masa.pth",
|
16 |
+
"--texts", texts,
|
17 |
+
"--score-thr", "0.2",
|
18 |
+
"--unified",
|
19 |
+
"--show_fps"
|
20 |
+
]
|
21 |
+
|
22 |
+
subprocess.run(command, check=True)
|
23 |
+
|
24 |
+
# Ensure the video is in a compatible format using ffmpeg
|
25 |
+
converted_output_path = output_video_path.replace('.mp4', '_converted.mp4')
|
26 |
+
ffmpeg_command = [
|
27 |
+
"ffmpeg", "-i", output_video_path, "-c:v", "mpeg4",
|
28 |
+
"-c:a", "aac", "-b:a", "128k", "-movflags", "+faststart", converted_output_path
|
29 |
+
]
|
30 |
+
subprocess.run(ffmpeg_command, check=True)
|
31 |
+
|
32 |
+
return converted_output_path
|
33 |
+
|
34 |
+
css = """
|
35 |
+
#img-display-container {
|
36 |
+
max-height: 100vh;
|
37 |
+
}
|
38 |
+
#img-display-input {
|
39 |
+
max-height: 80vh;
|
40 |
+
}
|
41 |
+
#img-display-output {
|
42 |
+
max-height: 80vh;
|
43 |
+
}
|
44 |
+
"""
|
45 |
+
|
46 |
+
title = "# MASA Track Everything Demo"
|
47 |
+
description = """ MASA + GroundingDINO on your video files!
|
48 |
+
Please refer to our [paper](https://arxiv.org/abs/2406.04221), [project page](https://matchinganything.github.io/), or [github](https://github.com/siyuanliii/masa/tree/main?tab=readme-ov-file) for more details."""
|
49 |
+
|
50 |
+
with gr.Blocks(css=css) as demo:
|
51 |
+
gr.Markdown(title)
|
52 |
+
gr.Markdown(description)
|
53 |
+
gr.Markdown("### Video Object Tracking demo")
|
54 |
+
|
55 |
+
with gr.Row():
|
56 |
+
input_video = gr.Video(label="Input Video")
|
57 |
+
input_texts = gr.Textbox(label="Input Texts")
|
58 |
+
|
59 |
+
submit = gr.Button("Submit")
|
60 |
+
processed_video = gr.Video(label="Processed Video")
|
61 |
+
|
62 |
+
submit.click(process_video, inputs=[input_video, input_texts], outputs=processed_video)
|
63 |
+
|
64 |
+
example_files = os.listdir('assets/examples_video')
|
65 |
+
example_files.sort()
|
66 |
+
example_files = [os.path.join('assets/examples_video', filename) for filename in example_files]
|
67 |
+
examples = gr.Examples(examples=example_files, inputs=[input_video, input_texts], outputs=processed_video, fn=process_video, cache_examples=True)
|
68 |
+
|
69 |
+
if __name__ == '__main__':
|
70 |
+
demo.queue().launch()
|
71 |
+
|
configs/datasets/bdd/bdd_dataset.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset settings
|
2 |
+
img_norm_cfg = dict(
|
3 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
4 |
+
|
5 |
+
|
6 |
+
test_dataset_tpye = 'BDDVideoDataset'
|
7 |
+
|
8 |
+
test_pipeline = [
|
9 |
+
dict(
|
10 |
+
type='TransformBroadcaster',
|
11 |
+
transforms=[
|
12 |
+
dict(type='LoadImageFromFile'),
|
13 |
+
dict(type='Resize', scale=(1333, 800), keep_ratio=True),
|
14 |
+
dict(type='LoadTrackAnnotations')
|
15 |
+
]),
|
16 |
+
dict(type='PackTrackInputs')
|
17 |
+
]
|
18 |
+
|
19 |
+
val_dataloader = dict(
|
20 |
+
batch_size=1,
|
21 |
+
num_workers=2,
|
22 |
+
persistent_workers=True,
|
23 |
+
sampler=dict(type='TrackImgSampler'),
|
24 |
+
dataset=dict(
|
25 |
+
type=test_dataset_tpye,
|
26 |
+
ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
|
27 |
+
data_prefix=dict(img_path='data/bdd/bdd100k/images/track/val/'),
|
28 |
+
test_mode=True,
|
29 |
+
pipeline=test_pipeline
|
30 |
+
))
|
31 |
+
|
32 |
+
test_dataloader = val_dataloader
|
33 |
+
|
34 |
+
# evaluator
|
35 |
+
val_evaluator = dict(
|
36 |
+
type='BDDTETAMetric',
|
37 |
+
dataset_type=test_dataset_tpye,
|
38 |
+
format_only=False,
|
39 |
+
ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
|
40 |
+
scalabel_gt='data/bdd/annotations/scalabel_gt/box_track_20/val/',
|
41 |
+
metric=['TETA'])
|
42 |
+
test_evaluator = val_evaluator
|
43 |
+
|
44 |
+
|
configs/datasets/tao/tao_dataset_v05.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# data pipeline
|
2 |
+
|
3 |
+
test_pipeline = [
|
4 |
+
dict(
|
5 |
+
type='TransformBroadcaster',
|
6 |
+
transforms=[
|
7 |
+
dict(type='LoadImageFromFile'),
|
8 |
+
dict(type='Resize', scale=(1333, 800), keep_ratio=True),
|
9 |
+
dict(type='LoadTrackAnnotations')
|
10 |
+
]),
|
11 |
+
dict(type='PackTrackInputs')
|
12 |
+
]
|
13 |
+
|
14 |
+
# dataloader
|
15 |
+
|
16 |
+
test_dataset_tpye = 'Taov05Dataset'
|
17 |
+
|
18 |
+
val_dataloader = dict(
|
19 |
+
batch_size=1,
|
20 |
+
num_workers=2,
|
21 |
+
persistent_workers=True,
|
22 |
+
# Now we support two ways to test, image_based and video_based
|
23 |
+
# if you want to use video_based sampling, you can use as follows
|
24 |
+
sampler=dict(type='TrackImgSampler'), # image-based sampling
|
25 |
+
dataset=dict(
|
26 |
+
type=test_dataset_tpye,
|
27 |
+
ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
|
28 |
+
data_prefix=dict(img_path='data/tao/frames/'),
|
29 |
+
test_mode=True,
|
30 |
+
pipeline=test_pipeline
|
31 |
+
))
|
32 |
+
test_dataloader = val_dataloader
|
33 |
+
|
34 |
+
# evaluator
|
35 |
+
val_evaluator = dict(
|
36 |
+
type='TaoTETAMetric',
|
37 |
+
dataset_type=test_dataset_tpye,
|
38 |
+
format_only=False,
|
39 |
+
ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
|
40 |
+
metric=['TETA'])
|
41 |
+
test_evaluator = val_evaluator
|
42 |
+
|
43 |
+
|
configs/datasets/tao/tao_dataset_v1.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# data pipeline
|
2 |
+
|
3 |
+
test_pipeline = [
|
4 |
+
dict(
|
5 |
+
type='TransformBroadcaster',
|
6 |
+
transforms=[
|
7 |
+
dict(type='LoadImageFromFile'),
|
8 |
+
dict(type='Resize', scale=(1333, 800), keep_ratio=True),
|
9 |
+
dict(type='LoadTrackAnnotations')
|
10 |
+
]),
|
11 |
+
dict(type='PackTrackInputs')
|
12 |
+
]
|
13 |
+
|
14 |
+
# dataloader
|
15 |
+
|
16 |
+
test_dataset_tpye = 'Taov1Dataset'
|
17 |
+
|
18 |
+
val_dataloader = dict(
|
19 |
+
batch_size=1,
|
20 |
+
num_workers=2,
|
21 |
+
persistent_workers=True,
|
22 |
+
# Now we support two ways to test, image_based and video_based
|
23 |
+
# if you want to use video_based sampling, you can use as follows
|
24 |
+
sampler=dict(type='TrackImgSampler'), # image-based sampling
|
25 |
+
dataset=dict(
|
26 |
+
type=test_dataset_tpye,
|
27 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
28 |
+
data_prefix=dict(img_path='data/tao/frames/'),
|
29 |
+
test_mode=True,
|
30 |
+
pipeline=test_pipeline
|
31 |
+
))
|
32 |
+
|
33 |
+
test_dataloader = val_dataloader
|
34 |
+
|
35 |
+
# evaluator
|
36 |
+
val_evaluator = dict(
|
37 |
+
type='TaoTETAMetric',
|
38 |
+
dataset_type=test_dataset_tpye,
|
39 |
+
format_only=False,
|
40 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
41 |
+
metric=['TETA'])
|
42 |
+
test_evaluator = val_evaluator
|
43 |
+
|
44 |
+
|
configs/default_runtime.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
default_scope = 'mmdet'
|
2 |
+
default_hooks = dict(
|
3 |
+
timer=dict(type='IterTimerHook'),
|
4 |
+
logger=dict(type='LoggerHook', interval=50),
|
5 |
+
param_scheduler=dict(type='ParamSchedulerHook'),
|
6 |
+
checkpoint=dict(type='CheckpointHook', interval=1),
|
7 |
+
sampler_seed=dict(type='DistSamplerSeedHook'),
|
8 |
+
visualization=dict(type='DetVisualizationHook'))
|
9 |
+
|
10 |
+
env_cfg = dict(
|
11 |
+
cudnn_benchmark=False,
|
12 |
+
mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
|
13 |
+
dist_cfg=dict(backend='nccl'),
|
14 |
+
)
|
15 |
+
|
16 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
17 |
+
visualizer = dict(
|
18 |
+
type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
19 |
+
log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
|
20 |
+
|
21 |
+
log_level = 'INFO'
|
22 |
+
load_from = None
|
23 |
+
resume = False
|
configs/masa-detic/bdd_test/masa_detic_bdd_mot_test.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../../projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-masa.py',
|
3 |
+
'../../datasets/bdd/bdd_dataset.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector.pop('data_preprocessor')
|
9 |
+
detector['init_cfg'] = dict(
|
10 |
+
type='Pretrained',
|
11 |
+
checkpoint= 'saved_models/tsa_models/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-ec91245d.pth'
|
12 |
+
# noqa: E501
|
13 |
+
)
|
14 |
+
detector['type'] = 'DeticMasa'
|
15 |
+
|
16 |
+
del _base_.model
|
17 |
+
|
18 |
+
model = dict(
|
19 |
+
type='MASA',
|
20 |
+
freeze_detector=True,
|
21 |
+
unified_backbone=True,
|
22 |
+
load_public_dets = True,
|
23 |
+
benchmark = 'bdd',
|
24 |
+
public_det_path = 'results/public_dets/bdd_mot_yolox_dets/',
|
25 |
+
data_preprocessor=dict(
|
26 |
+
type='TrackDataPreprocessor',
|
27 |
+
# Image normalization parameters
|
28 |
+
mean=[123.675, 116.28, 103.53],
|
29 |
+
std=[58.395, 57.12, 57.375],
|
30 |
+
bgr_to_rgb=True,
|
31 |
+
# Image padding parameters
|
32 |
+
pad_mask=True, # In instance segmentation, the mask needs to be padded
|
33 |
+
pad_size_divisor=32), # Padding the image to multiples of 32
|
34 |
+
|
35 |
+
detector=detector,
|
36 |
+
masa_adapter=[
|
37 |
+
dict(
|
38 |
+
type='FPN',
|
39 |
+
in_channels=[256, 512, 1024],
|
40 |
+
out_channels=256,
|
41 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
42 |
+
num_outs=5),
|
43 |
+
dict(
|
44 |
+
type='DeformFusion',
|
45 |
+
in_channels=256,
|
46 |
+
out_channels=256,
|
47 |
+
num_blocks=3)],
|
48 |
+
rpn_head=dict(
|
49 |
+
type='RPNHead',
|
50 |
+
in_channels=256,
|
51 |
+
feat_channels=256,
|
52 |
+
anchor_generator=dict(
|
53 |
+
type='AnchorGenerator',
|
54 |
+
scales=[8],
|
55 |
+
ratios=[0.5, 1.0, 2.0],
|
56 |
+
strides=[8, 16, 32, 64, 128]),
|
57 |
+
bbox_coder=dict(
|
58 |
+
type='DeltaXYWHBBoxCoder',
|
59 |
+
target_means=[.0, .0, .0, .0],
|
60 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
61 |
+
loss_cls=dict(
|
62 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
63 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
64 |
+
),
|
65 |
+
roi_head=dict(
|
66 |
+
type='StandardRoIHead',
|
67 |
+
bbox_roi_extractor=dict(
|
68 |
+
type='SingleRoIExtractor',
|
69 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
70 |
+
out_channels=256,
|
71 |
+
featmap_strides=[8, 16, 32]),
|
72 |
+
bbox_head=dict(
|
73 |
+
type='Shared2FCBBoxHead',
|
74 |
+
in_channels=256,
|
75 |
+
fc_out_channels=1024,
|
76 |
+
roi_feat_size=7,
|
77 |
+
num_classes=1,
|
78 |
+
bbox_coder=dict(
|
79 |
+
type='DeltaXYWHBBoxCoder',
|
80 |
+
target_means=[0., 0., 0., 0.],
|
81 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
82 |
+
reg_class_agnostic=True,
|
83 |
+
loss_cls=dict(
|
84 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
85 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
86 |
+
# model training and testing settings
|
87 |
+
train_cfg=dict(
|
88 |
+
rpn=dict(
|
89 |
+
assigner=dict(
|
90 |
+
type='MaxIoUAssigner',
|
91 |
+
pos_iou_thr=0.7,
|
92 |
+
neg_iou_thr=0.3,
|
93 |
+
min_pos_iou=0.3,
|
94 |
+
match_low_quality=True,
|
95 |
+
ignore_iof_thr=-1),
|
96 |
+
sampler=dict(
|
97 |
+
type='RandomSampler',
|
98 |
+
num=256,
|
99 |
+
pos_fraction=0.5,
|
100 |
+
neg_pos_ub=-1,
|
101 |
+
add_gt_as_proposals=False),
|
102 |
+
allowed_border=-1,
|
103 |
+
pos_weight=-1,
|
104 |
+
debug=False),
|
105 |
+
rpn_proposal=dict(
|
106 |
+
nms_pre=2000,
|
107 |
+
max_per_img=1000,
|
108 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
109 |
+
min_bbox_size=0),
|
110 |
+
rcnn=dict(
|
111 |
+
assigner=dict(
|
112 |
+
type='MaxIoUAssigner',
|
113 |
+
pos_iou_thr=0.5,
|
114 |
+
neg_iou_thr=0.5,
|
115 |
+
min_pos_iou=0.5,
|
116 |
+
match_low_quality=False,
|
117 |
+
ignore_iof_thr=-1),
|
118 |
+
sampler=dict(
|
119 |
+
type='RandomSampler',
|
120 |
+
num=512,
|
121 |
+
pos_fraction=0.25,
|
122 |
+
neg_pos_ub=-1,
|
123 |
+
add_gt_as_proposals=True),
|
124 |
+
pos_weight=-1,
|
125 |
+
debug=False)),
|
126 |
+
test_cfg=dict(
|
127 |
+
rpn=dict(
|
128 |
+
nms_pre=1000,
|
129 |
+
max_per_img=1000,
|
130 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
131 |
+
min_bbox_size=0),
|
132 |
+
rcnn=dict(
|
133 |
+
score_thr=0.02,
|
134 |
+
nms=dict(type='nms',
|
135 |
+
iou_threshold=0.5,
|
136 |
+
class_agnostic=True,
|
137 |
+
split_thr=100000),
|
138 |
+
max_per_img=50,
|
139 |
+
mask_thr_binary=0.5)
|
140 |
+
# soft-nms is also supported for rcnn testing
|
141 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
142 |
+
),
|
143 |
+
track_head=dict(
|
144 |
+
type='MasaTrackHead',
|
145 |
+
roi_extractor=dict(
|
146 |
+
type='SingleRoIExtractor',
|
147 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
148 |
+
out_channels=256,
|
149 |
+
featmap_strides=[8, 16, 32]),
|
150 |
+
embed_head=dict(
|
151 |
+
type='QuasiDenseEmbedHead',
|
152 |
+
num_convs=4,
|
153 |
+
num_fcs=1,
|
154 |
+
embed_channels=256,
|
155 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
156 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
157 |
+
loss_track_aux=dict(
|
158 |
+
type='MarginL2Loss',
|
159 |
+
neg_pos_ub=3,
|
160 |
+
pos_margin=0,
|
161 |
+
neg_margin=0.1,
|
162 |
+
hard_mining=True,
|
163 |
+
loss_weight=1.0)),
|
164 |
+
# loss_bbox=dict(type='L1Loss', loss_weight=1.0),
|
165 |
+
train_cfg=dict(
|
166 |
+
assigner=dict(
|
167 |
+
type='MaxIoUAssigner',
|
168 |
+
pos_iou_thr=0.7,
|
169 |
+
neg_iou_thr=0.3,
|
170 |
+
min_pos_iou=0.5,
|
171 |
+
match_low_quality=False,
|
172 |
+
ignore_iof_thr=-1),
|
173 |
+
sampler=dict(
|
174 |
+
type='CombinedSampler',
|
175 |
+
num=512,
|
176 |
+
pos_fraction=0.5,
|
177 |
+
neg_pos_ub=3,
|
178 |
+
add_gt_as_proposals=True,
|
179 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
180 |
+
neg_sampler=dict(type='RandomSampler')))),
|
181 |
+
tracker=dict(
|
182 |
+
type='MasaBDDTracker',
|
183 |
+
init_score_thr=0.5,
|
184 |
+
obj_score_thr=0.3,
|
185 |
+
match_score_thr=0.6,
|
186 |
+
memo_tracklet_frames=10,
|
187 |
+
memo_backdrop_frames=1,
|
188 |
+
memo_momentum=0.8,
|
189 |
+
nms_conf_thr=0.5,
|
190 |
+
nms_backdrop_iou_thr=0.3,
|
191 |
+
nms_class_iou_thr=0.7,
|
192 |
+
with_cats=False,
|
193 |
+
match_metric='bisoftmax')
|
194 |
+
)
|
195 |
+
|
196 |
+
# runtime settings
|
197 |
+
train_dataloader = None
|
198 |
+
train_cfg = None
|
199 |
+
val_cfg = dict(type='ValLoop')
|
200 |
+
test_cfg = dict(type='TestLoop')
|
201 |
+
|
202 |
+
default_hooks = dict(
|
203 |
+
logger=dict(type='LoggerHook', interval=50),
|
204 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
205 |
+
checkpoint=dict(type='CheckpointHook', interval=1),
|
206 |
+
)
|
207 |
+
|
208 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
209 |
+
visualizer = dict(
|
210 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
211 |
+
|
212 |
+
val_dataloader = dict(
|
213 |
+
dataset=dict(
|
214 |
+
ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
|
215 |
+
)
|
216 |
+
)
|
217 |
+
test_dataloader = val_dataloader
|
218 |
+
val_evaluator = dict(
|
219 |
+
ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
|
220 |
+
scalabel_gt='data/bdd/annotations/scalabel_gt/box_track_20/val/',
|
221 |
+
outfile_prefix='results/detic_masa_trained_bdd_demo',
|
222 |
+
metric=['TETA', 'HOTA', 'CLEAR']
|
223 |
+
)
|
224 |
+
test_evaluator = val_evaluator
|
configs/masa-detic/bdd_test/masa_detic_bdd_mots_test.py
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-masa.py',
|
3 |
+
'../datasets/bdd/bdd_dataset.py',
|
4 |
+
'../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector.pop('data_preprocessor')
|
9 |
+
detector['init_cfg'] = dict(
|
10 |
+
type='Pretrained',
|
11 |
+
checkpoint= 'saved_models/tsa_models/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-ec91245d.pth'
|
12 |
+
# noqa: E501
|
13 |
+
)
|
14 |
+
detector['type'] = 'DeticMasa'
|
15 |
+
|
16 |
+
del _base_.model
|
17 |
+
|
18 |
+
model = dict(
|
19 |
+
type='MASA',
|
20 |
+
freeze_detector=True,
|
21 |
+
unified_backbone=True,
|
22 |
+
load_public_dets = True,
|
23 |
+
with_segm=True,
|
24 |
+
benchmark = 'bdd',
|
25 |
+
public_det_path = 'results/public_dets/bdd_mots_val_uninext_dets/',
|
26 |
+
data_preprocessor=dict(
|
27 |
+
type='TrackDataPreprocessor',
|
28 |
+
# Image normalization parameters
|
29 |
+
mean=[123.675, 116.28, 103.53],
|
30 |
+
std=[58.395, 57.12, 57.375],
|
31 |
+
bgr_to_rgb=True,
|
32 |
+
# Image padding parameters
|
33 |
+
pad_mask=True, # In instance segmentation, the mask needs to be padded
|
34 |
+
pad_size_divisor=32), # Padding the image to multiples of 32
|
35 |
+
|
36 |
+
detector=detector,
|
37 |
+
masa_adapter=[
|
38 |
+
dict(
|
39 |
+
type='FPN',
|
40 |
+
in_channels=[256, 512, 1024],
|
41 |
+
out_channels=256,
|
42 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
43 |
+
num_outs=5),
|
44 |
+
dict(
|
45 |
+
type='DeformFusion',
|
46 |
+
in_channels=256,
|
47 |
+
out_channels=256,
|
48 |
+
num_blocks=3)],
|
49 |
+
rpn_head=dict(
|
50 |
+
type='RPNHead',
|
51 |
+
in_channels=256,
|
52 |
+
feat_channels=256,
|
53 |
+
anchor_generator=dict(
|
54 |
+
type='AnchorGenerator',
|
55 |
+
scales=[8],
|
56 |
+
ratios=[0.5, 1.0, 2.0],
|
57 |
+
strides=[8, 16, 32, 64, 128]),
|
58 |
+
bbox_coder=dict(
|
59 |
+
type='DeltaXYWHBBoxCoder',
|
60 |
+
target_means=[.0, .0, .0, .0],
|
61 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
62 |
+
loss_cls=dict(
|
63 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
64 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
65 |
+
),
|
66 |
+
roi_head=dict(
|
67 |
+
type='StandardRoIHead',
|
68 |
+
bbox_roi_extractor=dict(
|
69 |
+
type='SingleRoIExtractor',
|
70 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
71 |
+
out_channels=256,
|
72 |
+
featmap_strides=[8, 16, 32]),
|
73 |
+
bbox_head=dict(
|
74 |
+
type='Shared2FCBBoxHead',
|
75 |
+
in_channels=256,
|
76 |
+
fc_out_channels=1024,
|
77 |
+
roi_feat_size=7,
|
78 |
+
num_classes=1,
|
79 |
+
bbox_coder=dict(
|
80 |
+
type='DeltaXYWHBBoxCoder',
|
81 |
+
target_means=[0., 0., 0., 0.],
|
82 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
83 |
+
reg_class_agnostic=True,
|
84 |
+
loss_cls=dict(
|
85 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
86 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
87 |
+
# model training and testing settings
|
88 |
+
train_cfg=dict(
|
89 |
+
rpn=dict(
|
90 |
+
assigner=dict(
|
91 |
+
type='MaxIoUAssigner',
|
92 |
+
pos_iou_thr=0.7,
|
93 |
+
neg_iou_thr=0.3,
|
94 |
+
min_pos_iou=0.3,
|
95 |
+
match_low_quality=True,
|
96 |
+
ignore_iof_thr=-1),
|
97 |
+
sampler=dict(
|
98 |
+
type='RandomSampler',
|
99 |
+
num=256,
|
100 |
+
pos_fraction=0.5,
|
101 |
+
neg_pos_ub=-1,
|
102 |
+
add_gt_as_proposals=False),
|
103 |
+
allowed_border=-1,
|
104 |
+
pos_weight=-1,
|
105 |
+
debug=False),
|
106 |
+
rpn_proposal=dict(
|
107 |
+
nms_pre=2000,
|
108 |
+
max_per_img=1000,
|
109 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
110 |
+
min_bbox_size=0),
|
111 |
+
rcnn=dict(
|
112 |
+
assigner=dict(
|
113 |
+
type='MaxIoUAssigner',
|
114 |
+
pos_iou_thr=0.5,
|
115 |
+
neg_iou_thr=0.5,
|
116 |
+
min_pos_iou=0.5,
|
117 |
+
match_low_quality=False,
|
118 |
+
ignore_iof_thr=-1),
|
119 |
+
sampler=dict(
|
120 |
+
type='RandomSampler',
|
121 |
+
num=512,
|
122 |
+
pos_fraction=0.25,
|
123 |
+
neg_pos_ub=-1,
|
124 |
+
add_gt_as_proposals=True),
|
125 |
+
pos_weight=-1,
|
126 |
+
debug=False)),
|
127 |
+
test_cfg=dict(
|
128 |
+
rpn=dict(
|
129 |
+
nms_pre=1000,
|
130 |
+
max_per_img=1000,
|
131 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
132 |
+
min_bbox_size=0),
|
133 |
+
rcnn=dict(
|
134 |
+
score_thr=0.02,
|
135 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
136 |
+
nms=dict(type='nms',
|
137 |
+
iou_threshold=0.5,
|
138 |
+
class_agnostic=True,
|
139 |
+
split_thr=100000),
|
140 |
+
max_per_img=50,
|
141 |
+
mask_thr_binary=0.5)
|
142 |
+
# soft-nms is also supported for rcnn testing
|
143 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
144 |
+
),
|
145 |
+
track_head=dict(
|
146 |
+
type='MasaTrackHead',
|
147 |
+
roi_extractor=dict(
|
148 |
+
type='SingleRoIExtractor',
|
149 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
150 |
+
out_channels=256,
|
151 |
+
featmap_strides=[8, 16, 32]),
|
152 |
+
embed_head=dict(
|
153 |
+
type='QuasiDenseEmbedHead',
|
154 |
+
num_convs=4,
|
155 |
+
num_fcs=1,
|
156 |
+
embed_channels=256,
|
157 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
158 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
159 |
+
loss_track_aux=dict(
|
160 |
+
type='MarginL2Loss',
|
161 |
+
neg_pos_ub=3,
|
162 |
+
pos_margin=0,
|
163 |
+
neg_margin=0.1,
|
164 |
+
hard_mining=True,
|
165 |
+
loss_weight=1.0)),
|
166 |
+
# loss_bbox=dict(type='L1Loss', loss_weight=1.0),
|
167 |
+
train_cfg=dict(
|
168 |
+
assigner=dict(
|
169 |
+
type='MaxIoUAssigner',
|
170 |
+
pos_iou_thr=0.7,
|
171 |
+
neg_iou_thr=0.3,
|
172 |
+
min_pos_iou=0.5,
|
173 |
+
match_low_quality=False,
|
174 |
+
ignore_iof_thr=-1),
|
175 |
+
sampler=dict(
|
176 |
+
type='CombinedSampler',
|
177 |
+
num=512,
|
178 |
+
pos_fraction=0.5,
|
179 |
+
neg_pos_ub=3,
|
180 |
+
add_gt_as_proposals=True,
|
181 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
182 |
+
neg_sampler=dict(type='RandomSampler')))),
|
183 |
+
tracker=dict(
|
184 |
+
type='MasaBDDTracker',
|
185 |
+
init_score_thr=0.5,
|
186 |
+
obj_score_thr=0.3,
|
187 |
+
match_score_thr=0.6,
|
188 |
+
memo_tracklet_frames=10,
|
189 |
+
memo_backdrop_frames=1,
|
190 |
+
memo_momentum=0.8,
|
191 |
+
nms_conf_thr=0.5,
|
192 |
+
nms_backdrop_iou_thr=0.3,
|
193 |
+
nms_class_iou_thr=0.7,
|
194 |
+
with_cats=False,
|
195 |
+
match_metric='bisoftmax')
|
196 |
+
)
|
197 |
+
|
198 |
+
# runtime settings
|
199 |
+
train_dataloader = None
|
200 |
+
train_cfg = None
|
201 |
+
val_cfg = dict(type='ValLoop')
|
202 |
+
test_cfg = dict(type='TestLoop')
|
203 |
+
|
204 |
+
default_hooks = dict(
|
205 |
+
logger=dict(type='LoggerHook', interval=50),
|
206 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
207 |
+
checkpoint = dict(type='CheckpointHook', interval=1),
|
208 |
+
)
|
209 |
+
|
210 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
211 |
+
visualizer = dict(
|
212 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
213 |
+
|
214 |
+
val_dataloader = dict(
|
215 |
+
dataset=dict(
|
216 |
+
ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
|
217 |
+
)
|
218 |
+
)
|
219 |
+
test_dataloader = val_dataloader
|
220 |
+
val_evaluator = dict(
|
221 |
+
ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
|
222 |
+
scalabel_gt='data/bdd/annotations/scalabel_gt/seg_track_20/val/',
|
223 |
+
outfile_prefix='results/masa_results/masa-groundingdino-release-bdd-mots-test',
|
224 |
+
metric=['TETA', 'HOTA', 'CLEAR'],
|
225 |
+
with_mask=True,
|
226 |
+
)
|
227 |
+
test_evaluator = val_evaluator
|
configs/masa-detic/open_vocabulary_mot_test/masa_detic_swinb_open_vocabulary_test.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../../projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-masa.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v1.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector.pop('data_preprocessor')
|
9 |
+
detector['init_cfg'] = dict(
|
10 |
+
type='Pretrained',
|
11 |
+
checkpoint= 'saved_models/tsa_models/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-ec91245d.pth'
|
12 |
+
# noqa: E501
|
13 |
+
)
|
14 |
+
detector['type'] = 'DeticMasa'
|
15 |
+
detector['test_cfg'] =dict(
|
16 |
+
rpn=dict(
|
17 |
+
score_thr=0.0001,
|
18 |
+
nms_pre=1000,
|
19 |
+
max_per_img=256,
|
20 |
+
nms=dict(type='nms', iou_threshold=0.9),
|
21 |
+
min_bbox_size=0),
|
22 |
+
rcnn=dict(
|
23 |
+
score_thr=0.02,
|
24 |
+
nms=dict(type='nms',
|
25 |
+
iou_threshold=0.5,
|
26 |
+
class_agnostic=True,
|
27 |
+
split_thr=100000),
|
28 |
+
max_per_img=50,
|
29 |
+
mask_thr_binary=0.5)
|
30 |
+
)
|
31 |
+
|
32 |
+
del _base_.model
|
33 |
+
|
34 |
+
model = dict(
|
35 |
+
type='MASA',
|
36 |
+
freeze_detector=True,
|
37 |
+
unified_backbone=True,
|
38 |
+
load_public_dets = False,
|
39 |
+
benchmark = 'tao',
|
40 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
|
41 |
+
data_preprocessor=dict(
|
42 |
+
type='TrackDataPreprocessor',
|
43 |
+
# Image normalization parameters
|
44 |
+
mean=[123.675, 116.28, 103.53],
|
45 |
+
std=[58.395, 57.12, 57.375],
|
46 |
+
bgr_to_rgb=True,
|
47 |
+
# Image padding parameters
|
48 |
+
pad_mask=True, # In instance segmentation, the mask needs to be padded
|
49 |
+
pad_size_divisor=32), # Padding the image to multiples of 32
|
50 |
+
detector=detector,
|
51 |
+
masa_adapter=[
|
52 |
+
dict(
|
53 |
+
type='FPN',
|
54 |
+
in_channels=[256, 512, 1024],
|
55 |
+
out_channels=256,
|
56 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
57 |
+
num_outs=5),
|
58 |
+
dict(
|
59 |
+
type='DeformFusion',
|
60 |
+
in_channels=256,
|
61 |
+
out_channels=256,
|
62 |
+
num_blocks=3)],
|
63 |
+
rpn_head=dict(
|
64 |
+
type='RPNHead',
|
65 |
+
in_channels=256,
|
66 |
+
feat_channels=256,
|
67 |
+
anchor_generator=dict(
|
68 |
+
type='AnchorGenerator',
|
69 |
+
scales=[8],
|
70 |
+
ratios=[0.5, 1.0, 2.0],
|
71 |
+
strides=[8, 16, 32, 64, 128]),
|
72 |
+
bbox_coder=dict(
|
73 |
+
type='DeltaXYWHBBoxCoder',
|
74 |
+
target_means=[.0, .0, .0, .0],
|
75 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
76 |
+
loss_cls=dict(
|
77 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
78 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
79 |
+
),
|
80 |
+
roi_head=dict(
|
81 |
+
type='StandardRoIHead',
|
82 |
+
bbox_roi_extractor=dict(
|
83 |
+
type='SingleRoIExtractor',
|
84 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
85 |
+
out_channels=256,
|
86 |
+
featmap_strides=[8, 16, 32]),
|
87 |
+
bbox_head=dict(
|
88 |
+
type='Shared2FCBBoxHead',
|
89 |
+
in_channels=256,
|
90 |
+
fc_out_channels=1024,
|
91 |
+
roi_feat_size=7,
|
92 |
+
num_classes=1,
|
93 |
+
bbox_coder=dict(
|
94 |
+
type='DeltaXYWHBBoxCoder',
|
95 |
+
target_means=[0., 0., 0., 0.],
|
96 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
97 |
+
reg_class_agnostic=True,
|
98 |
+
loss_cls=dict(
|
99 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
100 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
101 |
+
# model training and testing settings
|
102 |
+
train_cfg=dict(
|
103 |
+
rpn=dict(
|
104 |
+
assigner=dict(
|
105 |
+
type='MaxIoUAssigner',
|
106 |
+
pos_iou_thr=0.7,
|
107 |
+
neg_iou_thr=0.3,
|
108 |
+
min_pos_iou=0.3,
|
109 |
+
match_low_quality=True,
|
110 |
+
ignore_iof_thr=-1),
|
111 |
+
sampler=dict(
|
112 |
+
type='RandomSampler',
|
113 |
+
num=256,
|
114 |
+
pos_fraction=0.5,
|
115 |
+
neg_pos_ub=-1,
|
116 |
+
add_gt_as_proposals=False),
|
117 |
+
allowed_border=-1,
|
118 |
+
pos_weight=-1,
|
119 |
+
debug=False),
|
120 |
+
rpn_proposal=dict(
|
121 |
+
nms_pre=2000,
|
122 |
+
max_per_img=1000,
|
123 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
124 |
+
min_bbox_size=0),
|
125 |
+
rcnn=dict(
|
126 |
+
assigner=dict(
|
127 |
+
type='MaxIoUAssigner',
|
128 |
+
pos_iou_thr=0.5,
|
129 |
+
neg_iou_thr=0.5,
|
130 |
+
min_pos_iou=0.5,
|
131 |
+
match_low_quality=False,
|
132 |
+
ignore_iof_thr=-1),
|
133 |
+
sampler=dict(
|
134 |
+
type='RandomSampler',
|
135 |
+
num=512,
|
136 |
+
pos_fraction=0.25,
|
137 |
+
neg_pos_ub=-1,
|
138 |
+
add_gt_as_proposals=True),
|
139 |
+
pos_weight=-1,
|
140 |
+
debug=False)),
|
141 |
+
test_cfg=dict(
|
142 |
+
rpn=dict(
|
143 |
+
nms_pre=1000,
|
144 |
+
max_per_img=1000,
|
145 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
146 |
+
min_bbox_size=0),
|
147 |
+
rcnn=dict(
|
148 |
+
score_thr=0.05,
|
149 |
+
nms=dict(type='nms', iou_threshold=0.5),
|
150 |
+
max_per_img=100)
|
151 |
+
# soft-nms is also supported for rcnn testing
|
152 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
153 |
+
),
|
154 |
+
track_head=dict(
|
155 |
+
type='MasaTrackHead',
|
156 |
+
roi_extractor=dict(
|
157 |
+
type='SingleRoIExtractor',
|
158 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
159 |
+
out_channels=256,
|
160 |
+
featmap_strides=[8, 16, 32]),
|
161 |
+
embed_head=dict(
|
162 |
+
type='QuasiDenseEmbedHead',
|
163 |
+
num_convs=4,
|
164 |
+
num_fcs=1,
|
165 |
+
embed_channels=256,
|
166 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
167 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
168 |
+
loss_track_aux=dict(
|
169 |
+
type='MarginL2Loss',
|
170 |
+
neg_pos_ub=3,
|
171 |
+
pos_margin=0,
|
172 |
+
neg_margin=0.1,
|
173 |
+
hard_mining=True,
|
174 |
+
loss_weight=1.0)),
|
175 |
+
# loss_bbox=dict(type='L1Loss', loss_weight=1.0),
|
176 |
+
train_cfg=dict(
|
177 |
+
assigner=dict(
|
178 |
+
type='MaxIoUAssigner',
|
179 |
+
pos_iou_thr=0.7,
|
180 |
+
neg_iou_thr=0.5,
|
181 |
+
min_pos_iou=0.5,
|
182 |
+
match_low_quality=False,
|
183 |
+
ignore_iof_thr=-1),
|
184 |
+
sampler=dict(
|
185 |
+
type='CombinedSampler',
|
186 |
+
num=512,
|
187 |
+
pos_fraction=0.8,
|
188 |
+
neg_pos_ub=3,
|
189 |
+
add_gt_as_proposals=True,
|
190 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
191 |
+
neg_sampler=dict(type='RandomSampler')))),
|
192 |
+
tracker=dict(
|
193 |
+
type='MasaTaoTracker',
|
194 |
+
init_score_thr=0.0001,
|
195 |
+
obj_score_thr=0.0001,
|
196 |
+
match_score_thr=0.5,
|
197 |
+
memo_tracklet_frames=10,
|
198 |
+
memo_momentum=0.8,
|
199 |
+
with_cats=False,
|
200 |
+
max_distance=-1,
|
201 |
+
fps=1,
|
202 |
+
)
|
203 |
+
)
|
204 |
+
|
205 |
+
train_dataloader = None
|
206 |
+
train_cfg = None
|
207 |
+
val_cfg = dict(type='ValLoop')
|
208 |
+
test_cfg = dict(type='TestLoop')
|
209 |
+
|
210 |
+
default_hooks = dict(
|
211 |
+
logger=dict(type='LoggerHook', interval=50),
|
212 |
+
visualization=dict(type='TrackVisualizationHook', draw=False))
|
213 |
+
|
214 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
215 |
+
visualizer = dict(
|
216 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
217 |
+
|
218 |
+
# custom hooks
|
219 |
+
custom_hooks = [
|
220 |
+
# Synchronize model buffers such as running_mean and running_var in BN
|
221 |
+
# at the end of each epoch
|
222 |
+
dict(type='SyncBuffersHook')
|
223 |
+
]
|
224 |
+
auto_scale_lr = dict(enable=False, base_batch_size=16)
|
225 |
+
val_dataloader = dict(
|
226 |
+
dataset=dict(
|
227 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
228 |
+
)
|
229 |
+
)
|
230 |
+
test_dataloader = val_dataloader
|
231 |
+
val_evaluator = dict(
|
232 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
233 |
+
outfile_prefix='results/masa_results/masa-detic-release-ovmot-test',
|
234 |
+
open_vocabulary=True,
|
235 |
+
)
|
236 |
+
test_evaluator = val_evaluator
|
configs/masa-detic/tao_teta_test/masa_detic_swinb_tao_test_detic_dets.py
ADDED
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../../projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-masa.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v1.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector.pop('data_preprocessor')
|
9 |
+
detector['init_cfg'] = dict(
|
10 |
+
type='Pretrained',
|
11 |
+
checkpoint= 'saved_models/tsa_models/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-ec91245d.pth'
|
12 |
+
# noqa: E501
|
13 |
+
)
|
14 |
+
detector['type'] = 'DeticMasa'
|
15 |
+
|
16 |
+
del _base_.model
|
17 |
+
|
18 |
+
model = dict(
|
19 |
+
type='MASA',
|
20 |
+
freeze_detector=True,
|
21 |
+
unified_backbone=True,
|
22 |
+
load_public_dets = True,
|
23 |
+
benchmark = 'tao',
|
24 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
|
25 |
+
data_preprocessor=dict(
|
26 |
+
type='TrackDataPreprocessor',
|
27 |
+
# Image normalization parameters
|
28 |
+
mean=[123.675, 116.28, 103.53],
|
29 |
+
std=[58.395, 57.12, 57.375],
|
30 |
+
bgr_to_rgb=True,
|
31 |
+
# Image padding parameters
|
32 |
+
pad_mask=True, # In instance segmentation, the mask needs to be padded
|
33 |
+
pad_size_divisor=32), # Padding the image to multiples of 32
|
34 |
+
detector=detector,
|
35 |
+
masa_adapter=[
|
36 |
+
dict(
|
37 |
+
type='FPN',
|
38 |
+
in_channels=[256, 512, 1024],
|
39 |
+
out_channels=256,
|
40 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
41 |
+
num_outs=5),
|
42 |
+
dict(
|
43 |
+
type='DeformFusion',
|
44 |
+
in_channels=256,
|
45 |
+
out_channels=256,
|
46 |
+
num_blocks=3)],
|
47 |
+
rpn_head=dict(
|
48 |
+
type='RPNHead',
|
49 |
+
in_channels=256,
|
50 |
+
feat_channels=256,
|
51 |
+
anchor_generator=dict(
|
52 |
+
type='AnchorGenerator',
|
53 |
+
scales=[8],
|
54 |
+
ratios=[0.5, 1.0, 2.0],
|
55 |
+
strides=[8, 16, 32, 64, 128]),
|
56 |
+
bbox_coder=dict(
|
57 |
+
type='DeltaXYWHBBoxCoder',
|
58 |
+
target_means=[.0, .0, .0, .0],
|
59 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
60 |
+
loss_cls=dict(
|
61 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
62 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
63 |
+
),
|
64 |
+
roi_head=dict(
|
65 |
+
type='StandardRoIHead',
|
66 |
+
bbox_roi_extractor=dict(
|
67 |
+
type='SingleRoIExtractor',
|
68 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
69 |
+
out_channels=256,
|
70 |
+
featmap_strides=[8, 16, 32]),
|
71 |
+
bbox_head=dict(
|
72 |
+
type='Shared2FCBBoxHead',
|
73 |
+
in_channels=256,
|
74 |
+
fc_out_channels=1024,
|
75 |
+
roi_feat_size=7,
|
76 |
+
num_classes=1,
|
77 |
+
bbox_coder=dict(
|
78 |
+
type='DeltaXYWHBBoxCoder',
|
79 |
+
target_means=[0., 0., 0., 0.],
|
80 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
81 |
+
reg_class_agnostic=True,
|
82 |
+
loss_cls=dict(
|
83 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
84 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
85 |
+
# model training and testing settings
|
86 |
+
train_cfg=dict(
|
87 |
+
rpn=dict(
|
88 |
+
assigner=dict(
|
89 |
+
type='MaxIoUAssigner',
|
90 |
+
pos_iou_thr=0.7,
|
91 |
+
neg_iou_thr=0.3,
|
92 |
+
min_pos_iou=0.3,
|
93 |
+
match_low_quality=True,
|
94 |
+
ignore_iof_thr=-1),
|
95 |
+
sampler=dict(
|
96 |
+
type='RandomSampler',
|
97 |
+
num=256,
|
98 |
+
pos_fraction=0.5,
|
99 |
+
neg_pos_ub=-1,
|
100 |
+
add_gt_as_proposals=False),
|
101 |
+
allowed_border=-1,
|
102 |
+
pos_weight=-1,
|
103 |
+
debug=False),
|
104 |
+
rpn_proposal=dict(
|
105 |
+
nms_pre=2000,
|
106 |
+
max_per_img=1000,
|
107 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
108 |
+
min_bbox_size=0),
|
109 |
+
rcnn=dict(
|
110 |
+
assigner=dict(
|
111 |
+
type='MaxIoUAssigner',
|
112 |
+
pos_iou_thr=0.5,
|
113 |
+
neg_iou_thr=0.5,
|
114 |
+
min_pos_iou=0.5,
|
115 |
+
match_low_quality=False,
|
116 |
+
ignore_iof_thr=-1),
|
117 |
+
sampler=dict(
|
118 |
+
type='RandomSampler',
|
119 |
+
num=512,
|
120 |
+
pos_fraction=0.25,
|
121 |
+
neg_pos_ub=-1,
|
122 |
+
add_gt_as_proposals=True),
|
123 |
+
pos_weight=-1,
|
124 |
+
debug=False)),
|
125 |
+
test_cfg=dict(
|
126 |
+
rpn=dict(
|
127 |
+
nms_pre=1000,
|
128 |
+
max_per_img=1000,
|
129 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
130 |
+
min_bbox_size=0),
|
131 |
+
rcnn=dict(
|
132 |
+
score_thr=0.05,
|
133 |
+
nms=dict(type='nms', iou_threshold=0.5),
|
134 |
+
max_per_img=100)
|
135 |
+
# soft-nms is also supported for rcnn testing
|
136 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
137 |
+
),
|
138 |
+
track_head=dict(
|
139 |
+
type='MasaTrackHead',
|
140 |
+
roi_extractor=dict(
|
141 |
+
type='SingleRoIExtractor',
|
142 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
143 |
+
out_channels=256,
|
144 |
+
featmap_strides=[8, 16, 32]),
|
145 |
+
embed_head=dict(
|
146 |
+
type='QuasiDenseEmbedHead',
|
147 |
+
num_convs=4,
|
148 |
+
num_fcs=1,
|
149 |
+
embed_channels=256,
|
150 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
151 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
152 |
+
loss_track_aux=dict(
|
153 |
+
type='MarginL2Loss',
|
154 |
+
neg_pos_ub=3,
|
155 |
+
pos_margin=0,
|
156 |
+
neg_margin=0.1,
|
157 |
+
hard_mining=True,
|
158 |
+
loss_weight=1.0)),
|
159 |
+
# loss_bbox=dict(type='L1Loss', loss_weight=1.0),
|
160 |
+
train_cfg=dict(
|
161 |
+
assigner=dict(
|
162 |
+
type='MaxIoUAssigner',
|
163 |
+
pos_iou_thr=0.7,
|
164 |
+
neg_iou_thr=0.5,
|
165 |
+
min_pos_iou=0.5,
|
166 |
+
match_low_quality=False,
|
167 |
+
ignore_iof_thr=-1),
|
168 |
+
sampler=dict(
|
169 |
+
type='CombinedSampler',
|
170 |
+
num=512,
|
171 |
+
pos_fraction=0.8,
|
172 |
+
neg_pos_ub=3,
|
173 |
+
add_gt_as_proposals=True,
|
174 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
175 |
+
neg_sampler=dict(type='RandomSampler')))),
|
176 |
+
tracker=dict(
|
177 |
+
type='MasaTaoTracker',
|
178 |
+
init_score_thr=0.0001,
|
179 |
+
obj_score_thr=0.0001,
|
180 |
+
match_score_thr=0.5,
|
181 |
+
memo_tracklet_frames=10,
|
182 |
+
memo_momentum=0.8,
|
183 |
+
with_cats=False,
|
184 |
+
max_distance=-1,
|
185 |
+
fps=1,
|
186 |
+
)
|
187 |
+
)
|
188 |
+
|
189 |
+
train_dataloader = None
|
190 |
+
train_cfg = None
|
191 |
+
val_cfg = dict(type='ValLoop')
|
192 |
+
test_cfg = dict(type='TestLoop')
|
193 |
+
|
194 |
+
default_hooks = dict(
|
195 |
+
logger=dict(type='LoggerHook', interval=50),
|
196 |
+
visualization=dict(type='TrackVisualizationHook', draw=False))
|
197 |
+
|
198 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
199 |
+
visualizer = dict(
|
200 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
201 |
+
|
202 |
+
# custom hooks
|
203 |
+
custom_hooks = [
|
204 |
+
# Synchronize model buffers such as running_mean and running_var in BN
|
205 |
+
# at the end of each epoch
|
206 |
+
dict(type='SyncBuffersHook')
|
207 |
+
]
|
208 |
+
auto_scale_lr = dict(enable=False, base_batch_size=16)
|
209 |
+
val_dataloader = dict(
|
210 |
+
dataset=dict(
|
211 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
212 |
+
)
|
213 |
+
)
|
214 |
+
test_dataloader = val_dataloader
|
215 |
+
val_evaluator = dict(
|
216 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
217 |
+
outfile_prefix='results/masa_results/masa-detic-release-detic-dets-tao-test',
|
218 |
+
)
|
219 |
+
test_evaluator = val_evaluator
|
configs/masa-detic/tao_teta_test/masa_detic_swinb_tao_test_teter_swinT_dets.py
ADDED
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../../projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-masa.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v05.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector.pop('data_preprocessor')
|
9 |
+
detector['init_cfg'] = dict(
|
10 |
+
type='Pretrained',
|
11 |
+
checkpoint= 'saved_models/tsa_models/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-ec91245d.pth'
|
12 |
+
# noqa: E501
|
13 |
+
)
|
14 |
+
detector['type'] = 'DeticMasa'
|
15 |
+
|
16 |
+
del _base_.model
|
17 |
+
|
18 |
+
model = dict(
|
19 |
+
type='MASA',
|
20 |
+
freeze_detector=True,
|
21 |
+
unified_backbone=True,
|
22 |
+
load_public_dets = True,
|
23 |
+
benchmark='tao',
|
24 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/teter_swinT_tao_val_internms_50/',
|
25 |
+
data_preprocessor=dict(
|
26 |
+
type='TrackDataPreprocessor',
|
27 |
+
# Image normalization parameters
|
28 |
+
mean=[123.675, 116.28, 103.53],
|
29 |
+
std=[58.395, 57.12, 57.375],
|
30 |
+
bgr_to_rgb=True,
|
31 |
+
# Image padding parameters
|
32 |
+
pad_mask=True, # In instance segmentation, the mask needs to be padded
|
33 |
+
pad_size_divisor=32), # Padding the image to multiples of 32
|
34 |
+
detector=detector,
|
35 |
+
masa_adapter=[
|
36 |
+
dict(
|
37 |
+
type='FPN',
|
38 |
+
in_channels=[256, 512, 1024],
|
39 |
+
out_channels=256,
|
40 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
41 |
+
num_outs=5),
|
42 |
+
dict(
|
43 |
+
type='DeformFusion',
|
44 |
+
in_channels=256,
|
45 |
+
out_channels=256,
|
46 |
+
num_blocks=3)],
|
47 |
+
rpn_head=dict(
|
48 |
+
type='RPNHead',
|
49 |
+
in_channels=256,
|
50 |
+
feat_channels=256,
|
51 |
+
anchor_generator=dict(
|
52 |
+
type='AnchorGenerator',
|
53 |
+
scales=[8],
|
54 |
+
ratios=[0.5, 1.0, 2.0],
|
55 |
+
strides=[8, 16, 32, 64, 128]),
|
56 |
+
bbox_coder=dict(
|
57 |
+
type='DeltaXYWHBBoxCoder',
|
58 |
+
target_means=[.0, .0, .0, .0],
|
59 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
60 |
+
loss_cls=dict(
|
61 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
62 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
63 |
+
),
|
64 |
+
roi_head=dict(
|
65 |
+
type='StandardRoIHead',
|
66 |
+
bbox_roi_extractor=dict(
|
67 |
+
type='SingleRoIExtractor',
|
68 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
69 |
+
out_channels=256,
|
70 |
+
featmap_strides=[8, 16, 32]),
|
71 |
+
bbox_head=dict(
|
72 |
+
type='Shared2FCBBoxHead',
|
73 |
+
in_channels=256,
|
74 |
+
fc_out_channels=1024,
|
75 |
+
roi_feat_size=7,
|
76 |
+
num_classes=1,
|
77 |
+
bbox_coder=dict(
|
78 |
+
type='DeltaXYWHBBoxCoder',
|
79 |
+
target_means=[0., 0., 0., 0.],
|
80 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
81 |
+
reg_class_agnostic=True,
|
82 |
+
loss_cls=dict(
|
83 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
84 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
85 |
+
# model training and testing settings
|
86 |
+
train_cfg=dict(
|
87 |
+
rpn=dict(
|
88 |
+
assigner=dict(
|
89 |
+
type='MaxIoUAssigner',
|
90 |
+
pos_iou_thr=0.7,
|
91 |
+
neg_iou_thr=0.3,
|
92 |
+
min_pos_iou=0.3,
|
93 |
+
match_low_quality=True,
|
94 |
+
ignore_iof_thr=-1),
|
95 |
+
sampler=dict(
|
96 |
+
type='RandomSampler',
|
97 |
+
num=256,
|
98 |
+
pos_fraction=0.5,
|
99 |
+
neg_pos_ub=-1,
|
100 |
+
add_gt_as_proposals=False),
|
101 |
+
allowed_border=-1,
|
102 |
+
pos_weight=-1,
|
103 |
+
debug=False),
|
104 |
+
rpn_proposal=dict(
|
105 |
+
nms_pre=2000,
|
106 |
+
max_per_img=1000,
|
107 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
108 |
+
min_bbox_size=0),
|
109 |
+
rcnn=dict(
|
110 |
+
assigner=dict(
|
111 |
+
type='MaxIoUAssigner',
|
112 |
+
pos_iou_thr=0.5,
|
113 |
+
neg_iou_thr=0.5,
|
114 |
+
min_pos_iou=0.5,
|
115 |
+
match_low_quality=False,
|
116 |
+
ignore_iof_thr=-1),
|
117 |
+
sampler=dict(
|
118 |
+
type='RandomSampler',
|
119 |
+
num=512,
|
120 |
+
pos_fraction=0.25,
|
121 |
+
neg_pos_ub=-1,
|
122 |
+
add_gt_as_proposals=True),
|
123 |
+
pos_weight=-1,
|
124 |
+
debug=False)),
|
125 |
+
test_cfg=dict(
|
126 |
+
rpn=dict(
|
127 |
+
nms_pre=1000,
|
128 |
+
max_per_img=1000,
|
129 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
130 |
+
min_bbox_size=0),
|
131 |
+
rcnn=dict(
|
132 |
+
score_thr=0.05,
|
133 |
+
nms=dict(type='nms', iou_threshold=0.5),
|
134 |
+
max_per_img=100)
|
135 |
+
# soft-nms is also supported for rcnn testing
|
136 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
137 |
+
),
|
138 |
+
track_head=dict(
|
139 |
+
type='MasaTrackHead',
|
140 |
+
roi_extractor=dict(
|
141 |
+
type='SingleRoIExtractor',
|
142 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
143 |
+
out_channels=256,
|
144 |
+
featmap_strides=[8, 16, 32]),
|
145 |
+
embed_head=dict(
|
146 |
+
type='QuasiDenseEmbedHead',
|
147 |
+
num_convs=4,
|
148 |
+
num_fcs=1,
|
149 |
+
embed_channels=256,
|
150 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
151 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
152 |
+
loss_track_aux=dict(
|
153 |
+
type='MarginL2Loss',
|
154 |
+
neg_pos_ub=3,
|
155 |
+
pos_margin=0,
|
156 |
+
neg_margin=0.1,
|
157 |
+
hard_mining=True,
|
158 |
+
loss_weight=1.0)),
|
159 |
+
# loss_bbox=dict(type='L1Loss', loss_weight=1.0),
|
160 |
+
train_cfg=dict(
|
161 |
+
assigner=dict(
|
162 |
+
type='MaxIoUAssigner',
|
163 |
+
pos_iou_thr=0.7,
|
164 |
+
neg_iou_thr=0.5,
|
165 |
+
min_pos_iou=0.5,
|
166 |
+
match_low_quality=False,
|
167 |
+
ignore_iof_thr=-1),
|
168 |
+
sampler=dict(
|
169 |
+
type='CombinedSampler',
|
170 |
+
num=512,
|
171 |
+
pos_fraction=0.8,
|
172 |
+
neg_pos_ub=3,
|
173 |
+
add_gt_as_proposals=True,
|
174 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
175 |
+
neg_sampler=dict(type='RandomSampler')))),
|
176 |
+
tracker=dict(
|
177 |
+
type='MasaTaoTracker',
|
178 |
+
init_score_thr=0.0001,
|
179 |
+
obj_score_thr=0.0001,
|
180 |
+
match_score_thr=0.5,
|
181 |
+
memo_tracklet_frames=10,
|
182 |
+
memo_momentum=0.8,
|
183 |
+
with_cats=False,
|
184 |
+
max_distance=-1,
|
185 |
+
fps=1,
|
186 |
+
)
|
187 |
+
)
|
188 |
+
|
189 |
+
train_dataloader = None
|
190 |
+
train_cfg = None
|
191 |
+
val_cfg = dict(type='ValLoop')
|
192 |
+
test_cfg = dict(type='TestLoop')
|
193 |
+
|
194 |
+
default_hooks = dict(
|
195 |
+
logger=dict(type='LoggerHook', interval=50),
|
196 |
+
visualization=dict(type='TrackVisualizationHook', draw=False))
|
197 |
+
|
198 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
199 |
+
visualizer = dict(
|
200 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
201 |
+
|
202 |
+
# custom hooks
|
203 |
+
custom_hooks = [
|
204 |
+
# Synchronize model buffers such as running_mean and running_var in BN
|
205 |
+
# at the end of each epoch
|
206 |
+
dict(type='SyncBuffersHook')
|
207 |
+
]
|
208 |
+
auto_scale_lr = dict(enable=False, base_batch_size=16)
|
209 |
+
val_dataloader = dict(
|
210 |
+
dataset=dict(
|
211 |
+
ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json'
|
212 |
+
)
|
213 |
+
)
|
214 |
+
test_dataloader = val_dataloader
|
215 |
+
val_evaluator = dict(
|
216 |
+
ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
|
217 |
+
outfile_prefix='results/masa_results/masa-detic-release-test',
|
218 |
+
)
|
219 |
+
test_evaluator = val_evaluator
|
configs/masa-gdino/bdd_test/masa_gdino_bdd_mot_test.py
ADDED
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../../projects/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata_masa.py',
|
3 |
+
'../../datasets/bdd/bdd_dataset.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
# detector.backbone.update(dict(out_indices=(1, 2, 3)))
|
9 |
+
detector.pop('data_preprocessor')
|
10 |
+
detector['init_cfg'] = dict(
|
11 |
+
type='Pretrained',
|
12 |
+
checkpoint= 'saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'
|
13 |
+
# noqa: E501
|
14 |
+
)
|
15 |
+
detector['type'] = 'GroundingDINOMasa'
|
16 |
+
|
17 |
+
del _base_.model
|
18 |
+
|
19 |
+
model = dict(
|
20 |
+
type='MASA',
|
21 |
+
freeze_detector=True,
|
22 |
+
unified_backbone=True,
|
23 |
+
load_public_dets = True,
|
24 |
+
benchmark = 'bdd',
|
25 |
+
public_det_path = 'results/public_dets/bdd_mot_yolox_dets/',
|
26 |
+
data_preprocessor=dict(
|
27 |
+
type='TrackDataPreprocessor',
|
28 |
+
# Image normalization parameters
|
29 |
+
mean=[123.675, 116.28, 103.53],
|
30 |
+
std=[58.395, 57.12, 57.375],
|
31 |
+
bgr_to_rgb=True,
|
32 |
+
# Image padding parameters
|
33 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
34 |
+
pad_size_divisor=1024, # Padding the image to multiples of 32
|
35 |
+
),
|
36 |
+
detector=detector,
|
37 |
+
masa_adapter=[
|
38 |
+
dict(
|
39 |
+
type='FPN',
|
40 |
+
in_channels=[256, 512, 1024],
|
41 |
+
out_channels=256,
|
42 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
43 |
+
num_outs=5),
|
44 |
+
dict(
|
45 |
+
type='DeformFusion',
|
46 |
+
in_channels=256,
|
47 |
+
out_channels=256,
|
48 |
+
num_blocks=3)],
|
49 |
+
rpn_head=dict(
|
50 |
+
type='RPNHead',
|
51 |
+
in_channels=256,
|
52 |
+
feat_channels=256,
|
53 |
+
anchor_generator=dict(
|
54 |
+
type='AnchorGenerator',
|
55 |
+
scales=[8],
|
56 |
+
ratios=[0.5, 1.0, 2.0],
|
57 |
+
strides=[8, 16, 32, 64, 128]),
|
58 |
+
bbox_coder=dict(
|
59 |
+
type='DeltaXYWHBBoxCoder',
|
60 |
+
target_means=[.0, .0, .0, .0],
|
61 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
62 |
+
loss_cls=dict(
|
63 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
64 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
65 |
+
),
|
66 |
+
roi_head=dict(
|
67 |
+
type='StandardRoIHead',
|
68 |
+
bbox_roi_extractor=dict(
|
69 |
+
type='SingleRoIExtractor',
|
70 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
71 |
+
out_channels=256,
|
72 |
+
featmap_strides=[8, 16, 32]),
|
73 |
+
bbox_head=dict(
|
74 |
+
type='Shared2FCBBoxHead',
|
75 |
+
in_channels=256,
|
76 |
+
fc_out_channels=1024,
|
77 |
+
roi_feat_size=7,
|
78 |
+
num_classes=1,
|
79 |
+
bbox_coder=dict(
|
80 |
+
type='DeltaXYWHBBoxCoder',
|
81 |
+
target_means=[0., 0., 0., 0.],
|
82 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
83 |
+
reg_class_agnostic=True,
|
84 |
+
loss_cls=dict(
|
85 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
86 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
87 |
+
# model training and testing settings
|
88 |
+
train_cfg=dict(
|
89 |
+
rpn=dict(
|
90 |
+
assigner=dict(
|
91 |
+
type='MaxIoUAssigner',
|
92 |
+
pos_iou_thr=0.7,
|
93 |
+
neg_iou_thr=0.3,
|
94 |
+
min_pos_iou=0.3,
|
95 |
+
match_low_quality=True,
|
96 |
+
ignore_iof_thr=-1),
|
97 |
+
sampler=dict(
|
98 |
+
type='RandomSampler',
|
99 |
+
num=256,
|
100 |
+
pos_fraction=0.5,
|
101 |
+
neg_pos_ub=-1,
|
102 |
+
add_gt_as_proposals=False),
|
103 |
+
allowed_border=-1,
|
104 |
+
pos_weight=-1,
|
105 |
+
debug=False),
|
106 |
+
rpn_proposal=dict(
|
107 |
+
nms_pre=2000,
|
108 |
+
max_per_img=1000,
|
109 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
110 |
+
min_bbox_size=0),
|
111 |
+
rcnn=dict(
|
112 |
+
assigner=dict(
|
113 |
+
type='MaxIoUAssigner',
|
114 |
+
pos_iou_thr=0.5,
|
115 |
+
neg_iou_thr=0.5,
|
116 |
+
min_pos_iou=0.5,
|
117 |
+
match_low_quality=False,
|
118 |
+
ignore_iof_thr=-1),
|
119 |
+
sampler=dict(
|
120 |
+
type='RandomSampler',
|
121 |
+
num=512,
|
122 |
+
pos_fraction=0.25,
|
123 |
+
neg_pos_ub=-1,
|
124 |
+
add_gt_as_proposals=True),
|
125 |
+
pos_weight=-1,
|
126 |
+
debug=False)),
|
127 |
+
test_cfg=dict(
|
128 |
+
rpn=dict(
|
129 |
+
nms_pre=1000,
|
130 |
+
max_per_img=1000,
|
131 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
132 |
+
min_bbox_size=0),
|
133 |
+
rcnn=dict(
|
134 |
+
score_thr=0.02,
|
135 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
136 |
+
nms=dict(type='nms',
|
137 |
+
iou_threshold=0.5,
|
138 |
+
class_agnostic=True,
|
139 |
+
split_thr=100000),
|
140 |
+
max_per_img=50,
|
141 |
+
mask_thr_binary=0.5)
|
142 |
+
# soft-nms is also supported for rcnn testing
|
143 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
144 |
+
),
|
145 |
+
track_head=dict(
|
146 |
+
type='MasaTrackHead',
|
147 |
+
roi_extractor=dict(
|
148 |
+
type='SingleRoIExtractor',
|
149 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
150 |
+
out_channels=256,
|
151 |
+
featmap_strides=[8, 16, 32]),
|
152 |
+
embed_head=dict(
|
153 |
+
type='QuasiDenseEmbedHead',
|
154 |
+
num_convs=4,
|
155 |
+
num_fcs=1,
|
156 |
+
embed_channels=256,
|
157 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
158 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
159 |
+
loss_track_aux=dict(
|
160 |
+
type='MarginL2Loss',
|
161 |
+
neg_pos_ub=3,
|
162 |
+
pos_margin=0,
|
163 |
+
neg_margin=0.1,
|
164 |
+
hard_mining=True,
|
165 |
+
loss_weight=1.0)),
|
166 |
+
# loss_bbox=dict(type='L1Loss', loss_weight=1.0),
|
167 |
+
train_cfg=dict(
|
168 |
+
assigner=dict(
|
169 |
+
type='MaxIoUAssigner',
|
170 |
+
pos_iou_thr=0.7,
|
171 |
+
neg_iou_thr=0.3,
|
172 |
+
min_pos_iou=0.5,
|
173 |
+
match_low_quality=False,
|
174 |
+
ignore_iof_thr=-1),
|
175 |
+
sampler=dict(
|
176 |
+
type='CombinedSampler',
|
177 |
+
num=512,
|
178 |
+
pos_fraction=0.5,
|
179 |
+
neg_pos_ub=3,
|
180 |
+
add_gt_as_proposals=True,
|
181 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
182 |
+
neg_sampler=dict(type='RandomSampler')))),
|
183 |
+
tracker=dict(
|
184 |
+
type='MasaBDDTracker',
|
185 |
+
init_score_thr=0.5,
|
186 |
+
obj_score_thr=0.3,
|
187 |
+
match_score_thr=0.6,
|
188 |
+
memo_tracklet_frames=10,
|
189 |
+
memo_backdrop_frames=1,
|
190 |
+
memo_momentum=0.8,
|
191 |
+
nms_conf_thr=0.5,
|
192 |
+
nms_backdrop_iou_thr=0.3,
|
193 |
+
nms_class_iou_thr=0.7,
|
194 |
+
with_cats=False,
|
195 |
+
match_metric='bisoftmax')
|
196 |
+
)
|
197 |
+
|
198 |
+
# runtime settings
|
199 |
+
train_dataloader = None
|
200 |
+
train_cfg = None
|
201 |
+
val_cfg = dict(type='ValLoop')
|
202 |
+
test_cfg = dict(type='TestLoop')
|
203 |
+
|
204 |
+
default_hooks = dict(
|
205 |
+
logger=dict(type='LoggerHook', interval=50),
|
206 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
207 |
+
checkpoint = dict(type='CheckpointHook', interval=1),
|
208 |
+
)
|
209 |
+
|
210 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
211 |
+
visualizer = dict(
|
212 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
213 |
+
|
214 |
+
val_dataloader = dict(
|
215 |
+
dataset=dict(
|
216 |
+
ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
|
217 |
+
)
|
218 |
+
)
|
219 |
+
test_dataloader = val_dataloader
|
220 |
+
val_evaluator = dict(
|
221 |
+
ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
|
222 |
+
scalabel_gt='data/bdd/annotations/scalabel_gt/box_track_20/val/',
|
223 |
+
outfile_prefix='results/detic_masa_trained_bdd_demo',
|
224 |
+
metric=['TETA', 'HOTA', 'CLEAR']
|
225 |
+
)
|
226 |
+
test_evaluator = val_evaluator
|
configs/masa-gdino/bdd_test/masa_gdino_bdd_mots_test.py
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../../projects/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata_masa.py',
|
3 |
+
'../../datasets/bdd/bdd_dataset.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector.pop('data_preprocessor')
|
9 |
+
detector['init_cfg'] = dict(
|
10 |
+
type='Pretrained',
|
11 |
+
checkpoint= 'saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'
|
12 |
+
# noqa: E501
|
13 |
+
)
|
14 |
+
detector['type'] = 'GroundingDINOMasa'
|
15 |
+
|
16 |
+
del _base_.model
|
17 |
+
|
18 |
+
model = dict(
|
19 |
+
type='MASA',
|
20 |
+
freeze_detector=True,
|
21 |
+
unified_backbone=True,
|
22 |
+
load_public_dets = True,
|
23 |
+
with_segm=True,
|
24 |
+
benchmark = 'bdd',
|
25 |
+
public_det_path = 'results/public_dets/bdd_mots_val_uninext_dets/',
|
26 |
+
data_preprocessor=dict(
|
27 |
+
type='TrackDataPreprocessor',
|
28 |
+
# Image normalization parameters
|
29 |
+
mean=[123.675, 116.28, 103.53],
|
30 |
+
std=[58.395, 57.12, 57.375],
|
31 |
+
bgr_to_rgb=True,
|
32 |
+
# Image padding parameters
|
33 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
34 |
+
pad_size_divisor=1024, # Padding the image to multiples of 32
|
35 |
+
),
|
36 |
+
detector=detector,
|
37 |
+
masa_adapter=[
|
38 |
+
dict(
|
39 |
+
type='FPN',
|
40 |
+
in_channels=[256, 512, 1024],
|
41 |
+
out_channels=256,
|
42 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
43 |
+
num_outs=5),
|
44 |
+
dict(
|
45 |
+
type='DeformFusion',
|
46 |
+
in_channels=256,
|
47 |
+
out_channels=256,
|
48 |
+
num_blocks=3)],
|
49 |
+
rpn_head=dict(
|
50 |
+
type='RPNHead',
|
51 |
+
in_channels=256,
|
52 |
+
feat_channels=256,
|
53 |
+
anchor_generator=dict(
|
54 |
+
type='AnchorGenerator',
|
55 |
+
scales=[8],
|
56 |
+
ratios=[0.5, 1.0, 2.0],
|
57 |
+
strides=[8, 16, 32, 64, 128]),
|
58 |
+
bbox_coder=dict(
|
59 |
+
type='DeltaXYWHBBoxCoder',
|
60 |
+
target_means=[.0, .0, .0, .0],
|
61 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
62 |
+
loss_cls=dict(
|
63 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
64 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
65 |
+
),
|
66 |
+
roi_head=dict(
|
67 |
+
type='StandardRoIHead',
|
68 |
+
bbox_roi_extractor=dict(
|
69 |
+
type='SingleRoIExtractor',
|
70 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
71 |
+
out_channels=256,
|
72 |
+
featmap_strides=[8, 16, 32]),
|
73 |
+
bbox_head=dict(
|
74 |
+
type='Shared2FCBBoxHead',
|
75 |
+
in_channels=256,
|
76 |
+
fc_out_channels=1024,
|
77 |
+
roi_feat_size=7,
|
78 |
+
num_classes=1,
|
79 |
+
bbox_coder=dict(
|
80 |
+
type='DeltaXYWHBBoxCoder',
|
81 |
+
target_means=[0., 0., 0., 0.],
|
82 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
83 |
+
reg_class_agnostic=True,
|
84 |
+
loss_cls=dict(
|
85 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
86 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
87 |
+
# model training and testing settings
|
88 |
+
train_cfg=dict(
|
89 |
+
rpn=dict(
|
90 |
+
assigner=dict(
|
91 |
+
type='MaxIoUAssigner',
|
92 |
+
pos_iou_thr=0.7,
|
93 |
+
neg_iou_thr=0.3,
|
94 |
+
min_pos_iou=0.3,
|
95 |
+
match_low_quality=True,
|
96 |
+
ignore_iof_thr=-1),
|
97 |
+
sampler=dict(
|
98 |
+
type='RandomSampler',
|
99 |
+
num=256,
|
100 |
+
pos_fraction=0.5,
|
101 |
+
neg_pos_ub=-1,
|
102 |
+
add_gt_as_proposals=False),
|
103 |
+
allowed_border=-1,
|
104 |
+
pos_weight=-1,
|
105 |
+
debug=False),
|
106 |
+
rpn_proposal=dict(
|
107 |
+
nms_pre=2000,
|
108 |
+
max_per_img=1000,
|
109 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
110 |
+
min_bbox_size=0),
|
111 |
+
rcnn=dict(
|
112 |
+
assigner=dict(
|
113 |
+
type='MaxIoUAssigner',
|
114 |
+
pos_iou_thr=0.5,
|
115 |
+
neg_iou_thr=0.5,
|
116 |
+
min_pos_iou=0.5,
|
117 |
+
match_low_quality=False,
|
118 |
+
ignore_iof_thr=-1),
|
119 |
+
sampler=dict(
|
120 |
+
type='RandomSampler',
|
121 |
+
num=512,
|
122 |
+
pos_fraction=0.25,
|
123 |
+
neg_pos_ub=-1,
|
124 |
+
add_gt_as_proposals=True),
|
125 |
+
pos_weight=-1,
|
126 |
+
debug=False)),
|
127 |
+
test_cfg=dict(
|
128 |
+
rpn=dict(
|
129 |
+
nms_pre=1000,
|
130 |
+
max_per_img=1000,
|
131 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
132 |
+
min_bbox_size=0),
|
133 |
+
rcnn=dict(
|
134 |
+
score_thr=0.02,
|
135 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
136 |
+
nms=dict(type='nms',
|
137 |
+
iou_threshold=0.5,
|
138 |
+
class_agnostic=True,
|
139 |
+
split_thr=100000),
|
140 |
+
max_per_img=50,
|
141 |
+
mask_thr_binary=0.5)
|
142 |
+
# soft-nms is also supported for rcnn testing
|
143 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
144 |
+
),
|
145 |
+
track_head=dict(
|
146 |
+
type='MasaTrackHead',
|
147 |
+
roi_extractor=dict(
|
148 |
+
type='SingleRoIExtractor',
|
149 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
150 |
+
out_channels=256,
|
151 |
+
featmap_strides=[8, 16, 32]),
|
152 |
+
embed_head=dict(
|
153 |
+
type='QuasiDenseEmbedHead',
|
154 |
+
num_convs=4,
|
155 |
+
num_fcs=1,
|
156 |
+
embed_channels=256,
|
157 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
158 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
159 |
+
loss_track_aux=dict(
|
160 |
+
type='MarginL2Loss',
|
161 |
+
neg_pos_ub=3,
|
162 |
+
pos_margin=0,
|
163 |
+
neg_margin=0.1,
|
164 |
+
hard_mining=True,
|
165 |
+
loss_weight=1.0)),
|
166 |
+
# loss_bbox=dict(type='L1Loss', loss_weight=1.0),
|
167 |
+
train_cfg=dict(
|
168 |
+
assigner=dict(
|
169 |
+
type='MaxIoUAssigner',
|
170 |
+
pos_iou_thr=0.7,
|
171 |
+
neg_iou_thr=0.3,
|
172 |
+
min_pos_iou=0.5,
|
173 |
+
match_low_quality=False,
|
174 |
+
ignore_iof_thr=-1),
|
175 |
+
sampler=dict(
|
176 |
+
type='CombinedSampler',
|
177 |
+
num=512,
|
178 |
+
pos_fraction=0.5,
|
179 |
+
neg_pos_ub=3,
|
180 |
+
add_gt_as_proposals=True,
|
181 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
182 |
+
neg_sampler=dict(type='RandomSampler')))),
|
183 |
+
tracker=dict(
|
184 |
+
type='MasaBDDTracker',
|
185 |
+
init_score_thr=0.5,
|
186 |
+
obj_score_thr=0.3,
|
187 |
+
match_score_thr=0.6,
|
188 |
+
memo_tracklet_frames=10,
|
189 |
+
memo_backdrop_frames=1,
|
190 |
+
memo_momentum=0.8,
|
191 |
+
nms_conf_thr=0.5,
|
192 |
+
nms_backdrop_iou_thr=0.3,
|
193 |
+
nms_class_iou_thr=0.7,
|
194 |
+
with_cats=False,
|
195 |
+
match_metric='bisoftmax')
|
196 |
+
)
|
197 |
+
|
198 |
+
# runtime settings
|
199 |
+
train_dataloader = None
|
200 |
+
train_cfg = None
|
201 |
+
val_cfg = dict(type='ValLoop')
|
202 |
+
test_cfg = dict(type='TestLoop')
|
203 |
+
|
204 |
+
default_hooks = dict(
|
205 |
+
logger=dict(type='LoggerHook', interval=50),
|
206 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
207 |
+
checkpoint = dict(type='CheckpointHook', interval=1),
|
208 |
+
)
|
209 |
+
|
210 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
211 |
+
visualizer = dict(
|
212 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
213 |
+
|
214 |
+
val_dataloader = dict(
|
215 |
+
dataset=dict(
|
216 |
+
ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
|
217 |
+
)
|
218 |
+
)
|
219 |
+
test_dataloader = val_dataloader
|
220 |
+
val_evaluator = dict(
|
221 |
+
ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
|
222 |
+
scalabel_gt='data/bdd/annotations/scalabel_gt/seg_track_20/val/',
|
223 |
+
outfile_prefix='results/masa_results/masa-groundingdino-release-bdd-mots-test',
|
224 |
+
metric=['TETA', 'HOTA', 'CLEAR'],
|
225 |
+
with_mask=True,
|
226 |
+
)
|
227 |
+
test_evaluator = val_evaluator
|
configs/masa-gdino/masa_gdino_swinb_inference.py
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../projects/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata_masa.py',
|
3 |
+
'../default_runtime.py'
|
4 |
+
]
|
5 |
+
default_scope = 'mmdet'
|
6 |
+
detector = _base_.model
|
7 |
+
detector.pop('data_preprocessor')
|
8 |
+
detector['init_cfg'] = dict(
|
9 |
+
type='Pretrained',
|
10 |
+
checkpoint= 'saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'
|
11 |
+
# noqa: E501
|
12 |
+
)
|
13 |
+
detector['type'] = 'GroundingDINOMasa'
|
14 |
+
|
15 |
+
del _base_.model
|
16 |
+
|
17 |
+
model = dict(
|
18 |
+
type='MASA',
|
19 |
+
freeze_detector=True,
|
20 |
+
unified_backbone=True,
|
21 |
+
load_public_dets = False,
|
22 |
+
data_preprocessor=dict(
|
23 |
+
type='TrackDataPreprocessor',
|
24 |
+
# Image normalization parameters
|
25 |
+
mean=[123.675, 116.28, 103.53],
|
26 |
+
std=[58.395, 57.12, 57.375],
|
27 |
+
bgr_to_rgb=True,
|
28 |
+
# Image padding parameters
|
29 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
30 |
+
pad_size_divisor=32), # Padding the image to multiples of 32
|
31 |
+
detector=detector,
|
32 |
+
masa_adapter=[
|
33 |
+
dict(
|
34 |
+
type='FPN',
|
35 |
+
in_channels=[256, 512, 1024],
|
36 |
+
out_channels=256,
|
37 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
38 |
+
num_outs=5),
|
39 |
+
dict(
|
40 |
+
type='DeformFusion',
|
41 |
+
in_channels=256,
|
42 |
+
out_channels=256,
|
43 |
+
num_blocks=3)],
|
44 |
+
rpn_head=dict(
|
45 |
+
type='RPNHead',
|
46 |
+
in_channels=256,
|
47 |
+
feat_channels=256,
|
48 |
+
anchor_generator=dict(
|
49 |
+
type='AnchorGenerator',
|
50 |
+
scales=[8],
|
51 |
+
ratios=[0.5, 1.0, 2.0],
|
52 |
+
strides=[8, 16, 32, 64, 128]),
|
53 |
+
bbox_coder=dict(
|
54 |
+
type='DeltaXYWHBBoxCoder',
|
55 |
+
target_means=[.0, .0, .0, .0],
|
56 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
57 |
+
loss_cls=dict(
|
58 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
59 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
60 |
+
),
|
61 |
+
roi_head=dict(
|
62 |
+
type='StandardRoIHead',
|
63 |
+
bbox_roi_extractor=dict(
|
64 |
+
type='SingleRoIExtractor',
|
65 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
66 |
+
out_channels=256,
|
67 |
+
featmap_strides=[8, 16, 32]),
|
68 |
+
bbox_head=dict(
|
69 |
+
type='Shared2FCBBoxHead',
|
70 |
+
in_channels=256,
|
71 |
+
fc_out_channels=1024,
|
72 |
+
roi_feat_size=7,
|
73 |
+
num_classes=1,
|
74 |
+
bbox_coder=dict(
|
75 |
+
type='DeltaXYWHBBoxCoder',
|
76 |
+
target_means=[0., 0., 0., 0.],
|
77 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
78 |
+
reg_class_agnostic=True,
|
79 |
+
loss_cls=dict(
|
80 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
81 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
82 |
+
# model training and testing settings
|
83 |
+
train_cfg=dict(
|
84 |
+
rpn=dict(
|
85 |
+
assigner=dict(
|
86 |
+
type='MaxIoUAssigner',
|
87 |
+
pos_iou_thr=0.7,
|
88 |
+
neg_iou_thr=0.3,
|
89 |
+
min_pos_iou=0.3,
|
90 |
+
match_low_quality=True,
|
91 |
+
ignore_iof_thr=-1),
|
92 |
+
sampler=dict(
|
93 |
+
type='RandomSampler',
|
94 |
+
num=256,
|
95 |
+
pos_fraction=0.5,
|
96 |
+
neg_pos_ub=-1,
|
97 |
+
add_gt_as_proposals=False),
|
98 |
+
allowed_border=-1,
|
99 |
+
pos_weight=-1,
|
100 |
+
debug=False),
|
101 |
+
rpn_proposal=dict(
|
102 |
+
nms_pre=2000,
|
103 |
+
max_per_img=1000,
|
104 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
105 |
+
min_bbox_size=0),
|
106 |
+
rcnn=dict(
|
107 |
+
assigner=dict(
|
108 |
+
type='MaxIoUAssigner',
|
109 |
+
pos_iou_thr=0.5,
|
110 |
+
neg_iou_thr=0.5,
|
111 |
+
min_pos_iou=0.5,
|
112 |
+
match_low_quality=False,
|
113 |
+
ignore_iof_thr=-1),
|
114 |
+
sampler=dict(
|
115 |
+
type='RandomSampler',
|
116 |
+
num=512,
|
117 |
+
pos_fraction=0.25,
|
118 |
+
neg_pos_ub=-1,
|
119 |
+
add_gt_as_proposals=True),
|
120 |
+
pos_weight=-1,
|
121 |
+
debug=False)),
|
122 |
+
test_cfg=dict(
|
123 |
+
rpn=dict(
|
124 |
+
nms_pre=1000,
|
125 |
+
max_per_img=1000,
|
126 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
127 |
+
min_bbox_size=0),
|
128 |
+
rcnn=dict(
|
129 |
+
score_thr=0.02,
|
130 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
131 |
+
nms=dict(type='nms',
|
132 |
+
iou_threshold=0.5,
|
133 |
+
class_agnostic=True,
|
134 |
+
split_thr=100000),
|
135 |
+
max_per_img=50,
|
136 |
+
mask_thr_binary=0.5)
|
137 |
+
# soft-nms is also supported for rcnn testing
|
138 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
139 |
+
),
|
140 |
+
track_head=dict(
|
141 |
+
type='MasaTrackHead',
|
142 |
+
roi_extractor=dict(
|
143 |
+
type='SingleRoIExtractor',
|
144 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
145 |
+
out_channels=256,
|
146 |
+
featmap_strides=[8, 16, 32]),
|
147 |
+
embed_head=dict(
|
148 |
+
type='QuasiDenseEmbedHead',
|
149 |
+
num_convs=4,
|
150 |
+
num_fcs=1,
|
151 |
+
embed_channels=256,
|
152 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
153 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
154 |
+
loss_track_aux=dict(
|
155 |
+
type='MarginL2Loss',
|
156 |
+
neg_pos_ub=3,
|
157 |
+
pos_margin=0,
|
158 |
+
neg_margin=0.1,
|
159 |
+
hard_mining=True,
|
160 |
+
loss_weight=1.0)),
|
161 |
+
train_cfg=dict(
|
162 |
+
assigner=dict(
|
163 |
+
type='MaxIoUAssigner',
|
164 |
+
pos_iou_thr=0.7,
|
165 |
+
neg_iou_thr=0.3,
|
166 |
+
min_pos_iou=0.5,
|
167 |
+
match_low_quality=False,
|
168 |
+
ignore_iof_thr=-1),
|
169 |
+
sampler=dict(
|
170 |
+
type='CombinedSampler',
|
171 |
+
num=512,
|
172 |
+
pos_fraction=0.5,
|
173 |
+
neg_pos_ub=3,
|
174 |
+
add_gt_as_proposals=True,
|
175 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
176 |
+
neg_sampler=dict(type='RandomSampler')))),
|
177 |
+
tracker=dict(
|
178 |
+
type='MasaTaoTracker',
|
179 |
+
init_score_thr=0.1,
|
180 |
+
obj_score_thr=0.01,
|
181 |
+
match_score_thr=0.5,
|
182 |
+
memo_tracklet_frames=10,
|
183 |
+
memo_momentum=0.8,
|
184 |
+
with_cats=False,
|
185 |
+
max_distance=100,
|
186 |
+
fps=30,
|
187 |
+
)
|
188 |
+
)
|
189 |
+
|
190 |
+
inference_pipeline = [
|
191 |
+
dict(
|
192 |
+
type='TransformBroadcaster',
|
193 |
+
transforms=[
|
194 |
+
dict(
|
195 |
+
type='Resize',
|
196 |
+
scale=(1333, 800),
|
197 |
+
keep_ratio=True),
|
198 |
+
]),
|
199 |
+
dict(type='PackTrackInputs')
|
200 |
+
]
|
201 |
+
|
202 |
+
# runtime settings
|
203 |
+
train_cfg = None
|
204 |
+
val_cfg = dict(type='ValLoop')
|
205 |
+
test_cfg = dict(type='TestLoop')
|
206 |
+
|
207 |
+
default_hooks = dict(
|
208 |
+
logger=dict(type='LoggerHook', interval=50),
|
209 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
210 |
+
checkpoint = dict(type='CheckpointHook', interval=1),
|
211 |
+
)
|
212 |
+
|
213 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
214 |
+
visualizer = dict(
|
215 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
216 |
+
|
configs/masa-gdino/masa_gdino_swinb_plug_and_play.py
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../projects/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata_masa.py',
|
3 |
+
'../default_runtime.py'
|
4 |
+
]
|
5 |
+
default_scope = 'mmdet'
|
6 |
+
detector = _base_.model
|
7 |
+
detector.pop('data_preprocessor')
|
8 |
+
detector['init_cfg'] = dict(
|
9 |
+
type='Pretrained',
|
10 |
+
checkpoint= 'saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'
|
11 |
+
# noqa: E501
|
12 |
+
)
|
13 |
+
detector['type'] = 'GroundingDINOMasa'
|
14 |
+
|
15 |
+
del _base_.model
|
16 |
+
|
17 |
+
model = dict(
|
18 |
+
type='MASA',
|
19 |
+
freeze_detector=True,
|
20 |
+
unified_backbone=True,
|
21 |
+
load_public_dets = False,
|
22 |
+
given_dets = True,
|
23 |
+
data_preprocessor=dict(
|
24 |
+
type='TrackDataPreprocessor',
|
25 |
+
# Image normalization parameters
|
26 |
+
mean=[123.675, 116.28, 103.53],
|
27 |
+
std=[58.395, 57.12, 57.375],
|
28 |
+
bgr_to_rgb=True,
|
29 |
+
# Image padding parameters
|
30 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
31 |
+
pad_size_divisor=32), # Padding the image to multiples of 32
|
32 |
+
detector=detector,
|
33 |
+
masa_adapter=[
|
34 |
+
dict(
|
35 |
+
type='FPN',
|
36 |
+
in_channels=[256, 512, 1024],
|
37 |
+
out_channels=256,
|
38 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
39 |
+
num_outs=5),
|
40 |
+
dict(
|
41 |
+
type='DeformFusion',
|
42 |
+
in_channels=256,
|
43 |
+
out_channels=256,
|
44 |
+
num_blocks=3)],
|
45 |
+
rpn_head=dict(
|
46 |
+
type='RPNHead',
|
47 |
+
in_channels=256,
|
48 |
+
feat_channels=256,
|
49 |
+
anchor_generator=dict(
|
50 |
+
type='AnchorGenerator',
|
51 |
+
scales=[8],
|
52 |
+
ratios=[0.5, 1.0, 2.0],
|
53 |
+
strides=[8, 16, 32, 64, 128]),
|
54 |
+
bbox_coder=dict(
|
55 |
+
type='DeltaXYWHBBoxCoder',
|
56 |
+
target_means=[.0, .0, .0, .0],
|
57 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
58 |
+
loss_cls=dict(
|
59 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
60 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
61 |
+
),
|
62 |
+
roi_head=dict(
|
63 |
+
type='StandardRoIHead',
|
64 |
+
bbox_roi_extractor=dict(
|
65 |
+
type='SingleRoIExtractor',
|
66 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
67 |
+
out_channels=256,
|
68 |
+
featmap_strides=[8, 16, 32]),
|
69 |
+
bbox_head=dict(
|
70 |
+
type='Shared2FCBBoxHead',
|
71 |
+
in_channels=256,
|
72 |
+
fc_out_channels=1024,
|
73 |
+
roi_feat_size=7,
|
74 |
+
num_classes=1,
|
75 |
+
bbox_coder=dict(
|
76 |
+
type='DeltaXYWHBBoxCoder',
|
77 |
+
target_means=[0., 0., 0., 0.],
|
78 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
79 |
+
reg_class_agnostic=True,
|
80 |
+
loss_cls=dict(
|
81 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
82 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
83 |
+
# model training and testing settings
|
84 |
+
train_cfg=dict(
|
85 |
+
rpn=dict(
|
86 |
+
assigner=dict(
|
87 |
+
type='MaxIoUAssigner',
|
88 |
+
pos_iou_thr=0.7,
|
89 |
+
neg_iou_thr=0.3,
|
90 |
+
min_pos_iou=0.3,
|
91 |
+
match_low_quality=True,
|
92 |
+
ignore_iof_thr=-1),
|
93 |
+
sampler=dict(
|
94 |
+
type='RandomSampler',
|
95 |
+
num=256,
|
96 |
+
pos_fraction=0.5,
|
97 |
+
neg_pos_ub=-1,
|
98 |
+
add_gt_as_proposals=False),
|
99 |
+
allowed_border=-1,
|
100 |
+
pos_weight=-1,
|
101 |
+
debug=False),
|
102 |
+
rpn_proposal=dict(
|
103 |
+
nms_pre=2000,
|
104 |
+
max_per_img=1000,
|
105 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
106 |
+
min_bbox_size=0),
|
107 |
+
rcnn=dict(
|
108 |
+
assigner=dict(
|
109 |
+
type='MaxIoUAssigner',
|
110 |
+
pos_iou_thr=0.5,
|
111 |
+
neg_iou_thr=0.5,
|
112 |
+
min_pos_iou=0.5,
|
113 |
+
match_low_quality=False,
|
114 |
+
ignore_iof_thr=-1),
|
115 |
+
sampler=dict(
|
116 |
+
type='RandomSampler',
|
117 |
+
num=512,
|
118 |
+
pos_fraction=0.25,
|
119 |
+
neg_pos_ub=-1,
|
120 |
+
add_gt_as_proposals=True),
|
121 |
+
pos_weight=-1,
|
122 |
+
debug=False)),
|
123 |
+
test_cfg=dict(
|
124 |
+
rpn=dict(
|
125 |
+
nms_pre=1000,
|
126 |
+
max_per_img=1000,
|
127 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
128 |
+
min_bbox_size=0),
|
129 |
+
rcnn=dict(
|
130 |
+
score_thr=0.02,
|
131 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
132 |
+
nms=dict(type='nms',
|
133 |
+
iou_threshold=0.5,
|
134 |
+
class_agnostic=True,
|
135 |
+
split_thr=100000),
|
136 |
+
max_per_img=50,
|
137 |
+
mask_thr_binary=0.5)
|
138 |
+
# soft-nms is also supported for rcnn testing
|
139 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
140 |
+
),
|
141 |
+
track_head=dict(
|
142 |
+
type='MasaTrackHead',
|
143 |
+
roi_extractor=dict(
|
144 |
+
type='SingleRoIExtractor',
|
145 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
146 |
+
out_channels=256,
|
147 |
+
featmap_strides=[8, 16, 32]),
|
148 |
+
embed_head=dict(
|
149 |
+
type='QuasiDenseEmbedHead',
|
150 |
+
num_convs=4,
|
151 |
+
num_fcs=1,
|
152 |
+
embed_channels=256,
|
153 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
154 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
155 |
+
loss_track_aux=dict(
|
156 |
+
type='MarginL2Loss',
|
157 |
+
neg_pos_ub=3,
|
158 |
+
pos_margin=0,
|
159 |
+
neg_margin=0.1,
|
160 |
+
hard_mining=True,
|
161 |
+
loss_weight=1.0)),
|
162 |
+
train_cfg=dict(
|
163 |
+
assigner=dict(
|
164 |
+
type='MaxIoUAssigner',
|
165 |
+
pos_iou_thr=0.7,
|
166 |
+
neg_iou_thr=0.3,
|
167 |
+
min_pos_iou=0.5,
|
168 |
+
match_low_quality=False,
|
169 |
+
ignore_iof_thr=-1),
|
170 |
+
sampler=dict(
|
171 |
+
type='CombinedSampler',
|
172 |
+
num=512,
|
173 |
+
pos_fraction=0.5,
|
174 |
+
neg_pos_ub=3,
|
175 |
+
add_gt_as_proposals=True,
|
176 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
177 |
+
neg_sampler=dict(type='RandomSampler')))),
|
178 |
+
tracker=dict(
|
179 |
+
type='MasaTaoTracker',
|
180 |
+
init_score_thr=0.1,
|
181 |
+
obj_score_thr=0.01,
|
182 |
+
match_score_thr=0.5,
|
183 |
+
memo_tracklet_frames=10,
|
184 |
+
memo_momentum=0.8,
|
185 |
+
with_cats=False,
|
186 |
+
max_distance=100,
|
187 |
+
fps=30,
|
188 |
+
)
|
189 |
+
)
|
190 |
+
|
191 |
+
inference_pipeline = [
|
192 |
+
dict(
|
193 |
+
type='TransformBroadcaster',
|
194 |
+
transforms=[
|
195 |
+
dict(
|
196 |
+
type='Resize',
|
197 |
+
scale=(1333, 800),
|
198 |
+
keep_ratio=True),
|
199 |
+
]),
|
200 |
+
dict(type='PackTrackInputs')
|
201 |
+
]
|
202 |
+
|
203 |
+
|
204 |
+
# runtime settings
|
205 |
+
train_cfg = None
|
206 |
+
val_cfg = dict(type='ValLoop')
|
207 |
+
test_cfg = dict(type='TestLoop')
|
208 |
+
|
209 |
+
default_hooks = dict(
|
210 |
+
logger=dict(type='LoggerHook', interval=50),
|
211 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
212 |
+
checkpoint = dict(type='CheckpointHook', interval=1),
|
213 |
+
)
|
214 |
+
|
215 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
216 |
+
visualizer = dict(
|
217 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
218 |
+
|
configs/masa-gdino/open_vocabulary_mot_test/masa_gdino_swinb_open_vocabulary_test.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../../projects/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata_masa.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v1.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
# detector.backbone.update(dict(out_indices=(1, 2, 3)))
|
9 |
+
detector.pop('data_preprocessor')
|
10 |
+
detector['init_cfg'] = dict(
|
11 |
+
type='Pretrained',
|
12 |
+
checkpoint= 'saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'
|
13 |
+
# noqa: E501
|
14 |
+
)
|
15 |
+
detector['type'] = 'GroundingDINOMasa'
|
16 |
+
|
17 |
+
del _base_.model
|
18 |
+
|
19 |
+
model = dict(
|
20 |
+
type='MASA',
|
21 |
+
freeze_detector=True,
|
22 |
+
unified_backbone=True,
|
23 |
+
load_public_dets = True,
|
24 |
+
benchmark = 'tao',
|
25 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
|
26 |
+
data_preprocessor=dict(
|
27 |
+
type='TrackDataPreprocessor',
|
28 |
+
# Image normalization parameters
|
29 |
+
mean=[123.675, 116.28, 103.53],
|
30 |
+
std=[58.395, 57.12, 57.375],
|
31 |
+
bgr_to_rgb=True,
|
32 |
+
# Image padding parameters
|
33 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
34 |
+
pad_size_divisor=1024, # Padding the image to multiples of 32
|
35 |
+
),
|
36 |
+
detector=detector,
|
37 |
+
masa_adapter=[
|
38 |
+
dict(
|
39 |
+
type='FPN',
|
40 |
+
in_channels=[256, 512, 1024],
|
41 |
+
out_channels=256,
|
42 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
43 |
+
num_outs=5),
|
44 |
+
dict(
|
45 |
+
type='DeformFusion',
|
46 |
+
in_channels=256,
|
47 |
+
out_channels=256,
|
48 |
+
num_blocks=3)],
|
49 |
+
rpn_head=dict(
|
50 |
+
type='RPNHead',
|
51 |
+
in_channels=256,
|
52 |
+
feat_channels=256,
|
53 |
+
anchor_generator=dict(
|
54 |
+
type='AnchorGenerator',
|
55 |
+
scales=[8],
|
56 |
+
ratios=[0.5, 1.0, 2.0],
|
57 |
+
strides=[8, 16, 32, 64, 128]),
|
58 |
+
bbox_coder=dict(
|
59 |
+
type='DeltaXYWHBBoxCoder',
|
60 |
+
target_means=[.0, .0, .0, .0],
|
61 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
62 |
+
loss_cls=dict(
|
63 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
64 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
65 |
+
),
|
66 |
+
roi_head=dict(
|
67 |
+
type='StandardRoIHead',
|
68 |
+
bbox_roi_extractor=dict(
|
69 |
+
type='SingleRoIExtractor',
|
70 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
71 |
+
out_channels=256,
|
72 |
+
featmap_strides=[8, 16, 32]),
|
73 |
+
bbox_head=dict(
|
74 |
+
type='Shared2FCBBoxHead',
|
75 |
+
in_channels=256,
|
76 |
+
fc_out_channels=1024,
|
77 |
+
roi_feat_size=7,
|
78 |
+
num_classes=1,
|
79 |
+
bbox_coder=dict(
|
80 |
+
type='DeltaXYWHBBoxCoder',
|
81 |
+
target_means=[0., 0., 0., 0.],
|
82 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
83 |
+
reg_class_agnostic=True,
|
84 |
+
loss_cls=dict(
|
85 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
86 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
87 |
+
# model training and testing settings
|
88 |
+
train_cfg=dict(
|
89 |
+
rpn=dict(
|
90 |
+
assigner=dict(
|
91 |
+
type='MaxIoUAssigner',
|
92 |
+
pos_iou_thr=0.7,
|
93 |
+
neg_iou_thr=0.3,
|
94 |
+
min_pos_iou=0.3,
|
95 |
+
match_low_quality=True,
|
96 |
+
ignore_iof_thr=-1),
|
97 |
+
sampler=dict(
|
98 |
+
type='RandomSampler',
|
99 |
+
num=256,
|
100 |
+
pos_fraction=0.5,
|
101 |
+
neg_pos_ub=-1,
|
102 |
+
add_gt_as_proposals=False),
|
103 |
+
allowed_border=-1,
|
104 |
+
pos_weight=-1,
|
105 |
+
debug=False),
|
106 |
+
rpn_proposal=dict(
|
107 |
+
nms_pre=2000,
|
108 |
+
max_per_img=1000,
|
109 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
110 |
+
min_bbox_size=0),
|
111 |
+
rcnn=dict(
|
112 |
+
assigner=dict(
|
113 |
+
type='MaxIoUAssigner',
|
114 |
+
pos_iou_thr=0.5,
|
115 |
+
neg_iou_thr=0.5,
|
116 |
+
min_pos_iou=0.5,
|
117 |
+
match_low_quality=False,
|
118 |
+
ignore_iof_thr=-1),
|
119 |
+
sampler=dict(
|
120 |
+
type='RandomSampler',
|
121 |
+
num=512,
|
122 |
+
pos_fraction=0.25,
|
123 |
+
neg_pos_ub=-1,
|
124 |
+
add_gt_as_proposals=True),
|
125 |
+
pos_weight=-1,
|
126 |
+
debug=False)),
|
127 |
+
test_cfg=dict(
|
128 |
+
rpn=dict(
|
129 |
+
nms_pre=1000,
|
130 |
+
max_per_img=1000,
|
131 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
132 |
+
min_bbox_size=0),
|
133 |
+
rcnn=dict(
|
134 |
+
score_thr=0.02,
|
135 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
136 |
+
nms=dict(type='nms',
|
137 |
+
iou_threshold=0.5,
|
138 |
+
class_agnostic=True,
|
139 |
+
split_thr=100000),
|
140 |
+
max_per_img=50,
|
141 |
+
mask_thr_binary=0.5)
|
142 |
+
# soft-nms is also supported for rcnn testing
|
143 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
144 |
+
),
|
145 |
+
track_head=dict(
|
146 |
+
type='MasaTrackHead',
|
147 |
+
roi_extractor=dict(
|
148 |
+
type='SingleRoIExtractor',
|
149 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
150 |
+
out_channels=256,
|
151 |
+
featmap_strides=[8, 16, 32]),
|
152 |
+
embed_head=dict(
|
153 |
+
type='QuasiDenseEmbedHead',
|
154 |
+
num_convs=4,
|
155 |
+
num_fcs=1,
|
156 |
+
embed_channels=256,
|
157 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
158 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
159 |
+
loss_track_aux=dict(
|
160 |
+
type='MarginL2Loss',
|
161 |
+
neg_pos_ub=3,
|
162 |
+
pos_margin=0,
|
163 |
+
neg_margin=0.1,
|
164 |
+
hard_mining=True,
|
165 |
+
loss_weight=1.0)),
|
166 |
+
train_cfg=dict(
|
167 |
+
assigner=dict(
|
168 |
+
type='MaxIoUAssigner',
|
169 |
+
pos_iou_thr=0.7,
|
170 |
+
neg_iou_thr=0.3,
|
171 |
+
min_pos_iou=0.5,
|
172 |
+
match_low_quality=False,
|
173 |
+
ignore_iof_thr=-1),
|
174 |
+
sampler=dict(
|
175 |
+
type='CombinedSampler',
|
176 |
+
num=512,
|
177 |
+
pos_fraction=0.5,
|
178 |
+
neg_pos_ub=3,
|
179 |
+
add_gt_as_proposals=True,
|
180 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
181 |
+
neg_sampler=dict(type='RandomSampler')))),
|
182 |
+
tracker=dict(
|
183 |
+
type='MasaTaoTracker',
|
184 |
+
init_score_thr=0.0001,
|
185 |
+
obj_score_thr=0.0001,
|
186 |
+
match_score_thr=0.5,
|
187 |
+
memo_tracklet_frames=10,
|
188 |
+
memo_momentum=0.8,
|
189 |
+
with_cats=False,
|
190 |
+
max_distance=-1,
|
191 |
+
fps=1,
|
192 |
+
)
|
193 |
+
)
|
194 |
+
|
195 |
+
test_pipeline = [
|
196 |
+
dict(
|
197 |
+
type='TransformBroadcaster',
|
198 |
+
transforms=[
|
199 |
+
dict(type='LoadImageFromFile'),
|
200 |
+
dict(
|
201 |
+
type='Resize',
|
202 |
+
scale=(1024, 1024),
|
203 |
+
keep_ratio=True),
|
204 |
+
dict(type='LoadTrackAnnotations')
|
205 |
+
]),
|
206 |
+
dict(type='PackTrackInputs')
|
207 |
+
]
|
208 |
+
|
209 |
+
# runtime settings
|
210 |
+
train_dataloader = None
|
211 |
+
train_cfg = None
|
212 |
+
val_cfg = dict(type='ValLoop')
|
213 |
+
test_cfg = dict(type='TestLoop')
|
214 |
+
|
215 |
+
default_hooks = dict(
|
216 |
+
logger=dict(type='LoggerHook', interval=50),
|
217 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
218 |
+
checkpoint = dict(type='CheckpointHook', interval=1),
|
219 |
+
)
|
220 |
+
|
221 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
222 |
+
visualizer = dict(
|
223 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
224 |
+
|
225 |
+
val_dataloader = dict(
|
226 |
+
dataset=dict(
|
227 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
228 |
+
pipeline=test_pipeline,
|
229 |
+
)
|
230 |
+
)
|
231 |
+
test_dataloader = val_dataloader
|
232 |
+
test_evaluator = dict(
|
233 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
234 |
+
outfile_prefix='results/masa_results/masa-groundingdino-release-ovmot-test',
|
235 |
+
open_vocabulary=True,
|
236 |
+
)
|
configs/masa-gdino/tao_teta_test/masa_gdino_swinb_tao_test_detic_dets.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../../projects/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata_masa.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v1.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
# detector.backbone.update(dict(out_indices=(1, 2, 3)))
|
9 |
+
detector.pop('data_preprocessor')
|
10 |
+
detector['init_cfg'] = dict(
|
11 |
+
type='Pretrained',
|
12 |
+
checkpoint= 'saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'
|
13 |
+
# noqa: E501
|
14 |
+
)
|
15 |
+
detector['type'] = 'GroundingDINOMasa'
|
16 |
+
|
17 |
+
del _base_.model
|
18 |
+
|
19 |
+
model = dict(
|
20 |
+
type='MASA',
|
21 |
+
freeze_detector=True,
|
22 |
+
unified_backbone=True,
|
23 |
+
load_public_dets = True,
|
24 |
+
benchmark = 'tao',
|
25 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
|
26 |
+
data_preprocessor=dict(
|
27 |
+
type='TrackDataPreprocessor',
|
28 |
+
# Image normalization parameters
|
29 |
+
mean=[123.675, 116.28, 103.53],
|
30 |
+
std=[58.395, 57.12, 57.375],
|
31 |
+
bgr_to_rgb=True,
|
32 |
+
# Image padding parameters
|
33 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
34 |
+
pad_size_divisor=1024, # Padding the image to multiples of 32
|
35 |
+
),
|
36 |
+
detector=detector,
|
37 |
+
masa_adapter=[
|
38 |
+
dict(
|
39 |
+
type='FPN',
|
40 |
+
in_channels=[256, 512, 1024],
|
41 |
+
out_channels=256,
|
42 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
43 |
+
num_outs=5),
|
44 |
+
dict(
|
45 |
+
type='DeformFusion',
|
46 |
+
in_channels=256,
|
47 |
+
out_channels=256,
|
48 |
+
num_blocks=3)],
|
49 |
+
rpn_head=dict(
|
50 |
+
type='RPNHead',
|
51 |
+
in_channels=256,
|
52 |
+
feat_channels=256,
|
53 |
+
anchor_generator=dict(
|
54 |
+
type='AnchorGenerator',
|
55 |
+
scales=[8],
|
56 |
+
ratios=[0.5, 1.0, 2.0],
|
57 |
+
strides=[8, 16, 32, 64, 128]),
|
58 |
+
bbox_coder=dict(
|
59 |
+
type='DeltaXYWHBBoxCoder',
|
60 |
+
target_means=[.0, .0, .0, .0],
|
61 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
62 |
+
loss_cls=dict(
|
63 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
64 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
65 |
+
),
|
66 |
+
roi_head=dict(
|
67 |
+
type='StandardRoIHead',
|
68 |
+
bbox_roi_extractor=dict(
|
69 |
+
type='SingleRoIExtractor',
|
70 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
71 |
+
out_channels=256,
|
72 |
+
featmap_strides=[8, 16, 32]),
|
73 |
+
bbox_head=dict(
|
74 |
+
type='Shared2FCBBoxHead',
|
75 |
+
in_channels=256,
|
76 |
+
fc_out_channels=1024,
|
77 |
+
roi_feat_size=7,
|
78 |
+
num_classes=1,
|
79 |
+
bbox_coder=dict(
|
80 |
+
type='DeltaXYWHBBoxCoder',
|
81 |
+
target_means=[0., 0., 0., 0.],
|
82 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
83 |
+
reg_class_agnostic=True,
|
84 |
+
loss_cls=dict(
|
85 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
86 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
87 |
+
# model training and testing settings
|
88 |
+
train_cfg=dict(
|
89 |
+
rpn=dict(
|
90 |
+
assigner=dict(
|
91 |
+
type='MaxIoUAssigner',
|
92 |
+
pos_iou_thr=0.7,
|
93 |
+
neg_iou_thr=0.3,
|
94 |
+
min_pos_iou=0.3,
|
95 |
+
match_low_quality=True,
|
96 |
+
ignore_iof_thr=-1),
|
97 |
+
sampler=dict(
|
98 |
+
type='RandomSampler',
|
99 |
+
num=256,
|
100 |
+
pos_fraction=0.5,
|
101 |
+
neg_pos_ub=-1,
|
102 |
+
add_gt_as_proposals=False),
|
103 |
+
allowed_border=-1,
|
104 |
+
pos_weight=-1,
|
105 |
+
debug=False),
|
106 |
+
rpn_proposal=dict(
|
107 |
+
nms_pre=2000,
|
108 |
+
max_per_img=1000,
|
109 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
110 |
+
min_bbox_size=0),
|
111 |
+
rcnn=dict(
|
112 |
+
assigner=dict(
|
113 |
+
type='MaxIoUAssigner',
|
114 |
+
pos_iou_thr=0.5,
|
115 |
+
neg_iou_thr=0.5,
|
116 |
+
min_pos_iou=0.5,
|
117 |
+
match_low_quality=False,
|
118 |
+
ignore_iof_thr=-1),
|
119 |
+
sampler=dict(
|
120 |
+
type='RandomSampler',
|
121 |
+
num=512,
|
122 |
+
pos_fraction=0.25,
|
123 |
+
neg_pos_ub=-1,
|
124 |
+
add_gt_as_proposals=True),
|
125 |
+
pos_weight=-1,
|
126 |
+
debug=False)),
|
127 |
+
test_cfg=dict(
|
128 |
+
rpn=dict(
|
129 |
+
nms_pre=1000,
|
130 |
+
max_per_img=1000,
|
131 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
132 |
+
min_bbox_size=0),
|
133 |
+
rcnn=dict(
|
134 |
+
score_thr=0.02,
|
135 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
136 |
+
nms=dict(type='nms',
|
137 |
+
iou_threshold=0.5,
|
138 |
+
class_agnostic=True,
|
139 |
+
split_thr=100000),
|
140 |
+
max_per_img=50,
|
141 |
+
mask_thr_binary=0.5)
|
142 |
+
# soft-nms is also supported for rcnn testing
|
143 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
144 |
+
),
|
145 |
+
track_head=dict(
|
146 |
+
type='MasaTrackHead',
|
147 |
+
roi_extractor=dict(
|
148 |
+
type='SingleRoIExtractor',
|
149 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
150 |
+
out_channels=256,
|
151 |
+
featmap_strides=[8, 16, 32]),
|
152 |
+
embed_head=dict(
|
153 |
+
type='QuasiDenseEmbedHead',
|
154 |
+
num_convs=4,
|
155 |
+
num_fcs=1,
|
156 |
+
embed_channels=256,
|
157 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
158 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
159 |
+
loss_track_aux=dict(
|
160 |
+
type='MarginL2Loss',
|
161 |
+
neg_pos_ub=3,
|
162 |
+
pos_margin=0,
|
163 |
+
neg_margin=0.1,
|
164 |
+
hard_mining=True,
|
165 |
+
loss_weight=1.0)),
|
166 |
+
train_cfg=dict(
|
167 |
+
assigner=dict(
|
168 |
+
type='MaxIoUAssigner',
|
169 |
+
pos_iou_thr=0.7,
|
170 |
+
neg_iou_thr=0.3,
|
171 |
+
min_pos_iou=0.5,
|
172 |
+
match_low_quality=False,
|
173 |
+
ignore_iof_thr=-1),
|
174 |
+
sampler=dict(
|
175 |
+
type='CombinedSampler',
|
176 |
+
num=512,
|
177 |
+
pos_fraction=0.5,
|
178 |
+
neg_pos_ub=3,
|
179 |
+
add_gt_as_proposals=True,
|
180 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
181 |
+
neg_sampler=dict(type='RandomSampler')))),
|
182 |
+
tracker=dict(
|
183 |
+
type='MasaTaoTracker',
|
184 |
+
init_score_thr=0.0001,
|
185 |
+
obj_score_thr=0.0001,
|
186 |
+
match_score_thr=0.5,
|
187 |
+
memo_tracklet_frames=10,
|
188 |
+
memo_momentum=0.8,
|
189 |
+
with_cats=False,
|
190 |
+
max_distance=-1,
|
191 |
+
fps=1,
|
192 |
+
)
|
193 |
+
)
|
194 |
+
|
195 |
+
test_pipeline = [
|
196 |
+
dict(
|
197 |
+
type='TransformBroadcaster',
|
198 |
+
transforms=[
|
199 |
+
dict(type='LoadImageFromFile'),
|
200 |
+
dict(
|
201 |
+
type='Resize',
|
202 |
+
scale=(1024, 1024),
|
203 |
+
keep_ratio=True),
|
204 |
+
dict(type='LoadTrackAnnotations')
|
205 |
+
]),
|
206 |
+
dict(type='PackTrackInputs')
|
207 |
+
]
|
208 |
+
|
209 |
+
# runtime settings
|
210 |
+
train_dataloader = None
|
211 |
+
train_cfg = None
|
212 |
+
val_cfg = dict(type='ValLoop')
|
213 |
+
test_cfg = dict(type='TestLoop')
|
214 |
+
|
215 |
+
default_hooks = dict(
|
216 |
+
logger=dict(type='LoggerHook', interval=50),
|
217 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
218 |
+
checkpoint = dict(type='CheckpointHook', interval=1),
|
219 |
+
)
|
220 |
+
|
221 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
222 |
+
visualizer = dict(
|
223 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
224 |
+
|
225 |
+
val_dataloader = dict(
|
226 |
+
dataset=dict(
|
227 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
228 |
+
pipeline=test_pipeline,
|
229 |
+
)
|
230 |
+
)
|
231 |
+
test_dataloader = val_dataloader
|
232 |
+
test_evaluator = dict(
|
233 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
234 |
+
outfile_prefix='results/masa_results/masa-groundingdino-release_detic_dets-test',
|
235 |
+
)
|
configs/masa-gdino/tao_teta_test/masa_gdino_swinb_tao_test_teter_swinT_dets.py
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../../projects/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata_masa.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v05.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
# detector.backbone.update(dict(out_indices=(1, 2, 3)))
|
9 |
+
detector.pop('data_preprocessor')
|
10 |
+
detector['init_cfg'] = dict(
|
11 |
+
type='Pretrained',
|
12 |
+
checkpoint= 'saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'
|
13 |
+
# noqa: E501
|
14 |
+
)
|
15 |
+
detector['type'] = 'GroundingDINOMasa'
|
16 |
+
|
17 |
+
del _base_.model
|
18 |
+
|
19 |
+
model = dict(
|
20 |
+
type='MASA',
|
21 |
+
freeze_detector=True,
|
22 |
+
unified_backbone=True,
|
23 |
+
load_public_dets = True,
|
24 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/teter_swinT_tao_val_internms_50/',
|
25 |
+
data_preprocessor=dict(
|
26 |
+
type='TrackDataPreprocessor',
|
27 |
+
# Image normalization parameters
|
28 |
+
mean=[123.675, 116.28, 103.53],
|
29 |
+
std=[58.395, 57.12, 57.375],
|
30 |
+
bgr_to_rgb=True,
|
31 |
+
# Image padding parameters
|
32 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
33 |
+
pad_size_divisor=1024, # Padding the image to multiples of 32
|
34 |
+
),
|
35 |
+
detector=detector,
|
36 |
+
masa_adapter=[
|
37 |
+
dict(
|
38 |
+
type='FPN',
|
39 |
+
in_channels=[256, 512, 1024],
|
40 |
+
out_channels=256,
|
41 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
42 |
+
num_outs=5),
|
43 |
+
dict(
|
44 |
+
type='DeformFusion',
|
45 |
+
in_channels=256,
|
46 |
+
out_channels=256,
|
47 |
+
num_blocks=3)],
|
48 |
+
rpn_head=dict(
|
49 |
+
type='RPNHead',
|
50 |
+
in_channels=256,
|
51 |
+
feat_channels=256,
|
52 |
+
anchor_generator=dict(
|
53 |
+
type='AnchorGenerator',
|
54 |
+
scales=[8],
|
55 |
+
ratios=[0.5, 1.0, 2.0],
|
56 |
+
strides=[8, 16, 32, 64, 128]),
|
57 |
+
bbox_coder=dict(
|
58 |
+
type='DeltaXYWHBBoxCoder',
|
59 |
+
target_means=[.0, .0, .0, .0],
|
60 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
61 |
+
loss_cls=dict(
|
62 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
63 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
64 |
+
),
|
65 |
+
roi_head=dict(
|
66 |
+
type='StandardRoIHead',
|
67 |
+
bbox_roi_extractor=dict(
|
68 |
+
type='SingleRoIExtractor',
|
69 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
70 |
+
out_channels=256,
|
71 |
+
featmap_strides=[8, 16, 32]),
|
72 |
+
bbox_head=dict(
|
73 |
+
type='Shared2FCBBoxHead',
|
74 |
+
in_channels=256,
|
75 |
+
fc_out_channels=1024,
|
76 |
+
roi_feat_size=7,
|
77 |
+
num_classes=1,
|
78 |
+
bbox_coder=dict(
|
79 |
+
type='DeltaXYWHBBoxCoder',
|
80 |
+
target_means=[0., 0., 0., 0.],
|
81 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
82 |
+
reg_class_agnostic=True,
|
83 |
+
loss_cls=dict(
|
84 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
85 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
86 |
+
# model training and testing settings
|
87 |
+
train_cfg=dict(
|
88 |
+
rpn=dict(
|
89 |
+
assigner=dict(
|
90 |
+
type='MaxIoUAssigner',
|
91 |
+
pos_iou_thr=0.7,
|
92 |
+
neg_iou_thr=0.3,
|
93 |
+
min_pos_iou=0.3,
|
94 |
+
match_low_quality=True,
|
95 |
+
ignore_iof_thr=-1),
|
96 |
+
sampler=dict(
|
97 |
+
type='RandomSampler',
|
98 |
+
num=256,
|
99 |
+
pos_fraction=0.5,
|
100 |
+
neg_pos_ub=-1,
|
101 |
+
add_gt_as_proposals=False),
|
102 |
+
allowed_border=-1,
|
103 |
+
pos_weight=-1,
|
104 |
+
debug=False),
|
105 |
+
rpn_proposal=dict(
|
106 |
+
nms_pre=2000,
|
107 |
+
max_per_img=1000,
|
108 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
109 |
+
min_bbox_size=0),
|
110 |
+
rcnn=dict(
|
111 |
+
assigner=dict(
|
112 |
+
type='MaxIoUAssigner',
|
113 |
+
pos_iou_thr=0.5,
|
114 |
+
neg_iou_thr=0.5,
|
115 |
+
min_pos_iou=0.5,
|
116 |
+
match_low_quality=False,
|
117 |
+
ignore_iof_thr=-1),
|
118 |
+
sampler=dict(
|
119 |
+
type='RandomSampler',
|
120 |
+
num=512,
|
121 |
+
pos_fraction=0.25,
|
122 |
+
neg_pos_ub=-1,
|
123 |
+
add_gt_as_proposals=True),
|
124 |
+
pos_weight=-1,
|
125 |
+
debug=False)),
|
126 |
+
test_cfg=dict(
|
127 |
+
rpn=dict(
|
128 |
+
nms_pre=1000,
|
129 |
+
max_per_img=1000,
|
130 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
131 |
+
min_bbox_size=0),
|
132 |
+
rcnn=dict(
|
133 |
+
score_thr=0.02,
|
134 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
135 |
+
nms=dict(type='nms',
|
136 |
+
iou_threshold=0.5,
|
137 |
+
class_agnostic=True,
|
138 |
+
split_thr=100000),
|
139 |
+
max_per_img=50,
|
140 |
+
mask_thr_binary=0.5)
|
141 |
+
# soft-nms is also supported for rcnn testing
|
142 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
143 |
+
),
|
144 |
+
track_head=dict(
|
145 |
+
type='MasaTrackHead',
|
146 |
+
roi_extractor=dict(
|
147 |
+
type='SingleRoIExtractor',
|
148 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
149 |
+
out_channels=256,
|
150 |
+
featmap_strides=[8, 16, 32]),
|
151 |
+
embed_head=dict(
|
152 |
+
type='QuasiDenseEmbedHead',
|
153 |
+
num_convs=4,
|
154 |
+
num_fcs=1,
|
155 |
+
embed_channels=256,
|
156 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
157 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
158 |
+
loss_track_aux=dict(
|
159 |
+
type='MarginL2Loss',
|
160 |
+
neg_pos_ub=3,
|
161 |
+
pos_margin=0,
|
162 |
+
neg_margin=0.1,
|
163 |
+
hard_mining=True,
|
164 |
+
loss_weight=1.0)),
|
165 |
+
train_cfg=dict(
|
166 |
+
assigner=dict(
|
167 |
+
type='MaxIoUAssigner',
|
168 |
+
pos_iou_thr=0.7,
|
169 |
+
neg_iou_thr=0.3,
|
170 |
+
min_pos_iou=0.5,
|
171 |
+
match_low_quality=False,
|
172 |
+
ignore_iof_thr=-1),
|
173 |
+
sampler=dict(
|
174 |
+
type='CombinedSampler',
|
175 |
+
num=512,
|
176 |
+
pos_fraction=0.5,
|
177 |
+
neg_pos_ub=3,
|
178 |
+
add_gt_as_proposals=True,
|
179 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
180 |
+
neg_sampler=dict(type='RandomSampler')))),
|
181 |
+
tracker=dict(
|
182 |
+
type='MasaTaoTracker',
|
183 |
+
init_score_thr=0.0001,
|
184 |
+
obj_score_thr=0.0001,
|
185 |
+
match_score_thr=0.5,
|
186 |
+
memo_tracklet_frames=10,
|
187 |
+
memo_momentum=0.8,
|
188 |
+
with_cats=False,
|
189 |
+
max_distance=-1,
|
190 |
+
fps=1,
|
191 |
+
)
|
192 |
+
)
|
193 |
+
|
194 |
+
test_pipeline = [
|
195 |
+
dict(
|
196 |
+
type='TransformBroadcaster',
|
197 |
+
transforms=[
|
198 |
+
dict(type='LoadImageFromFile'),
|
199 |
+
dict(
|
200 |
+
type='Resize',
|
201 |
+
scale=(1024, 1024),
|
202 |
+
keep_ratio=True),
|
203 |
+
dict(type='LoadTrackAnnotations')
|
204 |
+
]),
|
205 |
+
dict(type='PackTrackInputs')
|
206 |
+
]
|
207 |
+
|
208 |
+
|
209 |
+
train_dataloader = None
|
210 |
+
train_cfg = None
|
211 |
+
val_cfg = dict(type='ValLoop')
|
212 |
+
test_cfg = dict(type='TestLoop')
|
213 |
+
|
214 |
+
default_hooks = dict(
|
215 |
+
logger=dict(type='LoggerHook', interval=50),
|
216 |
+
visualization=dict(type='TrackVisualizationHook', draw=False))
|
217 |
+
|
218 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
219 |
+
visualizer = dict(
|
220 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
221 |
+
|
222 |
+
# custom hooks
|
223 |
+
custom_hooks = [
|
224 |
+
# Synchronize model buffers such as running_mean and running_var in BN
|
225 |
+
# at the end of each epoch
|
226 |
+
dict(type='SyncBuffersHook')
|
227 |
+
]
|
228 |
+
auto_scale_lr = dict(enable=False, base_batch_size=16)
|
229 |
+
val_dataloader = dict(
|
230 |
+
dataset=dict(
|
231 |
+
ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
|
232 |
+
pipeline=test_pipeline,
|
233 |
+
)
|
234 |
+
)
|
235 |
+
test_dataloader = val_dataloader
|
236 |
+
val_evaluator = dict(
|
237 |
+
ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
|
238 |
+
outfile_prefix='results/masa_results/masa-groundingdino-release-tao-teter-test',
|
239 |
+
)
|
240 |
+
test_evaluator = val_evaluator
|
configs/masa-one/bdd_test/masa_r50_bdd_mot_test.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../default_runtime.py',
|
3 |
+
'../../datasets/bdd/bdd_dataset.py',
|
4 |
+
]
|
5 |
+
default_scope = 'mmdet'
|
6 |
+
|
7 |
+
model = dict(
|
8 |
+
type='MASA',
|
9 |
+
unified_backbone=False,
|
10 |
+
load_public_dets = True,
|
11 |
+
use_masa_backbone = True,
|
12 |
+
benchmark='bdd',
|
13 |
+
public_det_path='results/public_dets/bdd_mot_yolox_dets/',
|
14 |
+
data_preprocessor=dict(
|
15 |
+
type='TrackDataPreprocessor',
|
16 |
+
# Image normalization parameters
|
17 |
+
mean=[123.675, 116.28, 103.53],
|
18 |
+
std=[58.395, 57.12, 57.375],
|
19 |
+
bgr_to_rgb=True,
|
20 |
+
# Image padding parameters
|
21 |
+
pad_mask=True, # In instance segmentation, the mask needs to be padded
|
22 |
+
pad_size_divisor=32), # Padding the image to multiples of 32
|
23 |
+
backbone=dict(
|
24 |
+
type='ResNet',
|
25 |
+
depth=50,
|
26 |
+
num_stages=4,
|
27 |
+
out_indices=(0, 1, 2, 3),
|
28 |
+
frozen_stages=-1,
|
29 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
30 |
+
norm_eval=True,
|
31 |
+
style='caffe',),
|
32 |
+
masa_adapter=[
|
33 |
+
dict(
|
34 |
+
type='FPN',
|
35 |
+
in_channels=[256, 512, 1024, 2048],
|
36 |
+
out_channels=256,
|
37 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
38 |
+
num_outs=5),
|
39 |
+
dict(
|
40 |
+
type='DeformFusion',
|
41 |
+
in_channels=256,
|
42 |
+
out_channels=256,
|
43 |
+
num_blocks=3)],
|
44 |
+
rpn_head=dict(
|
45 |
+
type='RPNHead',
|
46 |
+
in_channels=256,
|
47 |
+
feat_channels=256,
|
48 |
+
anchor_generator=dict(
|
49 |
+
type='AnchorGenerator',
|
50 |
+
scales=[8],
|
51 |
+
ratios=[0.5, 1.0, 2.0],
|
52 |
+
strides=[4, 8, 16, 32, 64]),
|
53 |
+
bbox_coder=dict(
|
54 |
+
type='DeltaXYWHBBoxCoder',
|
55 |
+
target_means=[.0, .0, .0, .0],
|
56 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
57 |
+
loss_cls=dict(
|
58 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
59 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
60 |
+
roi_head=dict(
|
61 |
+
type='StandardRoIHead',
|
62 |
+
bbox_roi_extractor=dict(
|
63 |
+
type='SingleRoIExtractor',
|
64 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
65 |
+
out_channels=256,
|
66 |
+
featmap_strides=[4, 8, 16, 32]),
|
67 |
+
bbox_head=dict(
|
68 |
+
type='Shared4Conv1FCBBoxHead',
|
69 |
+
in_channels=256,
|
70 |
+
fc_out_channels=1024,
|
71 |
+
roi_feat_size=7,
|
72 |
+
num_classes=1,
|
73 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
74 |
+
bbox_coder=dict(
|
75 |
+
type='DeltaXYWHBBoxCoder',
|
76 |
+
target_means=[0., 0., 0., 0.],
|
77 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
78 |
+
reg_class_agnostic=True,
|
79 |
+
loss_cls=dict(
|
80 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
81 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
82 |
+
# model training and testing settings
|
83 |
+
train_cfg=dict(
|
84 |
+
rpn=dict(
|
85 |
+
assigner=dict(
|
86 |
+
type='MaxIoUAssigner',
|
87 |
+
pos_iou_thr=0.7,
|
88 |
+
neg_iou_thr=0.3,
|
89 |
+
min_pos_iou=0.3,
|
90 |
+
match_low_quality=True,
|
91 |
+
ignore_iof_thr=-1),
|
92 |
+
sampler=dict(
|
93 |
+
type='RandomSampler',
|
94 |
+
num=256,
|
95 |
+
pos_fraction=0.5,
|
96 |
+
neg_pos_ub=-1,
|
97 |
+
add_gt_as_proposals=False),
|
98 |
+
allowed_border=-1,
|
99 |
+
pos_weight=-1,
|
100 |
+
debug=False),
|
101 |
+
rpn_proposal=dict(
|
102 |
+
nms_pre=2000,
|
103 |
+
max_per_img=1000,
|
104 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
105 |
+
min_bbox_size=0),
|
106 |
+
rcnn=dict(
|
107 |
+
assigner=dict(
|
108 |
+
type='MaxIoUAssigner',
|
109 |
+
pos_iou_thr=0.5,
|
110 |
+
neg_iou_thr=0.5,
|
111 |
+
min_pos_iou=0.5,
|
112 |
+
match_low_quality=False,
|
113 |
+
ignore_iof_thr=-1),
|
114 |
+
sampler=dict(
|
115 |
+
type='RandomSampler',
|
116 |
+
num=512,
|
117 |
+
pos_fraction=0.25,
|
118 |
+
neg_pos_ub=-1,
|
119 |
+
add_gt_as_proposals=True),
|
120 |
+
pos_weight=-1,
|
121 |
+
debug=False)),
|
122 |
+
test_cfg=dict(
|
123 |
+
rpn=dict(
|
124 |
+
nms_pre=1000,
|
125 |
+
max_per_img=1000,
|
126 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
127 |
+
min_bbox_size=0),
|
128 |
+
rcnn=dict(
|
129 |
+
score_thr=0.02,
|
130 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
131 |
+
nms=dict(type='nms',
|
132 |
+
iou_threshold=0.5,
|
133 |
+
class_agnostic=True,
|
134 |
+
split_thr=100000),
|
135 |
+
max_per_img=50,
|
136 |
+
mask_thr_binary=0.5)
|
137 |
+
# soft-nms is also supported for rcnn testing
|
138 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
139 |
+
),
|
140 |
+
track_head=dict(
|
141 |
+
type='MasaTrackHead',
|
142 |
+
roi_extractor=dict(
|
143 |
+
type='SingleRoIExtractor',
|
144 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
145 |
+
out_channels=256,
|
146 |
+
featmap_strides=[4, 8, 16, 32]),
|
147 |
+
embed_head=dict(
|
148 |
+
type='QuasiDenseEmbedHead',
|
149 |
+
num_convs=4,
|
150 |
+
num_fcs=1,
|
151 |
+
embed_channels=256,
|
152 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
153 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
154 |
+
loss_track_aux=dict(
|
155 |
+
type='MarginL2Loss',
|
156 |
+
neg_pos_ub=3,
|
157 |
+
pos_margin=0,
|
158 |
+
neg_margin=0.1,
|
159 |
+
hard_mining=True,
|
160 |
+
loss_weight=1.0)),
|
161 |
+
train_cfg=dict(
|
162 |
+
assigner=dict(
|
163 |
+
type='MaxIoUAssigner',
|
164 |
+
pos_iou_thr=0.7,
|
165 |
+
neg_iou_thr=0.3,
|
166 |
+
min_pos_iou=0.5,
|
167 |
+
match_low_quality=False,
|
168 |
+
ignore_iof_thr=-1),
|
169 |
+
sampler=dict(
|
170 |
+
type='CombinedSampler',
|
171 |
+
num=512,
|
172 |
+
pos_fraction=0.5,
|
173 |
+
neg_pos_ub=3,
|
174 |
+
add_gt_as_proposals=True,
|
175 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
176 |
+
neg_sampler=dict(type='RandomSampler')))),
|
177 |
+
tracker=dict(
|
178 |
+
type='MasaBDDTracker',
|
179 |
+
init_score_thr=0.5,
|
180 |
+
obj_score_thr=0.3,
|
181 |
+
match_score_thr=0.6,
|
182 |
+
memo_tracklet_frames=10,
|
183 |
+
memo_backdrop_frames=1,
|
184 |
+
memo_momentum=0.8,
|
185 |
+
nms_conf_thr=0.5,
|
186 |
+
nms_backdrop_iou_thr=0.3,
|
187 |
+
nms_class_iou_thr=0.7,
|
188 |
+
with_cats=False,
|
189 |
+
match_metric='bisoftmax')
|
190 |
+
)
|
191 |
+
|
192 |
+
test_pipeline = [
|
193 |
+
dict(
|
194 |
+
type='TransformBroadcaster',
|
195 |
+
transforms=[
|
196 |
+
dict(type='LoadImageFromFile'),
|
197 |
+
dict(
|
198 |
+
type='Resize',
|
199 |
+
scale=(1024, 1024),
|
200 |
+
keep_ratio=True),
|
201 |
+
dict(type='LoadTrackAnnotations')
|
202 |
+
]),
|
203 |
+
dict(type='PackTrackInputs')
|
204 |
+
]
|
205 |
+
|
206 |
+
# runtime settings
|
207 |
+
train_dataloader = None
|
208 |
+
train_cfg = None
|
209 |
+
val_cfg = dict(type='ValLoop')
|
210 |
+
test_cfg = dict(type='TestLoop')
|
211 |
+
|
212 |
+
default_hooks = dict(
|
213 |
+
logger=dict(type='LoggerHook', interval=50),
|
214 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
215 |
+
checkpoint = dict(type='CheckpointHook', interval=1),
|
216 |
+
)
|
217 |
+
|
218 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
219 |
+
visualizer = dict(
|
220 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
221 |
+
|
222 |
+
val_dataloader = dict(
|
223 |
+
dataset=dict(
|
224 |
+
ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
|
225 |
+
pipeline=test_pipeline,
|
226 |
+
)
|
227 |
+
)
|
228 |
+
test_dataloader = val_dataloader
|
229 |
+
val_evaluator = dict(
|
230 |
+
ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
|
231 |
+
scalabel_gt='data/bdd/annotations/scalabel_gt/box_track_20/val/',
|
232 |
+
outfile_prefix='results/masa_results/masa-r50-release-bdd-mot-test',
|
233 |
+
metric=['TETA', 'HOTA', 'CLEAR']
|
234 |
+
)
|
235 |
+
test_evaluator = val_evaluator
|
configs/masa-one/bdd_test/masa_r50_bdd_mots_test.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../default_runtime.py',
|
3 |
+
'../../datasets/bdd/bdd_dataset.py',
|
4 |
+
]
|
5 |
+
default_scope = 'mmdet'
|
6 |
+
|
7 |
+
model = dict(
|
8 |
+
type='MASA',
|
9 |
+
unified_backbone=False,
|
10 |
+
load_public_dets = True,
|
11 |
+
use_masa_backbone = True,
|
12 |
+
benchmark='bdd',
|
13 |
+
with_segm=True,
|
14 |
+
public_det_path = 'results/public_dets/bdd_mots_val_uninext_dets/',
|
15 |
+
data_preprocessor=dict(
|
16 |
+
type='TrackDataPreprocessor',
|
17 |
+
# Image normalization parameters
|
18 |
+
mean=[123.675, 116.28, 103.53],
|
19 |
+
std=[58.395, 57.12, 57.375],
|
20 |
+
bgr_to_rgb=True,
|
21 |
+
# Image padding parameters
|
22 |
+
pad_mask=True, # In instance segmentation, the mask needs to be padded
|
23 |
+
pad_size_divisor=32), # Padding the image to multiples of 32
|
24 |
+
backbone=dict(
|
25 |
+
type='ResNet',
|
26 |
+
depth=50,
|
27 |
+
num_stages=4,
|
28 |
+
out_indices=(0, 1, 2, 3),
|
29 |
+
frozen_stages=-1,
|
30 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
31 |
+
norm_eval=True,
|
32 |
+
style='caffe',),
|
33 |
+
masa_adapter=[
|
34 |
+
dict(
|
35 |
+
type='FPN',
|
36 |
+
in_channels=[256, 512, 1024, 2048],
|
37 |
+
out_channels=256,
|
38 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
39 |
+
num_outs=5),
|
40 |
+
dict(
|
41 |
+
type='DeformFusion',
|
42 |
+
in_channels=256,
|
43 |
+
out_channels=256,
|
44 |
+
num_blocks=3)],
|
45 |
+
rpn_head=dict(
|
46 |
+
type='RPNHead',
|
47 |
+
in_channels=256,
|
48 |
+
feat_channels=256,
|
49 |
+
anchor_generator=dict(
|
50 |
+
type='AnchorGenerator',
|
51 |
+
scales=[8],
|
52 |
+
ratios=[0.5, 1.0, 2.0],
|
53 |
+
strides=[4, 8, 16, 32, 64]),
|
54 |
+
bbox_coder=dict(
|
55 |
+
type='DeltaXYWHBBoxCoder',
|
56 |
+
target_means=[.0, .0, .0, .0],
|
57 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
58 |
+
loss_cls=dict(
|
59 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
60 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
61 |
+
roi_head=dict(
|
62 |
+
type='StandardRoIHead',
|
63 |
+
bbox_roi_extractor=dict(
|
64 |
+
type='SingleRoIExtractor',
|
65 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
66 |
+
out_channels=256,
|
67 |
+
featmap_strides=[4, 8, 16, 32]),
|
68 |
+
bbox_head=dict(
|
69 |
+
type='Shared4Conv1FCBBoxHead',
|
70 |
+
in_channels=256,
|
71 |
+
fc_out_channels=1024,
|
72 |
+
roi_feat_size=7,
|
73 |
+
num_classes=1,
|
74 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
75 |
+
bbox_coder=dict(
|
76 |
+
type='DeltaXYWHBBoxCoder',
|
77 |
+
target_means=[0., 0., 0., 0.],
|
78 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
79 |
+
reg_class_agnostic=True,
|
80 |
+
loss_cls=dict(
|
81 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
82 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
83 |
+
# model training and testing settings
|
84 |
+
train_cfg=dict(
|
85 |
+
rpn=dict(
|
86 |
+
assigner=dict(
|
87 |
+
type='MaxIoUAssigner',
|
88 |
+
pos_iou_thr=0.7,
|
89 |
+
neg_iou_thr=0.3,
|
90 |
+
min_pos_iou=0.3,
|
91 |
+
match_low_quality=True,
|
92 |
+
ignore_iof_thr=-1),
|
93 |
+
sampler=dict(
|
94 |
+
type='RandomSampler',
|
95 |
+
num=256,
|
96 |
+
pos_fraction=0.5,
|
97 |
+
neg_pos_ub=-1,
|
98 |
+
add_gt_as_proposals=False),
|
99 |
+
allowed_border=-1,
|
100 |
+
pos_weight=-1,
|
101 |
+
debug=False),
|
102 |
+
rpn_proposal=dict(
|
103 |
+
nms_pre=2000,
|
104 |
+
max_per_img=1000,
|
105 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
106 |
+
min_bbox_size=0),
|
107 |
+
rcnn=dict(
|
108 |
+
assigner=dict(
|
109 |
+
type='MaxIoUAssigner',
|
110 |
+
pos_iou_thr=0.5,
|
111 |
+
neg_iou_thr=0.5,
|
112 |
+
min_pos_iou=0.5,
|
113 |
+
match_low_quality=False,
|
114 |
+
ignore_iof_thr=-1),
|
115 |
+
sampler=dict(
|
116 |
+
type='RandomSampler',
|
117 |
+
num=512,
|
118 |
+
pos_fraction=0.25,
|
119 |
+
neg_pos_ub=-1,
|
120 |
+
add_gt_as_proposals=True),
|
121 |
+
pos_weight=-1,
|
122 |
+
debug=False)),
|
123 |
+
test_cfg=dict(
|
124 |
+
rpn=dict(
|
125 |
+
nms_pre=1000,
|
126 |
+
max_per_img=1000,
|
127 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
128 |
+
min_bbox_size=0),
|
129 |
+
rcnn=dict(
|
130 |
+
score_thr=0.02,
|
131 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
132 |
+
nms=dict(type='nms',
|
133 |
+
iou_threshold=0.5,
|
134 |
+
class_agnostic=True,
|
135 |
+
split_thr=100000),
|
136 |
+
max_per_img=50,
|
137 |
+
mask_thr_binary=0.5)
|
138 |
+
# soft-nms is also supported for rcnn testing
|
139 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
140 |
+
),
|
141 |
+
track_head=dict(
|
142 |
+
type='MasaTrackHead',
|
143 |
+
roi_extractor=dict(
|
144 |
+
type='SingleRoIExtractor',
|
145 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
146 |
+
out_channels=256,
|
147 |
+
featmap_strides=[4, 8, 16, 32]),
|
148 |
+
embed_head=dict(
|
149 |
+
type='QuasiDenseEmbedHead',
|
150 |
+
num_convs=4,
|
151 |
+
num_fcs=1,
|
152 |
+
embed_channels=256,
|
153 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
154 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
155 |
+
loss_track_aux=dict(
|
156 |
+
type='MarginL2Loss',
|
157 |
+
neg_pos_ub=3,
|
158 |
+
pos_margin=0,
|
159 |
+
neg_margin=0.1,
|
160 |
+
hard_mining=True,
|
161 |
+
loss_weight=1.0)),
|
162 |
+
train_cfg=dict(
|
163 |
+
assigner=dict(
|
164 |
+
type='MaxIoUAssigner',
|
165 |
+
pos_iou_thr=0.7,
|
166 |
+
neg_iou_thr=0.3,
|
167 |
+
min_pos_iou=0.5,
|
168 |
+
match_low_quality=False,
|
169 |
+
ignore_iof_thr=-1),
|
170 |
+
sampler=dict(
|
171 |
+
type='CombinedSampler',
|
172 |
+
num=512,
|
173 |
+
pos_fraction=0.5,
|
174 |
+
neg_pos_ub=3,
|
175 |
+
add_gt_as_proposals=True,
|
176 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
177 |
+
neg_sampler=dict(type='RandomSampler')))),
|
178 |
+
tracker=dict(
|
179 |
+
type='MasaBDDTracker',
|
180 |
+
init_score_thr=0.5,
|
181 |
+
obj_score_thr=0.3,
|
182 |
+
match_score_thr=0.6,
|
183 |
+
memo_tracklet_frames=10,
|
184 |
+
memo_backdrop_frames=1,
|
185 |
+
memo_momentum=0.8,
|
186 |
+
nms_conf_thr=0.5,
|
187 |
+
nms_backdrop_iou_thr=0.3,
|
188 |
+
nms_class_iou_thr=0.7,
|
189 |
+
with_cats=False,
|
190 |
+
match_metric='bisoftmax')
|
191 |
+
)
|
192 |
+
|
193 |
+
test_pipeline = [
|
194 |
+
dict(
|
195 |
+
type='TransformBroadcaster',
|
196 |
+
transforms=[
|
197 |
+
dict(type='LoadImageFromFile'),
|
198 |
+
dict(
|
199 |
+
type='Resize',
|
200 |
+
scale=(1024, 1024),
|
201 |
+
keep_ratio=True),
|
202 |
+
dict(type='LoadTrackAnnotations')
|
203 |
+
]),
|
204 |
+
dict(type='PackTrackInputs')
|
205 |
+
]
|
206 |
+
|
207 |
+
# runtime settings
|
208 |
+
train_dataloader = None
|
209 |
+
train_cfg = None
|
210 |
+
val_cfg = dict(type='ValLoop')
|
211 |
+
test_cfg = dict(type='TestLoop')
|
212 |
+
|
213 |
+
default_hooks = dict(
|
214 |
+
logger=dict(type='LoggerHook', interval=50),
|
215 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
216 |
+
checkpoint = dict(type='CheckpointHook', interval=1),
|
217 |
+
)
|
218 |
+
|
219 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
220 |
+
visualizer = dict(
|
221 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
222 |
+
|
223 |
+
val_dataloader = dict(
|
224 |
+
dataset=dict(
|
225 |
+
ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
|
226 |
+
pipeline=test_pipeline,
|
227 |
+
)
|
228 |
+
)
|
229 |
+
|
230 |
+
test_dataloader = val_dataloader
|
231 |
+
val_evaluator = dict(
|
232 |
+
ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
|
233 |
+
scalabel_gt='data/bdd/annotations/scalabel_gt/seg_track_20/val/',
|
234 |
+
outfile_prefix='results/masa_results/masa-r50-release-bdd-mots-test',
|
235 |
+
metric=['TETA', 'HOTA', 'CLEAR'],
|
236 |
+
with_mask=True,
|
237 |
+
)
|
238 |
+
test_evaluator = val_evaluator
|
configs/masa-one/masa_r50_plug_and_play.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../default_runtime.py'
|
3 |
+
]
|
4 |
+
default_scope = 'mmdet'
|
5 |
+
|
6 |
+
model = dict(
|
7 |
+
type='MASA',
|
8 |
+
unified_backbone=False,
|
9 |
+
load_public_dets = False,
|
10 |
+
use_masa_backbone = True,
|
11 |
+
given_dets = True,
|
12 |
+
data_preprocessor=dict(
|
13 |
+
type='TrackDataPreprocessor',
|
14 |
+
# Image normalization parameters
|
15 |
+
mean=[123.675, 116.28, 103.53],
|
16 |
+
std=[58.395, 57.12, 57.375],
|
17 |
+
bgr_to_rgb=True,
|
18 |
+
# Image padding parameters
|
19 |
+
pad_mask=True, # In instance segmentation, the mask needs to be padded
|
20 |
+
pad_size_divisor=32), # Padding the image to multiples of 32
|
21 |
+
# detector=detector,
|
22 |
+
backbone=dict(
|
23 |
+
type='ResNet',
|
24 |
+
depth=50,
|
25 |
+
num_stages=4,
|
26 |
+
out_indices=(0, 1, 2, 3),
|
27 |
+
frozen_stages=-1,
|
28 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
29 |
+
norm_eval=True,
|
30 |
+
style='caffe',),
|
31 |
+
masa_adapter=[
|
32 |
+
dict(
|
33 |
+
type='FPN',
|
34 |
+
in_channels=[256, 512, 1024, 2048],
|
35 |
+
out_channels=256,
|
36 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
37 |
+
num_outs=5),
|
38 |
+
dict(
|
39 |
+
type='DeformFusion',
|
40 |
+
in_channels=256,
|
41 |
+
out_channels=256,
|
42 |
+
num_blocks=3)],
|
43 |
+
rpn_head=dict(
|
44 |
+
type='RPNHead',
|
45 |
+
in_channels=256,
|
46 |
+
feat_channels=256,
|
47 |
+
anchor_generator=dict(
|
48 |
+
type='AnchorGenerator',
|
49 |
+
scales=[8],
|
50 |
+
ratios=[0.5, 1.0, 2.0],
|
51 |
+
strides=[4, 8, 16, 32, 64]),
|
52 |
+
bbox_coder=dict(
|
53 |
+
type='DeltaXYWHBBoxCoder',
|
54 |
+
target_means=[.0, .0, .0, .0],
|
55 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
56 |
+
loss_cls=dict(
|
57 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
58 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
59 |
+
roi_head=dict(
|
60 |
+
type='StandardRoIHead',
|
61 |
+
bbox_roi_extractor=dict(
|
62 |
+
type='SingleRoIExtractor',
|
63 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
64 |
+
out_channels=256,
|
65 |
+
featmap_strides=[4, 8, 16, 32]),
|
66 |
+
bbox_head=dict(
|
67 |
+
type='Shared4Conv1FCBBoxHead',
|
68 |
+
in_channels=256,
|
69 |
+
fc_out_channels=1024,
|
70 |
+
roi_feat_size=7,
|
71 |
+
num_classes=1,
|
72 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
73 |
+
bbox_coder=dict(
|
74 |
+
type='DeltaXYWHBBoxCoder',
|
75 |
+
target_means=[0., 0., 0., 0.],
|
76 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
77 |
+
reg_class_agnostic=True,
|
78 |
+
loss_cls=dict(
|
79 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
80 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
81 |
+
# model training and testing settings
|
82 |
+
train_cfg=dict(
|
83 |
+
rpn=dict(
|
84 |
+
assigner=dict(
|
85 |
+
type='MaxIoUAssigner',
|
86 |
+
pos_iou_thr=0.7,
|
87 |
+
neg_iou_thr=0.3,
|
88 |
+
min_pos_iou=0.3,
|
89 |
+
match_low_quality=True,
|
90 |
+
ignore_iof_thr=-1),
|
91 |
+
sampler=dict(
|
92 |
+
type='RandomSampler',
|
93 |
+
num=256,
|
94 |
+
pos_fraction=0.5,
|
95 |
+
neg_pos_ub=-1,
|
96 |
+
add_gt_as_proposals=False),
|
97 |
+
allowed_border=-1,
|
98 |
+
pos_weight=-1,
|
99 |
+
debug=False),
|
100 |
+
rpn_proposal=dict(
|
101 |
+
nms_pre=2000,
|
102 |
+
max_per_img=1000,
|
103 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
104 |
+
min_bbox_size=0),
|
105 |
+
rcnn=dict(
|
106 |
+
assigner=dict(
|
107 |
+
type='MaxIoUAssigner',
|
108 |
+
pos_iou_thr=0.5,
|
109 |
+
neg_iou_thr=0.5,
|
110 |
+
min_pos_iou=0.5,
|
111 |
+
match_low_quality=False,
|
112 |
+
ignore_iof_thr=-1),
|
113 |
+
sampler=dict(
|
114 |
+
type='RandomSampler',
|
115 |
+
num=512,
|
116 |
+
pos_fraction=0.25,
|
117 |
+
neg_pos_ub=-1,
|
118 |
+
add_gt_as_proposals=True),
|
119 |
+
pos_weight=-1,
|
120 |
+
debug=False)),
|
121 |
+
test_cfg=dict(
|
122 |
+
rpn=dict(
|
123 |
+
nms_pre=1000,
|
124 |
+
max_per_img=1000,
|
125 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
126 |
+
min_bbox_size=0),
|
127 |
+
rcnn=dict(
|
128 |
+
score_thr=0.02,
|
129 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
130 |
+
nms=dict(type='nms',
|
131 |
+
iou_threshold=0.5,
|
132 |
+
class_agnostic=True,
|
133 |
+
split_thr=100000),
|
134 |
+
max_per_img=50,
|
135 |
+
mask_thr_binary=0.5)
|
136 |
+
# soft-nms is also supported for rcnn testing
|
137 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
138 |
+
),
|
139 |
+
track_head=dict(
|
140 |
+
type='QuasiDenseTrackHead',
|
141 |
+
roi_extractor=dict(
|
142 |
+
type='SingleRoIExtractor',
|
143 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
144 |
+
out_channels=256,
|
145 |
+
featmap_strides=[4, 8, 16, 32]),
|
146 |
+
embed_head=dict(
|
147 |
+
type='QuasiDenseEmbedHead',
|
148 |
+
num_convs=4,
|
149 |
+
num_fcs=1,
|
150 |
+
embed_channels=256,
|
151 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
152 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
153 |
+
loss_track_aux=dict(
|
154 |
+
type='MarginL2Loss',
|
155 |
+
neg_pos_ub=3,
|
156 |
+
pos_margin=0,
|
157 |
+
neg_margin=0.1,
|
158 |
+
hard_mining=True,
|
159 |
+
loss_weight=1.0)),
|
160 |
+
train_cfg=dict(
|
161 |
+
assigner=dict(
|
162 |
+
type='MaxIoUAssigner',
|
163 |
+
pos_iou_thr=0.7,
|
164 |
+
neg_iou_thr=0.3,
|
165 |
+
min_pos_iou=0.5,
|
166 |
+
match_low_quality=False,
|
167 |
+
ignore_iof_thr=-1),
|
168 |
+
sampler=dict(
|
169 |
+
type='CombinedSampler',
|
170 |
+
num=512,
|
171 |
+
pos_fraction=0.5,
|
172 |
+
neg_pos_ub=3,
|
173 |
+
add_gt_as_proposals=True,
|
174 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
175 |
+
neg_sampler=dict(type='RandomSampler')))),
|
176 |
+
tracker=dict(
|
177 |
+
type='MasaTaoTracker',
|
178 |
+
init_score_thr=0.1,
|
179 |
+
obj_score_thr=0.01,
|
180 |
+
match_score_thr=0.5,
|
181 |
+
memo_tracklet_frames=10,
|
182 |
+
memo_momentum=0.8,
|
183 |
+
with_cats=False,
|
184 |
+
max_distance=100,
|
185 |
+
fps=30,
|
186 |
+
)
|
187 |
+
)
|
188 |
+
|
189 |
+
inference_pipeline = [
|
190 |
+
dict(
|
191 |
+
type='TransformBroadcaster',
|
192 |
+
transforms=[
|
193 |
+
dict(
|
194 |
+
type='Resize',
|
195 |
+
scale=(1024, 1024),
|
196 |
+
keep_ratio=True),
|
197 |
+
]),
|
198 |
+
dict(type='PackTrackInputs')
|
199 |
+
]
|
200 |
+
|
201 |
+
# runtime settings
|
202 |
+
train_cfg = None
|
203 |
+
val_cfg = dict(type='ValLoop')
|
204 |
+
test_cfg = dict(type='TestLoop')
|
205 |
+
|
206 |
+
default_hooks = dict(
|
207 |
+
logger=dict(type='LoggerHook', interval=50),
|
208 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
209 |
+
checkpoint=dict(type='CheckpointHook', interval=12),
|
210 |
+
)
|
211 |
+
|
212 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
213 |
+
visualizer = dict(
|
214 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
configs/masa-one/open_vocabulary_mot_test/masa_r50_open_vocabulary_test.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../default_runtime.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v1.py',
|
4 |
+
]
|
5 |
+
default_scope = 'mmdet'
|
6 |
+
|
7 |
+
model = dict(
|
8 |
+
type='MASA',
|
9 |
+
unified_backbone=False,
|
10 |
+
load_public_dets = True,
|
11 |
+
use_masa_backbone = True,
|
12 |
+
benchmark = 'tao',
|
13 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
|
14 |
+
data_preprocessor=dict(
|
15 |
+
type='TrackDataPreprocessor',
|
16 |
+
# Image normalization parameters
|
17 |
+
mean=[123.675, 116.28, 103.53],
|
18 |
+
std=[58.395, 57.12, 57.375],
|
19 |
+
bgr_to_rgb=True,
|
20 |
+
# Image padding parameters
|
21 |
+
pad_mask=True, # In instance segmentation, the mask needs to be padded
|
22 |
+
pad_size_divisor=32), # Padding the image to multiples of 32
|
23 |
+
backbone=dict(
|
24 |
+
type='ResNet',
|
25 |
+
depth=50,
|
26 |
+
num_stages=4,
|
27 |
+
out_indices=(0, 1, 2, 3),
|
28 |
+
frozen_stages=-1,
|
29 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
30 |
+
norm_eval=True,
|
31 |
+
style='caffe',),
|
32 |
+
masa_adapter=[
|
33 |
+
dict(
|
34 |
+
type='FPN',
|
35 |
+
in_channels=[256, 512, 1024, 2048],
|
36 |
+
out_channels=256,
|
37 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
38 |
+
num_outs=5),
|
39 |
+
dict(
|
40 |
+
type='DeformFusion',
|
41 |
+
in_channels=256,
|
42 |
+
out_channels=256,
|
43 |
+
num_blocks=3)],
|
44 |
+
rpn_head=dict(
|
45 |
+
type='RPNHead',
|
46 |
+
in_channels=256,
|
47 |
+
feat_channels=256,
|
48 |
+
anchor_generator=dict(
|
49 |
+
type='AnchorGenerator',
|
50 |
+
scales=[8],
|
51 |
+
ratios=[0.5, 1.0, 2.0],
|
52 |
+
strides=[4, 8, 16, 32, 64]),
|
53 |
+
bbox_coder=dict(
|
54 |
+
type='DeltaXYWHBBoxCoder',
|
55 |
+
target_means=[.0, .0, .0, .0],
|
56 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
57 |
+
loss_cls=dict(
|
58 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
59 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
60 |
+
roi_head=dict(
|
61 |
+
type='StandardRoIHead',
|
62 |
+
bbox_roi_extractor=dict(
|
63 |
+
type='SingleRoIExtractor',
|
64 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
65 |
+
out_channels=256,
|
66 |
+
featmap_strides=[4, 8, 16, 32]),
|
67 |
+
bbox_head=dict(
|
68 |
+
type='Shared4Conv1FCBBoxHead',
|
69 |
+
in_channels=256,
|
70 |
+
fc_out_channels=1024,
|
71 |
+
roi_feat_size=7,
|
72 |
+
num_classes=1,
|
73 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
74 |
+
bbox_coder=dict(
|
75 |
+
type='DeltaXYWHBBoxCoder',
|
76 |
+
target_means=[0., 0., 0., 0.],
|
77 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
78 |
+
reg_class_agnostic=True,
|
79 |
+
loss_cls=dict(
|
80 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
81 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
82 |
+
# model training and testing settings
|
83 |
+
train_cfg=dict(
|
84 |
+
rpn=dict(
|
85 |
+
assigner=dict(
|
86 |
+
type='MaxIoUAssigner',
|
87 |
+
pos_iou_thr=0.7,
|
88 |
+
neg_iou_thr=0.3,
|
89 |
+
min_pos_iou=0.3,
|
90 |
+
match_low_quality=True,
|
91 |
+
ignore_iof_thr=-1),
|
92 |
+
sampler=dict(
|
93 |
+
type='RandomSampler',
|
94 |
+
num=256,
|
95 |
+
pos_fraction=0.5,
|
96 |
+
neg_pos_ub=-1,
|
97 |
+
add_gt_as_proposals=False),
|
98 |
+
allowed_border=-1,
|
99 |
+
pos_weight=-1,
|
100 |
+
debug=False),
|
101 |
+
rpn_proposal=dict(
|
102 |
+
nms_pre=2000,
|
103 |
+
max_per_img=1000,
|
104 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
105 |
+
min_bbox_size=0),
|
106 |
+
rcnn=dict(
|
107 |
+
assigner=dict(
|
108 |
+
type='MaxIoUAssigner',
|
109 |
+
pos_iou_thr=0.5,
|
110 |
+
neg_iou_thr=0.5,
|
111 |
+
min_pos_iou=0.5,
|
112 |
+
match_low_quality=False,
|
113 |
+
ignore_iof_thr=-1),
|
114 |
+
sampler=dict(
|
115 |
+
type='RandomSampler',
|
116 |
+
num=512,
|
117 |
+
pos_fraction=0.25,
|
118 |
+
neg_pos_ub=-1,
|
119 |
+
add_gt_as_proposals=True),
|
120 |
+
pos_weight=-1,
|
121 |
+
debug=False)),
|
122 |
+
test_cfg=dict(
|
123 |
+
rpn=dict(
|
124 |
+
nms_pre=1000,
|
125 |
+
max_per_img=1000,
|
126 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
127 |
+
min_bbox_size=0),
|
128 |
+
rcnn=dict(
|
129 |
+
score_thr=0.02,
|
130 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
131 |
+
nms=dict(type='nms',
|
132 |
+
iou_threshold=0.5,
|
133 |
+
class_agnostic=True,
|
134 |
+
split_thr=100000),
|
135 |
+
max_per_img=50,
|
136 |
+
mask_thr_binary=0.5)
|
137 |
+
# soft-nms is also supported for rcnn testing
|
138 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
139 |
+
),
|
140 |
+
track_head=dict(
|
141 |
+
type='MasaTrackHead',
|
142 |
+
roi_extractor=dict(
|
143 |
+
type='SingleRoIExtractor',
|
144 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
145 |
+
out_channels=256,
|
146 |
+
featmap_strides=[4, 8, 16, 32]),
|
147 |
+
embed_head=dict(
|
148 |
+
type='QuasiDenseEmbedHead',
|
149 |
+
num_convs=4,
|
150 |
+
num_fcs=1,
|
151 |
+
embed_channels=256,
|
152 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
153 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
154 |
+
loss_track_aux=dict(
|
155 |
+
type='MarginL2Loss',
|
156 |
+
neg_pos_ub=3,
|
157 |
+
pos_margin=0,
|
158 |
+
neg_margin=0.1,
|
159 |
+
hard_mining=True,
|
160 |
+
loss_weight=1.0)),
|
161 |
+
train_cfg=dict(
|
162 |
+
assigner=dict(
|
163 |
+
type='MaxIoUAssigner',
|
164 |
+
pos_iou_thr=0.7,
|
165 |
+
neg_iou_thr=0.3,
|
166 |
+
min_pos_iou=0.5,
|
167 |
+
match_low_quality=False,
|
168 |
+
ignore_iof_thr=-1),
|
169 |
+
sampler=dict(
|
170 |
+
type='CombinedSampler',
|
171 |
+
num=512,
|
172 |
+
pos_fraction=0.5,
|
173 |
+
neg_pos_ub=3,
|
174 |
+
add_gt_as_proposals=True,
|
175 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
176 |
+
neg_sampler=dict(type='RandomSampler')))),
|
177 |
+
tracker=dict(
|
178 |
+
type='MasaTaoTracker',
|
179 |
+
init_score_thr=0.0001,
|
180 |
+
obj_score_thr=0.0001,
|
181 |
+
match_score_thr=0.5,
|
182 |
+
memo_tracklet_frames=10,
|
183 |
+
memo_momentum=0.8,
|
184 |
+
with_cats=False,
|
185 |
+
max_distance=-1,
|
186 |
+
fps=1,
|
187 |
+
)
|
188 |
+
)
|
189 |
+
|
190 |
+
test_pipeline = [
|
191 |
+
dict(
|
192 |
+
type='TransformBroadcaster',
|
193 |
+
transforms=[
|
194 |
+
dict(type='LoadImageFromFile'),
|
195 |
+
dict(
|
196 |
+
type='Resize',
|
197 |
+
scale=(1024, 1024),
|
198 |
+
keep_ratio=True),
|
199 |
+
dict(type='LoadTrackAnnotations')
|
200 |
+
]),
|
201 |
+
dict(type='PackTrackInputs')
|
202 |
+
]
|
203 |
+
|
204 |
+
# runtime settings
|
205 |
+
train_dataloader = None
|
206 |
+
train_cfg = None
|
207 |
+
val_cfg = dict(type='ValLoop')
|
208 |
+
test_cfg = dict(type='TestLoop')
|
209 |
+
|
210 |
+
default_hooks = dict(
|
211 |
+
logger=dict(type='LoggerHook', interval=50),
|
212 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
213 |
+
checkpoint = dict(type='CheckpointHook', interval=1),
|
214 |
+
)
|
215 |
+
|
216 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
217 |
+
visualizer = dict(
|
218 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
219 |
+
|
220 |
+
val_dataloader = dict(
|
221 |
+
dataset=dict(
|
222 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
223 |
+
pipeline=test_pipeline,
|
224 |
+
)
|
225 |
+
)
|
226 |
+
test_dataloader = val_dataloader
|
227 |
+
test_evaluator = dict(
|
228 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
229 |
+
outfile_prefix='results/masa_results/masa-r50-release-ovmot-test',
|
230 |
+
open_vocabulary=True,
|
231 |
+
)
|
configs/masa-one/tao_teta_test/masa_r50_tao_test_detic_dets.py
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../default_runtime.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v1.py',
|
4 |
+
]
|
5 |
+
default_scope = 'mmdet'
|
6 |
+
|
7 |
+
model = dict(
|
8 |
+
type='MASA',
|
9 |
+
unified_backbone=False,
|
10 |
+
load_public_dets = True,
|
11 |
+
use_masa_backbone = True,
|
12 |
+
benchmark = 'tao',
|
13 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
|
14 |
+
data_preprocessor=dict(
|
15 |
+
type='TrackDataPreprocessor',
|
16 |
+
# Image normalization parameters
|
17 |
+
mean=[123.675, 116.28, 103.53],
|
18 |
+
std=[58.395, 57.12, 57.375],
|
19 |
+
bgr_to_rgb=True,
|
20 |
+
# Image padding parameters
|
21 |
+
pad_mask=True, # In instance segmentation, the mask needs to be padded
|
22 |
+
pad_size_divisor=32), # Padding the image to multiples of 32
|
23 |
+
backbone=dict(
|
24 |
+
type='ResNet',
|
25 |
+
depth=50,
|
26 |
+
num_stages=4,
|
27 |
+
out_indices=(0, 1, 2, 3),
|
28 |
+
frozen_stages=-1,
|
29 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
30 |
+
norm_eval=True,
|
31 |
+
style='caffe',),
|
32 |
+
masa_adapter=[
|
33 |
+
dict(
|
34 |
+
type='FPN',
|
35 |
+
in_channels=[256, 512, 1024, 2048],
|
36 |
+
out_channels=256,
|
37 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
38 |
+
num_outs=5),
|
39 |
+
dict(
|
40 |
+
type='DeformFusion',
|
41 |
+
in_channels=256,
|
42 |
+
out_channels=256,
|
43 |
+
num_blocks=3)],
|
44 |
+
rpn_head=dict(
|
45 |
+
type='RPNHead',
|
46 |
+
in_channels=256,
|
47 |
+
feat_channels=256,
|
48 |
+
anchor_generator=dict(
|
49 |
+
type='AnchorGenerator',
|
50 |
+
scales=[8],
|
51 |
+
ratios=[0.5, 1.0, 2.0],
|
52 |
+
strides=[4, 8, 16, 32, 64]),
|
53 |
+
bbox_coder=dict(
|
54 |
+
type='DeltaXYWHBBoxCoder',
|
55 |
+
target_means=[.0, .0, .0, .0],
|
56 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
57 |
+
loss_cls=dict(
|
58 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
59 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
60 |
+
roi_head=dict(
|
61 |
+
type='StandardRoIHead',
|
62 |
+
bbox_roi_extractor=dict(
|
63 |
+
type='SingleRoIExtractor',
|
64 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
65 |
+
out_channels=256,
|
66 |
+
featmap_strides=[4, 8, 16, 32]),
|
67 |
+
bbox_head=dict(
|
68 |
+
type='Shared4Conv1FCBBoxHead',
|
69 |
+
in_channels=256,
|
70 |
+
fc_out_channels=1024,
|
71 |
+
roi_feat_size=7,
|
72 |
+
num_classes=1,
|
73 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
74 |
+
bbox_coder=dict(
|
75 |
+
type='DeltaXYWHBBoxCoder',
|
76 |
+
target_means=[0., 0., 0., 0.],
|
77 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
78 |
+
reg_class_agnostic=True,
|
79 |
+
loss_cls=dict(
|
80 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
81 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
82 |
+
# model training and testing settings
|
83 |
+
train_cfg=dict(
|
84 |
+
rpn=dict(
|
85 |
+
assigner=dict(
|
86 |
+
type='MaxIoUAssigner',
|
87 |
+
pos_iou_thr=0.7,
|
88 |
+
neg_iou_thr=0.3,
|
89 |
+
min_pos_iou=0.3,
|
90 |
+
match_low_quality=True,
|
91 |
+
ignore_iof_thr=-1),
|
92 |
+
sampler=dict(
|
93 |
+
type='RandomSampler',
|
94 |
+
num=256,
|
95 |
+
pos_fraction=0.5,
|
96 |
+
neg_pos_ub=-1,
|
97 |
+
add_gt_as_proposals=False),
|
98 |
+
allowed_border=-1,
|
99 |
+
pos_weight=-1,
|
100 |
+
debug=False),
|
101 |
+
rpn_proposal=dict(
|
102 |
+
nms_pre=2000,
|
103 |
+
max_per_img=1000,
|
104 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
105 |
+
min_bbox_size=0),
|
106 |
+
rcnn=dict(
|
107 |
+
assigner=dict(
|
108 |
+
type='MaxIoUAssigner',
|
109 |
+
pos_iou_thr=0.5,
|
110 |
+
neg_iou_thr=0.5,
|
111 |
+
min_pos_iou=0.5,
|
112 |
+
match_low_quality=False,
|
113 |
+
ignore_iof_thr=-1),
|
114 |
+
sampler=dict(
|
115 |
+
type='RandomSampler',
|
116 |
+
num=512,
|
117 |
+
pos_fraction=0.25,
|
118 |
+
neg_pos_ub=-1,
|
119 |
+
add_gt_as_proposals=True),
|
120 |
+
pos_weight=-1,
|
121 |
+
debug=False)),
|
122 |
+
test_cfg=dict(
|
123 |
+
rpn=dict(
|
124 |
+
nms_pre=1000,
|
125 |
+
max_per_img=1000,
|
126 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
127 |
+
min_bbox_size=0),
|
128 |
+
rcnn=dict(
|
129 |
+
score_thr=0.02,
|
130 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
131 |
+
nms=dict(type='nms',
|
132 |
+
iou_threshold=0.5,
|
133 |
+
class_agnostic=True,
|
134 |
+
split_thr=100000),
|
135 |
+
max_per_img=50,
|
136 |
+
mask_thr_binary=0.5)
|
137 |
+
# soft-nms is also supported for rcnn testing
|
138 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
139 |
+
),
|
140 |
+
track_head=dict(
|
141 |
+
type='MasaTrackHead',
|
142 |
+
roi_extractor=dict(
|
143 |
+
type='SingleRoIExtractor',
|
144 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
145 |
+
out_channels=256,
|
146 |
+
featmap_strides=[4, 8, 16, 32]),
|
147 |
+
embed_head=dict(
|
148 |
+
type='QuasiDenseEmbedHead',
|
149 |
+
num_convs=4,
|
150 |
+
num_fcs=1,
|
151 |
+
embed_channels=256,
|
152 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
153 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
154 |
+
loss_track_aux=dict(
|
155 |
+
type='MarginL2Loss',
|
156 |
+
neg_pos_ub=3,
|
157 |
+
pos_margin=0,
|
158 |
+
neg_margin=0.1,
|
159 |
+
hard_mining=True,
|
160 |
+
loss_weight=1.0)),
|
161 |
+
train_cfg=dict(
|
162 |
+
assigner=dict(
|
163 |
+
type='MaxIoUAssigner',
|
164 |
+
pos_iou_thr=0.7,
|
165 |
+
neg_iou_thr=0.3,
|
166 |
+
min_pos_iou=0.5,
|
167 |
+
match_low_quality=False,
|
168 |
+
ignore_iof_thr=-1),
|
169 |
+
sampler=dict(
|
170 |
+
type='CombinedSampler',
|
171 |
+
num=512,
|
172 |
+
pos_fraction=0.5,
|
173 |
+
neg_pos_ub=3,
|
174 |
+
add_gt_as_proposals=True,
|
175 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
176 |
+
neg_sampler=dict(type='RandomSampler')))),
|
177 |
+
tracker=dict(
|
178 |
+
type='MasaTaoTracker',
|
179 |
+
init_score_thr=0.0001,
|
180 |
+
obj_score_thr=0.0001,
|
181 |
+
match_score_thr=0.5,
|
182 |
+
memo_tracklet_frames=10,
|
183 |
+
memo_momentum=0.8,
|
184 |
+
with_cats=False,
|
185 |
+
max_distance=-1,
|
186 |
+
fps=1,
|
187 |
+
)
|
188 |
+
)
|
189 |
+
|
190 |
+
test_pipeline = [
|
191 |
+
dict(
|
192 |
+
type='TransformBroadcaster',
|
193 |
+
transforms=[
|
194 |
+
dict(type='LoadImageFromFile'),
|
195 |
+
dict(
|
196 |
+
type='Resize',
|
197 |
+
scale=(1024, 1024),
|
198 |
+
keep_ratio=True),
|
199 |
+
dict(type='LoadTrackAnnotations')
|
200 |
+
]),
|
201 |
+
dict(type='PackTrackInputs')
|
202 |
+
]
|
203 |
+
|
204 |
+
# runtime settings
|
205 |
+
train_dataloader = None
|
206 |
+
train_cfg = None
|
207 |
+
val_cfg = dict(type='ValLoop')
|
208 |
+
test_cfg = dict(type='TestLoop')
|
209 |
+
|
210 |
+
default_hooks = dict(
|
211 |
+
logger=dict(type='LoggerHook', interval=50),
|
212 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
213 |
+
checkpoint = dict(type='CheckpointHook', interval=1),
|
214 |
+
)
|
215 |
+
|
216 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
217 |
+
visualizer = dict(
|
218 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
219 |
+
|
220 |
+
val_dataloader = dict(
|
221 |
+
dataset=dict(
|
222 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
223 |
+
pipeline=test_pipeline,
|
224 |
+
)
|
225 |
+
)
|
226 |
+
test_dataloader = val_dataloader
|
227 |
+
test_evaluator = dict(
|
228 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
229 |
+
outfile_prefix='results/masa_results/masa-r50-release_detic_dets-test',
|
230 |
+
)
|
configs/masa-one/tao_teta_test/masa_r50_tao_test_teter_swinT_dets.py
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../default_runtime.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v05.py',
|
4 |
+
]
|
5 |
+
default_scope = 'mmdet'
|
6 |
+
|
7 |
+
model = dict(
|
8 |
+
type='MASA',
|
9 |
+
unified_backbone=False,
|
10 |
+
load_public_dets = True,
|
11 |
+
use_masa_backbone = True,
|
12 |
+
benchmark = 'tao',
|
13 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/teter_swinT_tao_val_internms_50/',
|
14 |
+
data_preprocessor=dict(
|
15 |
+
type='TrackDataPreprocessor',
|
16 |
+
# Image normalization parameters
|
17 |
+
mean=[123.675, 116.28, 103.53],
|
18 |
+
std=[58.395, 57.12, 57.375],
|
19 |
+
bgr_to_rgb=True,
|
20 |
+
# Image padding parameters
|
21 |
+
pad_mask=True, # In instance segmentation, the mask needs to be padded
|
22 |
+
pad_size_divisor=32), # Padding the image to multiples of 32
|
23 |
+
backbone=dict(
|
24 |
+
type='ResNet',
|
25 |
+
depth=50,
|
26 |
+
num_stages=4,
|
27 |
+
out_indices=(0, 1, 2, 3),
|
28 |
+
frozen_stages=-1,
|
29 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
30 |
+
norm_eval=True,
|
31 |
+
style='caffe',),
|
32 |
+
masa_adapter=[
|
33 |
+
dict(
|
34 |
+
type='FPN',
|
35 |
+
in_channels=[256, 512, 1024, 2048],
|
36 |
+
out_channels=256,
|
37 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
38 |
+
num_outs=5),
|
39 |
+
dict(
|
40 |
+
type='DeformFusion',
|
41 |
+
in_channels=256,
|
42 |
+
out_channels=256,
|
43 |
+
num_blocks=3)],
|
44 |
+
rpn_head=dict(
|
45 |
+
type='RPNHead',
|
46 |
+
in_channels=256,
|
47 |
+
feat_channels=256,
|
48 |
+
anchor_generator=dict(
|
49 |
+
type='AnchorGenerator',
|
50 |
+
scales=[8],
|
51 |
+
ratios=[0.5, 1.0, 2.0],
|
52 |
+
strides=[4, 8, 16, 32, 64]),
|
53 |
+
bbox_coder=dict(
|
54 |
+
type='DeltaXYWHBBoxCoder',
|
55 |
+
target_means=[.0, .0, .0, .0],
|
56 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
57 |
+
loss_cls=dict(
|
58 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
59 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
60 |
+
roi_head=dict(
|
61 |
+
type='StandardRoIHead',
|
62 |
+
bbox_roi_extractor=dict(
|
63 |
+
type='SingleRoIExtractor',
|
64 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
65 |
+
out_channels=256,
|
66 |
+
featmap_strides=[4, 8, 16, 32]),
|
67 |
+
bbox_head=dict(
|
68 |
+
type='Shared4Conv1FCBBoxHead',
|
69 |
+
in_channels=256,
|
70 |
+
fc_out_channels=1024,
|
71 |
+
roi_feat_size=7,
|
72 |
+
num_classes=1,
|
73 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
74 |
+
bbox_coder=dict(
|
75 |
+
type='DeltaXYWHBBoxCoder',
|
76 |
+
target_means=[0., 0., 0., 0.],
|
77 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
78 |
+
reg_class_agnostic=True,
|
79 |
+
loss_cls=dict(
|
80 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
81 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
82 |
+
# model training and testing settings
|
83 |
+
train_cfg=dict(
|
84 |
+
rpn=dict(
|
85 |
+
assigner=dict(
|
86 |
+
type='MaxIoUAssigner',
|
87 |
+
pos_iou_thr=0.7,
|
88 |
+
neg_iou_thr=0.3,
|
89 |
+
min_pos_iou=0.3,
|
90 |
+
match_low_quality=True,
|
91 |
+
ignore_iof_thr=-1),
|
92 |
+
sampler=dict(
|
93 |
+
type='RandomSampler',
|
94 |
+
num=256,
|
95 |
+
pos_fraction=0.5,
|
96 |
+
neg_pos_ub=-1,
|
97 |
+
add_gt_as_proposals=False),
|
98 |
+
allowed_border=-1,
|
99 |
+
pos_weight=-1,
|
100 |
+
debug=False),
|
101 |
+
rpn_proposal=dict(
|
102 |
+
nms_pre=2000,
|
103 |
+
max_per_img=1000,
|
104 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
105 |
+
min_bbox_size=0),
|
106 |
+
rcnn=dict(
|
107 |
+
assigner=dict(
|
108 |
+
type='MaxIoUAssigner',
|
109 |
+
pos_iou_thr=0.5,
|
110 |
+
neg_iou_thr=0.5,
|
111 |
+
min_pos_iou=0.5,
|
112 |
+
match_low_quality=False,
|
113 |
+
ignore_iof_thr=-1),
|
114 |
+
sampler=dict(
|
115 |
+
type='RandomSampler',
|
116 |
+
num=512,
|
117 |
+
pos_fraction=0.25,
|
118 |
+
neg_pos_ub=-1,
|
119 |
+
add_gt_as_proposals=True),
|
120 |
+
pos_weight=-1,
|
121 |
+
debug=False)),
|
122 |
+
test_cfg=dict(
|
123 |
+
rpn=dict(
|
124 |
+
nms_pre=1000,
|
125 |
+
max_per_img=1000,
|
126 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
127 |
+
min_bbox_size=0),
|
128 |
+
rcnn=dict(
|
129 |
+
score_thr=0.02,
|
130 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
131 |
+
nms=dict(type='nms',
|
132 |
+
iou_threshold=0.5,
|
133 |
+
class_agnostic=True,
|
134 |
+
split_thr=100000),
|
135 |
+
max_per_img=50,
|
136 |
+
mask_thr_binary=0.5)
|
137 |
+
# soft-nms is also supported for rcnn testing
|
138 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
139 |
+
),
|
140 |
+
track_head=dict(
|
141 |
+
type='MasaTrackHead',
|
142 |
+
roi_extractor=dict(
|
143 |
+
type='SingleRoIExtractor',
|
144 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
145 |
+
out_channels=256,
|
146 |
+
featmap_strides=[4, 8, 16, 32]),
|
147 |
+
embed_head=dict(
|
148 |
+
type='QuasiDenseEmbedHead',
|
149 |
+
num_convs=4,
|
150 |
+
num_fcs=1,
|
151 |
+
embed_channels=256,
|
152 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
153 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
154 |
+
loss_track_aux=dict(
|
155 |
+
type='MarginL2Loss',
|
156 |
+
neg_pos_ub=3,
|
157 |
+
pos_margin=0,
|
158 |
+
neg_margin=0.1,
|
159 |
+
hard_mining=True,
|
160 |
+
loss_weight=1.0)),
|
161 |
+
train_cfg=dict(
|
162 |
+
assigner=dict(
|
163 |
+
type='MaxIoUAssigner',
|
164 |
+
pos_iou_thr=0.7,
|
165 |
+
neg_iou_thr=0.3,
|
166 |
+
min_pos_iou=0.5,
|
167 |
+
match_low_quality=False,
|
168 |
+
ignore_iof_thr=-1),
|
169 |
+
sampler=dict(
|
170 |
+
type='CombinedSampler',
|
171 |
+
num=512,
|
172 |
+
pos_fraction=0.5,
|
173 |
+
neg_pos_ub=3,
|
174 |
+
add_gt_as_proposals=True,
|
175 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
176 |
+
neg_sampler=dict(type='RandomSampler')))),
|
177 |
+
tracker=dict(
|
178 |
+
type='MasaTaoTracker',
|
179 |
+
init_score_thr=0.0001,
|
180 |
+
obj_score_thr=0.0001,
|
181 |
+
match_score_thr=0.5,
|
182 |
+
memo_tracklet_frames=10,
|
183 |
+
memo_momentum=0.8,
|
184 |
+
with_cats=False,
|
185 |
+
max_distance=-1,
|
186 |
+
fps=1,
|
187 |
+
)
|
188 |
+
)
|
189 |
+
|
190 |
+
test_pipeline = [
|
191 |
+
dict(
|
192 |
+
type='TransformBroadcaster',
|
193 |
+
transforms=[
|
194 |
+
dict(type='LoadImageFromFile'),
|
195 |
+
dict(
|
196 |
+
type='Resize',
|
197 |
+
scale=(1024, 1024),
|
198 |
+
keep_ratio=True),
|
199 |
+
dict(type='LoadTrackAnnotations')
|
200 |
+
]),
|
201 |
+
dict(type='PackTrackInputs')
|
202 |
+
]
|
203 |
+
|
204 |
+
# runtime settings
|
205 |
+
train_dataloader = None
|
206 |
+
train_cfg = None
|
207 |
+
val_cfg = dict(type='ValLoop')
|
208 |
+
test_cfg = dict(type='TestLoop')
|
209 |
+
|
210 |
+
default_hooks = dict(
|
211 |
+
logger=dict(type='LoggerHook', interval=50),
|
212 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
213 |
+
checkpoint = dict(type='CheckpointHook', interval=1),
|
214 |
+
)
|
215 |
+
|
216 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
217 |
+
visualizer = dict(
|
218 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
219 |
+
|
220 |
+
val_dataloader = dict(
|
221 |
+
dataset=dict(
|
222 |
+
ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
|
223 |
+
pipeline=test_pipeline,
|
224 |
+
)
|
225 |
+
)
|
226 |
+
test_dataloader = val_dataloader
|
227 |
+
test_evaluator = dict(
|
228 |
+
ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
|
229 |
+
outfile_prefix='results/masa_results/masa-r50-release-tao-teter-test',
|
230 |
+
)
|
configs/masa-sam/bdd_test/masa_sam_vitb_bdd_mot_test.py
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../sam-vitb.py',
|
3 |
+
'../../datasets/bdd/bdd_dataset.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector['init_cfg'] = dict(
|
9 |
+
type='Pretrained',
|
10 |
+
checkpoint= 'saved_models/pretrain_weights/sam_vit_b_01ec64_mmdet.pth'
|
11 |
+
# noqa: E501
|
12 |
+
)
|
13 |
+
detector['type'] = 'SamMasa'
|
14 |
+
|
15 |
+
del _base_.model
|
16 |
+
|
17 |
+
model = dict(
|
18 |
+
type='MASA',
|
19 |
+
freeze_detector=True,
|
20 |
+
unified_backbone=True,
|
21 |
+
load_public_dets = True,
|
22 |
+
benchmark = 'bdd',
|
23 |
+
public_det_path = 'results/public_dets/bdd_mot_yolox_dets/',
|
24 |
+
data_preprocessor=dict(
|
25 |
+
type='TrackDataPreprocessor',
|
26 |
+
# Image normalization parameters
|
27 |
+
mean=[123.675, 116.28, 103.53],
|
28 |
+
std=[58.395, 57.12, 57.375],
|
29 |
+
bgr_to_rgb=True,
|
30 |
+
# Image padding parameters
|
31 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
32 |
+
pad_size_divisor=1024), # Padding the image to multiples of 32
|
33 |
+
detector=detector,
|
34 |
+
masa_adapter=[
|
35 |
+
dict(
|
36 |
+
type='SimpleFPN',
|
37 |
+
in_channels=[768, 768, 768, 768],
|
38 |
+
out_channels=256,
|
39 |
+
use_residual=True,
|
40 |
+
num_outs=5),
|
41 |
+
dict(
|
42 |
+
type='DyHead',
|
43 |
+
in_channels=256,
|
44 |
+
out_channels=256,
|
45 |
+
num_blocks=3)
|
46 |
+
],
|
47 |
+
rpn_head=dict(
|
48 |
+
type='RPNHead',
|
49 |
+
in_channels=256,
|
50 |
+
feat_channels=256,
|
51 |
+
anchor_generator=dict(
|
52 |
+
type='AnchorGenerator',
|
53 |
+
scales=[8],
|
54 |
+
ratios=[0.5, 1.0, 2.0],
|
55 |
+
strides=[4, 8, 16, 32, 64]),
|
56 |
+
bbox_coder=dict(
|
57 |
+
type='DeltaXYWHBBoxCoder',
|
58 |
+
target_means=[.0, .0, .0, .0],
|
59 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
60 |
+
loss_cls=dict(
|
61 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
62 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
63 |
+
),
|
64 |
+
roi_head=dict(
|
65 |
+
type='StandardRoIHead',
|
66 |
+
bbox_roi_extractor=dict(
|
67 |
+
type='SingleRoIExtractor',
|
68 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
69 |
+
out_channels=256,
|
70 |
+
featmap_strides=[4, 8, 16, 32]),
|
71 |
+
bbox_head=dict(
|
72 |
+
type='Shared2FCBBoxHead',
|
73 |
+
in_channels=256,
|
74 |
+
fc_out_channels=1024,
|
75 |
+
roi_feat_size=7,
|
76 |
+
num_classes=1,
|
77 |
+
bbox_coder=dict(
|
78 |
+
type='DeltaXYWHBBoxCoder',
|
79 |
+
target_means=[0., 0., 0., 0.],
|
80 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
81 |
+
reg_class_agnostic=True,
|
82 |
+
loss_cls=dict(
|
83 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
84 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
85 |
+
# model training and testing settings
|
86 |
+
train_cfg=dict(
|
87 |
+
rpn=dict(
|
88 |
+
assigner=dict(
|
89 |
+
type='MaxIoUAssigner',
|
90 |
+
pos_iou_thr=0.7,
|
91 |
+
neg_iou_thr=0.3,
|
92 |
+
min_pos_iou=0.3,
|
93 |
+
match_low_quality=True,
|
94 |
+
ignore_iof_thr=-1),
|
95 |
+
sampler=dict(
|
96 |
+
type='RandomSampler',
|
97 |
+
num=256,
|
98 |
+
pos_fraction=0.5,
|
99 |
+
neg_pos_ub=-1,
|
100 |
+
add_gt_as_proposals=False),
|
101 |
+
allowed_border=-1,
|
102 |
+
pos_weight=-1,
|
103 |
+
debug=False),
|
104 |
+
rpn_proposal=dict(
|
105 |
+
nms_pre=2000,
|
106 |
+
max_per_img=1000,
|
107 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
108 |
+
min_bbox_size=0),
|
109 |
+
rcnn=dict(
|
110 |
+
assigner=dict(
|
111 |
+
type='MaxIoUAssigner',
|
112 |
+
pos_iou_thr=0.5,
|
113 |
+
neg_iou_thr=0.5,
|
114 |
+
min_pos_iou=0.5,
|
115 |
+
match_low_quality=False,
|
116 |
+
ignore_iof_thr=-1),
|
117 |
+
sampler=dict(
|
118 |
+
type='RandomSampler',
|
119 |
+
num=512,
|
120 |
+
pos_fraction=0.25,
|
121 |
+
neg_pos_ub=-1,
|
122 |
+
add_gt_as_proposals=True),
|
123 |
+
pos_weight=-1,
|
124 |
+
debug=False)),
|
125 |
+
test_cfg=dict(
|
126 |
+
rpn=dict(
|
127 |
+
nms_pre=1000,
|
128 |
+
max_per_img=1000,
|
129 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
130 |
+
min_bbox_size=0),
|
131 |
+
rcnn=dict(
|
132 |
+
score_thr=0.02,
|
133 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
134 |
+
nms=dict(type='nms',
|
135 |
+
iou_threshold=0.5,
|
136 |
+
class_agnostic=True,
|
137 |
+
split_thr=100000),
|
138 |
+
max_per_img=50,
|
139 |
+
mask_thr_binary=0.5)
|
140 |
+
# soft-nms is also supported for rcnn testing
|
141 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
142 |
+
),
|
143 |
+
track_head=dict(
|
144 |
+
type='MasaTrackHead',
|
145 |
+
roi_extractor=dict(
|
146 |
+
type='SingleRoIExtractor',
|
147 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
148 |
+
out_channels=256,
|
149 |
+
featmap_strides=[4, 8, 16, 32]),
|
150 |
+
embed_head=dict(
|
151 |
+
type='QuasiDenseEmbedHead',
|
152 |
+
num_convs=4,
|
153 |
+
num_fcs=1,
|
154 |
+
embed_channels=256,
|
155 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
156 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
157 |
+
loss_track_aux=dict(
|
158 |
+
type='MarginL2Loss',
|
159 |
+
neg_pos_ub=3,
|
160 |
+
pos_margin=0,
|
161 |
+
neg_margin=0.1,
|
162 |
+
hard_mining=True,
|
163 |
+
loss_weight=1.0)),
|
164 |
+
train_cfg=dict(
|
165 |
+
assigner=dict(
|
166 |
+
type='MaxIoUAssigner',
|
167 |
+
pos_iou_thr=0.7,
|
168 |
+
neg_iou_thr=0.3,
|
169 |
+
min_pos_iou=0.5,
|
170 |
+
match_low_quality=False,
|
171 |
+
ignore_iof_thr=-1),
|
172 |
+
sampler=dict(
|
173 |
+
type='CombinedSampler',
|
174 |
+
num=512,
|
175 |
+
pos_fraction=0.5,
|
176 |
+
neg_pos_ub=3,
|
177 |
+
add_gt_as_proposals=True,
|
178 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
179 |
+
neg_sampler=dict(type='RandomSampler')))),
|
180 |
+
tracker=dict(
|
181 |
+
type='MasaBDDTracker',
|
182 |
+
init_score_thr=0.5,
|
183 |
+
obj_score_thr=0.3,
|
184 |
+
match_score_thr=0.6,
|
185 |
+
memo_tracklet_frames=10,
|
186 |
+
memo_backdrop_frames=1,
|
187 |
+
memo_momentum=0.8,
|
188 |
+
nms_conf_thr=0.5,
|
189 |
+
nms_backdrop_iou_thr=0.3,
|
190 |
+
nms_class_iou_thr=0.7,
|
191 |
+
with_cats=False,
|
192 |
+
match_metric='bisoftmax')
|
193 |
+
)
|
194 |
+
|
195 |
+
test_pipeline = [
|
196 |
+
dict(
|
197 |
+
type='TransformBroadcaster',
|
198 |
+
transforms=[
|
199 |
+
dict(type='LoadImageFromFile'),
|
200 |
+
dict(
|
201 |
+
type='Resize',
|
202 |
+
scale=(1024, 1024),
|
203 |
+
keep_ratio=True),
|
204 |
+
dict(type='LoadTrackAnnotations')
|
205 |
+
]),
|
206 |
+
dict(type='PackTrackInputs')
|
207 |
+
]
|
208 |
+
|
209 |
+
# runtime settings
|
210 |
+
train_dataloader = None
|
211 |
+
train_cfg = None
|
212 |
+
val_cfg = dict(type='ValLoop')
|
213 |
+
test_cfg = dict(type='TestLoop')
|
214 |
+
|
215 |
+
default_hooks = dict(
|
216 |
+
logger=dict(type='LoggerHook', interval=50),
|
217 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
218 |
+
checkpoint=dict(type='CheckpointHook', interval=12),
|
219 |
+
)
|
220 |
+
|
221 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
222 |
+
visualizer = dict(
|
223 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
224 |
+
|
225 |
+
# custom hooks
|
226 |
+
custom_hooks = [
|
227 |
+
# Synchronize model buffers such as running_mean and running_var in BN
|
228 |
+
# at the end of each epoch
|
229 |
+
dict(type='SyncBuffersHook')
|
230 |
+
]
|
231 |
+
auto_scale_lr = dict(enable=False, base_batch_size=16)
|
232 |
+
val_dataloader = dict(
|
233 |
+
dataset=dict(
|
234 |
+
ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
|
235 |
+
pipeline=test_pipeline,
|
236 |
+
)
|
237 |
+
)
|
238 |
+
test_dataloader = val_dataloader
|
239 |
+
val_evaluator = dict(
|
240 |
+
ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
|
241 |
+
scalabel_gt='data/bdd/annotations/scalabel_gt/box_track_20/val/',
|
242 |
+
outfile_prefix='results/masa_results/masa-sam-vitb-bdd-mot-test',
|
243 |
+
metric=['TETA', 'HOTA', 'CLEAR']
|
244 |
+
)
|
245 |
+
test_evaluator = val_evaluator
|
configs/masa-sam/bdd_test/masa_sam_vitb_bdd_mots_test.py
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../sam-vitb.py',
|
3 |
+
'../../datasets/bdd/bdd_dataset.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector['init_cfg'] = dict(
|
9 |
+
type='Pretrained',
|
10 |
+
checkpoint= 'saved_models/pretrain_weights/sam_vit_b_01ec64_mmdet.pth'
|
11 |
+
# noqa: E501
|
12 |
+
)
|
13 |
+
detector['type'] = 'SamMasa'
|
14 |
+
|
15 |
+
del _base_.model
|
16 |
+
|
17 |
+
model = dict(
|
18 |
+
type='MASA',
|
19 |
+
freeze_detector=True,
|
20 |
+
unified_backbone=True,
|
21 |
+
load_public_dets = True,
|
22 |
+
with_segm=True,
|
23 |
+
benchmark = 'bdd',
|
24 |
+
public_det_path = 'results/public_dets/bdd_mots_val_uninext_dets/',
|
25 |
+
data_preprocessor=dict(
|
26 |
+
type='TrackDataPreprocessor',
|
27 |
+
# Image normalization parameters
|
28 |
+
mean=[123.675, 116.28, 103.53],
|
29 |
+
std=[58.395, 57.12, 57.375],
|
30 |
+
bgr_to_rgb=True,
|
31 |
+
# Image padding parameters
|
32 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
33 |
+
pad_size_divisor=1024), # Padding the image to multiples of 32
|
34 |
+
detector=detector,
|
35 |
+
masa_adapter=[
|
36 |
+
dict(
|
37 |
+
type='SimpleFPN',
|
38 |
+
in_channels=[768, 768, 768, 768],
|
39 |
+
out_channels=256,
|
40 |
+
use_residual=True,
|
41 |
+
num_outs=5),
|
42 |
+
dict(
|
43 |
+
type='DyHead',
|
44 |
+
in_channels=256,
|
45 |
+
out_channels=256,
|
46 |
+
num_blocks=3)
|
47 |
+
],
|
48 |
+
rpn_head=dict(
|
49 |
+
type='RPNHead',
|
50 |
+
in_channels=256,
|
51 |
+
feat_channels=256,
|
52 |
+
anchor_generator=dict(
|
53 |
+
type='AnchorGenerator',
|
54 |
+
scales=[8],
|
55 |
+
ratios=[0.5, 1.0, 2.0],
|
56 |
+
strides=[4, 8, 16, 32, 64]),
|
57 |
+
bbox_coder=dict(
|
58 |
+
type='DeltaXYWHBBoxCoder',
|
59 |
+
target_means=[.0, .0, .0, .0],
|
60 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
61 |
+
loss_cls=dict(
|
62 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
63 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
64 |
+
),
|
65 |
+
roi_head=dict(
|
66 |
+
type='StandardRoIHead',
|
67 |
+
bbox_roi_extractor=dict(
|
68 |
+
type='SingleRoIExtractor',
|
69 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
70 |
+
out_channels=256,
|
71 |
+
featmap_strides=[4, 8, 16, 32]),
|
72 |
+
bbox_head=dict(
|
73 |
+
type='Shared2FCBBoxHead',
|
74 |
+
in_channels=256,
|
75 |
+
fc_out_channels=1024,
|
76 |
+
roi_feat_size=7,
|
77 |
+
num_classes=1,
|
78 |
+
bbox_coder=dict(
|
79 |
+
type='DeltaXYWHBBoxCoder',
|
80 |
+
target_means=[0., 0., 0., 0.],
|
81 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
82 |
+
reg_class_agnostic=True,
|
83 |
+
loss_cls=dict(
|
84 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
85 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
86 |
+
# model training and testing settings
|
87 |
+
train_cfg=dict(
|
88 |
+
rpn=dict(
|
89 |
+
assigner=dict(
|
90 |
+
type='MaxIoUAssigner',
|
91 |
+
pos_iou_thr=0.7,
|
92 |
+
neg_iou_thr=0.3,
|
93 |
+
min_pos_iou=0.3,
|
94 |
+
match_low_quality=True,
|
95 |
+
ignore_iof_thr=-1),
|
96 |
+
sampler=dict(
|
97 |
+
type='RandomSampler',
|
98 |
+
num=256,
|
99 |
+
pos_fraction=0.5,
|
100 |
+
neg_pos_ub=-1,
|
101 |
+
add_gt_as_proposals=False),
|
102 |
+
allowed_border=-1,
|
103 |
+
pos_weight=-1,
|
104 |
+
debug=False),
|
105 |
+
rpn_proposal=dict(
|
106 |
+
nms_pre=2000,
|
107 |
+
max_per_img=1000,
|
108 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
109 |
+
min_bbox_size=0),
|
110 |
+
rcnn=dict(
|
111 |
+
assigner=dict(
|
112 |
+
type='MaxIoUAssigner',
|
113 |
+
pos_iou_thr=0.5,
|
114 |
+
neg_iou_thr=0.5,
|
115 |
+
min_pos_iou=0.5,
|
116 |
+
match_low_quality=False,
|
117 |
+
ignore_iof_thr=-1),
|
118 |
+
sampler=dict(
|
119 |
+
type='RandomSampler',
|
120 |
+
num=512,
|
121 |
+
pos_fraction=0.25,
|
122 |
+
neg_pos_ub=-1,
|
123 |
+
add_gt_as_proposals=True),
|
124 |
+
pos_weight=-1,
|
125 |
+
debug=False)),
|
126 |
+
test_cfg=dict(
|
127 |
+
rpn=dict(
|
128 |
+
nms_pre=1000,
|
129 |
+
max_per_img=1000,
|
130 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
131 |
+
min_bbox_size=0),
|
132 |
+
rcnn=dict(
|
133 |
+
score_thr=0.02,
|
134 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
135 |
+
nms=dict(type='nms',
|
136 |
+
iou_threshold=0.5,
|
137 |
+
class_agnostic=True,
|
138 |
+
split_thr=100000),
|
139 |
+
max_per_img=50,
|
140 |
+
mask_thr_binary=0.5)
|
141 |
+
# soft-nms is also supported for rcnn testing
|
142 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
143 |
+
),
|
144 |
+
track_head=dict(
|
145 |
+
type='MasaTrackHead',
|
146 |
+
roi_extractor=dict(
|
147 |
+
type='SingleRoIExtractor',
|
148 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
149 |
+
out_channels=256,
|
150 |
+
featmap_strides=[4, 8, 16, 32]),
|
151 |
+
embed_head=dict(
|
152 |
+
type='QuasiDenseEmbedHead',
|
153 |
+
num_convs=4,
|
154 |
+
num_fcs=1,
|
155 |
+
embed_channels=256,
|
156 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
157 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
158 |
+
loss_track_aux=dict(
|
159 |
+
type='MarginL2Loss',
|
160 |
+
neg_pos_ub=3,
|
161 |
+
pos_margin=0,
|
162 |
+
neg_margin=0.1,
|
163 |
+
hard_mining=True,
|
164 |
+
loss_weight=1.0)),
|
165 |
+
train_cfg=dict(
|
166 |
+
assigner=dict(
|
167 |
+
type='MaxIoUAssigner',
|
168 |
+
pos_iou_thr=0.7,
|
169 |
+
neg_iou_thr=0.3,
|
170 |
+
min_pos_iou=0.5,
|
171 |
+
match_low_quality=False,
|
172 |
+
ignore_iof_thr=-1),
|
173 |
+
sampler=dict(
|
174 |
+
type='CombinedSampler',
|
175 |
+
num=512,
|
176 |
+
pos_fraction=0.5,
|
177 |
+
neg_pos_ub=3,
|
178 |
+
add_gt_as_proposals=True,
|
179 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
180 |
+
neg_sampler=dict(type='RandomSampler')))),
|
181 |
+
tracker=dict(
|
182 |
+
type='MasaBDDTracker',
|
183 |
+
init_score_thr=0.5,
|
184 |
+
obj_score_thr=0.3,
|
185 |
+
match_score_thr=0.6,
|
186 |
+
memo_tracklet_frames=10,
|
187 |
+
memo_backdrop_frames=1,
|
188 |
+
memo_momentum=0.8,
|
189 |
+
nms_conf_thr=0.5,
|
190 |
+
nms_backdrop_iou_thr=0.3,
|
191 |
+
nms_class_iou_thr=0.7,
|
192 |
+
with_cats=False,
|
193 |
+
match_metric='bisoftmax')
|
194 |
+
)
|
195 |
+
|
196 |
+
test_pipeline = [
|
197 |
+
dict(
|
198 |
+
type='TransformBroadcaster',
|
199 |
+
transforms=[
|
200 |
+
dict(type='LoadImageFromFile'),
|
201 |
+
dict(
|
202 |
+
type='Resize',
|
203 |
+
scale=(1024, 1024),
|
204 |
+
keep_ratio=True),
|
205 |
+
dict(type='LoadTrackAnnotations')
|
206 |
+
]),
|
207 |
+
dict(type='PackTrackInputs')
|
208 |
+
]
|
209 |
+
|
210 |
+
# runtime settings
|
211 |
+
train_dataloader = None
|
212 |
+
train_cfg = None
|
213 |
+
val_cfg = dict(type='ValLoop')
|
214 |
+
test_cfg = dict(type='TestLoop')
|
215 |
+
|
216 |
+
default_hooks = dict(
|
217 |
+
logger=dict(type='LoggerHook', interval=50),
|
218 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
219 |
+
checkpoint = dict(type='CheckpointHook', interval=1),
|
220 |
+
)
|
221 |
+
|
222 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
223 |
+
visualizer = dict(
|
224 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
225 |
+
|
226 |
+
val_dataloader = dict(
|
227 |
+
dataset=dict(
|
228 |
+
ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
|
229 |
+
pipeline=test_pipeline,
|
230 |
+
)
|
231 |
+
)
|
232 |
+
|
233 |
+
test_dataloader = val_dataloader
|
234 |
+
val_evaluator = dict(
|
235 |
+
ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
|
236 |
+
scalabel_gt='data/bdd/annotations/scalabel_gt/seg_track_20/val/',
|
237 |
+
outfile_prefix='results/masa_results/masa-sam-vitb-bdd-mots-test',
|
238 |
+
metric=['TETA', 'HOTA', 'CLEAR'],
|
239 |
+
with_mask=True,
|
240 |
+
)
|
241 |
+
test_evaluator = val_evaluator
|
configs/masa-sam/bdd_test/masa_sam_vith_bdd_mot_test.py
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../sam-vith.py',
|
3 |
+
'../../datasets/bdd/bdd_dataset.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector['init_cfg'] = dict(
|
9 |
+
type='Pretrained',
|
10 |
+
checkpoint= 'saved_models/pretrain_weights/sam_vit_h_4b8939_mmdet.pth'
|
11 |
+
# noqa: E501
|
12 |
+
)
|
13 |
+
detector['type'] = 'SamMasa'
|
14 |
+
|
15 |
+
del _base_.model
|
16 |
+
|
17 |
+
model = dict(
|
18 |
+
type='MASA',
|
19 |
+
freeze_detector=True,
|
20 |
+
unified_backbone=True,
|
21 |
+
load_public_dets = True,
|
22 |
+
benchmark = 'bdd',
|
23 |
+
public_det_path = 'results/public_dets/bdd_mot_yolox_dets/',
|
24 |
+
data_preprocessor=dict(
|
25 |
+
type='TrackDataPreprocessor',
|
26 |
+
# Image normalization parameters
|
27 |
+
mean=[123.675, 116.28, 103.53],
|
28 |
+
std=[58.395, 57.12, 57.375],
|
29 |
+
bgr_to_rgb=True,
|
30 |
+
# Image padding parameters
|
31 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
32 |
+
pad_size_divisor=1024), # Padding the image to multiples of 32
|
33 |
+
detector=detector,
|
34 |
+
masa_adapter=[
|
35 |
+
dict(
|
36 |
+
type='SimpleFPN',
|
37 |
+
in_channels=[1280, 1280, 1280, 1280],
|
38 |
+
out_channels=256,
|
39 |
+
use_residual=True,
|
40 |
+
num_outs=5),
|
41 |
+
dict(
|
42 |
+
type='DyHead',
|
43 |
+
in_channels=256,
|
44 |
+
out_channels=256,
|
45 |
+
num_blocks=3)
|
46 |
+
],
|
47 |
+
rpn_head=dict(
|
48 |
+
type='RPNHead',
|
49 |
+
in_channels=256,
|
50 |
+
feat_channels=256,
|
51 |
+
anchor_generator=dict(
|
52 |
+
type='AnchorGenerator',
|
53 |
+
scales=[8],
|
54 |
+
ratios=[0.5, 1.0, 2.0],
|
55 |
+
strides=[4, 8, 16, 32, 64]),
|
56 |
+
bbox_coder=dict(
|
57 |
+
type='DeltaXYWHBBoxCoder',
|
58 |
+
target_means=[.0, .0, .0, .0],
|
59 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
60 |
+
loss_cls=dict(
|
61 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
62 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
63 |
+
),
|
64 |
+
roi_head=dict(
|
65 |
+
type='StandardRoIHead',
|
66 |
+
bbox_roi_extractor=dict(
|
67 |
+
type='SingleRoIExtractor',
|
68 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
69 |
+
out_channels=256,
|
70 |
+
featmap_strides=[4, 8, 16, 32]),
|
71 |
+
bbox_head=dict(
|
72 |
+
type='Shared2FCBBoxHead',
|
73 |
+
in_channels=256,
|
74 |
+
fc_out_channels=1024,
|
75 |
+
roi_feat_size=7,
|
76 |
+
num_classes=1,
|
77 |
+
bbox_coder=dict(
|
78 |
+
type='DeltaXYWHBBoxCoder',
|
79 |
+
target_means=[0., 0., 0., 0.],
|
80 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
81 |
+
reg_class_agnostic=True,
|
82 |
+
loss_cls=dict(
|
83 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
84 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
85 |
+
# model training and testing settings
|
86 |
+
train_cfg=dict(
|
87 |
+
rpn=dict(
|
88 |
+
assigner=dict(
|
89 |
+
type='MaxIoUAssigner',
|
90 |
+
pos_iou_thr=0.7,
|
91 |
+
neg_iou_thr=0.3,
|
92 |
+
min_pos_iou=0.3,
|
93 |
+
match_low_quality=True,
|
94 |
+
ignore_iof_thr=-1),
|
95 |
+
sampler=dict(
|
96 |
+
type='RandomSampler',
|
97 |
+
num=256,
|
98 |
+
pos_fraction=0.5,
|
99 |
+
neg_pos_ub=-1,
|
100 |
+
add_gt_as_proposals=False),
|
101 |
+
allowed_border=-1,
|
102 |
+
pos_weight=-1,
|
103 |
+
debug=False),
|
104 |
+
rpn_proposal=dict(
|
105 |
+
nms_pre=2000,
|
106 |
+
max_per_img=1000,
|
107 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
108 |
+
min_bbox_size=0),
|
109 |
+
rcnn=dict(
|
110 |
+
assigner=dict(
|
111 |
+
type='MaxIoUAssigner',
|
112 |
+
pos_iou_thr=0.5,
|
113 |
+
neg_iou_thr=0.5,
|
114 |
+
min_pos_iou=0.5,
|
115 |
+
match_low_quality=False,
|
116 |
+
ignore_iof_thr=-1),
|
117 |
+
sampler=dict(
|
118 |
+
type='RandomSampler',
|
119 |
+
num=512,
|
120 |
+
pos_fraction=0.25,
|
121 |
+
neg_pos_ub=-1,
|
122 |
+
add_gt_as_proposals=True),
|
123 |
+
pos_weight=-1,
|
124 |
+
debug=False)),
|
125 |
+
test_cfg=dict(
|
126 |
+
rpn=dict(
|
127 |
+
nms_pre=1000,
|
128 |
+
max_per_img=1000,
|
129 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
130 |
+
min_bbox_size=0),
|
131 |
+
rcnn=dict(
|
132 |
+
score_thr=0.02,
|
133 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
134 |
+
nms=dict(type='nms',
|
135 |
+
iou_threshold=0.5,
|
136 |
+
class_agnostic=True,
|
137 |
+
split_thr=100000),
|
138 |
+
max_per_img=50,
|
139 |
+
mask_thr_binary=0.5)
|
140 |
+
# soft-nms is also supported for rcnn testing
|
141 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
142 |
+
),
|
143 |
+
track_head=dict(
|
144 |
+
type='MasaTrackHead',
|
145 |
+
roi_extractor=dict(
|
146 |
+
type='SingleRoIExtractor',
|
147 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
148 |
+
out_channels=256,
|
149 |
+
featmap_strides=[4, 8, 16, 32]),
|
150 |
+
embed_head=dict(
|
151 |
+
type='QuasiDenseEmbedHead',
|
152 |
+
num_convs=4,
|
153 |
+
num_fcs=1,
|
154 |
+
embed_channels=256,
|
155 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
156 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
157 |
+
loss_track_aux=dict(
|
158 |
+
type='MarginL2Loss',
|
159 |
+
neg_pos_ub=3,
|
160 |
+
pos_margin=0,
|
161 |
+
neg_margin=0.1,
|
162 |
+
hard_mining=True,
|
163 |
+
loss_weight=1.0)),
|
164 |
+
# loss_bbox=dict(type='L1Loss', loss_weight=1.0),
|
165 |
+
train_cfg=dict(
|
166 |
+
assigner=dict(
|
167 |
+
type='MaxIoUAssigner',
|
168 |
+
pos_iou_thr=0.7,
|
169 |
+
neg_iou_thr=0.3,
|
170 |
+
min_pos_iou=0.5,
|
171 |
+
match_low_quality=False,
|
172 |
+
ignore_iof_thr=-1),
|
173 |
+
sampler=dict(
|
174 |
+
type='CombinedSampler',
|
175 |
+
num=512,
|
176 |
+
pos_fraction=0.5,
|
177 |
+
neg_pos_ub=3,
|
178 |
+
add_gt_as_proposals=True,
|
179 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
180 |
+
neg_sampler=dict(type='RandomSampler')))),
|
181 |
+
tracker=dict(
|
182 |
+
type='MasaBDDTracker',
|
183 |
+
init_score_thr=0.5,
|
184 |
+
obj_score_thr=0.3,
|
185 |
+
match_score_thr=0.6,
|
186 |
+
memo_tracklet_frames=10,
|
187 |
+
memo_backdrop_frames=1,
|
188 |
+
memo_momentum=0.8,
|
189 |
+
nms_conf_thr=0.5,
|
190 |
+
nms_backdrop_iou_thr=0.3,
|
191 |
+
nms_class_iou_thr=0.7,
|
192 |
+
with_cats=False,
|
193 |
+
match_metric='bisoftmax')
|
194 |
+
)
|
195 |
+
|
196 |
+
test_pipeline = [
|
197 |
+
dict(
|
198 |
+
type='TransformBroadcaster',
|
199 |
+
transforms=[
|
200 |
+
dict(type='LoadImageFromFile'),
|
201 |
+
dict(
|
202 |
+
type='Resize',
|
203 |
+
scale=(1024, 1024),
|
204 |
+
keep_ratio=True),
|
205 |
+
dict(type='LoadTrackAnnotations')
|
206 |
+
]),
|
207 |
+
dict(type='PackTrackInputs')
|
208 |
+
]
|
209 |
+
|
210 |
+
|
211 |
+
train_dataloader = None
|
212 |
+
train_cfg = None
|
213 |
+
val_cfg = dict(type='ValLoop')
|
214 |
+
test_cfg = dict(type='TestLoop')
|
215 |
+
|
216 |
+
default_hooks = dict(
|
217 |
+
logger=dict(type='LoggerHook', interval=50),
|
218 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
219 |
+
checkpoint=dict(type='CheckpointHook', interval=12),
|
220 |
+
)
|
221 |
+
|
222 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
223 |
+
visualizer = dict(
|
224 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
225 |
+
|
226 |
+
# custom hooks
|
227 |
+
custom_hooks = [
|
228 |
+
# Synchronize model buffers such as running_mean and running_var in BN
|
229 |
+
# at the end of each epoch
|
230 |
+
dict(type='SyncBuffersHook')
|
231 |
+
]
|
232 |
+
auto_scale_lr = dict(enable=False, base_batch_size=16)
|
233 |
+
val_dataloader = dict(
|
234 |
+
dataset=dict(
|
235 |
+
ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
|
236 |
+
pipeline=test_pipeline,
|
237 |
+
)
|
238 |
+
)
|
239 |
+
test_dataloader = val_dataloader
|
240 |
+
val_evaluator = dict(
|
241 |
+
ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
|
242 |
+
scalabel_gt='data/bdd/annotations/scalabel_gt/box_track_20/val/',
|
243 |
+
outfile_prefix='results/masa_results/masa-sam-vith-bdd-mot-test',
|
244 |
+
metric=['TETA', 'HOTA', 'CLEAR']
|
245 |
+
)
|
246 |
+
test_evaluator = val_evaluator
|
configs/masa-sam/bdd_test/masa_sam_vith_bdd_mots_test.py
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../sam-vith.py',
|
3 |
+
'../../datasets/bdd/bdd_dataset.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector['init_cfg'] = dict(
|
9 |
+
type='Pretrained',
|
10 |
+
checkpoint= 'saved_models/pretrain_weights/sam_vit_h_4b8939_mmdet.pth'
|
11 |
+
# noqa: E501
|
12 |
+
)
|
13 |
+
detector['type'] = 'SamMasa'
|
14 |
+
|
15 |
+
del _base_.model
|
16 |
+
|
17 |
+
model = dict(
|
18 |
+
type='MASA',
|
19 |
+
freeze_detector=True,
|
20 |
+
unified_backbone=True,
|
21 |
+
load_public_dets = True,
|
22 |
+
with_segm=True,
|
23 |
+
benchmark = 'bdd',
|
24 |
+
public_det_path = 'results/public_dets/bdd_mots_val_uninext_dets/',
|
25 |
+
data_preprocessor=dict(
|
26 |
+
type='TrackDataPreprocessor',
|
27 |
+
# Image normalization parameters
|
28 |
+
mean=[123.675, 116.28, 103.53],
|
29 |
+
std=[58.395, 57.12, 57.375],
|
30 |
+
bgr_to_rgb=True,
|
31 |
+
# Image padding parameters
|
32 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
33 |
+
pad_size_divisor=1024), # Padding the image to multiples of 32
|
34 |
+
detector=detector,
|
35 |
+
masa_adapter=[
|
36 |
+
dict(
|
37 |
+
type='SimpleFPN',
|
38 |
+
in_channels=[1280, 1280, 1280, 1280],
|
39 |
+
out_channels=256,
|
40 |
+
use_residual=True,
|
41 |
+
num_outs=5),
|
42 |
+
dict(
|
43 |
+
type='DyHead',
|
44 |
+
in_channels=256,
|
45 |
+
out_channels=256,
|
46 |
+
num_blocks=3)
|
47 |
+
],
|
48 |
+
rpn_head=dict(
|
49 |
+
type='RPNHead',
|
50 |
+
in_channels=256,
|
51 |
+
feat_channels=256,
|
52 |
+
anchor_generator=dict(
|
53 |
+
type='AnchorGenerator',
|
54 |
+
scales=[8],
|
55 |
+
ratios=[0.5, 1.0, 2.0],
|
56 |
+
strides=[4, 8, 16, 32, 64]),
|
57 |
+
bbox_coder=dict(
|
58 |
+
type='DeltaXYWHBBoxCoder',
|
59 |
+
target_means=[.0, .0, .0, .0],
|
60 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
61 |
+
loss_cls=dict(
|
62 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
63 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
64 |
+
),
|
65 |
+
roi_head=dict(
|
66 |
+
type='StandardRoIHead',
|
67 |
+
bbox_roi_extractor=dict(
|
68 |
+
type='SingleRoIExtractor',
|
69 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
70 |
+
out_channels=256,
|
71 |
+
featmap_strides=[4, 8, 16, 32]),
|
72 |
+
bbox_head=dict(
|
73 |
+
type='Shared2FCBBoxHead',
|
74 |
+
in_channels=256,
|
75 |
+
fc_out_channels=1024,
|
76 |
+
roi_feat_size=7,
|
77 |
+
num_classes=1,
|
78 |
+
bbox_coder=dict(
|
79 |
+
type='DeltaXYWHBBoxCoder',
|
80 |
+
target_means=[0., 0., 0., 0.],
|
81 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
82 |
+
reg_class_agnostic=True,
|
83 |
+
loss_cls=dict(
|
84 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
85 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
86 |
+
# model training and testing settings
|
87 |
+
train_cfg=dict(
|
88 |
+
rpn=dict(
|
89 |
+
assigner=dict(
|
90 |
+
type='MaxIoUAssigner',
|
91 |
+
pos_iou_thr=0.7,
|
92 |
+
neg_iou_thr=0.3,
|
93 |
+
min_pos_iou=0.3,
|
94 |
+
match_low_quality=True,
|
95 |
+
ignore_iof_thr=-1),
|
96 |
+
sampler=dict(
|
97 |
+
type='RandomSampler',
|
98 |
+
num=256,
|
99 |
+
pos_fraction=0.5,
|
100 |
+
neg_pos_ub=-1,
|
101 |
+
add_gt_as_proposals=False),
|
102 |
+
allowed_border=-1,
|
103 |
+
pos_weight=-1,
|
104 |
+
debug=False),
|
105 |
+
rpn_proposal=dict(
|
106 |
+
nms_pre=2000,
|
107 |
+
max_per_img=1000,
|
108 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
109 |
+
min_bbox_size=0),
|
110 |
+
rcnn=dict(
|
111 |
+
assigner=dict(
|
112 |
+
type='MaxIoUAssigner',
|
113 |
+
pos_iou_thr=0.5,
|
114 |
+
neg_iou_thr=0.5,
|
115 |
+
min_pos_iou=0.5,
|
116 |
+
match_low_quality=False,
|
117 |
+
ignore_iof_thr=-1),
|
118 |
+
sampler=dict(
|
119 |
+
type='RandomSampler',
|
120 |
+
num=512,
|
121 |
+
pos_fraction=0.25,
|
122 |
+
neg_pos_ub=-1,
|
123 |
+
add_gt_as_proposals=True),
|
124 |
+
pos_weight=-1,
|
125 |
+
debug=False)),
|
126 |
+
test_cfg=dict(
|
127 |
+
rpn=dict(
|
128 |
+
nms_pre=1000,
|
129 |
+
max_per_img=1000,
|
130 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
131 |
+
min_bbox_size=0),
|
132 |
+
rcnn=dict(
|
133 |
+
score_thr=0.02,
|
134 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
135 |
+
nms=dict(type='nms',
|
136 |
+
iou_threshold=0.5,
|
137 |
+
class_agnostic=True,
|
138 |
+
split_thr=100000),
|
139 |
+
max_per_img=50,
|
140 |
+
mask_thr_binary=0.5)
|
141 |
+
# soft-nms is also supported for rcnn testing
|
142 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
143 |
+
),
|
144 |
+
track_head=dict(
|
145 |
+
type='MasaTrackHead',
|
146 |
+
roi_extractor=dict(
|
147 |
+
type='SingleRoIExtractor',
|
148 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
149 |
+
out_channels=256,
|
150 |
+
featmap_strides=[4, 8, 16, 32]),
|
151 |
+
embed_head=dict(
|
152 |
+
type='QuasiDenseEmbedHead',
|
153 |
+
num_convs=4,
|
154 |
+
num_fcs=1,
|
155 |
+
embed_channels=256,
|
156 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
157 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
158 |
+
loss_track_aux=dict(
|
159 |
+
type='MarginL2Loss',
|
160 |
+
neg_pos_ub=3,
|
161 |
+
pos_margin=0,
|
162 |
+
neg_margin=0.1,
|
163 |
+
hard_mining=True,
|
164 |
+
loss_weight=1.0)),
|
165 |
+
# loss_bbox=dict(type='L1Loss', loss_weight=1.0),
|
166 |
+
train_cfg=dict(
|
167 |
+
assigner=dict(
|
168 |
+
type='MaxIoUAssigner',
|
169 |
+
pos_iou_thr=0.7,
|
170 |
+
neg_iou_thr=0.3,
|
171 |
+
min_pos_iou=0.5,
|
172 |
+
match_low_quality=False,
|
173 |
+
ignore_iof_thr=-1),
|
174 |
+
sampler=dict(
|
175 |
+
type='CombinedSampler',
|
176 |
+
num=512,
|
177 |
+
pos_fraction=0.5,
|
178 |
+
neg_pos_ub=3,
|
179 |
+
add_gt_as_proposals=True,
|
180 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
181 |
+
neg_sampler=dict(type='RandomSampler')))),
|
182 |
+
tracker=dict(
|
183 |
+
type='MasaBDDTracker',
|
184 |
+
init_score_thr=0.5,
|
185 |
+
obj_score_thr=0.3,
|
186 |
+
match_score_thr=0.6,
|
187 |
+
memo_tracklet_frames=10,
|
188 |
+
memo_backdrop_frames=1,
|
189 |
+
memo_momentum=0.8,
|
190 |
+
nms_conf_thr=0.5,
|
191 |
+
nms_backdrop_iou_thr=0.3,
|
192 |
+
nms_class_iou_thr=0.7,
|
193 |
+
with_cats=False,
|
194 |
+
match_metric='bisoftmax')
|
195 |
+
)
|
196 |
+
|
197 |
+
test_pipeline = [
|
198 |
+
dict(
|
199 |
+
type='TransformBroadcaster',
|
200 |
+
transforms=[
|
201 |
+
dict(type='LoadImageFromFile'),
|
202 |
+
dict(
|
203 |
+
type='Resize',
|
204 |
+
scale=(1024, 1024),
|
205 |
+
keep_ratio=True),
|
206 |
+
dict(type='LoadTrackAnnotations')
|
207 |
+
]),
|
208 |
+
dict(type='PackTrackInputs')
|
209 |
+
]
|
210 |
+
|
211 |
+
|
212 |
+
train_dataloader = None
|
213 |
+
train_cfg = None
|
214 |
+
val_cfg = dict(type='ValLoop')
|
215 |
+
test_cfg = dict(type='TestLoop')
|
216 |
+
|
217 |
+
default_hooks = dict(
|
218 |
+
logger=dict(type='LoggerHook', interval=50),
|
219 |
+
visualization=dict(type='TrackVisualizationHook', draw=False),
|
220 |
+
checkpoint = dict(type='CheckpointHook', interval=1),
|
221 |
+
)
|
222 |
+
|
223 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
224 |
+
visualizer = dict(
|
225 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
226 |
+
|
227 |
+
val_dataloader = dict(
|
228 |
+
dataset=dict(
|
229 |
+
ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
|
230 |
+
pipeline=test_pipeline,
|
231 |
+
)
|
232 |
+
)
|
233 |
+
|
234 |
+
test_dataloader = val_dataloader
|
235 |
+
val_evaluator = dict(
|
236 |
+
outfile_prefix='results/masa_results/masa-sam-vith-bdd-mots-test',
|
237 |
+
metric=['TETA'],
|
238 |
+
with_mask=True,
|
239 |
+
)
|
240 |
+
test_evaluator = val_evaluator
|
configs/masa-sam/open_vocabulary_mot_test/masa_sam_vitb_open_vocabulary_test.py
ADDED
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../sam-vitb.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v1.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector['init_cfg'] = dict(
|
9 |
+
type='Pretrained',
|
10 |
+
checkpoint= 'saved_models/pretrain_weights/sam_vit_b_01ec64_mmdet.pth'
|
11 |
+
# noqa: E501
|
12 |
+
)
|
13 |
+
detector['type'] = 'SamMasa'
|
14 |
+
|
15 |
+
del _base_.model
|
16 |
+
|
17 |
+
model = dict(
|
18 |
+
type='MASA',
|
19 |
+
freeze_detector=True,
|
20 |
+
unified_backbone=True,
|
21 |
+
load_public_dets = True,
|
22 |
+
benchmark = 'tao',
|
23 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
|
24 |
+
data_preprocessor=dict(
|
25 |
+
type='TrackDataPreprocessor',
|
26 |
+
# Image normalization parameters
|
27 |
+
mean=[123.675, 116.28, 103.53],
|
28 |
+
std=[58.395, 57.12, 57.375],
|
29 |
+
bgr_to_rgb=True,
|
30 |
+
# Image padding parameters
|
31 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
32 |
+
pad_size_divisor=1024), # Padding the image to multiples of 32
|
33 |
+
detector=detector,
|
34 |
+
masa_adapter=[
|
35 |
+
dict(
|
36 |
+
type='SimpleFPN',
|
37 |
+
in_channels=[768, 768, 768, 768],
|
38 |
+
out_channels=256,
|
39 |
+
use_residual=True,
|
40 |
+
num_outs=5),
|
41 |
+
dict(
|
42 |
+
type='DyHead',
|
43 |
+
in_channels=256,
|
44 |
+
out_channels=256,
|
45 |
+
num_blocks=3)
|
46 |
+
],
|
47 |
+
rpn_head=dict(
|
48 |
+
type='RPNHead',
|
49 |
+
in_channels=256,
|
50 |
+
feat_channels=256,
|
51 |
+
anchor_generator=dict(
|
52 |
+
type='AnchorGenerator',
|
53 |
+
scales=[8],
|
54 |
+
ratios=[0.5, 1.0, 2.0],
|
55 |
+
strides=[4, 8, 16, 32, 64]),
|
56 |
+
bbox_coder=dict(
|
57 |
+
type='DeltaXYWHBBoxCoder',
|
58 |
+
target_means=[.0, .0, .0, .0],
|
59 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
60 |
+
loss_cls=dict(
|
61 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
62 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
63 |
+
),
|
64 |
+
roi_head=dict(
|
65 |
+
type='StandardRoIHead',
|
66 |
+
bbox_roi_extractor=dict(
|
67 |
+
type='SingleRoIExtractor',
|
68 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
69 |
+
out_channels=256,
|
70 |
+
featmap_strides=[4, 8, 16, 32]),
|
71 |
+
bbox_head=dict(
|
72 |
+
type='Shared2FCBBoxHead',
|
73 |
+
in_channels=256,
|
74 |
+
fc_out_channels=1024,
|
75 |
+
roi_feat_size=7,
|
76 |
+
num_classes=1,
|
77 |
+
bbox_coder=dict(
|
78 |
+
type='DeltaXYWHBBoxCoder',
|
79 |
+
target_means=[0., 0., 0., 0.],
|
80 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
81 |
+
reg_class_agnostic=True,
|
82 |
+
loss_cls=dict(
|
83 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
84 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
85 |
+
# model training and testing settings
|
86 |
+
train_cfg=dict(
|
87 |
+
rpn=dict(
|
88 |
+
assigner=dict(
|
89 |
+
type='MaxIoUAssigner',
|
90 |
+
pos_iou_thr=0.7,
|
91 |
+
neg_iou_thr=0.3,
|
92 |
+
min_pos_iou=0.3,
|
93 |
+
match_low_quality=True,
|
94 |
+
ignore_iof_thr=-1),
|
95 |
+
sampler=dict(
|
96 |
+
type='RandomSampler',
|
97 |
+
num=256,
|
98 |
+
pos_fraction=0.5,
|
99 |
+
neg_pos_ub=-1,
|
100 |
+
add_gt_as_proposals=False),
|
101 |
+
allowed_border=-1,
|
102 |
+
pos_weight=-1,
|
103 |
+
debug=False),
|
104 |
+
rpn_proposal=dict(
|
105 |
+
nms_pre=2000,
|
106 |
+
max_per_img=1000,
|
107 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
108 |
+
min_bbox_size=0),
|
109 |
+
rcnn=dict(
|
110 |
+
assigner=dict(
|
111 |
+
type='MaxIoUAssigner',
|
112 |
+
pos_iou_thr=0.5,
|
113 |
+
neg_iou_thr=0.5,
|
114 |
+
min_pos_iou=0.5,
|
115 |
+
match_low_quality=False,
|
116 |
+
ignore_iof_thr=-1),
|
117 |
+
sampler=dict(
|
118 |
+
type='RandomSampler',
|
119 |
+
num=512,
|
120 |
+
pos_fraction=0.25,
|
121 |
+
neg_pos_ub=-1,
|
122 |
+
add_gt_as_proposals=True),
|
123 |
+
pos_weight=-1,
|
124 |
+
debug=False)),
|
125 |
+
test_cfg=dict(
|
126 |
+
rpn=dict(
|
127 |
+
nms_pre=1000,
|
128 |
+
max_per_img=1000,
|
129 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
130 |
+
min_bbox_size=0),
|
131 |
+
rcnn=dict(
|
132 |
+
score_thr=0.02,
|
133 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
134 |
+
nms=dict(type='nms',
|
135 |
+
iou_threshold=0.5,
|
136 |
+
class_agnostic=True,
|
137 |
+
split_thr=100000),
|
138 |
+
max_per_img=50,
|
139 |
+
mask_thr_binary=0.5)
|
140 |
+
# soft-nms is also supported for rcnn testing
|
141 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
142 |
+
),
|
143 |
+
track_head=dict(
|
144 |
+
type='MasaTrackHead',
|
145 |
+
roi_extractor=dict(
|
146 |
+
type='SingleRoIExtractor',
|
147 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
148 |
+
out_channels=256,
|
149 |
+
featmap_strides=[4, 8, 16, 32]),
|
150 |
+
embed_head=dict(
|
151 |
+
type='QuasiDenseEmbedHead',
|
152 |
+
num_convs=4,
|
153 |
+
num_fcs=1,
|
154 |
+
embed_channels=256,
|
155 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
156 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
157 |
+
loss_track_aux=dict(
|
158 |
+
type='MarginL2Loss',
|
159 |
+
neg_pos_ub=3,
|
160 |
+
pos_margin=0,
|
161 |
+
neg_margin=0.1,
|
162 |
+
hard_mining=True,
|
163 |
+
loss_weight=1.0)),
|
164 |
+
train_cfg=dict(
|
165 |
+
assigner=dict(
|
166 |
+
type='MaxIoUAssigner',
|
167 |
+
pos_iou_thr=0.7,
|
168 |
+
neg_iou_thr=0.3,
|
169 |
+
min_pos_iou=0.5,
|
170 |
+
match_low_quality=False,
|
171 |
+
ignore_iof_thr=-1),
|
172 |
+
sampler=dict(
|
173 |
+
type='CombinedSampler',
|
174 |
+
num=512,
|
175 |
+
pos_fraction=0.5,
|
176 |
+
neg_pos_ub=3,
|
177 |
+
add_gt_as_proposals=True,
|
178 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
179 |
+
neg_sampler=dict(type='RandomSampler')))),
|
180 |
+
tracker=dict(
|
181 |
+
type='MasaTaoTracker',
|
182 |
+
init_score_thr=0.0001,
|
183 |
+
obj_score_thr=0.0001,
|
184 |
+
match_score_thr=0.5,
|
185 |
+
memo_tracklet_frames=10,
|
186 |
+
memo_momentum=0.8,
|
187 |
+
with_cats=False,
|
188 |
+
max_distance=-1,
|
189 |
+
fps=1,
|
190 |
+
)
|
191 |
+
)
|
192 |
+
|
193 |
+
test_pipeline = [
|
194 |
+
dict(
|
195 |
+
type='TransformBroadcaster',
|
196 |
+
transforms=[
|
197 |
+
dict(type='LoadImageFromFile'),
|
198 |
+
dict(
|
199 |
+
type='Resize',
|
200 |
+
scale=(1024, 1024),
|
201 |
+
keep_ratio=True),
|
202 |
+
dict(type='LoadTrackAnnotations')
|
203 |
+
]),
|
204 |
+
dict(type='PackTrackInputs')
|
205 |
+
]
|
206 |
+
|
207 |
+
# runtime settings
|
208 |
+
train_dataloader = None
|
209 |
+
train_cfg = None
|
210 |
+
val_cfg = dict(type='ValLoop')
|
211 |
+
test_cfg = dict(type='TestLoop')
|
212 |
+
|
213 |
+
default_hooks = dict(
|
214 |
+
logger=dict(type='LoggerHook', interval=50),
|
215 |
+
visualization=dict(type='TrackVisualizationHook', draw=False))
|
216 |
+
|
217 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
218 |
+
visualizer = dict(
|
219 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
220 |
+
|
221 |
+
auto_scale_lr = dict(enable=False, base_batch_size=16)
|
222 |
+
val_dataloader = dict(
|
223 |
+
dataset=dict(
|
224 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
225 |
+
pipeline=test_pipeline,
|
226 |
+
)
|
227 |
+
)
|
228 |
+
test_dataloader = val_dataloader
|
229 |
+
test_evaluator = dict(
|
230 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
231 |
+
outfile_prefix='results/masa_results/masa-sam-b-release-ovmot-test',
|
232 |
+
open_vocabulary=True,
|
233 |
+
)
|
configs/masa-sam/open_vocabulary_mot_test/masa_sam_vith_open_vocabulary_test.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../sam-vith.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v1.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector['init_cfg'] = dict(
|
9 |
+
type='Pretrained',
|
10 |
+
checkpoint= 'saved_models/pretrain_weights/sam_vit_h_4b8939_mmdet.pth'
|
11 |
+
# noqa: E501
|
12 |
+
)
|
13 |
+
detector['type'] = 'SamMasa'
|
14 |
+
|
15 |
+
del _base_.model
|
16 |
+
|
17 |
+
model = dict(
|
18 |
+
type='MASA',
|
19 |
+
freeze_detector=True,
|
20 |
+
unified_backbone=True,
|
21 |
+
load_public_dets = True,
|
22 |
+
benchmark = 'tao',
|
23 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
|
24 |
+
data_preprocessor=dict(
|
25 |
+
type='TrackDataPreprocessor',
|
26 |
+
# Image normalization parameters
|
27 |
+
mean=[123.675, 116.28, 103.53],
|
28 |
+
std=[58.395, 57.12, 57.375],
|
29 |
+
bgr_to_rgb=True,
|
30 |
+
# Image padding parameters
|
31 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
32 |
+
pad_size_divisor=1024), # Padding the image to multiples of 32
|
33 |
+
detector=detector,
|
34 |
+
masa_adapter=[
|
35 |
+
dict(
|
36 |
+
type='SimpleFPN',
|
37 |
+
in_channels=[1280, 1280, 1280, 1280],
|
38 |
+
out_channels=256,
|
39 |
+
use_residual=True,
|
40 |
+
num_outs=5),
|
41 |
+
dict(
|
42 |
+
type='DyHead',
|
43 |
+
in_channels=256,
|
44 |
+
out_channels=256,
|
45 |
+
num_blocks=3)
|
46 |
+
],
|
47 |
+
rpn_head=dict(
|
48 |
+
type='RPNHead',
|
49 |
+
in_channels=256,
|
50 |
+
feat_channels=256,
|
51 |
+
anchor_generator=dict(
|
52 |
+
type='AnchorGenerator',
|
53 |
+
scales=[8],
|
54 |
+
ratios=[0.5, 1.0, 2.0],
|
55 |
+
strides=[4, 8, 16, 32, 64]),
|
56 |
+
bbox_coder=dict(
|
57 |
+
type='DeltaXYWHBBoxCoder',
|
58 |
+
target_means=[.0, .0, .0, .0],
|
59 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
60 |
+
loss_cls=dict(
|
61 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
62 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
63 |
+
),
|
64 |
+
roi_head=dict(
|
65 |
+
type='StandardRoIHead',
|
66 |
+
bbox_roi_extractor=dict(
|
67 |
+
type='SingleRoIExtractor',
|
68 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
69 |
+
out_channels=256,
|
70 |
+
featmap_strides=[4, 8, 16, 32]),
|
71 |
+
bbox_head=dict(
|
72 |
+
type='Shared2FCBBoxHead',
|
73 |
+
in_channels=256,
|
74 |
+
fc_out_channels=1024,
|
75 |
+
roi_feat_size=7,
|
76 |
+
num_classes=1,
|
77 |
+
bbox_coder=dict(
|
78 |
+
type='DeltaXYWHBBoxCoder',
|
79 |
+
target_means=[0., 0., 0., 0.],
|
80 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
81 |
+
reg_class_agnostic=True,
|
82 |
+
loss_cls=dict(
|
83 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
84 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
85 |
+
# model training and testing settings
|
86 |
+
train_cfg=dict(
|
87 |
+
rpn=dict(
|
88 |
+
assigner=dict(
|
89 |
+
type='MaxIoUAssigner',
|
90 |
+
pos_iou_thr=0.7,
|
91 |
+
neg_iou_thr=0.3,
|
92 |
+
min_pos_iou=0.3,
|
93 |
+
match_low_quality=True,
|
94 |
+
ignore_iof_thr=-1),
|
95 |
+
sampler=dict(
|
96 |
+
type='RandomSampler',
|
97 |
+
num=256,
|
98 |
+
pos_fraction=0.5,
|
99 |
+
neg_pos_ub=-1,
|
100 |
+
add_gt_as_proposals=False),
|
101 |
+
allowed_border=-1,
|
102 |
+
pos_weight=-1,
|
103 |
+
debug=False),
|
104 |
+
rpn_proposal=dict(
|
105 |
+
nms_pre=2000,
|
106 |
+
max_per_img=1000,
|
107 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
108 |
+
min_bbox_size=0),
|
109 |
+
rcnn=dict(
|
110 |
+
assigner=dict(
|
111 |
+
type='MaxIoUAssigner',
|
112 |
+
pos_iou_thr=0.5,
|
113 |
+
neg_iou_thr=0.5,
|
114 |
+
min_pos_iou=0.5,
|
115 |
+
match_low_quality=False,
|
116 |
+
ignore_iof_thr=-1),
|
117 |
+
sampler=dict(
|
118 |
+
type='RandomSampler',
|
119 |
+
num=512,
|
120 |
+
pos_fraction=0.25,
|
121 |
+
neg_pos_ub=-1,
|
122 |
+
add_gt_as_proposals=True),
|
123 |
+
pos_weight=-1,
|
124 |
+
debug=False)),
|
125 |
+
test_cfg=dict(
|
126 |
+
rpn=dict(
|
127 |
+
nms_pre=1000,
|
128 |
+
max_per_img=1000,
|
129 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
130 |
+
min_bbox_size=0),
|
131 |
+
rcnn=dict(
|
132 |
+
score_thr=0.02,
|
133 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
134 |
+
nms=dict(type='nms',
|
135 |
+
iou_threshold=0.5,
|
136 |
+
class_agnostic=True,
|
137 |
+
split_thr=100000),
|
138 |
+
max_per_img=50,
|
139 |
+
mask_thr_binary=0.5)
|
140 |
+
# soft-nms is also supported for rcnn testing
|
141 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
142 |
+
),
|
143 |
+
track_head=dict(
|
144 |
+
type='MasaTrackHead',
|
145 |
+
roi_extractor=dict(
|
146 |
+
type='SingleRoIExtractor',
|
147 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
148 |
+
out_channels=256,
|
149 |
+
featmap_strides=[4, 8, 16, 32]),
|
150 |
+
embed_head=dict(
|
151 |
+
type='QuasiDenseEmbedHead',
|
152 |
+
num_convs=4,
|
153 |
+
num_fcs=1,
|
154 |
+
embed_channels=256,
|
155 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
156 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
157 |
+
loss_track_aux=dict(
|
158 |
+
type='MarginL2Loss',
|
159 |
+
neg_pos_ub=3,
|
160 |
+
pos_margin=0,
|
161 |
+
neg_margin=0.1,
|
162 |
+
hard_mining=True,
|
163 |
+
loss_weight=1.0)),
|
164 |
+
# loss_bbox=dict(type='L1Loss', loss_weight=1.0),
|
165 |
+
train_cfg=dict(
|
166 |
+
assigner=dict(
|
167 |
+
type='MaxIoUAssigner',
|
168 |
+
pos_iou_thr=0.7,
|
169 |
+
neg_iou_thr=0.5,
|
170 |
+
min_pos_iou=0.5,
|
171 |
+
match_low_quality=False,
|
172 |
+
ignore_iof_thr=-1),
|
173 |
+
sampler=dict(
|
174 |
+
type='CombinedSampler',
|
175 |
+
num=512,
|
176 |
+
pos_fraction=0.8,
|
177 |
+
neg_pos_ub=3,
|
178 |
+
add_gt_as_proposals=True,
|
179 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
180 |
+
neg_sampler=dict(type='RandomSampler')))),
|
181 |
+
tracker=dict(
|
182 |
+
type='MasaTaoTracker',
|
183 |
+
init_score_thr=0.0001,
|
184 |
+
obj_score_thr=0.0001,
|
185 |
+
match_score_thr=0.5,
|
186 |
+
memo_tracklet_frames=10,
|
187 |
+
memo_momentum=0.8,
|
188 |
+
with_cats=False,
|
189 |
+
max_distance=-1,
|
190 |
+
fps=1,
|
191 |
+
)
|
192 |
+
)
|
193 |
+
|
194 |
+
test_pipeline = [
|
195 |
+
dict(
|
196 |
+
type='TransformBroadcaster',
|
197 |
+
transforms=[
|
198 |
+
dict(type='LoadImageFromFile'),
|
199 |
+
dict(
|
200 |
+
type='Resize',
|
201 |
+
scale=(1024, 1024),
|
202 |
+
keep_ratio=True),
|
203 |
+
dict(type='LoadTrackAnnotations')
|
204 |
+
]),
|
205 |
+
dict(type='PackTrackInputs')
|
206 |
+
]
|
207 |
+
|
208 |
+
|
209 |
+
train_dataloader = None
|
210 |
+
train_cfg = None
|
211 |
+
val_cfg = dict(type='ValLoop')
|
212 |
+
test_cfg = dict(type='TestLoop')
|
213 |
+
|
214 |
+
default_hooks = dict(
|
215 |
+
logger=dict(type='LoggerHook', interval=50),
|
216 |
+
visualization=dict(type='TrackVisualizationHook', draw=False))
|
217 |
+
|
218 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
219 |
+
visualizer = dict(
|
220 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
221 |
+
|
222 |
+
auto_scale_lr = dict(enable=False, base_batch_size=16)
|
223 |
+
val_dataloader = dict(
|
224 |
+
dataset=dict(
|
225 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
226 |
+
pipeline=test_pipeline,
|
227 |
+
)
|
228 |
+
)
|
229 |
+
test_dataloader = val_dataloader
|
230 |
+
test_evaluator = dict(
|
231 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
232 |
+
outfile_prefix='results/masa_results/masa-sam-h-release-ovmot-test',
|
233 |
+
open_vocabulary=True,
|
234 |
+
)
|
configs/masa-sam/sam-vitb.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
prompt_embed_dim=256
|
2 |
+
model = dict(
|
3 |
+
type='SamMasa',
|
4 |
+
backbone=dict(
|
5 |
+
type='ImageEncoderViT',
|
6 |
+
depth=12,
|
7 |
+
embed_dim=768,
|
8 |
+
img_size=1024,
|
9 |
+
mlp_ratio=4,
|
10 |
+
num_heads=12,
|
11 |
+
patch_size=16,
|
12 |
+
qkv_bias=True,
|
13 |
+
use_rel_pos=True,
|
14 |
+
global_attn_indexes=[2, 5, 8, 11],
|
15 |
+
window_size=14,
|
16 |
+
out_chans=prompt_embed_dim,
|
17 |
+
out_indices=[2, 5, 8, 11]),
|
18 |
+
mask_decoder=dict(
|
19 |
+
type='MaskDecoder',
|
20 |
+
num_multimask_outputs=3,
|
21 |
+
transformer_dim=prompt_embed_dim,
|
22 |
+
iou_head_depth=3,
|
23 |
+
iou_head_hidden_dim=256),
|
24 |
+
prompt_encoder=dict(
|
25 |
+
type='PromptEncoder',
|
26 |
+
embed_dim=prompt_embed_dim,
|
27 |
+
image_embedding_size=(64, 64),
|
28 |
+
input_image_size=(1024, 1024),
|
29 |
+
mask_in_chans=16),
|
30 |
+
)
|
configs/masa-sam/sam-vith.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
prompt_embed_dim=256
|
2 |
+
model = dict(
|
3 |
+
type='SamMasa',
|
4 |
+
backbone=dict(
|
5 |
+
type='ImageEncoderViT',
|
6 |
+
depth=32,
|
7 |
+
embed_dim=1280,
|
8 |
+
img_size=1024,
|
9 |
+
mlp_ratio=4,
|
10 |
+
num_heads=16,
|
11 |
+
patch_size=16,
|
12 |
+
qkv_bias=True,
|
13 |
+
use_rel_pos=True,
|
14 |
+
global_attn_indexes=[7, 15, 23, 31],
|
15 |
+
window_size=14,
|
16 |
+
out_chans=prompt_embed_dim,
|
17 |
+
out_indices=[7, 15, 23, 31]),
|
18 |
+
mask_decoder=dict(
|
19 |
+
type='MaskDecoder',
|
20 |
+
num_multimask_outputs=3,
|
21 |
+
transformer_dim=prompt_embed_dim,
|
22 |
+
iou_head_depth=3,
|
23 |
+
iou_head_hidden_dim=256),
|
24 |
+
prompt_encoder=dict(
|
25 |
+
type='PromptEncoder',
|
26 |
+
embed_dim=prompt_embed_dim,
|
27 |
+
image_embedding_size=(64, 64),
|
28 |
+
input_image_size=(1024, 1024),
|
29 |
+
mask_in_chans=16),
|
30 |
+
)
|
configs/masa-sam/tao_teta_test/masa_sam_vitb_tao_test_detic_dets.py
ADDED
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../sam-vitb.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v1.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector['init_cfg'] = dict(
|
9 |
+
type='Pretrained',
|
10 |
+
checkpoint= 'saved_models/pretrain_weights/sam_vit_b_01ec64_mmdet.pth'
|
11 |
+
# noqa: E501
|
12 |
+
)
|
13 |
+
detector['type'] = 'SamMasa'
|
14 |
+
|
15 |
+
del _base_.model
|
16 |
+
|
17 |
+
model = dict(
|
18 |
+
type='MASA',
|
19 |
+
freeze_detector=True,
|
20 |
+
unified_backbone=True,
|
21 |
+
load_public_dets = True,
|
22 |
+
benchmark = 'tao',
|
23 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
|
24 |
+
data_preprocessor=dict(
|
25 |
+
type='TrackDataPreprocessor',
|
26 |
+
# Image normalization parameters
|
27 |
+
mean=[123.675, 116.28, 103.53],
|
28 |
+
std=[58.395, 57.12, 57.375],
|
29 |
+
bgr_to_rgb=True,
|
30 |
+
# Image padding parameters
|
31 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
32 |
+
pad_size_divisor=1024), # Padding the image to multiples of 32
|
33 |
+
detector=detector,
|
34 |
+
masa_adapter=[
|
35 |
+
dict(
|
36 |
+
type='SimpleFPN',
|
37 |
+
in_channels=[768, 768, 768, 768],
|
38 |
+
out_channels=256,
|
39 |
+
use_residual=True,
|
40 |
+
num_outs=5),
|
41 |
+
dict(
|
42 |
+
type='DyHead',
|
43 |
+
in_channels=256,
|
44 |
+
out_channels=256,
|
45 |
+
num_blocks=3)
|
46 |
+
],
|
47 |
+
rpn_head=dict(
|
48 |
+
type='RPNHead',
|
49 |
+
in_channels=256,
|
50 |
+
feat_channels=256,
|
51 |
+
anchor_generator=dict(
|
52 |
+
type='AnchorGenerator',
|
53 |
+
scales=[8],
|
54 |
+
ratios=[0.5, 1.0, 2.0],
|
55 |
+
strides=[4, 8, 16, 32, 64]),
|
56 |
+
bbox_coder=dict(
|
57 |
+
type='DeltaXYWHBBoxCoder',
|
58 |
+
target_means=[.0, .0, .0, .0],
|
59 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
60 |
+
loss_cls=dict(
|
61 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
62 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
63 |
+
),
|
64 |
+
roi_head=dict(
|
65 |
+
type='StandardRoIHead',
|
66 |
+
bbox_roi_extractor=dict(
|
67 |
+
type='SingleRoIExtractor',
|
68 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
69 |
+
out_channels=256,
|
70 |
+
featmap_strides=[4, 8, 16, 32]),
|
71 |
+
bbox_head=dict(
|
72 |
+
type='Shared2FCBBoxHead',
|
73 |
+
in_channels=256,
|
74 |
+
fc_out_channels=1024,
|
75 |
+
roi_feat_size=7,
|
76 |
+
num_classes=1,
|
77 |
+
bbox_coder=dict(
|
78 |
+
type='DeltaXYWHBBoxCoder',
|
79 |
+
target_means=[0., 0., 0., 0.],
|
80 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
81 |
+
reg_class_agnostic=True,
|
82 |
+
loss_cls=dict(
|
83 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
84 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
85 |
+
# model training and testing settings
|
86 |
+
train_cfg=dict(
|
87 |
+
rpn=dict(
|
88 |
+
assigner=dict(
|
89 |
+
type='MaxIoUAssigner',
|
90 |
+
pos_iou_thr=0.7,
|
91 |
+
neg_iou_thr=0.3,
|
92 |
+
min_pos_iou=0.3,
|
93 |
+
match_low_quality=True,
|
94 |
+
ignore_iof_thr=-1),
|
95 |
+
sampler=dict(
|
96 |
+
type='RandomSampler',
|
97 |
+
num=256,
|
98 |
+
pos_fraction=0.5,
|
99 |
+
neg_pos_ub=-1,
|
100 |
+
add_gt_as_proposals=False),
|
101 |
+
allowed_border=-1,
|
102 |
+
pos_weight=-1,
|
103 |
+
debug=False),
|
104 |
+
rpn_proposal=dict(
|
105 |
+
nms_pre=2000,
|
106 |
+
max_per_img=1000,
|
107 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
108 |
+
min_bbox_size=0),
|
109 |
+
rcnn=dict(
|
110 |
+
assigner=dict(
|
111 |
+
type='MaxIoUAssigner',
|
112 |
+
pos_iou_thr=0.5,
|
113 |
+
neg_iou_thr=0.5,
|
114 |
+
min_pos_iou=0.5,
|
115 |
+
match_low_quality=False,
|
116 |
+
ignore_iof_thr=-1),
|
117 |
+
sampler=dict(
|
118 |
+
type='RandomSampler',
|
119 |
+
num=512,
|
120 |
+
pos_fraction=0.25,
|
121 |
+
neg_pos_ub=-1,
|
122 |
+
add_gt_as_proposals=True),
|
123 |
+
pos_weight=-1,
|
124 |
+
debug=False)),
|
125 |
+
test_cfg=dict(
|
126 |
+
rpn=dict(
|
127 |
+
nms_pre=1000,
|
128 |
+
max_per_img=1000,
|
129 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
130 |
+
min_bbox_size=0),
|
131 |
+
rcnn=dict(
|
132 |
+
score_thr=0.02,
|
133 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
134 |
+
nms=dict(type='nms',
|
135 |
+
iou_threshold=0.5,
|
136 |
+
class_agnostic=True,
|
137 |
+
split_thr=100000),
|
138 |
+
max_per_img=50,
|
139 |
+
mask_thr_binary=0.5)
|
140 |
+
# soft-nms is also supported for rcnn testing
|
141 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
142 |
+
),
|
143 |
+
track_head=dict(
|
144 |
+
type='MasaTrackHead',
|
145 |
+
roi_extractor=dict(
|
146 |
+
type='SingleRoIExtractor',
|
147 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
148 |
+
out_channels=256,
|
149 |
+
featmap_strides=[4, 8, 16, 32]),
|
150 |
+
embed_head=dict(
|
151 |
+
type='QuasiDenseEmbedHead',
|
152 |
+
num_convs=4,
|
153 |
+
num_fcs=1,
|
154 |
+
embed_channels=256,
|
155 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
156 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
157 |
+
loss_track_aux=dict(
|
158 |
+
type='MarginL2Loss',
|
159 |
+
neg_pos_ub=3,
|
160 |
+
pos_margin=0,
|
161 |
+
neg_margin=0.1,
|
162 |
+
hard_mining=True,
|
163 |
+
loss_weight=1.0)),
|
164 |
+
train_cfg=dict(
|
165 |
+
assigner=dict(
|
166 |
+
type='MaxIoUAssigner',
|
167 |
+
pos_iou_thr=0.7,
|
168 |
+
neg_iou_thr=0.3,
|
169 |
+
min_pos_iou=0.5,
|
170 |
+
match_low_quality=False,
|
171 |
+
ignore_iof_thr=-1),
|
172 |
+
sampler=dict(
|
173 |
+
type='CombinedSampler',
|
174 |
+
num=512,
|
175 |
+
pos_fraction=0.5,
|
176 |
+
neg_pos_ub=3,
|
177 |
+
add_gt_as_proposals=True,
|
178 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
179 |
+
neg_sampler=dict(type='RandomSampler')))),
|
180 |
+
tracker=dict(
|
181 |
+
type='MasaTaoTracker',
|
182 |
+
init_score_thr=0.0001,
|
183 |
+
obj_score_thr=0.0001,
|
184 |
+
match_score_thr=0.5,
|
185 |
+
memo_tracklet_frames=10,
|
186 |
+
memo_momentum=0.8,
|
187 |
+
with_cats=False,
|
188 |
+
max_distance=-1,
|
189 |
+
fps=1,
|
190 |
+
)
|
191 |
+
)
|
192 |
+
|
193 |
+
test_pipeline = [
|
194 |
+
dict(
|
195 |
+
type='TransformBroadcaster',
|
196 |
+
transforms=[
|
197 |
+
dict(type='LoadImageFromFile'),
|
198 |
+
dict(
|
199 |
+
type='Resize',
|
200 |
+
scale=(1024, 1024),
|
201 |
+
keep_ratio=True),
|
202 |
+
dict(type='LoadTrackAnnotations')
|
203 |
+
]),
|
204 |
+
dict(type='PackTrackInputs')
|
205 |
+
]
|
206 |
+
|
207 |
+
# runtime settings
|
208 |
+
train_dataloader = None
|
209 |
+
train_cfg = None
|
210 |
+
val_cfg = dict(type='ValLoop')
|
211 |
+
test_cfg = dict(type='TestLoop')
|
212 |
+
|
213 |
+
default_hooks = dict(
|
214 |
+
logger=dict(type='LoggerHook', interval=50),
|
215 |
+
visualization=dict(type='TrackVisualizationHook', draw=False))
|
216 |
+
|
217 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
218 |
+
visualizer = dict(
|
219 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
220 |
+
|
221 |
+
auto_scale_lr = dict(enable=False, base_batch_size=16)
|
222 |
+
val_dataloader = dict(
|
223 |
+
dataset=dict(
|
224 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
225 |
+
pipeline=test_pipeline,
|
226 |
+
)
|
227 |
+
)
|
228 |
+
test_dataloader = val_dataloader
|
229 |
+
test_evaluator = dict(
|
230 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
231 |
+
outfile_prefix='results/masa_results/masa-sam-vitb-tao-test-detic-dets',
|
232 |
+
)
|
configs/masa-sam/tao_teta_test/masa_sam_vitb_tao_test_teter_swinT_dets.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../sam-vitb.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v05.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector['init_cfg'] = dict(
|
9 |
+
type='Pretrained',
|
10 |
+
checkpoint= 'saved_models/pretrain_weights/sam_vit_b_01ec64_mmdet.pth'
|
11 |
+
# noqa: E501
|
12 |
+
)
|
13 |
+
detector['type'] = 'SamMasa'
|
14 |
+
|
15 |
+
del _base_.model
|
16 |
+
|
17 |
+
model = dict(
|
18 |
+
type='MASA',
|
19 |
+
freeze_detector=True,
|
20 |
+
unified_backbone=True,
|
21 |
+
load_public_dets = True,
|
22 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/teter_swinT_tao_val_internms_50/',
|
23 |
+
data_preprocessor=dict(
|
24 |
+
type='TrackDataPreprocessor',
|
25 |
+
# Image normalization parameters
|
26 |
+
mean=[123.675, 116.28, 103.53],
|
27 |
+
std=[58.395, 57.12, 57.375],
|
28 |
+
bgr_to_rgb=True,
|
29 |
+
# Image padding parameters
|
30 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
31 |
+
pad_size_divisor=1024), # Padding the image to multiples of 32
|
32 |
+
detector=detector,
|
33 |
+
masa_adapter=[
|
34 |
+
dict(
|
35 |
+
type='SimpleFPN',
|
36 |
+
in_channels=[768, 768, 768, 768],
|
37 |
+
out_channels=256,
|
38 |
+
use_residual=True,
|
39 |
+
num_outs=5),
|
40 |
+
dict(
|
41 |
+
type='DyHead',
|
42 |
+
in_channels=256,
|
43 |
+
out_channels=256,
|
44 |
+
num_blocks=3)
|
45 |
+
],
|
46 |
+
rpn_head=dict(
|
47 |
+
type='RPNHead',
|
48 |
+
in_channels=256,
|
49 |
+
feat_channels=256,
|
50 |
+
anchor_generator=dict(
|
51 |
+
type='AnchorGenerator',
|
52 |
+
scales=[8],
|
53 |
+
ratios=[0.5, 1.0, 2.0],
|
54 |
+
strides=[4, 8, 16, 32, 64]),
|
55 |
+
bbox_coder=dict(
|
56 |
+
type='DeltaXYWHBBoxCoder',
|
57 |
+
target_means=[.0, .0, .0, .0],
|
58 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
59 |
+
loss_cls=dict(
|
60 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
61 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
62 |
+
),
|
63 |
+
roi_head=dict(
|
64 |
+
type='StandardRoIHead',
|
65 |
+
bbox_roi_extractor=dict(
|
66 |
+
type='SingleRoIExtractor',
|
67 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
68 |
+
out_channels=256,
|
69 |
+
featmap_strides=[4, 8, 16, 32]),
|
70 |
+
bbox_head=dict(
|
71 |
+
type='Shared2FCBBoxHead',
|
72 |
+
in_channels=256,
|
73 |
+
fc_out_channels=1024,
|
74 |
+
roi_feat_size=7,
|
75 |
+
num_classes=1,
|
76 |
+
bbox_coder=dict(
|
77 |
+
type='DeltaXYWHBBoxCoder',
|
78 |
+
target_means=[0., 0., 0., 0.],
|
79 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
80 |
+
reg_class_agnostic=True,
|
81 |
+
loss_cls=dict(
|
82 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
83 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
84 |
+
# model training and testing settings
|
85 |
+
train_cfg=dict(
|
86 |
+
rpn=dict(
|
87 |
+
assigner=dict(
|
88 |
+
type='MaxIoUAssigner',
|
89 |
+
pos_iou_thr=0.7,
|
90 |
+
neg_iou_thr=0.3,
|
91 |
+
min_pos_iou=0.3,
|
92 |
+
match_low_quality=True,
|
93 |
+
ignore_iof_thr=-1),
|
94 |
+
sampler=dict(
|
95 |
+
type='RandomSampler',
|
96 |
+
num=256,
|
97 |
+
pos_fraction=0.5,
|
98 |
+
neg_pos_ub=-1,
|
99 |
+
add_gt_as_proposals=False),
|
100 |
+
allowed_border=-1,
|
101 |
+
pos_weight=-1,
|
102 |
+
debug=False),
|
103 |
+
rpn_proposal=dict(
|
104 |
+
nms_pre=2000,
|
105 |
+
max_per_img=1000,
|
106 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
107 |
+
min_bbox_size=0),
|
108 |
+
rcnn=dict(
|
109 |
+
assigner=dict(
|
110 |
+
type='MaxIoUAssigner',
|
111 |
+
pos_iou_thr=0.5,
|
112 |
+
neg_iou_thr=0.5,
|
113 |
+
min_pos_iou=0.5,
|
114 |
+
match_low_quality=False,
|
115 |
+
ignore_iof_thr=-1),
|
116 |
+
sampler=dict(
|
117 |
+
type='RandomSampler',
|
118 |
+
num=512,
|
119 |
+
pos_fraction=0.25,
|
120 |
+
neg_pos_ub=-1,
|
121 |
+
add_gt_as_proposals=True),
|
122 |
+
pos_weight=-1,
|
123 |
+
debug=False)),
|
124 |
+
test_cfg=dict(
|
125 |
+
rpn=dict(
|
126 |
+
nms_pre=1000,
|
127 |
+
max_per_img=1000,
|
128 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
129 |
+
min_bbox_size=0),
|
130 |
+
rcnn=dict(
|
131 |
+
score_thr=0.02,
|
132 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
133 |
+
nms=dict(type='nms',
|
134 |
+
iou_threshold=0.5,
|
135 |
+
class_agnostic=True,
|
136 |
+
split_thr=100000),
|
137 |
+
max_per_img=50,
|
138 |
+
mask_thr_binary=0.5)
|
139 |
+
# soft-nms is also supported for rcnn testing
|
140 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
141 |
+
),
|
142 |
+
track_head=dict(
|
143 |
+
type='MasaTrackHead',
|
144 |
+
roi_extractor=dict(
|
145 |
+
type='SingleRoIExtractor',
|
146 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
147 |
+
out_channels=256,
|
148 |
+
featmap_strides=[4, 8, 16, 32]),
|
149 |
+
embed_head=dict(
|
150 |
+
type='QuasiDenseEmbedHead',
|
151 |
+
num_convs=4,
|
152 |
+
num_fcs=1,
|
153 |
+
embed_channels=256,
|
154 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
155 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
156 |
+
loss_track_aux=dict(
|
157 |
+
type='MarginL2Loss',
|
158 |
+
neg_pos_ub=3,
|
159 |
+
pos_margin=0,
|
160 |
+
neg_margin=0.1,
|
161 |
+
hard_mining=True,
|
162 |
+
loss_weight=1.0)),
|
163 |
+
train_cfg=dict(
|
164 |
+
assigner=dict(
|
165 |
+
type='MaxIoUAssigner',
|
166 |
+
pos_iou_thr=0.7,
|
167 |
+
neg_iou_thr=0.3,
|
168 |
+
min_pos_iou=0.5,
|
169 |
+
match_low_quality=False,
|
170 |
+
ignore_iof_thr=-1),
|
171 |
+
sampler=dict(
|
172 |
+
type='CombinedSampler',
|
173 |
+
num=512,
|
174 |
+
pos_fraction=0.5,
|
175 |
+
neg_pos_ub=3,
|
176 |
+
add_gt_as_proposals=True,
|
177 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
178 |
+
neg_sampler=dict(type='RandomSampler')))),
|
179 |
+
tracker=dict(
|
180 |
+
type='MasaTaoTracker',
|
181 |
+
init_score_thr=0.0001,
|
182 |
+
obj_score_thr=0.0001,
|
183 |
+
match_score_thr=0.5,
|
184 |
+
memo_tracklet_frames=10,
|
185 |
+
memo_momentum=0.8,
|
186 |
+
with_cats=False,
|
187 |
+
max_distance=-1,
|
188 |
+
fps=1,
|
189 |
+
)
|
190 |
+
)
|
191 |
+
|
192 |
+
test_pipeline = [
|
193 |
+
dict(
|
194 |
+
type='TransformBroadcaster',
|
195 |
+
transforms=[
|
196 |
+
dict(type='LoadImageFromFile'),
|
197 |
+
dict(
|
198 |
+
type='Resize',
|
199 |
+
scale=(1024, 1024),
|
200 |
+
keep_ratio=True),
|
201 |
+
dict(type='LoadTrackAnnotations')
|
202 |
+
]),
|
203 |
+
dict(type='PackTrackInputs')
|
204 |
+
]
|
205 |
+
|
206 |
+
|
207 |
+
train_dataloader = None
|
208 |
+
train_cfg = None
|
209 |
+
val_cfg = dict(type='ValLoop')
|
210 |
+
test_cfg = dict(type='TestLoop')
|
211 |
+
|
212 |
+
default_hooks = dict(
|
213 |
+
logger=dict(type='LoggerHook', interval=50),
|
214 |
+
visualization=dict(type='TrackVisualizationHook', draw=False))
|
215 |
+
|
216 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
217 |
+
visualizer = dict(
|
218 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
219 |
+
|
220 |
+
# custom hooks
|
221 |
+
custom_hooks = [
|
222 |
+
# Synchronize model buffers such as running_mean and running_var in BN
|
223 |
+
# at the end of each epoch
|
224 |
+
dict(type='SyncBuffersHook')
|
225 |
+
]
|
226 |
+
auto_scale_lr = dict(enable=False, base_batch_size=16)
|
227 |
+
val_dataloader = dict(
|
228 |
+
dataset=dict(
|
229 |
+
ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
|
230 |
+
pipeline=test_pipeline,
|
231 |
+
)
|
232 |
+
)
|
233 |
+
test_dataloader = val_dataloader
|
234 |
+
val_evaluator = dict(
|
235 |
+
ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
|
236 |
+
outfile_prefix='results/masa_results/masa-sam-vitb-tao-test-teter-swinT-dets',
|
237 |
+
)
|
238 |
+
test_evaluator = val_evaluator
|
configs/masa-sam/tao_teta_test/masa_sam_vith_tao_test_detic_dets.py
ADDED
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../sam-vith.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v1.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector['init_cfg'] = dict(
|
9 |
+
type='Pretrained',
|
10 |
+
checkpoint= 'saved_models/pretrain_weights/sam_vit_h_4b8939_mmdet.pth'
|
11 |
+
# noqa: E501
|
12 |
+
)
|
13 |
+
detector['type'] = 'SamMasa'
|
14 |
+
|
15 |
+
del _base_.model
|
16 |
+
|
17 |
+
model = dict(
|
18 |
+
type='MASA',
|
19 |
+
freeze_detector=True,
|
20 |
+
unified_backbone=True,
|
21 |
+
load_public_dets = True,
|
22 |
+
benchmark = 'tao',
|
23 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
|
24 |
+
data_preprocessor=dict(
|
25 |
+
type='TrackDataPreprocessor',
|
26 |
+
# Image normalization parameters
|
27 |
+
mean=[123.675, 116.28, 103.53],
|
28 |
+
std=[58.395, 57.12, 57.375],
|
29 |
+
bgr_to_rgb=True,
|
30 |
+
# Image padding parameters
|
31 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
32 |
+
pad_size_divisor=1024), # Padding the image to multiples of 32
|
33 |
+
detector=detector,
|
34 |
+
masa_adapter=[
|
35 |
+
dict(
|
36 |
+
type='SimpleFPN',
|
37 |
+
in_channels=[1280, 1280, 1280, 1280],
|
38 |
+
out_channels=256,
|
39 |
+
use_residual=True,
|
40 |
+
num_outs=5),
|
41 |
+
dict(
|
42 |
+
type='DyHead',
|
43 |
+
in_channels=256,
|
44 |
+
out_channels=256,
|
45 |
+
num_blocks=3)
|
46 |
+
],
|
47 |
+
rpn_head=dict(
|
48 |
+
type='RPNHead',
|
49 |
+
in_channels=256,
|
50 |
+
feat_channels=256,
|
51 |
+
anchor_generator=dict(
|
52 |
+
type='AnchorGenerator',
|
53 |
+
scales=[8],
|
54 |
+
ratios=[0.5, 1.0, 2.0],
|
55 |
+
strides=[4, 8, 16, 32, 64]),
|
56 |
+
bbox_coder=dict(
|
57 |
+
type='DeltaXYWHBBoxCoder',
|
58 |
+
target_means=[.0, .0, .0, .0],
|
59 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
60 |
+
loss_cls=dict(
|
61 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
62 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
63 |
+
),
|
64 |
+
roi_head=dict(
|
65 |
+
type='StandardRoIHead',
|
66 |
+
bbox_roi_extractor=dict(
|
67 |
+
type='SingleRoIExtractor',
|
68 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
69 |
+
out_channels=256,
|
70 |
+
featmap_strides=[4, 8, 16, 32]),
|
71 |
+
bbox_head=dict(
|
72 |
+
type='Shared2FCBBoxHead',
|
73 |
+
in_channels=256,
|
74 |
+
fc_out_channels=1024,
|
75 |
+
roi_feat_size=7,
|
76 |
+
num_classes=1,
|
77 |
+
bbox_coder=dict(
|
78 |
+
type='DeltaXYWHBBoxCoder',
|
79 |
+
target_means=[0., 0., 0., 0.],
|
80 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
81 |
+
reg_class_agnostic=True,
|
82 |
+
loss_cls=dict(
|
83 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
84 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
85 |
+
# model training and testing settings
|
86 |
+
train_cfg=dict(
|
87 |
+
rpn=dict(
|
88 |
+
assigner=dict(
|
89 |
+
type='MaxIoUAssigner',
|
90 |
+
pos_iou_thr=0.7,
|
91 |
+
neg_iou_thr=0.3,
|
92 |
+
min_pos_iou=0.3,
|
93 |
+
match_low_quality=True,
|
94 |
+
ignore_iof_thr=-1),
|
95 |
+
sampler=dict(
|
96 |
+
type='RandomSampler',
|
97 |
+
num=256,
|
98 |
+
pos_fraction=0.5,
|
99 |
+
neg_pos_ub=-1,
|
100 |
+
add_gt_as_proposals=False),
|
101 |
+
allowed_border=-1,
|
102 |
+
pos_weight=-1,
|
103 |
+
debug=False),
|
104 |
+
rpn_proposal=dict(
|
105 |
+
nms_pre=2000,
|
106 |
+
max_per_img=1000,
|
107 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
108 |
+
min_bbox_size=0),
|
109 |
+
rcnn=dict(
|
110 |
+
assigner=dict(
|
111 |
+
type='MaxIoUAssigner',
|
112 |
+
pos_iou_thr=0.5,
|
113 |
+
neg_iou_thr=0.5,
|
114 |
+
min_pos_iou=0.5,
|
115 |
+
match_low_quality=False,
|
116 |
+
ignore_iof_thr=-1),
|
117 |
+
sampler=dict(
|
118 |
+
type='RandomSampler',
|
119 |
+
num=512,
|
120 |
+
pos_fraction=0.25,
|
121 |
+
neg_pos_ub=-1,
|
122 |
+
add_gt_as_proposals=True),
|
123 |
+
pos_weight=-1,
|
124 |
+
debug=False)),
|
125 |
+
test_cfg=dict(
|
126 |
+
rpn=dict(
|
127 |
+
nms_pre=1000,
|
128 |
+
max_per_img=1000,
|
129 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
130 |
+
min_bbox_size=0),
|
131 |
+
rcnn=dict(
|
132 |
+
score_thr=0.02,
|
133 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
134 |
+
nms=dict(type='nms',
|
135 |
+
iou_threshold=0.5,
|
136 |
+
class_agnostic=True,
|
137 |
+
split_thr=100000),
|
138 |
+
max_per_img=50,
|
139 |
+
mask_thr_binary=0.5)
|
140 |
+
# soft-nms is also supported for rcnn testing
|
141 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
142 |
+
),
|
143 |
+
track_head=dict(
|
144 |
+
type='MasaTrackHead',
|
145 |
+
roi_extractor=dict(
|
146 |
+
type='SingleRoIExtractor',
|
147 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
148 |
+
out_channels=256,
|
149 |
+
featmap_strides=[4, 8, 16, 32]),
|
150 |
+
embed_head=dict(
|
151 |
+
type='QuasiDenseEmbedHead',
|
152 |
+
num_convs=4,
|
153 |
+
num_fcs=1,
|
154 |
+
embed_channels=256,
|
155 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
156 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
157 |
+
loss_track_aux=dict(
|
158 |
+
type='MarginL2Loss',
|
159 |
+
neg_pos_ub=3,
|
160 |
+
pos_margin=0,
|
161 |
+
neg_margin=0.1,
|
162 |
+
hard_mining=True,
|
163 |
+
loss_weight=1.0)),
|
164 |
+
# loss_bbox=dict(type='L1Loss', loss_weight=1.0),
|
165 |
+
train_cfg=dict(
|
166 |
+
assigner=dict(
|
167 |
+
type='MaxIoUAssigner',
|
168 |
+
pos_iou_thr=0.7,
|
169 |
+
neg_iou_thr=0.5,
|
170 |
+
min_pos_iou=0.5,
|
171 |
+
match_low_quality=False,
|
172 |
+
ignore_iof_thr=-1),
|
173 |
+
sampler=dict(
|
174 |
+
type='CombinedSampler',
|
175 |
+
num=512,
|
176 |
+
pos_fraction=0.8,
|
177 |
+
neg_pos_ub=3,
|
178 |
+
add_gt_as_proposals=True,
|
179 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
180 |
+
neg_sampler=dict(type='RandomSampler')))),
|
181 |
+
tracker=dict(
|
182 |
+
type='MasaTaoTracker',
|
183 |
+
init_score_thr=0.0001,
|
184 |
+
obj_score_thr=0.0001,
|
185 |
+
match_score_thr=0.5,
|
186 |
+
memo_tracklet_frames=10,
|
187 |
+
memo_momentum=0.8,
|
188 |
+
with_cats=False,
|
189 |
+
max_distance=-1,
|
190 |
+
fps=1,
|
191 |
+
)
|
192 |
+
)
|
193 |
+
|
194 |
+
test_pipeline = [
|
195 |
+
dict(
|
196 |
+
type='TransformBroadcaster',
|
197 |
+
transforms=[
|
198 |
+
dict(type='LoadImageFromFile'),
|
199 |
+
dict(
|
200 |
+
type='Resize',
|
201 |
+
scale=(1024, 1024),
|
202 |
+
keep_ratio=True),
|
203 |
+
dict(type='LoadTrackAnnotations')
|
204 |
+
]),
|
205 |
+
dict(type='PackTrackInputs')
|
206 |
+
]
|
207 |
+
|
208 |
+
|
209 |
+
train_dataloader = None
|
210 |
+
train_cfg = None
|
211 |
+
val_cfg = dict(type='ValLoop')
|
212 |
+
test_cfg = dict(type='TestLoop')
|
213 |
+
|
214 |
+
default_hooks = dict(
|
215 |
+
logger=dict(type='LoggerHook', interval=50),
|
216 |
+
visualization=dict(type='TrackVisualizationHook', draw=False))
|
217 |
+
|
218 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
219 |
+
visualizer = dict(
|
220 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
221 |
+
|
222 |
+
auto_scale_lr = dict(enable=False, base_batch_size=16)
|
223 |
+
val_dataloader = dict(
|
224 |
+
dataset=dict(
|
225 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
226 |
+
pipeline=test_pipeline,
|
227 |
+
)
|
228 |
+
)
|
229 |
+
test_dataloader = val_dataloader
|
230 |
+
test_evaluator = dict(
|
231 |
+
ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
|
232 |
+
outfile_prefix='results/masa_results/masa-sam-vith-tao-test-detic-dets',
|
233 |
+
)
|
configs/masa-sam/tao_teta_test/masa_sam_vith_tao_test_teter_swinT_dets.py
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../sam-vith.py',
|
3 |
+
'../../datasets/tao/tao_dataset_v05.py',
|
4 |
+
'../../default_runtime.py'
|
5 |
+
]
|
6 |
+
default_scope = 'mmdet'
|
7 |
+
detector = _base_.model
|
8 |
+
detector['init_cfg'] = dict(
|
9 |
+
type='Pretrained',
|
10 |
+
checkpoint= 'saved_models/pretrain_weights/sam_vit_h_4b8939_mmdet.pth'
|
11 |
+
# noqa: E501
|
12 |
+
)
|
13 |
+
detector['type'] = 'SamMasa'
|
14 |
+
|
15 |
+
del _base_.model
|
16 |
+
|
17 |
+
model = dict(
|
18 |
+
type='MASA',
|
19 |
+
freeze_detector=True,
|
20 |
+
unified_backbone=True,
|
21 |
+
load_public_dets = True,
|
22 |
+
public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/teter_swinT_tao_val_internms_50/',
|
23 |
+
data_preprocessor=dict(
|
24 |
+
type='TrackDataPreprocessor',
|
25 |
+
# Image normalization parameters
|
26 |
+
mean=[123.675, 116.28, 103.53],
|
27 |
+
std=[58.395, 57.12, 57.375],
|
28 |
+
bgr_to_rgb=True,
|
29 |
+
# Image padding parameters
|
30 |
+
pad_mask=False, # In instance segmentation, the mask needs to be padded
|
31 |
+
pad_size_divisor=1024), # Padding the image to multiples of 32
|
32 |
+
detector=detector,
|
33 |
+
masa_adapter=[
|
34 |
+
dict(
|
35 |
+
type='SimpleFPN',
|
36 |
+
in_channels=[1280, 1280, 1280, 1280],
|
37 |
+
out_channels=256,
|
38 |
+
use_residual=True,
|
39 |
+
num_outs=5),
|
40 |
+
dict(
|
41 |
+
type='DyHead',
|
42 |
+
in_channels=256,
|
43 |
+
out_channels=256,
|
44 |
+
num_blocks=3)
|
45 |
+
],
|
46 |
+
rpn_head=dict(
|
47 |
+
type='RPNHead',
|
48 |
+
in_channels=256,
|
49 |
+
feat_channels=256,
|
50 |
+
anchor_generator=dict(
|
51 |
+
type='AnchorGenerator',
|
52 |
+
scales=[8],
|
53 |
+
ratios=[0.5, 1.0, 2.0],
|
54 |
+
strides=[4, 8, 16, 32, 64]),
|
55 |
+
bbox_coder=dict(
|
56 |
+
type='DeltaXYWHBBoxCoder',
|
57 |
+
target_means=[.0, .0, .0, .0],
|
58 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
59 |
+
loss_cls=dict(
|
60 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
61 |
+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
|
62 |
+
),
|
63 |
+
roi_head=dict(
|
64 |
+
type='StandardRoIHead',
|
65 |
+
bbox_roi_extractor=dict(
|
66 |
+
type='SingleRoIExtractor',
|
67 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
68 |
+
out_channels=256,
|
69 |
+
featmap_strides=[4, 8, 16, 32]),
|
70 |
+
bbox_head=dict(
|
71 |
+
type='Shared2FCBBoxHead',
|
72 |
+
in_channels=256,
|
73 |
+
fc_out_channels=1024,
|
74 |
+
roi_feat_size=7,
|
75 |
+
num_classes=1,
|
76 |
+
bbox_coder=dict(
|
77 |
+
type='DeltaXYWHBBoxCoder',
|
78 |
+
target_means=[0., 0., 0., 0.],
|
79 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
80 |
+
reg_class_agnostic=True,
|
81 |
+
loss_cls=dict(
|
82 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
83 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
84 |
+
# model training and testing settings
|
85 |
+
train_cfg=dict(
|
86 |
+
rpn=dict(
|
87 |
+
assigner=dict(
|
88 |
+
type='MaxIoUAssigner',
|
89 |
+
pos_iou_thr=0.7,
|
90 |
+
neg_iou_thr=0.3,
|
91 |
+
min_pos_iou=0.3,
|
92 |
+
match_low_quality=True,
|
93 |
+
ignore_iof_thr=-1),
|
94 |
+
sampler=dict(
|
95 |
+
type='RandomSampler',
|
96 |
+
num=256,
|
97 |
+
pos_fraction=0.5,
|
98 |
+
neg_pos_ub=-1,
|
99 |
+
add_gt_as_proposals=False),
|
100 |
+
allowed_border=-1,
|
101 |
+
pos_weight=-1,
|
102 |
+
debug=False),
|
103 |
+
rpn_proposal=dict(
|
104 |
+
nms_pre=2000,
|
105 |
+
max_per_img=1000,
|
106 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
107 |
+
min_bbox_size=0),
|
108 |
+
rcnn=dict(
|
109 |
+
assigner=dict(
|
110 |
+
type='MaxIoUAssigner',
|
111 |
+
pos_iou_thr=0.5,
|
112 |
+
neg_iou_thr=0.5,
|
113 |
+
min_pos_iou=0.5,
|
114 |
+
match_low_quality=False,
|
115 |
+
ignore_iof_thr=-1),
|
116 |
+
sampler=dict(
|
117 |
+
type='RandomSampler',
|
118 |
+
num=512,
|
119 |
+
pos_fraction=0.25,
|
120 |
+
neg_pos_ub=-1,
|
121 |
+
add_gt_as_proposals=True),
|
122 |
+
pos_weight=-1,
|
123 |
+
debug=False)),
|
124 |
+
test_cfg=dict(
|
125 |
+
rpn=dict(
|
126 |
+
nms_pre=1000,
|
127 |
+
max_per_img=1000,
|
128 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
129 |
+
min_bbox_size=0),
|
130 |
+
rcnn=dict(
|
131 |
+
score_thr=0.02,
|
132 |
+
# nms=dict(type='nms', iou_threshold=0.5),
|
133 |
+
nms=dict(type='nms',
|
134 |
+
iou_threshold=0.5,
|
135 |
+
class_agnostic=True,
|
136 |
+
split_thr=100000),
|
137 |
+
max_per_img=50,
|
138 |
+
mask_thr_binary=0.5)
|
139 |
+
# soft-nms is also supported for rcnn testing
|
140 |
+
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
141 |
+
),
|
142 |
+
track_head=dict(
|
143 |
+
type='MasaTrackHead',
|
144 |
+
roi_extractor=dict(
|
145 |
+
type='SingleRoIExtractor',
|
146 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
147 |
+
out_channels=256,
|
148 |
+
featmap_strides=[4, 8, 16, 32]),
|
149 |
+
embed_head=dict(
|
150 |
+
type='QuasiDenseEmbedHead',
|
151 |
+
num_convs=4,
|
152 |
+
num_fcs=1,
|
153 |
+
embed_channels=256,
|
154 |
+
norm_cfg=dict(type='GN', num_groups=32),
|
155 |
+
loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
|
156 |
+
loss_track_aux=dict(
|
157 |
+
type='MarginL2Loss',
|
158 |
+
neg_pos_ub=3,
|
159 |
+
pos_margin=0,
|
160 |
+
neg_margin=0.1,
|
161 |
+
hard_mining=True,
|
162 |
+
loss_weight=1.0)),
|
163 |
+
# loss_bbox=dict(type='L1Loss', loss_weight=1.0),
|
164 |
+
train_cfg=dict(
|
165 |
+
assigner=dict(
|
166 |
+
type='MaxIoUAssigner',
|
167 |
+
pos_iou_thr=0.7,
|
168 |
+
neg_iou_thr=0.5,
|
169 |
+
min_pos_iou=0.5,
|
170 |
+
match_low_quality=False,
|
171 |
+
ignore_iof_thr=-1),
|
172 |
+
sampler=dict(
|
173 |
+
type='CombinedSampler',
|
174 |
+
num=512,
|
175 |
+
pos_fraction=0.8,
|
176 |
+
neg_pos_ub=3,
|
177 |
+
add_gt_as_proposals=True,
|
178 |
+
pos_sampler=dict(type='InstanceBalancedPosSampler'),
|
179 |
+
neg_sampler=dict(type='RandomSampler')))),
|
180 |
+
tracker=dict(
|
181 |
+
type='MasaTaoTracker',
|
182 |
+
init_score_thr=0.0001,
|
183 |
+
obj_score_thr=0.0001,
|
184 |
+
match_score_thr=0.5,
|
185 |
+
memo_tracklet_frames=10,
|
186 |
+
memo_momentum=0.8,
|
187 |
+
with_cats=False,
|
188 |
+
max_distance=-1,
|
189 |
+
fps=1,
|
190 |
+
)
|
191 |
+
)
|
192 |
+
|
193 |
+
test_pipeline = [
|
194 |
+
dict(
|
195 |
+
type='TransformBroadcaster',
|
196 |
+
transforms=[
|
197 |
+
dict(type='LoadImageFromFile'),
|
198 |
+
dict(
|
199 |
+
type='Resize',
|
200 |
+
scale=(1024, 1024),
|
201 |
+
keep_ratio=True),
|
202 |
+
dict(type='LoadTrackAnnotations')
|
203 |
+
]),
|
204 |
+
dict(type='PackTrackInputs')
|
205 |
+
]
|
206 |
+
|
207 |
+
|
208 |
+
train_dataloader = None
|
209 |
+
train_cfg = None
|
210 |
+
val_cfg = dict(type='ValLoop')
|
211 |
+
test_cfg = dict(type='TestLoop')
|
212 |
+
|
213 |
+
default_hooks = dict(
|
214 |
+
logger=dict(type='LoggerHook', interval=50),
|
215 |
+
visualization=dict(type='TrackVisualizationHook', draw=False))
|
216 |
+
|
217 |
+
vis_backends = [dict(type='LocalVisBackend')]
|
218 |
+
visualizer = dict(
|
219 |
+
type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
220 |
+
|
221 |
+
# custom hooks
|
222 |
+
custom_hooks = [
|
223 |
+
# Synchronize model buffers such as running_mean and running_var in BN
|
224 |
+
# at the end of each epoch
|
225 |
+
dict(type='SyncBuffersHook')
|
226 |
+
]
|
227 |
+
auto_scale_lr = dict(enable=False, base_batch_size=16)
|
228 |
+
val_dataloader = dict(
|
229 |
+
dataset=dict(
|
230 |
+
ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
|
231 |
+
pipeline=test_pipeline,
|
232 |
+
)
|
233 |
+
)
|
234 |
+
test_dataloader = val_dataloader
|
235 |
+
val_evaluator = dict(
|
236 |
+
ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
|
237 |
+
outfile_prefix='results/masa_results/masa-sam-vith-tao-test-teter-swinT-dets',
|
238 |
+
)
|
239 |
+
test_evaluator = val_evaluator
|
environment_docker.yml
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: masaenv
|
2 |
+
channels:
|
3 |
+
- pytorch
|
4 |
+
- nvidia
|
5 |
+
- conda-forge
|
6 |
+
- defaults
|
7 |
+
dependencies:
|
8 |
+
- _libgcc_mutex=0.1=conda_forge
|
9 |
+
- _openmp_mutex=4.5=2_gnu
|
10 |
+
- aom=3.9.1=hac33072_0
|
11 |
+
- blas=1.0=mkl
|
12 |
+
- brotli-python=1.0.9=py311h6a678d5_8
|
13 |
+
- bzip2=1.0.8=h5eee18b_6
|
14 |
+
- ca-certificates=2024.6.2=hbcca054_0
|
15 |
+
- cairo=1.18.0=h3faef2a_0
|
16 |
+
- certifi=2024.6.2=pyhd8ed1ab_0
|
17 |
+
- charset-normalizer=2.0.4=pyhd3eb1b0_0
|
18 |
+
- cuda-cudart=11.8.89=0
|
19 |
+
- cuda-cupti=11.8.87=0
|
20 |
+
- cuda-libraries=11.8.0=0
|
21 |
+
- cuda-nvrtc=11.8.89=0
|
22 |
+
- cuda-nvtx=11.8.86=0
|
23 |
+
- cuda-runtime=11.8.0=0
|
24 |
+
- cudatoolkit=11.8.0=h6a678d5_0
|
25 |
+
- dav1d=1.2.1=hd590300_0
|
26 |
+
- expat=2.6.2=h59595ed_0
|
27 |
+
- ffmpeg=7.0.1=gpl_hb399a10_100
|
28 |
+
- font-ttf-dejavu-sans-mono=2.37=hab24e00_0
|
29 |
+
- font-ttf-inconsolata=3.000=h77eed37_0
|
30 |
+
- font-ttf-source-code-pro=2.038=h77eed37_0
|
31 |
+
- font-ttf-ubuntu=0.83=h77eed37_2
|
32 |
+
- fontconfig=2.14.2=h14ed4e7_0
|
33 |
+
- fonts-conda-ecosystem=1=0
|
34 |
+
- fonts-conda-forge=1=0
|
35 |
+
- freetype=2.12.1=h4a9f257_0
|
36 |
+
- fribidi=1.0.10=h36c2ea0_0
|
37 |
+
- gmp=6.3.0=h59595ed_1
|
38 |
+
- gmpy2=2.1.2=py311hc9b5ff0_0
|
39 |
+
- gnutls=3.7.9=hb077bed_0
|
40 |
+
- graphite2=1.3.13=h59595ed_1003
|
41 |
+
- harfbuzz=8.5.0=hfac3d4d_0
|
42 |
+
- icu=73.2=h59595ed_0
|
43 |
+
- idna=3.7=py311h06a4308_0
|
44 |
+
- intel-openmp=2023.1.0=hdb19cb5_46306
|
45 |
+
- jinja2=3.1.4=py311h06a4308_0
|
46 |
+
- jpeg=9e=h5eee18b_1
|
47 |
+
- lame=3.100=h7b6447c_0
|
48 |
+
- lcms2=2.12=h3be6417_0
|
49 |
+
- ld_impl_linux-64=2.38=h1181459_1
|
50 |
+
- lerc=3.0=h295c915_0
|
51 |
+
- libabseil=20240116.2=cxx17_h59595ed_0
|
52 |
+
- libass=0.17.1=h8fe9dca_1
|
53 |
+
- libcublas=11.11.3.6=0
|
54 |
+
- libcufft=10.9.0.58=0
|
55 |
+
- libcufile=1.9.1.3=0
|
56 |
+
- libcurand=10.3.5.147=0
|
57 |
+
- libcusolver=11.4.1.48=0
|
58 |
+
- libcusparse=11.7.5.86=0
|
59 |
+
- libdeflate=1.17=h5eee18b_1
|
60 |
+
- libdrm=2.4.120=hd590300_0
|
61 |
+
- libexpat=2.6.2=h59595ed_0
|
62 |
+
- libffi=3.4.4=h6a678d5_1
|
63 |
+
- libgcc-ng=13.2.0=h77fa898_10
|
64 |
+
- libglib=2.80.2=hf974151_0
|
65 |
+
- libgomp=13.2.0=h77fa898_10
|
66 |
+
- libhwloc=2.10.0=default_h5622ce7_1001
|
67 |
+
- libiconv=1.17=hd590300_2
|
68 |
+
- libidn2=2.3.4=h5eee18b_0
|
69 |
+
- libjpeg-turbo=2.0.0=h9bf148f_0
|
70 |
+
- libnpp=11.8.0.86=0
|
71 |
+
- libnsl=2.0.1=hd590300_0
|
72 |
+
- libnvjpeg=11.9.0.86=0
|
73 |
+
- libopenvino=2024.1.0=h2da1b83_7
|
74 |
+
- libopenvino-auto-batch-plugin=2024.1.0=hb045406_7
|
75 |
+
- libopenvino-auto-plugin=2024.1.0=hb045406_7
|
76 |
+
- libopenvino-hetero-plugin=2024.1.0=h5c03a75_7
|
77 |
+
- libopenvino-intel-cpu-plugin=2024.1.0=h2da1b83_7
|
78 |
+
- libopenvino-intel-gpu-plugin=2024.1.0=h2da1b83_7
|
79 |
+
- libopenvino-intel-npu-plugin=2024.1.0=he02047a_7
|
80 |
+
- libopenvino-ir-frontend=2024.1.0=h5c03a75_7
|
81 |
+
- libopenvino-onnx-frontend=2024.1.0=h07e8aee_7
|
82 |
+
- libopenvino-paddle-frontend=2024.1.0=h07e8aee_7
|
83 |
+
- libopenvino-pytorch-frontend=2024.1.0=he02047a_7
|
84 |
+
- libopenvino-tensorflow-frontend=2024.1.0=h39126c6_7
|
85 |
+
- libopenvino-tensorflow-lite-frontend=2024.1.0=he02047a_7
|
86 |
+
- libopus=1.3.1=h7f98852_1
|
87 |
+
- libpciaccess=0.18=hd590300_0
|
88 |
+
- libpng=1.6.39=h5eee18b_0
|
89 |
+
- libprotobuf=4.25.3=h08a7969_0
|
90 |
+
- libsqlite=3.46.0=hde9e2c9_0
|
91 |
+
- libstdcxx-ng=13.2.0=hc0a3c3a_10
|
92 |
+
- libtasn1=4.19.0=h5eee18b_0
|
93 |
+
- libtiff=4.5.1=h6a678d5_0
|
94 |
+
- libunistring=0.9.10=h27cfd23_0
|
95 |
+
- libuuid=2.38.1=h0b41bf4_0
|
96 |
+
- libva=2.21.0=h4ab18f5_2
|
97 |
+
- libvpx=1.14.1=hac33072_0
|
98 |
+
- libwebp-base=1.3.2=h5eee18b_0
|
99 |
+
- libxcb=1.15=h0b41bf4_0
|
100 |
+
- libxcrypt=4.4.36=hd590300_1
|
101 |
+
- libxml2=2.12.7=hc051c1a_1
|
102 |
+
- libzlib=1.2.13=h4ab18f5_6
|
103 |
+
- llvm-openmp=14.0.6=h9e868ea_0
|
104 |
+
- lz4-c=1.9.4=h6a678d5_1
|
105 |
+
- markupsafe=2.1.3=py311h5eee18b_0
|
106 |
+
- mkl=2023.1.0=h213fc3f_46344
|
107 |
+
- mkl-service=2.4.0=py311h5eee18b_1
|
108 |
+
- mkl_fft=1.3.8=py311h5eee18b_0
|
109 |
+
- mkl_random=1.2.4=py311hdb19cb5_0
|
110 |
+
- mpc=1.1.0=h10f8cd9_1
|
111 |
+
- mpfr=4.0.2=hb69a4c5_1
|
112 |
+
- mpmath=1.3.0=py311h06a4308_0
|
113 |
+
- ncurses=6.4=h6a678d5_0
|
114 |
+
- nettle=3.9.1=h7ab15ed_0
|
115 |
+
- networkx=3.2.1=py311h06a4308_0
|
116 |
+
- numpy=1.26.4=py311h08b1b3b_0
|
117 |
+
- numpy-base=1.26.4=py311hf175353_0
|
118 |
+
- ocl-icd=2.3.2=hd590300_1
|
119 |
+
- openh264=2.4.1=h59595ed_0
|
120 |
+
- openjpeg=2.4.0=h3ad879b_0
|
121 |
+
- openssl=3.3.1=h4ab18f5_0
|
122 |
+
- p11-kit=0.24.1=hc5aa10d_0
|
123 |
+
- pcre2=10.43=hcad00b1_0
|
124 |
+
- pillow=10.3.0=py311h5eee18b_0
|
125 |
+
- pip=24.0=py311h06a4308_0
|
126 |
+
- pixman=0.43.2=h59595ed_0
|
127 |
+
- pthread-stubs=0.4=h36c2ea0_1001
|
128 |
+
- pugixml=1.14=h59595ed_0
|
129 |
+
- pysocks=1.7.1=py311h06a4308_0
|
130 |
+
- python=3.11.8=hab00c5b_0_cpython
|
131 |
+
- pytorch=2.1.2=py3.11_cuda11.8_cudnn8.7.0_0
|
132 |
+
- pytorch-cuda=11.8=h7e8668a_5
|
133 |
+
- pytorch-mutex=1.0=cuda
|
134 |
+
- pyyaml=6.0.1=py311h5eee18b_0
|
135 |
+
- readline=8.2=h5eee18b_0
|
136 |
+
- snappy=1.2.0=hdb0a2a9_1
|
137 |
+
- sqlite=3.45.3=h5eee18b_0
|
138 |
+
- svt-av1=2.1.0=hac33072_0
|
139 |
+
- sympy=1.12=py311h06a4308_0
|
140 |
+
- tbb=2021.12.0=h297d8ca_1
|
141 |
+
- tk=8.6.14=h39e8969_0
|
142 |
+
- torchaudio=2.1.2=py311_cu118
|
143 |
+
- torchtriton=2.1.0=py311
|
144 |
+
- torchvision=0.16.2=py311_cu118
|
145 |
+
- typing_extensions=4.11.0=py311h06a4308_0
|
146 |
+
- wheel=0.43.0=py311h06a4308_0
|
147 |
+
- x264=1!164.3095=h166bdaf_2
|
148 |
+
- x265=3.5=h924138e_3
|
149 |
+
- xorg-fixesproto=5.0=h7f98852_1002
|
150 |
+
- xorg-kbproto=1.0.7=h7f98852_1002
|
151 |
+
- xorg-libice=1.1.1=hd590300_0
|
152 |
+
- xorg-libsm=1.2.4=h7391055_0
|
153 |
+
- xorg-libx11=1.8.9=h8ee46fc_0
|
154 |
+
- xorg-libxau=1.0.11=hd590300_0
|
155 |
+
- xorg-libxdmcp=1.1.3=h7f98852_0
|
156 |
+
- xorg-libxext=1.3.4=h0b41bf4_2
|
157 |
+
- xorg-libxfixes=5.0.3=h7f98852_1004
|
158 |
+
- xorg-libxrender=0.9.11=hd590300_0
|
159 |
+
- xorg-renderproto=0.11.1=h7f98852_1002
|
160 |
+
- xorg-xextproto=7.3.0=h0b41bf4_1003
|
161 |
+
- xorg-xproto=7.0.31=h7f98852_1007
|
162 |
+
- xz=5.4.6=h5eee18b_1
|
163 |
+
- yaml=0.2.5=h7b6447c_0
|
164 |
+
- zlib=1.2.13=h4ab18f5_6
|
165 |
+
- zstd=1.5.5=hc292b87_2
|
166 |
+
- pip:
|
167 |
+
- addict==2.4.0
|
168 |
+
- aiofiles==23.2.1
|
169 |
+
- aliyun-python-sdk-core==2.15.1
|
170 |
+
- aliyun-python-sdk-kms==2.16.3
|
171 |
+
- altair==5.3.0
|
172 |
+
- annotated-types==0.7.0
|
173 |
+
- anyio==4.4.0
|
174 |
+
- attrs==23.2.0
|
175 |
+
- boto3==1.34.128
|
176 |
+
- botocore==1.34.128
|
177 |
+
- cffi==1.16.0
|
178 |
+
- click==8.1.7
|
179 |
+
- clip==1.0
|
180 |
+
- colorama==0.4.6
|
181 |
+
- contourpy==1.2.1
|
182 |
+
- crcmod==1.7
|
183 |
+
- cryptography==42.0.8
|
184 |
+
- cycler==0.12.1
|
185 |
+
- cython==3.0.10
|
186 |
+
- decorator==4.4.2
|
187 |
+
- defusedxml==0.7.1
|
188 |
+
- dnspython==2.6.1
|
189 |
+
- einops==0.8.0
|
190 |
+
- email-validator==2.1.2
|
191 |
+
- fairscale==0.4.13
|
192 |
+
- fastapi==0.111.0
|
193 |
+
- fastapi-cli==0.0.4
|
194 |
+
- ffmpy==0.3.2
|
195 |
+
- filelock==3.14.0
|
196 |
+
- fonttools==4.53.0
|
197 |
+
- fsspec==2024.6.0
|
198 |
+
- ftfy==6.2.0
|
199 |
+
- gradio==4.36.1
|
200 |
+
- gradio-client==1.0.1
|
201 |
+
- h11==0.14.0
|
202 |
+
- h5py==3.11.0
|
203 |
+
- httpcore==1.0.5
|
204 |
+
- httptools==0.6.1
|
205 |
+
- httpx==0.27.0
|
206 |
+
- huggingface-hub==0.23.4
|
207 |
+
- imageio==2.34.1
|
208 |
+
- importlib-metadata==7.1.0
|
209 |
+
- importlib-resources==6.4.0
|
210 |
+
- jmespath==0.10.0
|
211 |
+
- joblib==1.4.2
|
212 |
+
- jsonschema==4.22.0
|
213 |
+
- jsonschema-specifications==2023.12.1
|
214 |
+
- kiwisolver==1.4.5
|
215 |
+
- llvmlite==0.43.0
|
216 |
+
- lvis==0.5.3
|
217 |
+
- markdown==3.6
|
218 |
+
- markdown-it-py==3.0.0
|
219 |
+
- matplotlib==3.9.0
|
220 |
+
- mdurl==0.1.2
|
221 |
+
- mmcv==2.1.0
|
222 |
+
- mmdet==3.3.0
|
223 |
+
- mmengine==0.10.4
|
224 |
+
- model-index==0.1.11
|
225 |
+
- motmetrics==1.4.0
|
226 |
+
- moviepy==0.2.3.5
|
227 |
+
- nanoid==2.0.0
|
228 |
+
- natsort==8.4.0
|
229 |
+
- nltk==3.8.1
|
230 |
+
- numba==0.60.0
|
231 |
+
- opencv-python==4.10.0.84
|
232 |
+
- opencv-python-headless==4.10.0.84
|
233 |
+
- opendatalab==0.0.10
|
234 |
+
- openmim==0.3.9
|
235 |
+
- openxlab==0.1.0
|
236 |
+
- ordered-set==4.1.0
|
237 |
+
- orjson==3.10.5
|
238 |
+
- oss2==2.17.0
|
239 |
+
- packaging==24.1
|
240 |
+
- pandas==2.2.2
|
241 |
+
- platformdirs==4.2.2
|
242 |
+
- plyfile==1.0.3
|
243 |
+
- psutil==5.9.8
|
244 |
+
- pycocotools==2.0.8
|
245 |
+
- pycparser==2.22
|
246 |
+
- pycryptodome==3.20.0
|
247 |
+
- pydantic==2.7.4
|
248 |
+
- pydantic-core==2.18.4
|
249 |
+
- pydub==0.25.1
|
250 |
+
- pygments==2.18.0
|
251 |
+
- pyparsing==3.1.2
|
252 |
+
- python-dateutil==2.9.0.post0
|
253 |
+
- python-dotenv==1.0.1
|
254 |
+
- python-multipart==0.0.9
|
255 |
+
- pytz==2023.4
|
256 |
+
- referencing==0.35.1
|
257 |
+
- regex==2024.5.15
|
258 |
+
- requests==2.32.3
|
259 |
+
- rich==13.4.2
|
260 |
+
- rpds-py==0.18.1
|
261 |
+
- ruff==0.4.9
|
262 |
+
- s3transfer==0.10.1
|
263 |
+
- safetensors==0.4.3
|
264 |
+
- scalabel==0.3.0
|
265 |
+
- scipy==1.13.1
|
266 |
+
- script-utils==0.0.1
|
267 |
+
- seaborn==0.13.2
|
268 |
+
- semantic-version==2.10.0
|
269 |
+
- setuptools==60.2.0
|
270 |
+
- shapely==2.0.4
|
271 |
+
- shellingham==1.5.4
|
272 |
+
- six==1.16.0
|
273 |
+
- sniffio==1.3.1
|
274 |
+
- starlette==0.37.2
|
275 |
+
- supervision==0.21.0
|
276 |
+
- tabulate==0.9.0
|
277 |
+
- tao==0.1.0
|
278 |
+
- termcolor==2.4.0
|
279 |
+
- terminaltables==3.1.10
|
280 |
+
- teta==0.1.0
|
281 |
+
- tokenizers==0.15.2
|
282 |
+
- toml==0.10.2
|
283 |
+
- tomli==2.0.1
|
284 |
+
- tomlkit==0.12.0
|
285 |
+
- toolz==0.12.1
|
286 |
+
- tqdm==4.65.2
|
287 |
+
- trackeval==1.0.dev1
|
288 |
+
- transformers==4.38.2
|
289 |
+
- typer==0.12.3
|
290 |
+
- tzdata==2024.1
|
291 |
+
- ujson==5.10.0
|
292 |
+
- urllib3==2.2.2
|
293 |
+
- uvicorn==0.30.1
|
294 |
+
- uvloop==0.19.0
|
295 |
+
- watchfiles==0.22.0
|
296 |
+
- wcwidth==0.2.13
|
297 |
+
- websockets==11.0.3
|
298 |
+
- xmltodict==0.13.0
|
299 |
+
- yacs==0.1.8
|
300 |
+
- yapf==0.40.2
|
301 |
+
- youtube-dl==2021.12.17
|
302 |
+
- zipp==3.19.2
|
masa/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .datasets import * # noqa
|
2 |
+
from .models import * # noqa
|
3 |
+
from .visualization import * # noqa
|
masa/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (267 Bytes). View file
|
|
masa/apis/__init__.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) OpenMMLab. All rights reserved.
|
2 |
+
from .masa_inference import (build_test_pipeline, inference_detector,
|
3 |
+
inference_masa, init_masa)
|
4 |
+
|
5 |
+
__all__ = [
|
6 |
+
"inference_masa",
|
7 |
+
"init_masa",
|
8 |
+
"inference_detector",
|
9 |
+
"build_test_pipeline",
|
10 |
+
]
|
masa/apis/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (418 Bytes). View file
|
|
masa/apis/__pycache__/masa_inference.cpython-311.pyc
ADDED
Binary file (13 kB). View file
|
|
masa/apis/masa_inference.py
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import time
|
3 |
+
import warnings
|
4 |
+
from pathlib import Path
|
5 |
+
from typing import Optional, Sequence, Union
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
import torch.nn as nn
|
10 |
+
from mmcv.ops import RoIPool
|
11 |
+
from mmcv.transforms import Compose
|
12 |
+
from mmdet.evaluation import get_classes
|
13 |
+
from mmdet.registry import MODELS
|
14 |
+
from mmdet.structures import DetDataSample, SampleList
|
15 |
+
from mmdet.utils import ConfigType, get_test_pipeline_cfg
|
16 |
+
from mmengine.config import Config
|
17 |
+
from mmengine.dataset import default_collate
|
18 |
+
from mmengine.model.utils import revert_sync_batchnorm
|
19 |
+
from mmengine.registry import init_default_scope
|
20 |
+
from mmengine.runner import autocast, load_checkpoint
|
21 |
+
|
22 |
+
ImagesType = Union[str, np.ndarray, Sequence[str], Sequence[np.ndarray]]
|
23 |
+
|
24 |
+
|
25 |
+
def init_masa(
|
26 |
+
config: Union[str, Path, Config],
|
27 |
+
checkpoint: Optional[str] = None,
|
28 |
+
palette: str = "none",
|
29 |
+
device: str = "cuda:0",
|
30 |
+
cfg_options: Optional[dict] = None,
|
31 |
+
) -> nn.Module:
|
32 |
+
"""Initialize a unified masa detector from config file.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path,
|
36 |
+
:obj:`Path`, or the config object.
|
37 |
+
checkpoint (str, optional): Checkpoint path. If left as None, the model
|
38 |
+
will not load any weights.
|
39 |
+
palette (str): Color palette used for visualization. If palette
|
40 |
+
is stored in checkpoint, use checkpoint's palette first, otherwise
|
41 |
+
use externally passed palette. Currently, supports 'coco', 'voc',
|
42 |
+
'citys' and 'random'. Defaults to none.
|
43 |
+
device (str): The device where the anchors will be put on.
|
44 |
+
Defaults to cuda:0.
|
45 |
+
cfg_options (dict, optional): Options to override some settings in
|
46 |
+
the used config.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
nn.Module: The constructed detector.
|
50 |
+
"""
|
51 |
+
if isinstance(config, (str, Path)):
|
52 |
+
config = Config.fromfile(config)
|
53 |
+
elif not isinstance(config, Config):
|
54 |
+
raise TypeError(
|
55 |
+
"config must be a filename or Config object, " f"but got {type(config)}"
|
56 |
+
)
|
57 |
+
|
58 |
+
with_backbone = config.model.get("backbone", False)
|
59 |
+
if with_backbone:
|
60 |
+
if cfg_options is not None:
|
61 |
+
config.merge_from_dict(cfg_options)
|
62 |
+
elif "init_cfg" in config.model.backbone:
|
63 |
+
config.model.backbone.init_cfg = None
|
64 |
+
else:
|
65 |
+
if cfg_options is not None:
|
66 |
+
config.merge_from_dict(cfg_options)
|
67 |
+
elif "init_cfg" in config.model.detector.backbone:
|
68 |
+
config.model.detector.backbone.init_cfg = None
|
69 |
+
|
70 |
+
scope = config.get("default_scope", "mmdet")
|
71 |
+
if scope is not None:
|
72 |
+
init_default_scope(config.get("default_scope", "mmdet"))
|
73 |
+
|
74 |
+
model = MODELS.build(config.model)
|
75 |
+
model = revert_sync_batchnorm(model)
|
76 |
+
if checkpoint is None:
|
77 |
+
warnings.simplefilter("once")
|
78 |
+
warnings.warn("checkpoint is None, use COCO classes by default.")
|
79 |
+
model.dataset_meta = {"classes": get_classes("coco")}
|
80 |
+
else:
|
81 |
+
checkpoint = load_checkpoint(model, checkpoint, map_location="cpu")
|
82 |
+
# Weights converted from elsewhere may not have meta fields.
|
83 |
+
checkpoint_meta = checkpoint.get("meta", {})
|
84 |
+
|
85 |
+
# save the dataset_meta in the model for convenience
|
86 |
+
if "dataset_meta" in checkpoint_meta:
|
87 |
+
# mmdet 3.x, all keys should be lowercase
|
88 |
+
model.dataset_meta = {
|
89 |
+
k.lower(): v for k, v in checkpoint_meta["dataset_meta"].items()
|
90 |
+
}
|
91 |
+
elif "CLASSES" in checkpoint_meta:
|
92 |
+
# < mmdet 3.x
|
93 |
+
classes = checkpoint_meta["CLASSES"]
|
94 |
+
model.dataset_meta = {"classes": classes}
|
95 |
+
else:
|
96 |
+
warnings.simplefilter("once")
|
97 |
+
warnings.warn(
|
98 |
+
"dataset_meta or class names are not saved in the "
|
99 |
+
"checkpoint's meta data, use COCO classes by default."
|
100 |
+
)
|
101 |
+
model.dataset_meta = {"classes": get_classes("coco")}
|
102 |
+
|
103 |
+
# Priority: args.palette -> config -> checkpoint
|
104 |
+
if palette != "none":
|
105 |
+
model.dataset_meta["palette"] = palette
|
106 |
+
else:
|
107 |
+
if "palette" not in model.dataset_meta:
|
108 |
+
warnings.warn(
|
109 |
+
"palette does not exist, random is used by default. "
|
110 |
+
"You can also set the palette to customize."
|
111 |
+
)
|
112 |
+
model.dataset_meta["palette"] = "random"
|
113 |
+
|
114 |
+
model.cfg = config # save the config in the model for convenience
|
115 |
+
model.to(device)
|
116 |
+
model.eval()
|
117 |
+
return model
|
118 |
+
|
119 |
+
|
120 |
+
def inference_detector(
|
121 |
+
model: nn.Module,
|
122 |
+
imgs: ImagesType,
|
123 |
+
test_pipeline: Optional[Compose] = None,
|
124 |
+
text_prompt: Optional[str] = None,
|
125 |
+
custom_entities: bool = False,
|
126 |
+
fp16: bool = False,
|
127 |
+
) -> Union[DetDataSample, SampleList]:
|
128 |
+
"""Inference image(s) with the detector.
|
129 |
+
|
130 |
+
Args:
|
131 |
+
model (nn.Module): The loaded detector.
|
132 |
+
imgs (str, ndarray, Sequence[str/ndarray]):
|
133 |
+
Either image files or loaded images.
|
134 |
+
test_pipeline (:obj:`Compose`): Test pipeline.
|
135 |
+
|
136 |
+
Returns:
|
137 |
+
:obj:`DetDataSample` or list[:obj:`DetDataSample`]:
|
138 |
+
If imgs is a list or tuple, the same length list type results
|
139 |
+
will be returned, otherwise return the detection results directly.
|
140 |
+
"""
|
141 |
+
|
142 |
+
if isinstance(imgs, (list, tuple)):
|
143 |
+
is_batch = True
|
144 |
+
else:
|
145 |
+
imgs = [imgs]
|
146 |
+
is_batch = False
|
147 |
+
|
148 |
+
cfg = model.cfg
|
149 |
+
|
150 |
+
if test_pipeline is None:
|
151 |
+
cfg = cfg.copy()
|
152 |
+
test_pipeline = get_test_pipeline_cfg(cfg)
|
153 |
+
if isinstance(imgs[0], np.ndarray):
|
154 |
+
# Calling this method across libraries will result
|
155 |
+
# in module unregistered error if not prefixed with mmdet.
|
156 |
+
test_pipeline[0].type = "mmdet.LoadImageFromNDArray"
|
157 |
+
|
158 |
+
test_pipeline = Compose(test_pipeline)
|
159 |
+
|
160 |
+
if model.data_preprocessor.device.type == "cpu":
|
161 |
+
for m in model.modules():
|
162 |
+
assert not isinstance(
|
163 |
+
m, RoIPool
|
164 |
+
), "CPU inference with RoIPool is not supported currently."
|
165 |
+
|
166 |
+
result_list = []
|
167 |
+
for i, img in enumerate(imgs):
|
168 |
+
# prepare data
|
169 |
+
if isinstance(img, np.ndarray):
|
170 |
+
# TODO: remove img_id.
|
171 |
+
data_ = dict(img=img, img_id=0)
|
172 |
+
else:
|
173 |
+
# TODO: remove img_id.
|
174 |
+
data_ = dict(img_path=img, img_id=0)
|
175 |
+
|
176 |
+
if text_prompt:
|
177 |
+
data_["text"] = text_prompt
|
178 |
+
data_["custom_entities"] = custom_entities
|
179 |
+
|
180 |
+
# build the data pipeline
|
181 |
+
data_ = test_pipeline(data_)
|
182 |
+
|
183 |
+
data_["inputs"] = [data_["inputs"]]
|
184 |
+
data_["data_samples"] = [data_["data_samples"]]
|
185 |
+
|
186 |
+
# forward the model
|
187 |
+
with torch.no_grad():
|
188 |
+
with autocast(enabled=fp16):
|
189 |
+
results = model.test_step(data_)[0]
|
190 |
+
|
191 |
+
result_list.append(results)
|
192 |
+
|
193 |
+
if not is_batch:
|
194 |
+
return result_list[0]
|
195 |
+
else:
|
196 |
+
return result_list
|
197 |
+
|
198 |
+
|
199 |
+
def inference_masa(
|
200 |
+
model: nn.Module,
|
201 |
+
img: np.ndarray,
|
202 |
+
frame_id: int,
|
203 |
+
video_len: int,
|
204 |
+
test_pipeline: Optional[Compose] = None,
|
205 |
+
text_prompt=None,
|
206 |
+
custom_entities: bool = False,
|
207 |
+
det_bboxes=None,
|
208 |
+
det_labels=None,
|
209 |
+
fp16=False,
|
210 |
+
detector_type="mmdet",
|
211 |
+
show_fps=False,
|
212 |
+
) -> SampleList:
|
213 |
+
"""Inference image(s) with the masa model.
|
214 |
+
|
215 |
+
Args:
|
216 |
+
model (nn.Module): The loaded mot model.
|
217 |
+
img (np.ndarray): Loaded image.
|
218 |
+
frame_id (int): frame id.
|
219 |
+
video_len (int): demo video length
|
220 |
+
Returns:
|
221 |
+
SampleList: The tracking data samples.
|
222 |
+
"""
|
223 |
+
data = dict(
|
224 |
+
img=[img.astype(np.float32)],
|
225 |
+
# img=[img.astype(np.uint8)],
|
226 |
+
frame_id=[frame_id],
|
227 |
+
ori_shape=[img.shape[:2]],
|
228 |
+
img_id=[frame_id + 1],
|
229 |
+
ori_video_length=[video_len],
|
230 |
+
)
|
231 |
+
|
232 |
+
if text_prompt is not None:
|
233 |
+
if detector_type == "mmdet":
|
234 |
+
data["text"] = [text_prompt]
|
235 |
+
data["custom_entities"] = [custom_entities]
|
236 |
+
elif detector_type == "yolo-world":
|
237 |
+
data["texts"] = [text_prompt]
|
238 |
+
data["custom_entities"] = [custom_entities]
|
239 |
+
|
240 |
+
data = test_pipeline(data)
|
241 |
+
|
242 |
+
# forward the model
|
243 |
+
with torch.no_grad():
|
244 |
+
data = default_collate([data])
|
245 |
+
if det_bboxes is not None:
|
246 |
+
data["data_samples"][0].video_data_samples[0].det_bboxes = det_bboxes
|
247 |
+
data["data_samples"][0].video_data_samples[0].det_labels = det_labels
|
248 |
+
# measure FPS ##
|
249 |
+
if show_fps:
|
250 |
+
start = time.time()
|
251 |
+
with autocast(enabled=fp16):
|
252 |
+
result = model.test_step(data)[0]
|
253 |
+
end = time.time()
|
254 |
+
fps = 1 / (end - start)
|
255 |
+
return result, fps
|
256 |
+
|
257 |
+
else:
|
258 |
+
with autocast(enabled=fp16):
|
259 |
+
result = model.test_step(data)[0]
|
260 |
+
return result
|
261 |
+
|
262 |
+
|
263 |
+
def build_test_pipeline(
|
264 |
+
cfg: ConfigType, with_text=False, detector_type="mmdet"
|
265 |
+
) -> ConfigType:
|
266 |
+
"""Build test_pipeline for mot/vis demo. In mot/vis infer, original
|
267 |
+
test_pipeline should remove the "LoadImageFromFile" and
|
268 |
+
"LoadTrackAnnotations".
|
269 |
+
|
270 |
+
Args:
|
271 |
+
cfg (ConfigDict): The loaded config.
|
272 |
+
Returns:
|
273 |
+
ConfigType: new test_pipeline
|
274 |
+
"""
|
275 |
+
# remove the "LoadImageFromFile" and "LoadTrackAnnotations" in pipeline
|
276 |
+
transform_broadcaster = cfg.inference_pipeline[0].copy()
|
277 |
+
if detector_type == "yolo-world":
|
278 |
+
kept_transform = []
|
279 |
+
for transform in transform_broadcaster["transforms"]:
|
280 |
+
if (
|
281 |
+
transform["type"] == "mmyolo.YOLOv5KeepRatioResize"
|
282 |
+
or transform["type"] == "mmyolo.LetterResize"
|
283 |
+
):
|
284 |
+
kept_transform.append(transform)
|
285 |
+
transform_broadcaster["transforms"] = kept_transform
|
286 |
+
pack_track_inputs = cfg.test_dataloader.dataset.pipeline[-1].copy()
|
287 |
+
test_pipeline = Compose([transform_broadcaster, pack_track_inputs])
|
288 |
+
else:
|
289 |
+
for transform in transform_broadcaster["transforms"]:
|
290 |
+
if "Resize" in transform["type"]:
|
291 |
+
transform_broadcaster["transforms"] = transform
|
292 |
+
pack_track_inputs = cfg.inference_pipeline[-1].copy()
|
293 |
+
if with_text:
|
294 |
+
pack_track_inputs["meta_keys"] = ("text", "custom_entities")
|
295 |
+
test_pipeline = Compose([transform_broadcaster, pack_track_inputs])
|
296 |
+
|
297 |
+
return test_pipeline
|
masa/datasets/__init__.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Tencent Inc. All rights reserved.
|
2 |
+
from .bdd_masa_dataset import BDDVideoDataset
|
3 |
+
from .dataset_wrappers import SeqMultiImageMixDataset
|
4 |
+
from .evaluation import * # NOQA
|
5 |
+
from .masa_dataset import MASADataset
|
6 |
+
from .pipelines import * # NOQA
|
7 |
+
from .rsconcat_dataset import RandomSampleConcatDataset
|
8 |
+
from .tao_masa_dataset import Taov1Dataset, Taov05Dataset
|
9 |
+
from .utils import yolow_collate
|
10 |
+
|
11 |
+
__all__ = [
|
12 |
+
"yolow_collate",
|
13 |
+
"RandomSampleConcatDataset",
|
14 |
+
"MASADataset",
|
15 |
+
"SeqMultiImageMixDataset",
|
16 |
+
"Taov05Dataset",
|
17 |
+
"Taov1Dataset",
|
18 |
+
"BDDVideoDataset",
|
19 |
+
]
|
masa/datasets/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (771 Bytes). View file
|
|
masa/datasets/__pycache__/bdd_masa_dataset.cpython-311.pyc
ADDED
Binary file (4.74 kB). View file
|
|
masa/datasets/__pycache__/dataset_wrappers.cpython-311.pyc
ADDED
Binary file (19.5 kB). View file
|
|
masa/datasets/__pycache__/masa_dataset.cpython-311.pyc
ADDED
Binary file (12.3 kB). View file
|
|
masa/datasets/__pycache__/rsconcat_dataset.cpython-311.pyc
ADDED
Binary file (11.1 kB). View file
|
|