Spaces:

hamacojr
/

LocoTrack

Running on Zero

App Files Files Community

Seokju Cho commited on Aug 6

Commit

f1586f7

•

1 Parent(s): 058b9ed

initial commit

Browse files

Files changed (15) hide show

.gitignore +162 -0
app.py +444 -0
locotrack_pytorch/README.md +62 -0
locotrack_pytorch/config/default.ini +25 -0
locotrack_pytorch/data/evaluation_datasets.py +784 -0
locotrack_pytorch/data/kubric_data.py +243 -0
locotrack_pytorch/environment.yml +151 -0
locotrack_pytorch/experiment.py +238 -0
locotrack_pytorch/model_utils.py +165 -0
locotrack_pytorch/models/cmdtop.py +45 -0
locotrack_pytorch/models/locotrack_model.py +1053 -0
locotrack_pytorch/models/nets.py +429 -0
locotrack_pytorch/models/utils.py +344 -0
requirements.txt +7 -0
viz_utils.py +104 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py ADDED Viewed

	@@ -0,0 +1,444 @@

+import os
+import sys
+import uuid
+import gradio as gr
+import mediapy
+import numpy as np
+import cv2
+import matplotlib
+import torch
+from locotrack_pytorch.models.locotrack_model import load_model
+from viz_utils import paint_point_track
+PREVIEW_WIDTH = 768 # Width of the preview video
+VIDEO_INPUT_RESO = (256, 256) # Resolution of the input video
+POINT_SIZE = 4 # Size of the query point in the preview video
+FRAME_LIMIT = 300 # Limit the number of frames to process
+def get_point(frame_num, video_queried_preview, query_points, query_points_color, query_count, evt: gr.SelectData):
+    print(f"You selected {(evt.index[0], evt.index[1], frame_num)}")
+    current_frame = video_queried_preview[int(frame_num)]
+    # Get the mouse click
+    query_points[int(frame_num)].append((evt.index[0], evt.index[1], frame_num))
+    # Choose the color for the point from matplotlib colormap
+    color = matplotlib.colormaps.get_cmap("gist_rainbow")(query_count % 20 / 20)
+    color = (int(color[0] * 255), int(color[1] * 255), int(color[2] * 255))
+    print(f"Color: {color}")
+    query_points_color[int(frame_num)].append(color)
+    # Draw the point on the frame
+    x, y = evt.index
+    current_frame_draw = cv2.circle(current_frame, (x, y), POINT_SIZE, color, -1)
+    # Update the frame
+    video_queried_preview[int(frame_num)] = current_frame_draw
+    # Update the query count
+    query_count += 1
+    return (
+        current_frame_draw, # Updated frame for preview
+        video_queried_preview, # Updated preview video
+        query_points, # Updated query points
+        query_points_color, # Updated query points color
+        query_count # Updated query count
+    )
+def undo_point(frame_num, video_preview, video_queried_preview, query_points, query_points_color, query_count):
+    if len(query_points[int(frame_num)]) == 0:
+        return (
+            video_queried_preview[int(frame_num)],
+            video_queried_preview,
+            query_points,
+            query_points_color,
+            query_count
+        )
+    # Get the last point
+    query_points[int(frame_num)].pop(-1)
+    query_points_color[int(frame_num)].pop(-1)
+    # Redraw the frame
+    current_frame_draw = video_preview[int(frame_num)].copy()
+    for point, color in zip(query_points[int(frame_num)], query_points_color[int(frame_num)]):
+        x, y, _ = point
+        current_frame_draw = cv2.circle(current_frame_draw, (x, y), POINT_SIZE, color, -1)
+    # Update the query count
+    query_count -= 1
+    # Update the frame
+    video_queried_preview[int(frame_num)] = current_frame_draw
+    return (
+        current_frame_draw, # Updated frame for preview
+        video_queried_preview, # Updated preview video
+        query_points, # Updated query points
+        query_points_color, # Updated query points color
+        query_count # Updated query count
+    )
+def clear_frame_fn(frame_num, video_preview, video_queried_preview, query_points, query_points_color, query_count):
+    query_count -= len(query_points[int(frame_num)])
+    query_points[int(frame_num)] = []
+    query_points_color[int(frame_num)] = []
+    video_queried_preview[int(frame_num)] = video_preview[int(frame_num)].copy()
+    return (
+        video_preview[int(frame_num)], # Set the preview frame to the original frame
+        video_queried_preview,
+        query_points, # Cleared query points
+        query_points_color, # Cleared query points color
+        query_count # New query count
+    )
+def clear_all_fn(frame_num, video_preview):
+    return (
+        video_preview[int(frame_num)],
+        video_preview.copy(),
+        [[] for _ in range(len(video_preview))],
+        [[] for _ in range(len(video_preview))],
+        0
+    )
+def choose_frame(frame_num, video_preview_array):
+    return video_preview_array[int(frame_num)]
+def extract_feature(video_input, model_size="small"):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.bfloat16 if device == "cuda" else torch.float16
+    model = load_model(model_size=model_size).to(device)
+    video_input = (video_input / 255.0) * 2 - 1
+    video_input = torch.tensor(video_input).unsqueeze(0).to(device, dtype)
+    with torch.autocast(device_type=device, dtype=dtype):
+        with torch.no_grad():
+            feature = model.get_feature_grids(video_input)
+    return feature
+def preprocess_video_input(video_path, model_size):
+    video_arr = mediapy.read_video(video_path)
+    video_fps = video_arr.metadata.fps
+    num_frames = video_arr.shape[0]
+    if num_frames > FRAME_LIMIT:
+        gr.Warning(f"The video is too long. Only the first {FRAME_LIMIT} frames will be used.", duration=5)
+        video_arr = video_arr[:FRAME_LIMIT]
+        num_frames = FRAME_LIMIT
+    # Resize to preview size for faster processing, width = PREVIEW_WIDTH
+    height, width = video_arr.shape[1:3]
+    new_height, new_width = int(PREVIEW_WIDTH * height / width), PREVIEW_WIDTH
+    preview_video = mediapy.resize_video(video_arr, (new_height, new_width))
+    input_video = mediapy.resize_video(video_arr, VIDEO_INPUT_RESO)
+    preview_video = np.array(preview_video)
+    input_video = np.array(input_video)
+    video_feature = extract_feature(input_video, model_size)
+    return (
+        video_arr, # Original video
+        preview_video, # Original preview video, resized for faster processing
+        preview_video.copy(), # Copy of preview video for visualization
+        input_video, # Resized video input for model
+        video_feature, # Extracted feature
+        video_fps, # Set the video FPS
+        gr.update(open=False), # Close the video input drawer
+        model_size, # Set the model size
+        preview_video[0], # Set the preview frame to the first frame
+        gr.update(minimum=0, maximum=num_frames - 1, value=0, interactive=True), # Set slider interactive
+        [[] for _ in range(num_frames)], # Set query_points to empty
+        [[] for _ in range(num_frames)], # Set query_points_color to empty
+        [[] for _ in range(num_frames)],
+        0, # Set query count to 0
+        gr.update(interactive=True), # Make the buttons interactive
+        gr.update(interactive=True),
+        gr.update(interactive=True),
+        gr.update(interactive=True),
+    )
+def track(
+    model_size,
+    video_preview,
+    video_input,
+    video_feature,
+    video_fps,
+    query_points,
+    query_points_color,
+    query_count,
+):
+    if query_count == 0:
+        gr.Warning("Please add query points before tracking.", duration=5)
+        return None
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.bfloat16 if device == "cuda" else torch.float16
+    # Convert query points to tensor, normalize to input resolution
+    query_points_tensor = []
+    for frame_points in query_points:
+        query_points_tensor.extend(frame_points)
+    query_points_tensor = torch.tensor(query_points_tensor).float()
+    query_points_tensor *= torch.tensor([
+        VIDEO_INPUT_RESO[1], VIDEO_INPUT_RESO[0], 1
+    ]) / torch.tensor([
+        [video_preview.shape[2], video_preview.shape[1], 1]
+    ])
+    query_points_tensor = query_points_tensor[None].flip(-1).to(device, dtype) # xyt -> tyx
+    # Preprocess video input
+    video_input = (video_input / 255.0) * 2 - 1
+    video_input = torch.tensor(video_input).unsqueeze(0).to(device, dtype)
+    model = load_model(model_size=model_size).to(device)
+    with torch.autocast(device_type=device, dtype=dtype):
+        with torch.no_grad():
+            output = model(video_input, query_points_tensor, feature_grids=video_feature)
+    tracks = output['tracks'][0].cpu()
+    tracks = tracks * torch.tensor([
+        video_preview.shape[2], video_preview.shape[1]
+    ]) / torch.tensor([
+        VIDEO_INPUT_RESO[1], VIDEO_INPUT_RESO[0]
+    ])
+    tracks = tracks.numpy()
+    occlusion_logits = output['occlusion']
+    pred_occ = torch.sigmoid(occlusion_logits)
+    if 'expected_dist' in output:
+        expected_dist = output['expected_dist']
+        pred_occ = 1 - (1 - pred_occ) * (1 - torch.sigmoid(expected_dist))
+    pred_occ = (pred_occ > 0.5)[0].cpu().numpy()
+    # make color array
+    colors = []
+    for frame_colors in query_points_color:
+        colors.extend(frame_colors)
+    colors = np.array(colors)
+    painted_video = paint_point_track(
+        video_preview,
+        tracks,
+        ~pred_occ,
+        colors,
+    )
+    # save video
+    video_file_name = uuid.uuid4().hex + ".mp4"
+    video_path = os.path.join(os.path.dirname(__file__), "tmp")
+    video_file_path = os.path.join(video_path, video_file_name)
+    os.makedirs(video_path, exist_ok=True)
+    mediapy.write_video(video_file_path, painted_video, fps=video_fps)
+    return video_file_path
+with gr.Blocks() as demo:
+    video = gr.State()
+    video_queried_preview = gr.State()
+    video_preview = gr.State()
+    video_input = gr.State()
+    video_feautre = gr.State()
+    video_fps = gr.State(24)
+    model_size = gr.State("small")
+    query_points = gr.State([])
+    query_points_color = gr.State([])
+    is_tracked_query = gr.State([])
+    query_count = gr.State(0)
+    gr.Markdown("# LocoTrack Demo")
+    gr.Markdown("This is an interactive demo for LocoTrack. For more details, please refer to the [GitHub repository](https://github.com/KU-CVLAB/LocoTrack) or the [paper](https://arxiv.org/abs/2407.15420).")
+    gr.Markdown("## First step: Choose the model size and upload your video")
+    with gr.Row():
+        with gr.Accordion("Your video input", open=True) as video_in_drawer:
+            model_size_selection = gr.Radio(
+                label="Model Size",
+                choices=["small", "base"],
+                value="small",
+            )
+            video_in = gr.Video(label="Video Input", format="mp4")
+    gr.Markdown("## Second step: Add query points to track")
+    with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                query_frames = gr.Slider(
+                    minimum=0, maximum=100, value=0, step=1, label="Choose Frame", interactive=False)
+            with gr.Row():
+                undo = gr.Button("Undo", interactive=False)
+                clear_frame = gr.Button("Clear Frame", interactive=False)
+                clear_all = gr.Button("Clear All", interactive=False)
+            with gr.Row():
+                current_frame = gr.Image(
+                    label="Click to add query points",
+                    type="numpy",
+                    interactive=False
+                )
+            with gr.Row():
+                track_button = gr.Button("Track", interactive=False)
+        with gr.Column():
+            output_video = gr.Video(
+                label="Output Video",
+                interactive=False,
+                autoplay=True,
+                loop=True,
+            )
+    video_in.upload(
+        fn = preprocess_video_input,
+        inputs = [video_in, model_size_selection],
+        outputs = [
+            video,
+            video_preview,
+            video_queried_preview,
+            video_input,
+            video_feautre,
+            video_fps,
+            video_in_drawer,
+            model_size,
+            current_frame,
+            query_frames,
+            query_points,
+            query_points_color,
+            is_tracked_query,
+            query_count,
+            undo,
+            clear_frame,
+            clear_all,
+            track_button,
+        ],
+        queue = False
+    )
+    query_frames.change(
+        fn = choose_frame,
+        inputs = [query_frames, video_queried_preview],
+        outputs = [
+            current_frame,
+        ],
+        queue = False
+    )
+    current_frame.select(
+        fn = get_point,
+        inputs = [
+            query_frames,
+            video_queried_preview,
+            query_points,
+            query_points_color,
+            query_count,
+        ],
+        outputs = [
+            current_frame,
+            video_queried_preview,
+            query_points,
+            query_points_color,
+            query_count
+        ],
+        queue = False
+    )
+    undo.click(
+        fn = undo_point,
+        inputs = [
+            query_frames,
+            video_preview,
+            video_queried_preview,
+            query_points,
+            query_points_color,
+            query_count
+        ],
+        outputs = [
+            current_frame,
+            video_queried_preview,
+            query_points,
+            query_points_color,
+            query_count
+        ],
+        queue = False
+    )
+    clear_frame.click(
+        fn = clear_frame_fn,
+        inputs = [
+            query_frames,
+            video_preview,
+            video_queried_preview,
+            query_points,
+            query_points_color,
+            query_count
+        ],
+        outputs = [
+            current_frame,
+            video_queried_preview,
+            query_points,
+            query_points_color,
+            query_count
+        ],
+        queue = False
+    )
+    clear_all.click(
+        fn = clear_all_fn,
+        inputs = [
+            query_frames,
+            video_preview,
+        ],
+        outputs = [
+            current_frame,
+            video_queried_preview,
+            query_points,
+            query_points_color,
+            query_count
+        ],
+        queue = False
+    )
+    track_button.click(
+        fn = track,
+        inputs = [
+            model_size,
+            video_preview,
+            video_input,
+            video_feautre,
+            video_fps,
+            query_points,
+            query_points_color,
+            query_count,
+        ],
+        outputs = [
+            output_video,
+        ],
+        queue = True,
+    )
+demo.launch(show_api=False, show_error=True, debug=True)

locotrack_pytorch/README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+# PyTorch Implementation of LocoTrack
+## Preparing the Environment
+```bash
+git clone https://github.com/google-research/kubric.git
+conda create -n locotrack-pytorch python=3.11
+conda activate locotrack-pytorch
+pip install torch torchvision torchaudio lightning==2.3.3 tensorflow_datasets tensorflow matplotlib mediapy tensorflow_graphics einshape wandb
+```
+## LocoTrack Evaluation
+### 1. Download Pre-trained Weights
+To evaluate LocoTrack on the benchmarks, first download the pre-trained weights.
+| Model       | Pre-trained Weights |
+|-------------|---------------------|
+| LocoTrack-S | [Link](https://huggingface.co/datasets/hamacojr/LocoTrack-pytorch-weights/resolve/main/locotrack_small.ckpt) |
+| LocoTrack-B | [Link](https://huggingface.co/datasets/hamacojr/LocoTrack-pytorch-weights/resolve/main/locotrack_base.ckpt)  |
+### 2. Adjust the Config File
+In `config/default.ini` (or any other config file), add the path to the evaluation datasets to `[TRAINING]-val_dataset_path`. Additionally, adjust the model size for evaluation in `[MODEL]-model_kwargs-model_size`.
+### 3. Run Evaluation
+To evaluate the LocoTrack model, use the `experiment.py` script with the following command-line arguments:
+```bash
+python experiment.py --config config/default.ini --mode eval_{dataset_to_eval_1}_..._{dataset_to_eval_N}[_q_first] --ckpt_path /path/to/checkpoint --save_path ./path_to_save_checkpoints/
+```
+- `--config`: Specifies the path to the configuration file. Default is `config/default.ini`.
+- `--mode`: Specifies the mode to run the script. Use `eval` to perform evaluation. You can also include additional options for query first mode (`q_first`), and the name of the evaluation datasets. For example:
+  - Evaluation of the DAVIS dataset: `eval_davis`
+  - Evaluation of DAVIS and RoboTAP in query first mode: `eval_davis_robotap_q_first`
+- `--ckpt_path`: Specifies the path to the checkpoint file. If not provided, the script will use the default checkpoint.
+- `--save_path`: Specifies the path to save logs.
+Replace `/path/to/checkpoint` with the actual path to your checkpoint file. This command will run the evaluation process and save the results in the specified `save_path`.
+## LocoTrack Training
+### Training Dataset Preparation
+Download the panning-MOVi-E dataset used for training (approximately 273GB) from Huggingface using the following script. Git LFS should be installed to download the dataset. To install Git LFS, please refer to this [link](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage?platform=linux). Additionally, downloading instructions for the Huggingface dataset are available at this [link](https://huggingface.co/docs/hub/en/datasets-downloading).
+```bash
+git clone git@hf.co:datasets/hamacojr/LocoTrack-panning-MOVi-E
+```
+### Training Script
+Add the path to the downloaded panning-MOVi-E to the `[TRAINING]-kubric_dir` entry in `config/default.ini` (or any other config file). Optionally, for efficient training, change `[TRAINING]-precision` in the config file to `bf16-mixed` to use `bfloat16`. Then, run the training with the following script:
+```bash
+python experiment.py --config config/default.ini --mode train_davis --save_path ./path_to_save_checkpoints/
+```

locotrack_pytorch/config/default.ini ADDED Viewed

	@@ -0,0 +1,25 @@

+[TRAINING]
+val_dataset_path = {"davis": "/home/seokjuc/sensei-fs-link/tapvid/tapvid_davis/tapvid_davis.pkl", "robotics": "", "kinetics": "", "robotap": ""}
+kubric_dir = ./kubric
+precision = 32
+batch_size = 4
+val_check_interval = 1000
+log_every_n_steps = 5
+gradient_clip_val = 1.0
+max_steps = 300000
+[MODEL]
+model_kwargs = {"model_size": "base", "num_pips_iter": 4}
+model_forward_kwargs = {"refinement_resolutions": ((256, 256),), "query_chunk_size": 256}
+[LOSS]
+loss_name = tapir_loss
+loss_kwargs = {}
+[OPTIMIZER]
+optimizer_name = AdamW
+optimizer_kwargs = {"lr": 1e-3, "weight_decay": 1e-3, "betas": (0.9, 0.95)}
+[SCHEDULER]
+scheduler_name = OneCycleLR
+scheduler_kwargs = {"max_lr": 1e-3, "pct_start": 0.003, "total_steps": 300000}

locotrack_pytorch/data/evaluation_datasets.py ADDED Viewed

	@@ -0,0 +1,784 @@

+# Copyright 2024 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Evaluation dataset creation functions."""
+import csv
+import functools
+import io
+import os
+from os import path
+import pickle
+import random
+from typing import Iterable, Mapping, Optional, Tuple, Union
+from absl import logging
+import mediapy as media
+import numpy as np
+from PIL import Image
+import scipy.io as sio
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from models.utils import convert_grid_coordinates
+DatasetElement = Mapping[str, Mapping[str, Union[np.ndarray, str]]]
+def resize_video(video: np.ndarray, output_size: Tuple[int, int]) -> np.ndarray:
+  """Resize a video to output_size."""
+  # If you have a GPU, consider replacing this with a GPU-enabled resize op,
+  # such as a jitted jax.image.resize.  It will make things faster.
+  return media.resize_video(video, output_size)
+def compute_tapvid_metrics(
+    query_points: np.ndarray,
+    gt_occluded: np.ndarray,
+    gt_tracks: np.ndarray,
+    pred_occluded: np.ndarray,
+    pred_tracks: np.ndarray,
+    query_mode: str,
+    get_trackwise_metrics: bool = False,
+) -> Mapping[str, np.ndarray]:
+  """Computes TAP-Vid metrics (Jaccard, Pts.
+  Within Thresh, Occ.
+  Acc.)
+  See the TAP-Vid paper for details on the metric computation.  All inputs are
+  given in raster coordinates.  The first three arguments should be the direct
+  outputs of the reader: the 'query_points', 'occluded', and 'target_points'.
+  The paper metrics assume these are scaled relative to 256x256 images.
+  pred_occluded and pred_tracks are your algorithm's predictions.
+  This function takes a batch of inputs, and computes metrics separately for
+  each video.  The metrics for the full benchmark are a simple mean of the
+  metrics across the full set of videos.  These numbers are between 0 and 1,
+  but the paper multiplies them by 100 to ease reading.
+  Args:
+     query_points: The query points, an in the format [t, y, x].  Its size is
+       [b, n, 3], where b is the batch size and n is the number of queries
+     gt_occluded: A boolean array of shape [b, n, t], where t is the number of
+       frames.  True indicates that the point is occluded.
+     gt_tracks: The target points, of shape [b, n, t, 2].  Each point is in the
+       format [x, y]
+     pred_occluded: A boolean array of predicted occlusions, in the same format
+       as gt_occluded.
+     pred_tracks: An array of track predictions from your algorithm, in the same
+       format as gt_tracks.
+     query_mode: Either 'first' or 'strided', depending on how queries are
+       sampled.  If 'first', we assume the prior knowledge that all points
+       before the query point are occluded, and these are removed from the
+       evaluation.
+     get_trackwise_metrics: if True, the metrics will be computed for every
+       track (rather than every video, which is the default).  This means
+       every output tensor will have an extra axis [batch, num_tracks] rather
+       than simply (batch).
+  Returns:
+      A dict with the following keys:
+      occlusion_accuracy: Accuracy at predicting occlusion.
+      pts_within_{x} for x in [1, 2, 4, 8, 16]: Fraction of points
+        predicted to be within the given pixel threshold, ignoring occlusion
+        prediction.
+      jaccard_{x} for x in [1, 2, 4, 8, 16]: Jaccard metric for the given
+        threshold
+      average_pts_within_thresh: average across pts_within_{x}
+      average_jaccard: average across jaccard_{x}
+  """
+  summing_axis = (2,) if get_trackwise_metrics else (1, 2)
+  metrics = {}
+  eye = np.eye(gt_tracks.shape[2], dtype=np.int32)
+  if query_mode == 'first':
+    # evaluate frames after the query frame
+    query_frame_to_eval_frames = np.cumsum(eye, axis=1) - eye
+  elif query_mode == 'strided':
+    # evaluate all frames except the query frame
+    query_frame_to_eval_frames = 1 - eye
+  else:
+    raise ValueError('Unknown query mode ' + query_mode)
+  query_frame = query_points[..., 0]
+  query_frame = np.round(query_frame).astype(np.int32)
+  evaluation_points = query_frame_to_eval_frames[query_frame] > 0
+  # Occlusion accuracy is simply how often the predicted occlusion equals the
+  # ground truth.
+  occ_acc = np.sum(
+      np.equal(pred_occluded, gt_occluded) & evaluation_points,
+      axis=summing_axis,
+  ) / np.sum(evaluation_points, axis=summing_axis)
+  metrics['occlusion_accuracy'] = occ_acc
+  # Next, convert the predictions and ground truth positions into pixel
+  # coordinates.
+  visible = np.logical_not(gt_occluded)
+  pred_visible = np.logical_not(pred_occluded)
+  all_frac_within = []
+  all_jaccard = []
+  for thresh in [1, 2, 4, 8, 16]:
+    # True positives are points that are within the threshold and where both
+    # the prediction and the ground truth are listed as visible.
+    within_dist = np.sum(
+        np.square(pred_tracks - gt_tracks),
+        axis=-1,
+    ) < np.square(thresh)
+    is_correct = np.logical_and(within_dist, visible)
+    # Compute the frac_within_threshold, which is the fraction of points
+    # within the threshold among points that are visible in the ground truth,
+    # ignoring whether they're predicted to be visible.
+    count_correct = np.sum(
+        is_correct & evaluation_points,
+        axis=summing_axis,
+    )
+    count_visible_points = np.sum(
+        visible & evaluation_points, axis=summing_axis
+    )
+    frac_correct = count_correct / count_visible_points
+    metrics['pts_within_' + str(thresh)] = frac_correct
+    all_frac_within.append(frac_correct)
+    true_positives = np.sum(
+        is_correct & pred_visible & evaluation_points, axis=summing_axis
+    )
+    # The denominator of the jaccard metric is the true positives plus
+    # false positives plus false negatives.  However, note that true positives
+    # plus false negatives is simply the number of points in the ground truth
+    # which is easier to compute than trying to compute all three quantities.
+    # Thus we just add the number of points in the ground truth to the number
+    # of false positives.
+    #
+    # False positives are simply points that are predicted to be visible,
+    # but the ground truth is not visible or too far from the prediction.
+    gt_positives = np.sum(visible & evaluation_points, axis=summing_axis)
+    false_positives = (~visible) & pred_visible
+    false_positives = false_positives | ((~within_dist) & pred_visible)
+    false_positives = np.sum(
+        false_positives & evaluation_points, axis=summing_axis
+    )
+    jaccard = true_positives / (gt_positives + false_positives)
+    metrics['jaccard_' + str(thresh)] = jaccard
+    all_jaccard.append(jaccard)
+  metrics['average_jaccard'] = np.mean(
+      np.stack(all_jaccard, axis=1),
+      axis=1,
+  )
+  metrics['average_pts_within_thresh'] = np.mean(
+      np.stack(all_frac_within, axis=1),
+      axis=1,
+  )
+  return metrics
+def latex_table(mean_scalars: Mapping[str, float]) -> str:
+  """Generate a latex table for displaying TAP-Vid and PCK metrics."""
+  if 'average_jaccard' in mean_scalars:
+    latex_fields = [
+        'average_jaccard',
+        'average_pts_within_thresh',
+        'occlusion_accuracy',
+        'jaccard_1',
+        'jaccard_2',
+        'jaccard_4',
+        'jaccard_8',
+        'jaccard_16',
+        'pts_within_1',
+        'pts_within_2',
+        'pts_within_4',
+        'pts_within_8',
+        'pts_within_16',
+    ]
+    header = (
+        'AJ & $<\\delta^{x}_{avg}$ & OA & Jac. $\\delta^{0}$ & '
+        + 'Jac. $\\delta^{1}$ & Jac. $\\delta^{2}$ & '
+        + 'Jac. $\\delta^{3}$ & Jac. $\\delta^{4}$ & $<\\delta^{0}$ & '
+        + '$<\\delta^{1}$ & $<\\delta^{2}$ & $<\\delta^{3}$ & '
+        + '$<\\delta^{4}$'
+    )
+  else:
+    latex_fields = ['PCK@0.1', 'PCK@0.2', 'PCK@0.3', 'PCK@0.4', 'PCK@0.5']
+    header = ' & '.join(latex_fields)
+  body = ' & '.join(
+      [f'{float(np.array(mean_scalars[x]*100)):.3}' for x in latex_fields]
+  )
+  return '\n'.join([header, body])
+def sample_queries_strided(
+    target_occluded: np.ndarray,
+    target_points: np.ndarray,
+    frames: np.ndarray,
+    query_stride: int = 5,
+) -> Mapping[str, np.ndarray]:
+  """Package a set of frames and tracks for use in TAPNet evaluations.
+  Given a set of frames and tracks with no query points, sample queries
+  strided every query_stride frames, ignoring points that are not visible
+  at the selected frames.
+  Args:
+    target_occluded: Boolean occlusion flag, of shape [n_tracks, n_frames],
+      where True indicates occluded.
+    target_points: Position, of shape [n_tracks, n_frames, 2], where each point
+      is [x,y] scaled between 0 and 1.
+    frames: Video tensor, of shape [n_frames, height, width, 3].  Scaled between
+      -1 and 1.
+    query_stride: When sampling query points, search for un-occluded points
+      every query_stride frames and convert each one into a query.
+  Returns:
+    A dict with the keys:
+      video: Video tensor of shape [1, n_frames, height, width, 3].  The video
+        has floats scaled to the range [-1, 1].
+      query_points: Query points of shape [1, n_queries, 3] where
+        each point is [t, y, x] scaled to the range [-1, 1].
+      target_points: Target points of shape [1, n_queries, n_frames, 2] where
+        each point is [x, y] scaled to the range [-1, 1].
+      trackgroup: Index of the original track that each query point was
+        sampled from.  This is useful for visualization.
+  """
+  tracks = []
+  occs = []
+  queries = []
+  trackgroups = []
+  total = 0
+  trackgroup = np.arange(target_occluded.shape[0])
+  for i in range(0, target_occluded.shape[1], query_stride):
+    mask = target_occluded[:, i] == 0
+    query = np.stack(
+        [
+            i * np.ones(target_occluded.shape[0:1]),
+            target_points[:, i, 1],
+            target_points[:, i, 0],
+        ],
+        axis=-1,
+    )
+    queries.append(query[mask])
+    tracks.append(target_points[mask])
+    occs.append(target_occluded[mask])
+    trackgroups.append(trackgroup[mask])
+    total += np.array(np.sum(target_occluded[:, i] == 0))
+  return {
+      'video': frames[np.newaxis, ...],
+      'query_points': np.concatenate(queries, axis=0)[np.newaxis, ...],
+      'target_points': np.concatenate(tracks, axis=0)[np.newaxis, ...],
+      'occluded': np.concatenate(occs, axis=0)[np.newaxis, ...],
+      'trackgroup': np.concatenate(trackgroups, axis=0)[np.newaxis, ...],
+  }
+def sample_queries_first(
+    target_occluded: np.ndarray,
+    target_points: np.ndarray,
+    frames: np.ndarray,
+) -> Mapping[str, np.ndarray]:
+  """Package a set of frames and tracks for use in TAPNet evaluations.
+  Given a set of frames and tracks with no query points, use the first
+  visible point in each track as the query.
+  Args:
+    target_occluded: Boolean occlusion flag, of shape [n_tracks, n_frames],
+      where True indicates occluded.
+    target_points: Position, of shape [n_tracks, n_frames, 2], where each point
+      is [x,y] scaled between 0 and 1.
+    frames: Video tensor, of shape [n_frames, height, width, 3].  Scaled between
+      -1 and 1.
+  Returns:
+    A dict with the keys:
+      video: Video tensor of shape [1, n_frames, height, width, 3]
+      query_points: Query points of shape [1, n_queries, 3] where
+        each point is [t, y, x] scaled to the range [-1, 1]
+      target_points: Target points of shape [1, n_queries, n_frames, 2] where
+        each point is [x, y] scaled to the range [-1, 1]
+  """
+  valid = np.sum(~target_occluded, axis=1) > 0
+  target_points = target_points[valid, :]
+  target_occluded = target_occluded[valid, :]
+  query_points = []
+  for i in range(target_points.shape[0]):
+    index = np.where(target_occluded[i] == 0)[0][0]
+    x, y = target_points[i, index, 0], target_points[i, index, 1]
+    query_points.append(np.array([index, y, x]))  # [t, y, x]
+  query_points = np.stack(query_points, axis=0)
+  return {
+      'video': frames[np.newaxis, ...],
+      'query_points': query_points[np.newaxis, ...],
+      'target_points': target_points[np.newaxis, ...],
+      'occluded': target_occluded[np.newaxis, ...],
+  }
+def create_jhmdb_dataset(
+    jhmdb_path: str, resolution: Optional[Tuple[int, int]] = (256, 256)
+) -> Iterable[DatasetElement]:
+  """JHMDB dataset, including fields required for PCK evaluation."""
+  videos = []
+  for file in tf.io.gfile.listdir(path.join(gt_dir, 'splits')):
+    # JHMDB file containing the first split, which is standard for this type of
+    # evaluation.
+    if not file.endswith('split1.txt'):
+      continue
+    video_folder = '_'.join(file.split('_')[:-2])
+    for video in tf.io.gfile.GFile(path.join(gt_dir, 'splits', file), 'r'):
+      video, traintest = video.split()
+      video, _ = video.split('.')
+      traintest = int(traintest)
+      video_path = path.join(video_folder, video)
+      if traintest == 2:
+        videos.append(video_path)
+  if not videos:
+    raise ValueError('No JHMDB videos found in directory ' + str(jhmdb_path))
+  # Shuffle so numbers converge faster.
+  random.shuffle(videos)
+  for video in videos:
+    logging.info(video)
+    joints = path.join(gt_dir, 'joint_positions', video, 'joint_positions.mat')
+    if not tf.io.gfile.exists(joints):
+      logging.info('skip %s', video)
+      continue
+    gt_pose = sio.loadmat(tf.io.gfile.GFile(joints, 'rb'))['pos_img']
+    gt_pose = np.transpose(gt_pose, [1, 2, 0])
+    frames = path.join(gt_dir, 'Rename_Images', video, '*.png')
+    framefil = tf.io.gfile.glob(frames)
+    framefil.sort()
+    def read_frame(f):
+      im = Image.open(tf.io.gfile.GFile(f, 'rb'))
+      im = im.convert('RGB')
+      im_data = np.array(im.getdata(), np.uint8)
+      return im_data.reshape([im.size[1], im.size[0], 3])
+    frames = [read_frame(x) for x in framefil]
+    frames = np.stack(frames)
+    height = frames.shape[1]
+    width = frames.shape[2]
+    invalid_x = np.logical_or(
+        gt_pose[:, 0:1, 0] < 0,
+        gt_pose[:, 0:1, 0] >= width,
+    )
+    invalid_y = np.logical_or(
+        gt_pose[:, 0:1, 1] < 0,
+        gt_pose[:, 0:1, 1] >= height,
+    )
+    invalid = np.logical_or(invalid_x, invalid_y)
+    invalid = np.tile(invalid, [1, gt_pose.shape[1]])
+    invalid = invalid[:, :, np.newaxis].astype(np.float32)
+    gt_pose_orig = gt_pose
+    if resolution is not None and resolution != frames.shape[1:3]:
+      frames = resize_video(frames, resolution)
+    frames = frames / (255.0 / 2.0) - 1.0
+    queries = gt_pose[:, 0]
+    queries = np.concatenate(
+        [queries[..., 0:1] * 0, queries[..., ::-1]],
+        axis=-1,
+    )
+    gt_pose = convert_grid_coordinates(
+        gt_pose,
+        np.array([width, height]),
+        np.array([frames.shape[2], frames.shape[1]]),
+    )
+    # Set invalid poses to -1 (outside the frame)
+    gt_pose = (1.0 - invalid) * gt_pose + invalid * (-1.0)
+    if gt_pose.shape[1] < frames.shape[0]:
+      # Some videos have pose sequences that are shorter than the frame
+      # sequence (usually because the person disappears).  In this case,
+      # truncate the video.
+      logging.warning('short video!!')
+      frames = frames[: gt_pose.shape[1]]
+    converted = {
+        'video': frames[np.newaxis, ...],
+        'query_points': queries[np.newaxis, ...],
+        'target_points': gt_pose[np.newaxis, ...],
+        'gt_pose': gt_pose[np.newaxis, ...],
+        'gt_pose_orig': gt_pose_orig[np.newaxis, ...],
+        'occluded': gt_pose[np.newaxis, ..., 0] * 0,
+        'fname': video,
+        'im_size': np.array([height, width]),
+    }
+    yield {'jhmdb': converted}
+def create_kubric_eval_train_dataset(
+    mode: str,
+    train_size: Tuple[int, int] = (256, 256),
+    max_dataset_size: int = 100,
+) -> Iterable[DatasetElement]:
+  """Dataset for evaluating performance on Kubric training data."""
+  # Lazy import kubric because requirements_inference doesn't include it.
+  from kubric.challenges.point_tracking import dataset
+  res = dataset.create_point_tracking_dataset(
+      split='train',
+      train_size=train_size,
+      batch_dims=[1],
+      shuffle_buffer_size=None,
+      repeat=False,
+      vflip='vflip' in mode,
+      random_crop=False,
+  )
+  np_ds = tfds.as_numpy(res)
+  num_returned = 0
+  for data in np_ds:
+    if num_returned >= max_dataset_size:
+      break
+    num_returned += 1
+    yield {'kubric': data}
+def create_kubric_eval_dataset(
+    mode: str, train_size: Tuple[int, int] = (256, 256)
+) -> Iterable[DatasetElement]:
+  """Dataset for evaluating performance on Kubric val data."""
+  # Lazy import kubric because requirements_inference doesn't include it.
+  from kubric.challenges.point_tracking import dataset
+  res = dataset.create_point_tracking_dataset(
+      split='validation',
+      train_size=train_size,
+      batch_dims=[1],
+      shuffle_buffer_size=None,
+      repeat=False,
+      vflip='vflip' in mode,
+      random_crop=False,
+  )
+  np_ds = tfds.as_numpy(res)
+  for data in np_ds:
+    yield {'kubric': data}
+def create_davis_dataset(
+    davis_points_path: str,
+    query_mode: str = 'strided',
+    full_resolution=False,
+    resolution: Optional[Tuple[int, int]] = (256, 256),
+) -> Iterable[DatasetElement]:
+  """Dataset for evaluating performance on DAVIS data."""
+  pickle_path = davis_points_path
+  with tf.io.gfile.GFile(pickle_path, 'rb') as f:
+    davis_points_dataset = pickle.load(f)
+  if full_resolution:
+    ds, _ = tfds.load(
+        'davis/full_resolution', split='validation', with_info=True
+    )
+    to_iterate = tfds.as_numpy(ds)
+  else:
+    to_iterate = davis_points_dataset.keys()
+  for tmp in to_iterate:
+    if full_resolution:
+      frames = tmp['video']['frames']
+      video_name = tmp['metadata']['video_name'].decode()
+    else:
+      video_name = tmp
+      frames = davis_points_dataset[video_name]['video']
+      if resolution is not None and resolution != frames.shape[1:3]:
+        frames = resize_video(frames, resolution)
+    frames = frames.astype(np.float32) / 255.0 * 2.0 - 1.0
+    target_points = davis_points_dataset[video_name]['points']
+    target_occ = davis_points_dataset[video_name]['occluded']
+    target_points = target_points * np.array([frames.shape[2], frames.shape[1]])
+    if query_mode == 'strided':
+      converted = sample_queries_strided(target_occ, target_points, frames)
+    elif query_mode == 'first':
+      converted = sample_queries_first(target_occ, target_points, frames)
+    else:
+      raise ValueError(f'Unknown query mode {query_mode}.')
+    yield {'davis': converted}
+def create_rgb_stacking_dataset(
+    robotics_points_path: str,
+    query_mode: str = 'strided',
+    resolution: Optional[Tuple[int, int]] = (256, 256),
+) -> Iterable[DatasetElement]:
+  """Dataset for evaluating performance on robotics data."""
+  pickle_path = robotics_points_path
+  with tf.io.gfile.GFile(pickle_path, 'rb') as f:
+    robotics_points_dataset = pickle.load(f)
+  for example in robotics_points_dataset:
+    frames = example['video']
+    if resolution is not None and resolution != frames.shape[1:3]:
+      frames = resize_video(frames, resolution)
+    frames = frames.astype(np.float32) / 255.0 * 2.0 - 1.0
+    target_points = example['points']
+    target_occ = example['occluded']
+    target_points = target_points * np.array([frames.shape[2], frames.shape[1]])
+    if query_mode == 'strided':
+      converted = sample_queries_strided(target_occ, target_points, frames)
+    elif query_mode == 'first':
+      converted = sample_queries_first(target_occ, target_points, frames)
+    else:
+      raise ValueError(f'Unknown query mode {query_mode}.')
+    yield {'robotics': converted}
+def create_kinetics_dataset(
+    kinetics_path: str, query_mode: str = 'strided',
+    resolution: Optional[Tuple[int, int]] = (256, 256),
+) -> Iterable[DatasetElement]:
+  """Dataset for evaluating performance on Kinetics point tracking."""
+  all_paths = tf.io.gfile.glob(path.join(kinetics_path, '*_of_0010.pkl'))
+  for pickle_path in all_paths:
+    with open(pickle_path, 'rb') as f:
+      data = pickle.load(f)
+      if isinstance(data, dict):
+        data = list(data.values())
+    # idx = random.randint(0, len(data) - 1)
+    for idx in range(len(data)):
+      example = data[idx]
+      frames = example['video']
+      if isinstance(frames[0], bytes):
+        # TAP-Vid is stored and JPEG bytes rather than `np.ndarray`s.
+        def decode(frame):
+          byteio = io.BytesIO(frame)
+          img = Image.open(byteio)
+          return np.array(img)
+        frames = np.array([decode(frame) for frame in frames])
+      if resolution is not None and resolution != frames.shape[1:3]:
+        frames = resize_video(frames, resolution)
+      frames = frames.astype(np.float32) / 255.0 * 2.0 - 1.0
+      target_points = example['points']
+      target_occ = example['occluded']
+      target_points *= np.array([frames.shape[2], frames.shape[1]])
+      if query_mode == 'strided':
+        converted = sample_queries_strided(target_occ, target_points, frames)
+      elif query_mode == 'first':
+        converted = sample_queries_first(target_occ, target_points, frames)
+      else:
+        raise ValueError(f'Unknown query mode {query_mode}.')
+      yield {'kinetics': converted}
+def create_robotap_dataset(
+    robotics_points_path: str,
+    query_mode: str = 'strided',
+    resolution: Optional[Tuple[int, int]] = (256, 256),
+) -> Iterable[DatasetElement]:
+  """Dataset for evaluating performance on robotics data."""
+  pickle_path = robotics_points_path
+  # with tf.io.gfile.GFile(pickle_path, 'rb') as f:
+  #   robotics_points_dataset = pickle.load(f)
+  robotics_points_dataset = []
+  all_paths = tf.io.gfile.glob(path.join(robotics_points_path, '*.pkl'))
+  for pickle_path in all_paths:
+    with open(pickle_path, 'rb') as f:
+      data = pickle.load(f)
+    robotics_points_dataset.extend(data.values())
+  for example in robotics_points_dataset:
+    frames = example['video']
+    if resolution is not None and resolution != frames.shape[1:3]:
+      frames = resize_video(frames, resolution)
+    frames = frames.astype(np.float32) / 255.0 * 2.0 - 1.0
+    target_points = example['points']
+    target_occ = example['occluded']
+    target_points = target_points * np.array([frames.shape[2], frames.shape[1]])
+    if query_mode == 'strided':
+      converted = sample_queries_strided(target_occ, target_points, frames)
+    elif query_mode == 'first':
+      converted = sample_queries_first(target_occ, target_points, frames)
+    else:
+      raise ValueError(f'Unknown query mode {query_mode}.')
+    yield {'robotap': converted}
+def create_csv_dataset(
+    dataset_name: str,
+    csv_path: str,
+    video_base_path: str,
+    query_mode: str = 'strided',
+    resolution: Optional[Tuple[int, int]] = (256, 256),
+    max_video_frames: Optional[int] = 1000,
+) -> Iterable[DatasetElement]:
+  """Create an evaluation iterator out of human annotations and videos.
+  Args:
+    dataset_name: Name to the dataset.
+    csv_path: Path to annotations csv.
+    video_base_path: Path to annotated videos.
+    query_mode: sample query points from first frame or strided.
+    resolution: The video resolution in (height, width).
+    max_video_frames: Max length of annotated video.
+  Yields:
+    Samples for evaluation.
+  """
+  point_tracks_all = dict()
+  with tf.io.gfile.GFile(csv_path, 'r') as f:
+    reader = csv.reader(f, delimiter=',')
+    for row in reader:
+      video_id = row[0]
+      point_tracks = np.array(row[1:]).reshape(-1, 3)
+      if video_id in point_tracks_all:
+        point_tracks_all[video_id].append(point_tracks)
+      else:
+        point_tracks_all[video_id] = [point_tracks]
+  for video_id in point_tracks_all:
+    if video_id.endswith('.mp4'):
+      video_path = path.join(video_base_path, video_id)
+    else:
+      video_path = path.join(video_base_path, video_id + '.mp4')
+    frames = media.read_video(video_path)
+    if resolution is not None and resolution != frames.shape[1:3]:
+      frames = media.resize_video(frames, resolution)
+    frames = frames.astype(np.float32) / 255.0 * 2.0 - 1.0
+    point_tracks = np.stack(point_tracks_all[video_id], axis=0)
+    point_tracks = point_tracks.astype(np.float32)
+    if frames.shape[0] < point_tracks.shape[1]:
+      logging.info('Warning: short video!')
+      point_tracks = point_tracks[:, : frames.shape[0]]
+    point_tracks, occluded = point_tracks[..., 0:2], point_tracks[..., 2]
+    occluded = occluded > 0
+    target_points = point_tracks * np.array([frames.shape[2], frames.shape[1]])
+    num_splits = int(np.ceil(frames.shape[0] / max_video_frames))
+    if num_splits > 1:
+      print(f'Going to split the video {video_id} into {num_splits}')
+    for i in range(num_splits):
+      start_index = i * frames.shape[0] // num_splits
+      end_index = (i + 1) * frames.shape[0] // num_splits
+      sub_occluded = occluded[:, start_index:end_index]
+      sub_target_points = target_points[:, start_index:end_index]
+      sub_frames = frames[start_index:end_index]
+      if query_mode == 'strided':
+        converted = sample_queries_strided(
+            sub_occluded, sub_target_points, sub_frames
+        )
+      elif query_mode == 'first':
+        converted = sample_queries_first(
+            sub_occluded, sub_target_points, sub_frames
+        )
+      else:
+        raise ValueError(f'Unknown query mode {query_mode}.')
+      yield {dataset_name: converted}
+import torch
+from torch.utils.data import Dataset
+class CustomDataset(Dataset):
+    def __init__(self, data_generator: Iterable[DatasetElement], key: str):
+        self.data = list(data_generator)
+        self.key = key
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        data = self.data[idx][self.key]
+        data = {k: torch.tensor(v)[0] if isinstance(v, np.ndarray) else v for k, v in data.items()}
+        # Convert double to float
+        data = {k: v.float() if v.dtype == torch.float64 else v for k, v in data.items()}
+        return data
+def get_eval_dataset(mode, path, resolution=(256, 256)):
+    query_mode = 'first' if 'q_first' in mode else 'strided'
+    datasets = {}
+    if 'jhmdb' in mode:
+        key = 'jhmdb'
+        dataset = create_jhmdb_dataset(path[key], resolution)
+        datasets[key] = CustomDataset(dataset, key)
+    if 'davis' in mode:
+        key = 'davis'
+        dataset = create_davis_dataset(path[key], query_mode, False, resolution=resolution)
+        datasets[key] = CustomDataset(dataset, key)
+    if 'robotics' in mode:
+        key = 'robotics'
+        dataset = create_rgb_stacking_dataset(path[key], query_mode, resolution)
+        datasets[key] = CustomDataset(dataset, key)
+    if 'kinetics' in mode:
+        key = 'kinetics'
+        dataset = create_kinetics_dataset(path[key], query_mode, resolution)
+        datasets[key] = CustomDataset(dataset, key)
+    if 'robotap' in mode:
+        key = 'robotap'
+        dataset = create_robotap_dataset(path[key], query_mode, resolution)
+        datasets[key] = CustomDataset(dataset, key)
+    if len(datasets) == 0:
+        raise ValueError(f'No dataset found for mode {mode}.')
+    return datasets
+if __name__ == '__main__':
+  # Disable all GPUS
+  tf.config.set_visible_devices([], 'GPU')
+  visible_devices = tf.config.get_visible_devices()
+  for device in visible_devices:
+    assert device.device_type != 'GPU'
+  dataset_name = 'davis'
+  dataset_path = '/media/data2/PointTracking/tapvid/tapnet_dataset/tapvid_davis/tapvid_davis.pkl'
+  dataset = get_eval_dataset(dataset_name, dataset_path, 'strided', (256, 256))
+  breakpoint()
+  pass

locotrack_pytorch/data/kubric_data.py ADDED Viewed

	@@ -0,0 +1,243 @@

+from typing import Mapping
+import torch
+import numpy as np
+import functools
+import tensorflow_datasets as tfds
+import tensorflow as tf
+import torch.distributed
+from kubric.challenges.point_tracking.dataset import add_tracks
+# Disable all GPUS
+tf.config.set_visible_devices([], 'GPU')
+visible_devices = tf.config.get_visible_devices()
+for device in visible_devices:
+    assert device.device_type != 'GPU'
+def default_color_augmentation_fn(
+        inputs: Mapping[str, tf.Tensor]) -> Mapping[str, tf.Tensor]:
+    """Standard color augmentation for videos.
+    Args:
+        inputs: A DatasetElement containing the item 'video' which will have
+        augmentations applied to it.
+    Returns:
+        A DatasetElement with all the same data as the original, except that
+        the video has augmentations applied.
+    """
+    zero_centering_image = True
+    prob_color_augment = 0.8
+    prob_color_drop = 0.2
+    frames = inputs['video']
+    if frames.dtype != tf.float32:
+        raise ValueError('`frames` should be in float32.')
+    def color_augment(video: tf.Tensor) -> tf.Tensor:
+        """Do standard color augmentations."""
+        # Note the same augmentation will be applied to all frames of the video.
+        if zero_centering_image:
+            video = 0.5 * (video + 1.0)
+        video = tf.image.random_brightness(video, max_delta=32. / 255.)
+        video = tf.image.random_saturation(video, lower=0.6, upper=1.4)
+        video = tf.image.random_contrast(video, lower=0.6, upper=1.4)
+        video = tf.image.random_hue(video, max_delta=0.2)
+        video = tf.clip_by_value(video, 0.0, 1.0)
+        if zero_centering_image:
+            video = 2 * (video-0.5)
+        return video
+    def color_drop(video: tf.Tensor) -> tf.Tensor:
+        video = tf.image.rgb_to_grayscale(video)
+        video = tf.tile(video, [1, 1, 1, 1, 3])
+        return video
+    # Eventually applies color augmentation.
+    coin_toss_color_augment = tf.random.uniform(
+        [], minval=0, maxval=1, dtype=tf.float32)
+    frames = tf.cond(
+        pred=tf.less(coin_toss_color_augment,
+                    tf.cast(prob_color_augment, tf.float32)),
+        true_fn=lambda: color_augment(frames),
+        false_fn=lambda: frames)
+    # Eventually applies color drop.
+    coin_toss_color_drop = tf.random.uniform(
+        [], minval=0, maxval=1, dtype=tf.float32)
+    frames = tf.cond(
+        pred=tf.less(coin_toss_color_drop, tf.cast(prob_color_drop, tf.float32)),
+        true_fn=lambda: color_drop(frames),
+        false_fn=lambda: frames)
+    result = {**inputs}
+    result['video'] = frames
+    return result
+def add_default_data_augmentation(ds: tf.data.Dataset) -> tf.data.Dataset:
+    return ds.map(
+        default_color_augmentation_fn, num_parallel_calls=tf.data.AUTOTUNE)
+def create_point_tracking_dataset(
+    data_dir=None,
+    color_augmentation=True,
+    train_size=(256, 256),
+    shuffle_buffer_size=256,
+    split='train',
+    # batch_dims=tuple(),
+    batch_size=1,
+    repeat=True,
+    vflip=False,
+    random_crop=True,
+    tracks_to_sample=256,
+    sampling_stride=4,
+    max_seg_id=40,
+    max_sampled_frac=0.1,
+    num_parallel_point_extraction_calls=16,
+    **kwargs):
+    """Construct a dataset for point tracking using Kubric.
+    Args:
+        train_size: Tuple of 2 ints. Cropped output will be at this resolution
+        shuffle_buffer_size: Int. Size of the shuffle buffer
+        split: Which split to construct from Kubric.  Can be 'train' or
+        'validation'.
+        batch_dims: Sequence of ints. Add multiple examples into a batch of this
+        shape.
+        repeat: Bool. whether to repeat the dataset.
+        vflip: Bool. whether to vertically flip the dataset to test generalization.
+        random_crop: Bool. whether to randomly crop videos
+        tracks_to_sample: Int. Total number of tracks to sample per video.
+        sampling_stride: Int. For efficiency, query points are sampled from a
+        random grid of this stride.
+        max_seg_id: Int. The maxium segment id in the video.  Note the size of
+        the to graph is proportional to this number, so prefer small values.
+        max_sampled_frac: Float. The maximum fraction of points to sample from each
+        object, out of all points that lie on the sampling grid.
+        num_parallel_point_extraction_calls: Int. The num_parallel_calls for the
+        map function for point extraction.
+        snap_to_occluder: If true, query points within 1 pixel of occlusion
+        boundaries will track the occluding surface rather than the background.
+        This results in models which are biased to track foreground objects
+        instead of background.  Whether this is desirable depends on downstream
+        applications.
+        **kwargs: additional args to pass to tfds.load.
+    Returns:
+        The dataset generator.
+    """
+    ds = tfds.load(
+        'panning_movi_e/256x256',
+        data_dir=data_dir,
+        shuffle_files=shuffle_buffer_size is not None,
+        **kwargs)
+    ds = ds[split]
+    if repeat:
+        ds = ds.repeat()
+    ds = ds.map(
+        functools.partial(
+            add_tracks,
+            train_size=train_size,
+            vflip=vflip,
+            random_crop=random_crop,
+            tracks_to_sample=tracks_to_sample,
+            sampling_stride=sampling_stride,
+            max_seg_id=max_seg_id,
+            max_sampled_frac=max_sampled_frac),
+        num_parallel_calls=num_parallel_point_extraction_calls)
+    if shuffle_buffer_size is not None:
+        ds = ds.shuffle(shuffle_buffer_size)
+    ds = ds.batch(batch_size)
+    if color_augmentation:
+        ds = add_default_data_augmentation(ds)
+    ds = tfds.as_numpy(ds)
+    it = iter(ds)
+    while True:
+        data = next(it)
+        yield data
+class KubricData:
+    def __init__(
+            self,
+            global_rank,
+            data_dir,
+            **kwargs
+        ):
+        self.global_rank = global_rank
+        if self.global_rank == 0:
+            self.data = create_point_tracking_dataset(
+                data_dir=data_dir,
+                **kwargs
+            )
+    def __getitem__(self, idx):
+        if self.global_rank == 0:
+            batch_all = next(self.data)
+            batch_list = []
+            world_size = torch.distributed.get_world_size()
+            batch_size = batch_all['video'].shape[0] // world_size
+            for i in range(world_size):
+                batch = {}
+                for k, v in batch_all.items():
+                    if isinstance(v, (np.ndarray, torch.Tensor)):
+                        batch[k] = torch.tensor(v[i * batch_size: (i + 1) * batch_size])
+                batch_list.append(batch)
+        else:
+            batch_list = [None] * torch.distributed.get_world_size()
+        batch = [None]
+        torch.distributed.scatter_object_list(batch, batch_list, src=0)
+        return batch[0]
+if __name__ == '__main__':
+    import torch.nn as nn
+    import lightning as L
+    from lightning.pytorch.strategies import DDPStrategy
+    class Model(L.LightningModule):
+        def __init__(self):
+            super().__init__()
+            self.model = nn.Linear(256 * 256 * 3 * 24, 1)
+        def forward(self, x):
+            return self.model(x)
+        def training_step(self, batch, batch_idx):
+            breakpoint()
+            x = batch['video']
+            x = x.reshape(x.shape[0], -1)
+            y = self(x)
+            return y
+        def configure_optimizers(self):
+            return torch.optim.Adam(self.parameters(), lr=1e-3)
+    model = Model()
+    trainer = L.Trainer(accelerator="cpu", strategy=DDPStrategy(), max_steps=1000, devices=1)
+    dataloader = KubricData(
+        global_rank=trainer.global_rank,
+        data_dir='/media/data2/PointTracking/tensorflow_datasets',
+        batch_size=1 * trainer.world_size,
+    )
+    trainer.fit(model, dataloader)

locotrack_pytorch/environment.yml ADDED Viewed

	@@ -0,0 +1,151 @@

+name: locotrack-pytorch
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2024.7.2=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.14=h5eee18b_0
+  - pip=24.0=py311h06a4308_0
+  - python=3.11.9=h955ad1f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=69.5.1=py311h06a4308_0
+  - sqlite=3.45.3=h5eee18b_0
+  - tk=8.6.14=h39e8969_0
+  - tzdata=2024a=h04d1e81_0
+  - wheel=0.43.0=py311h06a4308_0
+  - xz=5.4.6=h5eee18b_1
+  - zlib=1.2.13=h5eee18b_1
+  - pip:
+      - absl-py==2.1.0
+      - aiohttp==3.9.5
+      - aiosignal==1.3.1
+      - array-record==0.5.1
+      - asttokens==2.4.1
+      - astunparse==1.6.3
+      - attrs==23.2.0
+      - certifi==2024.7.4
+      - charset-normalizer==3.3.2
+      - click==8.1.7
+      - contourpy==1.2.1
+      - cycler==0.12.1
+      - decorator==5.1.1
+      - dm-tree==0.1.8
+      - docker-pycreds==0.4.0
+      - docstring-parser==0.16
+      - einshape==1.0
+      - etils==1.9.2
+      - executing==2.0.1
+      - filelock==3.13.1
+      - flatbuffers==24.3.25
+      - fonttools==4.53.1
+      - frozenlist==1.4.1
+      - fsspec==2024.2.0
+      - gast==0.6.0
+      - gitdb==4.0.11
+      - gitpython==3.1.43
+      - google-pasta==0.2.0
+      - googleapis-common-protos==1.63.2
+      - grpcio==1.65.1
+      - h5py==3.11.0
+      - idna==3.7
+      - immutabledict==4.2.0
+      - importlib-resources==6.4.0
+      - ipython==8.26.0
+      - jedi==0.19.1
+      - jinja2==3.1.3
+      - keras==3.4.1
+      - kiwisolver==1.4.5
+      - libclang==18.1.1
+      - lightning==2.3.3
+      - lightning-utilities==0.11.6
+      - markdown==3.6
+      - markdown-it-py==3.0.0
+      - markupsafe==2.1.5
+      - matplotlib==3.9.1
+      - matplotlib-inline==0.1.7
+      - mdurl==0.1.2
+      - mediapy==1.2.2
+      - ml-dtypes==0.4.0
+      - mpmath==1.3.0
+      - multidict==6.0.5
+      - namex==0.0.8
+      - networkx==3.2.1
+      - numpy==1.26.3
+      - nvidia-cublas-cu12==12.4.2.65
+      - nvidia-cuda-cupti-cu12==12.4.99
+      - nvidia-cuda-nvrtc-cu12==12.4.99
+      - nvidia-cuda-runtime-cu12==12.4.99
+      - nvidia-cudnn-cu12==9.1.0.70
+      - nvidia-cufft-cu12==11.2.0.44
+      - nvidia-curand-cu12==10.3.5.119
+      - nvidia-cusolver-cu12==11.6.0.99
+      - nvidia-cusparse-cu12==12.3.0.142
+      - nvidia-nccl-cu12==2.20.5
+      - nvidia-nvjitlink-cu12==12.4.99
+      - nvidia-nvtx-cu12==12.4.99
+      - openexr==3.2.4
+      - opt-einsum==3.3.0
+      - optree==0.12.1
+      - packaging==24.1
+      - parso==0.8.4
+      - pexpect==4.9.0
+      - pillow==10.2.0
+      - platformdirs==4.2.2
+      - promise==2.3
+      - prompt-toolkit==3.0.47
+      - protobuf==4.25.4
+      - psutil==6.0.0
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.3
+      - pyarrow==17.0.0
+      - pygments==2.18.0
+      - pyparsing==3.1.2
+      - python-dateutil==2.9.0.post0
+      - pytorch-lightning==2.3.3
+      - pyyaml==6.0.1
+      - requests==2.32.3
+      - rich==13.7.1
+      - scipy==1.14.0
+      - sentry-sdk==2.11.0
+      - setproctitle==1.3.3
+      - simple-parsing==0.1.5
+      - six==1.16.0
+      - smmap==5.0.1
+      - stack-data==0.6.3
+      - sympy==1.12
+      - tensorboard==2.17.0
+      - tensorboard-data-server==0.7.2
+      - tensorflow==2.17.0
+      - tensorflow-addons==0.23.0
+      - tensorflow-datasets==4.9.6
+      - tensorflow-graphics==2021.12.3
+      - tensorflow-io-gcs-filesystem==0.37.1
+      - tensorflow-metadata==1.15.0
+      - termcolor==2.4.0
+      - toml==0.10.2
+      - torch==2.4.0+cu124
+      - torchaudio==2.4.0+cu124
+      - torchmetrics==1.4.0.post0
+      - torchvision==0.19.0+cu124
+      - tqdm==4.66.4
+      - traitlets==5.14.3
+      - trimesh==4.4.3
+      - triton==3.0.0
+      - typeguard==2.13.3
+      - typing-extensions==4.9.0
+      - urllib3==2.2.2
+      - wandb==0.17.5
+      - wcwidth==0.2.13
+      - werkzeug==3.0.3
+      - wrapt==1.16.0
+      - yarl==1.9.4
+      - zipp==3.19.2

locotrack_pytorch/experiment.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import os
+import configparser
+import argparse
+import logging
+from functools import partial
+from typing import Any, Dict, Optional, Union
+import lightning as L
+from lightning.pytorch import seed_everything
+from lightning.pytorch.loggers import WandbLogger
+from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor, TQDMProgressBar
+import torch
+from torch.utils.data import DataLoader
+from data.kubric_data import KubricData
+from models.locotrack_model import LocoTrack
+import model_utils
+from data.evaluation_datasets import get_eval_dataset
+class LocoTrackModel(L.LightningModule):
+    def __init__(
+        self,
+        model_kwargs: Optional[Dict[str, Any]] = None,
+        model_forward_kwargs: Optional[Dict[str, Any]] = None,
+        loss_name: Optional[str] = 'tapir_loss',
+        loss_kwargs: Optional[Dict[str, Any]] = None,
+        query_first: Optional[bool] = False,
+        optimizer_name: Optional[str] = 'Adam',
+        optimizer_kwargs: Optional[Dict[str, Any]] = None,
+        scheduler_name: Optional[str] = 'OneCycleLR',
+        scheduler_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__()
+        self.model = LocoTrack(**(model_kwargs or {}))
+        self.model_forward_kwargs = model_forward_kwargs or {}
+        self.loss = partial(model_utils.__dict__[loss_name], **(loss_kwargs or {}))
+        self.query_first = query_first
+        self.optimizer_name = optimizer_name
+        self.optimizer_kwargs = optimizer_kwargs or {'lr': 2e-3}
+        self.scheduler_name = scheduler_name
+        self.scheduler_kwargs = scheduler_kwargs or {'max_lr': 2e-3, 'pct_start': 0.05, 'total_steps': 300000}
+    def training_step(self, batch, batch_idx):
+        output = self.model(batch['video'], batch['query_points'], **self.model_forward_kwargs)
+        loss, loss_scalars = self.loss(batch, output)
+        self.log_dict(
+            {f'train/{k}': v.item() for k, v in loss_scalars.items()},
+            logger=True,
+            on_step=True,
+            sync_dist=True,
+        )
+        return loss
+    def validation_step(self, batch, batch_idx, dataloader_idx=None):
+        output = self.model(batch['video'], batch['query_points'], **self.model_forward_kwargs)
+        loss, loss_scalars = self.loss(batch, output)
+        metrics = model_utils.eval_batch(batch, output, query_first=self.query_first)
+        if self.trainer.global_rank == 0:
+            log_prefix = 'val/'
+            if dataloader_idx is not None:
+                log_prefix = f'val/data_{dataloader_idx}/'
+            self.log_dict(
+                {log_prefix + k: v for k, v in loss_scalars.items()},
+                logger=True,
+                rank_zero_only=True,
+            )
+            self.log_dict(
+                {log_prefix + k: v.item() for k, v in metrics.items()},
+                logger=True,
+                rank_zero_only=True,
+            )
+            logging.info(f"Batch {batch_idx}: {metrics}")
+    def test_step(self, batch, batch_idx, dataloader_idx=None):
+        output = self.model(batch['video'], batch['query_points'], **self.model_forward_kwargs)
+        loss, loss_scalars = self.loss(batch, output)
+        metrics = model_utils.eval_batch(batch, output, query_first=self.query_first)
+        if self.trainer.global_rank == 0:
+            log_prefix = 'test/'
+            if dataloader_idx is not None:
+                log_prefix = f'test/data_{dataloader_idx}/'
+            self.log_dict(
+                {log_prefix + k: v for k, v in loss_scalars.items()},
+                logger=True,
+                rank_zero_only=True,
+            )
+            self.log_dict(
+                {log_prefix + k: v.item() for k, v in metrics.items()},
+                logger=True,
+                rank_zero_only=True,
+            )
+            logging.info(f"Batch {batch_idx}: {metrics}")
+    def configure_optimizers(self):
+        weights = [p for n, p in self.named_parameters() if 'bias' not in n]
+        bias = [p for n, p in self.named_parameters() if 'bias' in n]
+        optimizer = torch.optim.__dict__[self.optimizer_name](
+            [
+                {'params': weights, **self.optimizer_kwargs},
+                {'params': bias, **self.optimizer_kwargs, 'weight_decay': 0.}
+            ]
+        )
+        scheduler = torch.optim.lr_scheduler.__dict__[self.scheduler_name](optimizer, **self.scheduler_kwargs)
+        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
+def train(
+    mode: str,
+    save_path: str,
+    val_dataset_path: str,
+    ckpt_path: str = None,
+    kubric_dir: str = '',
+    precision: str = '32',
+    batch_size: int = 1,
+    val_check_interval: Union[int, float] = 5000,
+    log_every_n_steps: int = 10,
+    gradient_clip_val: float = 1.0,
+    max_steps: int = 300_000,
+    model_kwargs: Optional[Dict[str, Any]] = None,
+    model_forward_kwargs: Optional[Dict[str, Any]] = None,
+    loss_name: str = 'tapir_loss',
+    loss_kwargs: Optional[Dict[str, Any]] = None,
+    optimizer_name: str = 'Adam',
+    optimizer_kwargs: Optional[Dict[str, Any]] = None,
+    scheduler_name: str = 'OneCycleLR',
+    scheduler_kwargs: Optional[Dict[str, Any]] = None,
+    # query_first: bool = False,
+):
+    """Train the LocoTrack model with specified configurations."""
+    seed_everything(42, workers=True)
+    model = LocoTrackModel(
+        model_kwargs=model_kwargs,
+        model_forward_kwargs=model_forward_kwargs,
+        loss_name=loss_name,
+        loss_kwargs=loss_kwargs,
+        query_first='q_first' in mode,
+        optimizer_name=optimizer_name,
+        optimizer_kwargs=optimizer_kwargs,
+        scheduler_name=scheduler_name,
+        scheduler_kwargs=scheduler_kwargs,
+    )
+    if ckpt_path is not None and 'train' in mode:
+        model.load_state_dict(torch.load(ckpt_path)['state_dict'])
+    logger = WandbLogger(project='LocoTrack_Pytorch', save_dir=save_path, id=os.path.basename(save_path))
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=save_path,
+        save_last=True,
+        save_top_k=3,
+        mode="max",
+        monitor="val/average_pts_within_thresh",
+        auto_insert_metric_name=True,
+        save_on_train_epoch_end=False,
+    )
+    eval_dataset = get_eval_dataset(
+        mode=mode,
+        path=val_dataset_path,
+    )
+    eval_dataloder = {
+        k: DataLoader(
+            v,
+            batch_size=1,
+            shuffle=False,
+        ) for k, v in eval_dataset.items()
+    }
+    if 'train' in mode:
+        trainer = L.Trainer(
+            strategy='ddp',
+            logger=logger,
+            precision=precision,
+            val_check_interval=val_check_interval,
+            log_every_n_steps=log_every_n_steps,
+            gradient_clip_val=gradient_clip_val,
+            max_steps=max_steps,
+            sync_batchnorm=True,
+            callbacks=[checkpoint_callback, lr_monitor],
+        )
+        train_dataloader = KubricData(
+            global_rank=trainer.global_rank,
+            data_dir=kubric_dir,
+            batch_size=batch_size * trainer.world_size,
+        )
+        trainer.fit(model, train_dataloader, eval_dataloder, ckpt_path=ckpt_path)
+    elif 'eval' in mode:
+        trainer = L.Trainer(strategy='ddp', logger=logger, precision=precision)
+        trainer.test(model, eval_dataloder, ckpt_path=ckpt_path)
+    else:
+        raise ValueError(f"Invalid mode: {mode}")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Train or evaluate the LocoTrack model.")
+    parser.add_argument('--config', type=str, default='config.ini', help="Path to the configuration file.")
+    parser.add_argument('--mode', type=str, required=True, help="Mode to run: 'train' or 'eval' with optional 'q_first' and the name of evaluation dataset.")
+    parser.add_argument('--ckpt_path', type=str, default=None, help="Path to the checkpoint file")
+    parser.add_argument('--save_path', type=str, default='snapshots', help="Path to save the logs and checkpoints.")
+    args = parser.parse_args()
+    config = configparser.ConfigParser()
+    config.read(args.config)
+    # Extract parameters from the config file
+    train_params = {
+        'mode': args.mode,
+        'ckpt_path': args.ckpt_path,
+        'save_path': args.save_path,
+        'val_dataset_path': eval(config.get('TRAINING', 'val_dataset_path', fallback='{}')),
+        'kubric_dir': config.get('TRAINING', 'kubric_dir', fallback=''),
+        'precision': config.get('TRAINING', 'precision', fallback='32'),
+        'batch_size': config.getint('TRAINING', 'batch_size', fallback=1),
+        'val_check_interval': config.getfloat('TRAINING', 'val_check_interval', fallback=5000),
+        'log_every_n_steps': config.getint('TRAINING', 'log_every_n_steps', fallback=10),
+        'gradient_clip_val': config.getfloat('TRAINING', 'gradient_clip_val', fallback=1.0),
+        'max_steps': config.getint('TRAINING', 'max_steps', fallback=300000),
+        'model_kwargs': eval(config.get('MODEL', 'model_kwargs', fallback='{}')),
+        'model_forward_kwargs': eval(config.get('MODEL', 'model_forward_kwargs', fallback='{}')),
+        'loss_name': config.get('LOSS', 'loss_name', fallback='tapir_loss'),
+        'loss_kwargs': eval(config.get('LOSS', 'loss_kwargs', fallback='{}')),
+        'optimizer_name': config.get('OPTIMIZER', 'optimizer_name', fallback='Adam'),
+        'optimizer_kwargs': eval(config.get('OPTIMIZER', 'optimizer_kwargs', fallback='{"lr": 2e-3}')),
+        'scheduler_name': config.get('SCHEDULER', 'scheduler_name', fallback='OneCycleLR'),
+        'scheduler_kwargs': eval(config.get('SCHEDULER', 'scheduler_kwargs', fallback='{"max_lr": 2e-3, "pct_start": 0.05, "total_steps": 300000}')),
+    }
+    train(**train_params)

locotrack_pytorch/model_utils.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from typing import Sequence, Optional
+import torch
+import torch.nn.functional as F
+from models.utils import convert_grid_coordinates
+from data.evaluation_datasets import compute_tapvid_metrics
+def huber_loss(tracks, target_points, occluded, delta=4.0, reduction_axes=(1, 2)):
+    """Huber loss for point trajectories."""
+    error = tracks - target_points
+    distsqr = torch.sum(error ** 2, dim=-1)
+    dist = torch.sqrt(distsqr + 1e-12)  # add eps to prevent nan
+    loss_huber = torch.where(dist < delta, distsqr / 2, delta * (torch.abs(dist) - delta / 2))
+    loss_huber = loss_huber * (1.0 - occluded.float())
+    if reduction_axes:
+        loss_huber = torch.mean(loss_huber, dim=reduction_axes)
+    return loss_huber
+def prob_loss(tracks, expd, target_points, occluded, expected_dist_thresh=8.0, reduction_axes=(1, 2)):
+    """Loss for classifying if a point is within pixel threshold of its target."""
+    err = torch.sum((tracks - target_points) ** 2, dim=-1)
+    invalid = (err > expected_dist_thresh ** 2).float()
+    logprob = F.binary_cross_entropy_with_logits(expd, invalid, reduction='none')
+    logprob = logprob * (1.0 - occluded.float())
+    if reduction_axes:
+        logprob = torch.mean(logprob, dim=reduction_axes)
+    return logprob
+def tapnet_loss(points, occlusion, target_points, target_occ, shape, mask=None, expected_dist=None,
+                position_loss_weight=0.05, expected_dist_thresh=6.0, huber_loss_delta=4.0,
+                rebalance_factor=None, occlusion_loss_mask=None):
+    """TAPNet loss."""
+    if mask is None:
+        mask = torch.tensor(1.0)
+    points = convert_grid_coordinates(points, shape[3:1:-1], (256, 256), coordinate_format='xy')
+    target_points = convert_grid_coordinates(target_points, shape[3:1:-1], (256, 256), coordinate_format='xy')
+    loss_huber = huber_loss(points, target_points, target_occ, delta=huber_loss_delta, reduction_axes=None) * mask
+    loss_huber = torch.mean(loss_huber) * position_loss_weight
+    if expected_dist is None:
+        loss_prob = torch.tensor(0.0)
+    else:
+        loss_prob = prob_loss(points.detach(), expected_dist, target_points, target_occ, expected_dist_thresh, reduction_axes=None) * mask
+        loss_prob = torch.mean(loss_prob)
+    target_occ = target_occ.to(dtype=occlusion.dtype)
+    loss_occ = F.binary_cross_entropy_with_logits(occlusion, target_occ, reduction='none') * mask
+    if rebalance_factor is not None:
+        loss_occ = loss_occ * ((1 + rebalance_factor) - rebalance_factor * target_occ)
+    if occlusion_loss_mask is not None:
+        loss_occ = loss_occ * occlusion_loss_mask
+    loss_occ = torch.mean(loss_occ)
+    return loss_huber, loss_occ, loss_prob
+def tapir_loss(
+    batch,
+    output,
+    position_loss_weight=0.05,
+    expected_dist_thresh=6.0,
+):
+    loss_scalars = {}
+    loss_huber, loss_occ, loss_prob = tapnet_loss(
+        output['tracks'],
+        output['occlusion'],
+        batch['target_points'],
+        batch['occluded'],
+        batch['video'].shape,  # pytype: disable=attribute-error  # numpy-scalars
+        expected_dist=output['expected_dist']
+        if 'expected_dist' in output
+        else None,
+        position_loss_weight=position_loss_weight,
+        expected_dist_thresh=expected_dist_thresh,
+    )
+    loss = loss_huber + loss_occ + loss_prob
+    loss_scalars['position_loss'] = loss_huber
+    loss_scalars['occlusion_loss'] = loss_occ
+    if 'expected_dist' in output:
+        loss_scalars['prob_loss'] = loss_prob
+    if 'unrefined_tracks' in output:
+        for l in range(len(output['unrefined_tracks'])):
+            loss_huber, loss_occ, loss_prob = tapnet_loss(
+                output['unrefined_tracks'][l],
+                output['unrefined_occlusion'][l],
+                batch['target_points'],
+                batch['occluded'],
+                batch['video'].shape,  # pytype: disable=attribute-error  # numpy-scalars
+                expected_dist=output['unrefined_expected_dist'][l]
+                if 'unrefined_expected_dist' in output
+                else None,
+                position_loss_weight=position_loss_weight,
+                expected_dist_thresh=expected_dist_thresh,
+            )
+            loss = loss + loss_huber + loss_occ + loss_prob
+            loss_scalars[f'position_loss_{l}'] = loss_huber
+            loss_scalars[f'occlusion_loss_{l}'] = loss_occ
+            if 'unrefined_expected_dist' in output:
+                loss_scalars[f'prob_loss_{l}'] = loss_prob
+    loss_scalars['loss'] = loss
+    return loss, loss_scalars
+def eval_batch(
+    batch,
+    output,
+    eval_metrics_resolution = (256, 256),
+    query_first = False,
+):
+    query_points = batch['query_points']
+    query_points = convert_grid_coordinates(
+        query_points,
+        (1,) + batch['video'].shape[2:4],  # (1, height, width)
+        (1,) + eval_metrics_resolution,  # (1, height, width)
+        coordinate_format='tyx',
+    )
+    gt_target_points = batch['target_points']
+    gt_target_points = convert_grid_coordinates(
+        gt_target_points,
+        batch['video'].shape[3:1:-1],  # (width, height)
+        eval_metrics_resolution[::-1],  # (width, height)
+        coordinate_format='xy',
+    )
+    gt_occluded = batch['occluded']
+    tracks = output['tracks']
+    tracks = convert_grid_coordinates(
+        tracks,
+        batch['video'].shape[3:1:-1],  # (width, height)
+        eval_metrics_resolution[::-1],  # (width, height)
+        coordinate_format='xy',
+    )
+    occlusion_logits = output['occlusion']
+    pred_occ = torch.sigmoid(occlusion_logits)
+    if 'expected_dist' in output:
+        expected_dist = output['expected_dist']
+        pred_occ = 1 - (1 - pred_occ) * (1 - torch.sigmoid(expected_dist))
+    pred_occ = pred_occ > 0.5  # threshold
+    query_mode = 'first' if query_first else 'strided'
+    metrics = compute_tapvid_metrics(
+        query_points=query_points.detach().cpu().numpy(),
+        gt_occluded=gt_occluded.detach().cpu().numpy(),
+        gt_tracks=gt_target_points.detach().cpu().numpy(),
+        pred_occluded=pred_occ.detach().cpu().numpy(),
+        pred_tracks=tracks.detach().cpu().numpy(),
+        query_mode=query_mode,
+    )
+    return metrics

locotrack_pytorch/models/cmdtop.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models import utils
+class CMDTop(nn.Module):
+    def __init__(self, in_channel, out_channels, kernel_shapes, strides):
+        super(CMDTop, self).__init__()
+        self.in_channels = [in_channel] + list(out_channels[:-1])
+        self.out_channels = out_channels
+        self.kernel_shapes = kernel_shapes
+        self.strides = strides
+        self.conv = nn.ModuleList([
+            nn.Sequential(
+                utils.Conv2dSamePadding(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.out_channels[i],
+                    kernel_size=self.kernel_shapes[i],
+                    stride=self.strides[i],
+                ),
+                nn.GroupNorm(out_channels[i] // 16, out_channels[i]),
+                nn.ReLU()
+            ) for i in range(len(out_channels))
+        ])
+    def forward(self, x):
+        """
+        x: (b, h, w, i, j)
+        """
+        out1 = utils.einshape('bhwij->b(ij)hw', x)
+        out2 = utils.einshape('bhwij->b(hw)ij', x)
+        for i in range(len(self.out_channels)):
+            out1 = self.conv[i](out1)
+        for i in range(len(self.out_channels)):
+            out2 = self.conv[i](out2)
+        out1 = torch.mean(out1, dim=(2, 3)) # (b, out_channels[-1])
+        out2 = torch.mean(out2, dim=(2, 3)) # (b, out_channels[-1])
+        return torch.cat([out1, out2], dim=-1) # (b, 2*out_channels[-1])

locotrack_pytorch/models/locotrack_model.py ADDED Viewed

	@@ -0,0 +1,1053 @@

+# Copyright 2024 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TAPIR models definition."""
+import functools
+from typing import Any, List, Mapping, NamedTuple, Optional, Sequence, Tuple
+import torch
+from torch import nn
+import torch.nn.functional as F
+import numpy as np
+from models import nets, utils
+from models.cmdtop import CMDTop
+def posenc(x, min_deg, max_deg, legacy_posenc_order=False):
+    """Cat x with a positional encoding of x with scales 2^[min_deg, max_deg-1].
+    Instead of computing [sin(x), cos(x)], we use the trig identity
+    cos(x) = sin(x + pi/2) and do one vectorized call to sin([x, x+pi/2]).
+    Args:
+        x: torch.Tensor, variables to be encoded. Note that x should be in [-pi, pi].
+        min_deg: int, the minimum (inclusive) degree of the encoding.
+        max_deg: int, the maximum (exclusive) degree of the encoding.
+        legacy_posenc_order: bool, keep the same ordering as the original tf code.
+    Returns:
+        encoded: torch.Tensor, encoded variables.
+    """
+    if min_deg == max_deg:
+        return x
+    scales = torch.tensor([2**i for i in range(min_deg, max_deg)], dtype=x.dtype, device=x.device)
+    if legacy_posenc_order:
+        xb = x[..., None, :] * scales[:, None]
+        four_feat = torch.reshape(
+            torch.sin(torch.stack([xb, xb + 0.5 * np.pi], dim=-2)),
+            list(x.shape[:-1]) + [-1]
+        )
+    else:
+        xb = torch.reshape((x[..., None, :] * scales[:, None]), list(x.shape[:-1]) + [-1])
+        four_feat = torch.sin(torch.cat([xb, xb + 0.5 * np.pi], dim=-1))
+    return torch.cat([x] + [four_feat], dim=-1)
+def get_relative_positions(seq_len, reverse=False):
+    x = torch.arange(seq_len)[None, :]
+    y = torch.arange(seq_len)[:, None]
+    return torch.tril(x - y) if not reverse else torch.triu(y - x)
+def get_alibi_slope(num_heads):
+    x = (24) ** (1 / num_heads)
+    return torch.tensor([1 / x ** (i + 1) for i in range(num_heads)], dtype=torch.float32).view(-1, 1, 1)
+class MultiHeadAttention(nn.Module):
+    """Multi-headed attention (MHA) module."""
+    def __init__(self, num_heads, key_size, w_init_scale=None, w_init=None, with_bias=True, b_init=None, value_size=None, model_size=None):
+        super(MultiHeadAttention, self).__init__()
+        self.num_heads = num_heads
+        self.key_size = key_size
+        self.value_size = value_size or key_size
+        self.model_size = model_size or key_size * num_heads
+        self.with_bias = with_bias
+        self.query_proj = nn.Linear(num_heads * key_size, num_heads * key_size, bias=with_bias)
+        self.key_proj = nn.Linear(num_heads * key_size, num_heads * key_size, bias=with_bias)
+        self.value_proj = nn.Linear(num_heads * self.value_size, num_heads * self.value_size, bias=with_bias)
+        self.final_proj = nn.Linear(num_heads * self.value_size, self.model_size, bias=with_bias)
+    def forward(self, query, key, value, mask=None):
+        batch_size, sequence_length, _ = query.size()
+        query_heads = self._linear_projection(query, self.key_size, self.query_proj)  # [T', H, Q=K]
+        key_heads = self._linear_projection(key, self.key_size, self.key_proj)  # [T, H, K]
+        value_heads = self._linear_projection(value, self.value_size, self.value_proj)  # [T, H, V]
+        bias_forward = get_alibi_slope(self.num_heads // 2) * get_relative_positions(sequence_length)
+        bias_forward = bias_forward + torch.triu(torch.full_like(bias_forward, -1e9), diagonal=1)
+        bias_backward = get_alibi_slope(self.num_heads // 2) * get_relative_positions(sequence_length, reverse=True)
+        bias_backward = bias_backward + torch.tril(torch.full_like(bias_backward, -1e9), diagonal=-1)
+        attn_bias = torch.cat([bias_forward, bias_backward], dim=0).to(query.device)
+        attn_logits = torch.einsum("...thd,...Thd->...htT", query_heads, key_heads)
+        attn_logits = attn_logits / np.sqrt(self.key_size) + attn_bias
+        if mask is not None:
+            if mask.ndim != attn_logits.ndim:
+                raise ValueError(f"Mask dimensionality {mask.ndim} must match logits dimensionality {attn_logits.ndim}.")
+            attn_logits = torch.where(mask, attn_logits, torch.tensor(-1e30))
+        attn_weights = F.softmax(attn_logits, dim=-1)  # [H, T', T]
+        attn = torch.einsum("...htT,...Thd->...thd", attn_weights, value_heads)
+        attn = attn.reshape(batch_size, sequence_length, -1)  # [T', H*V]
+        return self.final_proj(attn)  # [T', D']
+    def _linear_projection(self, x, head_size, proj_layer):
+        y = proj_layer(x)
+        *leading_dims, _ = x.shape
+        return y.reshape((*leading_dims, self.num_heads, head_size))
+class Transformer(nn.Module):
+    """A transformer stack."""
+    def __init__(self, num_heads, num_layers, attn_size, dropout_rate, widening_factor=4):
+        super(Transformer, self).__init__()
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.attn_size = attn_size
+        self.dropout_rate = dropout_rate
+        self.widening_factor = widening_factor
+        self.layers = nn.ModuleList([
+            nn.ModuleDict({
+                'attn': MultiHeadAttention(num_heads, attn_size, model_size=attn_size * num_heads),
+                'dense': nn.Sequential(
+                    nn.Linear(attn_size * num_heads, widening_factor * attn_size * num_heads),
+                    nn.GELU(),
+                    nn.Linear(widening_factor * attn_size * num_heads, attn_size * num_heads)
+                ),
+                'layer_norm1': nn.LayerNorm(attn_size * num_heads),
+                'layer_norm2': nn.LayerNorm(attn_size * num_heads)
+            })
+            for _ in range(num_layers)
+        ])
+        self.ln_out = nn.LayerNorm(attn_size * num_heads)
+    def forward(self, embeddings, mask=None):
+        h = embeddings
+        for layer in self.layers:
+            h_norm = layer['layer_norm1'](h)
+            h_attn = layer['attn'](h_norm, h_norm, h_norm, mask=mask)
+            h_attn = F.dropout(h_attn, p=self.dropout_rate, training=self.training)
+            h = h + h_attn
+            h_norm = layer['layer_norm2'](h)
+            h_dense = layer['dense'](h_norm)
+            h_dense = F.dropout(h_dense, p=self.dropout_rate, training=self.training)
+            h = h + h_dense
+        return self.ln_out(h)
+class PIPSTransformer(nn.Module):
+    def __init__(self, input_channels, output_channels, dim=512, num_heads=8, num_layers=1):
+        super(PIPSTransformer, self).__init__()
+        self.dim = dim
+        self.transformer = Transformer(
+            num_heads=num_heads,
+            num_layers=num_layers,
+            attn_size=dim // num_heads,
+            dropout_rate=0.,
+            widening_factor=4,
+        )
+        self.input_proj = nn.Linear(input_channels, dim)
+        self.output_proj = nn.Linear(dim, output_channels)
+    def forward(self, x):
+        x = self.input_proj(x)
+        x = self.transformer(x, mask=None)
+        return self.output_proj(x)
+class FeatureGrids(NamedTuple):
+  """Feature grids for a video, used to compute trajectories.
+  These are per-frame outputs of the encoding resnet.
+  Attributes:
+    lowres: Low-resolution features, one for each resolution; 256 channels.
+    hires: High-resolution features, one for each resolution; 64 channels.
+    resolutions: Resolutions used for trajectory computation.  There will be one
+      entry for the initialization, and then an entry for each PIPs refinement
+      resolution.
+  """
+  lowres: Sequence[torch.Tensor]
+  hires: Sequence[torch.Tensor]
+  highest: Sequence[torch.Tensor]
+  resolutions: Sequence[Tuple[int, int]]
+class QueryFeatures(NamedTuple):
+  """Query features used to compute trajectories.
+  These are sampled from the query frames and are a full descriptor of the
+  tracked points. They can be acquired from a query image and then reused in a
+  separate video.
+  Attributes:
+    lowres: Low-resolution features, one for each resolution; each has shape
+      [batch, num_query_points, 256]
+    hires: High-resolution features, one for each resolution; each has shape
+      [batch, num_query_points, 64]
+    resolutions: Resolutions used for trajectory computation.  There will be one
+      entry for the initialization, and then an entry for each PIPs refinement
+      resolution.
+  """
+  lowres: Sequence[torch.Tensor]
+  hires: Sequence[torch.Tensor]
+  highest: Sequence[torch.Tensor]
+  lowres_supp: Sequence[torch.Tensor]
+  hires_supp: Sequence[torch.Tensor]
+  highest_supp: Sequence[torch.Tensor]
+  resolutions: Sequence[Tuple[int, int]]
+class LocoTrack(nn.Module):
+  """TAPIR model."""
+  def __init__(
+      self,
+      bilinear_interp_with_depthwise_conv: bool = False,
+      num_pips_iter: int = 4,
+      pyramid_level: int = 0,
+      mixer_hidden_dim: int = 512,
+      num_mixer_blocks: int = 12,
+      mixer_kernel_shape: int = 3,
+      patch_size: int = 7,
+      softmax_temperature: float = 20.0,
+      parallelize_query_extraction: bool = False,
+      initial_resolution: Tuple[int, int] = (256, 256),
+      blocks_per_group: Sequence[int] = (2, 2, 2, 2),
+      feature_extractor_chunk_size: int = 256,
+      extra_convs: bool = False,
+      use_casual_conv: bool = False,
+      model_size: str = 'base',
+  ):
+    super().__init__()
+    if model_size == 'small':
+      model_params = {
+        'dim': 256,
+        'num_heads': 4,
+        'num_layers': 3,
+      }
+      cmdtop_params = {
+        'in_channel': 49,
+        'out_channels': (64, 128),
+        'kernel_shapes': (5, 2),
+        'strides': (4, 2),
+      }
+    elif model_size == 'base':
+      model_params = {
+        'dim': 384,
+        'num_heads': 6,
+        'num_layers': 3,
+      }
+      cmdtop_params = {
+        'in_channel': 49,
+        'out_channels': (64, 128, 128),
+        'kernel_shapes': (3, 3, 2),
+        'strides': (2, 2, 2),
+      }
+    else:
+      raise ValueError(f"Unknown model size '{model_size}'")
+    self.highres_dim = 128
+    self.lowres_dim = 256
+    self.bilinear_interp_with_depthwise_conv = (
+        bilinear_interp_with_depthwise_conv
+    )
+    self.parallelize_query_extraction = parallelize_query_extraction
+    self.num_pips_iter = num_pips_iter
+    self.pyramid_level = pyramid_level
+    self.patch_size = patch_size
+    self.softmax_temperature = softmax_temperature
+    self.initial_resolution = tuple(initial_resolution)
+    self.feature_extractor_chunk_size = feature_extractor_chunk_size
+    self.num_mixer_blocks = num_mixer_blocks
+    self.use_casual_conv = use_casual_conv
+    highres_dim = 128
+    lowres_dim = 256
+    strides = (1, 2, 2, 1)
+    blocks_per_group = (2, 2, 2, 2)
+    channels_per_group = (64, highres_dim, 256, lowres_dim)
+    use_projection = (True, True, True, True)
+    self.resnet_torch = nets.ResNet(
+        blocks_per_group=blocks_per_group,
+        channels_per_group=channels_per_group,
+        use_projection=use_projection,
+        strides=strides,
+    )
+    self.torch_pips_mixer = PIPSTransformer(
+      input_channels=854,
+      output_channels=4 + self.highres_dim + self.lowres_dim,
+      **model_params
+    )
+    self.cmdtop = nn.ModuleList([
+      CMDTop(
+        **cmdtop_params
+      ) for _ in range(3)
+    ])
+    self.cost_conv = utils.Conv2dSamePadding(2, 1, 3, 1)
+    self.occ_linear = nn.Linear(6, 2)
+    if extra_convs:
+      self.extra_convs = nets.ExtraConvs()
+    else:
+      self.extra_convs = None
+  def forward(
+      self,
+      video: torch.Tensor,
+      query_points: torch.Tensor,
+      feature_grids: Optional[FeatureGrids] = None,
+      is_training: bool = False,
+      query_chunk_size: Optional[int] = 64,
+      get_query_feats: bool = False,
+      refinement_resolutions: Optional[List[Tuple[int, int]]] = None,
+  ) -> Mapping[str, torch.Tensor]:
+    """Runs a forward pass of the model.
+    Args:
+      video: A 5-D tensor representing a batch of sequences of images.
+      query_points: The query points for which we compute tracks.
+      is_training: Whether we are training.
+      query_chunk_size: When computing cost volumes, break the queries into
+        chunks of this size to save memory.
+      get_query_feats: Return query features for other losses like contrastive.
+        Not supported in the current version.
+      refinement_resolutions: A list of (height, width) tuples.  Refinement will
+        be repeated at each specified resolution, in order to achieve high
+        accuracy on resolutions higher than what TAPIR was trained on. If None,
+        reasonable refinement resolutions will be inferred from the input video
+        size.
+    Returns:
+      A dict of outputs, including:
+        occlusion: Occlusion logits, of shape [batch, num_queries, num_frames]
+          where higher indicates more likely to be occluded.
+        tracks: predicted point locations, of shape
+          [batch, num_queries, num_frames, 2], where each point is [x, y]
+          in raster coordinates
+        expected_dist: uncertainty estimate logits, of shape
+          [batch, num_queries, num_frames], where higher indicates more likely
+          to be far from the correct answer.
+    """
+    if get_query_feats:
+      raise ValueError('Get query feats not supported in TAPIR.')
+    if feature_grids is None:
+      feature_grids = self.get_feature_grids(
+          video,
+          is_training,
+          refinement_resolutions,
+      )
+    query_features = self.get_query_features(
+        video,
+        is_training,
+        query_points,
+        feature_grids,
+        refinement_resolutions,
+    )
+    trajectories = self.estimate_trajectories(
+        video.shape[-3:-1],
+        is_training,
+        feature_grids,
+        query_features,
+        query_points,
+        query_chunk_size,
+    )
+    p = self.num_pips_iter
+    out = dict(
+        occlusion=torch.mean(
+            torch.stack(trajectories['occlusion'][p::p]), dim=0
+        ),
+        tracks=torch.mean(torch.stack(trajectories['tracks'][p::p]), dim=0),
+        expected_dist=torch.mean(
+            torch.stack(trajectories['expected_dist'][p::p]), dim=0
+        ),
+        unrefined_occlusion=trajectories['occlusion'][:-1],
+        unrefined_tracks=trajectories['tracks'][:-1],
+        unrefined_expected_dist=trajectories['expected_dist'][:-1],
+    )
+    return out
+  def get_query_features(
+      self,
+      video: torch.Tensor,
+      is_training: bool,
+      query_points: torch.Tensor,
+      feature_grids: Optional[FeatureGrids] = None,
+      refinement_resolutions: Optional[List[Tuple[int, int]]] = None,
+  ) -> QueryFeatures:
+    """Computes query features, which can be used for estimate_trajectories.
+    Args:
+      video: A 5-D tensor representing a batch of sequences of images.
+      is_training: Whether we are training.
+      query_points: The query points for which we compute tracks.
+      feature_grids: If passed, we'll use these feature grids rather than
+        computing new ones.
+      refinement_resolutions: A list of (height, width) tuples.  Refinement will
+        be repeated at each specified resolution, in order to achieve high
+        accuracy on resolutions higher than what TAPIR was trained on. If None,
+        reasonable refinement resolutions will be inferred from the input video
+        size.
+    Returns:
+      A QueryFeatures object which contains the required features for every
+        required resolution.
+    """
+    if feature_grids is None:
+      feature_grids = self.get_feature_grids(
+          video,
+          is_training=is_training,
+          refinement_resolutions=refinement_resolutions,
+      )
+    feature_grid = feature_grids.lowres
+    hires_feats = feature_grids.hires
+    highest_feats = feature_grids.highest
+    resize_im_shape = feature_grids.resolutions
+    shape = video.shape
+    # shape is [batch_size, time, height, width, channels]; conversion needs
+    # [time, width, height]
+    curr_resolution = (-1, -1)
+    query_feats = []
+    hires_query_feats = []
+    highest_query_feats = []
+    query_supp = []
+    hires_query_supp = []
+    highest_query_supp = []
+    for i, resolution in enumerate(resize_im_shape):
+      if utils.is_same_res(curr_resolution, resolution):
+        query_feats.append(query_feats[-1])
+        hires_query_feats.append(hires_query_feats[-1])
+        highest_query_feats.append(highest_query_feats[-1])
+        query_supp.append(query_supp[-1])
+        hires_query_supp.append(hires_query_supp[-1])
+        highest_query_supp.append(highest_query_supp[-1])
+        continue
+      position_in_grid = utils.convert_grid_coordinates(
+          query_points,
+          shape[1:4],
+          feature_grid[i].shape[1:4],
+          coordinate_format='tyx',
+      )
+      position_in_grid_hires = utils.convert_grid_coordinates(
+          query_points,
+          shape[1:4],
+          hires_feats[i].shape[1:4],
+          coordinate_format='tyx',
+      )
+      position_in_grid_highest = utils.convert_grid_coordinates(
+          query_points,
+          shape[1:4],
+          highest_feats[i].shape[1:4],
+          coordinate_format='tyx',
+      )
+      support_size = 7
+      ctxx, ctxy = torch.meshgrid(
+        torch.arange(-(support_size // 2), support_size // 2 + 1),
+        torch.arange(-(support_size // 2), support_size // 2 + 1),
+        indexing='xy',
+      )
+      ctx = torch.stack([torch.zeros_like(ctxy), ctxy, ctxx], axis=-1)
+      ctx = torch.reshape(ctx, [-1, 3]).to(video.device) # s*s 3
+      position_support = position_in_grid[..., None, :] + ctx[None, None, ...] # b n s*s 3
+      position_support = utils.einshape('bnsc->b(ns)c', position_support)
+      interp_supp = utils.map_coordinates_3d(
+          feature_grid[i], position_support
+      )
+      interp_supp = utils.einshape('b(nhw)c->bnhwc', interp_supp, h=support_size, w=support_size)
+      position_support_hires = position_in_grid_hires[..., None, :] + ctx[None, None, ...]
+      position_support_hires = utils.einshape('bnsc->b(ns)c', position_support_hires)
+      hires_interp_supp = utils.map_coordinates_3d(
+          hires_feats[i], position_support_hires
+      )
+      hires_interp_supp = utils.einshape('b(nhw)c->bnhwc', hires_interp_supp, h=support_size, w=support_size)
+      position_support_highest = position_in_grid_highest[..., None, :] + ctx[None, None, ...]
+      position_support_highest = utils.einshape('bnsc->b(ns)c', position_support_highest)
+      highest_interp_supp = utils.map_coordinates_3d(
+          highest_feats[i], position_support_highest
+      )
+      highest_interp_supp = utils.einshape('b(nhw)c->bnhwc', highest_interp_supp, h=support_size, w=support_size)
+      interp_features = interp_supp[..., support_size // 2, support_size // 2, :]
+      hires_interp = hires_interp_supp[..., support_size // 2, support_size // 2, :]
+      highest_interp = highest_interp_supp[..., support_size // 2, support_size // 2, :]
+      hires_query_feats.append(hires_interp)
+      query_feats.append(interp_features)
+      highest_query_feats.append(highest_interp)
+      query_supp.append(interp_supp)
+      hires_query_supp.append(hires_interp_supp)
+      highest_query_supp.append(highest_interp_supp)
+    return QueryFeatures(
+        tuple(query_feats), tuple(hires_query_feats), tuple(highest_query_feats),
+        tuple(query_supp), tuple(hires_query_supp), tuple(highest_query_supp), tuple(resize_im_shape),
+    )
+  def get_feature_grids(
+      self,
+      video: torch.Tensor,
+      is_training: Optional[bool] = False,
+      refinement_resolutions: Optional[List[Tuple[int, int]]] = None,
+  ) -> FeatureGrids:
+    """Computes feature grids.
+    Args:
+      video: A 5-D tensor representing a batch of sequences of images.
+      is_training: Whether we are training.
+      refinement_resolutions: A list of (height, width) tuples. Refinement will
+        be repeated at each specified resolution, to achieve high accuracy on
+        resolutions higher than what TAPIR was trained on. If None, reasonable
+        refinement resolutions will be inferred from the input video size.
+    Returns:
+      A FeatureGrids object containing the required features for every
+      required resolution. Note that there will be one more feature grid
+      than there are refinement_resolutions, because there is always a
+      feature grid computed for TAP-Net initialization.
+    """
+    del is_training
+    if refinement_resolutions is None:
+      refinement_resolutions = utils.generate_default_resolutions(
+          video.shape[2:4], self.initial_resolution
+      )
+    all_required_resolutions = [self.initial_resolution]
+    all_required_resolutions.extend(refinement_resolutions)
+    feature_grid = []
+    hires_feats = []
+    highest_feats = []
+    resize_im_shape = []
+    curr_resolution = (-1, -1)
+    latent = None
+    hires = None
+    video_resize = None
+    for resolution in all_required_resolutions:
+      if resolution[0] % 8 != 0 or resolution[1] % 8 != 0:
+        raise ValueError('Image resolution must be a multiple of 8.')
+      if not utils.is_same_res(curr_resolution, resolution):
+        if utils.is_same_res(curr_resolution, video.shape[-3:-1]):
+          video_resize = video
+        else:
+          video_resize = utils.bilinear(video, resolution)
+        curr_resolution = resolution
+        n, f, h, w, c = video_resize.shape
+        video_resize = video_resize.view(n*f, h, w, c).permute(0, 3, 1, 2)
+        if self.feature_extractor_chunk_size > 0:
+          latent_list = []
+          hires_list = []
+          highest_list = []
+          chunk_size = self.feature_extractor_chunk_size
+          for start_idx in range(0, video_resize.shape[0], chunk_size):
+            video_chunk = video_resize[start_idx:start_idx + chunk_size]
+            resnet_out = self.resnet_torch(video_chunk)
+            u3 = resnet_out['resnet_unit_3'].permute(0, 2, 3, 1)
+            latent_list.append(u3)
+            u1 = resnet_out['resnet_unit_1'].permute(0, 2, 3, 1)
+            hires_list.append(u1)
+            u0 = resnet_out['resnet_unit_0'].permute(0, 2, 3, 1)
+            highest_list.append(u0)
+          latent = torch.cat(latent_list, dim=0)
+          hires = torch.cat(hires_list, dim=0)
+          highest = torch.cat(highest_list, dim=0)
+        else:
+          resnet_out = self.resnet_torch(video_resize)
+          latent = resnet_out['resnet_unit_3'].permute(0, 2, 3, 1)
+          hires = resnet_out['resnet_unit_1'].permute(0, 2, 3, 1)
+          highest = resnet_out['resnet_unit_0'].permute(0, 2, 3, 1)
+        if self.extra_convs:
+          latent = self.extra_convs(latent)
+        latent = latent / torch.sqrt(
+            torch.maximum(
+                torch.sum(torch.square(latent), axis=-1, keepdims=True),
+                torch.tensor(1e-12, device=latent.device),
+            )
+        )
+        hires = hires / torch.sqrt(
+            torch.maximum(
+                torch.sum(torch.square(hires), axis=-1, keepdims=True),
+                torch.tensor(1e-12, device=hires.device),
+            )
+        )
+        highest = highest / torch.sqrt(
+            torch.maximum(
+                torch.sum(torch.square(highest), axis=-1, keepdims=True),
+                torch.tensor(1e-12, device=highest.device),
+            )
+        )
+        latent = latent.view(n, f, *latent.shape[1:])
+        hires = hires.view(n, f, *hires.shape[1:])
+        highest = highest.view(n, f, *highest.shape[1:])
+      feature_grid.append(latent)
+      hires_feats.append(hires)
+      highest_feats.append(highest)
+      resize_im_shape.append(video_resize.shape[2:4])
+    return FeatureGrids(
+        tuple(feature_grid), tuple(hires_feats), tuple(highest_feats), tuple(resize_im_shape)
+    )
+  def estimate_trajectories(
+      self,
+      video_size: Tuple[int, int],
+      is_training: bool,
+      feature_grids: FeatureGrids,
+      query_features: QueryFeatures,
+      query_points_in_video: Optional[torch.Tensor],
+      query_chunk_size: Optional[int] = None,
+      causal_context: Optional[dict[str, torch.Tensor]] = None,
+      get_causal_context: bool = False,
+  ) -> Mapping[str, Any]:
+    """Estimates trajectories given features for a video and query features.
+    Args:
+      video_size: A 2-tuple containing the original [height, width] of the
+        video.  Predictions will be scaled with respect to this resolution.
+      is_training: Whether we are training.
+      feature_grids: a FeatureGrids object computed for the given video.
+      query_features: a QueryFeatures object computed for the query points.
+      query_points_in_video: If provided, assume that the query points come from
+        the same video as feature_grids, and therefore constrain the resulting
+        trajectories to (approximately) pass through them.
+      query_chunk_size: When computing cost volumes, break the queries into
+        chunks of this size to save memory.
+      causal_context: If provided, a dict of causal context to use for
+        refinement.
+      get_causal_context: If True, return causal context in the output.
+    Returns:
+      A dict of outputs, including:
+        occlusion: Occlusion logits, of shape [batch, num_queries, num_frames]
+          where higher indicates more likely to be occluded.
+        tracks: predicted point locations, of shape
+          [batch, num_queries, num_frames, 2], where each point is [x, y]
+          in raster coordinates
+        expected_dist: uncertainty estimate logits, of shape
+          [batch, num_queries, num_frames], where higher indicates more likely
+          to be far from the correct answer.
+    """
+    del is_training
+    def train2orig(x):
+      return utils.convert_grid_coordinates(
+          x,
+          self.initial_resolution[::-1],
+          video_size[::-1],
+          coordinate_format='xy',
+      )
+    occ_iters = []
+    pts_iters = []
+    expd_iters = []
+    new_causal_context = []
+    num_iters = self.num_pips_iter * (len(feature_grids.lowres) - 1)
+    for _ in range(num_iters + 1):
+      occ_iters.append([])
+      pts_iters.append([])
+      expd_iters.append([])
+      new_causal_context.append([])
+    del new_causal_context[-1]
+    infer = functools.partial(
+        self.tracks_from_cost_volume,
+        im_shp=feature_grids.lowres[0].shape[0:2]
+        + self.initial_resolution
+        + (3,),
+    )
+    num_queries = query_features.lowres[0].shape[1]
+    if causal_context is None:
+      perm = torch.randperm(num_queries)
+    else:
+      perm = torch.arange(num_queries)
+    inv_perm = torch.zeros_like(perm)
+    inv_perm[perm] = torch.arange(num_queries)
+    for ch in range(0, num_queries, query_chunk_size):
+      perm_chunk = perm[ch : ch + query_chunk_size]
+      chunk = query_features.lowres[0][:, perm_chunk]
+      chunk_hires = query_features.hires[0][:, perm_chunk]
+      cc_chunk = []
+      if causal_context is not None:
+        for d in range(len(causal_context)):
+          tmp_dict = {}
+          for k, v in causal_context[d].items():
+            tmp_dict[k] = v[:, perm_chunk]
+          cc_chunk.append(tmp_dict)
+      if query_points_in_video is not None:
+        infer_query_points = query_points_in_video[
+            :, perm[ch : ch + query_chunk_size]
+        ]
+        num_frames = feature_grids.lowres[0].shape[1]
+        infer_query_points = utils.convert_grid_coordinates(
+            infer_query_points,
+            (num_frames,) + video_size,
+            (num_frames,) + self.initial_resolution,
+            coordinate_format='tyx',
+        )
+      else:
+        infer_query_points = None
+      points, occlusion, expected_dist, cost_volume = infer(
+          chunk,
+          chunk_hires,
+          feature_grids.lowres[0],
+          feature_grids.hires[0],
+          infer_query_points,
+      )
+      pts_iters[0].append(train2orig(points))
+      occ_iters[0].append(occlusion)
+      expd_iters[0].append(expected_dist)
+      mixer_feats = None
+      for i in range(num_iters):
+        feature_level = i // self.num_pips_iter + 1
+        queries = [
+            query_features.hires[feature_level][:, perm_chunk],
+            query_features.lowres[feature_level][:, perm_chunk],
+            query_features.highest[feature_level][:, perm_chunk],
+        ]
+        supports = [
+            query_features.hires_supp[feature_level][:, perm_chunk],
+            query_features.lowres_supp[feature_level][:, perm_chunk],
+            query_features.highest_supp[feature_level][:, perm_chunk],
+        ]
+        for _ in range(self.pyramid_level):
+          queries.append(queries[-1])
+        pyramid = [
+            feature_grids.hires[feature_level],
+            feature_grids.lowres[feature_level],
+            feature_grids.highest[feature_level],
+        ]
+        for _ in range(self.pyramid_level):
+          pyramid.append(
+              F.avg_pool3d(
+                  pyramid[-1],
+                  kernel_size=(2, 2, 1),
+                  stride=(2, 2, 1),
+                  padding=0,
+              )
+          )
+        cc = cc_chunk[i] if causal_context is not None else None
+        refined = self.refine_pips(
+            queries,
+            supports,
+            None,
+            pyramid,
+            points.detach(),
+            occlusion.detach(),
+            expected_dist.detach(),
+            orig_hw=self.initial_resolution,
+            last_iter=mixer_feats,
+            mixer_iter=i,
+            resize_hw=feature_grids.resolutions[feature_level],
+            causal_context=cc,
+            get_causal_context=get_causal_context,
+            cost_volume=cost_volume
+        )
+        points, occlusion, expected_dist, mixer_feats, new_causal = refined
+        pts_iters[i + 1].append(train2orig(points))
+        occ_iters[i + 1].append(occlusion)
+        expd_iters[i + 1].append(expected_dist)
+        new_causal_context[i].append(new_causal)
+        if (i + 1) % self.num_pips_iter == 0:
+          mixer_feats = None
+          expected_dist = expd_iters[0][-1]
+          occlusion = occ_iters[0][-1]
+    occlusion = []
+    points = []
+    expd = []
+    for i, _ in enumerate(occ_iters):
+      occlusion.append(torch.cat(occ_iters[i], dim=1)[:, inv_perm])
+      points.append(torch.cat(pts_iters[i], dim=1)[:, inv_perm])
+      expd.append(torch.cat(expd_iters[i], dim=1)[:, inv_perm])
+    out = dict(
+        occlusion=occlusion,
+        tracks=points,
+        expected_dist=expd,
+    )
+    return out
+  def refine_pips(
+      self,
+      target_feature,
+      support_feature,
+      frame_features,
+      pyramid,
+      pos_guess,
+      occ_guess,
+      expd_guess,
+      orig_hw,
+      last_iter=None,
+      mixer_iter=0.0,
+      resize_hw=None,
+      causal_context=None,
+      get_causal_context=False,
+      cost_volume=None,
+  ):
+    del frame_features
+    del mixer_iter
+    orig_h, orig_w = orig_hw
+    resized_h, resized_w = resize_hw
+    corrs_pyr = []
+    assert len(target_feature) == len(pyramid)
+    for pyridx, (query, supp, grid) in enumerate(zip(target_feature, support_feature, pyramid)):
+      # note: interp needs [y,x]
+      coords = utils.convert_grid_coordinates(
+          pos_guess, (orig_w, orig_h), grid.shape[-2:-4:-1]
+      )
+      coords = torch.flip(coords, dims=(-1,))
+      support_size = 7
+      ctxx, ctxy = torch.meshgrid(
+        torch.arange(-(support_size // 2), support_size // 2 + 1),
+        torch.arange(-(support_size // 2), support_size // 2 + 1),
+        indexing='xy',
+      )
+      ctx = torch.stack([ctxy, ctxx], dim=-1)
+      ctx = ctx.reshape(-1, 2).to(coords.device)
+      coords2 = coords.unsqueeze(3) + ctx.unsqueeze(0).unsqueeze(0).unsqueeze(0)
+      neighborhood = utils.map_coordinates_2d(grid, coords2)
+      neighborhood = utils.einshape('bnt(hw)c->bnthwc', neighborhood, h=support_size, w=support_size)
+      patches_input = torch.einsum('bnthwc,bnijc->bnthwij', neighborhood, supp)
+      patches_input = utils.einshape('bnthwij->(bnt)hwij', patches_input)
+      patches_emb = self.cmdtop[pyridx](patches_input)
+      patches = utils.einshape('(bnt)c->bntc', patches_emb, b=neighborhood.shape[0], n=neighborhood.shape[1])
+      corrs_pyr.append(patches)
+    corrs_pyr = torch.concatenate(corrs_pyr, dim=-1)
+    corrs_chunked = corrs_pyr
+    pos_guess_input = pos_guess
+    occ_guess_input = occ_guess[..., None]
+    expd_guess_input = expd_guess[..., None]
+    # mlp_input is batch, num_points, num_chunks, frames_per_chunk, channels
+    if last_iter is None:
+      both_feature = torch.cat([target_feature[0], target_feature[1]], axis=-1)
+      mlp_input_features = torch.tile(
+          both_feature.unsqueeze(2), (1, 1, corrs_chunked.shape[-2], 1)
+      )
+    else:
+      mlp_input_features = last_iter
+    mlp_input_list = [
+        occ_guess_input,
+        expd_guess_input,
+        corrs_chunked
+    ]
+    rel_pos_forward = F.pad(pos_guess_input[..., :-1, :] - pos_guess_input[..., 1:, :], (0, 0, 0, 1))
+    rel_pos_backward = F.pad(pos_guess_input[..., 1:, :] - pos_guess_input[..., :-1, :], (0, 0, 1, 0))
+    scale = torch.tensor([resized_w / orig_w, resized_h / orig_h]) / torch.tensor([orig_w, orig_h])
+    scale = scale.to(pos_guess_input.device)
+    rel_pos_forward = rel_pos_forward * scale
+    rel_pos_backward = rel_pos_backward * scale
+    rel_pos_emb_input = posenc(torch.cat([rel_pos_forward, rel_pos_backward], axis=-1), min_deg=0, max_deg=10) # batch, num_points, num_frames, 84
+    mlp_input_list.append(rel_pos_emb_input)
+    mlp_input = torch.cat(mlp_input_list, axis=-1)
+    x = utils.einshape('bnfc->(bn)fc', mlp_input)
+    if causal_context is not None:
+      for k, v in causal_context.items():
+        causal_context[k] = utils.einshape('bn...->(bn)...', v)
+    res = self.torch_pips_mixer(x)
+    res = utils.einshape('(bn)fc->bnfc', res, b=mlp_input.shape[0])
+    pos_update = utils.convert_grid_coordinates(
+        res[..., :2],
+        (resized_w, resized_h),
+        (orig_w, orig_h),
+    )
+    return (
+        pos_update + pos_guess,
+        res[..., 2] + occ_guess,
+        res[..., 3] + expd_guess,
+        res[..., 4:] + (mlp_input_features if last_iter is None else last_iter),
+        None,
+    )
+  def tracks_from_cost_volume(
+      self,
+      interp_feature: torch.Tensor,
+      interp_feature_hires: torch.Tensor,
+      feature_grid: torch.Tensor,
+      feature_grid_hires: torch.Tensor,
+      query_points: Optional[torch.Tensor],
+      im_shp=None,
+  ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Converts features into tracks by computing a cost volume.
+    The computed cost volume will have shape
+      [batch, num_queries, time, height, width], which can be very
+      memory intensive.
+    Args:
+      interp_feature: A tensor of features for each query point, of shape
+        [batch, num_queries, channels, heads].
+      feature_grid: A tensor of features for the video, of shape [batch, time,
+        height, width, channels, heads].
+      query_points: When computing tracks, we assume these points are given as
+        ground truth and we reproduce them exactly.  This is a set of points of
+        shape [batch, num_points, 3], where each entry is [t, y, x] in frame/
+        raster coordinates.
+      im_shp: The shape of the original image, i.e., [batch, num_frames, time,
+        height, width, 3].
+    Returns:
+      A 2-tuple of the inferred points (of shape
+        [batch, num_points, num_frames, 2] where each point is [x, y]) and
+        inferred occlusion (of shape [batch, num_points, num_frames], where
+        each is a logit where higher means occluded)
+    """
+    cost_volume = torch.einsum(
+        'bnc,bthwc->tbnhw',
+        interp_feature,
+        feature_grid,
+    )
+    cost_volume_hires = torch.einsum(
+        'bnc,bthwc->tbnhw',
+        interp_feature_hires,
+        feature_grid_hires,
+    )
+    shape = cost_volume.shape
+    batch_size, num_points = cost_volume.shape[1:3]
+    interp_cost = utils.einshape('tbnhw->(tbn)1hw', cost_volume)
+    interp_cost = F.interpolate(interp_cost, cost_volume_hires.shape[3:], mode='bilinear', align_corners=False)
+    # TODO: not sure if this is correct
+    interp_cost = utils.einshape('(tbn)1hw->tbnhw', interp_cost, b=batch_size, n=num_points)
+    cost_volume_stack = torch.stack(
+        [
+          # jax.image.resize(cost_volume, cost_volume_hires.shape, method='bilinear'),
+          interp_cost,
+          cost_volume_hires,
+        ], dim=-1
+    )
+    pos = utils.einshape('tbnhwc->(tbn)chw', cost_volume_stack)
+    pos = self.cost_conv(pos)
+    pos = utils.einshape('(tbn)1hw->bnthw', pos, b=batch_size, n=num_points)
+    pos_sm = pos.reshape(pos.size(0), pos.size(1), pos.size(2), -1)
+    softmaxed = F.softmax(pos_sm * self.softmax_temperature, dim=-1)
+    pos = softmaxed.view_as(pos)
+    points = utils.heatmaps_to_points(pos, im_shp, query_points=query_points)
+    occlusion = torch.cat(
+      [
+        torch.mean(cost_volume_stack, dim=(-2, -3)),
+        torch.amax(cost_volume_stack, dim=(-2, -3)),
+        torch.amin(cost_volume_stack, dim=(-2, -3)),
+      ], dim=-1
+    )
+    occlusion = self.occ_linear(occlusion)
+    expected_dist = utils.einshape(
+        'tbn1->bnt', occlusion[..., 1:2]
+    )
+    occlusion = utils.einshape(
+        'tbn1->bnt', occlusion[..., 0:1]
+    )
+    return points, occlusion, expected_dist, utils.einshape('tbnhw->bnthw', cost_volume)
+  def construct_initial_causal_state(self, num_points, num_resolutions=1):
+    """Construct initial causal state."""
+    value_shapes = {}
+    for i in range(self.num_mixer_blocks):
+      value_shapes[f'block_{i}_causal_1'] = (1, num_points, 2, 512)
+      value_shapes[f'block_{i}_causal_2'] = (1, num_points, 2, 2048)
+    fake_ret = {
+        k: torch.zeros(v, dtype=torch.float32) for k, v in value_shapes.items()
+    }
+    return [fake_ret] * num_resolutions * 4
+CHECKPOINT_LINK = {
+    'small': 'https://huggingface.co/datasets/hamacojr/LocoTrack-pytorch-weights/resolve/main/locotrack_small.ckpt',
+    'base': 'https://huggingface.co/datasets/hamacojr/LocoTrack-pytorch-weights/resolve/main/locotrack_base.ckpt',
+}
+def load_model(ckpt_path=None, model_size='base'):
+  if ckpt_path is None:
+    ckpt_link = CHECKPOINT_LINK[model_size]
+    state_dict = torch.hub.load_state_dict_from_url(ckpt_link, map_location='cpu')['state_dict']
+  else:
+    state_dict = torch.load(ckpt_path)['state_dict']
+  state_dict = {k.replace('model.', ''): v for k, v in state_dict.items()}
+  model = LocoTrack(model_size=model_size)
+  model.load_state_dict(state_dict)
+  model.eval()
+  return model

locotrack_pytorch/models/nets.py ADDED Viewed

	@@ -0,0 +1,429 @@

+# Copyright 2024 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Pytorch neural network definitions."""
+from typing import Sequence, Union
+import torch
+from torch import nn
+import torch.nn.functional as F
+from models.utils import Conv2dSamePadding
+class ExtraConvBlock(nn.Module):
+  """Additional convolution block."""
+  def __init__(
+      self,
+      channel_dim,
+      channel_multiplier,
+  ):
+    super().__init__()
+    self.channel_dim = channel_dim
+    self.channel_multiplier = channel_multiplier
+    self.layer_norm = nn.LayerNorm(
+        normalized_shape=channel_dim, elementwise_affine=True, bias=True
+    )
+    self.conv = nn.Conv2d(
+        self.channel_dim,
+        self.channel_dim * self.channel_multiplier,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+    )
+    self.conv_1 = nn.Conv2d(
+        self.channel_dim * self.channel_multiplier,
+        self.channel_dim,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+    )
+  def forward(self, x):
+    x = self.layer_norm(x)
+    x = x.permute(0, 3, 1, 2)
+    res = self.conv(x)
+    res = F.gelu(res, approximate='tanh')
+    x = x + self.conv_1(res)
+    x = x.permute(0, 2, 3, 1)
+    return x
+class ExtraConvs(nn.Module):
+  """Additional CNN."""
+  def __init__(
+      self,
+      num_layers=5,
+      channel_dim=256,
+      channel_multiplier=4,
+  ):
+    super().__init__()
+    self.num_layers = num_layers
+    self.channel_dim = channel_dim
+    self.channel_multiplier = channel_multiplier
+    self.blocks = nn.ModuleList()
+    for _ in range(self.num_layers):
+      self.blocks.append(
+          ExtraConvBlock(self.channel_dim, self.channel_multiplier)
+      )
+  def forward(self, x):
+    for block in self.blocks:
+      x = block(x)
+    return x
+class ConvChannelsMixer(nn.Module):
+  """Linear activation block for PIPs's MLP Mixer."""
+  def __init__(self, in_channels):
+    super().__init__()
+    self.mlp2_up = nn.Linear(in_channels, in_channels * 4)
+    self.mlp2_down = nn.Linear(in_channels * 4, in_channels)
+  def forward(self, x):
+    x = self.mlp2_up(x)
+    x = F.gelu(x, approximate='tanh')
+    x = self.mlp2_down(x)
+    return x
+class PIPsConvBlock(nn.Module):
+  """Convolutional block for PIPs's MLP Mixer."""
+  def __init__(
+      self, in_channels, kernel_shape=3, use_causal_conv=False, block_idx=None
+  ):
+    super().__init__()
+    self.use_causal_conv = use_causal_conv
+    self.block_name = f'block_{block_idx}'
+    self.kernel_shape = kernel_shape
+    self.layer_norm = nn.LayerNorm(
+        normalized_shape=in_channels, elementwise_affine=True, bias=False
+    )
+    self.mlp1_up = nn.Conv1d(
+        in_channels,
+        in_channels * 4,
+        kernel_shape,
+        stride=1,
+        padding=0 if self.use_causal_conv else 1,
+        groups=in_channels,
+    )
+    self.mlp1_up_1 = nn.Conv1d(
+        in_channels * 4,
+        in_channels * 4,
+        kernel_shape,
+        stride=1,
+        padding=0 if self.use_causal_conv else 1,
+        groups=in_channels * 4,
+    )
+    self.layer_norm_1 = nn.LayerNorm(
+        normalized_shape=in_channels, elementwise_affine=True, bias=False
+    )
+    self.conv_channels_mixer = ConvChannelsMixer(in_channels)
+  def forward(self, x, causal_context=None, get_causal_context=False):
+    to_skip = x
+    x = self.layer_norm(x)
+    new_causal_context = {}
+    num_extra = 0
+    if causal_context is not None:
+      name1 = self.block_name + '_causal_1'
+      x = torch.cat([causal_context[name1], x], dim=-2)
+      num_extra = causal_context[name1].shape[-2]
+      new_causal_context[name1] = x[..., -(self.kernel_shape - 1) :, :]
+    x = x.permute(0, 2, 1)
+    if self.use_causal_conv:
+      x = F.pad(x, (2, 0))
+    x = self.mlp1_up(x)
+    x = F.gelu(x, approximate='tanh')
+    if causal_context is not None:
+      x = x.permute(0, 2, 1)
+      name2 = self.block_name + '_causal_2'
+      num_extra = causal_context[name2].shape[-2]
+      x = torch.cat([causal_context[name2], x[..., num_extra:, :]], dim=-2)
+      new_causal_context[name2] = x[..., -(self.kernel_shape - 1) :, :]
+      x = x.permute(0, 2, 1)
+    if self.use_causal_conv:
+      x = F.pad(x, (2, 0))
+    x = self.mlp1_up_1(x)
+    x = x.permute(0, 2, 1)
+    if causal_context is not None:
+      x = x[..., num_extra:, :]
+    x = x[..., 0::4] + x[..., 1::4] + x[..., 2::4] + x[..., 3::4]
+    x = x + to_skip
+    to_skip = x
+    x = self.layer_norm_1(x)
+    x = self.conv_channels_mixer(x)
+    x = x + to_skip
+    return x, new_causal_context
+class PIPSMLPMixer(nn.Module):
+  """Depthwise-conv version of PIPs's MLP Mixer."""
+  def __init__(
+      self,
+      input_channels: int,
+      output_channels: int,
+      hidden_dim: int = 512,
+      num_blocks: int = 12,
+      kernel_shape: int = 3,
+      use_causal_conv: bool = False,
+  ):
+    """Inits Mixer module.
+    A depthwise-convolutional version of a MLP Mixer for processing images.
+    Args:
+        input_channels (int): The number of input channels.
+        output_channels (int): The number of output channels.
+        hidden_dim (int, optional): The dimension of the hidden layer. Defaults
+          to 512.
+        num_blocks (int, optional): The number of convolution blocks in the
+          mixer. Defaults to 12.
+        kernel_shape (int, optional): The size of the kernel in the convolution
+          blocks. Defaults to 3.
+        use_causal_conv (bool, optional): Whether to use causal convolutions.
+          Defaults to False.
+    """
+    super().__init__()
+    self.hidden_dim = hidden_dim
+    self.num_blocks = num_blocks
+    self.use_causal_conv = use_causal_conv
+    self.linear = nn.Linear(input_channels, self.hidden_dim)
+    self.layer_norm = nn.LayerNorm(
+        normalized_shape=hidden_dim, elementwise_affine=True, bias=False
+    )
+    self.linear_1 = nn.Linear(hidden_dim, output_channels)
+    self.blocks = nn.ModuleList([
+        PIPsConvBlock(
+            hidden_dim, kernel_shape, self.use_causal_conv, block_idx=i
+        )
+        for i in range(num_blocks)
+    ])
+  def forward(self, x, causal_context=None, get_causal_context=False):
+    x = self.linear(x)
+    all_causal_context = {}
+    for block in self.blocks:
+      x, new_causal_context = block(x, causal_context, get_causal_context)
+      if get_causal_context:
+        all_causal_context.update(new_causal_context)
+    x = self.layer_norm(x)
+    x = self.linear_1(x)
+    return x, all_causal_context
+class BlockV2(nn.Module):
+  """ResNet V2 block."""
+  def __init__(
+      self,
+      channels_in: int,
+      channels_out: int,
+      stride: Union[int, Sequence[int]],
+      use_projection: bool,
+  ):
+    super().__init__()
+    self.padding = (1, 1, 1, 1)
+    # Handle assymetric padding created by padding="SAME" in JAX/LAX
+    if stride == 1:
+      self.padding = (1, 1, 1, 1)
+    elif stride == 2:
+      self.padding = (0, 2, 0, 2)
+    else:
+      raise ValueError(
+          'Check correct padding using padtype_to_padsin jax._src.lax.lax'
+      )
+    self.use_projection = use_projection
+    if self.use_projection:
+      self.proj_conv = Conv2dSamePadding(
+          in_channels=channels_in,
+          out_channels=channels_out,
+          kernel_size=1,
+          stride=stride,
+          padding=0,
+          bias=False,
+      )
+    self.bn_0 = nn.InstanceNorm2d(
+        num_features=channels_in,
+        eps=1e-05,
+        momentum=0.1,
+        affine=True,
+        track_running_stats=False,
+    )
+    self.conv_0 = Conv2dSamePadding(
+        in_channels=channels_in,
+        out_channels=channels_out,
+        kernel_size=3,
+        stride=stride,
+        padding=0,
+        bias=False,
+    )
+    self.conv_1 = Conv2dSamePadding(
+        in_channels=channels_out,
+        out_channels=channels_out,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+    )
+    self.bn_1 = nn.InstanceNorm2d(
+        num_features=channels_out,
+        eps=1e-05,
+        momentum=0.1,
+        affine=True,
+        track_running_stats=False,
+    )
+  def forward(self, inputs):
+    x = shortcut = inputs
+    x = self.bn_0(x)
+    x = torch.relu(x)
+    if self.use_projection:
+      shortcut = self.proj_conv(x)
+    x = self.conv_0(x)
+    x = self.bn_1(x)
+    x = torch.relu(x)
+    # no issues with padding here as this layer always has stride 1
+    x = self.conv_1(x)
+    return x + shortcut
+class BlockGroup(nn.Module):
+  """Higher level block for ResNet implementation."""
+  def __init__(
+      self,
+      channels_in: int,
+      channels_out: int,
+      num_blocks: int,
+      stride: Union[int, Sequence[int]],
+      use_projection: bool,
+  ):
+    super().__init__()
+    blocks = []
+    for i in range(num_blocks):
+      blocks.append(
+          BlockV2(
+              channels_in=channels_in if i == 0 else channels_out,
+              channels_out=channels_out,
+              stride=(1 if i else stride),
+              use_projection=(i == 0 and use_projection),
+          )
+      )
+    self.blocks = nn.ModuleList(blocks)
+  def forward(self, inputs):
+    out = inputs
+    for block in self.blocks:
+      out = block(out)
+    return out
+class ResNet(nn.Module):
+  """ResNet model."""
+  def __init__(
+      self,
+      blocks_per_group: Sequence[int],
+      channels_per_group: Sequence[int] = (64, 128, 256, 512),
+      use_projection: Sequence[bool] = (True, True, True, True),
+      strides: Sequence[int] = (1, 2, 2, 2),
+  ):
+    """Initializes a ResNet model with customizable layers and configurations.
+    This constructor allows defining the architecture of a ResNet model by
+    setting the number of blocks, channels, projection usage, and strides for
+    each group of blocks within the network. It provides flexibility in
+    creating various ResNet configurations.
+    Args:
+      blocks_per_group: A sequence of 4 integers, each indicating the number
+        of residual blocks in each group.
+      channels_per_group: A sequence of 4 integers, each specifying the number
+        of output channels for the blocks in each group. Defaults to (64, 128,
+        256, 512).
+      use_projection: A sequence of 4 booleans, each indicating whether to use
+        a projection shortcut (True) or an identity shortcut (False) in each
+        group. Defaults to (True, True, True, True).
+      strides: A sequence of 4 integers, each specifying the stride size for
+        the convolutions in each group. Defaults to (1, 2, 2, 2).
+    The ResNet model created will have 4 groups, with each group's
+    architecture defined by the corresponding elements in these sequences.
+    """
+    super().__init__()
+    self.initial_conv = Conv2dSamePadding(
+        in_channels=3,
+        out_channels=channels_per_group[0],
+        kernel_size=(7, 7),
+        stride=2,
+        padding=0,
+        bias=False,
+    )
+    block_groups = []
+    for i, _ in enumerate(strides):
+      block_groups.append(
+          BlockGroup(
+              channels_in=channels_per_group[i - 1] if i > 0 else 64,
+              channels_out=channels_per_group[i],
+              num_blocks=blocks_per_group[i],
+              stride=strides[i],
+              use_projection=use_projection[i],
+          )
+      )
+    self.block_groups = nn.ModuleList(block_groups)
+  def forward(self, inputs):
+    result = {}
+    out = inputs
+    out = self.initial_conv(out)
+    result['initial_conv'] = out
+    for block_id, block_group in enumerate(self.block_groups):
+      out = block_group(out)
+      result[f'resnet_unit_{block_id}'] = out
+    return result

locotrack_pytorch/models/utils.py ADDED Viewed

	@@ -0,0 +1,344 @@

+# Copyright 2024 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Pytorch model utilities."""
+import math
+from typing import Any, Sequence, Union
+from einshape.src import abstract_ops
+from einshape.src import backend
+import numpy as np
+import torch
+import torch.nn.functional as F
+def bilinear(x: torch.Tensor, resolution: tuple[int, int]) -> torch.Tensor:
+  """Resizes a 5D tensor using bilinear interpolation.
+  Args:
+        x: A 5D tensor of shape (B, T, W, H, C) where B is batch size, T is
+          time, W is width, H is height, and C is the number of channels.
+    resolution: The target resolution as a tuple (new_width, new_height).
+  Returns:
+    The resized tensor.
+  """
+  b, t, h, w, c = x.size()
+  x = x.permute(0, 1, 4, 2, 3).reshape(b, t * c, h, w)
+  x = F.interpolate(x, size=resolution, mode='bilinear', align_corners=False)
+  b, _, h, w = x.size()
+  x = x.reshape(b, t, c, h, w).permute(0, 1, 3, 4, 2)
+  return x
+def map_coordinates_3d(
+    feats: torch.Tensor, coordinates: torch.Tensor
+) -> torch.Tensor:
+  """Maps 3D coordinates to corresponding features using bilinear interpolation.
+  Args:
+    feats: A 5D tensor of features with shape (B, W, H, D, C), where B is batch
+      size, W is width, H is height, D is depth, and C is the number of
+      channels.
+    coordinates: A 3D tensor of coordinates with shape (B, N, 3), where N is the
+      number of coordinates and the last dimension represents (W, H, D)
+      coordinates.
+  Returns:
+    The mapped features tensor.
+  """
+  x = feats.permute(0, 4, 1, 2, 3)
+  y = coordinates[:, :, None, None, :].float().clone()
+  y[..., 0] = y[..., 0] + 0.5
+  y = 2 * (y / torch.tensor(x.shape[2:], device=y.device)) - 1
+  y = torch.flip(y, dims=(-1,))
+  out = (
+      F.grid_sample(
+          x, y, mode='bilinear', align_corners=False, padding_mode='border'
+      )
+      .squeeze(dim=(3, 4))
+      .permute(0, 2, 1)
+  )
+  return out
+def map_coordinates_2d(
+    feats: torch.Tensor, coordinates: torch.Tensor
+) -> torch.Tensor:
+  """Maps 2D coordinates to feature maps using bilinear interpolation.
+  The function performs bilinear interpolation on the feature maps (`feats`)
+  at the specified `coordinates`. The coordinates are normalized between
+  -1 and 1 The result is a tensor of sampled features corresponding
+  to these coordinates.
+  Args:
+    feats (Tensor): A 5D tensor of shape (N, T, H, W, C) representing feature
+      maps, where N is the batch size, T is the number of frames, H and W are
+      height and width, and C is the number of channels.
+    coordinates (Tensor): A 5D tensor of shape (N, P, T, S, XY) representing
+      coordinates, where N is the batch size, P is the number of points, T is
+      the number of frames, S is the number of samples, and XY represents the 2D
+      coordinates.
+  Returns:
+    Tensor: A 5D tensor of the sampled features corresponding to the
+      given coordinates, of shape (N, P, T, S, C).
+  """
+  n, t, h, w, c = feats.shape
+  x = feats.permute(0, 1, 4, 2, 3).view(n * t, c, h, w)
+  n, p, t, s, xy = coordinates.shape
+  y = coordinates.permute(0, 2, 1, 3, 4).reshape(n * t, p, s, xy)
+  y = 2 * (y / h) - 1
+  y = torch.flip(y, dims=(-1,)).float()
+  out = F.grid_sample(
+      x, y, mode='bilinear', align_corners=False, padding_mode='zeros'
+  )
+  _, c, _, _ = out.shape
+  out = out.permute(0, 2, 3, 1).view(n, t, p, s, c).permute(0, 2, 1, 3, 4)
+  return out
+def soft_argmax_heatmap_batched(softmax_val, threshold=5):
+  """Test if two image resolutions are the same."""
+  b, h, w, d1, d2 = softmax_val.shape
+  y, x = torch.meshgrid(
+      torch.arange(d1, device=softmax_val.device),
+      torch.arange(d2, device=softmax_val.device),
+      indexing='ij',
+  )
+  coords = torch.stack([x + 0.5, y + 0.5], dim=-1).to(softmax_val.device)
+  softmax_val_flat = softmax_val.reshape(b, h, w, -1)
+  argmax_pos = torch.argmax(softmax_val_flat, dim=-1)
+  pos = coords.reshape(-1, 2)[argmax_pos]
+  valid = (
+      torch.sum(
+          torch.square(
+              coords[None, None, None, :, :, :] - pos[:, :, :, None, None, :]
+          ),
+          dim=-1,
+          keepdims=True,
+      )
+      < threshold**2
+  )
+  weighted_sum = torch.sum(
+      coords[None, None, None, :, :, :]
+      * valid
+      * softmax_val[:, :, :, :, :, None],
+      dim=(3, 4),
+  )
+  sum_of_weights = torch.maximum(
+      torch.sum(valid * softmax_val[:, :, :, :, :, None], dim=(3, 4)),
+      torch.tensor(1e-12, device=softmax_val.device),
+  )
+  return weighted_sum / sum_of_weights
+def heatmaps_to_points(
+    all_pairs_softmax,
+    image_shape,
+    threshold=5,
+    query_points=None,
+):
+  """Convert heatmaps to points using soft argmax."""
+  out_points = soft_argmax_heatmap_batched(all_pairs_softmax, threshold)
+  feature_grid_shape = all_pairs_softmax.shape[1:]
+  # Note: out_points is now [x, y]; we need to divide by [width, height].
+  # image_shape[3] is width and image_shape[2] is height.
+  out_points = convert_grid_coordinates(
+      out_points,
+      feature_grid_shape[3:1:-1],
+      image_shape[3:1:-1],
+  )
+  assert feature_grid_shape[1] == image_shape[1]
+  if query_points is not None:
+    # The [..., 0:1] is because we only care about the frame index.
+    query_frame = convert_grid_coordinates(
+        query_points.detach(),
+        image_shape[1:4],
+        feature_grid_shape[1:4],
+        coordinate_format='tyx',
+    )[..., 0:1]
+    query_frame = torch.round(query_frame)
+    frame_indices = torch.arange(image_shape[1], device=query_frame.device)[
+        None, None, :
+    ]
+    is_query_point = query_frame == frame_indices
+    is_query_point = is_query_point[:, :, :, None]
+    out_points = (
+        out_points * ~is_query_point
+        + torch.flip(query_points[:, :, None], dims=(-1,))[..., 0:2]
+        * is_query_point
+    )
+  return out_points
+def is_same_res(r1, r2):
+  """Test if two image resolutions are the same."""
+  return all([x == y for x, y in zip(r1, r2)])
+def convert_grid_coordinates(
+    coords: torch.Tensor,
+    input_grid_size: Sequence[int],
+    output_grid_size: Sequence[int],
+    coordinate_format: str = 'xy',
+) -> torch.Tensor:
+  """Convert grid coordinates to correct format."""
+  if isinstance(input_grid_size, tuple):
+    input_grid_size = torch.tensor(input_grid_size, device=coords.device)
+  if isinstance(output_grid_size, tuple):
+    output_grid_size = torch.tensor(output_grid_size, device=coords.device)
+  if coordinate_format == 'xy':
+    if input_grid_size.shape[0] != 2 or output_grid_size.shape[0] != 2:
+      raise ValueError(
+          'If coordinate_format is xy, the shapes must be length 2.'
+      )
+  elif coordinate_format == 'tyx':
+    if input_grid_size.shape[0] != 3 or output_grid_size.shape[0] != 3:
+      raise ValueError(
+          'If coordinate_format is tyx, the shapes must be length 3.'
+      )
+    if input_grid_size[0] != output_grid_size[0]:
+      raise ValueError('converting frame count is not supported.')
+  else:
+    raise ValueError('Recognized coordinate formats are xy and tyx.')
+  position_in_grid = coords
+  position_in_grid = position_in_grid * output_grid_size / input_grid_size
+  return position_in_grid
+class _JaxBackend(backend.Backend[torch.Tensor]):
+  """Einshape implementation for PyTorch."""
+  # https://github.com/vacancy/einshape/blob/main/einshape/src/pytorch/pytorch_ops.py
+  def reshape(self, x: torch.Tensor, op: abstract_ops.Reshape) -> torch.Tensor:
+    return x.reshape(op.shape)
+  def transpose(
+      self, x: torch.Tensor, op: abstract_ops.Transpose
+  ) -> torch.Tensor:
+    return x.permute(op.perm)
+  def broadcast(
+      self, x: torch.Tensor, op: abstract_ops.Broadcast
+  ) -> torch.Tensor:
+    shape = op.transform_shape(x.shape)
+    for axis_position in sorted(op.axis_sizes.keys()):
+      x = x.unsqueeze(axis_position)
+    return x.expand(shape)
+def einshape(
+    equation: str, value: Union[torch.Tensor, Any], **index_sizes: int
+) -> torch.Tensor:
+  """Reshapes `value` according to the given Shape Equation.
+  Args:
+    equation: The Shape Equation specifying the index regrouping and reordering.
+    value: Input tensor, or tensor-like object.
+    **index_sizes: Sizes of indices, where they cannot be inferred from
+      `input_shape`.
+  Returns:
+    Tensor derived from `value` by reshaping as specified by `equation`.
+  """
+  if not isinstance(value, torch.Tensor):
+    value = torch.tensor(value)
+  return _JaxBackend().exec(equation, value, value.shape, **index_sizes)
+def generate_default_resolutions(full_size, train_size, num_levels=None):
+  """Generate a list of logarithmically-spaced resolutions.
+  Generated resolutions are between train_size and full_size, inclusive, with
+  num_levels different resolutions total.  Useful for generating the input to
+  refinement_resolutions in PIPs.
+  Args:
+    full_size: 2-tuple of ints.  The full image size desired.
+    train_size: 2-tuple of ints.  The smallest refinement level.  Should
+      typically match the training resolution, which is (256, 256) for TAPIR.
+    num_levels: number of levels.  Typically each resolution should be less than
+      twice the size of prior resolutions.
+  Returns:
+    A list of resolutions.
+  """
+  if all([x == y for x, y in zip(train_size, full_size)]):
+    return [train_size]
+  if num_levels is None:
+    size_ratio = np.array(full_size) / np.array(train_size)
+    num_levels = int(np.ceil(np.max(np.log2(size_ratio))) + 1)
+  if num_levels <= 1:
+    return [train_size]
+  h, w = full_size[0:2]
+  if h % 8 != 0 or w % 8 != 0:
+    print(
+        'Warning: output size is not a multiple of 8. Final layer '
+        + 'will round size down.'
+    )
+  ll_h, ll_w = train_size[0:2]
+  sizes = []
+  for i in range(num_levels):
+    size = (
+        int(round((ll_h * (h / ll_h) ** (i / (num_levels - 1))) // 8)) * 8,
+        int(round((ll_w * (w / ll_w) ** (i / (num_levels - 1))) // 8)) * 8,
+    )
+    sizes.append(size)
+  return sizes
+class Conv2dSamePadding(torch.nn.Conv2d):
+    def calc_same_pad(self, i: int, k: int, s: int, d: int) -> int:
+      return max((math.ceil(i / s) - 1) * s + (k - 1) * d + 1 - i, 0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+      ih, iw = x.size()[-2:]
+      pad_h = self.calc_same_pad(i=ih, k=self.kernel_size[0], s=self.stride[0], d=self.dilation[0])
+      pad_w = self.calc_same_pad(i=iw, k=self.kernel_size[1], s=self.stride[1], d=self.dilation[1])
+      if pad_h > 0 or pad_w > 0:
+        x = F.pad(
+            x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]
+        )
+      return F.conv2d(
+        x,
+        self.weight,
+        self.bias,
+        self.stride,
+        # self.padding,
+        0,
+        self.dilation,
+        self.groups,
+      )

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+einshape==1.0
+gradio==4.40.0
+mediapy==1.2.2
+opencv-python==4.10.0.84
+torch==2.4.0
+torchaudio==2.4.0
+torchvision==0.19.0

viz_utils.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""Visualization utility functions."""
+import colorsys
+import random
+from typing import List, Optional, Sequence, Tuple
+import numpy as np
+# Generate random colormaps for visualizing different points.
+def get_colors(num_colors: int) -> List[Tuple[int, int, int]]:
+  """Gets colormap for points."""
+  colors = []
+  for i in np.arange(0.0, 360.0, 360.0 / num_colors):
+    hue = i / 360.0
+    lightness = (50 + np.random.rand() * 10) / 100.0
+    saturation = (90 + np.random.rand() * 10) / 100.0
+    color = colorsys.hls_to_rgb(hue, lightness, saturation)
+    colors.append(
+        (int(color[0] * 255), int(color[1] * 255), int(color[2] * 255))
+    )
+  random.shuffle(colors)
+  return colors
+def paint_point_track(
+    frames: np.ndarray,
+    point_tracks: np.ndarray,
+    visibles: np.ndarray,
+    colormap: Optional[List[Tuple[int, int, int]]] = None,
+) -> np.ndarray:
+  """Converts a sequence of points to color code video.
+  Args:
+    frames: [num_frames, height, width, 3], np.uint8, [0, 255]
+    point_tracks: [num_points, num_frames, 2], np.float32, [0, width / height]
+    visibles: [num_points, num_frames], bool
+    colormap: colormap for points, each point has a different RGB color.
+  Returns:
+    video: [num_frames, height, width, 3], np.uint8, [0, 255]
+  """
+  num_points, num_frames = point_tracks.shape[0:2]
+  if colormap is None:
+    colormap = get_colors(num_colors=num_points)
+  height, width = frames.shape[1:3]
+  dot_size_as_fraction_of_min_edge = 0.015
+  radius = int(round(min(height, width) * dot_size_as_fraction_of_min_edge))
+  diam = radius * 2 + 1
+  quadratic_y = np.square(np.arange(diam)[:, np.newaxis] - radius - 1)
+  quadratic_x = np.square(np.arange(diam)[np.newaxis, :] - radius - 1)
+  icon = (quadratic_y + quadratic_x) - (radius**2) / 2.0
+  sharpness = 0.15
+  icon = np.clip(icon / (radius * 2 * sharpness), 0, 1)
+  icon = 1 - icon[:, :, np.newaxis]
+  icon1 = np.pad(icon, [(0, 1), (0, 1), (0, 0)])
+  icon2 = np.pad(icon, [(1, 0), (0, 1), (0, 0)])
+  icon3 = np.pad(icon, [(0, 1), (1, 0), (0, 0)])
+  icon4 = np.pad(icon, [(1, 0), (1, 0), (0, 0)])
+  video = frames.copy()
+  for t in range(num_frames):
+    # Pad so that points that extend outside the image frame don't crash us
+    image = np.pad(
+        video[t],
+        [
+            (radius + 1, radius + 1),
+            (radius + 1, radius + 1),
+            (0, 0),
+        ],
+    )
+    for i in range(num_points):
+      # The icon is centered at the center of a pixel, but the input coordinates
+      # are raster coordinates.  Therefore, to render a point at (1,1) (which
+      # lies on the corner between four pixels), we need 1/4 of the icon placed
+      # centered on the 0'th row, 0'th column, etc.  We need to subtract
+      # 0.5 to make the fractional position come out right.
+      x, y = point_tracks[i, t, :] + 0.5
+      x = min(max(x, 0.0), width)
+      y = min(max(y, 0.0), height)
+      if visibles[i, t]:
+        x1, y1 = np.floor(x).astype(np.int32), np.floor(y).astype(np.int32)
+        x2, y2 = x1 + 1, y1 + 1
+        # bilinear interpolation
+        patch = (
+            icon1 * (x2 - x) * (y2 - y)
+            + icon2 * (x2 - x) * (y - y1)
+            + icon3 * (x - x1) * (y2 - y)
+            + icon4 * (x - x1) * (y - y1)
+        )
+        x_ub = x1 + 2 * radius + 2
+        y_ub = y1 + 2 * radius + 2
+        image[y1:y_ub, x1:x_ub, :] = (1 - patch) * image[
+            y1:y_ub, x1:x_ub, :
+        ] + patch * np.array(colormap[i])[np.newaxis, np.newaxis, :]
+      # Remove the pad
+      video[t] = image[
+          radius + 1 : -radius - 1, radius + 1 : -radius - 1
+      ].astype(np.uint8)
+  return video