|
same architecture with [timm/vit_base_patch14_dinov2.lvd142m](https://huggingface.co/timm/vit_base_patch14_dinov2.lvd142m) |
|
|
|
```shell |
|
git clone https://github.com/DepthAnything/Depth-Anything-V2 |
|
cd Depth-Anything-V2 |
|
``` |
|
|
|
# translate |
|
|
|
```python |
|
''' |
|
wget https://huggingface.co/depth-anything/Depth-Anything-V2-Small/resolve/main/depth_anything_v2_vits.pth?download=true |
|
wget https://huggingface.co/depth-anything/Depth-Anything-V2-Base/resolve/main/depth_anything_v2_vitb.pth?download=true |
|
wget https://huggingface.co/depth-anything/Depth-Anything-V2-Large/resolve/main/depth_anything_v2_vitl.pth?download=true |
|
''' |
|
import torch |
|
|
|
from depth_anything_v2.dpt import DepthAnythingV2 |
|
|
|
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' |
|
|
|
model_configs = { |
|
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, |
|
'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, |
|
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, |
|
} |
|
|
|
encoder = 'vitb' # or 'vits', 'vitb' |
|
|
|
model = DepthAnythingV2(**model_configs[encoder]) |
|
model.load_state_dict(torch.load(f'depth_anything_v2_{encoder}.pth?download=true', map_location='cpu')) |
|
vit = model.pretrained |
|
|
|
# total_params = 0 |
|
# for name, param in vit.named_parameters(): |
|
# print(f"Parameter: {name} - Size: {param.size()} - Total Elements: {param.numel()}") |
|
# total_params += param.numel() |
|
# print(f"Total number of parameters in ViT: {total_params}") |
|
|
|
filtered_state_dict = {k: v for k, v in vit.state_dict().items() if 'mask_token' not in k} |
|
torch.save(filtered_state_dict, "pytorch_model.bin") |
|
``` |
|
|
|
# usage |
|
|
|
```python |
|
from urllib.request import urlopen |
|
from PIL import Image |
|
import timm |
|
|
|
img = Image.open(urlopen( |
|
'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png' |
|
)) |
|
|
|
model = timm.create_model( |
|
'vit_base_patch14_dinov2.lvd142m', |
|
pretrained=True, |
|
num_classes=0, # remove classifier nn.Linear |
|
checkpoint_path="pytorch_model.bin" |
|
) |
|
|
|
# model2.load_state_dict(torch.load("backbone_weights.pth")) |
|
|
|
# for name, param in model.named_parameters(): |
|
# print(f"Parameter: {name} - Size: {param.size()} - Total Elements: {param.numel()}") |
|
model = model.eval() |
|
|
|
# get model specific transforms (normalization, resize) |
|
data_config = timm.data.resolve_model_data_config(model) |
|
transforms = timm.data.create_transform(**data_config, is_training=False) |
|
|
|
output = model(transforms(img).unsqueeze(0)) # output is (batch_size, num_features) shaped tensor |
|
|
|
# or equivalently (without needing to set num_classes=0) |
|
|
|
output = model.forward_features(transforms(img).unsqueeze(0)) |
|
# output is unpooled, a (1, 1374, 1024) shaped tensor |
|
|
|
output = model.forward_head(output, pre_logits=True) |
|
print(output) |
|
``` |
|
|
|
|
|
Copyright saved. |