Upload model
Browse files- common.py +7 -0
- config.json +20 -33
- extra_timm_models.py +16 -6
- model.safetensors +3 -0
common.py
CHANGED
|
@@ -59,6 +59,13 @@ RESOURCE_MAP = {
|
|
| 59 |
preferred_resolution=(896, 896),
|
| 60 |
vitdet_num_global=8,
|
| 61 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
# RADIO
|
| 63 |
"radio_v2.1": RadioResource(
|
| 64 |
"https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.1_bf16.pth.tar?download=true",
|
|
|
|
| 59 |
preferred_resolution=(896, 896),
|
| 60 |
vitdet_num_global=8,
|
| 61 |
),
|
| 62 |
+
"c-radio_v2.5-g": RadioResource(
|
| 63 |
+
"https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.5-g.pth.tar?download=true",
|
| 64 |
+
patch_size=16,
|
| 65 |
+
max_resolution=2048,
|
| 66 |
+
preferred_resolution=(768, 768),
|
| 67 |
+
vitdet_num_global=8,
|
| 68 |
+
),
|
| 69 |
# RADIO
|
| 70 |
"radio_v2.1": RadioResource(
|
| 71 |
"https://huggingface.co/nvidia/RADIO/resolve/main/radio_v2.1_bf16.pth.tar?download=true",
|
config.json
CHANGED
|
@@ -16,7 +16,7 @@
|
|
| 16 |
"cache_dir": null,
|
| 17 |
"channels_last": false,
|
| 18 |
"checkpoint_hist": 10,
|
| 19 |
-
"chk_keep_forever":
|
| 20 |
"class_map": "",
|
| 21 |
"clip_grad": null,
|
| 22 |
"clip_mode": "norm",
|
|
@@ -25,8 +25,7 @@
|
|
| 25 |
"coco_image_dir": "/datasets/coco2017-adlsa/val2017",
|
| 26 |
"color_jitter": 0.4,
|
| 27 |
"cooldown_epochs": 0,
|
| 28 |
-
"cpe_max_size":
|
| 29 |
-
"cpe_num_registers": 4,
|
| 30 |
"crd_loss": false,
|
| 31 |
"crd_loss_weight": 0.8,
|
| 32 |
"crop_pct": null,
|
|
@@ -59,7 +58,7 @@
|
|
| 59 |
"eval_throughput": false,
|
| 60 |
"fast_norm": false,
|
| 61 |
"fd_loss_fn": "MSE",
|
| 62 |
-
"feature_normalization": "
|
| 63 |
"feature_summarizer": "cls_token",
|
| 64 |
"feature_upscale_factor": null,
|
| 65 |
"force_new_wandb_id": false,
|
|
@@ -74,8 +73,8 @@
|
|
| 74 |
"head_init_bias": null,
|
| 75 |
"head_init_scale": null,
|
| 76 |
"head_lr": null,
|
| 77 |
-
"head_warmup":
|
| 78 |
-
"head_weight_decay": 0.
|
| 79 |
"hflip": 0.5,
|
| 80 |
"img_size": null,
|
| 81 |
"in_chans": null,
|
|
@@ -106,10 +105,10 @@
|
|
| 106 |
"mixup_off_epoch": 0,
|
| 107 |
"mixup_prob": 1.0,
|
| 108 |
"mixup_switch_prob": 0.5,
|
| 109 |
-
"mlp_hidden_size":
|
| 110 |
-
"mlp_num_inner":
|
| 111 |
"mlp_version": "v2",
|
| 112 |
-
"model": "
|
| 113 |
"model_kwargs": {},
|
| 114 |
"model_norm": false,
|
| 115 |
"momentum": 0.9,
|
|
@@ -137,10 +136,10 @@
|
|
| 137 |
],
|
| 138 |
"recount": 1,
|
| 139 |
"recovery_interval": 0,
|
| 140 |
-
"register_multiple":
|
| 141 |
"remode": "pixel",
|
| 142 |
"reprob": 0.0,
|
| 143 |
-
"reset_loss_state":
|
| 144 |
"resplit": false,
|
| 145 |
"sample_tracking": false,
|
| 146 |
"save_images": false,
|
|
@@ -169,29 +168,17 @@
|
|
| 169 |
"model": "ViT-H-14-378-quickgelu",
|
| 170 |
"name": "clip",
|
| 171 |
"pretrained": "dfn5b",
|
| 172 |
-
"type": "open_clip"
|
| 173 |
-
|
| 174 |
-
{
|
| 175 |
-
"feature_distillation": true,
|
| 176 |
-
"input_size": 448,
|
| 177 |
-
"name": "paligemma-448",
|
| 178 |
-
"type": "paligemma",
|
| 179 |
-
"use_summary": false
|
| 180 |
},
|
| 181 |
{
|
| 182 |
"fd_normalize": false,
|
| 183 |
"feature_distillation": true,
|
| 184 |
-
"input_size":
|
| 185 |
"model": "dinov2_vitg14_reg",
|
| 186 |
"name": "dino_v2",
|
| 187 |
-
"type": "dino_v2"
|
| 188 |
-
|
| 189 |
-
{
|
| 190 |
-
"feature_distillation": true,
|
| 191 |
-
"input_size": 378,
|
| 192 |
-
"name": "aimv2",
|
| 193 |
-
"type": "aimv2",
|
| 194 |
-
"use_summary": false
|
| 195 |
},
|
| 196 |
{
|
| 197 |
"fd_normalize": false,
|
|
@@ -230,14 +217,14 @@
|
|
| 230 |
},
|
| 231 |
"feature_normalizer_config": null,
|
| 232 |
"inter_feature_normalizer_config": null,
|
| 233 |
-
"max_resolution":
|
| 234 |
-
"patch_size":
|
| 235 |
"preferred_resolution": [
|
| 236 |
-
|
| 237 |
-
|
| 238 |
],
|
| 239 |
"torch_dtype": "float32",
|
| 240 |
"transformers_version": "4.47.0.dev0",
|
| 241 |
-
"version": "radio_v2.5-g",
|
| 242 |
"vitdet_window_size": null
|
| 243 |
}
|
|
|
|
| 16 |
"cache_dir": null,
|
| 17 |
"channels_last": false,
|
| 18 |
"checkpoint_hist": 10,
|
| 19 |
+
"chk_keep_forever": 50,
|
| 20 |
"class_map": "",
|
| 21 |
"clip_grad": null,
|
| 22 |
"clip_mode": "norm",
|
|
|
|
| 25 |
"coco_image_dir": "/datasets/coco2017-adlsa/val2017",
|
| 26 |
"color_jitter": 0.4,
|
| 27 |
"cooldown_epochs": 0,
|
| 28 |
+
"cpe_max_size": 2048,
|
|
|
|
| 29 |
"crd_loss": false,
|
| 30 |
"crd_loss_weight": 0.8,
|
| 31 |
"crop_pct": null,
|
|
|
|
| 58 |
"eval_throughput": false,
|
| 59 |
"fast_norm": false,
|
| 60 |
"fd_loss_fn": "MSE",
|
| 61 |
+
"feature_normalization": "PHI_STANDARDIZE",
|
| 62 |
"feature_summarizer": "cls_token",
|
| 63 |
"feature_upscale_factor": null,
|
| 64 |
"force_new_wandb_id": false,
|
|
|
|
| 73 |
"head_init_bias": null,
|
| 74 |
"head_init_scale": null,
|
| 75 |
"head_lr": null,
|
| 76 |
+
"head_warmup": 5,
|
| 77 |
+
"head_weight_decay": 0.03,
|
| 78 |
"hflip": 0.5,
|
| 79 |
"img_size": null,
|
| 80 |
"in_chans": null,
|
|
|
|
| 105 |
"mixup_off_epoch": 0,
|
| 106 |
"mixup_prob": 1.0,
|
| 107 |
"mixup_switch_prob": 0.5,
|
| 108 |
+
"mlp_hidden_size": 1520,
|
| 109 |
+
"mlp_num_inner": 1,
|
| 110 |
"mlp_version": "v2",
|
| 111 |
+
"model": "vit_giant_patch16_224",
|
| 112 |
"model_kwargs": {},
|
| 113 |
"model_norm": false,
|
| 114 |
"momentum": 0.9,
|
|
|
|
| 136 |
],
|
| 137 |
"recount": 1,
|
| 138 |
"recovery_interval": 0,
|
| 139 |
+
"register_multiple": 8,
|
| 140 |
"remode": "pixel",
|
| 141 |
"reprob": 0.0,
|
| 142 |
+
"reset_loss_state": true,
|
| 143 |
"resplit": false,
|
| 144 |
"sample_tracking": false,
|
| 145 |
"save_images": false,
|
|
|
|
| 168 |
"model": "ViT-H-14-378-quickgelu",
|
| 169 |
"name": "clip",
|
| 170 |
"pretrained": "dfn5b",
|
| 171 |
+
"type": "open_clip",
|
| 172 |
+
"use_summary": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
},
|
| 174 |
{
|
| 175 |
"fd_normalize": false,
|
| 176 |
"feature_distillation": true,
|
| 177 |
+
"input_size": 448,
|
| 178 |
"model": "dinov2_vitg14_reg",
|
| 179 |
"name": "dino_v2",
|
| 180 |
+
"type": "dino_v2",
|
| 181 |
+
"use_summary": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
},
|
| 183 |
{
|
| 184 |
"fd_normalize": false,
|
|
|
|
| 217 |
},
|
| 218 |
"feature_normalizer_config": null,
|
| 219 |
"inter_feature_normalizer_config": null,
|
| 220 |
+
"max_resolution": 2048,
|
| 221 |
+
"patch_size": 16,
|
| 222 |
"preferred_resolution": [
|
| 223 |
+
768,
|
| 224 |
+
768
|
| 225 |
],
|
| 226 |
"torch_dtype": "float32",
|
| 227 |
"transformers_version": "4.47.0.dev0",
|
| 228 |
+
"version": "c-radio_v2.5-g",
|
| 229 |
"vitdet_window_size": null
|
| 230 |
}
|
extra_timm_models.py
CHANGED
|
@@ -24,7 +24,7 @@ from . import dinov2_arch
|
|
| 24 |
def vit_tiny_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
| 25 |
""" ViT-Tiny (Vit-Ti/16)
|
| 26 |
"""
|
| 27 |
-
model_args = dict(patch_size=14, embed_dim=192, depth=12, num_heads=3)
|
| 28 |
model = _create_vision_transformer('vit_tiny_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
|
| 29 |
return model
|
| 30 |
|
|
@@ -33,7 +33,7 @@ def vit_tiny_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
|
| 33 |
def vit_small_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
| 34 |
""" ViT-Small (ViT-S/16)
|
| 35 |
"""
|
| 36 |
-
model_args = dict(patch_size=14, embed_dim=384, depth=12, num_heads=6)
|
| 37 |
model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
|
| 38 |
return model
|
| 39 |
|
|
@@ -43,7 +43,7 @@ def vit_base_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
|
| 43 |
""" ViT-Base (ViT-B/14) from original paper (https://arxiv.org/abs/2010.11929).
|
| 44 |
ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
|
| 45 |
"""
|
| 46 |
-
model_args = dict(patch_size=14, embed_dim=768, depth=12, num_heads=12)
|
| 47 |
model = _create_vision_transformer('vit_base_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
|
| 48 |
return model
|
| 49 |
|
|
@@ -52,7 +52,7 @@ def vit_base_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
|
| 52 |
def vit_huge_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
|
| 53 |
""" ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
|
| 54 |
"""
|
| 55 |
-
model_args = dict(patch_size=16, embed_dim=1280, depth=32, num_heads=16)
|
| 56 |
if pretrained:
|
| 57 |
# There is no pretrained version of ViT-H/16, but we can adapt a ViT-H/14 for this purpose
|
| 58 |
model = _create_vision_transformer('vit_huge_patch14_224', pretrained=True, **dict(model_args, **kwargs))
|
|
@@ -65,7 +65,7 @@ def vit_huge_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
|
|
| 65 |
def vit_huge_patch16_224_mlpnorm(pretrained=False, **kwargs) -> VisionTransformer:
|
| 66 |
""" ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
|
| 67 |
"""
|
| 68 |
-
model = vit_huge_patch16_224(pretrained=pretrained, **kwargs)
|
| 69 |
|
| 70 |
for m in model.modules():
|
| 71 |
if isinstance(m, Mlp) and not isinstance(m.norm, nn.LayerNorm):
|
|
@@ -74,9 +74,18 @@ def vit_huge_patch16_224_mlpnorm(pretrained=False, **kwargs) -> VisionTransforme
|
|
| 74 |
return model
|
| 75 |
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
@register_model
|
| 78 |
def vit_bigG_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
| 79 |
-
model_args = dict(patch_size=14, embed_dim=1664, depth=48, num_heads=16, init_values=1e-6)
|
| 80 |
model = _create_vision_transformer('vit_bigG_patch14', pretrained=False, **dict(model_args, **kwargs))
|
| 81 |
return model
|
| 82 |
|
|
@@ -102,3 +111,4 @@ def _patch_layer_scale(model: VisionTransformer):
|
|
| 102 |
if isinstance(mod.ls2, TIMMLayerScale):
|
| 103 |
mod.ls2 = replace_ls(mod.ls2)
|
| 104 |
pass
|
|
|
|
|
|
| 24 |
def vit_tiny_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
| 25 |
""" ViT-Tiny (Vit-Ti/16)
|
| 26 |
"""
|
| 27 |
+
model_args = dict(patch_size=14, embed_dim=192, depth=12, num_heads=3, weight_init='skip')
|
| 28 |
model = _create_vision_transformer('vit_tiny_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
|
| 29 |
return model
|
| 30 |
|
|
|
|
| 33 |
def vit_small_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
| 34 |
""" ViT-Small (ViT-S/16)
|
| 35 |
"""
|
| 36 |
+
model_args = dict(patch_size=14, embed_dim=384, depth=12, num_heads=6, weight_init='skip')
|
| 37 |
model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
|
| 38 |
return model
|
| 39 |
|
|
|
|
| 43 |
""" ViT-Base (ViT-B/14) from original paper (https://arxiv.org/abs/2010.11929).
|
| 44 |
ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
|
| 45 |
"""
|
| 46 |
+
model_args = dict(patch_size=14, embed_dim=768, depth=12, num_heads=12, weight_init='skip')
|
| 47 |
model = _create_vision_transformer('vit_base_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
|
| 48 |
return model
|
| 49 |
|
|
|
|
| 52 |
def vit_huge_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
|
| 53 |
""" ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
|
| 54 |
"""
|
| 55 |
+
model_args = dict(patch_size=16, embed_dim=1280, depth=32, num_heads=16, weight_init='skip')
|
| 56 |
if pretrained:
|
| 57 |
# There is no pretrained version of ViT-H/16, but we can adapt a ViT-H/14 for this purpose
|
| 58 |
model = _create_vision_transformer('vit_huge_patch14_224', pretrained=True, **dict(model_args, **kwargs))
|
|
|
|
| 65 |
def vit_huge_patch16_224_mlpnorm(pretrained=False, **kwargs) -> VisionTransformer:
|
| 66 |
""" ViT-Huge model (ViT-H/16) from original paper (https://arxiv.org/abs/2010.11929).
|
| 67 |
"""
|
| 68 |
+
model = vit_huge_patch16_224(pretrained=pretrained, weight_init='skip', **kwargs)
|
| 69 |
|
| 70 |
for m in model.modules():
|
| 71 |
if isinstance(m, Mlp) and not isinstance(m.norm, nn.LayerNorm):
|
|
|
|
| 74 |
return model
|
| 75 |
|
| 76 |
|
| 77 |
+
@register_model
|
| 78 |
+
def vit_giant_patch16_224(pretrained=False, **kwargs) -> VisionTransformer:
|
| 79 |
+
""" ViT-giant model (ViT-g/16) from original paper (https://arxiv.org/abs/2010.11929).
|
| 80 |
+
"""
|
| 81 |
+
model_args = dict(patch_size=16, embed_dim=1536, depth=40, num_heads=24, weight_init='skip')
|
| 82 |
+
model = _create_vision_transformer('vit_giant_patch16_224', pretrained=False, **dict(model_args, **kwargs))
|
| 83 |
+
return model
|
| 84 |
+
|
| 85 |
+
|
| 86 |
@register_model
|
| 87 |
def vit_bigG_patch14_224(pretrained=False, **kwargs) -> VisionTransformer:
|
| 88 |
+
model_args = dict(patch_size=14, embed_dim=1664, depth=48, num_heads=16, init_values=1e-6, weight_init='skip')
|
| 89 |
model = _create_vision_transformer('vit_bigG_patch14', pretrained=False, **dict(model_args, **kwargs))
|
| 90 |
return model
|
| 91 |
|
|
|
|
| 111 |
if isinstance(mod.ls2, TIMMLayerScale):
|
| 112 |
mod.ls2 = replace_ls(mod.ls2)
|
| 113 |
pass
|
| 114 |
+
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa6e741d60c99c87d8be4f74439daaadf1eb831bf78d4cfbe1e97ce672204bd1
|
| 3 |
+
size 4638530048
|