Spaces:
Runtime error
Runtime error
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # Copyright (c) Meta Platforms, Inc. All Rights Reserved | |
| from collections import OrderedDict | |
| import torch | |
| import torch.nn as nn | |
| from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec | |
| class Bottleneck(nn.Module): | |
| expansion = 4 | |
| def __init__(self, inplanes, planes, stride=1, dilation=1): | |
| super().__init__() | |
| # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 | |
| self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) | |
| self.bn1 = nn.BatchNorm2d(planes) | |
| self.conv2 = nn.Conv2d( | |
| planes, planes, 3, padding=1 * dilation, bias=False, dilation=dilation | |
| ) | |
| self.bn2 = nn.BatchNorm2d(planes) | |
| self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() | |
| self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) | |
| self.bn3 = nn.BatchNorm2d(planes * self.expansion) | |
| self.relu = nn.ReLU(inplace=True) | |
| self.downsample = None | |
| self.stride = stride | |
| if stride > 1 or inplanes != planes * Bottleneck.expansion: | |
| # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 | |
| self.downsample = nn.Sequential( | |
| OrderedDict( | |
| [ | |
| ("-1", nn.AvgPool2d(stride)), | |
| ( | |
| "0", | |
| nn.Conv2d( | |
| inplanes, | |
| planes * self.expansion, | |
| 1, | |
| stride=1, | |
| bias=False, | |
| ), | |
| ), | |
| ("1", nn.BatchNorm2d(planes * self.expansion)), | |
| ] | |
| ) | |
| ) | |
| def forward(self, x: torch.Tensor): | |
| identity = x | |
| out = self.relu(self.bn1(self.conv1(x))) | |
| out = self.relu(self.bn2(self.conv2(out))) | |
| out = self.avgpool(out) | |
| out = self.bn3(self.conv3(out)) | |
| if self.downsample is not None: | |
| identity = self.downsample(x) | |
| out += identity | |
| out = self.relu(out) | |
| return out | |
| class ModifiedResNet(nn.Module): | |
| """ | |
| A ResNet class that is similar to torchvision's but contains the following changes: | |
| - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. | |
| - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 | |
| - The final pooling layer is a QKV attention instead of an average pool | |
| """ | |
| def __init__(self, layers, width=64, strides=[2, 1, 2, 2, 2], multi_grid=[1, 1, 1]): | |
| super().__init__() | |
| # the 3-layer stem | |
| self.conv1 = nn.Conv2d( | |
| 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False | |
| ) | |
| self.bn1 = nn.BatchNorm2d(width // 2) | |
| self.conv2 = nn.Conv2d( | |
| width // 2, width // 2, kernel_size=3, padding=1, bias=False | |
| ) | |
| self.bn2 = nn.BatchNorm2d(width // 2) | |
| self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False) | |
| self.bn3 = nn.BatchNorm2d(width) | |
| self.avgpool = nn.AvgPool2d(strides[0]) if strides[0] > 1 else nn.Identity() | |
| self.relu = nn.ReLU(inplace=True) | |
| # residual layers | |
| self._inplanes = width # this is a *mutable* variable used during construction | |
| self.layer1 = self._make_layer(width, layers[0], stride=strides[1]) | |
| self.layer2 = self._make_layer(width * 2, layers[1], stride=strides[2]) | |
| self.layer3 = self._make_layer(width * 4, layers[2], stride=strides[3]) | |
| self.layer4 = self._make_layer( | |
| width * 8, layers[3], stride=strides[4], dilations=multi_grid | |
| ) | |
| self.num_features = [width * 4, width * 8, width * 16, width * 32] | |
| def _make_layer(self, planes, blocks, stride=1, dilations=None): | |
| if dilations is None: | |
| dilations = [1] * blocks | |
| layers = [Bottleneck(self._inplanes, planes, stride, dilation=dilations[0])] | |
| self._inplanes = planes * Bottleneck.expansion | |
| for i in range(1, blocks): | |
| layers.append(Bottleneck(self._inplanes, planes, dilation=dilations[i])) | |
| return nn.Sequential(*layers) | |
| def forward(self, x): | |
| def stem(x): | |
| for conv, bn in [ | |
| (self.conv1, self.bn1), | |
| (self.conv2, self.bn2), | |
| (self.conv3, self.bn3), | |
| ]: | |
| x = self.relu(bn(conv(x))) | |
| x = self.avgpool(x) | |
| return x | |
| output = {} | |
| x = x.type(self.conv1.weight.dtype) | |
| x = stem(x) # 1/4,1/4 | |
| x = self.layer1(x) | |
| output["res2"] = x | |
| x = self.layer2(x) # 1/8,1/8 | |
| output["res3"] = x | |
| x = self.layer3(x) # 1/16,1/16 | |
| output["res4"] = x | |
| x = self.layer4(x) # 1/32,1/32 | |
| output["res5"] = x | |
| return output | |
| class D2ModifiedResNet(ModifiedResNet, Backbone): | |
| def __init__(self, cfg, input_shape): | |
| depth = cfg.MODEL.RESNETS.DEPTH | |
| num_groups = cfg.MODEL.RESNETS.NUM_GROUPS | |
| width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP | |
| bottleneck_channels = num_groups * width_per_group | |
| num_blocks_per_stage = { | |
| 18: [2, 2, 2, 2], | |
| 34: [3, 4, 6, 3], | |
| 50: [3, 4, 6, 3], | |
| 101: [3, 4, 23, 3], | |
| 152: [3, 8, 36, 3], | |
| }[depth] | |
| strides = [2, 1, 2, 2, 2] | |
| multi_grid = cfg.MODEL.RESNETS.RES5_MULTI_GRID | |
| if cfg.MODEL.RESNETS.STEM_TYPE == "deeplab": | |
| strides = [1, 1, 2, 2, 2] | |
| super().__init__( | |
| num_blocks_per_stage, | |
| bottleneck_channels, | |
| strides=strides, | |
| multi_grid=multi_grid, | |
| ) | |
| self._out_features = cfg.MODEL.RESNETS.OUT_FEATURES | |
| self._out_feature_strides = { | |
| "res2": 4, | |
| "res3": 8, | |
| "res4": 16, | |
| "res5": 32, | |
| } | |
| self._out_feature_channels = { | |
| "res2": self.num_features[0], | |
| "res3": self.num_features[1], | |
| "res4": self.num_features[2], | |
| "res5": self.num_features[3], | |
| } | |
| def forward(self, x): | |
| """ | |
| Args: | |
| x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. | |
| Returns: | |
| dict[str->Tensor]: names and the corresponding features | |
| """ | |
| outputs = {} | |
| y = super().forward(x) | |
| for k in y.keys(): | |
| if k in self._out_features: | |
| outputs[k] = y[k] | |
| return outputs | |
| def output_shape(self): | |
| return { | |
| name: ShapeSpec( | |
| channels=self._out_feature_channels[name], | |
| stride=self._out_feature_strides[name], | |
| ) | |
| for name in self._out_features | |
| } | |
| def size_divisibility(self): | |
| return 32 | |