Spaces:

facebook
/

ov-seg

Runtime error

ov-seg / open_vocab_seg /modeling /backbone /clip_resnet.py

liangfeng

add ovseg

583456e about 3 years ago

7.19 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	# Copyright (c) Meta Platforms, Inc. All Rights Reserved

	from collections import OrderedDict
	import torch
	import torch.nn as nn
	from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec


	class Bottleneck(nn.Module):
	expansion = 4

	def __init__(self, inplanes, planes, stride=1, dilation=1):
	super().__init__()

	# all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
	self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
	self.bn1 = nn.BatchNorm2d(planes)

	self.conv2 = nn.Conv2d(
	planes, planes, 3, padding=1 * dilation, bias=False, dilation=dilation
	)
	self.bn2 = nn.BatchNorm2d(planes)

	self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()

	self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
	self.bn3 = nn.BatchNorm2d(planes * self.expansion)

	self.relu = nn.ReLU(inplace=True)
	self.downsample = None
	self.stride = stride

	if stride > 1 or inplanes != planes * Bottleneck.expansion:
	# downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
	self.downsample = nn.Sequential(
	OrderedDict(
	[
	("-1", nn.AvgPool2d(stride)),
	(
	"0",
	nn.Conv2d(
	inplanes,
	planes * self.expansion,
	1,
	stride=1,
	bias=False,
	),
	),
	("1", nn.BatchNorm2d(planes * self.expansion)),
	]
	)
	)

	def forward(self, x: torch.Tensor):
	identity = x

	out = self.relu(self.bn1(self.conv1(x)))
	out = self.relu(self.bn2(self.conv2(out)))
	out = self.avgpool(out)
	out = self.bn3(self.conv3(out))

	if self.downsample is not None:
	identity = self.downsample(x)

	out += identity
	out = self.relu(out)
	return out


	class ModifiedResNet(nn.Module):
	"""
	A ResNet class that is similar to torchvision's but contains the following changes:
	- There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
	- Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
	- The final pooling layer is a QKV attention instead of an average pool
	"""

	def __init__(self, layers, width=64, strides=[2, 1, 2, 2, 2], multi_grid=[1, 1, 1]):
	super().__init__()

	# the 3-layer stem
	self.conv1 = nn.Conv2d(
	3, width // 2, kernel_size=3, stride=2, padding=1, bias=False
	)
	self.bn1 = nn.BatchNorm2d(width // 2)
	self.conv2 = nn.Conv2d(
	width // 2, width // 2, kernel_size=3, padding=1, bias=False
	)
	self.bn2 = nn.BatchNorm2d(width // 2)
	self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
	self.bn3 = nn.BatchNorm2d(width)
	self.avgpool = nn.AvgPool2d(strides[0]) if strides[0] > 1 else nn.Identity()
	self.relu = nn.ReLU(inplace=True)

	# residual layers
	self._inplanes = width # this is a mutable variable used during construction
	self.layer1 = self._make_layer(width, layers[0], stride=strides[1])
	self.layer2 = self._make_layer(width * 2, layers[1], stride=strides[2])
	self.layer3 = self._make_layer(width * 4, layers[2], stride=strides[3])
	self.layer4 = self._make_layer(
	width * 8, layers[3], stride=strides[4], dilations=multi_grid
	)
	self.num_features = [width * 4, width * 8, width * 16, width * 32]

	def _make_layer(self, planes, blocks, stride=1, dilations=None):
	if dilations is None:
	dilations = [1] * blocks
	layers = [Bottleneck(self._inplanes, planes, stride, dilation=dilations[0])]
	self._inplanes = planes * Bottleneck.expansion

	for i in range(1, blocks):
	layers.append(Bottleneck(self._inplanes, planes, dilation=dilations[i]))

	return nn.Sequential(*layers)

	def forward(self, x):
	def stem(x):
	for conv, bn in [
	(self.conv1, self.bn1),
	(self.conv2, self.bn2),
	(self.conv3, self.bn3),
	]:
	x = self.relu(bn(conv(x)))
	x = self.avgpool(x)
	return x

	output = {}
	x = x.type(self.conv1.weight.dtype)
	x = stem(x) # 1/4,1/4
	x = self.layer1(x)
	output["res2"] = x
	x = self.layer2(x) # 1/8,1/8
	output["res3"] = x
	x = self.layer3(x) # 1/16,1/16
	output["res4"] = x
	x = self.layer4(x) # 1/32,1/32
	output["res5"] = x
	return output


	@BACKBONE_REGISTRY.register()
	class D2ModifiedResNet(ModifiedResNet, Backbone):
	def __init__(self, cfg, input_shape):
	depth = cfg.MODEL.RESNETS.DEPTH
	num_groups = cfg.MODEL.RESNETS.NUM_GROUPS
	width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
	bottleneck_channels = num_groups * width_per_group
	num_blocks_per_stage = {
	18: [2, 2, 2, 2],
	34: [3, 4, 6, 3],
	50: [3, 4, 6, 3],
	101: [3, 4, 23, 3],
	152: [3, 8, 36, 3],
	}[depth]
	strides = [2, 1, 2, 2, 2]
	multi_grid = cfg.MODEL.RESNETS.RES5_MULTI_GRID
	if cfg.MODEL.RESNETS.STEM_TYPE == "deeplab":
	strides = [1, 1, 2, 2, 2]
	super().__init__(
	num_blocks_per_stage,
	bottleneck_channels,
	strides=strides,
	multi_grid=multi_grid,
	)
	self._out_features = cfg.MODEL.RESNETS.OUT_FEATURES

	self._out_feature_strides = {
	"res2": 4,
	"res3": 8,
	"res4": 16,
	"res5": 32,
	}
	self._out_feature_channels = {
	"res2": self.num_features[0],
	"res3": self.num_features[1],
	"res4": self.num_features[2],
	"res5": self.num_features[3],
	}

	def forward(self, x):
	"""
	Args:
	x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
	Returns:
	dict[str->Tensor]: names and the corresponding features
	"""
	outputs = {}
	y = super().forward(x)
	for k in y.keys():
	if k in self._out_features:
	outputs[k] = y[k]
	return outputs

	def output_shape(self):
	return {
	name: ShapeSpec(
	channels=self._out_feature_channels[name],
	stride=self._out_feature_strides[name],
	)
	for name in self._out_features
	}

	@property
	def size_divisibility(self):
	return 32