Grounded-Segment-Anything
/
grounded-sam-osx
/transformer_utils
/mmpose
/models
/backbones
/v2v_net.py
| # ------------------------------------------------------------------------------ | |
| # Copyright and License Information | |
| # Adapted from | |
| # https://github.com/microsoft/voxelpose-pytorch/blob/main/lib/models/v2v_net.py | |
| # Original Licence: MIT License | |
| # ------------------------------------------------------------------------------ | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from mmcv.cnn import ConvModule | |
| from ..builder import BACKBONES | |
| from .base_backbone import BaseBackbone | |
| class Basic3DBlock(nn.Module): | |
| """A basic 3D convolutional block. | |
| Args: | |
| in_channels (int): Input channels of this block. | |
| out_channels (int): Output channels of this block. | |
| kernel_size (int): Kernel size of the convolution operation | |
| conv_cfg (dict): Dictionary to construct and config conv layer. | |
| Default: dict(type='Conv3d') | |
| norm_cfg (dict): Dictionary to construct and config norm layer. | |
| Default: dict(type='BN3d') | |
| """ | |
| def __init__(self, | |
| in_channels, | |
| out_channels, | |
| kernel_size, | |
| conv_cfg=dict(type='Conv3d'), | |
| norm_cfg=dict(type='BN3d')): | |
| super(Basic3DBlock, self).__init__() | |
| self.block = ConvModule( | |
| in_channels, | |
| out_channels, | |
| kernel_size, | |
| stride=1, | |
| padding=((kernel_size - 1) // 2), | |
| conv_cfg=conv_cfg, | |
| norm_cfg=norm_cfg, | |
| bias=True) | |
| def forward(self, x): | |
| """Forward function.""" | |
| return self.block(x) | |
| class Res3DBlock(nn.Module): | |
| """A residual 3D convolutional block. | |
| Args: | |
| in_channels (int): Input channels of this block. | |
| out_channels (int): Output channels of this block. | |
| kernel_size (int): Kernel size of the convolution operation | |
| Default: 3 | |
| conv_cfg (dict): Dictionary to construct and config conv layer. | |
| Default: dict(type='Conv3d') | |
| norm_cfg (dict): Dictionary to construct and config norm layer. | |
| Default: dict(type='BN3d') | |
| """ | |
| def __init__(self, | |
| in_channels, | |
| out_channels, | |
| kernel_size=3, | |
| conv_cfg=dict(type='Conv3d'), | |
| norm_cfg=dict(type='BN3d')): | |
| super(Res3DBlock, self).__init__() | |
| self.res_branch = nn.Sequential( | |
| ConvModule( | |
| in_channels, | |
| out_channels, | |
| kernel_size, | |
| stride=1, | |
| padding=((kernel_size - 1) // 2), | |
| conv_cfg=conv_cfg, | |
| norm_cfg=norm_cfg, | |
| bias=True), | |
| ConvModule( | |
| out_channels, | |
| out_channels, | |
| kernel_size, | |
| stride=1, | |
| padding=((kernel_size - 1) // 2), | |
| conv_cfg=conv_cfg, | |
| norm_cfg=norm_cfg, | |
| act_cfg=None, | |
| bias=True)) | |
| if in_channels == out_channels: | |
| self.skip_con = nn.Sequential() | |
| else: | |
| self.skip_con = ConvModule( | |
| in_channels, | |
| out_channels, | |
| 1, | |
| stride=1, | |
| padding=0, | |
| conv_cfg=conv_cfg, | |
| norm_cfg=norm_cfg, | |
| act_cfg=None, | |
| bias=True) | |
| def forward(self, x): | |
| """Forward function.""" | |
| res = self.res_branch(x) | |
| skip = self.skip_con(x) | |
| return F.relu(res + skip, True) | |
| class Pool3DBlock(nn.Module): | |
| """A 3D max-pool block. | |
| Args: | |
| pool_size (int): Pool size of the 3D max-pool layer | |
| """ | |
| def __init__(self, pool_size): | |
| super(Pool3DBlock, self).__init__() | |
| self.pool_size = pool_size | |
| def forward(self, x): | |
| """Forward function.""" | |
| return F.max_pool3d( | |
| x, kernel_size=self.pool_size, stride=self.pool_size) | |
| class Upsample3DBlock(nn.Module): | |
| """A 3D upsample block. | |
| Args: | |
| in_channels (int): Input channels of this block. | |
| out_channels (int): Output channels of this block. | |
| kernel_size (int): Kernel size of the transposed convolution operation. | |
| Default: 2 | |
| stride (int): Kernel size of the transposed convolution operation. | |
| Default: 2 | |
| """ | |
| def __init__(self, in_channels, out_channels, kernel_size=2, stride=2): | |
| super(Upsample3DBlock, self).__init__() | |
| assert kernel_size == 2 | |
| assert stride == 2 | |
| self.block = nn.Sequential( | |
| nn.ConvTranspose3d( | |
| in_channels, | |
| out_channels, | |
| kernel_size=kernel_size, | |
| stride=stride, | |
| padding=0, | |
| output_padding=0), nn.BatchNorm3d(out_channels), nn.ReLU(True)) | |
| def forward(self, x): | |
| """Forward function.""" | |
| return self.block(x) | |
| class EncoderDecorder(nn.Module): | |
| """An encoder-decoder block. | |
| Args: | |
| in_channels (int): Input channels of this block | |
| """ | |
| def __init__(self, in_channels=32): | |
| super(EncoderDecorder, self).__init__() | |
| self.encoder_pool1 = Pool3DBlock(2) | |
| self.encoder_res1 = Res3DBlock(in_channels, in_channels * 2) | |
| self.encoder_pool2 = Pool3DBlock(2) | |
| self.encoder_res2 = Res3DBlock(in_channels * 2, in_channels * 4) | |
| self.mid_res = Res3DBlock(in_channels * 4, in_channels * 4) | |
| self.decoder_res2 = Res3DBlock(in_channels * 4, in_channels * 4) | |
| self.decoder_upsample2 = Upsample3DBlock(in_channels * 4, | |
| in_channels * 2, 2, 2) | |
| self.decoder_res1 = Res3DBlock(in_channels * 2, in_channels * 2) | |
| self.decoder_upsample1 = Upsample3DBlock(in_channels * 2, in_channels, | |
| 2, 2) | |
| self.skip_res1 = Res3DBlock(in_channels, in_channels) | |
| self.skip_res2 = Res3DBlock(in_channels * 2, in_channels * 2) | |
| def forward(self, x): | |
| """Forward function.""" | |
| skip_x1 = self.skip_res1(x) | |
| x = self.encoder_pool1(x) | |
| x = self.encoder_res1(x) | |
| skip_x2 = self.skip_res2(x) | |
| x = self.encoder_pool2(x) | |
| x = self.encoder_res2(x) | |
| x = self.mid_res(x) | |
| x = self.decoder_res2(x) | |
| x = self.decoder_upsample2(x) | |
| x = x + skip_x2 | |
| x = self.decoder_res1(x) | |
| x = self.decoder_upsample1(x) | |
| x = x + skip_x1 | |
| return x | |
| class V2VNet(BaseBackbone): | |
| """V2VNet. | |
| Please refer to the `paper <https://arxiv.org/abs/1711.07399>` | |
| for details. | |
| Args: | |
| input_channels (int): | |
| Number of channels of the input feature volume. | |
| output_channels (int): | |
| Number of channels of the output volume. | |
| mid_channels (int): | |
| Input and output channels of the encoder-decoder block. | |
| """ | |
| def __init__(self, input_channels, output_channels, mid_channels=32): | |
| super(V2VNet, self).__init__() | |
| self.front_layers = nn.Sequential( | |
| Basic3DBlock(input_channels, mid_channels // 2, 7), | |
| Res3DBlock(mid_channels // 2, mid_channels), | |
| ) | |
| self.encoder_decoder = EncoderDecorder(in_channels=mid_channels) | |
| self.output_layer = nn.Conv3d( | |
| mid_channels, output_channels, kernel_size=1, stride=1, padding=0) | |
| self._initialize_weights() | |
| def forward(self, x): | |
| """Forward function.""" | |
| x = self.front_layers(x) | |
| x = self.encoder_decoder(x) | |
| x = self.output_layer(x) | |
| return x | |
| def _initialize_weights(self): | |
| for m in self.modules(): | |
| if isinstance(m, nn.Conv3d): | |
| nn.init.normal_(m.weight, 0, 0.001) | |
| nn.init.constant_(m.bias, 0) | |
| elif isinstance(m, nn.ConvTranspose3d): | |
| nn.init.normal_(m.weight, 0, 0.001) | |
| nn.init.constant_(m.bias, 0) | |