stylematte

Runtime error

App Files Files Community

shir0ha

befozg commited on Sep 4, 2023

Commit

cff9442

0 Parent(s):

Duplicate from befozg/stylematte

Browse files

Co-authored-by: Karen Efremyan <befozg@users.noreply.huggingface.co>

Files changed (10) hide show

.gitattributes +34 -0
.gitignore +1 -0
README.md +13 -0
app.py +30 -0
base.yaml +61 -0
logo.jpeg +0 -0
models.py +481 -0
requirements.txt +38 -0
stylematte.pth +3 -0
test.py +1002 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.pth

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Stylematte
+emoji: 💻
+colorFrom: indigo
+colorTo: pink
+sdk: gradio
+sdk_version: 3.29.0
+app_file: app.py
+pinned: false
+duplicated_from: befozg/stylematte
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import gradio as gr
+from test import inference_img
+from models import *
+device='cpu'
+model = StyleMatte()
+model = model.to(device)
+checkpoint = f"stylematte.pth"
+state_dict = torch.load(checkpoint, map_location=f'{device}')
+model.load_state_dict(state_dict)
+model.eval()
+def predict(inp):
+    print("***********Inference****************")
+    res = inference_img(model, inp)
+    print("***********Inference finish****************")
+    return res
+print("MODEL LOADED")
+print("************************************")
+iface = gr.Interface(fn=predict,
+             inputs=gr.Image(type="numpy"),
+             outputs=gr.Image(type="numpy"),
+             examples=["./logo.jpeg"])
+print("****************Interface created******************")
+iface.launch()

base.yaml ADDED Viewed

	@@ -0,0 +1,61 @@

+world_size: 1
+experiment_name: "test"
+datasets:
+    synthetic_fg: "/home/jovyan/datasets/synthetic_psi/"
+    synthetic_animals: "/home/jovyan/datasets/synthetic_psiny/"
+    bg: "/home/jovyan/datasets/matting/background/testval/"
+    ppm100: "/home/jovyan/kvanchiani/stylegan3/PPM-100/image"
+    aim500: "/home/jovyan/datasets/AIM-500"
+    am2k:
+        train_original: "/home/jovyan/datasets/matting/am-2k/train/original"
+        train_mask: "/home/jovyan/datasets/matting/am-2k/train/mask"
+        background: "/home/jovyan/datasets/matting/am-2k/background/train"
+        validation_original: "/home/jovyan/datasets/matting/am-2k/validation/original/"
+        validation_mask: "/home/jovyan/datasets/matting/am-2k/validation/mask/"
+        validation_trimap: "/home/jovyan/datasets/matting/am-2k/validation/trimap/"
+    tiktok: "/home/jovyan/datasets/tiktokdataset/dataset"
+    p3m10k: "/home/jovyan/datasets/matting/P3M-10k"
+    p3m10k_test:
+        VAL500P:
+            ROOT_PATH: "P3M-500-P/"
+            ORIGINAL_PATH: "P3M-500-P/blurred_image/"
+            MASK_PATH: "P3M-500-P/mask/"
+            TRIMAP_PATH: "P3M-500-P/trimap/"
+            SAMPLE_NUMBER: 500
+        VAL500NP:
+            ROOT_PATH: "P3M-500-NP/"
+            ORIGINAL_PATH: "P3M-500-NP/original_image/"
+            MASK_PATH: "P3M-500-NP/mask/"
+            TRIMAP_PATH: "P3M-500-NP/trimap/"
+            SAMPLE_NUMBER: 500
+    MAX_SIZE_H: 1600
+    MAX_SIZE_W: 1600
+    image_crop: 800
+    max_image_count: 10000
+dataset_to_use: MixedDataset
+pretrained_model: "microsoft/swinv2-tiny-patch4-window8-256"  #"nielsr/mask2former-swin-base-youtubevis-2021" #"nvidia/mit-b2"
+batch_size: 4
+num_workers: 4
+log_dir: "log"
+checkpoint_dir: "checkpoints"
+checkpoint: "best-89.pth"
+distributed_addr: "localhost"
+distributed_port: "12357"
+image_size: 800
+lr: 1e-7
+epochs: 200
+disable_validation: False
+warmup_steps: 2
+validate_each_epoch: 5
+max_images_for_validation: 500
+disable_mixed_precision: True
+log_image_interval: 500
+log_image_number: 8
+save_model_interval: 10000
+switch: 3
+lambda_losses:
+    default: 1.
+    Laplassian: 1.
+    Grad: 3.
+    L1: 1.
+    switch: 1e-6

logo.jpeg ADDED Viewed

models.py ADDED Viewed

	@@ -0,0 +1,481 @@

+import cv2
+import random
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from typing import List
+from itertools import chain
+from transformers import SegformerForSemanticSegmentation,Mask2FormerForUniversalSegmentation
+device='cpu'
+class EncoderDecoder(nn.Module):
+    def __init__(
+        self,
+        encoder,
+        decoder,
+        prefix=nn.Conv2d(3, 3, kernel_size=3, padding=1, bias=True),
+    ):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.prefix = prefix
+    def forward(self, x):
+        if self.prefix is not None:
+            x = self.prefix(x)
+        x = self.encoder(x)["hidden_states"] #transformers
+        return self.decoder(x)
+def conv2d_relu(input_filters,output_filters,kernel_size=3,  bias=True):
+    return nn.Sequential(
+        nn.Conv2d(input_filters, output_filters, kernel_size=kernel_size, padding=kernel_size//2, bias=bias),
+        nn.LeakyReLU(0.2, inplace=True),
+        nn.BatchNorm2d(output_filters)
+    )
+def up_and_add(x, y):
+    return F.interpolate(x, size=(y.size(2), y.size(3)), mode='bilinear', align_corners=True) + y
+class FPN_fuse(nn.Module):
+    def __init__(self, feature_channels=[256, 512, 1024, 2048], fpn_out=256):
+        super(FPN_fuse, self).__init__()
+        assert feature_channels[0] == fpn_out
+        self.conv1x1 = nn.ModuleList([nn.Conv2d(ft_size, fpn_out, kernel_size=1)
+                                    for ft_size in feature_channels[1:]])
+        self.smooth_conv =  nn.ModuleList([nn.Conv2d(fpn_out, fpn_out, kernel_size=3, padding=1)]
+                                    * (len(feature_channels)-1))
+        self.conv_fusion = nn.Sequential(
+            nn.Conv2d(2*fpn_out, fpn_out, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(fpn_out),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, features):
+        features[:-1] = [conv1x1(feature) for feature, conv1x1 in zip(features[:-1], self.conv1x1)]##
+        feature=up_and_add(self.smooth_conv[0](features[0]),features[1])
+        feature=up_and_add(self.smooth_conv[1](feature),features[2])
+        feature=up_and_add(self.smooth_conv[2](feature),features[3])
+        H, W = features[-1].size(2), features[-1].size(3)
+        x = [feature,features[-1]]
+        x = [F.interpolate(x_el, size=(H, W), mode='bilinear', align_corners=True) for x_el in x]
+        x = self.conv_fusion(torch.cat(x, dim=1))
+        #x = F.interpolate(x, size=(H*4, W*4), mode='bilinear', align_corners=True)
+        return x
+class PSPModule(nn.Module):
+    # In the original inmplementation they use precise RoI pooling
+    # Instead of using adaptative average pooling
+    def __init__(self, in_channels, bin_sizes=[1, 2, 4, 6]):
+        super(PSPModule, self).__init__()
+        out_channels = in_channels // len(bin_sizes)
+        self.stages = nn.ModuleList([self._make_stages(in_channels, out_channels, b_s)
+                                                        for b_s in bin_sizes])
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(in_channels+(out_channels * len(bin_sizes)), in_channels,
+                                    kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(in_channels),
+            nn.ReLU(inplace=True),
+            nn.Dropout2d(0.1)
+        )
+    def _make_stages(self, in_channels, out_channels, bin_sz):
+        prior = nn.AdaptiveAvgPool2d(output_size=bin_sz)
+        conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
+        bn = nn.BatchNorm2d(out_channels)
+        relu = nn.ReLU(inplace=True)
+        return nn.Sequential(prior, conv, bn, relu)
+    def forward(self, features):
+        h, w = features.size()[2], features.size()[3]
+        pyramids = [features]
+        pyramids.extend([F.interpolate(stage(features), size=(h, w), mode='bilinear',
+                                        align_corners=True) for stage in self.stages])
+        output = self.bottleneck(torch.cat(pyramids, dim=1))
+        return output
+class UperNet_swin(nn.Module):
+    # Implementing only the object path
+    def __init__(self, backbone,pretrained=True):
+        super(UperNet_swin, self).__init__()
+        self.backbone = backbone
+        feature_channels = [192,384,768,768]
+        self.PPN = PSPModule(feature_channels[-1])
+        self.FPN = FPN_fuse(feature_channels, fpn_out=feature_channels[0])
+        self.head = nn.Conv2d(feature_channels[0], 1, kernel_size=3, padding=1)
+    def forward(self, x):
+        input_size = (x.size()[2], x.size()[3])
+        features = self.backbone(x)["hidden_states"]
+        features[-1] = self.PPN(features[-1])
+        x = self.head(self.FPN(features))
+        x = F.interpolate(x, size=input_size, mode='bilinear')
+        return x
+    def get_backbone_params(self):
+        return self.backbone.parameters()
+    def get_decoder_params(self):
+        return chain(self.PPN.parameters(), self.FPN.parameters(), self.head.parameters())
+class UnetDecoder(nn.Module):
+    def __init__(
+        self,
+        encoder_channels= (3,192,384,768,768),
+        decoder_channels=(512,256,128,64),
+        n_blocks=4,
+        use_batchnorm=True,
+        attention_type=None,
+        center=False,
+    ):
+        super().__init__()
+        if n_blocks != len(decoder_channels):
+            raise ValueError(
+                "Model depth is {}, but you provide `decoder_channels` for {} blocks.".format(
+                    n_blocks, len(decoder_channels)
+                )
+            )
+        # remove first skip with same spatial resolution
+        encoder_channels = encoder_channels[1:]
+        # reverse channels to start from head of encoder
+        encoder_channels = encoder_channels[::-1]
+        # computing blocks input and output channels
+        head_channels = encoder_channels[0]
+        in_channels = [head_channels] + list(decoder_channels[:-1])
+        skip_channels = list(encoder_channels[1:]) + [0]
+        out_channels = decoder_channels
+        if center:
+            self.center = CenterBlock(head_channels, head_channels, use_batchnorm=use_batchnorm)
+        else:
+            self.center = nn.Identity()
+        # combine decoder keyword arguments
+        kwargs = dict(use_batchnorm=use_batchnorm, attention_type=attention_type)
+        blocks = [
+            DecoderBlock(in_ch, skip_ch, out_ch, **kwargs)
+            for in_ch, skip_ch, out_ch in zip(in_channels, skip_channels, out_channels)
+        ]
+        self.blocks = nn.ModuleList(blocks)
+        upscale_factor=4
+        self.matting_head = nn.Sequential(
+            nn.Conv2d(64,1, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.UpsamplingBilinear2d(scale_factor=upscale_factor),
+)
+    def preprocess_features(self,x):
+        features=[]
+        for out_tensor in x:
+            bs,n,f=out_tensor.size()
+            h = int(n**0.5)
+            feature = out_tensor.view(-1,h,h,f).permute(0, 3, 1, 2).contiguous()
+            features.append(feature)
+        return features
+    def forward(self, features):
+        features = features[1:]  # remove first skip with same spatial resolution
+        features = features[::-1]  # reverse channels to start from head of encoder
+        features = self.preprocess_features(features)
+        head = features[0]
+        skips = features[1:]
+        x = self.center(head)
+        for i, decoder_block in enumerate(self.blocks):
+            skip = skips[i] if i < len(skips) else None
+            x = decoder_block(x, skip)
+            #y_i = self.upsample1(y_i)
+        #hypercol = torch.cat([y0,y1,y2,y3,y4], dim=1)
+        x = self.matting_head(x)
+        x=1-nn.ReLU()(1-x)
+        return x
+class SegmentationHead(nn.Sequential):
+    def __init__(self, in_channels, out_channels, kernel_size=3, upsampling=1):
+        conv2d = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=kernel_size // 2)
+        upsampling = nn.UpsamplingBilinear2d(scale_factor=upsampling) if upsampling > 1 else nn.Identity()
+        super().__init__(conv2d, upsampling)
+class DecoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        skip_channels,
+        out_channels,
+        use_batchnorm=True,
+        attention_type=None,
+    ):
+        super().__init__()
+        self.conv1 = conv2d_relu(
+            in_channels + skip_channels,
+            out_channels,
+            kernel_size=3
+        )
+        self.conv2 = conv2d_relu(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+        )
+        self.in_channels=in_channels
+        self.out_channels = out_channels
+        self.skip_channels = skip_channels
+    def forward(self, x, skip=None):
+        if skip is None:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        else:
+            if x.shape[-1]!=skip.shape[-1]:
+                x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if skip is not None:
+            #print(x.shape,skip.shape)
+            x = torch.cat([x, skip], dim=1)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+class CenterBlock(nn.Sequential):
+    def __init__(self, in_channels, out_channels):
+        conv1 = conv2d_relu(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+        )
+        conv2 = conv2d_relu(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+        )
+        super().__init__(conv1, conv2)
+class SegForm(nn.Module):
+    def __init__(self):
+        super(SegForm, self).__init__()
+#         configuration = SegformerConfig.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+#         configuration.num_labels = 1 ## set output as 1
+#         self.model = SegformerForSemanticSegmentation(config=configuration)
+        self.model =  SegformerForSemanticSegmentation.from_pretrained("nvidia/mit-b0", num_labels=1, ignore_mismatched_sizes=True
+        )
+    def forward(self, image):
+        img_segs = self.model(image)
+        upsampled_logits = nn.functional.interpolate(img_segs.logits,
+                scale_factor=4,
+                mode='nearest',
+               )
+        return upsampled_logits
+class StyleMatte(nn.Module):
+    def __init__(self):
+        super(StyleMatte, self).__init__()
+#         configuration = SegformerConfig.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+#         configuration.num_labels = 1 ## set output as 1
+        self.fpn = FPN_fuse(feature_channels=[256, 256, 256, 256],fpn_out=256)
+        self.pixel_decoder =  Mask2FormerForUniversalSegmentation.from_pretrained("facebook/mask2former-swin-tiny-coco-instance").base_model.pixel_level_module
+        self.fgf = FastGuidedFilter()
+        self.conv = nn.Conv2d(256,1,kernel_size=3,padding=1)
+        # self.mean =  torch.Tensor([0.43216, 0.394666, 0.37645]).float().view(-1, 1, 1)
+        # self.register_buffer('image_net_mean', self.mean)
+        # self.std = torch.Tensor([0.22803, 0.22145, 0.216989]).float().view(-1, 1, 1)
+        # self.register_buffer('image_net_std', self.std)
+    def forward(self, image, normalize=False):
+        # if normalize:
+        #     image.sub_(self.get_buffer("image_net_mean")).div_(self.get_buffer("image_net_std"))
+        decoder_out = self.pixel_decoder(image)
+        decoder_states=list(decoder_out.decoder_hidden_states)
+        decoder_states.append(decoder_out.decoder_last_hidden_state)
+        out_pure=self.fpn(decoder_states)
+        image_lr=nn.functional.interpolate(image.mean(1, keepdim=True),
+                scale_factor=0.25,
+                mode='bicubic',
+                align_corners=True
+               )
+        out = self.conv(out_pure)
+        out = self.fgf(image_lr,out,image.mean(1, keepdim=True))#.clip(0,1)
+        # out = nn.Sigmoid()(out)
+        # out =  nn.functional.interpolate(out,
+        #                     scale_factor=4,
+        #                     mode='bicubic',
+        #                     align_corners=True
+        #                 )
+        return torch.sigmoid(out)
+    def get_training_params(self):
+        return list(self.fpn.parameters())+list(self.conv.parameters())#+list(self.fgf.parameters())
+class GuidedFilter(nn.Module):
+    def __init__(self, r, eps=1e-8):
+        super(GuidedFilter, self).__init__()
+        self.r = r
+        self.eps = eps
+        self.boxfilter = BoxFilter(r)
+    def forward(self, x, y):
+        n_x, c_x, h_x, w_x = x.size()
+        n_y, c_y, h_y, w_y = y.size()
+        assert n_x == n_y
+        assert c_x == 1 or c_x == c_y
+        assert h_x == h_y and w_x == w_y
+        assert h_x > 2 * self.r + 1 and w_x > 2 * self.r + 1
+        # N
+        N = self.boxfilter((x.data.new().resize_((1, 1, h_x, w_x)).fill_(1.0)))
+        # mean_x
+        mean_x = self.boxfilter(x) / N
+        # mean_y
+        mean_y = self.boxfilter(y) / N
+        # cov_xy
+        cov_xy = self.boxfilter(x * y) / N - mean_x * mean_y
+        # var_x
+        var_x = self.boxfilter(x * x) / N - mean_x * mean_x
+        # A
+        A = cov_xy / (var_x + self.eps)
+        # b
+        b = mean_y - A * mean_x
+        # mean_A; mean_b
+        mean_A = self.boxfilter(A) / N
+        mean_b = self.boxfilter(b) / N
+        return mean_A * x + mean_b
+class FastGuidedFilter(nn.Module):
+    def __init__(self, r=1, eps=1e-8):
+        super(FastGuidedFilter, self).__init__()
+        self.r = r
+        self.eps = eps
+        self.boxfilter = BoxFilter(r)
+    def forward(self, lr_x, lr_y, hr_x):
+        n_lrx, c_lrx, h_lrx, w_lrx = lr_x.size()
+        n_lry, c_lry, h_lry, w_lry = lr_y.size()
+        n_hrx, c_hrx, h_hrx, w_hrx = hr_x.size()
+        assert n_lrx == n_lry and n_lry == n_hrx
+        assert c_lrx == c_hrx and (c_lrx == 1 or c_lrx == c_lry)
+        assert h_lrx == h_lry and w_lrx == w_lry
+        assert h_lrx > 2*self.r+1 and w_lrx > 2*self.r+1
+        ## N
+        N = self.boxfilter(lr_x.new().resize_((1, 1, h_lrx, w_lrx)).fill_(1.0))
+        ## mean_x
+        mean_x = self.boxfilter(lr_x) / N
+        ## mean_y
+        mean_y = self.boxfilter(lr_y) / N
+        ## cov_xy
+        cov_xy = self.boxfilter(lr_x * lr_y) / N - mean_x * mean_y
+        ## var_x
+        var_x = self.boxfilter(lr_x * lr_x) / N - mean_x * mean_x
+        ## A
+        A = cov_xy / (var_x + self.eps)
+        ## b
+        b = mean_y - A * mean_x
+        ## mean_A; mean_b
+        mean_A = F.interpolate(A, (h_hrx, w_hrx), mode='bilinear', align_corners=True)
+        mean_b = F.interpolate(b, (h_hrx, w_hrx), mode='bilinear', align_corners=True)
+        return mean_A*hr_x+mean_b
+class DeepGuidedFilterRefiner(nn.Module):
+    def __init__(self, hid_channels=16):
+        super().__init__()
+        self.box_filter = nn.Conv2d(4, 4, kernel_size=3, padding=1, bias=False, groups=4)
+        self.box_filter.weight.data[...] = 1 / 9
+        self.conv = nn.Sequential(
+            nn.Conv2d(4 * 2 + hid_channels, hid_channels, kernel_size=1, bias=False),
+            nn.BatchNorm2d(hid_channels),
+            nn.ReLU(True),
+            nn.Conv2d(hid_channels, hid_channels, kernel_size=1, bias=False),
+            nn.BatchNorm2d(hid_channels),
+            nn.ReLU(True),
+            nn.Conv2d(hid_channels, 4, kernel_size=1, bias=True)
+        )
+    def forward(self, fine_src, base_src, base_fgr, base_pha, base_hid):
+        fine_x = torch.cat([fine_src, fine_src.mean(1, keepdim=True)], dim=1)
+        base_x = torch.cat([base_src, base_src.mean(1, keepdim=True)], dim=1)
+        base_y = torch.cat([base_fgr, base_pha], dim=1)
+        mean_x = self.box_filter(base_x)
+        mean_y = self.box_filter(base_y)
+        cov_xy = self.box_filter(base_x * base_y) - mean_x * mean_y
+        var_x  = self.box_filter(base_x * base_x) - mean_x * mean_x
+        A = self.conv(torch.cat([cov_xy, var_x, base_hid], dim=1))
+        b = mean_y - A * mean_x
+        H, W = fine_src.shape[2:]
+        A = F.interpolate(A, (H, W), mode='bilinear', align_corners=False)
+        b = F.interpolate(b, (H, W), mode='bilinear', align_corners=False)
+        out = A * fine_x + b
+        fgr, pha = out.split([3, 1], dim=1)
+        return fgr, pha
+def diff_x(input, r):
+    assert input.dim() == 4
+    left   = input[:, :,         r:2 * r + 1]
+    middle = input[:, :, 2 * r + 1:         ] - input[:, :,           :-2 * r - 1]
+    right  = input[:, :,        -1:         ] - input[:, :, -2 * r - 1:    -r - 1]
+    output = torch.cat([left, middle, right], dim=2)
+    return output
+def diff_y(input, r):
+    assert input.dim() == 4
+    left   = input[:, :, :,         r:2 * r + 1]
+    middle = input[:, :, :, 2 * r + 1:         ] - input[:, :, :,           :-2 * r - 1]
+    right  = input[:, :, :,        -1:         ] - input[:, :, :, -2 * r - 1:    -r - 1]
+    output = torch.cat([left, middle, right], dim=3)
+    return output
+class BoxFilter(nn.Module):
+    def __init__(self, r):
+        super(BoxFilter, self).__init__()
+        self.r = r
+    def forward(self, x):
+        assert x.dim() == 4
+        return diff_y(diff_x(x.cumsum(dim=2), self.r).cumsum(dim=3), self.r)
+if __name__ == '__main__':
+    model = StyleMatte().to(device)
+    out=model(torch.randn(1,3,640,480).to(devuce))
+    print(out.shape)

requirements.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+gradio==3.30.0
+gradio_client==0.2.4
+huggingface-hub==0.14.1
+imageio==2.25.1
+imgcat==0.5.0
+ipykernel==6.16.0
+ipython==8.5.0
+ipywidgets==8.0.2
+kiwisolver==1.4.2
+kornia==0.6.9
+legacy==0.1.6
+numpy==1.21.6
+omegaconf==2.2.3
+opencv-python==4.5.5.62
+opencv-python-headless==4.7.0.68
+packaging==21.3
+pandas==1.4.2
+parso==0.8.3
+Pillow==9.4.0
+protobuf==3.20.1
+Pygments==2.13.0
+PyMatting==1.1.8
+pyparsing==3.0.9
+pyrsistent==0.19.3
+scikit-image==0.19.3
+scikit-learn==1.1.1
+scipy==1.10.0
+seaborn==0.12.2
+sklearn==0.0
+sniffio==1.3.0
+soupsieve==2.4
+timm==0.6.12
+torch==1.11.0
+torchaudio==0.11.0
+torchvision==0.12.0
+tornado==6.2
+tqdm==4.64.1
+transformers==4.28.1

stylematte.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ce985571e909b6677d7d25e560216fa3f620e5cd337a8382ee0799c6d9af16c
+size 140040541

test.py ADDED Viewed

	@@ -0,0 +1,1002 @@

+#modified from Github repo: https://github.com/JizhiziLi/P3M
+#added inference code for other networks
+import torch
+import cv2
+import argparse
+import numpy as np
+from tqdm import tqdm
+from PIL import Image
+from skimage.transform import resize
+from torchvision import transforms,models
+import os
+from models import *
+import torch.nn.functional as F
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+import torch.nn.functional as fnn
+import glob
+import tqdm
+from torch.autograd import Variable
+from typing import Type, Any, Callable, Union, List, Optional
+import logging
+import time
+from omegaconf import OmegaConf
+config = OmegaConf.load("base.yaml")
+device = "cpu"
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class TFI(nn.Module):
+    expansion = 1
+    def __init__(self, planes,stride=1):
+        super(TFI, self).__init__()
+        middle_planes = int(planes/2)
+        self.transform = conv1x1(planes, middle_planes)
+        self.conv1 = conv3x3(middle_planes*3, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+    def forward(self, input_s_guidance, input_m_decoder, input_m_encoder):
+        input_s_guidance_transform = self.transform(input_s_guidance)
+        input_m_decoder_transform = self.transform(input_m_decoder)
+        input_m_encoder_transform = self.transform(input_m_encoder)
+        x = torch.cat((input_s_guidance_transform,input_m_decoder_transform,input_m_encoder_transform),1)
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        return out
+class SBFI(nn.Module):
+    def __init__(self, planes,stride=1):
+        super(SBFI, self).__init__()
+        self.stride = stride
+        self.transform1 = conv1x1(planes, int(planes/2))
+        self.transform2 = conv1x1(64, int(planes/2))
+        self.maxpool = nn.MaxPool2d(2, stride=stride)
+        self.conv1 = conv3x3(planes, planes, 1)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, input_m_decoder,e0):
+        input_m_decoder_transform = self.transform1(input_m_decoder)
+        e0_maxpool = self.maxpool(e0)
+        e0_transform = self.transform2(e0_maxpool)
+        x = torch.cat((input_m_decoder_transform,e0_transform),1)
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = out+input_m_decoder
+        return out
+class DBFI(nn.Module):
+    def __init__(self, planes,stride=1):
+        super(DBFI, self).__init__()
+        self.stride = stride
+        self.transform1 = conv1x1(planes, int(planes/2))
+        self.transform2 = conv1x1(512, int(planes/2))
+        self.upsample = nn.Upsample(scale_factor=stride, mode='bilinear')
+        self.conv1 = conv3x3(planes, planes, 1)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, 3, 1)
+        self.upsample2 = nn.Upsample(scale_factor=int(32/stride), mode='bilinear')
+    def forward(self, input_s_decoder,e4):
+        input_s_decoder_transform = self.transform1(input_s_decoder)
+        e4_transform = self.transform2(e4)
+        e4_upsample = self.upsample(e4_transform)
+        x = torch.cat((input_s_decoder_transform,e4_upsample),1)
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = out+input_s_decoder
+        out_side = self.conv2(out)
+        out_side = self.upsample2(out_side)
+        return out, out_side
+class P3mNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.resnet = resnet34_mp()
+        ############################
+        ### Encoder part - RESNETMP
+        ############################
+        self.encoder0 = nn.Sequential(
+            self.resnet.conv1,
+            self.resnet.bn1,
+            self.resnet.relu,
+            )
+        self.mp0 = self.resnet.maxpool1
+        self.encoder1 = nn.Sequential(
+            self.resnet.layer1)
+        self.mp1 = self.resnet.maxpool2
+        self.encoder2 = self.resnet.layer2
+        self.mp2 = self.resnet.maxpool3
+        self.encoder3 = self.resnet.layer3
+        self.mp3 = self.resnet.maxpool4
+        self.encoder4 = self.resnet.layer4
+        self.mp4 = self.resnet.maxpool5
+        self.tfi_3 = TFI(256)
+        self.tfi_2 = TFI(128)
+        self.tfi_1 = TFI(64)
+        self.tfi_0 = TFI(64)
+        self.sbfi_2 = SBFI(128, 8)
+        self.sbfi_1 = SBFI(64, 4)
+        self.sbfi_0 = SBFI(64, 2)
+        self.dbfi_2 = DBFI(128, 4)
+        self.dbfi_1 = DBFI(64, 8)
+        self.dbfi_0 = DBFI(64, 16)
+        ##########################
+        ### Decoder part - GLOBAL
+        ##########################
+        self.decoder4_g = nn.Sequential(
+            nn.Conv2d(512,512,3,padding=1),
+            nn.BatchNorm2d(512),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512,512,3,padding=1),
+            nn.BatchNorm2d(512),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512,256,3,padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.Upsample(scale_factor=2, mode='bilinear') )
+        self.decoder3_g = nn.Sequential(
+            nn.Conv2d(256,256,3,padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256,256,3,padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256,128,3,padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+            nn.Upsample(scale_factor=2, mode='bilinear') )
+        self.decoder2_g = nn.Sequential(
+            nn.Conv2d(128,128,3,padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128,128,3,padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128,64,3,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.Upsample(scale_factor=2, mode='bilinear'))
+        self.decoder1_g = nn.Sequential(
+            nn.Conv2d(64,64,3,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64,64,3,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64,64,3,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.Upsample(scale_factor=2, mode='bilinear'))
+        self.decoder0_g = nn.Sequential(
+            nn.Conv2d(64,64,3,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64,64,3,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64,3,3,padding=1),
+            nn.Upsample(scale_factor=2, mode='bilinear'))
+        ##########################
+        ### Decoder part - LOCAL
+        ##########################
+        self.decoder4_l = nn.Sequential(
+            nn.Conv2d(512,512,3,padding=1),
+            nn.BatchNorm2d(512),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512,512,3,padding=1),
+            nn.BatchNorm2d(512),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512,256,3,padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True))
+        self.decoder3_l = nn.Sequential(
+            nn.Conv2d(256,256,3,padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256,256,3,padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256,128,3,padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True))
+        self.decoder2_l = nn.Sequential(
+            nn.Conv2d(128,128,3,padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128,128,3,padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128,64,3,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True))
+        self.decoder1_l = nn.Sequential(
+            nn.Conv2d(64,64,3,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64,64,3,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64,64,3,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True))
+        self.decoder0_l = nn.Sequential(
+            nn.Conv2d(64,64,3,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64,64,3,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True))
+        self.decoder_final_l = nn.Conv2d(64,1,3,padding=1)
+    def forward(self, input):
+        ##########################
+        ### Encoder part - RESNET
+        ##########################
+        e0 = self.encoder0(input)
+        e0p, id0 = self.mp0(e0)
+        e1p, id1 = self.mp1(e0p)
+        e1 = self.encoder1(e1p)
+        e2p, id2 = self.mp2(e1)
+        e2 = self.encoder2(e2p)
+        e3p, id3 = self.mp3(e2)
+        e3 = self.encoder3(e3p)
+        e4p, id4 = self.mp4(e3)
+        e4 = self.encoder4(e4p)
+        ###########################
+        ### Decoder part - Global
+        ###########################
+        d4_g = self.decoder4_g(e4)
+        d3_g = self.decoder3_g(d4_g)
+        d2_g, global_sigmoid_side2 = self.dbfi_2(d3_g, e4)
+        d2_g = self.decoder2_g(d2_g)
+        d1_g, global_sigmoid_side1 = self.dbfi_1(d2_g, e4)
+        d1_g = self.decoder1_g(d1_g)
+        d0_g, global_sigmoid_side0 = self.dbfi_0(d1_g, e4)
+        d0_g = self.decoder0_g(d0_g)
+        global_sigmoid = d0_g
+        ###########################
+        ### Decoder part - Local
+        ###########################
+        d4_l = self.decoder4_l(e4)
+        d4_l = F.max_unpool2d(d4_l, id4, kernel_size=2, stride=2)
+        d3_l = self.tfi_3(d4_g, d4_l, e3)
+        d3_l = self.decoder3_l(d3_l)
+        d3_l = F.max_unpool2d(d3_l, id3, kernel_size=2, stride=2)
+        d2_l = self.tfi_2(d3_g, d3_l, e2)
+        d2_l = self.sbfi_2(d2_l, e0)
+        d2_l = self.decoder2_l(d2_l)
+        d2_l  = F.max_unpool2d(d2_l, id2, kernel_size=2, stride=2)
+        d1_l = self.tfi_1(d2_g, d2_l, e1)
+        d1_l = self.sbfi_1(d1_l, e0)
+        d1_l = self.decoder1_l(d1_l)
+        d1_l  = F.max_unpool2d(d1_l, id1, kernel_size=2, stride=2)
+        d0_l = self.tfi_0(d1_g, d1_l, e0p)
+        d0_l = self.sbfi_0(d0_l, e0)
+        d0_l = self.decoder0_l(d0_l)
+        d0_l  = F.max_unpool2d(d0_l, id0, kernel_size=2, stride=2)
+        d0_l = self.decoder_final_l(d0_l)
+        local_sigmoid = F.sigmoid(d0_l)
+        ##########################
+        ### Fusion net - G/L
+        ##########################
+        fusion_sigmoid = get_masked_local_from_global(global_sigmoid, local_sigmoid)
+        return global_sigmoid, local_sigmoid, fusion_sigmoid, global_sigmoid_side2, global_sigmoid_side1, global_sigmoid_side0
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+class BasicBlock(nn.Module):
+    expansion: int = 1
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    __constants__ = ['downsample']
+    def __init__(self, inplanes, planes,stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.attention(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, block, layers, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
+                 norm_layer=None):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=1, padding=3,
+                               bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, return_indices=True)
+        self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, return_indices=True)
+        self.maxpool3 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, return_indices=True)
+        self.maxpool4 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, return_indices=True)
+        self.maxpool5 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, return_indices=True)
+        #pdb.set_trace()
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=1,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
+                                       dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, 1000)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes,stride, downsample, self.groups,
+                            self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes,groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer))
+        return nn.Sequential(*layers)
+    def _forward_impl(self, x):
+        x1 = self.conv1(x)
+        x1 = self.bn1(x1)
+        x1 = self.relu(x1)
+        x1, idx1 = self.maxpool1(x1)
+        x2, idx2 = self.maxpool2(x1)
+        x2 = self.layer1(x2)
+        x3, idx3 = self.maxpool3(x2)
+        x3 = self.layer2(x3)
+        x4, idx4 = self.maxpool4(x3)
+        x4 = self.layer3(x4)
+        x5, idx5 = self.maxpool5(x4)
+        x5 = self.layer4(x5)
+        x_cls = self.avgpool(x5)
+        x_cls = torch.flatten(x_cls, 1)
+        x_cls = self.fc(x_cls)
+        return x_cls
+    def forward(self, x):
+        return self._forward_impl(x)
+def resnet34_mp(**kwargs):
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`
+    """
+    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+    checkpoint = torch.load("checkpoints/r34mp_pretrained_imagenet.pth.tar")
+    model.load_state_dict(checkpoint)
+    return model
+##############################
+### Training loses for P3M-NET
+##############################
+def get_crossentropy_loss(gt,pre):
+    gt_copy = gt.clone()
+    gt_copy[gt_copy==0] = 0
+    gt_copy[gt_copy==255] = 2
+    gt_copy[gt_copy>2] = 1
+    gt_copy = gt_copy.long()
+    gt_copy = gt_copy[:,0,:,:]
+    criterion = nn.CrossEntropyLoss()
+    entropy_loss = criterion(pre, gt_copy)
+    return entropy_loss
+def get_alpha_loss(predict, alpha, trimap):
+    weighted = torch.zeros(trimap.shape).to(device)
+    weighted[trimap == 128] = 1.
+    alpha_f = alpha / 255.
+    alpha_f = alpha_f.to(device)
+    diff = predict - alpha_f
+    diff = diff * weighted
+    alpha_loss = torch.sqrt(diff ** 2 + 1e-12)
+    alpha_loss_weighted = alpha_loss.sum() / (weighted.sum() + 1.)
+    return alpha_loss_weighted
+def get_alpha_loss_whole_img(predict, alpha):
+    weighted = torch.ones(alpha.shape).to(device)
+    alpha_f = alpha / 255.
+    alpha_f = alpha_f.to(device)
+    diff = predict - alpha_f
+    alpha_loss = torch.sqrt(diff ** 2 + 1e-12)
+    alpha_loss = alpha_loss.sum()/(weighted.sum())
+    return alpha_loss
+## Laplacian loss is refer to
+## https://gist.github.com/MarcoForte/a07c40a2b721739bb5c5987671aa5270
+def build_gauss_kernel(size=5, sigma=1.0, n_channels=1, cuda=False):
+    if size % 2 != 1:
+        raise ValueError("kernel size must be uneven")
+    grid = np.float32(np.mgrid[0:size,0:size].T)
+    gaussian = lambda x: np.exp((x - size//2)**2/(-2*sigma**2))**2
+    kernel = np.sum(gaussian(grid), axis=2)
+    kernel /= np.sum(kernel)
+    kernel = np.tile(kernel, (n_channels, 1, 1))
+    kernel = torch.FloatTensor(kernel[:, None, :, :]).to(device)
+    return Variable(kernel, requires_grad=False)
+def conv_gauss(img, kernel):
+    """ convolve img with a gaussian kernel that has been built with build_gauss_kernel """
+    n_channels, _, kw, kh = kernel.shape
+    img = fnn.pad(img, (kw//2, kh//2, kw//2, kh//2), mode='replicate')
+    return fnn.conv2d(img, kernel, groups=n_channels)
+def laplacian_pyramid(img, kernel, max_levels=5):
+    current = img
+    pyr = []
+    for level in range(max_levels):
+        filtered = conv_gauss(current, kernel)
+        diff = current - filtered
+        pyr.append(diff)
+        current = fnn.avg_pool2d(filtered, 2)
+    pyr.append(current)
+    return pyr
+def get_laplacian_loss(predict, alpha, trimap):
+    weighted = torch.zeros(trimap.shape).to(device)
+    weighted[trimap == 128] = 1.
+    alpha_f = alpha / 255.
+    alpha_f = alpha_f.to(device)
+    alpha_f = alpha_f.clone()*weighted
+    predict = predict.clone()*weighted
+    gauss_kernel = build_gauss_kernel(size=5, sigma=1.0, n_channels=1, cuda=True)
+    pyr_alpha  = laplacian_pyramid(alpha_f, gauss_kernel, 5)
+    pyr_predict = laplacian_pyramid(predict, gauss_kernel, 5)
+    laplacian_loss_weighted = sum(fnn.l1_loss(a, b) for a, b in zip(pyr_alpha, pyr_predict))
+    return laplacian_loss_weighted
+def get_laplacian_loss_whole_img(predict, alpha):
+    alpha_f = alpha / 255.
+    alpha_f = alpha_f.to(device)
+    gauss_kernel = build_gauss_kernel(size=5, sigma=1.0, n_channels=1, cuda=True)
+    pyr_alpha  = laplacian_pyramid(alpha_f, gauss_kernel, 5)
+    pyr_predict = laplacian_pyramid(predict, gauss_kernel, 5)
+    laplacian_loss = sum(fnn.l1_loss(a, b) for a, b in zip(pyr_alpha, pyr_predict))
+    return laplacian_loss
+def get_composition_loss_whole_img(img, alpha, fg, bg, predict):
+    weighted = torch.ones(alpha.shape).to(device)
+    predict_3 = torch.cat((predict, predict, predict), 1)
+    comp = predict_3 * fg + (1. - predict_3) * bg
+    comp_loss = torch.sqrt((comp - img) ** 2 + 1e-12)
+    comp_loss = comp_loss.sum()/(weighted.sum())
+    return comp_loss
+##############################
+### Test loss for matting
+##############################
+def calculate_sad_mse_mad(predict_old,alpha,trimap):
+    predict = np.copy(predict_old)
+    pixel = float((trimap == 128).sum())
+    predict[trimap == 255] = 1.
+    predict[trimap == 0  ] = 0.
+    sad_diff = np.sum(np.abs(predict - alpha))/1000
+    if pixel==0:
+        pixel = trimap.shape[0]*trimap.shape[1]-float((trimap==255).sum())-float((trimap==0).sum())
+    mse_diff = np.sum((predict - alpha) ** 2)/pixel
+    mad_diff = np.sum(np.abs(predict - alpha))/pixel
+    return sad_diff, mse_diff, mad_diff
+def calculate_sad_mse_mad_whole_img(predict, alpha):
+    pixel = predict.shape[0]*predict.shape[1]
+    sad_diff = np.sum(np.abs(predict - alpha))/1000
+    mse_diff = np.sum((predict - alpha) ** 2)/pixel
+    mad_diff = np.sum(np.abs(predict - alpha))/pixel
+    return sad_diff, mse_diff, mad_diff
+def calculate_sad_fgbg(predict, alpha, trimap):
+    sad_diff = np.abs(predict-alpha)
+    weight_fg = np.zeros(predict.shape)
+    weight_bg = np.zeros(predict.shape)
+    weight_trimap = np.zeros(predict.shape)
+    weight_fg[trimap==255] = 1.
+    weight_bg[trimap==0  ] = 1.
+    weight_trimap[trimap==128  ] = 1.
+    sad_fg = np.sum(sad_diff*weight_fg)/1000
+    sad_bg = np.sum(sad_diff*weight_bg)/1000
+    sad_trimap = np.sum(sad_diff*weight_trimap)/1000
+    return sad_fg, sad_bg
+def compute_gradient_whole_image(pd, gt):
+    from scipy.ndimage import gaussian_filter
+    pd_x = gaussian_filter(pd, sigma=1.4, order=[1, 0], output=np.float32)
+    pd_y = gaussian_filter(pd, sigma=1.4, order=[0, 1], output=np.float32)
+    gt_x = gaussian_filter(gt, sigma=1.4, order=[1, 0], output=np.float32)
+    gt_y = gaussian_filter(gt, sigma=1.4, order=[0, 1], output=np.float32)
+    pd_mag = np.sqrt(pd_x**2 + pd_y**2)
+    gt_mag = np.sqrt(gt_x**2 + gt_y**2)
+    error_map = np.square(pd_mag - gt_mag)
+    loss = np.sum(error_map) / 10
+    return loss
+def compute_connectivity_loss_whole_image(pd, gt, step=0.1):
+    from scipy.ndimage import morphology
+    from skimage.measure import label, regionprops
+    h, w = pd.shape
+    thresh_steps = np.arange(0, 1.1, step)
+    l_map = -1 * np.ones((h, w), dtype=np.float32)
+    lambda_map = np.ones((h, w), dtype=np.float32)
+    for i in range(1, thresh_steps.size):
+        pd_th = pd >= thresh_steps[i]
+        gt_th = gt >= thresh_steps[i]
+        label_image = label(pd_th & gt_th, connectivity=1)
+        cc = regionprops(label_image)
+        size_vec = np.array([c.area for c in cc])
+        if len(size_vec) == 0:
+            continue
+        max_id = np.argmax(size_vec)
+        coords = cc[max_id].coords
+        omega = np.zeros((h, w), dtype=np.float32)
+        omega[coords[:, 0], coords[:, 1]] = 1
+        flag = (l_map == -1) & (omega == 0)
+        l_map[flag == 1] = thresh_steps[i-1]
+        dist_maps = morphology.distance_transform_edt(omega==0)
+        dist_maps = dist_maps / dist_maps.max()
+    l_map[l_map == -1] = 1
+    d_pd = pd - l_map
+    d_gt = gt - l_map
+    phi_pd = 1 - d_pd * (d_pd >= 0.15).astype(np.float32)
+    phi_gt = 1 - d_gt * (d_gt >= 0.15).astype(np.float32)
+    loss = np.sum(np.abs(phi_pd - phi_gt)) / 1000
+    return loss
+def gen_trimap_from_segmap_e2e(segmap):
+    trimap = np.argmax(segmap, axis=1)[0]
+    trimap = trimap.astype(np.int64)
+    trimap[trimap==1]=128
+    trimap[trimap==2]=255
+    return trimap.astype(np.uint8)
+def get_masked_local_from_global(global_sigmoid, local_sigmoid):
+    values, index = torch.max(global_sigmoid,1)
+    index = index[:,None,:,:].float()
+    ### index <===> [0, 1, 2]
+    ### bg_mask <===> [1, 0, 0]
+    bg_mask = index.clone()
+    bg_mask[bg_mask==2]=1
+    bg_mask = 1- bg_mask
+    ### trimap_mask <===> [0, 1, 0]
+    trimap_mask = index.clone()
+    trimap_mask[trimap_mask==2]=0
+    ### fg_mask <===> [0, 0, 1]
+    fg_mask = index.clone()
+    fg_mask[fg_mask==1]=0
+    fg_mask[fg_mask==2]=1
+    fusion_sigmoid = local_sigmoid*trimap_mask+fg_mask
+    return fusion_sigmoid
+def get_masked_local_from_global_test(global_result, local_result):
+    weighted_global = np.ones(global_result.shape)
+    weighted_global[global_result==255] = 0
+    weighted_global[global_result==0] = 0
+    fusion_result = global_result*(1.-weighted_global)/255+local_result*weighted_global
+    return fusion_result
+def inference_once( model, scale_img, scale_trimap=None):
+    pred_list = []
+    tensor_img = torch.from_numpy(scale_img[:, :, :]).permute(2, 0, 1).to(device)
+    input_t = tensor_img
+    input_t = input_t/255.0
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                 std=[0.229, 0.224, 0.225])
+    input_t = normalize(input_t)
+    input_t = input_t.unsqueeze(0).float()
+   # pred_global, pred_local, pred_fusion = model(input_t)[:3]
+    pred_fusion = model(input_t)[:3]
+    pred_global = pred_fusion
+    pred_local = pred_fusion
+    pred_global = pred_global.data.cpu().numpy()
+    pred_global = gen_trimap_from_segmap_e2e(pred_global)
+    pred_local = pred_local.data.cpu().numpy()[0,0,:,:]
+    pred_fusion = pred_fusion.data.cpu().numpy()[0,0,:,:]
+    return pred_global, pred_local, pred_fusion
+# def inference_img( test_choice,model, img):
+#     h, w, c = img.shape
+#     new_h = min(config['datasets'].MAX_SIZE_H, h - (h % 32))
+#     new_w = min(config['datasets'].MAX_SIZE_W, w - (w % 32))
+#     if test_choice=='HYBRID':
+#         global_ratio = 1/2
+#         local_ratio = 1
+#         resize_h = int(h*global_ratio)
+#         resize_w = int(w*global_ratio)
+#         new_h = min(config['datasets'].MAX_SIZE_H, resize_h - (resize_h % 32))
+#         new_w = min(config['datasets'].MAX_SIZE_W, resize_w - (resize_w % 32))
+#         scale_img = resize(img,(new_h,new_w))*255.0
+#         pred_coutour_1, pred_retouching_1, pred_fusion_1 = inference_once( model, scale_img)
+#         pred_coutour_1 = resize(pred_coutour_1,(h,w))*255.0
+#         resize_h = int(h*local_ratio)
+#         resize_w = int(w*local_ratio)
+#         new_h = min(config['datasets'].MAX_SIZE_H, resize_h - (resize_h % 32))
+#         new_w = min(config['datasets'].MAX_SIZE_W, resize_w - (resize_w % 32))
+#         scale_img = resize(img,(new_h,new_w))*255.0
+#         pred_coutour_2, pred_retouching_2, pred_fusion_2 = inference_once( model, scale_img)
+#         pred_retouching_2 = resize(pred_retouching_2,(h,w))
+#         pred_fusion = get_masked_local_from_global_test(pred_coutour_1, pred_retouching_2)
+#         return pred_fusion
+#     else:
+#         resize_h = int(h/2)
+#         resize_w = int(w/2)
+#         new_h = min(config['datasets'].MAX_SIZE_H, resize_h - (resize_h % 32))
+#         new_w = min(config['datasets'].MAX_SIZE_W, resize_w - (resize_w % 32))
+#         scale_img = resize(img,(new_h,new_w))*255.0
+#         pred_global, pred_local, pred_fusion = inference_once( model, scale_img)
+#         pred_local = resize(pred_local,(h,w))
+#         pred_global = resize(pred_global,(h,w))*255.0
+#         pred_fusion = resize(pred_fusion,(h,w))
+#         return pred_fusion
+def inference_img(model, img):
+    h,w,_ = img.shape
+    # print(img.shape)
+    if h%8!=0 or w%8!=0:
+        img=cv2.copyMakeBorder(img, 8-h%8, 0, 8-w%8, 0, cv2.BORDER_REFLECT)
+    # print(img.shape)
+    tensor_img = torch.from_numpy(img).permute(2, 0, 1).to(device)
+    input_t = tensor_img
+    input_t = input_t/255.0
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                 std=[0.229, 0.224, 0.225])
+    input_t = normalize(input_t)
+    input_t = input_t.unsqueeze(0).float()
+    with torch.no_grad():
+        out=model(input_t)
+    # print("out",out.shape)
+    result = out[0][:,-h:,-w:].cpu().numpy()
+    # print(result.shape)
+    return result[0]
+def test_am2k(model):
+    ############################
+    # Some initial setting for paths
+    ############################
+    ORIGINAL_PATH = config['datasets']['am2k']['validation_original']
+    MASK_PATH = config['datasets']['am2k']['validation_mask']
+    TRIMAP_PATH = config['datasets']['am2k']['validation_trimap']
+    img_paths = glob.glob(ORIGINAL_PATH+"/*.jpg")
+    ############################
+    # Start testing
+    ############################
+    sad_diffs = 0.
+    mse_diffs = 0.
+    mad_diffs = 0.
+    grad_diffs = 0.
+    conn_diffs = 0.
+    sad_trimap_diffs = 0.
+    mse_trimap_diffs = 0.
+    mad_trimap_diffs = 0.
+    sad_fg_diffs = 0.
+    sad_bg_diffs = 0.
+    total_number = len(img_paths)
+    log("===============================")
+    log(f'====> Start Testing\n\t--Dataset: AM2k\n\t-\n\t--Number: {total_number}')
+    for img_path in tqdm.tqdm(img_paths):
+        img_name=(img_path.split("/")[-1])[:-4]
+        alpha_path = MASK_PATH+img_name+'.png'
+        trimap_path = TRIMAP_PATH+img_name+'.png'
+        pil_img = Image.open(img_path)
+        img = np.array(pil_img)
+        trimap = np.array(Image.open(trimap_path))
+        alpha = np.array(Image.open(alpha_path))/255.
+        img = img[:,:,:3] if img.ndim>2 else img
+        trimap = trimap[:,:,0] if trimap.ndim>2 else trimap
+        alpha = alpha[:,:,0] if alpha.ndim>2 else alpha
+        with torch.no_grad():
+#             torch.cuda.empty_cache()
+            predict = inference_img( model, img)
+            sad_trimap_diff, mse_trimap_diff, mad_trimap_diff = calculate_sad_mse_mad(predict, alpha, trimap)
+            sad_diff, mse_diff, mad_diff = calculate_sad_mse_mad_whole_img(predict, alpha)
+            sad_fg_diff, sad_bg_diff = calculate_sad_fgbg(predict, alpha, trimap)
+            conn_diff = compute_connectivity_loss_whole_image(predict, alpha)
+            grad_diff = compute_gradient_whole_image(predict, alpha)
+            log(f"[{img_paths.index(img_path)}/{total_number}]\nImage:{img_name}\nsad:{sad_diff}\nmse:{mse_diff}\nmad:{mad_diff}\nsad_trimap:{sad_trimap_diff}\nmse_trimap:{mse_trimap_diff}\nmad_trimap:{mad_trimap_diff}\nsad_fg:{sad_fg_diff}\nsad_bg:{sad_bg_diff}\nconn:{conn_diff}\ngrad:{grad_diff}\n-----------")
+            sad_diffs += sad_diff
+            mse_diffs += mse_diff
+            mad_diffs += mad_diff
+            mse_trimap_diffs += mse_trimap_diff
+            sad_trimap_diffs += sad_trimap_diff
+            mad_trimap_diffs += mad_trimap_diff
+            sad_fg_diffs += sad_fg_diff
+            sad_bg_diffs += sad_bg_diff
+            conn_diffs += conn_diff
+            grad_diffs += grad_diff
+            Image.fromarray(np.uint8(predict*255)).save(f"test/{img_name}.png")
+    log("===============================")
+    log(f"Testing numbers: {total_number}")
+    log("SAD: {}".format(sad_diffs / total_number))
+    log("MSE: {}".format(mse_diffs / total_number))
+    log("MAD: {}".format(mad_diffs / total_number))
+    log("GRAD: {}".format(grad_diffs / total_number))
+    log("CONN: {}".format(conn_diffs / total_number))
+    log("SAD TRIMAP: {}".format(sad_trimap_diffs / total_number))
+    log("MSE TRIMAP: {}".format(mse_trimap_diffs / total_number))
+    log("MAD TRIMAP: {}".format(mad_trimap_diffs / total_number))
+    log("SAD FG: {}".format(sad_fg_diffs / total_number))
+    log("SAD BG: {}".format(sad_bg_diffs / total_number))
+    return sad_diffs/total_number,mse_diffs/total_number,grad_diffs/total_number
+def test_p3m10k(model,dataset_choice, max_image=-1):
+    ############################
+    # Some initial setting for paths
+    ############################
+    if dataset_choice == 'P3M_500_P':
+        val_option = 'VAL500P'
+    else:
+        val_option = 'VAL500NP'
+    ORIGINAL_PATH = config['datasets']['p3m10k']+"/validation/"+config['datasets']['p3m10k_test'][val_option]['ORIGINAL_PATH']
+    MASK_PATH = config['datasets']['p3m10k']+"/validation/"+config['datasets']['p3m10k_test'][val_option]['MASK_PATH']
+    TRIMAP_PATH = config['datasets']['p3m10k']+"/validation/"+config['datasets']['p3m10k_test'][val_option]['TRIMAP_PATH']
+    ############################
+    # Start testing
+    ############################
+    sad_diffs = 0.
+    mse_diffs = 0.
+    mad_diffs = 0.
+    sad_trimap_diffs = 0.
+    mse_trimap_diffs = 0.
+    mad_trimap_diffs = 0.
+    sad_fg_diffs = 0.
+    sad_bg_diffs = 0.
+    conn_diffs = 0.
+    grad_diffs = 0.
+    model.eval()
+    img_paths = glob.glob(ORIGINAL_PATH+"/*.jpg")
+    if (max_image>1):
+        img_paths = img_paths[:max_image]
+    total_number = len(img_paths)
+    log("===============================")
+    log(f'====> Start Testing\n\t----Test: {dataset_choice}\n\t--Number: {total_number}')
+    for img_path in tqdm.tqdm(img_paths):
+        img_name=(img_path.split("/")[-1])[:-4]
+        alpha_path = MASK_PATH+img_name+'.png'
+        trimap_path = TRIMAP_PATH+img_name+'.png'
+        pil_img = Image.open(img_path)
+        img = np.array(pil_img)
+        trimap = np.array(Image.open(trimap_path))
+        alpha = np.array(Image.open(alpha_path))/255.
+        img = img[:,:,:3] if img.ndim>2 else img
+        trimap = trimap[:,:,0] if trimap.ndim>2 else trimap
+        alpha = alpha[:,:,0] if alpha.ndim>2 else alpha
+        with torch.no_grad():
+#             torch.cuda.empty_cache()
+            start = time.time()
+            predict = inference_img( model, img) #HYBRID show less accuracy
+            # tensorimg=transforms.ToTensor()(pil_img)
+            # input_img=transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(tensorimg)
+            # predict = model(input_img.unsqueeze(0).to(device))[0][0].detach().cpu().numpy()
+            # if predict.shape!=(pil_img.height,pil_img.width):
+            #     print("resize for ",img_path)
+            #     predict = resize(predict,(pil_img.height,pil_img.width))
+            sad_trimap_diff, mse_trimap_diff, mad_trimap_diff = calculate_sad_mse_mad(predict, alpha, trimap)
+            sad_diff, mse_diff, mad_diff = calculate_sad_mse_mad_whole_img(predict, alpha)
+            sad_fg_diff, sad_bg_diff = calculate_sad_fgbg(predict, alpha, trimap)
+            conn_diff = compute_connectivity_loss_whole_image(predict, alpha)
+            grad_diff = compute_gradient_whole_image(predict, alpha)
+            log(f"[{img_paths.index(img_path)}/{total_number}]\nImage:{img_name}\nsad:{sad_diff}\nmse:{mse_diff}\nmad:{mad_diff}\nconn:{conn_diff}\ngrad:{grad_diff}\n-----------")
+            sad_diffs += sad_diff
+            mse_diffs += mse_diff
+            mad_diffs += mad_diff
+            mse_trimap_diffs += mse_trimap_diff
+            sad_trimap_diffs += sad_trimap_diff
+            mad_trimap_diffs += mad_trimap_diff
+            sad_fg_diffs += sad_fg_diff
+            sad_bg_diffs += sad_bg_diff
+            conn_diffs += conn_diff
+            grad_diffs += grad_diff
+            Image.fromarray(np.uint8(predict*255)).save(f"test/{img_name}.png")
+    log("===============================")
+    log(f"Testing numbers: {total_number}")
+    log("SAD: {}".format(sad_diffs / total_number))
+    log("MSE: {}".format(mse_diffs / total_number))
+    log("MAD: {}".format(mad_diffs / total_number))
+    log("SAD TRIMAP: {}".format(sad_trimap_diffs / total_number))
+    log("MSE TRIMAP: {}".format(mse_trimap_diffs / total_number))
+    log("MAD TRIMAP: {}".format(mad_trimap_diffs / total_number))
+    log("SAD FG: {}".format(sad_fg_diffs / total_number))
+    log("SAD BG: {}".format(sad_bg_diffs / total_number))
+    log("CONN: {}".format(conn_diffs / total_number))
+    log("GRAD: {}".format(grad_diffs / total_number))
+    return sad_diffs/total_number,mse_diffs/total_number,grad_diffs/total_number
+def log(str):
+    print(str)
+    logging.info(str)
+if __name__ == '__main__':
+    print('*********************************')
+    config = OmegaConf.load("base.yaml")
+    config=OmegaConf.merge(config,OmegaConf.from_cli())
+    print(config)
+    model =  MaskForm()
+    model =  model.to(device)
+    checkpoint = f"{config.checkpoint_dir}/{config.checkpoint}"
+    state_dict = torch.load(checkpoint, map_location=f'{device}')
+    print("loaded",checkpoint)
+    model.load_state_dict(state_dict)
+    model.eval()
+    logging.basicConfig(filename=f'report/{config.checkpoint.replace("/","--")}.report', encoding='utf-8',filemode='w', level=logging.INFO)
+    # ckpt = torch.load("checkpoints/p3mnet_pretrained_on_p3m10k.pth")
+    # model.load_state_dict(ckpt['state_dict'], strict=True)
+    # model = model.cuda()
+    if config.dataset_to_use =="AM2K":
+         test_am2k(model)
+    else:
+        for dataset_choice in ['P3M_500_P','P3M_500_NP']:
+            test_p3m10k(model,dataset_choice)