| syntax = "proto2"; | |
| package object_detection.protos; | |
| import "object_detection/protos/image_resizer.proto"; | |
| import "object_detection/protos/losses.proto"; | |
| // Configuration for the CenterNet meta architecture from the "Objects as | |
| // Points" paper [1] | |
| // [1]: https://arxiv.org/abs/1904.07850 | |
| message CenterNet { | |
| // Number of classes to predict. | |
| optional int32 num_classes = 1; | |
| // Feature extractor config. | |
| optional CenterNetFeatureExtractor feature_extractor = 2; | |
| // Image resizer for preprocessing the input image. | |
| optional ImageResizer image_resizer = 3; | |
| // Parameters which are related to object detection task. | |
| message ObjectDetection { | |
| // The original fields are moved to ObjectCenterParams or deleted. | |
| reserved 2, 5, 6, 7; | |
| // Weight of the task loss. The total loss of the model will be the | |
| // summation of task losses weighted by the weights. | |
| optional float task_loss_weight = 1 [default = 1.0]; | |
| // Weight for the offset localization loss. | |
| optional float offset_loss_weight = 3 [default = 1.0]; | |
| // Weight for the height/width localization loss. | |
| optional float scale_loss_weight = 4 [default = 0.1]; | |
| // Localization loss configuration for object scale and offset losses. | |
| optional LocalizationLoss localization_loss = 8; | |
| } | |
| optional ObjectDetection object_detection_task = 4; | |
| // Parameters related to object center prediction. This is required for both | |
| // object detection and keypoint estimation tasks. | |
| message ObjectCenterParams { | |
| // Weight for the object center loss. | |
| optional float object_center_loss_weight = 1 [default = 1.0]; | |
| // Classification loss configuration for object center loss. | |
| optional ClassificationLoss classification_loss = 2; | |
| // The initial bias value of the convlution kernel of the class heatmap | |
| // prediction head. -2.19 corresponds to predicting foreground with | |
| // a probability of 0.1. See "Focal Loss for Dense Object Detection" | |
| // at https://arxiv.org/abs/1708.02002. | |
| optional float heatmap_bias_init = 3 [default = -2.19]; | |
| // The minimum IOU overlap boxes need to have to not be penalized. | |
| optional float min_box_overlap_iou = 4 [default = 0.7]; | |
| // Maximum number of boxes to predict. | |
| optional int32 max_box_predictions = 5 [default = 100]; | |
| // If set, loss is only computed for the labeled classes. | |
| optional bool use_labeled_classes = 6 [default = false]; | |
| } | |
| optional ObjectCenterParams object_center_params = 5; | |
| // Path of the file that conatins the label map along with the keypoint | |
| // information, including the keypoint indices, corresponding labels, and the | |
| // corresponding class. The file should be the same one as used in the input | |
| // pipeline. Note that a plain text of StringIntLabelMap proto is expected in | |
| // this file. | |
| // It is required only if the keypoint estimation task is specified. | |
| optional string keypoint_label_map_path = 6; | |
| // Parameters which are related to keypoint estimation task. | |
| message KeypointEstimation { | |
| // Name of the task, e.g. "human pose". Note that the task name should be | |
| // unique to each keypoint task. | |
| optional string task_name = 1; | |
| // Weight of the task loss. The total loss of the model will be their | |
| // summation of task losses weighted by the weights. | |
| optional float task_loss_weight = 2 [default = 1.0]; | |
| // Loss configuration for keypoint heatmap, offset, regression losses. Note | |
| // that the localization loss is used for offset/regression losses and | |
| // classification loss is used for heatmap loss. | |
| optional Loss loss = 3; | |
| // The name of the class that contains the keypoints for this task. This is | |
| // used to retrieve the corresponding keypoint indices from the label map. | |
| // Note that this corresponds to the "name" field, not "display_name". | |
| optional string keypoint_class_name = 4; | |
| // The standard deviation of the Gaussian kernel used to generate the | |
| // keypoint heatmap. The unit is the pixel in the output image. It is to | |
| // provide the flexibility of using different sizes of Gaussian kernel for | |
| // each keypoint class. Note that if provided, the keypoint standard | |
| // deviations will be overridden by the specified values here, otherwise, | |
| // the default value 5.0 will be used. | |
| // TODO(yuhuic): Update the default value once we found the best value. | |
| map<string, float> keypoint_label_to_std = 5; | |
| // Loss weights corresponding to different heads. | |
| optional float keypoint_regression_loss_weight = 6 [default = 1.0]; | |
| optional float keypoint_heatmap_loss_weight = 7 [default = 1.0]; | |
| optional float keypoint_offset_loss_weight = 8 [default = 1.0]; | |
| // The initial bias value of the convolution kernel of the keypoint heatmap | |
| // prediction head. -2.19 corresponds to predicting foreground with | |
| // a probability of 0.1. See "Focal Loss for Dense Object Detection" | |
| // at https://arxiv.org/abs/1708.02002. | |
| optional float heatmap_bias_init = 9 [default = -2.19]; | |
| // The heatmap score threshold for a keypoint to become a valid candidate. | |
| optional float keypoint_candidate_score_threshold = 10 [default = 0.1]; | |
| // The maximum number of candidates to retrieve for each keypoint. | |
| optional int32 num_candidates_per_keypoint = 11 [default = 100]; | |
| // Max pool kernel size to use to pull off peak score locations in a | |
| // neighborhood (independently for each keypoint types). | |
| optional int32 peak_max_pool_kernel_size = 12 [default = 3]; | |
| // The default score to use for regressed keypoints that are not | |
| // successfully snapped to a nearby candidate. | |
| optional float unmatched_keypoint_score = 13 [default = 0.1]; | |
| // The multiplier to expand the bounding boxes (either the provided boxes or | |
| // those which tightly cover the regressed keypoints). Note that new | |
| // expanded box for an instance becomes the feasible search window for all | |
| // associated keypoints. | |
| optional float box_scale = 14 [default = 1.2]; | |
| // The scale parameter that multiplies the largest dimension of a bounding | |
| // box. The resulting distance becomes a search radius for candidates in the | |
| // vicinity of each regressed keypoint. | |
| optional float candidate_search_scale = 15 [default = 0.3]; | |
| // One of ['min_distance', 'score_distance_ratio'] indicating how to select | |
| // the keypoint candidate. | |
| optional string candidate_ranking_mode = 16 [default = "min_distance"]; | |
| // The radius (in the unit of output pixel) around heatmap peak to assign | |
| // the offset targets. If set 0, then the offset target will only be | |
| // assigned to the heatmap peak (same behavior as the original paper). | |
| optional int32 offset_peak_radius = 17 [default = 0]; | |
| // Indicates whether to assign offsets for each keypoint channel | |
| // separately. If set False, the output offset target has the shape | |
| // [batch_size, out_height, out_width, 2] (same behavior as the original | |
| // paper). If set True, the output offset target has the shape [batch_size, | |
| // out_height, out_width, 2 * num_keypoints] (recommended when the | |
| // offset_peak_radius is not zero). | |
| optional bool per_keypoint_offset = 18 [default = false]; | |
| } | |
| repeated KeypointEstimation keypoint_estimation_task = 7; | |
| // Parameters which are related to mask estimation task. | |
| // Note: Currently, CenterNet supports a weak instance segmentation, where | |
| // semantic segmentation masks are estimated, and then cropped based on | |
| // bounding box detections. Therefore, it is possible for the same image | |
| // pixel to be assigned to multiple instances. | |
| message MaskEstimation { | |
| // Weight of the task loss. The total loss of the model will be their | |
| // summation of task losses weighted by the weights. | |
| optional float task_loss_weight = 1 [default = 1.0]; | |
| // Classification loss configuration for segmentation loss. | |
| optional ClassificationLoss classification_loss = 2; | |
| // Each instance mask (one per detection) is cropped and resized (bilinear | |
| // resampling) from the predicted segmentation feature map. After | |
| // resampling, the masks are binarized with the provided score threshold. | |
| optional int32 mask_height = 4 [default = 256]; | |
| optional int32 mask_width = 5 [default = 256]; | |
| optional float score_threshold = 6 [default = 0.5]; | |
| // The initial bias value of the convlution kernel of the class heatmap | |
| // prediction head. -2.19 corresponds to predicting foreground with | |
| // a probability of 0.1. | |
| optional float heatmap_bias_init = 3 [default = -2.19]; | |
| } | |
| optional MaskEstimation mask_estimation_task = 8; | |
| } | |
| message CenterNetFeatureExtractor { | |
| optional string type = 1; | |
| // Channel means to be subtracted from each image channel. If not specified, | |
| // we use a default value of 0. | |
| repeated float channel_means = 2; | |
| // Channel standard deviations. Each channel will be normalized by dividing | |
| // it by its standard deviation. If not specified, we use a default value | |
| // of 1. | |
| repeated float channel_stds = 3; | |
| // If set, will change channel order to be [blue, green, red]. This can be | |
| // useful to be compatible with some pre-trained feature extractors. | |
| optional bool bgr_ordering = 4 [default = false]; | |
| } | |