a fix on gt generation (tusen-ai#260)

kfxw · RogerChern · commit 070fa02d65ca · 2019-11-26T19:40:48.000+08:00
* a fix on gt generation

* refine the comments in all fcos related files
diff --git a/config/fcos_r50v1_fpn_1x.py b/config/fcos_r50v1_fpn_1x.py
@@ -101,7 +101,7 @@ class FCOSFPNAssignParam:
                   [512, INF],
                  ]
         stride = (8, 16, 32, 64, 128)
-        num_classifier = 81 - 1			# COCO: 80 object + 1 background
+        num_classifier = 81 - 1				# COCO: 80 object + 1 background
         ignore_label = RpnParam.loss_setting.ignore_label
         ignore_offset = RpnParam.loss_setting.ignore_offset
         data_size = [PadParam.short, PadParam.long]
diff --git a/models/FCOS/input.py b/models/FCOS/input.py
@@ -6,14 +6,11 @@
 import mxnet as mx
 import time
 
-class DetectionAugmentation(object):
-    def __init__(self):
-        pass
-
-    def apply(self, input_record):
-        pass
-
-
+# Preparation to generate gt
+# output: 
+#   loc_x/loc_y:            [int array] xy coordinates for sampling in each scale
+#   stage_lower/upperbound: [int array] the scale range of each FPN stage, used in FPN stage assignment, defined in config file
+#   nonignore_area:         [boolean array] non-padding area under each scale
 class PreMakeFCOSgt(mx.operator.CustomOp):
 
     def __init__(self, fcos_gt_setting):
@@ -22,7 +19,6 @@ def __init__(self, fcos_gt_setting):
         self.stride = self.p.stride
         self.stages = self.p.stages			# type: FCOSFPNAssignParam
 
-        # make locations
         self.data_size = self.p.data_size
         h, w = self.data_size
         self.loc_x = []
@@ -32,6 +28,7 @@ def __init__(self, fcos_gt_setting):
         self.stage_lowerbound = [-1e-5, 64, 128, 256, 512]
         self.stage_upperbound = [64, 128, 256, 512, 1e5]
         for idx, stride in enumerate(self.stride):
+            # make sampling coordinate maps
             x = np.array(range(0,w,stride), dtype=np.float32) + stride/2.
             y = np.array(range(0,h,stride), dtype=np.float32) + stride/2.
             x, y = np.meshgrid(x, y)
@@ -41,6 +38,7 @@ def __init__(self, fcos_gt_setting):
             self.loc_y.append(y.reshape(-1))
             self.loc_x_T.append(y.T.reshape(-1))
             self.loc_y_T.append(x.T.reshape(-1))
+            # convert numpy/list to ndarray
             self.stage_lowerbound[idx] = mx.nd.full(self.loc_x[-1].shape, self.stage_lowerbound[idx])
             self.stage_upperbound[idx] = mx.nd.full(self.loc_x[-1].shape, self.stage_upperbound[idx])
         self.loc_x = mx.nd.concat(*(self.loc_x), dim=0)
@@ -52,24 +50,26 @@ def __init__(self, fcos_gt_setting):
 
     def forward(self, is_train, req, in_data, out_data, aux):
         context = in_data[0].context
-        if self.loc_x.context != context:		# execute only once
+        if self.loc_x.context != context:		# execute only once, load arrays into gpu
             self.loc_x = self.loc_x.as_in_context(context)
             self.loc_y = self.loc_y.as_in_context(context)
             self.loc_x_T = self.loc_x_T.as_in_context(context)
             self.loc_y_T = self.loc_y_T.as_in_context(context)
             self.stage_lowerbound = self.stage_lowerbound.as_in_context(context)
             self.stage_upperbound = self.stage_upperbound.as_in_context(context)
 
-        ori_h = in_data[1][0,0]				# aspect_ratio_grouping ensures all aspect ratios within a batch are same
+        ori_h = in_data[1][0,0]				# 'aspect_ratio_grouping' in 'detection_input.py' ensures all aspect ratios within a batch are the same
         ori_w = in_data[1][0,1]
 
         if ori_h < ori_w:
             self.assign(out_data[0], req[0], self.loc_x)
             self.assign(out_data[1], req[1], self.loc_y)
+            # filter out image padding area
             nonignore_area = mx.nd.logical_and(lhs=(self.loc_x<ori_w), rhs=(self.loc_y<ori_h))
         else:
             self.assign(out_data[0], req[0], self.loc_x_T)
             self.assign(out_data[1], req[1], self.loc_y_T)
+            # filter out image padding area
             nonignore_area = mx.nd.logical_and(lhs=(self.loc_x_T<ori_w), rhs=(self.loc_y_T<ori_h))
 
         self.assign(out_data[2], req[2], self.stage_lowerbound)
@@ -80,7 +80,7 @@ def forward(self, is_train, req, in_data, out_data, aux):
 
     def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
         pass
-
+        
 @mx.operator.register("make_fcos_gt_preparation")
 class PreMakeFCOSGTProp(mx.operator.CustomOpProp):
     def __init__(self):
@@ -112,50 +112,113 @@ def infer_type(self, in_type):
     def create_operator(self, ctx, shapes, dtypes):
         return PreMakeFCOSgt(self.p)
 
+# --------------------------------------------------------------------------------
+# Preparation to generate classification gt
+# output:
+#   bbox_cls:      bbox's classification annotations, the last row of bbox annotation
+#   cls_batch_idx: used for array indexing
+class PrepareFCOS_cls_gt(mx.operator.CustomOp):
+
+    def __init__(self, fcos_gt_setting):
+        super(PrepareFCOS_cls_gt, self).__init__()
+        self.p = fcos_gt_setting
+        self.batch_idx = None
+        self.spatial_idx = None
 
+    def forward(self, is_train, req, in_data, out_data, aux):
+        bboxes = in_data[0]
+        N = bboxes.shape[0]
+        HW = in_data[1].shape[-1]
+        
+        if self.batch_idx is None:			# excute only once
+            self.batch_idx = mx.nd.arange(N).reshape((-1,1)).tile((1,HW))
+
+        self.assign(out_data[0], req[0], bboxes[:,:,4])
+        self.assign(out_data[1], req[1], self.batch_idx)
+        return
+
+    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+        pass
+
+@mx.operator.register("prepare_fcos_cls_gt")
+class PrepareFCOS_CLS_GTProp(mx.operator.CustomOpProp):
+    def __init__(self):
+        super(PrepareFCOS_CLS_GTProp, self).__init__(need_top_grad=False)
+        from config.fcos_r50v1_fpn_1x import throwout_param
+        self.p = throwout_param
+        self.stride = self.p.stride
+        self.data_size = self.p.data_size
+
+    def list_arguments(self):
+        return ['gt_bbox', 'smallest_box_id']
+
+    def list_outputs(self):
+        return ['bbox_cls', 'cls_batch_idx']
+
+    def infer_shape(self, in_shape):
+        n = in_shape[0][0]
+        h, w = self.data_size
+        hw = 0
+        for stride in self.stride:
+            width = len(range(0,w,stride))
+            height = len(range(0,h,stride))
+            hw += height * width
+        return in_shape, [in_shape[0][:2], [n,hw]], []
+
+    def infer_type(self, in_type):
+        return in_type, [in_type[0], in_type[0]], []
+
+    def create_operator(self, ctx, shapes, dtypes):
+        return PrepareFCOS_cls_gt(self.p)
+
+# --------------------------------------------------------------------------------
+# To describe variable's shapes, 
+#   N: Batch size
+#   M: Number of bbox in gt
+#   HW: Overall spatial size of concatenated gt from different scales, sum(H_8s*W_8s, H_16s*W_16s,..., H_128s*W_128s)
+#   num_cls: Number of object categories, use 80 in coco dataset, defined in config file
 def make_fcos_gt(gt_bbox, im_info, ignore_offset, ignore_label, num_classifier):
+    # Preparations before generating gt
     loc_x, loc_y, stage_lowerbound, stage_upperbound, nonignore_area = mx.sym.Custom(gt_bbox=gt_bbox, im_info=im_info, op_type='make_fcos_gt_preparation', name='pre_fcos_gt')
+    bboxes = gt_bbox								# (N, M, 4), [x,y,x,y]
 
-    bboxes = gt_bbox
-
-    # compute offset
-    #bboxes_ = mx.sym.expand_dims(bboxes, axis=-1)
+    # Compute offsets to bbox edges at each pixel
     l = mx.sym.broadcast_sub( lhs=loc_x, rhs=mx.sym.slice(bboxes, begin=(None,None,0), end=(None,None,1)) )
     t = mx.sym.broadcast_sub( lhs=loc_y, rhs=mx.sym.slice(bboxes, begin=(None,None,1), end=(None,None,2)) )
     r = mx.sym.broadcast_sub( lhs=mx.sym.slice(bboxes, begin=(None,None,2), end=(None,None,3)), rhs=loc_x )
     b = mx.sym.broadcast_sub( lhs=mx.sym.slice(bboxes, begin=(None,None,3), end=(None,None,4)), rhs=loc_y )
     offset_gt = mx.sym.stack(l,t,r,b, axis=1)					# (N, 4, M, HW)
-    # clean non-box area
+    # Reset non-box area, negative offsets indicate out-of-bbox area
     in_box_area = mx.sym.min(offset_gt, axis=1, keepdims=True) >= 0		# (N, 1, M, HW)
     offset_gt = mx.sym.broadcast_add( lhs=mx.sym.broadcast_mul(lhs=offset_gt, rhs=in_box_area), 
                                       rhs=(1 - in_box_area) * ignore_offset
                                     )						# offset_gt[!in_box_area] = self.ignore_offset
-    # assign stage
-    longest_side = mx.sym.max(offset_gt, axis=1, keepdims=True)			# (N, 1, M, HW)
-    stage_assign_mask = mx.sym.broadcast_logical_and( lhs=mx.sym.broadcast_greater_equal(lhs=longest_side, rhs=stage_lowerbound), 
-                                                      rhs=mx.sym.broadcast_lesser(lhs=longest_side, rhs=stage_upperbound)
-                                                    )					# (N, 1, M, HW)
+    # Assign FPN stage based on offset values
+    greatest_offset = mx.sym.max(offset_gt, axis=1, keepdims=True)		# (N, 1, M, HW)
+    stage_assign_mask = mx.sym.broadcast_logical_and( lhs=mx.sym.broadcast_greater_equal(lhs=greatest_offset, rhs=stage_lowerbound), 
+                                                      rhs=mx.sym.broadcast_lesser(lhs=greatest_offset, rhs=stage_upperbound)
+                                                    )				# (N, 1, M, HW)
     offset_gt = mx.sym.broadcast_add( lhs=mx.sym.broadcast_mul(lhs=offset_gt, rhs=stage_assign_mask), 
                                       rhs=(1 - stage_assign_mask) * ignore_offset
                                     )						# offset[!stage_assign_mask] = self.ignore_offset
-    # fuse box offsets based on box size
+    # Fuse offsets based on bbox sizes through dim M
+    # Smaller bboxes are on the top and cover the larger ones
     box_size = ( mx.sym.slice(offset_gt, begin=(None,0,None,None), end=(None,1,None,None)) + \
                  mx.sym.slice(offset_gt, begin=(None,2,None,None), end=(None,3,None,None)) ) \
              * ( mx.sym.slice(offset_gt, begin=(None,1,None,None), end=(None,2,None,None)) + \
                  mx.sym.slice(offset_gt, begin=(None,3,None,None), end=(None,4,None,None)) )
 										# (offset_gt[:,0,:,:] + offset_gt[:,2,:,:]) * (offset_gt[:,1,:,:] + offset_gt[:,3,:,:])
-    #box_size = mx.sym.expand_dims(box_size, axis=1)
+    # Bbox sizes in out-of-bbox area are set to MAX_BBOX_SIZE so that a bbox can always cover background
     box_size = mx.sym.broadcast_add( lhs=mx.sym.broadcast_mul(lhs=box_size, rhs=stage_assign_mask), 
                                      rhs=(1 - stage_assign_mask) * 1e10
                                    )						# box[!stage_assign_mask] = MAX_BBOX_SIZE
     smallest_box_id = mx.sym.argmin(box_size, axis=2)				# (N, 1, HW)
     smallest_box_ids = mx.sym.tile(smallest_box_id, reps=(1,4,1))		# (N, 4, HW)
     offset_gt = mx.sym.reshape_like( mx.sym.pick(offset_gt, smallest_box_ids, axis=2), smallest_box_ids )
                                                                                 # (N, 4, HW)
-
-    in_box_area = offset_gt != ignore_offset
-
-    # centerness
+                                             
+    # Calculate centerness values using the formula described in the paper                                   
+    in_box_area = offset_gt != ignore_offset					# centerness is only compute inside bboxes
     l_r_sorted = mx.sym.sort(mx.sym.slice(offset_gt, begin=(None,0,None), end=(None,3,None), step=(None,2,None)), axis=1)
 										# mx.nd.sort(offset_gt[:,[0,2],:], axis=1)
     term1_min = mx.sym.reshape(mx.sym.slice(l_r_sorted, begin=(None,0,None), end=(None,1,None)), shape=(0,-1))
@@ -172,20 +235,29 @@ def make_fcos_gt(gt_bbox, im_info, ignore_offset, ignore_label, num_classifier):
     centerness_gt = centerness_gt * mx.sym.reshape(mx.sym.slice(in_box_area, begin=(None,0,None), end=(None,1,None)), shape=(0,-1))
 										# (N, HW), centerness_gt*in_box_area[:,0,:]
 
-    # cls
+    # Classification gt
+    #  smallest_box_id: indicates which bbox is chosen at current position
     smallest_box_id = smallest_box_id.reshape((0,-1))				# (N, HW)
-    cls_gt = mx.sym.one_hot(smallest_box_id, num_classifier)			# (N, HW, num_cls)
+    bbox_cls, cls_batch_idx = mx.sym.Custom(gt_bbox=gt_bbox, smallest_box_id=smallest_box_ids, op_type='prepare_fcos_cls_gt', name='fcos_cls_gt')
+    										# bbox_cls = gt_bbox[:,:,4], cls_batch_idx = [[0,0,...,0],[1,1,...,1],...,[N-1,N-1,...,N-1]]
+    										# bbox_cls: (N, M), cls_batch_idx: (N, HW)
+    cls_id = mx.sym.stack(cls_batch_idx, smallest_box_id, axis=0)
+    # Transform bbox id to bbox's class id, e.g. (0th,0th,1st,5th,...)->(person,person,car,bike,...)
+    cls_gt = mx.sym.gather_nd(bbox_cls, cls_id)					# cls_gt = bbox_cls[cls_id], (N, HW)
+    cls_gt = cls_gt - 1								# 1~81 class id to 0~80 class id
+    cls_gt = mx.sym.one_hot(cls_gt, num_classifier)				# (N, HW, num_cls), one-hot matrix
     cls_gt = mx.sym.transpose(cls_gt, axes=(0,2,1))				# (N, num_cls, HW)
     cls_gt = mx.sym.broadcast_mul( lhs=cls_gt, rhs=mx.sym.slice(in_box_area, begin=(None,0,None), end=(None,1,None)) )
+    										# cls_gt[:,:,!in_box_area] = 0
         
-    # ignore label
+    # Set ignore labels in the image padding area
     nonignore_area = mx.sym.reshape(nonignore_area, shape=(1,-1))
     centerness_gt = mx.sym.broadcast_add( lhs=mx.sym.broadcast_mul(lhs=centerness_gt, rhs=nonignore_area), 
                                           rhs=(1 - nonignore_area) * ignore_label
-                                        )					# centerness_gt[!nonignore_area] = self.ignore_label
+                                        )					# centerness_gt[:,!nonignore_area] = self.ignore_label
     nonignore_area = mx.sym.reshape(nonignore_area, shape=(1,1,-1))
     cls_gt = mx.sym.broadcast_add( lhs=mx.sym.broadcast_mul(lhs=cls_gt, rhs=nonignore_area), 
                                    rhs=(1 - nonignore_area) * ignore_label
-                                 )		
+                                 )						# cls_gt[:,:,!nonignore_area] = self.ignore_label
 
     return centerness_gt, mx.sym.reshape(cls_gt, shape=(0,-1)), offset_gt
diff --git a/models/FCOS/loss.py b/models/FCOS/loss.py
@@ -1,6 +1,9 @@
 import mxnet as mx
 import mxnext as X
 
+# Use symbol for internal variables computation for better parallelization
+# Use custom python op to control loss/gradient flow
+
 """ ---Sigmoid Focal Loss---
     def forward(self, is_train, req, in_data, out_data, aux):
         logits = in_data[0]
@@ -46,6 +49,8 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
 
         self.assign(in_grad[0], req[0], grad)"""
 
+
+# the formula is better demonstrated above
 class ComputeSigmoidFocalLoss(mx.operator.CustomOp):
     def __init__(self):
         super(ComputeSigmoidFocalLoss, self).__init__()
@@ -79,12 +84,11 @@ def create_operator(self, ctx, shapes, dtypes):
         return ComputeSigmoidFocalLoss()
 
 def make_sigmoid_focal_loss(gamma, alpha, logits, labels, nonignore_mask):
-    # conduct most of calculations using symbol and control gradient flow with custom op
-    p = 1 / (1 + mx.sym.exp(-logits))						# sigmoid
+    p = 1 / (1 + mx.sym.exp(-logits))					# sigmoid
     mask_logits_GE_zero = mx.sym.broadcast_greater_equal(lhs=logits, rhs=mx.sym.zeros((1,1)))
-										# logits>=0
-    minus_logits_mask = -1. * logits * mask_logits_GE_zero			# -1 * logits * [logits>=0]
-    negative_abs_logits = logits - 2*logits*mask_logits_GE_zero			# logtis - 2 * logits * [logits>=0]
+									# logits>=0
+    minus_logits_mask = -1. * logits * mask_logits_GE_zero		# -1 * logits * [logits>=0]
+    negative_abs_logits = logits - 2*logits*mask_logits_GE_zero		# logtis - 2 * logits * [logits>=0]
     log_one_exp_minus_abs = mx.sym.log(1. + mx.sym.exp(negative_abs_logits))
     minus_log = minus_logits_mask - log_one_exp_minus_abs
 
@@ -101,14 +105,13 @@ def make_sigmoid_focal_loss(gamma, alpha, logits, labels, nonignore_mask):
     backward_term2 = one_alpha_p_gamma_one_labels * (minus_log  * (1 - p) * gamma - p)
     grad = mx.sym.broadcast_div( lhs=-1 * (backward_term1 + backward_term2) * nonignore_mask, rhs=norm.reshape((1,1)) )
 
-    loss = X.block_grad(loss)
-    grad = X.block_grad(grad)
+    loss = X.block_grad(loss)						# symbols are only used for computation
+    grad = X.block_grad(grad)						# use custom op to control gradient flow instead
 
     loss = mx.sym.Custom(logits=logits, loss=loss, grad=grad, op_type='compute_focal_loss', name='focal_loss')
     return loss
 
-
-
+# -------------------------------------------------------
 class ComputeBCELoss(mx.operator.CustomOp):
     def __init__(self):
         super(ComputeBCELoss, self).__init__()
@@ -152,8 +155,7 @@ def make_binary_cross_entropy_loss(logits, labels, nonignore_mask):
 
     return mx.sym.Custom(logits=logits, loss=loss, grad=grad, op_type='compute_bce_loss', name='sigmoid_bce_loss')
 
-
-
+# -------------------------------------------------------
 def IoULoss(x_box, y_box, ignore_offset, centerness_label, name='iouloss'):
     centerness_label = mx.sym.reshape(centerness_label, shape=(0,1,-1))
     y_box = X.block_grad(y_box)
@@ -163,6 +165,7 @@ def IoULoss(x_box, y_box, ignore_offset, centerness_label, name='iouloss'):
     target_right = mx.sym.slice_axis(y_box, axis=1, begin=2, end=3)
     target_bottom = mx.sym.slice_axis(y_box, axis=1, begin=3, end=4)
 
+    # filter out out-of-bbox area, loss is only computed inside bboxes
     nonignore_mask = mx.sym.broadcast_logical_and(lhs = mx.sym.broadcast_not_equal(lhs=target_left, rhs=ignore_offset),
                                               rhs = mx.sym.broadcast_greater( lhs=centerness_label, rhs=mx.sym.full((1,1,1), 0) )
                                              )
diff --git a/models/FCOS/metric.py b/models/FCOS/metric.py
@@ -1,5 +1,3 @@
-import pdb
-
 import mxnet as mx
 import numpy as np
 
diff --git a/models/FCOS/utils.py b/models/FCOS/utils.py

Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ class FCOSFPNAssignParam:`
`101`	`101`	`[512, INF],`
`102`	`102`	`]`
`103`	`103`	`stride = (8, 16, 32, 64, 128)`
`104`		`- num_classifier = 81 - 1 # COCO: 80 object + 1 background`
	`104`	`+ num_classifier = 81 - 1 # COCO: 80 object + 1 background`
`105`	`105`	`ignore_label = RpnParam.loss_setting.ignore_label`
`106`	`106`	`ignore_offset = RpnParam.loss_setting.ignore_offset`
`107`	`107`	`data_size = [PadParam.short, PadParam.long]`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,3 @@`
`1`		`-import pdb`
`2`		`-`
`3`	`1`	`import mxnet as mx`
`4`	`2`	`import numpy as np`
`5`	`3`