七扭八歪解faster rcnn(keras版)(三)

来源:互联网 发布:node dev 安装与使用 编辑:程序博客网 时间:2024/04/29 15:35

前边得到的anchor只区分了背景和圈中物体,并没有判别物体属于哪一类



目前看该代码,没有找到anchor后边接的softmax来判断是不是一个物体,前边的代码已经确定了


def rpn(base_layers,num_anchors):    x = Convolution2D(512, (3, 3), padding='same', activation='relu', kernel_initializer='normal', name='rpn_conv1')(base_layers)    x_class = Convolution2D(num_anchors, (1, 1), activation='sigmoid', kernel_initializer='uniform', name='rpn_out_class')(x)    x_regr = Convolution2D(num_anchors * 4, (1, 1), activation='linear', kernel_initializer='zero', name='rpn_out_regress')(x)    return [x_class, x_regr, base_layers]
很简单,从特征图输出大小相同,num_anchors通道的x_class和num_anchors*4(因为有中心点坐标还有宽高四个值)个通道的x_regr


model_rpn.compile(optimizer=Adam(lr=1e-4), loss=[losses.rpn_loss_cls(num_anchors), losses.rpn_loss_regr(num_anchors)])

def rpn_loss_regr(num_anchors):   def rpn_loss_regr_fixed_num(y_true, y_pred):      if K.image_dim_ordering() == 'th':         x = y_true[:, 4 * num_anchors:, :, :] - y_pred         x_abs = K.abs(x)         x_bool = K.less_equal(x_abs, 1.0)         return lambda_rpn_regr * K.sum(            y_true[:, :4 * num_anchors, :, :] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / K.sum(epsilon + y_true[:, :4 * num_anchors, :, :])      else:         x = y_true[:, :, :, 4 * num_anchors:] - y_pred         x_abs = K.abs(x)         x_bool = K.cast(K.less_equal(x_abs, 1.0), tf.float32)         return lambda_rpn_regr * K.sum(            y_true[:, :, :, :4 * num_anchors] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / K.sum(epsilon + y_true[:, :, :, :4 * num_anchors])   return rpn_loss_regr_fixed_numdef rpn_loss_cls(num_anchors):   def rpn_loss_cls_fixed_num(y_true, y_pred):      if K.image_dim_ordering() == 'tf':         return lambda_rpn_class * K.sum(y_true[:, :, :, :num_anchors] * K.binary_crossentropy(y_pred[:, :, :, :], y_true[:, :, :, num_anchors:])) / K.sum(epsilon + y_true[:, :, :, :num_anchors])      else:         return lambda_rpn_class * K.sum(y_true[:, :num_anchors, :, :] * K.binary_crossentropy(y_pred[:, :, :, :], y_true[:, num_anchors:, :, :])) / K.sum(epsilon + y_true[:, :num_anchors, :, :])   return rpn_loss_cls_fixed_num
求两个loss

由于在实际过程中,Ncls和Nreg差距过大,用参数λ平衡二者(如Ncls=256,Nreg=2400时设置λ=10),使总的网络Loss计算过程中能够均匀考虑2种Loss


X, Y, img_data = next(data_gen_train)loss_rpn = model_rpn.train_on_batch(X, Y)

P_rpn = model_rpn.predict_on_batch(X)R = roi_helpers.rpn_to_roi(P_rpn[0], P_rpn[1], C, K.image_dim_ordering(), use_regr=True, overlap_thresh=0.7, max_boxes=300)
这里我理解是先训练了一下,然后用训练后的参数做预测,将预测得到的框住物体的概率和框的中心点坐标宽高拿到,进入roi层

def rpn_to_roi(rpn_layer, regr_layer, C, dim_ordering, use_regr=True, max_boxes=300,overlap_thresh=0.9):    regr_layer = regr_layer / C.std_scaling    anchor_sizes = C.anchor_box_scales    anchor_ratios = C.anchor_box_ratios    assert rpn_layer.shape[0] == 1    if dim_ordering == 'th':        (rows,cols) = rpn_layer.shape[2:]    elif dim_ordering == 'tf':        (rows, cols) = rpn_layer.shape[1:3]    curr_layer = 0    if dim_ordering == 'tf':        A = np.zeros((4, rpn_layer.shape[1], rpn_layer.shape[2], rpn_layer.shape[3]))    elif dim_ordering == 'th':        A = np.zeros((4, rpn_layer.shape[2], rpn_layer.shape[3], rpn_layer.shape[1]))    for anchor_size in anchor_sizes:        for anchor_ratio in anchor_ratios:            anchor_x = (anchor_size * anchor_ratio[0])/C.rpn_stride            anchor_y = (anchor_size * anchor_ratio[1])/C.rpn_stride            if dim_ordering == 'th':                regr = regr_layer[0, 4 * curr_layer:4 * curr_layer + 4, :, :]            else:                regr = regr_layer[0, :, :, 4 * curr_layer:4 * curr_layer + 4]                regr = np.transpose(regr, (2, 0, 1))            X, Y = np.meshgrid(np.arange(cols),np. arange(rows))            A[0, :, :, curr_layer] = X - anchor_x/2            A[1, :, :, curr_layer] = Y - anchor_y/2            A[2, :, :, curr_layer] = anchor_x            A[3, :, :, curr_layer] = anchor_y            if use_regr:                A[:, :, :, curr_layer] = apply_regr_np(A[:, :, :, curr_layer], regr)            A[2, :, :, curr_layer] = np.maximum(1, A[2, :, :, curr_layer])            A[3, :, :, curr_layer] = np.maximum(1, A[3, :, :, curr_layer])            A[2, :, :, curr_layer] += A[0, :, :, curr_layer]            A[3, :, :, curr_layer] += A[1, :, :, curr_layer]            A[0, :, :, curr_layer] = np.maximum(0, A[0, :, :, curr_layer])            A[1, :, :, curr_layer] = np.maximum(0, A[1, :, :, curr_layer])            A[2, :, :, curr_layer] = np.minimum(cols-1, A[2, :, :, curr_layer])            A[3, :, :, curr_layer] = np.minimum(rows-1, A[3, :, :, curr_layer])            curr_layer += 1    all_boxes = np.reshape(A.transpose((0, 3, 1,2)), (4, -1)).transpose((1, 0))    all_probs = rpn_layer.transpose((0, 3, 1, 2)).reshape((-1))    x1 = all_boxes[:, 0]    y1 = all_boxes[:, 1]    x2 = all_boxes[:, 2]    y2 = all_boxes[:, 3]    idxs = np.where((x1 - x2 >= 0) | (y1 - y2 >= 0))    all_boxes = np.delete(all_boxes, idxs, 0)    all_probs = np.delete(all_probs, idxs, 0)    result = non_max_suppression_fast(all_boxes, all_probs, overlap_thresh=overlap_thresh, max_boxes=max_boxes)[0]    return result
搞了一堆什么没仔细看,应该是把所有的框和可能框住物体的数值去掉里面x1-x2>=0|y1-y2>=0,因为这样明显不符合我们的逻辑


下边进行非最大值抑制,就是将所有框按照框住物体的概率大小排列,挑出来概率最大的依次和剩下来的做比较,当重叠IOU(就是交集比并集)超过设置的阙值(这里overlap_thresh设置为0.9的框去除


# note: calc_iou converts from (x1,y1,x2,y2) to (x,y,w,h) formatX2, Y1, Y2 = roi_helpers.calc_iou(R, img_data, C, class_mapping)
def calc_iou(R, img_data, C, class_mapping):    bboxes = img_data['bboxes']    (width, height) = (img_data['width'], img_data['height'])    # get image dimensions for resizing    resized_width, resized_height, _ = data_generators.get_new_img_size(width, height, C.im_size)    gta = np.zeros((len(bboxes), 4))    for bbox_num, bbox in enumerate(bboxes):        # get the GT box coordinates, and resize to account for image resizing        gta[bbox_num, 0] = int(round(bbox['x1'] * (resized_width / float(width))/C.rpn_stride))        gta[bbox_num, 1] = int(round(bbox['x2'] * (resized_width / float(width))/C.rpn_stride))        gta[bbox_num, 2] = int(round(bbox['y1'] * (resized_height / float(height))/C.rpn_stride))        gta[bbox_num, 3] = int(round(bbox['y2'] * (resized_height / float(height))/C.rpn_stride))

跟之前一样,转换到resized后的尺寸,gta数组为转换后的bounding box的左上右下坐标

x_roi = []y_class_num = []y_class_regr_coords = []y_class_regr_label = []for ix in range(R.shape[0]):    (x1, y1, x2, y2) = R[ix, :]    x1 = int(round(x1))    y1 = int(round(y1))    x2 = int(round(x2))    y2 = int(round(y2))    best_iou = 0.0    best_bbox = -1    for bbox_num in range(len(bboxes)):        curr_iou = data_generators.iou([gta[bbox_num, 0], gta[bbox_num, 2], gta[bbox_num, 1], gta[bbox_num, 3]], [x1, y1, x2, y2])        if curr_iou > best_iou:            best_iou = curr_iou            best_bbox = bbox_num    if best_iou < C.classifier_min_overlap:            continue    else:        w = x2 - x1        h = y2 - y1        x_roi.append([x1, y1, w, h])        if C.classifier_min_overlap <= best_iou < C.classifier_max_overlap:            # hard negative example            cls_name = 'bg'        elif C.classifier_max_overlap <= best_iou:            cls_name = bboxes[best_bbox]['class']            cxg = (gta[best_bbox, 0] + gta[best_bbox, 1]) / 2.0            cyg = (gta[best_bbox, 2] + gta[best_bbox, 3]) / 2.0            cx = x1 + w / 2.0            cy = y1 + h / 2.0            tx = (cxg - cx) / float(w)            ty = (cyg - cy) / float(h)            tw = np.log((gta[best_bbox, 1] - gta[best_bbox, 0]) / float(w))            th = np.log((gta[best_bbox, 3] - gta[best_bbox, 2]) / float(h))        else:            print('roi = {}'.format(best_iou))            raise RuntimeError
拿到经过非最大值抑制的bounding box四个值,分别和所有的ground true值做交比并,得到该框的best_iou小于C.classifier_min_overlap

则忽略该框,如果该框在classifier_min和max_overlap之间,那么该框的cls_name为bg背景,当大于classifier_max_overlap(该值默认config为0.5)时,拿到该框对应的class类型,然后算出来预测值需要移动和缩放的值

    class_num = class_mapping[cls_name]    class_label = len(class_mapping) * [0]    class_label[class_num] = 1    y_class_num.append(copy.deepcopy(class_label))    coords = [0] * 4 * (len(class_mapping) - 1)    labels = [0] * 4 * (len(class_mapping) - 1)    if cls_name != 'bg':        label_pos = 4 * class_num        sx, sy, sw, sh = C.classifier_regr_std        coords[label_pos:4+label_pos] = [sx*tx, sy*ty, sw*tw, sh*th]        labels[label_pos:4+label_pos] = [1, 1, 1, 1]        y_class_regr_coords.append(copy.deepcopy(coords))        y_class_regr_label.append(copy.deepcopy(labels))    else:        y_class_regr_coords.append(copy.deepcopy(coords))        y_class_regr_label.append(copy.deepcopy(labels))if len(x_roi) == 0:    return None, None, NoneX = np.array(x_roi)Y1 = np.array(y_class_num)Y2 = np.concatenate([np.array(y_class_regr_label),np.array(y_class_regr_coords)],axis=1)return np.expand_dims(X, axis=0), np.expand_dims(Y1, axis=0), np.expand_dims(Y2, axis=0)

先将对应的class_label置为1,(class_mapping这里还没有完全明白),最后把数据跌在一起,返回标记和移动缩放坐标(这里乘了sx,sy,sw,sh,我也不是多理解)


参考文章链接:

https://zhuanlan.zhihu.com/p/28585873

https://zhuanlan.zhihu.com/p/24916624


原创粉丝点击