Faster RCNN roi_pooling_layer.cpp roi_pooling_layer.cu

来源:互联网 发布:端口测试失败 编辑:程序博客网 时间:2024/06/13 08:04

主要定义了一个 ROIPoolingLayer 类

ROIPoolingLayer的受保护数据成员有:

int channels_;int height_;int width_;int pooled_height_; // pooling后的特征图的height_int pooled_width_; //  pooling后的特征图的width_Dtype spatial_scale_; // 空间缩放尺度,在ZF中为1/16 = 0.0625Blob<int> max_idx_; // 保存pooling的时候最大的元素的位置
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

LayerSetUp:

template <typename Dtype>void ROIPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,      const vector<Blob<Dtype>*>& top) {  ROIPoolingParameter roi_pool_param = this->layer_param_.roi_pooling_param();  CHECK_GT(roi_pool_param.pooled_h(), 0)      << "pooled_h must be > 0";  CHECK_GT(roi_pool_param.pooled_w(), 0)      << "pooled_w must be > 0";  pooled_height_ = roi_pool_param.pooled_h();  pooled_width_ = roi_pool_param.pooled_w();  spatial_scale_ = roi_pool_param.spatial_scale();  LOG(INFO) << "Spatial scale: " << spatial_scale_;}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13

Reshape:

template <typename Dtype>void ROIPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,      const vector<Blob<Dtype>*>& top) {  channels_ = bottom[0]->channels();  height_ = bottom[0]->height();  width_ = bottom[0]->width();  // top[0]的通道数与bottom[0]的通道数是相等的,毕竟只是做了个pooling而已  top[0]->Reshape(bottom[1]->num(), channels_, pooled_height_,      pooled_width_);  max_idx_.Reshape(bottom[1]->num(), channels_, pooled_height_,      pooled_width_);}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12

Forward_cpu:

template <typename Dtype>void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,      const vector<Blob<Dtype>*>& top) {  const Dtype* bottom_data = bottom[0]->cpu_data(); // bottom_data 为完整图像经过conv_layer等前向传播所得,即conv_5  const Dtype* bottom_rois = bottom[1]->cpu_data();// bottom_rois 为rios,其实就是一些rois的信息:batch_index和2个点的坐标   // Number of ROIs  int num_rois = bottom[1]->num();  int batch_size = bottom[0]->num();  int top_count = top[0]->count();  Dtype* top_data = top[0]->mutable_cpu_data();  caffe_set(top_count, Dtype(-FLT_MAX), top_data);  int* argmax_data = max_idx_.mutable_cpu_data();  caffe_set(top_count, -1, argmax_data);  // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R  for (int n = 0; n < num_rois; ++n) {    int roi_batch_ind = bottom_rois[0];    int roi_start_w = round(bottom_rois[1] * spatial_scale_);    int roi_start_h = round(bottom_rois[2] * spatial_scale_);    int roi_end_w = round(bottom_rois[3] * spatial_scale_);    int roi_end_h = round(bottom_rois[4] * spatial_scale_);    CHECK_GE(roi_batch_ind, 0);    CHECK_LT(roi_batch_ind, batch_size);    int roi_height = max(roi_end_h - roi_start_h + 1, 1);    int roi_width = max(roi_end_w - roi_start_w + 1, 1);    // bin_size_h、bin_size_w >1 或者 <1都有意义    const Dtype bin_size_h = static_cast<Dtype>(roi_height)                             / static_cast<Dtype>(pooled_height_);    const Dtype bin_size_w = static_cast<Dtype>(roi_width)                             / static_cast<Dtype>(pooled_width_);    const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind);    for (int c = 0; c < channels_; ++c) {      //各channel独立pooling      for (int ph = 0; ph < pooled_height_; ++ph) {        for (int pw = 0; pw < pooled_width_; ++pw) {          // Compute pooling region for this output unit:          //  start (included) = floor(ph * roi_height / pooled_height_)          //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)          int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)                                              * bin_size_h));          int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)                                              * bin_size_w));          int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)                                           * bin_size_h));          int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)                                           * bin_size_w));          // 在conv_5特征图上找到每个pooling unit所对应的区域,用(hstart、hend、wstart、wend)表示,          // 同时防止溢出          hstart = min(max(hstart + roi_start_h, 0), height_);          hend = min(max(hend + roi_start_h, 0), height_);          wstart = min(max(wstart + roi_start_w, 0), width_);          wend = min(max(wend + roi_start_w, 0), width_);          bool is_empty = (hend <= hstart) || (wend <= wstart);          const int pool_index = ph * pooled_width_ + pw;          if (is_empty) {            top_data[pool_index] = 0;            argmax_data[pool_index] = -1;          }          // 遍历pooling unit所对应的区域中的像素,找出其中最大的元素及位置索引          for (int h = hstart; h < hend; ++h) {            for (int w = wstart; w < wend; ++w) {              const int index = h * width_ + w;              if (batch_data[index] > top_data[pool_index]) {                top_data[pool_index] = batch_data[index];                argmax_data[pool_index] = index;              }            }          }        }      }      // Increment all data pointers by one channel      // 因为各通道独立pooling,所以要移动指针      batch_data += bottom[0]->offset(0, 1);      top_data += top[0]->offset(0, 1);      argmax_data += max_idx_.offset(0, 1);    }    // Increment ROI data pointer    // bottom[1]的shape为num_rois×5×1×1, 所以offset(1)表示将指针移动到下一个roi    bottom_rois += bottom[1]->offset(1);  }}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86

Backward_gpu:

template <typename Dtype>void ROIPoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {  if (!propagate_down[0]) {    return;  }  const Dtype* bottom_rois = bottom[1]->gpu_data();  const Dtype* top_diff = top[0]->gpu_diff();  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();  const int count = bottom[0]->count();  caffe_gpu_set(count, Dtype(0.), bottom_diff);  const int* argmax_data = max_idx_.gpu_data();  // NOLINT_NEXT_LINE(whitespace/operators)  ROIPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(      count, top_diff, argmax_data, top[0]->num(), spatial_scale_, channels_,      height_, width_, pooled_height_, pooled_width_, bottom_diff, bottom_rois);  CUDA_POST_KERNEL_CHECK;}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
template <typename Dtype>__global__ void ROIPoolBackward(const int nthreads, const Dtype* top_diff,    const int* argmax_data, const int num_rois, const Dtype spatial_scale,    const int channels, const int height, const int width,    const int pooled_height, const int pooled_width, Dtype* bottom_diff,    const Dtype* bottom_rois) {  CUDA_KERNEL_LOOP(index, nthreads) {    // (n, c, h, w) coords in bottom data,考虑bottom data里的每一个点    int w = index % width;    int h = (index / width) % height;    int c = (index / width / height) % channels;    int n = index / width / height / channels;    Dtype gradient = 0;    // Accumulate gradient over all ROIs that pooled this element    // 1. 遍历所有的rois,如果该roi在前向传播的时候,在点conv_5的点(n,c,h,w)对roi_pooling_layer的max     // pooling有贡献,那么相应位置的top_diff就需要累加到gradient上;     // 2. 看似bottom_rois里包含的rois的roi_batch_ind不止一个,但是如果n != roi_batch_ind,会直接pass掉    for (int roi_n = 0; roi_n < num_rois; ++roi_n) {      const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;      int roi_batch_ind = offset_bottom_rois[0];      // Skip if ROI's batch index doesn't match n      if (n != roi_batch_ind) {        continue;      }      int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);      int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);      int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);      int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);      // Skip if ROI doesn't include (h, w)      const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&                           h >= roi_start_h && h <= roi_end_h);      if (!in_roi) {        continue;      }      int offset = (roi_n * channels + c) * pooled_height * pooled_width;      const Dtype* offset_top_diff = top_diff + offset;      const int* offset_argmax_data = argmax_data + offset;      // Compute feasible set of pooled units that could have pooled      // this bottom unit      // Force malformed ROIs to be 1x1      int roi_width = max(roi_end_w - roi_start_w + 1, 1);      int roi_height = max(roi_end_h - roi_start_h + 1, 1);      Dtype bin_size_h = static_cast<Dtype>(roi_height)                         / static_cast<Dtype>(pooled_height);      Dtype bin_size_w = static_cast<Dtype>(roi_width)                         / static_cast<Dtype>(pooled_width);      // 确定位置(n,c,h,w)可能被哪些pooled units的感受野所覆盖      int phstart = floor(static_cast<Dtype>(h - roi_start_h) / bin_size_h);      int phend = ceil(static_cast<Dtype>(h - roi_start_h + 1) / bin_size_h);      int pwstart = floor(static_cast<Dtype>(w - roi_start_w) / bin_size_w);      int pwend = ceil(static_cast<Dtype>(w - roi_start_w + 1) / bin_size_w);      // 防止溢出      phstart = min(max(phstart, 0), pooled_height);      phend = min(max(phend, 0), pooled_height);      pwstart = min(max(pwstart, 0), pooled_width);      pwend = min(max(pwend, 0), pooled_width);      for (int ph = phstart; ph < phend; ++ph) {        for (int pw = pwstart; pw < pwend; ++pw) {          if (offset_argmax_data[ph * pooled_width + pw] == (h * width + w)) {            gradient += offset_top_diff[ph * pooled_width + pw];          }        }      }    }    bottom_diff[index] = gradient;  }}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75

1 0
原创粉丝点击