caffe代码阅读6：Filler的实现细节-2016.3.18

来源：互联网发布：ce6.7源码编辑：程序博客网时间：2024/06/08 11:19

一、Filler的作用简介

Filler层的作用实际上就是根据proto中给出的参数对权重进行初始化，初始化的方式有很多种，分别为常量初始化（constant）、高斯分布初始化（gaussian）、positive_unitball初始化、均匀分布初始化（uniform）、xavier初始化、msra初始化、双线性初始化（bilinear）这么几种。

二、Filler类的详细介绍

首先了解一下Filler类的第一个函数：该函数把整个Filler类一下子就看明白了

template <typename Dtype>Filler<Dtype>* GetFiller(const FillerParameter& param) {  const std::string& type = param.type();  if (type == "constant") {    return new ConstantFiller<Dtype>(param);  } else if (type == "gaussian") {    return new GaussianFiller<Dtype>(param);  } else if (type == "positive_unitball") {    return new PositiveUnitballFiller<Dtype>(param);  } else if (type == "uniform") {    return new UniformFiller<Dtype>(param);  } else if (type == "xavier") {    return new XavierFiller<Dtype>(param);  } else if (type == "msra") {    return new MSRAFiller<Dtype>(param);  } else if (type == "bilinear") {    return new BilinearFiller<Dtype>(param);  } else {    CHECK(false) << "Unknown filler name: " << param.type();  }  return (Filler<Dtype>*)(NULL);

根据给定的参数获取对应的Filler，由该段代码可以看出proto文件里面对于权重可以有哪些指定的初始化方式。

1）基类Filler

template <typename Dtype>class Filler { public: // 构造函数  explicit Filler(const FillerParameter& param) : filler_param_(param) {}  // 析构函数，并且是虚函数  virtual ~Filler() {}  // 纯虚函数，继承的子类必须要实现  virtual void Fill(Blob<Dtype>* blob) = 0; protected:  FillerParameter filler_param_;};  // class Filler

2）继承Filler的类

2-1常量初始化类

template <typename Dtype>class ConstantFiller : public Filler<Dtype> { public:  explicit ConstantFiller(const FillerParameter& param)      : Filler<Dtype>(param) {}  virtual void Fill(Blob<Dtype>* blob) {    // 获取数据指针    Dtype* data = blob->mutable_cpu_data();    // 获取数据长度    const int count = blob->count();    // 获取常量初始化的常数值    const Dtype value = this->filler_param_.value();    CHECK(count);    for (int i = 0; i < count; ++i) {      data[i] = value;//对于每一个元素都初始化为常数值    }    CHECK_EQ(this->filler_param_.sparse(), -1)         << "Sparsity not supported by this Filler.";  }};

2-2均匀分布初始化类

template <typename Dtype>class UniformFiller : public Filler<Dtype> { public:  explicit UniformFiller(const FillerParameter& param)      : Filler<Dtype>(param) {}  virtual void Fill(Blob<Dtype>* blob) {    // 检查blob中的元素是否为0    CHECK(blob->count());    // 调用caffe_rng_uniform进行初始化    caffe_rng_uniform<Dtype>(blob->count(), Dtype(this->filler_param_.min()),        Dtype(this->filler_param_.max()), blob->mutable_cpu_data());    // 均匀分布初始化是不支持稀疏特性的    CHECK_EQ(this->filler_param_.sparse(), -1)         << "Sparsity not supported by this Filler.";  }};

2-3高斯分布初始化类（支持稀疏特性）

template <typename Dtype>class GaussianFiller : public Filler<Dtype> { public:  explicit GaussianFiller(const FillerParameter& param)      : Filler<Dtype>(param) {}  virtual void Fill(Blob<Dtype>* blob) {    Dtype* data = blob->mutable_cpu_data();    CHECK(blob->count());    // 调用caffe_rng_gaussian初始化、其中输入了高斯分布的均值和标准差    caffe_rng_gaussian<Dtype>(blob->count(), Dtype(this->filler_param_.mean()),        Dtype(this->filler_param_.std()), blob->mutable_cpu_data());    int sparse = this->filler_param_.sparse();    // 检查sparse > -1    CHECK_GE(sparse, -1);    if (sparse >= 0) {//  如果启用稀疏的话      // Sparse initialization is implemented for "weight" blobs; i.e. matrices.      // These have num == channels == 1; width is number of inputs; height is      // number of outputs.  The 'sparse' variable specifies the mean number      // of non-zero input weights for a given output.      CHECK_GE(blob->num_axes(), 1);      // 假设权重的形状是 输出单元个数 X输入单元个数      // blob->shape(0) = 输出单元的个数      const int num_outputs = blob->shape(0);      // 不为0的概率 = 1/输出单元个数      // 那么为0的概率= 1 - 1/输出单元个数      Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs);      // 新建一个rand_vec，用户存放伯努利分布（二项分布）所生成的值      rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int)));      int* mask = reinterpret_cast<int*>(rand_vec_->mutable_cpu_data());      caffe_rng_bernoulli(blob->count(), non_zero_probability, mask);      for (int i = 0; i < blob->count(); ++i) {        data[i] *= mask[i];// 每一个数据元素都与生成的二项分布的样本值相乘      }    }  } protected:  shared_ptr<SyncedMemory> rand_vec_;};

2-4PositiveUnitballFiller初始化

不懂的可以看http://math.stackexchange.com/questions/520002/unit-ball-with-p-norm

相当于是一个单位球

// PositiveUnitballFiller首先用均匀分布填充W// 然后将W中的元素按行求和，然后该行每一个的元素都除以该行的和template <typename Dtype>class PositiveUnitballFiller : public Filler<Dtype> { public:  explicit PositiveUnitballFiller(const FillerParameter& param)      : Filler<Dtype>(param) {}  virtual void Fill(Blob<Dtype>* blob) {    Dtype* data = blob->mutable_cpu_data();    DCHECK(blob->count());// 我很奇怪为啥这里用DCHECK    // 先填充均匀分布到权重    caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());    // We expect the filler to not be called very frequently, so we will    // just use a simple implementation    // count / num = 输入的维度    int dim = blob->count() / blob->num();    CHECK(dim);// 检查输入维度是否小于0    for (int i = 0; i < blob->num(); ++i) {// 遍历隐藏单元的个数（或者是输出单元的个数）      Dtype sum = 0;      for (int j = 0; j < dim; ++j) {        sum += data[i * dim + j];//sum += data[i][j] 也就是说要按行求和      }      for (int j = 0; j < dim; ++j) {        data[i * dim + j] /= sum;// 每一行都除以该行的和      }    }    CHECK_EQ(this->filler_param_.sparse(), -1)         << "Sparsity not supported by this Filler.";  }};

2-5 XavierFiller初始化（用于卷积核）

// 这里不明白的就是shape (num, a, b, c) where a * b * c = fan_in and num * b * c = fan_out // 扇入和扇出的定义了// 感谢王峰，后来才知道b*c=kernel size// a是输入的channel// num是输出的channeltemplate <typename Dtype>class XavierFiller : public Filler<Dtype> { public:  explicit XavierFiller(const FillerParameter& param)      : Filler<Dtype>(param) {}  virtual void Fill(Blob<Dtype>* blob) {    CHECK(blob->count());    int fan_in = blob->count() / blob->num();    int fan_out = blob->count() / blob->channels();    Dtype n = fan_in;  // default to fan_in    if (this->filler_param_.variance_norm() ==// 如果参数里面定义了方差归一化则n = 扇入+扇出        FillerParameter_VarianceNorm_AVERAGE) {      n = (fan_in + fan_out) / Dtype(2);    } else if (this->filler_param_.variance_norm() ==        FillerParameter_VarianceNorm_FAN_OUT) {      n = fan_out;    }    Dtype scale = sqrt(Dtype(3) / n);// scale = \frac{sqrt{3}}{n}    // 然后用[-scale,scale]的均匀分布初始化    caffe_rng_uniform<Dtype>(blob->count(), -scale, scale,        blob->mutable_cpu_data());    CHECK_EQ(this->filler_param_.sparse(), -1)         << "Sparsity not supported by this Filler.";  }};

2-6 MSRAFiller初始化方式（用于卷积核）

template <typename Dtype>class MSRAFiller : public Filler<Dtype> { public:  explicit MSRAFiller(const FillerParameter& param)      : Filler<Dtype>(param) {}  virtual void Fill(Blob<Dtype>* blob) {    CHECK(blob->count());    int fan_in = blob->count() / blob->num();    int fan_out = blob->count() / blob->channels();    Dtype n = fan_in;  // default to fan_in    if (this->filler_param_.variance_norm() ==        FillerParameter_VarianceNorm_AVERAGE) {      n = (fan_in + fan_out) / Dtype(2);    } else if (this->filler_param_.variance_norm() ==        FillerParameter_VarianceNorm_FAN_OUT) {      n = fan_out;    }    // 标准差是\sqrt{\frac{2}{n}}    Dtype std = sqrt(Dtype(2) / n);    caffe_rng_gaussian<Dtype>(blob->count(), Dtype(0), std,        blob->mutable_cpu_data());    CHECK_EQ(this->filler_param_.sparse(), -1)         << "Sparsity not supported by this Filler.";  }};

2-7 BilinearFiller初始化（用户反卷积核）

// 反卷积所用的初始化，不支持稀疏特性 // 没研究过。。。也不知道template <typename Dtype>class BilinearFiller : public Filler<Dtype> { public:  explicit BilinearFiller(const FillerParameter& param)      : Filler<Dtype>(param) {}  virtual void Fill(Blob<Dtype>* blob) {    CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim.";    CHECK_EQ(blob->width(), blob->height()) << "Filter must be square";    Dtype* data = blob->mutable_cpu_data();    // f是宽度除以2    int f = ceil(blob->width() / 2.);    // c的含义不明白    float c = (2 * f - 1 - f % 2) / (2. * f);    for (int i = 0; i < blob->count(); ++i) {      float x = i % blob->width();// x表示列的索引      float y = (i / blob->width()) % blob->height();// 行的索引%宽度      data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));    }    CHECK_EQ(this->filler_param_.sparse(), -1)         << "Sparsity not supported by this Filler.";  }};

三、与Filler类相关类的介绍

因为Filler用到了关于随机数生成的一些方法，下面来看下math_function的相关实现：

（1）高斯分布随机数的生成：

CPU上的实现（直接调用Boost的库了）

template <typename Dtype>void caffe_rng_gaussian(const int n, const Dtype a,                        const Dtype sigma, Dtype* r) {  CHECK_GE(n, 0);  CHECK(r);  CHECK_GT(sigma, 0);  // 直接调用boost中的正太分布了。  boost::normal_distribution<Dtype> random_distribution(a, sigma);  boost::variate_generator<caffe::rng_t*, boost::normal_distribution<Dtype> >      variate_generator(caffe_rng(), random_distribution);  for (int i = 0; i < n; ++i) {    r[i] = variate_generator();  }}

GPU的实现（直接调用CUDA的库了）

template <>void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma,                            float* r) {  CURAND_CHECK(      curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma));}template <>void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma,                            double* r) {  CURAND_CHECK(      curandGenerateNormalDouble(Caffe::curand_generator(), r, n, mu, sigma));}

（2）均匀分布随机数的生成：

CPU：

template <typename Dtype>void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) {  CHECK_GE(n, 0);  CHECK(r);  CHECK_LE(a, b);  // 调用Boost的库  boost::uniform_real<Dtype> random_distribution(a, caffe_nextafter<Dtype>(b));  boost::variate_generator<caffe::rng_t*, boost::uniform_real<Dtype> >      variate_generator(caffe_rng(), random_distribution);  for (int i = 0; i < n; ++i) {    r[i] = variate_generator();  }}

GPU：

void caffe_gpu_rng_uniform(const int n, unsigned int* r) {  CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n));}template <>void caffe_gpu_rng_uniform<float>(const int n, const float a, const float b,                                  float* r) {  CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n));  const float range = b - a;  if (range != static_cast<float>(1)) {    caffe_gpu_scal(n, range, r);  }  if (a != static_cast<float>(0)) {    caffe_gpu_add_scalar(n, a, r);  }}template <>void caffe_gpu_rng_uniform<double>(const int n, const double a, const double b,                                   double* r) {  CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n));  const double range = b - a;  if (range != static_cast<double>(1)) {    caffe_gpu_scal(n, range, r);  }  if (a != static_cast<double>(0)) {    caffe_gpu_add_scalar(n, a, r);  }}

（3）伯努利分布（二项分布）随机数的生成（竟然没有GPU上的代码。。。）

template <typename Dtype>void caffe_rng_bernoulli(const int n, const Dtype p, int* r) {  CHECK_GE(n, 0);  CHECK(r);  CHECK_GE(p, 0);  CHECK_LE(p, 1);  boost::bernoulli_distribution<Dtype> random_distribution(p);  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >      variate_generator(caffe_rng(), random_distribution);  for (int i = 0; i < n; ++i) {    r[i] = variate_generator();  }}void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {  CHECK_GE(n, 0);  CHECK(r);  CHECK_GE(p, 0);  CHECK_LE(p, 1);  boost::bernoulli_distribution<Dtype> random_distribution(p);  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >      variate_generator(caffe_rng(), random_distribution);  for (int i = 0; i < n; ++i) {    r[i] = static_cast<unsigned int>(variate_generator());  }}

四、总结

主要介绍了Filler中初始化权重各个算法的具体的实现，具体原理可以参考相关的论文。关于Filler其实没啥可以深挖的。已经被挖得差不多了。

3 0