NVCaffe 0.16.2 多 GPU 训练过程代码分析

来源：互联网发布：淘宝大悲咒水晶杯真假编辑：程序博客网时间：2024/06/08 20:11

NVIDA在Caffe的基础上对其进行了优化，这篇文章主要是针对其多 GPU 训练过程中参数更新方式及通讯方法进行相关代码的学习，如有不正确之处请指正。
先放主要的参考文章
1. NVCaffe github 主页
2. 博主 @KFXW 之前写了NVcaffe源码阅读系列文章，给了我很大启发，非常感谢！！
3. 另一位博主 @沤江一流对（Caffe，LeNet）的训练过程作了非常详细的介绍，前后向传播，权值更新几篇文章中让我学到了很多知识，同样非常感谢！！
4. 还参考了网络上其他博主的文章，很抱歉没有记录下来，但在此谢谢各位博主！

好了，进入正题，首先从主函数开始。
主函数main()

int main(int argc, char** argv) {  // Run tool or show usage.  caffe::GlobalInit(&argc, &argv);  // 设置设备   vector<int> gpus;  get_gpus(&gpus);#ifndef CPU_ONLY  if (gpus.size() > 0) {    Caffe::SetDevice(gpus[0]);  }#endif  if (argc == 2) {      // 若训练 caffe 的命令行为 ./build/tools/caffe train      // 则这里 g_brew_map 的 key 值为 argv[1]，也即是 'train'，则实际调用了 train()       return GetBrewFunction(caffe::string(argv[1]))();  // ------->  } else {    gflags::ShowUsageWithFlagsRestrict(argv[0], "tools/caffe");  }}

RegisterBrewFunction 宏在每一个实现主要功能的函数之后将这个函数的名字和其对应的函数指针添加到了 g_brew_map 中, 具体分别为 train()，test()，device_query()，time() 这四个函数。

#define RegisterBrewFunction(func) \namespace { \class __Registerer_##func { \ public: /* NOLINT */ \  __Registerer_##func() { \    g_brew_map[#func] = &func; \  } \}; \__Registerer_##func g_registerer_##func; \}

GetBrewFunction() 函数返回 g_brew_map[name], 即返回需要实现功能的函数。

static BrewFunction GetBrewFunction(const caffe::string& name) {  if (g_brew_map.count(name)) {    return g_brew_map[name];  }}

进入 train() 函数，首先是从 solver.prototxt 文件中读取训练模型参数，并设置 Caffe 的 mode（GPU 还是 CPU）以及设备 id[s]，该部分代码省略，主要分析使用 Solver 类完成整个训练的过程。

int train() {  // 通过调用 SolverRegistry 类的静态成员函数 CreateSolver() 得到一个指向 Solver 的指针来构造 shared_ptr 类型的 solver。  // 这里的 solver_param 就是网络的模型及求解文件 solver.prototxt, 当多个 GPU 时，这里创建的 Solver 为训练过程的 root_solver, device_id = 0 (GPU0)。  shared_ptr<caffe::Solver> solver(caffe::SolverRegistry::CreateSolver(solver_param)); //-----> solver_factory.hpp CreateSolver()  solver->SetActionFunction(signal_handler.GetActionFunction());  // 多 GPU 训练，需要涉及到 GPU 间通信与计算的异步处理问题。  if (gpus.size() > 1)   {    caffe::P2PManager p2p_mgr(solver, gpus.size(), solver->param());                          //-----> parallel.cpp   P2PManager::P2PManager()    p2p_mgr.Run(gpus); //-------> parallel.cpp  P2PManager::Run(const vector<int>& gpus)  }   else {  // gpus.size() <= 1)    LOG(INFO) << "Starting Optimization";    // 调用 Solver 的 Solve() 方法，开始优化。    solver->Solve();   //-------> solver.cpp  Solver::Solve(const char* resume_file)  }  LOG(INFO) << "Optimization Done in " << Caffe::time_from_init();  return 0;}

solver_factory.hpp 创建 Solver。

  static Solver* CreateSolver(const SolverParameter& param, size_t rank = 0U,      Solver* root_solver = NULL) {    const string& type = param.type();    CreatorRegistry& registry = Registry();    CHECK_EQ(registry.count(type), 1) << "Unknown solver type: " << type        << " (known types: " << SolverTypeListString() << ")";    Solver* solver = registry[type](param, rank, root_solver);    return solver;  }

尽管 solver 是一个指向基类 Solver 类型对象的指针，但由于 C++ 多态的特性，solver 这个智能指针调用各个成员函数时会调用到各个子类的函数。
由于 caffe.proto 文件中默认的优化方法为 SGD，所以会实例化一个 SGDSolver 的对象（sgd_solvers.hpp）, SGDSolver 类继承于 Solver 类。
class SGDSolver : public Solver
构造函数为：

  explicit SGDSolver(const SolverParameter& param,      size_t rank = 0U, Solver *root_solver = NULL)      : Solver(param, rank, root_solver) { PreSolve(); }

因此，需要先调用父类 Solver 的构造函数，而 Solver 类中包含 Net 类对象，而 Net 类对象又包含了 Layers 类对象和 Blob 类对象。最终整个初始化的工作大概是：

新建一个 SGDSolver 对象 -> 调用 SGDSolver 类的构造函数 -> 调用 Solver 类的构造函数 -> 新建 Net 类实例 -> 调用 Net 类的构造函数 -> 新建各个 Layer 的实例 -> 调用各个 Layer 类的构造函数 -> 设置每个 Blob，也由此完成整个网络的初始化。

parallel.cpp P2PManager 构造函数
注意： caffe.cpp 中创建的 solver 即为 root_solver

P2PManager::P2PManager(shared_ptr<Solver> root_solver,    int nranks, const SolverParameter& solver_param) :      nranks_(nranks),      syncs_(nranks),      root_solver_(root_solver)

parallel.cpp Run() 函数

void P2PManager::Run(const vector<int>& gpus) {  ......  SolverParameter param = root_solver_->param();  this->shared_ = make_shared<SharedScores<float>>(nranks_);  for (int i = 0; i < gpus.size(); ++i) {    param.set_device_id(gpus[i]);    // 返回一个 P2PSync 类型的 shared_ptr 智能指针 syncs_[i]    // 每个 GPU 对应一个 P2PSync, 用于多 GPU 间的 P2P 异步    syncs_[i] = make_shared<P2PSync>(this, root_solver_, i, gpus.size(), param);                                 // -------> parallel.cpp  P2PSync::P2PSync()#ifndef CPU_ONLY#ifdef USE_NCCL    syncs_[i]->aux_ = &nccl_id_;#else    LOG(FATAL) << "Multi-GPU execution not available - rebuild with USE_NCCL";#endif  // USE_NCCL#endif  // CPU_ONLY    syncs_[i]->shared_ = this->shared_;  }  LOG(INFO)<< "Starting Optimization";  for (int i = 0; i < syncs_.size(); ++i) {    // 开始内部线程    syncs_[i]->StartInternalThread(true, static_cast<uint64_t>(param.random_seed())); // -------> internal_thread.cpp        // InternalThread::StartInternalThread(bool set_cpu_affinity, uint64_t random_seed)  }  for (int i = 0; i < syncs_.size(); ++i) {    syncs_[i]->WaitAll();  }  ......}

P2PSync 类继承自 Solver::Callback 和 InternalThread

class P2PSync : public Solver::Callback, public InternalThread
构造函数：

P2PSync::P2PSync(P2PManager* mgr, shared_ptr<Solver> root_solver,    int rank, int nranks, const SolverParameter& solver_param)    : InternalThread(solver_param.device_id(), rank, 1, false),      mgr_(mgr),      rank_(rank),      nranks_(nranks),      initial_iter_(root_solver->iter()),      solver_(),      root_solver_(root_solver),      solver_param_(solver_param)

InternalThread 构造函数：

InternalThread::InternalThread(int target_device, size_t rank, size_t threads, bool delayed)    : target_device_(target_device),      rank_(rank),      aux_(nullptr),      threads_(threads),      delay_flags_(threads, make_shared<Flag>(!delayed))

InternalThread 开启线程函数
注意：创建 InternalThread 实例时传进的参数 threads = 1，因此这里 threads_.size() = 1

void InternalThread::StartInternalThread(bool set_cpu_affinity, uint64_t random_seed) {  ......  const int solver_count = Caffe::solver_count();  try {    for (size_t id = 0; id < threads_.size(); ++id) {      // 实例化一个 boost::thread 对象给 thread_[id] 指针，该线程的执行的是 entry 函数，实际只有一个线程       threads_[id] = boost::thread(&InternalThread::entry, this, id, target_device_, mode,          random_seed, solver_count, rank_, set_cpu_affinity);  // ------>    }  } catch (std::exception& e) {    LOG(FATAL) << "Thread exception: " << e.what();  }}

线程所要执行的函数 InternalThread::entry

void InternalThread::entry(int thread_id, int device, Caffe::Brew mode, uint64_t random_seed,    int solver_count, size_t rank, bool set_cpu_affinity) {  ......  Caffe::set_mode(mode);  Caffe::set_random_seed(random_seed);  Caffe::set_solver_count(solver_count);  if (threads_.size() == 1) {    InternalThreadEntry();   // ---------> internal_thread.hpp 虚函数，由其子类实现  } else {    InternalThreadEntryN(thread_id);  // ---------> internal_thread.hpp 虚函数，由其子类实现  }}

由于 threads_.size() = 1，因此下一步执行的是 InternalThreadEntry() 函数，该函数由其子类 P2PSync 实现。

parallel.cpp

void P2PSync::InternalThreadEntry() {  if (rank_ == 0) { // GPU0 为 root_solver, root_solver 在 caffe.cpp 的 train() 函数中创建    Caffe::set_root_solver(true);    solver_ = root_solver_;     solver_->root_add_callback(this);  } else { // 为其他 GPU 创建 Solver    Caffe::set_root_solver(false);    solver_.reset(caffe::SolverRegistry::CreateSolver(solver_param_, rank_, root_solver_.get()));  }  solver_->set_callback(this);#ifndef CPU_ONLY#ifdef USE_NCCL  ncclUniqueId* nccl_id = reinterpret_cast<ncclUniqueId*>(this->aux_);  soft_barrier();  NCCL_CHECK(ncclCommInitRank(&nccl_comm_, nranks_, *nccl_id, rank_));  soft_barrier();#endif#endif  init_streams();  //  调用 Solver 的 Solve() 方法，开始优化。  if (solver_->Solve()) {    mgr_->EarlyCancel(this);  }}

solver.cpp Solver::Solve() 主要是调用了Step函数完成迭代

bool Solver::Solve(const char* resume_file) {  ......  int start_iter = iter_;    ......  // 核心代码  // 参数 param.max_iter() 为 solver.prototxt 中的 max_iter, 参数 iter_ 在初始化 Solver 时被初始化为 0。  Step(param_.max_iter() - iter_);  ......  return false;}

进入 Step() 函数

void Solver::Step(int iters) {  //设置开始的迭代次数和结束的迭代次数  const int start_iter = iter_;  const int stop_iter = iter_ + iters;  // 输出的 loss 为前 average_loss 次 loss 的平均值，在 solver.prototxt 里设置，默认为 1，  // losses 存储之前的 average_loss 个 loss, smoothed_loss_ 为最后要输出的均值    int average_loss = this->param_.average_loss();  losses_.clear();  smoothed_loss_ = 0;  const Caffe::Brew mode = Caffe::mode();  const int solver_count = Caffe::solver_count();  const bool root_solver = this->is_root();  net_->set_solver(this);#ifndef CPU_ONLY  for (const shared_ptr<Blob>& param : net_->learnable_params()) {    // To prevent allocations inside on_start call:    param->allocate_data(mode == Caffe::GPU);  }  // 初始化网络的可学习(更新)参数的梯度数值存储空间，全部清0  net_->InitializeLearnableDiffSpace();  // 当有多个 GPU 设备时  if (solver_count > 1) {     // we need to sync all threads before starting, otherwise some cuda init,    // malloc or other cuda stuff could interlock with in-loop cuda GPU sync    // called in on_start.    // 需要在开始前同步所有的线程    callback_soft_barrier();    {      unique_ptr<unique_lock<shared_mutex>> lock;      if (root_solver) {        lock.reset(new unique_lock<shared_mutex>(GPUMemory::read_write_mutex()));      }      callback_soft_barrier();      // on_start() 使用 ncclBcast 并辅以一些同步函数，将 net 分发到各个 GPU 设备上      callback_->on_start(net_->learnable_params()); // -------> parallel.cpp P2PSync::on_start()    }    callback_soft_barrier();    LOG(INFO) << "Starting Optimization on GPU " << Caffe::current_device();  }  const bool use_multi_gpu_testing = Caffe::solver_count() > 1;  const string mgpu_str = use_multi_gpu_testing ? "[MultiGPU] " : "";#else  const bool use_multi_gpu_testing = false;  const string mgpu_str;#endif  uint64_t random_seed = param_.random_seed() >= 0 ?      static_cast<uint64_t>(param_.random_seed()) : Caffe::next_seed();  // *** 在循环迭代之前开启了一个新线程 reduce_thread_, 专门负责权重的更新，该线程调用 Solver::Reduce() 函数，以及进一步的 Net::ReduceAndUpdate() 函数，实现多 GPU 之间异步并行更新权重。  reduce_thread_.reset(new boost::thread(&Solver::Reduce, this,      Caffe::current_device(), mode, random_seed, solver_count, root_solver));  // 开始迭代  while (iter_ < stop_iter) {    if (param_.snapshot_diff()) {      net_->ClearParamDiffs();  // 权值梯度清 0    }  // we clean them in ApplyUpdate otherwise    // Just started or restored?    const bool first_loop = iter_ == 0 || iterations_last_ < 0;    // 测试    ......TestAll(); // 代码略    const bool display = this->display();    net_->set_debug_info(display && param_.debug_info());    // accumulate the loss and gradient    float loss = 0.F;    if (first_loop) {      iterations_last_ = iter_;      iteration_timer_.Start();      init_flag_.set();    }    iteration_start_signal();    // iter_size 是在 solver.prototxt 里设置（默认为 1），每次迭代都会以 batch_size 大小计算梯度和 loss，最后再取 iter_size 次迭代的平均。    // 当进行了 iter_size 次迭代时，参数 apply_update = true。可以看成进行 iter_size 次迭代训练，或者说训练 iter_size*batch_size 张图片时会更新一次参数。    // 这样的好处是比一次使用大的 batch_size 要节省存储。可在当 batch_size 设置的过大，导致GPU的显存不够（出现 out_of_memory）的时候使用。    for (int i = 0; i < param_.iter_size(); ++i) {      // *** 前向传播和反向传播，前向用于计算模型的最终输出和 Loss, 后向用于计算每一层网络和参数的梯度。       loss += net_->ForwardBackward(i + 1 == param_.iter_size());              //-------> net.cpp  Net::ForwardBackward(bool apply_update)      if (i == 0) {        if (first_loop) {          iter0_flag_.set();          net_->wait_layers_init();        }        iter_size_complete_ = true;      }    }    loss /= param_.iter_size(); // 最终的 loss 为 iter_size 次迭代的平均    iteration_wait();    if (requested_early_exit_) {      total_lapse_ += iteration_timer_.Seconds();      break;    }    // average the loss across iterations for smoothed reporting    // 对 loss 作平滑    // 由于 Caffe 的训练方式是 SGD, 我们无法把所有的数据同时放入模型进行训练，    // 那么部分数据产生的 Loss 就可能会和全样本的平均 Loss 不同，    // 在必要时候将 loss 和历史过程中更新的 loss 求平均就可以减少 Loss 的震荡问题    UpdateSmoothedLoss(loss, start_iter, average_loss);    if (display || iter_ <= 2 || iter_ + 1 >= stop_iter) {      ...... display // 代码省略    }    // Increment the internal iter_ counter -- its value should always indicate    // the number of times the weights have been updated.    ++iter_;    SolverAction::Enum request = GetRequestedAction();    // Save a snapshot if needed.    if ((param_.snapshot()         && iter_ % param_.snapshot() == 0         && Caffe::root_solver()) ||         (request == SolverAction::SNAPSHOT)) {      Snapshot();    }    if (SolverAction::STOP == request) {      requested_early_exit_ = true;      total_lapse_ += iteration_timer_.Seconds();      // Break out of training loop.      break;    }  }  Finalize();}

在 Step() 函数中实现了多 GPU 间计算与权重更新的异步模型，表现在以下
1. 在循环迭代之前开启了一个新线程 reduce_thread_, 专门负责权重的更新，该线程调用 Solver::Reduce() 函数，以及进一步的 Net::ReduceAndUpdate() 函数，实现多 GPU 之间异步并行更新权重。

reduce_thread_.reset(new boost::thread(&Solver::Reduce, this, Caffe::current_device(), mode, random_seed, solver_count, root_solver));

2. 对 net.cpp 中 ForwardBackward(bool apply_update) 函数，主要是 Backward(bool apply_update) 的改进。net.cpp维护了一个异步队列，该队列存储的元素是需要更新的参数的id。
BlockingQueue<int> reduction_queue_;

首先来看前向计算和反向计算部分，然后再看权重更新部分。
net.cpp 前向计算和反向计算函数

float Net::ForwardBackward(bool apply_update) {  float loss;  Forward(&loss); // 前向计算  Backward(apply_update); // 反向计算  return loss;}

重点关注反向计算 Backward() 函数进而调用 BackwardFromToAu() 函数。经过网络反向计算完一个层的梯度之后，且该层的参数需要被更新时，将需要更新的参数的 id 存入队列 reduction_queue_ 中。

void Net::Backward(bool apply_update) {  BackwardFromToAu(layers_.size() - 1, 0, apply_update);}void Net::BackwardFromToAu(int start, int end, bool apply_update) {  for (int i = start; i >= end; --i) {    // 对每一层进行反向计算，调用不同层的 Backward() 函数来计算每层的梯度。      layers_[i]->Backward(top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);    if (!apply_update) {      continue;    }    for (int j = 0; j < layers_[i]->blobs().size(); ++j) {      if (layers_[i]->skip_apply_update(j)) {        continue;      }      int param_id = layer_index_params_[make_pair(i, j)];      if (param_owners_[param_id] < 0) {        // 计算完一个层的数据，且该层的参数需要被更新时，将需要更新的参数的 id 存入队列 reduction_queue_ 中。        // 比如 LeNet 需要更新的参数有 4 个，id 为 0-3，reduction_queue_ 队列中将 push 进 0-3。AlexNet 需要更新的参数有 16 个，id 为 0-15，reduction_queue_ 队列中将 push 进 0-15。        reduction_queue_.push(learnable_param_ids_[param_id]);       }  // leave it to the owner otherwise    }  }  if (apply_update) {    // 在训练完 batch_size * iter_size 张图片后，插入 END_OF_ITERATION 标识符    reduction_queue_.push(END_OF_ITERATION);  }}

下面再来看参数更新的过程，线程 reduce_thread_ 负责权重的更新，调用 solver.cpp 中的 Reduce() 函数

void Solver::Reduce(int device, Caffe::Brew mode, uint64_t random_seed,    int solver_count, bool root_solver) {  Caffe::set_mode(mode);#ifndef CPU_ONLY  if (Caffe::mode() == Caffe::GPU) {    CUDA_CHECK(cudaSetDevice(device));#ifndef NO_NVML    nvml::setCpuAffinity(rank_);#endif  }#endif  Caffe::set_random_seed(random_seed);  Caffe::set_solver_count(solver_count);  Caffe::set_root_solver(root_solver);  net_->ReduceAndUpdate(); // ---------> net.cpp  Net::ReduceAndUpdate()}

进一步的 net.cpp 中的 ReduceAndUpdate() 函数。
在使用多个 GPUs 时, 我们必须在每次迭代后进行归约。为了达到更好的性能, 我们将多个 layers 组合到 buckets 中。Net 的参数 reduce_buckets 用于设置 buckets 的大概数量（默认为 6 ）。
reduce_buckets 的定义 caffe.proto 文件中：

  // While using multiple GPUs we have to run reduction process after every iteration.  // For better performance we unify multiple layers in buckets.  // This parameter sets approximate number of buckets to combine layers to.  // Default value is good for majority of nets.  // 在使用多个 GPUs 时, 我们必须在每次迭代后进行归约。  // 为了更好的性能, 我们将多个 layers 组合到 buckets 中。  // 此参数设置要组合 layers 的 buckets 的大概数量。  // 默认值（default = 6）可以适用于大多数网络。  optional int32 reduce_buckets = 18 [default = 6];

随后会利用这个参数进一步得到参数 max_params_per_bucket （每个 bucket 中最多可存的参数的个数）和参数 bucket_space_count （每个 bucket 所占的空间大小）来设置当 reduction_queue_ 累计了多少待处理参数时调用一次权重更新函数。
ReduceAndUpdate() 线程轮询 reduction_queue_ 中的元素，并记录所到达的元素所占空间大小（参数 received_count）。发现队列中有待处理参数信息，且满足一定要求（比如 received_count >= bucket_space_count，详细条件见代码中 if 语句）时便调用归约函数 ReduceBucket() ，并调用实例化的 solver 中的 ApplyUpdate() 函数（例如 sgd_solver.cpp 中的实现）进行参数更新。

Net::ReduceAndUpdate() 函数

void Net::ReduceAndUpdate() {#ifndef CPU_ONLY  cudaStream_t stream;  CUBLAS_CHECK(cublasGetStream(handle, &stream));  int max_params_per_bucket = 0;  size_t bucket_space_count = 0UL;  if (Caffe::solver_count() > 1) {    CHECK_GT(reduce_buckets_, 0);    max_params_per_bucket = (int) (learnable_params_.size() + 1UL) / (int) reduce_buckets_; // 每个 bucket 中最多可存的参数的个数，例如，AlexNet 参数个数为 16 个，reduce_buckets = 6，则 max_params_per_bucket = 2    if (max_params_per_bucket < 1) {      max_params_per_bucket = 1;    }    bucket_space_count =        size_t((float)(learnable_space_count_ + 1UL) /            learnable_params_ptrs_.size() * max_params_per_bucket); // 每个 bucket 所占的空间大小  }  int id_from = -1, id_to = -1;  size_t received_count = 0U; // reduction_queue_ 队列中待处理参数的所占空间大小  std::list<int> au_ids;#endif  const bool clear_grads = !solver_->param().snapshot_diff();  while (true) {    int param_id = reduction_queue_.pop(); // 将队列 reduction_queue_ 中的元素取出    SolverAction::Enum request = solver_->GetRequestedAction();    if (SolverAction::STOP == request) {#ifndef CPU_ONLY      CUDA_CHECK(cudaStreamSynchronize(stream));#endif      solver_->request_early_exit();      break;    }    if (param_id == END_OF_BATCH) {#ifndef CPU_ONLY      CUDA_CHECK(cudaStreamSynchronize(stream));#endif      break;    }    if (param_id != END_OF_ITERATION) {      if (Caffe::solver_count() > 1) { // 当有多个 GPU 时#ifndef CPU_ONLY        if (max_params_per_bucket == 1) { // 每个 bucket 中最多只有一个参数时          Reduce(param_id); // 调用 Reduce() 归约函数，这里不是很懂，因为调用了这里之后，仍然会调用下边的 ReduceBucket() 函数。。。不知道为啥要加这一步呢？？？         }#else        NO_GPU;#endif      } else { // 当 Caffe::solver_count() <= 1, 即只使用 CPU 或只有一个 GPU 时，直接调用 ApplyUpdate() 权值更新函数        if (global_grad_scale_ != 1.F) {          this->learnable_params()[param_id]->scale_diff(1.F/global_grad_scale_, handle, true);        }        solver_->ApplyUpdate(param_id, handle, clear_grads);        continue;      }    }#ifndef CPU_ONLY    //    if (learnable_params_.size() > 0 && Caffe::solver_count() > 1) {      // Is bucket big enough? Done with iteration? Next param_id doesn't fit?      // Type changed?      // 归约及权重更新判断条件：bucket 够大吗？是否完成迭代？      // 下一个 param_id 不适合？类型是否已更改？      if (received_count >= bucket_space_count ||          (param_id == END_OF_ITERATION && id_from != -1) || // leftovers          (id_from != -1 && param_id < id_from - 1) ||          (id_to != -1 && param_id > id_to + 1) ||          (id_from != -1 && learnable_params_[id_from]->diff_type()                         != learnable_params_[param_id]->diff_type())) {        Type dtype = learnable_params_[id_from]->diff_type();        size_t count = 0U;        for (int i = id_from; i <= id_to; ++i) {          count += even(learnable_params_[i]->count());        }        ReduceBucket(count, dtype, learnable_params_ptrs_[id_from]); // 调用 ReduceBucket() Bucket归约函数        for (int i : au_ids) {          if (global_grad_scale_ != 1.F) {            this->learnable_params()[i]->scale_diff(1.F/ global_grad_scale_, handle, true);          }          solver_->ApplyUpdate(i, handle, clear_grads); // 调用 ApplyUpdate() 权值更新函数        }        au_ids.clear();        // 归约及权值更新后，若迭代没有结束，则重新设置 id_from 和 id_to 以及当前 param_id 的 received_count。        if (param_id != END_OF_ITERATION) {          id_from = id_to = param_id;          received_count = (size_t) even(learnable_params_[param_id]->count());          au_ids.emplace_back(param_id);        }      } else if (param_id != END_OF_ITERATION) { // 不满足权值更新条件，同时迭代没有结束，则重新设置 id_from 和 id_to 并累计 received_count        if (id_from == -1 || param_id < id_from) {          id_from = param_id;        }        if (id_to == -1 || param_id > id_to) {          id_to = param_id;        }        received_count += even(learnable_params_[param_id]->count());        au_ids.emplace_back(param_id);      }    }#endif    // 迭代结束    if (param_id == END_OF_ITERATION) {#ifndef CPU_ONLY      CUDA_CHECK(cudaStreamSynchronize(stream));      received_count = 0U;      id_from = id_to = -1;      au_ids.clear();#endif      solver_->iteration_complete_signal();    }  }  DLOG(INFO) << "[" << Caffe::current_device() << "] Leaving ReduceAndUpdate thread";}

上述函数中涉及到两个变量：learnable_space_count_ 和 learnable_params_ptrs_，这两个变量是通过 Net::InitializeLearnableDiffSpace() 这个函数来设置的，先看这个函数。

void Net::InitializeLearnableDiffSpace() {  learnable_space_count_ = 0;  size_t workspace_size = 0UL;  // vector<void*> learnable_params_ptrs_;  // vector<shared_ptr<Blob>> learnable_params_  learnable_params_ptrs_.resize(learnable_params_.size());  for (int i = 0; i < learnable_params_.size(); ++i) {    learnable_params_[i]->lock_diff();    learnable_space_count_ += even(learnable_params_[i]->count()); // learnable_space_count_ 中存放的是参数的总个数    workspace_size += even(learnable_params_[i]->count()) *        tsize(learnable_params_[i]->diff_type());  // workspace_size 为所有参数所占空间  }  // Size have at least one byte, otherwise cudaMalloc fails if net has no  // learnable parameters. Times two.  if (workspace_size < 2) {    workspace_size = 2;  }  // GPUMemory::Workspace learnable_space_;  learnable_space_.reserve(workspace_size); // 为 learnable_space_ 分配 workspace_size 的 GPU 内存   gpu_memory.hpp  unsigned char* ptr = reinterpret_cast<unsigned char*>(learnable_space_.data());  // 返回 learnable_space_ 的指针  caffe_gpu_memset(workspace_size, 0, ptr);  // 用 0 初始化空间  for (int i = 0; i < learnable_params_.size(); ++i) {    learnable_params_[i]->set_gpu_diff(static_cast<void*>(ptr));    learnable_params_ptrs_[i] = ptr; // 每个参数的初始位置使用 learnable_params_ptrs_ 这个指针数组保存了起来    ptr += even(learnable_params_[i]->count()) * tsize(learnable_params_[i]->diff_type()); // ptr 指针指向下一个参数  }}

gpu_memory.hpp Workspace 结构体的函数 reserve(), 进一步调用 try_reserve() 函数。

    void reserve(size_t size, int device = current_device()) {      if (!try_reserve(size, device))        {        LOG(FATAL) << "Out of memory: failed to allocate " << size            << " bytes on device " << device;      }    }

gpu_memory.cpp

bool GPUMemory::Workspace::try_reserve(size_t size, int device) {  bool status = true;  if (size > size_ || ptr_ == nullptr) {    release();    if (device != INVALID_DEVICE) {      device_ = device;  // switch from default to specific one    }    status = mgr_.try_allocate(&ptr_, size, device_); // 调用 try_allocate() 函数分配内存在制定的设备上    if (status) {      CHECK_NOTNULL(ptr_);      size_ = size;    }  }  return status;}

最终会调用 gpu_memory.cpp 文件中的 try_allocate 函数进行分配内存操作，这部分代码不在此进行说明。

bool GPUMemory::Manager::try_allocate(void** ptr, size_t size, int device, int group)

回到归约函数 Reduce() 和 ReduceBucket()

#ifndef CPU_ONLYvoid Net::Reduce(int param_id) {  solver_->callback()->reduce_barrier();  {    unique_ptr<unique_lock<shared_mutex>> lock;    if (solver_->is_root()) {      lock.reset(new unique_lock<shared_mutex>(GPUMemory::read_write_mutex()));    }    solver_->callback()->reduce_barrier();    solver_->callback()->allreduce(param_id); //-------->solver.hpp virtual void allreduce(int param_id) = 0                    // -------->parallel.cpp  P2PSync::allreduce(int param_id)    solver_->callback()->reduce_barrier();  }  this->learnable_params()[param_id]->gpu_scale_diff(1.F / Caffe::solver_count(),      solver_->callback()->cublas_handle(), true);  // Also need to barrier to make sure lock isn't undone  // until all have completed, but the current nature of  // NCCL makes this unnecessary.  // solver_->callback()->reduce_barrier();}void Net::ReduceBucket(size_t count, Type bucket_type, void* bucket) {  solver_->callback()->reduce_barrier();  {    unique_ptr<unique_lock<shared_mutex>> lock;    if (solver_->is_root()) {      lock.reset(new unique_lock<shared_mutex>(GPUMemory::read_write_mutex()));    }    solver_->callback()->reduce_barrier();    solver_->callback()->allreduce_bucket(count, bucket, bucket_type);  //-------->solver.hpp virtual void allreduce_bucket(int count, void* bucket, Type type) = 0// -------->parallel.cpp  P2PSync::allreduce_bucket(int count, void* bucket, Type type)    solver_->callback()->reduce_barrier();  }  Tensor::gpu_scal(count, bucket_type, bucket, 1.F / Caffe::solver_count(),      solver_->callback()->cublas_handle(), true);}#endif

parallel.cpp allreduce() 和 allreduce_bucket()

void P2PSync::allreduce(int param_id) {#ifndef CPU_ONLY#ifdef USE_NCCL  const shared_ptr<Blob>& param = solver_->net()->learnable_params()[param_id];  NCCL_CHECK(ncclAllReduce(param->current_diff_memory(true),      param->current_mutable_diff_memory(true),      even(param->count()),      nccl::nccl_type(param->diff_type()),      ncclSum,      nccl_comm_,      comm_stream_->get()));  CUDA_CHECK(cudaStreamSynchronize(comm_stream_->get()));#endif  // USE_NCCL#endif  // CPU_ONLY}void P2PSync::allreduce_bucket(int count, void* bucket, Type type) {#ifndef CPU_ONLY#ifdef USE_NCCL  NCCL_CHECK(ncclAllReduce(bucket, bucket, count, nccl::nccl_type(type),                           ncclSum, nccl_comm_, comm_stream_->get()));  CUDA_CHECK(cudaStreamSynchronize(comm_stream_->get()));#endif  // USE_NCCL#endif  // CPU_ONLY}

ncclAllReduce() 的定义在 nccl.h 文件中，实现多个 GPU 间的全归约通信。

/* Reduction opperation selector */typedef enum { ncclSum        = 0,               ncclProd       = 1,               ncclMax        = 2,               ncclMin        = 3,               nccl_NUM_OPS   = 4 } ncclRedOp_t;/* Reduces data arrays of length count in sendbuff using op operation, and leaves * identical copies of result on each GPUs recvbuff. * Sendbuff and recvbuff are assumed to reside on the same device. * Must be called separately for each communicator in communicator clique. */ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, int count,    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);

权值更新函数，调用 SGDSolver 类的 ApplyUpdate() 函数

具体 SGD 的实现原理及公式这里略过，只分析代码。

sgd_solver.cpp

template<typename Dtype>void SGDSolver<Dtype>::ApplyUpdate(int param_id, void* handle, bool clear_grads) {  // 获取该轮迭代的学习率(learning rate)  float rate = GetLearningRate();   / 在计算当前梯度的时候，如果该值超过了阈值 clip_gradients，则将梯度直接设置为该阈值。    // clip_gradient 的引入是为了处理 gradient explosion 的问题。  // 当在一次迭代中权重的更新过于迅猛的话，很容易导致 loss divergence。  // clip_gradient 的直观作用就是让权重的更新限制在一个合适的范围。  ClipGradients(handle);  // 归一化, iter_size 大于 1 时梯度值再除以 iter_size  Normalize(param_id, handle);  // 正则化  Regularize(param_id, handle);  // 计算更新值  ComputeUpdateValue(param_id, handle, rate, clear_grads);}

正则化

template<typename Dtype>void SGDSolver<Dtype>::Regularize(int param_id, void* handle) {  if (Caffe::mode() == Caffe::CPU) {    // 获取所有要优化的参数      const vector<shared_ptr<Blob>>& net_params = this->net_->learnable_params();    // 获取所有要优化的参数的权重衰减向量      const vector<float>& net_params_weight_decay = this->net_->params_weight_decay();    // 获取网络模型整体的权重衰减    float weight_decay = this->param_.weight_decay();    // 获取网络的正则化类型，L1或者L2     string regularization_type = this->param_.regularization_type();    // 每一个参数的权重衰减等于每个参数的权重衰减乘以网络整体的权重衰减      float local_decay = weight_decay * net_params_weight_decay[param_id];    if (local_decay) {      if (regularization_type == "L2") {        // add weight decay          // 执行正则化，L2的梯度 diff_= weight_decay * data_ + diff_          // caffe_axpy means ax_plus_y. i.e., Y = alpha*X + Y        // template <typename Dtype>         // void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y);        caffe_axpy<Dtype>(net_params[param_id]->count(), local_decay,            net_params[param_id]->cpu_data<Dtype>(),            net_params[param_id]->mutable_cpu_diff<Dtype>());      } else if (regularization_type == "L1") {        caffe_cpu_sign<Dtype>(net_params[param_id]->count(),            net_params[param_id]->cpu_data<Dtype>(), temp_[param_id]->mutable_cpu_data());        caffe_axpy<Dtype>(net_params[param_id]->count(), local_decay, temp_[param_id]->cpu_data(),            net_params[param_id]->mutable_cpu_diff<Dtype>());      } else {        LOG(FATAL) << "Unknown regularization type: " << regularization_type;      }    }  } else if (Caffe::mode() == Caffe::GPU) {#ifndef CPU_ONLY    //Fused with ComputeUpdateValue#else    NO_GPU;#endif  } else {    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();  }}

计算更新值

template<typename Dtype>voidSGDSolver<Dtype>::ComputeUpdateValue(int param_id, void* handle, float rate, bool clear_grads) {  shared_ptr<Blob> param = this->net_->learnable_params()[param_id];  // history_ 存储了上一次的梯度  shared_ptr<TBlob<Dtype>> history = history_[param_id];  // 获取所有参数对应的 learning_rate 的 vector  const vector<float>& net_params_lr = this->net_->params_lr();  // 获取momentum值  float momentum = GetMomentum();  // 实际的 learning_rate 为全局的 learning_rate 乘以每个参数对应的 lr_mult  // local_rate = global_rate * lr_mult  // lr_mult 为该层学习率因子，在 train_test.prototxt 中设置  float local_rate = rate * net_params_lr[param_id];  // Compute the update to history, then copy it to the parameter diff.  if (Caffe::mode() == Caffe::CPU) {    // history_ = learning_rate*diff_ + momentum*history_    caffe_cpu_axpby<Dtype>(param->count(), local_rate, param->cpu_diff<Dtype>(), momentum,        history->mutable_cpu_data());    // 把当前的梯度拷贝给参数 Blob 的 diff_    caffe_copy<Dtype>(param->count(), history->cpu_data(), param->mutable_cpu_diff<Dtype>());    param->Update(); // 参数更新    if (clear_grads) {      param->set_diff(0.F);    }  } else if (Caffe::mode() == Caffe::GPU) {#ifndef CPU_ONLY    const std::string& regularization_type = this->param_.regularization_type();    const float decay = local_decay(param_id);    const Type gtype = param->diff_type();    // 调用 sgd_reg_update_all_and_clear_gpu() 函数    if (gtype == tp<float16>()) {      sgd_reg_update_all_and_clear_gpu<float16, Dtype>(param->count(),          param->mutable_gpu_diff<float16>(),          param->mutable_gpu_data<Dtype>(),          history->mutable_gpu_data(),          momentum, local_rate, regularization_type, decay,  handle, clear_grads);    } else if (gtype == tp<float>()) {      sgd_reg_update_all_and_clear_gpu<float, Dtype>(param->count(),          param->mutable_gpu_diff<float>(),          param->mutable_gpu_data<Dtype>(),          history->mutable_gpu_data(),          momentum, local_rate, regularization_type, decay,  handle, clear_grads);    } else if (gtype == tp<double>()) {      sgd_reg_update_all_and_clear_gpu<double, Dtype>(param->count(),          param->mutable_gpu_diff<double>(),          param->mutable_gpu_data<Dtype>(),          history->mutable_gpu_data(),          momentum, local_rate, regularization_type, decay,  handle, clear_grads);    } else {      LOG(FATAL) << "Gradient type " << Type_Name(gtype) << " is not supported";    }#else    NO_GPU;#endif  } else {    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();  }}

sgd_reg_update_all_and_clear_gpu() 函数的定义：

#ifndef CPU_ONLYtemplate<typename Gtype, typename Wtype>void sgd_reg_update_all_and_clear_gpu(int N,    Gtype* g, Wtype* w, Wtype* h,    float momentum, float local_rate, const std::string& regularization_type, float local_decay,    void* handle, bool clear_grads);#endif

sgd_reg_update_all_and_clear_gpu 函数的具体实现是在 sgd_solver.cu 文件中

template<typename Gtype, typename Wtype>void sgd_reg_update_all_and_clear_gpu(int N,  Gtype* g, Wtype* w, Wtype* h,  float momentum, float local_rate, const std::string& reg_type, float local_decay,  void* handle,  bool clear_grads) {  cublasHandle_t cublas_handle =      handle == nullptr ? Caffe::cublas_handle() : reinterpret_cast<cublasHandle_t>(handle);  cudaStream_t stream;  CUBLAS_CHECK(cublasGetStream(cublas_handle, &stream));  // NOLINT_NEXT_LINE(whitespace/operators)  SGDRegUpdateAllAndClear<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0, stream>>> (N,    g, w, h,    momentum, local_rate, local_decay, reg_type == "L2",  clear_grads);  CUDA_POST_KERNEL_CHECK;  CUDA_CHECK(cudaStreamSynchronize(stream));}

阅读全文

1 0