LR 做多分类的笔记

来源：互联网发布：ubuntu rar 解压命令编辑：程序博客网时间：2024/04/29 14:58

1. 从概率的角度出发，推断一个样本的后验概率为：

其中：

4.63 式可以有比较简洁的形式，例如：线性表达式。

2. 假定P(x| Ck) 为正态分布，

则 lnp(x|Ck)p(Ck) 可以表示为线性的表达式如下：

3. 求解模型参数：

4. 本质上分类走概率模型比较靠谱，直观上某一个地方的点密集，可以说明在该类的概率搞。使用平方误差是，距离无法衡量到某类的距离。

但是 SVC 也用了类似的距离，但是 SVC 只用了支持向量，且投影到高维空间。

5. sample code：

#include <string>#include <vector>#include <cmath>#include <map>#include "base/flags.h"#include "base/string_util.h"#include "utils/hash_tables.h"#include "common/file/simple_line_reader.h"DEFINE_string(train_path, "./test.txt", "trainning file");DEFINE_double(lambda, 0.001, "the weight of regularation");DEFINE_double(alpha, 0.1, "the learning rate");DEFINE_int32(n, 5000, "iteration times");// origianl data;struct DataSample {  std::string                      label;  double                           predict_prob;  utils::hash_map<std::string, double>  features;  void AddFeature(const std::string& fn, const double& v) {    features[fn] = v;  }};// inner_class label are: [0, 1, 2,  (label_count_-1)]class TrainningDataSet {public:  TrainningDataSet() {    label_count_ = 0;    inner_feature_map_["cb"] = 0;    outer_feature_map_[0] = "cb";    feature_count_ = 1;  }  // format like:  "A 1:0.2 2:0.4 3:77 4:0.3"  bool LoadSamplesFromFile(const std::string& file_path) {    file::SimpleLineReader  line_reader;    line_reader.OpenOrDie(file_path);    std::vector<std::string> lines;    line_reader.ReadLines(&lines);    for (size_t i = 0; i < lines.size(); ++i) {      std::vector<std::string> parts;      DataSample sample;      SplitString(lines[i], ' ', &parts);            sample.label = parts[0];      AddLabel(parts[0]);      for (size_t j = 1; j < parts.size(); ++j) {        std::vector<std::string> fn_v;        SplitString(parts[j], ':', &fn_v);        if (fn_v.size() != 2) {          continue;        }        double v = 0.0f;        StringToDouble(fn_v[1], &v);        sample.AddFeature(fn_v[0], v);        AddFeature(fn_v[0]);      }      samples_.push_back(sample);    }    return true;  }   void Train() {    AllocAuxParam();    TrainInternal(FLAGS_n);    Predict();    FreeAuxParam();  }  void Predict() {    double prob[100];    // Xn    for (auto it = samples_.begin(); it != samples_.end(); ++it) {      for (int k = 0; k < label_count_; ++k) {        prob[k] = w[k][0]*1.0f;        for (auto  sit = it->features.begin(); sit != it->features.end(); ++sit) {          prob[k] += sit->second*w[k][inner_feature_map_[sit->first]];        }        prob[k] = exp(prob[k]);      }      double total_exp = 0.0f;      for (int k = 0; k < label_count_; ++k) {        total_exp += prob[k];      }      it->predict_prob = prob[inner_label_map_[it->label]]/total_exp;      VLOG(0) << "sampel predict [" << it->label << "]: " << it->predict_prob;    }  }  void TrainInternal(int32 count) {    for (int32 i = 0; i < count; ++i) {      //VLOG(0) << "training iteration:  " << (i+1);      // caculate post_prob: P(Ci | x)      for (size_t n = 0; n < samples_.size(); ++n) {        const DataSample& sample = samples_[n];        for (int32 k = 0; k < label_count_; ++k) {          // W*X,  x[0] = 1;          post_prob[n][k] = 1.0f*w[k][0];          for (auto it = sample.features.begin(); it != sample.features.end(); ++it) {            std::string outter_feature_idx = it->first;            double val = it->second;            int32 feature_idx = inner_feature_map_[outter_feature_idx];            post_prob[n][k] += val*w[k][feature_idx];          }          post_prob[n][k] = exp(post_prob[n][k]);        }        double exp_total = 0.0f;        for (int32 k = 0; k < label_count_; ++k) {          exp_total += post_prob[n][k];        }        for (int32 k = 0; k < label_count_; ++k) {         post_prob[n][k] /= exp_total;         //VLOG(0) << "P(C" << k << "|X" << n << ") = " << post_prob[n][k];        }      }      // caculate gradient, (E)/(Wk)      for (int32 k = 0; k < label_count_; ++k) {        for (int32 d = 0; d < feature_count_; ++d) {          grad[k][d] = 0.0f;        }        // iteration on every sample        for (size_t n = 0; n < samples_.size(); ++n) {          // iteration on every dimension          double Tnk = GetTnk(n, k);          double Ynk = post_prob[n][k];          for (int32 d = 0; d < feature_count_; ++d) {            grad[k][d] += GetXnd(n, d)*(Ynk - Tnk);          }        }        //std::string w_str;        for (int32 d = 0; d < feature_count_; ++d) {          grad[k][d] += w[k][d]*FLAGS_lambda;          w[k][d] -= FLAGS_alpha*grad[k][d];          //w_str.append(outer_feature_map_[d]).append(":").append(DoubleToString(w[k][d])).append(",");        }        //VLOG(0) << "[w" << k << "]: " << w_str;      }    }  }  void Dump() {    utils::hash_map<std::string, int>::iterator it;    for (it = inner_label_map_.begin(); it != inner_label_map_.end(); ++it) {      VLOG(0) << "lable: " << it->first << ", " << it->second;    }    for (it = inner_feature_map_.begin(); it != inner_feature_map_.end(); ++it) {      VLOG(0) << "featu: " << it->first << ", " << it->second;    }  }private:  double**   w;           // w[k][d]    update:  w[k] = w[k] - alpha*(grad[k])  double**   grad;        // grad[k][d] update:  grad[k] = (Ynk - Tnk)*Xn*lambda;  double**   post_prob;   // post_prob[n][k] update:  p[n][k] = P(Ck | xn);  std::vector<DataSample>  samples_;  utils::hash_map<std::string, int>  inner_label_map_;  // 'A' -> 1    'B' -> 2  utils::hash_map<int, std::string>  outer_label_map_;  //   1 -> 'A'    2 -> 'B'  int32 label_count_;  int AddLabel(const std::string& outer_label) {    utils::hash_map<std::string, int>::iterator it = inner_label_map_.find(outer_label);    if (it == inner_label_map_.end()) {      inner_label_map_[outer_label] = label_count_;      outer_label_map_[label_count_] = outer_label;      label_count_++;    }    return it->second;  }  utils::hash_map<std::string, int>  inner_feature_map_;      // "const_bias" -> 0,  "1" -> 1,  "2" -> 2,  "url_host_big" -> feature_count  utils::hash_map<int, std::string>  outer_feature_map_;  int32 feature_count_;  void AddFeature(const std::string& feature_name) {    utils::hash_map<std::string, int>::iterator it = inner_feature_map_.find(feature_name);    if (it == inner_feature_map_.end()) {      inner_feature_map_[feature_name] = feature_count_;      outer_feature_map_[feature_count_] = feature_name;      feature_count_++;    }  }  double GetXnd(const int32& n, const int32& d) {    double ret = 0.0f;    if (d == 0) {      ret = 1.0f;    } else {      DataSample& sample = samples_[n];      std::string& ol = outer_feature_map_[d];      auto it = sample.features.find(ol);      if (it != sample.features.end()) {        ret = it->second;      }    }    //VLOG(0) << "X(" << n << "," << d << ") = " << ret;    return ret;  }  double GetTnk(const int32& n, const int32& k) {    double ret = 0.0f;    if (inner_label_map_[samples_[n].label] == k) {      ret = 1.0f;    }    //VLOG(0) << "T(" << n << "," << k << ") = " << ret;    return ret;  } void FreeAuxParam() {    for (int k = 0; k < label_count_; ++k) {      delete w[k];      delete grad[k];    }    delete w;    delete grad;    for (size_t n = 0; n < samples_.size(); ++n) {      delete post_prob[n];    }    delete post_prob;  }  void AllocAuxParam() {    // w[k][d], grad[k][d]    w = new double*[label_count_];    grad = new double*[label_count_];    for (int k = 0; k < label_count_; ++k) {      w[k] = new double[feature_count_];      grad[k] = new double[feature_count_];      for (int f = 0; f < feature_count_; ++f) {        w[k][f] = 0.0f;      }    }    // post_prob[n][k]    post_prob = new double*[samples_.size()];    for (size_t n = 0; n < samples_.size(); ++n) {      post_prob[n] = new double[label_count_];    }  }};int main(int argc, char* argv[]) {  base::ParseCommandLineFlags(&argc, &argv, false);  TrainningDataSet  tds;  tds.LoadSamplesFromFile(FLAGS_train_path);  tds.Dump();  tds.Train();  return 0;}

测试样例：

A 1:0.20 2:0.70

A 1:0.10 2:0.80

A 1:0.30 2:0.60

A 1:0.05 2:0.94

A 1:0.77 2:0.22

A 1:0.44 2:0.55

B 1:0.20 2:0.81

B 1:0.30 2:0.71

B 1:1.00 2:0.01

B 1:0.50 2:0.51

B 1:0.40 2:0.65

B 1:0.70 2:0.40

结果：

I0930 09:53:14.443656 32046 lr.cc:100] sampel predict [A]: 0.926048
I0930 09:53:14.443763 32046 lr.cc:100] sampel predict [A]: 0.932346
I0930 09:53:14.443836 32046 lr.cc:100] sampel predict [A]: 0.919215
I0930 09:53:14.443896 32046 lr.cc:100] sampel predict [A]: 0.592285
I0930 09:53:14.443940 32046 lr.cc:100] sampel predict [A]: 0.421599
I0930 09:53:14.443987 32046 lr.cc:100] sampel predict [A]: 0.499967
I0930 09:53:14.444035 32046 lr.cc:100] sampel predict [B]: 0.569759
I0930 09:53:14.444111 32046 lr.cc:100] sampel predict [B]: 0.593065
I0930 09:53:14.444164 32046 lr.cc:100] sampel predict [B]: 0.740223
I0930 09:53:14.444211 32046 lr.cc:100] sampel predict [B]: 0.638351
I0930 09:53:14.444258 32046 lr.cc:100] sampel predict [B]: 0.816627
I0930 09:53:14.444300 32046 lr.cc:100] sampel predict [B]: 0.955107

中间的几个点很接近，所以 prob 不是那么高~

0 0