c++实现LogisticRegression/用c++实现逻辑回归
来源:互联网 发布:highcharts.js api 编辑:程序博客网 时间:2024/06/05 09:41
今天没事学习c++,试着用c++实现了一个简单的逻辑回归。
因为重在学习c++语法和逻辑回归的具体实现细节,因此没有用到向量化矩阵化等操作,而是老老实实用遍历一步一步求解梯度。
代码具体实现了梯度下降、随梯度下降、批梯度下降三种学习算法。
本文仅包含代码实现部分,逻辑回归的理论可以参考:
http://blog.csdn.net/abcjennifer/article/details/7716281
代码包括两个头文件和一个cpp文件:
logisticregression.h
#ifndef _LOGISTICREGRESSION_H_#define _LOGISTICREGRESSION_H_#include <iostream>#include <fstream>#include <string>#include <vector>#include <assert.h>struct DataSet { int numData; int numFeature; std::vector<std::vector<float> > features; std::vector<int> labels;};class LogisticRegression {public: LogisticRegression(int max_iter, float learn_rate, float tol); ~LogisticRegression(); DataSet loadData(std::string filename); void initWeights(int length); std::vector<float> oneSampleGrident(std::vector<float> feature, int label); void train(DataSet* dataset, std::string gdType); std::vector<float> getWeight(); int predict(std::vector<float> feature); float predict_proba(std::vector<float> feature); float score(DataSet* dataset);private: int maxIter_; float learnRate_; float tol_; std::vector<float> weights;};#endif
common.h
#ifndef _COMMON_H_#define _COMMON_H_#include <iostream>#include <string>#include <vector>#include <cmath>namespace common {std::vector<std::string> Split(std::string line, char flag) { std::vector<std::string> ret; size_t start = 0; size_t index = line.find_first_of(flag, start); while (index != std::string::npos) { ret.push_back(line.substr(start, index)); start = index + 1; index = line.find_first_of(flag, start); } ret.push_back(line.substr(start, index)); return ret;}float Sigmoid(float inx) { return 1 / (1 + exp(-inx));}}#endif
logisticregression.cpp
/* *author: july-zj *usage: g++ -std=c++0x LogisticRegression.cpp -I../include -o lr_test *run: ./lr ../data/iris_train.data ../data/iris_train.data 10 0.1 0.0001 gd *dataset: csvtype file, label is the last column **/#include "LogisticRegression.h"#include "common.h"#include <random>#include <chrono>LogisticRegression::LogisticRegression(int maxIter, float learnRate, float tol = 0.0001) { this->maxIter_ = maxIter; this->learnRate_ = learnRate; this->tol_ = tol;}LogisticRegression::~LogisticRegression() {}DataSet LogisticRegression::loadData(std::string filename) { std::ifstream ifile(filename); DataSet dataset; if (!ifile) { std::cout << "can not open file: " << filename << std::endl; return dataset; } std::string line; while (getline(ifile, line)) { std::vector<std::string> tokens = common::Split(line, ','); std::vector<float> feature; for (int i = 0; i < tokens.size(); ++i) { if (i == tokens.size() - 1) { dataset.labels.push_back(atoi(tokens[i].c_str())); } else { feature.push_back(atof(tokens[i].c_str())); } } dataset.features.push_back(feature); dataset.numData += 1; } dataset.numFeature = dataset.features[0].size(); return dataset; }void LogisticRegression::initWeights(int length) { this->weights.push_back(1.0); for (int i = 0; i < length; ++i) { this->weights.push_back(1.0); }}std::vector<float> LogisticRegression::oneSampleGrident(std::vector<float> feature, int label) { std::vector<float> grident(this->weights.size(), 0.0); float predY = predict_proba(feature); grident[0] = predY - label; for (int i = 0; i < feature.size(); ++i) { grident[i + 1] = (predY - label) * feature[i]; } return grident; }void LogisticRegression::train(DataSet* dataset, std::string gdType) { DataSet* traindata = dataset; int dataNum = traindata->numData; int featureNum = traindata->numFeature; initWeights(featureNum); for (int iter = 0; iter < this->maxIter_; ++iter) { if (gdType == "gd") { std::vector<float> overallGrident(this->weights.size(), 0.0); for (int i = 0; i < dataNum; ++i) { std::vector<float> tmpGrid = oneSampleGrident(traindata->features[i], traindata->labels[i]); for (int j = 0; j < tmpGrid.size(); ++j) { overallGrident[j] += tmpGrid[j]; } } for (int i = 0; i < this->weights.size(); ++i) { this->weights[i] -= this->learnRate_ * overallGrident[i] / dataNum; } float loss = 0.0; for (int i = 0; i < dataNum; ++i) { float predY = predict_proba(traindata->features[i]); if (predY < 1e-5) predY = 1e-5; if (predY > 1 - 1e-5) predY = 1 - 1e-5; loss -= (traindata->labels[i] * log(predY) + (1 - traindata->labels[i]) * log(1 - predY)); } loss /= dataNum; std::cout << "iter " << iter << " loss is: " << loss << std::endl; if (loss < this->tol_) { std::cout << "early stoping for loss is less then tolerance" << std::endl; return; } } else if (gdType == "sgd") { std::vector<int> indexes; for (int i = 0; i < dataNum; ++i) { indexes.push_back(i); } unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); std::shuffle(indexes.begin(), indexes.end(), std::default_random_engine(seed)); for (int i : indexes) { std::vector<float> grident = oneSampleGrident(traindata->features[i], traindata->labels[i]); for (int j = 0; j < this->weights.size(); ++j) { this->weights[i] -= this->learnRate_ * grident[i]; } float loss = 0.0; for (int n = 0; n < dataNum; ++n) { float predY = predict_proba(traindata->features[n]); if (predY < 1e-5) predY = 1e-5; if (predY > 1 - 1e-5) predY = 1 - 1e-5; loss -= (traindata->labels[n] * log(predY) + (1 - traindata->labels[n]) * log(1 - predY)); } loss /= dataNum; std::cout << "iter " << iter << " loss is: " << loss << std::endl; if (loss < this->tol_) { std::cout << "early stoping for loss is less then tolerance" << std::endl; return; } } } else { std::vector<int> indexes; for (int i = 0; i < dataNum; ++i) { indexes.push_back(i); } unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); std::shuffle(indexes.begin(), indexes.end(), std::default_random_engine(seed)); for (int i = 0; i < dataNum; i += 10) { std::vector<float> batchGrident(this->weights.size(), 0.0); int batchSize = 0; for (int j = 0; j < 10; ++j) { int idx = i + j; if (idx >= dataNum) { break; } batchSize += 1; std::vector<float> tmpGrid = oneSampleGrident(traindata->features[idx], traindata->labels[idx]); for (int m = 0; m < tmpGrid.size(); ++m) { batchGrident[m] += tmpGrid[m]; } } for (int n = 0; n < this->weights.size(); ++n) { this->weights[n] -= this->learnRate_ * batchGrident[n] / batchSize; } float loss = 0.0; for (int i = 0; i < dataNum; ++i) { float predY = predict_proba(traindata->features[i]); if (predY < 1e-5) predY = 1e-5; if (predY > 1 - 1e-5) predY = 1 - 1e-5; loss -= (traindata->labels[i] * log(predY) + (1 - traindata->labels[i]) * log(1 - predY)); } loss /= dataNum; std::cout << "iter " << iter << " loss is: " << loss << std::endl; if (loss < this->tol_) { std::cout << "early stoping for loss is less then tolerance" << std::endl; return; } } } }}std::vector<float> LogisticRegression::getWeight() { return this->weights; }int LogisticRegression::predict(std::vector<float> feature) { float predY = predict_proba(feature); if (predY > 0.5) { return 1; } else { return 0; }}float LogisticRegression::predict_proba(std::vector<float> feature) { float summation = this->weights[0]; for (int i = 0; i < feature.size(); ++i) { summation += this->weights[i + 1] * feature[i]; } return common::Sigmoid(summation);}float LogisticRegression::score(DataSet* dataset) { int rightNum = 0; for (int i = 0; i < dataset->numData; ++i) { if (predict(dataset->features[i]) == dataset->labels[i]) { rightNum++; } } std::cout << "right number: " << rightNum << std::endl; return rightNum * 1.0 / dataset->numData; }int main(int argc, char* argv[]) { if (argc < 7) { std::cout << "usage: trainfile, valfile, maxiter, learnrate, tol, gdtype" << std::endl; return -1; } std::string trainFile = std::string(argv[1]); std::string valFile = std::string(argv[2]); int maxIter = atoi(argv[3]); float learnRate = atof(argv[4]); float tol = atof(argv[5]); std::string gdType = std::string(argv[6]); assert(gdType == "gd" || gdType == "sgd" || gdType == "bachgd"); LogisticRegression lr(maxIter, learnRate); DataSet trainData = lr.loadData(trainFile); DataSet valData = lr.loadData(valFile); lr.train(&trainData, gdType); std::cout << "precistion is: " << lr.score(&trainData) << std::endl; return 0;}
直接拷备过来编译即可,输入参数按usage说明来写。需要指定训练数据路径,验证数据路径,迭代次数,学习速率,训练结束的容差,训练方式等六个参数。
阅读全文
0 0
- c++实现LogisticRegression/用c++实现逻辑回归
- 【甘道夫】基于scikit-learn实现逻辑回归LogisticRegression
- LogisticRegression逻辑回归
- Sklearn-LogisticRegression逻辑回归
- 逻辑回归LogisticRegression
- SparkMLlib---LinearRegression(线性回归)、LogisticRegression(逻辑回归)
- (sklearn)逻辑回归linear_model.LogisticRegression用法
- 逻辑回归--Octave实现
- python实现逻辑回归
- 逻辑回归 python 实现
- 逻辑回归的实现
- Python实现逻辑回归
- Tensorflow实现逻辑回归
- ML逻辑回归实现
- 逻辑回归实现
- 逻辑回归----Python实现
- 逻辑回归实现
- Python实现逻辑回归
- jquery简单轮播图实现
- POJ Checking an Alibi
- 17.Spark第1部分
- Python:numpy中arange的使用方法
- hdu6152 Friend-Graph CCPC网赛1003 Ramsey定理
- c++实现LogisticRegression/用c++实现逻辑回归
- POJ 3321 树状数组+DFS
- 大家好!刚开通博客,和大家打个招呼,\(^o^)/
- ubuntu eigen 安装与使用
- HDU_6152 Friend-Graph 【暴力】
- [LintCode]4.丑数 II
- 页面弹出层组件layer的用法
- 经济编码
- 字符串问题---数组中两个字符串的最小距离