决策树和决策森林
来源:互联网 发布:海问律师事务所 知乎 编辑:程序博客网 时间:2024/04/29 00:33
概念
回归是预测一个数值型数量,比如大小,收入和温度。而分类是指预测标号或类别,比如判断邮件是否为“垃圾邮件”,拼图游戏的图案是否是“猫”。回归问题的目标为数值型特征,而分类问题的目标为类别型特征。
精确度是二元分类问题中一个常用的指标。精确度就是被标记为“正”而且确实是“正”的样本占所有标记为“正”的样本比例。召回率是被分类器标记为“正”的所有样本与所有本来就是“正”的样本的比率。
代码
object RDF { def main(args: Array[String]) { val sc = new SparkContext(new SparkConf().setAppName("RDF")) val rawData = sc.textFile("/user/ds/covtype.data") val data = rawData.map { line => val values = line.split(',').map(_.toDouble) val featureVector = Vectors.dense(values.init) val label = values.last - 1 LabeledPoint(label, featureVector) } val Array(trainData, cvData, testData) = data.randomSplit(Array(0.8, 0.1, 0.1)) trainData.cache() cvData.cache() testData.cache() simpleDecisionTree(trainData, cvData) randomClassifier(trainData, cvData) evaluate(trainData, cvData, testData) evaluateCategorical(rawData) evaluateForest(rawData) trainData.unpersist() cvData.unpersist() testData.unpersist() } def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]): MulticlassMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new MulticlassMetrics(predictionsAndLabels) } def simpleDecisionTree(trainData: RDD[LabeledPoint], cvData: RDD[LabeledPoint]): Unit = { // Build a simple default DecisionTreeModel val model = DecisionTree.trainClassifier(trainData, 7, Map[Int, Int](), "gini", 4, 100) val metrics = getMetrics(model, cvData) println(metrics.confusionMatrix) println(metrics.precision) (0 until 7) .map( category => (metrics.precision(category),metrics.recall(category)) ).foreach(println) } def classProbabilities(data:RDD[LabeledPoint]):Array[Double] = { val countsByCategrory = data.map(_.label).countByValue() val counts = countsByCategrory.toArray.sortBy(_._1).map(_._2) counts.map(_.toDouble/counts.sum) } def randomClassifier(trainData: RDD[LabeledPoint], cvData: RDD[LabeledPoint]): Unit = { val trainPriorProbabilities = classProbabilities(trainData) val cvPriorProbabilities = classProbabilities(cvData) val accuracy = trainPriorProbabilities.zip(cvPriorProbabilities).map { case (trainProb, cvProb) => trainProb * cvProb }.sum println(accuracy) } def evaluate( trainData:RDD[LabeledPoint], cvData:RDD[LabeledPoint], testData :RDD[LabeledPoint] ):Unit = { val evaluations = for (impurity <- Array("gini","entropy"); depth <- Array(1,20); bins <- Array(10,300)) yield { val model = DecisionTree.trainClassifier( trainData,7,Map[Int,Int](),impurity,depth,bins) val accuracy = getMetrics(model,cvData).precision ((impurity,depth,bins),accuracy) } evaluations.sortBy(_._2).reverse.foreach(println) val model = DecisionTree.trainClassifier( trainData.union(cvData), 7, Map[Int,Int](), "entropy", 20, 300) println(getMetrics(model, testData).precision) println(getMetrics(model, trainData.union(cvData)).precision) } def unencodeOneHot(rawData: RDD[String]): RDD[LabeledPoint] = { rawData.map { line => val values = line.split(',').map(_.toDouble) val wilderness = values.slice(10,14).indexOf(1.0).toDouble val soil = values.slice(14,54).indexOf(1.0).toDouble val featureVector = Vectors.dense(values.slice(0,10) :+ wilderness :+ soil) val label = values.last -1 LabeledPoint(label,featureVector) } } def evaluateCategorical(rawData: RDD[String]): Unit = { val data = unencodeOneHot(rawData) val Array(trainData,cvData,testData) = data.randomSplit(Array(0.8,0.1,0.1)) trainData.cache() cvData.cache() testData.cache() val evaluations = for (impurity <- Array("gini", "entropy"); depth <- Array(10, 20, 30); bins <- Array(40, 300)) yield { // Specify value count for categorical features 10, 11 val model = DecisionTree.trainClassifier( trainData, 7, Map(10 -> 4, 11 -> 40), impurity, depth, bins) val trainAccuracy = getMetrics(model, trainData).precision val cvAccuracy = getMetrics(model, cvData).precision // Return train and CV accuracy ((impurity, depth, bins), (trainAccuracy, cvAccuracy)) } evaluations.sortBy(_._2._2).reverse.foreach(println) evaluations.sortBy(_._2._2).reverse.foreach(println) val model = DecisionTree.trainClassifier( trainData.union(cvData), 7, Map(10 -> 4, 11 -> 40), "entropy", 30, 300) println(getMetrics(model, testData).precision) trainData.unpersist() cvData.unpersist() testData.unpersist() } def evaluateForest(rawData: RDD[String]): Unit = { val data = unencodeOneHot(rawData) val Array(trainData, cvData) = data.randomSplit(Array(0.9, 0.1)) trainData.cache() cvData.cache() val forest = RandomForest.trainClassifier( trainData, 7, Map(10 -> 4, 11 -> 40), 20, "auto", "entropy", 30, 300) val predictionsAndLabels = cvData.map(example => (forest.predict(example.features), example.label) ) println(new MulticlassMetrics(predictionsAndLabels).precision) val input = "2709,125,28,67,23,3224,253,207,61,6094,0,29" val vector = Vectors.dense(input.split(',').map(_.toDouble)) println(forest.predict(vector)) }}
阅读全文
0 0
- 决策树和决策森林
- 决策树和决策森林
- 决策树和随机决策森林基本原理和应用实例
- 决策树和随机森林
- 决策树和随机森林
- 决策树和随机森林
- 决策树和随机森林算法
- 决策森林和卷积神经网络二道归一
- 决策森林和卷积神经网络二道归一
- 决策树, 森林
- R语言之决策树和随机森林
- 机器学习之决策树和随机森林
- 决策树、随机森林简单原理和实现
- 机器学习-决策树和随机森林
- 决策树、装袋、提升和随机森林
- 机器学习--决策树和随机森林简介
- Mahout决策森林
- 决策树与决策规则
- nginx编译安装的时候总是出现pcre.h没有那个文件或目录
- VPN篇(5.6) 06. 多条 IPsec VPN 冗余 ❀ 飞塔 (Fortinet) 防火墙
- 【Spring】【一】基本注解以及小实例
- ABAP开发CHECK, EXIT, RETURN, LEAVE PROGRAM用法
- C/C++ 之 define的使用
- 决策树和决策森林
- quartz调度器简单例子
- 探究 Java 虚拟机栈
- JDBC学习笔记
- 解决Django和EasyUI搭配使用时出现的CSRF问题
- Python文件头模板
- php黑魔法
- ubuntu系统下wireshark普通用户抓包设置
- 229. Majority Element II