再谈随机森林---python实现
来源:互联网 发布:淘宝上买春药搜什么 编辑:程序博客网 时间:2024/06/01 09:57
该代码是我从github上找了半天才找到非常适合初学者的,在此感谢原作者,从你的代码中我学到了很多。
from __future__ import divisionimport pandas as pdimport copyimport randomimport math# 最后一个属性还不能将样本完全分开,此时数量最多的label被选为最终类别def majorClass(classList): classDict = {} for cls in classList: classDict[cls] = classDict.get(cls, 0) + 1 sortClass = sorted(classDict.items(), key=lambda item: item[1]) return sortClass[-1][0]# 计算基尼系数def calcGini(dataSet): labelCounts = {} # 给所有可能分类创建字典 for dt in dataSet: currentLabel = dt[-1] labelCounts[currentLabel] = labelCounts.get(currentLabel, 0) + 1 Gini = 1 for key in labelCounts: prob = labelCounts[key] / len(dataSet) Gini -= prob * prob return Gini# 对连续变量划分数据集def splitDataSet(dataSet, featIndex, value): leftData, rightData = [], [] for dt in dataSet: if dt[featIndex] <= value: leftData.append(dt) else: rightData.append(dt) return leftData, rightData# 选择最好的数据集划分方式def chooseBestFeature(dataSet): bestGini = 1 bestFeatureIndex = -1 bestSplitValue = None # 第i个特征 for i in range(len(dataSet[0]) - 1): featList = [dt[i] for dt in dataSet] # 产生候选划分点 sortfeatList = sorted(list(set(featList))) splitList = [] for j in range(len(sortfeatList) - 1): splitList.append((sortfeatList[j] + sortfeatList[j + 1]) / 2) # 第j个候选划分点,记录最佳划分点 for splitValue in splitList: newGini = 0 subDataSet0, subDataSet1 = splitDataSet(dataSet, i, splitValue) newGini += len(subDataSet0) / len(dataSet) * calcGini(subDataSet0) newGini += len(subDataSet1) / len(dataSet) * calcGini(subDataSet1) if newGini < bestGini: bestGini = newGini bestFeatureIndex = i bestSplitValue = splitValue return bestFeatureIndex, bestSplitValue# 去掉第i个属性,生成新的数据集def splitData(dataSet, featIndex, features, value): newFeatures = copy.deepcopy(features) newFeatures.remove(features[featIndex]) leftData, rightData = [], [] for dt in dataSet: temp = [] temp.extend(dt[:featIndex]) temp.extend(dt[featIndex + 1:]) #实验时,有时会出现value=NONE的情况 #好像引起这个问题有两个原因 #1.chooseBestFeature有问题 (20%) #2.你的数据集有问题(80%) # 括号是出问题的可能性 if dt[featIndex] <= value: leftData.append(temp) else: rightData.append(temp) return newFeatures, leftData, rightData# 建立决策树def createTree(dataSet, features): classList = [dt[-1] for dt in dataSet] # label一样,全部分到一边 if classList.count(classList[0]) == len(classList): return classList[0] # 最后一个特征还不能把所有样本分到一边,则选数量最多的label if len(features) == 1: return majorClass(classList) bestFeatureIndex, bestSplitValue = chooseBestFeature(dataSet) bestFeature = features[bestFeatureIndex] # 生成新的去掉bestFeature特征的数据集 newFeatures, leftData, rightData = splitData(dataSet, bestFeatureIndex, features, bestSplitValue) # 左右两颗子树,左边小于等于最佳划分点,右边大于最佳划分点 myTree = {bestFeature: {'<' + str(bestSplitValue): {}, '>' + str(bestSplitValue): {}}} myTree[bestFeature]['<' + str(bestSplitValue)] = createTree(leftData, newFeatures) myTree[bestFeature]['>' + str(bestSplitValue)] = createTree(rightData, newFeatures) return myTree# 用生成的决策树对测试样本进行分类def treeClassify(decisionTree, featureLabel, testDataSet): firstFeature = decisionTree.keys()[0] secondFeatDict = decisionTree[firstFeature] splitValue = float(secondFeatDict.keys()[0][1:]) # secondFeatDict.keys()[0]是'<'+str(bestsplitValue) #所以索引应该从1开始去掉之前的'<'或者‘>’ featureIndex = featureLabel.index(firstFeature) if testDataSet[featureIndex] <= splitValue: valueOfFeat = secondFeatDict['<' + str(splitValue)] else: valueOfFeat = secondFeatDict['>' + str(splitValue)] if isinstance(valueOfFeat, dict): pred_label = treeClassify(valueOfFeat, featureLabel, testDataSet) else: pred_label = valueOfFeat return pred_label# 随机抽取样本,样本数量与原训练样本集一样,维度为sqrt(m-1)def baggingDataSet(dataSet): n, m = dataSet.shape features = random.sample(dataSet.columns.values[:-1], int(math.sqrt(m - 1))) features.append(dataSet.columns.values[-1]) #不好疑惑这儿就是得添加最后一列,你可能想说不是应该抽取特征数的开方个特征不就行了吗? #其实这个函数是为了构造一个新的数据集,所以必须加上最后一列(类别列)。 rows = [random.randint(0, n-1) for _ in range(n)] trainData = dataSet.iloc[rows][features] #利用pandas.DataFrame的iloc方法取值 return trainData.values.tolist(), featuresdef testWine(): df = pd.read_csv('wine.txt', header=None) labels = df.columns.values.tolist() df = df[df[labels[-1]] != 3] # 生成多棵决策树,放到一个list里边 treeCounts = 10 treeList = [] for i in range(treeCounts): baggingData, bagginglabels = baggingDataSet(df) decisionTree = createTree(baggingData, bagginglabels) treeList.append(decisionTree) print treeList # 对测试样本分类 labelPred = [] for tree in treeList: testData = [12, 0.92, 2, 19, 86, 2.42, 2.26, 0.3, 1.43, 2.5, 1.38, 3.12, 278] label = treeClassify(tree, labels[:-1], testData) labelPred.append(label) # 投票选择最终类别(其实也就是majorityclass()) labelDict = {} for label in labelPred: labelDict[label] = labelDict.get(label, 0) + 1 sortClass = sorted(labelDict.items(), key=lambda item: item[1]) print "The predicted label is: {}".format(sortClass[-1][0])testWine()
该算法解决了上一篇关于随机森林中的两个问题。
如果拓展下加上重复做十次又该怎么做呢。
注意:这不是十折交叉验证,因为在十交叉验证中,块都是分好的。也就是说已经被分为测试集的样本将不会再分成测试集。我做的时候忘记了这点只想着循环十次就是十-交叉验证,做完才发现自己的错误。。。。
在这给大家一个参考的例子,我自己将这两篇关于随机森林的代码综合了下 ,能力有限,凑合着看吧。
import scipy.io as sioimport numpy as npimport pandas as pdimport copyimport randomimport mathfrom sklearn.cross_validation import train_test_splitdef load_data(filename): load_data = sio.loadmat(filename) PET = load_data['PET'] MRI = load_data['MRI'] #MRI = np.round(MRI, decimals=6) GND4 = load_data['GND4'] GND3 = load_data['GND3'] CSF = load_data['CSF'] dataset = np.append(PET,MRI,axis=1) dataset = np.append(dataset,CSF,axis = 1) dataset = np.append(dataset,GND3,axis = 1) df = pd.DataFrame(dataset) labels = df.columns.values.tolist() df = df[df[labels[-1]] != 2] # only left 1 and 3 dataset = df.iloc[:,0:189] label = df.iloc[:,189] dataset = np.array(dataset) label = np.array(label) label = label.tolist() labels = [] for i in label: g = [i] labels.append(g) labels = np.array(labels) #print(GND3[0:150,])#retudata = []#for i in range(dataset.shape[0]): #retudata.append(dataset[i].tolist()) return dataset,labelsdef majorClass(classList): classDict = {} for cls in classList: classDict[cls] = classDict.get(cls, 0) + 1 sortClass = sorted(classDict.items(), key=lambda item: item[1]) return sortClass[-1][0]def calcGini(dataSet): labelCounts = {} for dt in dataSet: currentLabel = dt[-1] labelCounts[currentLabel] = labelCounts.get(currentLabel, 0) + 1 Gini = 1 for key in labelCounts: prob = labelCounts[key] / len(dataSet) Gini -= prob * prob return Ginidef splitDataSet(dataSet, featIndex, value): leftData, rightData = [], [] for dt in dataSet: if dt[featIndex] <= value: leftData.append(dt) else: rightData.append(dt) return leftData, rightDatadef chooseBestFeature(dataSet): bestGini = 999 bestFeatureIndex = -1 bestSplitValue = None for i in range(len(dataSet[0]) - 1): featList = [dt[i] for dt in dataSet] sortfeatList = sorted(list(set(featList))) splitList = [] for j in range(len(sortfeatList) - 1): #splitList.append((sortfeatList[j] + sortfeatList[j + 1]) / 2) for splitValue in sortfeatList: newGini = 0 subDataSet0, subDataSet1 = splitDataSet(dataSet, i, splitValue) newGini += len(subDataSet0) / len(dataSet) * calcGini(subDataSet0) newGini += len(subDataSet1) / len(dataSet) * calcGini(subDataSet1) if newGini < bestGini: bestGini = newGini bestFeatureIndex = i bestSplitValue = splitValue return bestFeatureIndex, bestSplitValuedef splitData(dataSet, featIndex, features, value): newFeatures = copy.deepcopy(features) newFeatures.remove(features[featIndex]) leftData, rightData = [], [] for dt in dataSet: temp = [] temp.extend(dt[:featIndex]) temp.extend(dt[featIndex + 1:]) if value == None or dt[featIndex] <= value: leftData.append(temp) else: rightData.append(temp) return newFeatures, leftData, rightDatadef createTree(dataSet, features): classList = [dt[-1] for dt in dataSet] if classList.count(classList[0]) == len(classList): return classList[0] if len(features) == 1: return majorClass(classList) bestFeatureIndex, bestSplitValue = chooseBestFeature(dataSet) bestFeature = features[bestFeatureIndex] newFeatures, leftData, rightData = splitData(dataSet, bestFeatureIndex, features, bestSplitValue) myTree = {bestFeature: {'<' + str(bestSplitValue): {}, '>' + str(bestSplitValue): {}}} myTree[bestFeature]['<' + str(bestSplitValue)] = createTree(leftData, newFeatures) myTree[bestFeature]['>' + str(bestSplitValue)] = createTree(rightData, newFeatures) return myTreedef treeClassify(decisionTree, featureLabel, testDataSet): firstFeature = list(decisionTree.keys())[0] secondFeatDict = decisionTree[firstFeature] splitValue = float(list(secondFeatDict.keys())[0][1:]) featureIndex = featureLabel.index(firstFeature) if testDataSet[featureIndex] <= splitValue: valueOfFeat = secondFeatDict['<' + str(splitValue)] else: valueOfFeat = secondFeatDict['>' + str(splitValue)] if isinstance(valueOfFeat, dict): pred_label = treeClassify(valueOfFeat, featureLabel, testDataSet) else: pred_label = valueOfFeat return pred_labeldef baggingDataSet(dataSet): n, m = dataSet.shape features = random.sample(list(dataSet.columns.values[:-1]), int(math.sqrt(m - 1)) + 1) features.append(dataSet.columns.values[-1]) rows = [random.randint(0, n-1) for _ in range(n)] trainData = dataSet.iloc[rows][features] return trainData.values.tolist(), features#def majorityCnt(classList): #classCount = {} #for vote in classList: #if vote not in classCount.keys(): #classCount[vote] = 0 #classCount[vote] += 1 #sortClass = sorted(labelDict.items(), key=lambda item: item[1]) #return sortClass[-1][0]def ad_vs_nc(train_set,test_set,n_trees): df = pd.DataFrame(train_set) # this is train dataset labels = df.columns.values.tolist() df = df[df[labels[-1]] != 2] treeCounts = n_trees treeList = [] for i in range(treeCounts): #get forest baggingData, bagginglabels = baggingDataSet(df) decisionTree = createTree(baggingData, bagginglabels) treeList.append(decisionTree) print( treeList,'treelist') predictions = [] for row in test_set: labelPred = [] for tree in treeList: testData = row label = treeClassify(tree, labels[:-1], testData) labelPred.append(label) predictions.append(majorClass(labelPred)) return predictionsdef change_y_test(y_test): actualList = [] for i in range(len(y_test)): #if y_test[i][0] != 2: actualList.extend(y_test[i]) return actualListdef accuracy_cal(actual, predicted): print(actual,'actual') print(predicted,'predicted') correct = 0 for i in range(len(actual)): if actual[i] == predicted[i]: correct += 1 return correct / float(len(actual)) * 100.0def evaluate_algorithm(): filename = 'f:\\ADNI202.mat' data,labels = load_data(filename) scores = [] n_trees = 100 for i in range(10): x_train,x_test,y_train,y_test = train_test_split(data,labels,test_size = 0.1,random_state = 0) #这是sklearn的方法,自动分测试集和训练集 x_train是测试样本的特征值 #y_train是测试样本的类别值 #train_set = np.append(x_train,y_train,axis = 1) train_set = np.append(x_train,y_train,axis = 1) train_set = pd.DataFrame(train_set) test_set = x_test.tolist() actual = change_y_test(y_test) predicted = ad_vs_nc(train_set,test_set,n_trees) accuracy = accuracy_cal(actual,predicted) scores.append(accuracy) print("trees {0}".format(n_trees)) print('scores{0}'.format(scores)) print('mean accuracy{0}'.format(sum(scores)/float(len(scores)))) evaluate_algorithm()
阅读全文
0 0
- 再谈随机森林---python实现
- 随机森林---python实现
- python多线程并行实现随机森林
- python实现机器学习之随机森林
- 随机森林算法的python实现
- python实现机器学习之随机森林
- Bagging:随机森林及Python实现
- 随机森林random forest及python实现
- python实现机器学习之随机森林
- 随机森林 python
- 随机森林(python)
- Python随机森林
- python随机森林
- python实现集成回归算法,包括随机森林,极端随机森林,梯度boosting算法
- 随机森林算法实现
- 随机森林算法实现
- 随机森林算法的简单总结及python实现
- 决策树与随机森林相关概念及其Python实现
- 移植QT5.6到嵌入式开发板(史上最详细的QT移植教程)
- LeetCode----241. Different Ways to Add Parenthese(M)分治
- Pandas(Serises)
- 函数数组参数解构
- FFT算法的完整DSP实现
- 再谈随机森林---python实现
- 教你如何使用webpack打包你的项目
- SpringMVC(三)
- leetcode 623. Add One Row to Tree
- Java 类会执行初始化的5种情况
- 关于Lenovo YOGA700 启动boot与bios
- Day16-response
- Java中抽象类与接口的注意要点
- 使用scrapy爬取CR糗百图片