Python分类算法交叉验证
来源:互联网 发布:淘宝买家怎样修改差评 编辑:程序博客网 时间:2024/05/18 00:24
我们使用Sklearn-train_test_split随机划分训练集和测试集
http://blog.csdn.net/cherdw/article/details/54881167
实验代码:
import gensimfrom sklearn.linear_model import LogisticRegressionimport pandas as pdfrom sklearn.model_selection import train_test_splitfile = open(u'/home/ubuntu/file/数据平衡无分类', encoding='utf-8')documents = gensim.models.doc2vec.TaggedLineDocument(file)fileresult=open(u'/home/ubuntu/file/调参结果','w')#怎么根据model计算新的向量#生成文本向量#使用逻辑回归进行预测def LR(): clf = LogisticRegression() return clfdef getRecognitionRate(testPre, testClass): testNum = len(testPre) rightNum = 0 for i in range(0, testNum): if testClass[i] == testPre[i]: rightNum += 1 return float(rightNum) / float(testNum)def getData(model): #生成pandas tigs = [] data_dict = {} # 生成pandas数据 fileclass = open(u'/home/ubuntu/file/数据平衡分类结果', encoding='utf-8') for tig in fileclass: tigs.append(tig.strip()) for i in range(len(model.docvecs)-1): data_dict['p'+str(i)] = model.docvecs[i] data = pd.DataFrame(data_dict) data = data.T # data['class0'] = tigs X_train1, X_test1, y_train1, y_test1 = train_test_split(data, tigs, test_size=0.4, random_state=1) fileclass.close() return X_train1, y_train1, X_test1, y_test1#调参# for i in range(20,100):# for j in range(10,100):# print('参数值:'+str(i)+":"+str(j))# fileresult.write('参数值:'+str(i)+":"+str(j)+'\n')model = gensim.models.Doc2Vec(documents, size=80, window=9, min_count=40, workers=8)T = getData(model)trainMatrix, trainClass, testMatrix, testClass = T[0], T[1], T[2], T[3]clf_LR=LR()clf_LR.fit(trainMatrix, trainClass)print('Logistic Regression recognition rate: '+str(getRecognitionRate(clf_LR.predict(testMatrix), testClass)))# fileresult.write('Logistic Regression recognition rate: '+str(getRecognitionRate(clf_LR.predict(testMatrix), testClass))+'\n')# fileresult.close()#怎么画出来ROC_curve# ROC_curve(clf_LR,testClass)
使用交叉验证方法计算平均正确率:
import gensimfrom sklearn.linear_model import LogisticRegressionimport pandas as pdfrom sklearn.model_selection import train_test_splitimport numpy as npfrom sklearn.model_selection import StratifiedKFoldfrom sklearn.metrics import roc_curve, aucfile = open(u'/home/ubuntu/file/数据平衡无分类', encoding='utf-8')fileclass=open(u'/home/ubuntu/file/数据平衡分类结果',encoding='utf-8')documents = gensim.models.doc2vec.TaggedLineDocument(file)model = gensim.models.Doc2Vec(documents, size=81, window=8, min_count=39, workers=8)#怎么根据model计算新的向量#生成文本向量#使用逻辑回归进行预测def LR(): clf = LogisticRegression() return clfdef getRecognitionRate(testPre, testClass): testNum = len(testPre) rightNum = 0 for i in range(0, testNum): if testClass[i] == testPre[i]: rightNum += 1 return float(rightNum) / float(testNum)def getData(): #生成pandas tigs = [] data_dict = {} # 生成pandas数据 for tig in fileclass: tigs.append(tig.strip()) for i in range(len(model.docvecs)): data_dict['p' + str(i)] = model.docvecs[i] data = pd.DataFrame(data_dict) data = data.T data['class0'] = tigs X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size=0.4, random_state=0) return X_train1, y_train1, X_test1, y_test1# def ROC_curve(lr,y_test):# pred_probas = lr.predict_proba(testMatrix)[:,1]# fpr,tpr,_ = roc_curve(y_test, pred_probas)# roc_auc = auc(fpr,tpr)# plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)# plt.plot([0, 1], [0, 1], 'k--')# plt.xlim([0.0, 1.0])# plt.ylim([0.0, 1.05])# plt.show()def getData_3(): tigs = [] data_dict = {} # 生成pandas数据 for tig in fileclass: tigs.append(tig.strip()) for i in range(len(model.docvecs)-1): data_dict['p' + str(i)] = model.docvecs[i] data = pd.DataFrame(data_dict) dataMatrix = data.T dataMatrix['class0'] = tigs print(dataMatrix) sampleData = [] sampleClass = [] for i in dataMatrix.index: tempList = dataMatrix.loc[i].values print(tempList) sampleClass.append(tempList[-1]) sampleData.append(tempList[0:-1]) sampleM = np.array(sampleData) # 二维矩阵,一行是一个样本,行数=样本总数,列数=样本特征数 classM = np.array(sampleClass) # 一维列向量,每个元素对应每个样本所属类别 # 调用StratifiedKFold方法生成训练集和测试集 skf = StratifiedKFold(n_splits=2) setDict = {} # 创建字典,用于存储生成的训练集和测试集 count = 1 for trainI, testI in skf.split(sampleM, classM): trainSTemp = [] # 用于存储当前循环抽取出的训练样本数据 trainCTemp = [] # 用于存储当前循环抽取出的训练样本类标 testSTemp = [] # 用于存储当前循环抽取出的测试样本数据 testCTemp = [] # 用于存储当前循环抽取出的测试样本类标 # 生成训练集 trainIndex = list(trainI) for t1 in range(0, len(trainIndex)): trainNum = trainIndex[t1] trainSTemp.append(list(sampleM[trainNum, :])) trainCTemp.append(list(classM)[trainNum]) setDict[str(count) + 'train'] = np.array(trainSTemp) setDict[str(count) + 'trainclass'] = np.array(trainCTemp) # 生成测试集 testIndex = list(testI) for t2 in range(0, len(testIndex)): testNum = testIndex[t2] testSTemp.append(list(sampleM[testNum, :])) testCTemp.append(list(classM)[testNum]) setDict[str(count) + 'test'] = np.array(testSTemp) setDict[str(count) + 'testclass'] = np.array(testCTemp) count += 1 return setDictclf_LR=LR()#怎么画出来ROC_curve# ROC_curve(clf_LR,testClass)setDict = getData_3()setNums = len(setDict.keys())print(setDict.keys())LR_rate = 0.0for i in range(1, 3): print(i) trainMatrix = setDict[str(i) + 'train'] trainClass = setDict[str(i) + 'trainclass'] print(len(trainClass)) testMatrix = setDict[str(i) + 'test'] testClass = setDict[str(i) + 'testclass'] print(len(testClass)) clf_LR.fit(trainMatrix, trainClass) LR_rate += getRecognitionRate(clf_LR.predict(testMatrix), testClass)print('Logistic Regression mean recognition rate: ', LR_rate / 2)
getData_3方法把数据集进行分割,分别对于不同的分割进行计算正确率,最后计算平均正确率。
阅读全文
0 0
- Python分类算法交叉验证
- K折交叉验证-python
- python sklearn包----------交叉验证
- K-折交叉验证算法
- python机器学习——十次交叉验证训练的数据准备算法
- 机器学习-CrossValidation交叉验证Python实现
- Python 之 sklearn 交叉验证 数据拆分
- S折交叉验证 in Python
- Python 之 sklearn 交叉验证 数据拆分
- 机器学习-CrossValidation交叉验证Python实现
- CrossValidation十字交叉验证的Python实现
- 机器学习-CrossValidation交叉验证Python实现
- python中sklearn实现交叉验证
- python中sklearn实现交叉验证
- 机器学习 python 交叉验证实例
- CART算法实现之交叉验证
- KNN算法实现及其交叉验证
- LARS算法---十折交叉验证
- Eclipse neon for java ee开发android点击layout下文件闪退问题
- 页面中加入一个人体时钟和仓鼠动画
- dom4j解析多层xml
- postgres备份和恢复
- 统计数字,空格等个数
- Python分类算法交叉验证
- sqlserver语法点滴
- Tensorflow 可视化 Tensorboard 1
- 1.2 为什么选择Key-Value Store
- Cookie常用操作以及属性
- push-推送部署统计数据重复-问题处理
- hibernate api理解学习
- 第二周 项目4 汉诺塔
- TK1/TX1 Gstreamer 测试命令