机器学习实战代码详解(八)预测数值型数据:回归
来源:互联网 发布:oracle数据库主键类型 编辑:程序博客网 时间:2024/06/05 09:13
#coding=utf-8#数据导入行数from numpy import *def loadDataSet(fileName): numFeat = len(open(fileName).readline().split('\t')) - 1 dataMat = []; labelMat = [] fr = open(fileName) for line in fr.readlines(): lineArr = [] curLine = line.strip().split('\t') for i in range(numFeat): lineArr.append(float(curLine[i])) dataMat.append(lineArr) labelMat.append(float(curLine[-1])) return dataMat, labelMat#标准回归函数def standRegress(xArr, yArr): xMat = mat(xArr); yMat = mat(yArr).T xTx = xMat.T*xMat if linalg.det(xTx) == 0.0: print "this matrix is singular, cannot do inverse" return ws = xTx.I * (xMat.T*yMat) return ws#局部加权线性回归函数,给定一点,用局部加权线性回归预测该点yMat值def lwlr(testPoint, xArr, yArr, k = 1.0): xMat = mat(xArr); yMat = mat(yArr).T m = shape(xMat)[0] weights = mat(eye((m))) #对角矩阵w[i][i] = 1其余均为0 for j in range(m): diffMat = testPoint - xMat[j,:] weights[j,j] = exp(diffMat * diffMat.T)/(-2.0*k**2) xTx = xMat.T * (weights * xMat) if linalg.det(xTx) == 0.0: print "this matrix is singular, cannot do inverse" return ws = xTx.I * (xMat.T * (weights * yMat)) return testPoint * ws#测试函数,为每一个测试数据点调用lwlr函数def lwlrTest(testArr, xArr, yArr, k = 1.0): m = shape(testArr)[0] yHat = zeros(m) for i in range(m): yHat[i] = lwlr(testArr[i], xArr, yArr, k) return yHatdef rssError(yArr, yHatArr): return ((yArr - yHatArr)**2).sum()#岭回归,计算回归系数def ridgeRegress(xMat, yMat, lam = 0.2): xTx = xMat.T * xMat denom = xTx + eye(shape(xMat)[1]) * lam if linalg.det(denom) == 0.0: print "This matrix is singular, cannot do inverse" return ws = denom.I * (xMat.T * yMat) return ws#在一组lam上测试结果def ridgeTest(xArr, yArr): xMat = mat(xArr); yMat = mat(yArr).T yMean = mean(yMat, 0) #对各列求均值 yMat = yMat - yMean xMeans = mean(xMat, 0) #对各列求均值 xVar = var(xMat, 0) #对各列求方差 xMat = (xMat - xMeans)/xVar #标准化数据 numTestPts = 30 wMat = zeros((numTestPts, shape(xMat)[1])) for i in range(numTestPts): #在30个不同的lam下计算回归系数 ws = ridgeRegress(xMat, yMat, exp(i - 10)) wMat[i,:] = ws.T return wMat#数据标准化函数def regularize(xMat):#regularize by columns inMat = xMat.copy() inMeans = mean(inMat,0) #calc mean then subtract it off inVar = var(inMat,0) #calc variance of Xi then divide by it inMat = (inMat - inMeans)/inVar return inMat#前向逐步线性回归算法实现,eps迭代需要调整的步长,numIt迭代次数def stageWise(xArr, yArr, eps = 0.01, numIt = 100): xMat = mat(xArr); yMat = mat(yArr).T yMean = mean(yMat, 0) yMat = yMat - yMean xMat = regularize(xMat) m, n = shape(xMat) returnMat = zeros((numIt, n)) ws = zeros((n, 1)); wsTest = ws.copy(); wsMax = ws.copy() for i in range(numIt): print ws.T lowestError = inf #初始平方误差无穷 for j in range(n): for sign in [-1, 1]: #分别计算增加或者减少该特征对平方误差的影响 wsTest = ws.copy() wsTest[j] += eps*sign yTest = xMat * wsTest rssE = rssError(yMat.A, yTest.A) if rssE < lowestError: #取平方误差较小者 lowestError = rssE wsMax = wsTest ws = wsMax.copy() returnMat[i,:] =ws.T return returnMat#购物信息获取函数from time import sleepimport jsonimport urllib2from time import sleepimport jsonimport urllib2def searchForSet(retX, retY, setNum, yr, numPce, origPrc): sleep(10) myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY' searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % ( myAPIstr, setNum) pg = urllib2.urlopen(searchURL) retDict = json.loads(pg.read()) for i in range(len(retDict['items'])): try: currItem = retDict['items'][i] if currItem['product']['condition'] == 'new': newFlag = 1 else: newFlag = 0 listOfInv = currItem['product']['inventories'] for item in listOfInv: sellingPrice = item['price'] if sellingPrice > origPrc * 0.5: print "%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc, sellingPrice) retX.append([yr, numPce, newFlag, origPrc]) retY.append(sellingPrice) except: print 'problem with item %d' % idef setDataCollect(retX, retY): searchForSet(retX, retY, 8288, 2006, 800, 49.99) searchForSet(retX, retY, 10030, 2002, 3096, 269.99) searchForSet(retX, retY, 10179, 2007, 5195, 499.99) searchForSet(retX, retY, 10181, 2007, 3428, 199.99) searchForSet(retX, retY, 10189, 2008, 5922, 299.99) searchForSet(retX, retY, 10196, 2009, 3263, 249.99)def crossValidation(xArr, yArr, numVal=10): m = len(yArr) indexList = range(m) errorMat = zeros((numVal, 30)) # create error mat 30columns numVal rows for i in range(numVal): trainX = []; trainY = [] testX = []; testY = [] random.shuffle(indexList) for j in range(m): # create training set based on first 90% of values in indexList if j < m * 0.9: trainX.append(xArr[indexList[j]]) trainY.append(yArr[indexList[j]]) else: testX.append(xArr[indexList[j]]) testY.append(yArr[indexList[j]]) wMat = ridgeTest(trainX, trainY) # get 30 weight vectors from ridge for k in range(30): # loop over all of the ridge estimates matTestX = mat(testX); matTrainX = mat(trainX) meanTrain = mean(matTrainX, 0) varTrain = var(matTrainX, 0) matTestX = (matTestX - meanTrain) / varTrain # regularize test with training params yEst = matTestX * mat(wMat[k, :]).T + mean(trainY) # test ridge results and store errorMat[i, k] = rssError(yEst.T.A, array(testY)) # print errorMat[i,k] meanErrors = mean(errorMat, 0) # calc avg performance of the different ridge weight vectors minMean = float(min(meanErrors)) bestWeights = wMat[nonzero(meanErrors == minMean)] # can unregularize to get model # when we regularized we wrote Xreg = (x-meanX)/var(x) # we can now write in terms of x not Xreg: x*w/var(x) - meanX/var(x) +meanY xMat = mat(xArr); yMat = mat(yArr).T meanX = mean(xMat, 0); varX = var(xMat, 0) unReg = bestWeights / varX print "the best model from Ridge Regression is:\n", unReg print "with constant term: ", -1 * sum(multiply(meanX, unReg)) + mean(yMat)
总结
- 与分类一样,回归也是预测目标值的过程。回归与分类的不同点在于,前者预测连续性变量,后者预测零散型变量。
- 当数据的样本比特征还少时候,举证xTx的逆不能直接计算。即便当样本数比特征树多时,xTx的逆仍然可能无法直接计算,这是因为特征有可能高度相关。这时可以考虑使用岭回归。因为当xTx的逆不能计算时,它仍保证能求得回归系数
- 岭回归是缩减法的一种,相当于对回归系数的大小施加限制。另一种很好的缩减法是lasso。Lasso难以求解,但可以使用计算简便的逐步线性回归求得近似结果。
- 缩减法还可以看做是对一个模型增加偏差的同时减少方差。偏差方差折中是一个重要的概念,可以帮助我们理解现有模型并做出改进,从而得到更好的模型。
阅读全文
0 0
- 机器学习实战代码详解(八)预测数值型数据:回归
- 《机器学习实战》预测数值型数据-回归(Regression)
- 《机器学习实战》预测数值型数据-回归(Regression)
- 《机器学习实战》笔记之八——预测数值型数据:回归
- 机器学习实战——预测数值型数据:回归
- 机器学习实战-预测数值型数据:回归
- 机器学习实战_08预测数值型数据-回归
- 机器学习实战笔记-预测数值型数据:回归
- 机器学习实战-8预测数值型数据-回归
- 代码注释:机器学习实战第8章 预测数值型数据:回归
- 机器学习实战 第八章 预测数值型数据:回归(Regression)
- 【机器学习实战】第8章 预测数值型数据:回归(Regression)
- 机器学习实战学习笔记(七)预测数值型数据—回归(python3实现)
- 机器学习实战——第八章:用回归预测数值型数据
- 【机器学习实战】第8章 预测数值型数据:回归
- 机器学习之预测数值型数据: 回归
- 机器学习(五):回归方法——预测数值型数据
- 预测数值型数据:回归(一)
- 业务日志查询,环境问题初步定为方法
- inception_v3迁移学习(GoogleNet)
- 练习项目 一款新闻app的开发 (三):通过Rxjava + Retrofit2框架获取网络数据
- jenkins定时编译节省测试用例执行时间
- Druid组件角色
- 机器学习实战代码详解(八)预测数值型数据:回归
- Linux平台卸载MySQL总结
- android图形系统组件(二)
- 连接池
- idea中mybatis+spring boot, mapper 提示Could not autowire. No beans of … type found解决办法
- Java 代码性能优化总结
- comm
- Activity切换动画
- 简单集合实现三(HashMap和HashSet)