代码注释:机器学习实战第8章 预测数值型数据:回归

来源:互联网 发布:汽车销售软件有哪些 编辑:程序博客网 时间:2024/05/16 05:52

写在开头的话:在学习《机器学习实战》的过程中发现书中很多代码并没有注释,这对新入门的同学是一个挑战,特此贴出我对代码做出的注释,仅供参考,欢迎指正。

#coding:gbkfrom numpy import *#作用:从文件中导入数据#输入:文件名#输出:数据矩阵,标签向量def loadDataSet(fileName):    # .readline()每次只读取一行,只需读取一行计算特征值洁身内存    numFeat = len(open(fileName).readline().split('\t')) - 1    dataMat = []    labelMat = []    fr = open(fileName)    #.readlines()一次读取整个文件,自动将文件内容分析成一个行的列表    for line in fr.readlines():        lineArr = []        curLine = line.strip().split('\t')        for i in range(numFeat):            lineArr.append(float(curLine[i]))        dataMat.append(lineArr)        labelMat.append(float(curLine[-1]))    return dataMat, labelMat#作用:计算最佳拟合直线#输入:数据点的x向量,y向量#输出:最佳拟合直线的回归系数向量def standRegres(xArr, yArr):    xMat = mat(xArr)    yMat = mat(yArr).T    xTx = xMat.T * xMat    #.linalg.det():计算矩阵的行列式    if linalg.det(xTx) == 0.0:        #如果矩阵行列式为零,则该矩阵不能计算逆矩阵        print "This matrix is singular, cannot do inverse"        return    ws = xTx.I * (xMat.T * yMat)    return ws#作用:局部加权线性回归方法#输入:测试点,数据点的x向量,y向量,参数k#输出:预测值def lwlr(testPoint, xArr, yArr, k = 1.0):    xMat = mat(xArr)    yMat = mat(yArr).T    m = shape(xMat)[0]    weights = mat(eye((m)))#(m)是一个元组    for j in range(m):        diffMat = testPoint - xMat[j, :]        weights[j, j] = exp(diffMat * diffMat.T / (-2.0 * k ** 2))    xTx = xMat.T * (weights * xMat)    if linalg.det(xTx) == 0.0:        print "This matrix is singular, cannot do inverse"        return    ws = xTx.I * (xMat.T * (weights * yMat))    return testPoint * ws#作用:测试局部加权线性回归方法#输入:测试向量,数据点的x向量,y向量,参数k#输出:预测值向量def lwlrTest(testArr, xArr, yArr, k = 1.0):    m = shape(testArr)[0]    yHat = zeros(m)    for i in range(m):        yHat[i] = lwlr(testArr[i], xArr, yArr, k)    return yHat#作用:计算预测值与真实值之间的误差#输入:真实值,预测值#输出:误差def rssError(yArr, yHatArr):    return ((yArr - yHatArr) ** 2).sum()#作用:岭回归#输入:数据点的x向量,y向量,岭参数k#输出:预测值def ridgeRegres(xMat, yMat, lam = 0.2):    xTx = xMat.T * xMat    denom = xTx + eye(shape(xMat)[1]) * lam    if linalg.det(denom) == 0.0:        print "This matrix is singular, cannot do inverse"        return    ws = denom.I * (xMat.T * yMat)    return ws#作用:岭回归测试#输入:数据点的x向量,y向量#输出:回归系数矩阵def ridgeTest(xArr, yArr):    xMat = mat(xArr)    yMat = mat(yArr).T    yMean = mean(yMat, 0)    yMat = yMat - yMean    xMeans = mean(xMat, 0)    xVar = var(xMat, 0)#计算偏差    xMat = (xMat - xMeans) / xVar    numTestPts = 30    wMat = zeros((numTestPts, shape(xMat)[1]))    for i in range(numTestPts):        ws = ridgeRegres(xMat, yMat, exp(i - 10))        wMat[i, :] = ws.T    return wMat#作用:对矩阵进行正则化#输入:需要正则化的矩阵#输出:正则化后的矩阵def regularize(xMat):#regularize by columns    inMat = xMat.copy()    inMeans = mean(inMat,0)   #calc mean then subtract it off    inVar = var(inMat,0)      #calc variance of Xi then divide by it    inMat = (inMat - inMeans)/inVar    return inMat#作用:逐步线性回归算法#输入:数据点的x向量,y向量,步长,迭代次数#输出:回归系数矩阵def stageWise(xArr, yArr, eps = 0.01, numIt = 100):    xMat = mat(xArr)    yMat = mat(yArr).T    yMean = mean(yMat, 0)    yMat = yMat - yMean#正则化后的y向量    xMat = regularize(xMat)#正则化后x向量    m, n = shape(xMat)#x向量的行数、列数,即样例个数与特征值个数    returnMat = zeros((numIt, n))#回归系数矩阵    ws = zeros((n, 1))#回归系数    wsTest = ws.copy()    wsMax = ws.copy()    for i in range(numIt):        print ws.T        lowestError = inf        #对每个特征,增大或减小一个特征        for j in range(n):            for sign in [-1, 1]:                wsTest = ws.copy()                wsTest[j] += eps * sign                yTest = xMat * wsTest                rssE = rssError(yMat.A, yTest.A)                if rssE < lowestError:                    lowestError = rssE                    wsMax = wsTest        ws = wsMax.copy()        returnMat[i, :] = ws.T    return returnMatfrom time import sleepimport jsonimport urllib2#作用:购物信息的获取函数#输入:#输出:网址已取消,失败def searchForSet(retX, retY, setNum, yr, numPce, origPrc):    sleep(10)    myAPIstr = 'get from code.google.com'    searchURL = 'https://www.googleapis.com/shopping/search/v1/pubilic/products?\                key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum)    pg = urllib2.urlopen(searchURL)    retDict = json.loads(pg.read())    for i in range(len(retDict['item'])):        try:            currItem = retDict['items'][i]            if currItem['product']['conditon'] == 'new':                newFlag = 1            else:                newFlag = 0            listOfInv = currItem['product']['inventories']            for item in listOfInv:                sellingPrice = item['price']                if sellingPrice > origPrc * 0.5:                    print "%d\t%d\t%d\t%f\t%df" %\                          (yr, numPce, newFlag, origPrc, sellingPrice)                    retX.append([yr, numPce, newFlag, origPrc])                    retY.append(sellingPrice)        except:            print 'problem with item %d' % idef setDataCollect(retX, retY):    searchForSet(retX, retY, 8288, 2006, 800, 49.99)    searchForSet(retX, retY, 10030, 2002, 3096, 269.99)    searchForSet(retX, retY, 10179, 2007, 5195, 499.99)    searchForSet(retX, retY, 10181, 2007, 3428, 199.99)    searchForSet(retX, retY, 10189, 2008, 5922, 299.99)    searchForSet(retX, retY, 10196, 2009, 3263, 249.99)


1 0
原创粉丝点击