python中实现PCA

来源:互联网 发布:做淘宝需要营业执照吗 编辑:程序博客网 时间:2024/06/05 04:25

主成分思想还是较于简单,跟LDA线性判别方法可以做做比较,昨天下午到今天上午在python里面写了一下,代码主要参考的是机器学习实战那本书,代码具体如下:

#encoding:utf-8'''Created on 2015年9月23日@author: ZHOUMEIXU204'''path=u'D:\\Users\\zhoumeixu204\\Desktop\\python语言机器学习\\机器学习实战代码   python\\机器学习实战代码\\machinelearninginaction\\Ch13\\'import  numpy as npimport matplotlib.pyplot  as pltdef  loadDataSet(filename,delim='\t'):    fr=open(filename)    StringArr=[line.strip().split(delim) for line  in fr.readlines()]    datArr=[map(float,line) for line in StringArr]    return np.mat(datArr)def pca(dataMat, topNfeat=9999999):    meanVals = np.mean(dataMat, axis=0)         meanRemoved = dataMat - meanVals #remove mean       covMat = np.cov(meanRemoved, rowvar=0)    #寻找方差最大的方向a,Var(a'X)=a'Cov(X)a方向误差最大    eigVals,eigVects = np.linalg.eig(np.mat(covMat))    eigValInd =np.argsort(eigVals)            #sort, sort goes smallest to largest    eigValInd = eigValInd[:-(topNfeat+1):-1]  #cut off unwanted dimensions    redEigVects = eigVects[:,eigValInd]       #reorganize eig vects largest to smallest    lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions    reconMat = (lowDDataMat * redEigVects.T) + meanVals    return lowDDataMat, reconMatdataMat=loadDataSet(path+'testSet.txt')print dataMatlowDMat,recoMat=pca(dataMat,1)print(u'特征值是:')print(lowDMat)print(u'特征向量是:')print(recoMat)    fig = plt.figure()ax = fig.add_subplot(111)ax.scatter(dataMat[:,0], dataMat[:,1], marker='^', s=90)ax.scatter(recoMat[:,0], recoMat[:,1], marker='o', s=50, c='red')plt.show()            def   replaceNanWithMean():    datMat=loadDataSet(path+'secom.data',' ')    numFeat=np.shape(datMat)[1]    for  i in range(numFeat):        meanVal=np.mean(datMat[np.nonzero(~np.isnan(datMat[:,i].A))[0],i])        datMat[np.nonzero(np.isnan(datMat[:,i].A))[0],i]=meanVal    return datMatdataMat=replaceNanWithMean()meanVals =np.mean(dataMat, axis=0)meanRemoved = dataMat - meanVals #remove meancovMat = np.cov(meanRemoved, rowvar=0)eigVals,eigVects = np.linalg.eig(np.mat(covMat))eigValInd = np.argsort(eigVals)            #sort, sort goes smallest to largesteigValInd = eigValInd[::-1]#reversesortedEigVals = eigVals[eigValInd]total = sum(sortedEigVals)varPercentage = sortedEigVals/total*100#计算主成分方差fig = plt.figure()ax = fig.add_subplot(111)ax.plot(range(1, 21), varPercentage[:20], marker='^')plt.xlabel('Principal Component Number')plt.ylabel('Percentage of Variance')plt.show()


0 0