Python机器学习numpy依赖包特征之间的相关性

来源:互联网 发布:mac下载office办公软件 编辑:程序博客网 时间:2024/06/07 19:33
# -*- coding: utf-8 -*-__author__ = 'gerry''''    @time 10 Nov 2017    @auther gerry    特征之间的相关性        期望:衡量样本某个特征列取值范围的平均值        方差:衡量样本某个特征列取值范围的离散程度        协方差矩阵和相关系数:衡量样本特征列之间线性相关性'''from numpy import  *import sysimport osimport cPickle as pickleimport matplotlib.pyplot as plt#1、相关系数与相关距离# * 相关系数ρxy = Cov(X,Y)/(sqrt(D(x))*sqrt(D(y))):是衡量两个特征列之间相关程度的一种方法取值范围为[-1,1],相关系数的绝对值越大,表明特征列X与Y的相关程度越高# * 相关距离:Dxy = 1 -ρxyfeatureMat = mat([    [88.5,96.8,104.1,111.3,117.7,124.0,130.0,135.4,140.2,145.3,151.9,159.5,165.9,169.8,171.6,172.3,172.7],    [12.54,14.65,16.64,18.98,21.26,24.06,27.33,30.47,33.74,37.69,42.49,48.08,53.27,57.08,59.35,60.68,61.40]])print shape(featureMat)# 计算均值mv1 = mean(featureMat[0])#第一列的均值mv2 = mean(featureMat[1])#第二列的均值# 计算两列的标准差dv1 = std(featureMat[0])dv2 = std(featureMat[1])corref = mean(multiply(featureMat[0]-mv1,featureMat[1]-mv2))/(dv1*dv2)print corref#使用Numpy相关系数得到关系相关系数矩阵print corrcoef(featureMat)#2、,马氏距离#   有M个样本向量X1~Xm,协方差矩阵记为S,均值记为向量μ,则其中样本向量X到μ的距离记为#       D(X) = sqrt((X-μ)'S^(-1)(X-μ))#   其中向量Xi与Xj之间的马氏距离定义为:#           D(Xi,Xj) = sqrt((Xi,Xj)'S^(-1)(Xi,Xj))#   协方差矩阵是单位矩阵(各个样本向量之间独立分布),则公式变成了欧氏距离#           D(Xi,Xj) = sqrt((Xi,Xj)'(Xi,Xj))#   协方差矩阵是对角矩阵,则公式变成了标准化欧氏距离公式#   优点:量纲无关,排除变量之间的相关性的干扰print "===================求马氏距离===================================="a = cov(featureMat) #协方差公式covinv = linalg.inv(a)  #矩阵求逆tp = featureMat.T[0]-featureMat.T[1]distma = sqrt(dot(dot(tp,covinv),tp.T))print distmaprint "==========================矩阵的特征值和特征向量==================================="A= [[8,1,6],[3,5,7],[4,9,2]]evals,evecs = linalg.eig(A)print "特征值:",evals,"\n特征向量:",evecsprint "===========================数据的归一化======================================================="# 归一化是一种简化计算的方式,集将有量纲的表达式,经过变换,转换为无量纲的表达式,称为标量#归一化有两种形式:一种是把数变为(0,1)之间的小数,一种是把有量纲表达式变为无量刚的表达式# X* = (X-M)/S# 标准化以后的值=(标准化前的值-分量的均值)/分量的标准差#欧氏距离的标准化vectormat = mat([[1,2,3],[4,5,6]])print mean(vectormat)v12 = vectormat[0]-vectormat[1]print sqrt(v12*v12.T)#normvarmat = std(vectormat.T,axis=0)    #求得方差print varmatnormvmat = (vectormat-mean(vectormat))/varmat.Tnormv12 = normvmat[0]-normvmat[1]print sqrt(normv12*normv12.T)print "====================数据的导入和内存管理==============================="#配置UTF-8输出环境reload(sys)sys.setdefaultencoding('utf-8')#数据文件转矩阵#path:数据文件路径#delimiter:行内字段分隔符def file2matrix(path,delimiter):    recordlist = []    fp = open(path,"rb")    #读取文件内容    content = fp.read()    fp.close()    rowlist = content.splitlines()  #按行转换为一维表    #逐行遍历,结果按分割符分割行向量    recordlist = [map(eval,row.split(delimiter)) for row in rowlist if row.strip()]    return mat(recordlist)  #返回转换后的矩阵形式root = "testdata"   #数据文件所在路径pathlist = os.listdir(root) #获取路径下的所有数据文件for path in pathlist:    recordmat = file2matrix(root+"/"+path,"\t") #文件到矩阵的转换    print shape(recordmat)      #输出解析矩阵的行、列数#对象的持久化file_obj = open(root+"recordmat.dat","wb")pickle.dump(recordmat[0],file_obj)  #将生产的矩阵对象保存到指定位置file_obj.close()read_obj = open(root+"/recordmat.dat","rb")readmat = pickle.load(read_obj)print shape(readmat)#高效读取大文本文件# 按行读文件,读取指定行数;nmax = 0按行读取全部def readfilelines(path,nmax=0):    fp = open(path,"rb")    ncount = 0  #已经读取行    while True:        content = fp.readline()        if content =="" or (ncount>=nmax and nmax!=0):  #判断到文件尾,或读完指定行数            break        yield content #返回读取的行        if nmax !=0:            ncount +=1    fp.close()path = "testdata/01.txt"    #数据文件所在的路径for line in readfilelines(path,nmax=10):        #读取10行    print line.strip()print "======================表与线性结构的可视化============================="#曲线数据加入噪声x = linspace(-5,5,200)y = sin(x)  #给出y与x的基本关系yn = y+random.rand(1,len(y))*1.5    #加入噪声的点集# 绘图fig = plt.figure()ax = fig.add_subplot(111)ax.scatter(x,yn,c = "blue",marker="o")ax.plot(x,y+0.75,'r')plt.show()print "=================表与线性结构的可视化============================"#曲线数据加入噪声x = linspace(-5,5,200)y = sin(x)  #给出y与x的基本关系yn = y+random.rand(1,len(y))*1.5    #加入噪声的点集# 绘图fig = plt.figure()ax = fig.add_subplot(111)ax.scatter(x,yn,c = "blue",marker="o")ax.plot(x,y+0.75,'r')plt.show()print "=================图与网络结构的可视化============================"dist = mat([[0.1,0.1],[0.9,0.5],[0.9,1],[0.45,0.9],[0.9,0.8],[0.7,0.9],[0.1,0.45],[0.45,0.1]])m,n = shape(dist)fig = plt.figure()  #绘图ax = fig.add_subplot(111)ax.scatter(dist.T.tolist()[0],dist.T.tolist()[1],c='blue',marker='o')for point in dist.tolist():    plt.annotate("("+str(point[0])+","+str(point[1])+")",xy =(point[0],point[1]) )xlist = []; ylist = []for px,py in zip(dist.T.tolist()[0],dist.T.tolist()[1]):    xlist.append([px])    ylist.append([py])ax.plot(xlist,ylist,'r')plt.show()
阅读全文
0 0
原创粉丝点击