【数据挖掘】k-邻近算法
来源:互联网 发布:手机淘宝店铺主页装修 编辑:程序博客网 时间:2024/04/29 14:39
#!/usr/bin/python#-*-encoding:utf-8-*-import numpy as npimport operatordef classify(vect,dataset,lables,k=5): rows = dataset.shape[0] diffmat = np.tile(vect,(rows,1)) - dataset diffmat = diffmat ** 2 diffmat = diffmat.sum(axis=1)#n维数组降维为1维数组 distances = diffmat ** 0.5 sortedIndices = distances.argsort()#返回排序后索引 classCount={} for i in range(k): votedLabel = lables[sortedIndices[i]] classCount.setdefault(votedLabel,0) classCount[votedLabel] += classCount[votedLabel] + 1 #对classCount进行排序 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1),reverse=True) return sortedClassCount[0][0]def load(filename,cols): fr = open(filename) arrayLines = fr.readlines() mat = np.zeros((len(arrayLines),cols)) label = [] labelval=[] valdict={} indexCount=0 for line in arrayLines: line = line.strip().strip("\n") listLine = line.split("\t") mat[indexCount,:] = listLine[0:-1] label.append(listLine[-1]) indexCount+=1 if not valdict.has_key(listLine[-1]): valdict[listLine[-1]]=float(indexCount) labelval.append(valdict[listLine[-1]]) if indexCount > 100: break return mat,label,labelval#数据归一化处理 def normalize(dataset): #n维数组axis=0维度的最小值,最大值 minval = dataset.min(0) maxval = dataset.max(0) ranges = maxval - minval normat = np.zeros(np.shape(dataset)) minmat = np.tile(minval,(dataset.shape[0],1)) mormat = (dataset - minmat)/np.tile(maxval,(dataset.shape[0],1)) return mormat