【数据挖掘】k-邻近算法

来源:互联网 发布:手机淘宝店铺主页装修 编辑:程序博客网 时间:2024/04/29 14:39
#!/usr/bin/python#-*-encoding:utf-8-*-import numpy as npimport operatordef classify(vect,dataset,lables,k=5):    rows = dataset.shape[0]    diffmat =  np.tile(vect,(rows,1)) - dataset    diffmat = diffmat ** 2    diffmat = diffmat.sum(axis=1)#n维数组降维为1维数组    distances = diffmat ** 0.5    sortedIndices = distances.argsort()#返回排序后索引    classCount={}    for i in range(k):      votedLabel = lables[sortedIndices[i]]      classCount.setdefault(votedLabel,0)      classCount[votedLabel] += classCount[votedLabel] + 1    #对classCount进行排序    sortedClassCount = sorted(classCount.iteritems(),        key=operator.itemgetter(1),reverse=True)    return sortedClassCount[0][0]def load(filename,cols):  fr = open(filename)  arrayLines = fr.readlines()  mat = np.zeros((len(arrayLines),cols))  label = []  labelval=[]  valdict={}  indexCount=0  for line in arrayLines:    line = line.strip().strip("\n")    listLine = line.split("\t")    mat[indexCount,:] = listLine[0:-1]    label.append(listLine[-1])    indexCount+=1    if not valdict.has_key(listLine[-1]):      valdict[listLine[-1]]=float(indexCount)    labelval.append(valdict[listLine[-1]])    if indexCount > 100: break  return mat,label,labelval#数据归一化处理  def normalize(dataset):  #n维数组axis=0维度的最小值,最大值  minval = dataset.min(0)    maxval = dataset.max(0)  ranges = maxval - minval  normat = np.zeros(np.shape(dataset))  minmat = np.tile(minval,(dataset.shape[0],1))  mormat = (dataset - minmat)/np.tile(maxval,(dataset.shape[0],1))  return mormat