文本分类学习笔记(5)- KNN

来源:互联网 发布:游戏充值用什么软件 编辑:程序博客网 时间:2024/06/10 16:23

KNN分类器实现,运行极慢不推荐;

#coding=utf-8from numpy import *from scipy import sparse,iofrom sklearn.datasets import load_filesfrom sklearn.cross_validation import train_test_splitfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.feature_selection import SelectKBest, chi2from sklearn.naive_bayes import MultinomialNBfrom sklearn.linear_model import LogisticRegressionfrom sklearn import neighborsfrom sklearn import metricsfrom operator import itemgetterimport warningswarnings.filterwarnings("ignore")def calculate_result(actual,pred):    m_precision = metrics.precision_score(actual,pred)    m_recall = metrics.recall_score(actual,pred)    m_acc = metrics.accuracy_score(actual,pred)    print 'predict info:'    print 'accuracy:{0:.3f}'.format(m_acc)    print 'precision:{0:.3f}'.format(m_precision)    print 'recall:{0:0.3f}'.format(m_recall)    print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred))def text_classsfier(train_dir,test_dir):    #load datasets    doc_train = load_files(train_dir)    doc_test = load_files(test_dir)    #切分数据集,由于有单独的测试集,故省略    #doc_terms_train, doc_terms_test, doc_class_train, doc_class_test = train_test_split(doc_train.data, doc_train.target, test_size = 0.2)    #调用Vectorizer提取文本特征    #Bool型特征(one-hot)    #count_vec = CountVectorizer(binary = True,decode_error='replace')    #TF-IDF特征(词频)    count_vec =  TfidfVectorizer(min_df=1,decode_error='replace')       doc_train_bool = count_vec.fit_transform(doc_train.data)    doc_test_bool = count_vec.transform(doc_test.data)    #调用KNN分类器预测分类    predicted = []    test = doc_test_bool.toarray()    for i in doc_test.target:        print i,    print    for i in xrange(shape(test)[0]):        x = classify(test[i], doc_train_bool.toarray(), doc_train.target,10)        print x,        predicted.append(x)    #计算分类准确度信息    calculate_result(doc_test.target,predicted)    #保存分类结果    file_o = open('result_knn.txt', 'w')    file_o.write(str(predicted))#KNN分类器,使用欧式距离度量,未使用kd树def classify(inX, dataSet, labels, k):    dataSetSize = dataSet.shape[0]    diffMat = tile(inX, (dataSetSize,1)) - dataSet#将数组A作为元素构造m行n列的数组    #以矩阵为单位计算距离    sqDiffMat = diffMat**2    sqDistances = sqDiffMat.sum(axis=1)#(axis=1)按行累加    distances = sqDistances**0.5    sortedDistIndicies = distances.argsort()#每个元素的排序序号    classCount = {}#sortedDistIndicies[0]表示排序后排在第一个的那个数在原来数组中的下标    for i in range(k):        voteIlabel = labels[sortedDistIndicies[i]]        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1#获取key对应的value,没有key返回0    sortedClassCount = sorted(classCount.iteritems(), key=itemgetter(1), reverse=True)#按照value逆向排序    return sortedClassCount[0][0]if __name__ == '__main__':    text_classsfier('training','test')

运行结果:

accuracy:0.811precision:0.816recall:0.811f1-score:0.808
0 0