朴素贝叶斯分类

来源:互联网 发布:软件项目招标网 编辑:程序博客网 时间:2024/06/05 20:49
数据地址:https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews
#-*- coding:utf-8 -*-#朴素贝叶斯import pandas as pdimport numpy as npfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.datasets.base import Bunchfrom sklearn.naive_bayes import MultinomialNB #导入多项式贝叶斯算法包import zipfileimport sysreload(sys)sys.setdefaultencoding('utf-8')z=zipfile.ZipFile('train.tsv.zip')df=pd.read_csv(z.open(z.namelist()[0]),header=0,delimiter='\t')x=df['Phrase'].as_matrix()y=df['Sentiment'].as_matrix()train_x=x[:100000]train_y=y[:100000]test_x=x[-50000:]test_y=y[-50000:]trainSpace=Bunch(tdm=[],label=[],vocabulary={}) #tdm tfidf词向量矩阵 label 类别  vocabulary词汇表testSpace=Bunch(tdm=[],label=[],vocabulary={})vect_train=TfidfVectorizer(stop_words='english',sublinear_tf=True,max_df=0.25)trainSpace.tdm=vect_train.fit_transform(train_x)trainSpace.label=train_ytrainSpace.vocabulary=vect_train.vocabulary_vect_test=TfidfVectorizer(stop_words='english',sublinear_tf=True,max_df=0.25,vocabulary=trainSpace.vocabulary)testSpace.tdm=vect_test.fit_transform(test_x)testSpace.label=test_ytestSpace.vocabulary=trainSpace.vocabularyclf=MultinomialNB(alpha=0.00001).fit(trainSpace.tdm,trainSpace.label)predicted=clf.predict(testSpace.tdm)total=len(test_x)rate=0for label,text,pre in zip(testSpace.label,test_x,predicted):    if label!=pre:        rate+=1        print text," 实际类别:",label," 预测类别:",preprint "错误率:",float(rate)*100/float(total),"%"print clf.score(testSpace.tdm,test_y)

原创粉丝点击