如何做情感分析,以京东评论为例(jieba+sklearn)

来源:互联网 发布:手机淘宝店铺分类链接 编辑:程序博客网 时间:2024/06/07 10:10

1、引言

作为
我的自然语言处理的第一篇博客,就简单的给大家看看用jieba分词,提取特征,利用机器学习的算法做情感分析的过程,照样,我不多做说明,请看代码,代码有注释

2、介绍

我的文本
来自上一篇博客爬取的京东的评论,都放在了我的mysql库里面,所以这里我只要在mysql中提取我想要的数据就行

3、代码

import jiebaimport pymysqlimport collections, itertoolsimport nltk.classify.util, nltk.metricsimport nltkfrom nltk.corpus import movie_reviews, stopwordsfrom nltk.collocations import BigramCollocationFinderfrom nltk.metrics import BigramAssocMeasuresfrom nltk.probability import FreqDist, ConditionalFreqDistfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.feature_extraction.text import TfidfTransformerdb = pymysql.connect('localhost', 'root', 'xin123456789', 'test')db.encoding = 'utf-8'cursor = db.cursor()cursor.execute('set names utf8')sql='SELECT * FROM newjd LIMIT 10000'cursor.execute(sql)results = cursor.fetchall()db.commit()cursor.close()db.close()total=[]#####去停用词stopwords = {}.fromkeys(['的', '包括' ,'等', '是',' ',',','。'])all=[]#####用jieba分词for row  in results :    allcon = jieba.cut(row[0], cut_all=False)    allcon = list(allcon)    all.append((allcon,row[1]))com=[]label=[]for content in all:    com.append(' '.join(content[0]))    label.append(content[1])train=com[:300000]trainlabel=label[:300000]test=com[300000:]testlabel=label[300000:]trainvectorizer = CountVectorizer()####转换为词频向量X = trainvectorizer.fit(train)x_=X.transform(train)from sklearn.feature_selection import SelectKBestfrom sklearn.feature_selection import chi2#####选择卡方值最大的1000个词做特征X_new = SelectKBest(chi2, k=1000).fit(x_, trainlabel)X_2=X_new.transform(x_)print(X_2.shape)testvectorizer = CountVectorizer(vocabulary=trainvectorizer.vocabulary_)test=X.transform(test)test=X_new.transform(test)##将词频向量转换为tfidf向量newtrain=traintfidf.transform(X_2)testidf= traintfidf.transform(test)print(newtrain)#查看数据结构 tfidf[i][j]表示i类文本中的tf-idf权重from sklearn import metricsfrom sklearn.naive_bayes import MultinomialNBimport numpy as npclf = MultinomialNB(alpha=0.01)clf.fit(newtrain, np.array(trainlabel))pred = clf.predict(testidf)print(pred)def calculate_result(actual,pred):    m_precision = metrics.precision_score(actual,pred)    m_recall = metrics.recall_score(actual,pred)    print('predict info:' )    print(m_precision)    print (m_recall)    print (metrics.f1_score(testlabel,pred))calculate_result(testlabel,pred)####svmfrom sklearn.svm import SVCprint ('*************************\nSVM\n*************************')svclf = SVC(kernel = 'linear')#default with 'rbf'svclf.fit(newtrain,trainlabel)pred = svclf.predict(testidf)calculate_result(testlabel,pred)
原创粉丝点击