python机器学习----利用sklearn进行情感分析

来源:互联网 发布:淘宝信鸽赛中二手 编辑:程序博客网 时间:2024/05/21 14:55
import jiebafrom collections import defaultdictimport osfrom sklearn.feature_extraction.text import TfidfTransformerfrom sklearn.feature_extraction.text import CountVectorizerdef readfile(filename):    fh=open(filename,'r',encoding='utf-8')    data=[]    for x in fh.readlines():        if(x.strip()!=''):            data.append(x.strip())    fh.close()    return data#x=readfile("C:/Users/yyq/Desktop/毕业论文/文档1.txt")#print(x)#分词处理def cut2wd(sentence):    wdlist=jieba.cut(sentence)    wdrst=[]    for w in wdlist:        wdrst.append(w)    stopwds=readfile("C:/Users/yyq/Desktop/毕业论文/停用词表.txt")    newwd=[]    for w2 in wdrst:        if w2 in stopwds:            continue        else:            newwd.append(w2)    return newwda=cut2wd("我爱北京天安门")#print(a)         #词频统计def Count(words):    #{"词语":词频,}    corpus=words    vectorizer=CountVectorizer(token_pattern="\\b\\w+\\b")#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频      transformer=TfidfTransformer(norm=None,use_idf=False)#该类会统计每个词语的tf-idf权值      tf=transformer.fit_transform(vectorizer.fit_transform(corpus)) #第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵     word=vectorizer.get_feature_names()#获取词袋模型中的所有词语     weight=tf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重    #print(weight)    mycp={}    for i in range(len(weight)):        for j in range(len(word)):            mycp.update({str(word[j]):int(weight[i][j])})    return mycpb=Count(["我","爱","天安门","爱","明月"])print(b)#情感定位def pos(wddict):    senlist=readfile("情感语料库")    sendict=defaultdict()    for s in senlist:        sendict[s.split(' ')[0]]=s.split(' ')[1]    notlist=readfile("否定词表")    degreelist=readfile("程度副词")    degreedict=defaultdict()    for d in degreelist:        degreedict[d.split(',')[0]]=d.split(',')[1]    senwd=defaultdict()    notwd=defaultdict()    degreewd=defaultdict()    for word in wddict.keys():        if word in sendict.keys() and not in notlist and word not in degreedict.keys():            senwd[wddict[word]]=sendict[word]        elif word in notlist and word not in degreedict.keys():            notwd[wddict[word]]=-1        elif word in degreedict.keys():            degreewd[wddict[word]]=degreedict[word]    return senwd,notwd,degreewd#情感得分计算def score(senwd,notwd,degreewd,cutrst):    score=0    w=1    senLoc=senwd.keys()    notloc=notwd.keys()    degreeloc=degreewd.keys()    senloc=-1    for i in range(0,len(cutrst)):        if i in senLoc:            senloc+=1            score+=w*float(senwd[i])            if senloc <len(senloc)-1:                for j in range(list(senLoc)[senloc],list(senLoc)[senloc+1]):                    if j in notloc:                        w*=-1                    elif j in degreeloc:                        w*=float(degreewd[j])        if senloc<len(senLoc)-1:            i=list(senLoc)[senloc+1]    return scorestr1="这样的工作很好"cut=sut2wd(str1)wddict=Count(cut)senwd,notwd,degreewd=pos(wddict)rst=score(senwd,notwd,degreewd)print(rst)#批量测试               allposfile=os.listdir("积极情感文件")for thisfile in allposfile:    open(""+thisfile,"r",encoding="gbk").read()    cut=sut2wd(str1)    wddict=Count(cut)    senwd,notwd,degreewd=pos(wddict)    rst=score(senwd,notwd,degreewd)    print(rst)
阅读全文
0 0
原创粉丝点击