R软件中jiebaR包分词和用python中jieba分词以及做关键字提取及LDA模型

来源：互联网发布：淘宝助理怎么编辑宝贝编辑：程序博客网时间：2024/04/29 21:28

对于一个软件来讲，若是开源其发展速度是很快的，在R软件中，去年年底就发布了jiebaR分词包，记得上学的那会jieba包首先是出现在python中，没想到在R软件中也用的到，前几天接了点私活，用这个包帮他做点东西出来，没想到，做到最后不愿意给钱，无良奸商。。。不过也正好也熟悉了一下R中的jiebaR分词包，总体来讲这个包还是蛮强大的，中文分词很准确，能提取关键字，能快速的上手，直接上代码对比python中jieba包，看看吧：

library(jiebaRD)

library(jiebaR)# library(jiebaR)加载包时没有启动任何分词引擎启动引擎很简单，就是一句赋值语句就可以了x<-"众筹项目成功了，众筹绑卡成功了，一切都很顺利"cutter=worker()cutter<=x

show_dictpath() #显示字典路径

edit_dict() #编辑用户字典 可以自己加词语进去 要重新cutter=worker()才能生效

cutter <= "D:\\Users\\xuguoxuan694\\Desktop\\新建文本文档.txt"show_dictpath() ### 可以显示默认词典路径segment(code= x , jiebar = cutter) ##一样的功能cutter=worker(type="tag")<pre name="code" class="plain">tryCatch(library(jiebaR),error=function(e){install.packages("jiebaR")},finally={  tryCatch(library(jiebaR)  ,error=function(e){print("请重新安装jiebaR包")})})tryCatch(library(ggplot2),error=function(e){install.packages("ggplot2")},finally={  tryCatch(library(ggplot2)  ,error=function(e){print("请重新安装ggplot2包")})})tryCatch(library(wordcloud),error=function(e){install.packages("wordcloud")},finally={  tryCatch(library(wordcloud)  ,error=function(e){print("请重新安装wordcloud包")})})result<-read.csv("C:\\Users\\Administrator\\Desktop\\质量记录.csv")head(result)# edit_dict()cutter=worker()x<-"众筹项目成功了，众筹绑卡成功了，一切都很顺利"  cutter<=x result$QUALITYDESC_d<-sapply(result$QUALITYDESC,function(x)gsub("[a-zA-Z0-9]","",x))result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("客户反应","",x))result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("客户","",x))result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("反映","",x))#把全身没气压、全身气压弱、全身气压很小转化为全身气压问题clear_stopwords<-function(x){   if(grepl("全身没气压",x)){    x<-gsub("全身没气压","全身气压",x)   }    if(grepl("全身不充气",x)){   x<-gsub("全身不充气","全身充气",x)   }    if(grepl("断了",x)){   x<-gsub("断了","断裂",x)   }   if(grepl("响声",x)){   x<-gsub("响声","异响",x)}  x}result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,clear_stopwords)jieba_result<-c()for(j in result$QUALITYDESC_d){  jieba_result<-c(jieba_result,cutter<=j)}stopwords1<-c("不能","不","了","有","在","没","少","一个","都","也","时","来","用","会","上","后","是","腿","走","无","左","大","没有","就","到","右","坏"," 部","不会"," 两个")stopword2<-c(stopwords1,"加","一","小","个","才","去","能","对","只","还","和","需要","过","倒","的","跟","已","掉","让","可以","掉","停","拨","亮","一下","下")stopword<-c(stopword2,"其他","下去","时候","使用","问题","正常","部","一边","一直","工作","响","说","好","买","但是","一样","不行","时有","夹")jjj_result<-as.data.frame(table(jieba_result))jj_result<-jjj_result[!jjj_result$jieba_result %in% stopword, ]op<-par(bg = "lightyellow")  wordcloud(jj_result$jieba_result,jj_result$Freq,col = rainbow(length(jj_result$Freq)),scale=c(5,1),min.freq=4,max.words=Inf,random.order=FALSE)  par(op)  last<-jj_result[order(jj_result$Freq),]p<-ggplot(tail(last,30),aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1)p+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前30名分析报告分析")+ geom_text(label=(tail(last,30))$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result)))p1<-ggplot( last[(nrow(last)-60):(nrow(last)-30),],aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1)p1+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+  xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前30到60名分析报告分析")+ geom_text(label=(last[(nrow(last)-60):(nrow(last)-30),])$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result)))p2<-ggplot( last[(nrow(last)-90):(nrow(last)-60),],aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1)p2+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前60到90名分析报告分析")+geom_text(label=(last[(nrow(last)-90):(nrow(last)-60),])$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result)))

cutter_words <- cutter <= "我爱北京天安门"# 关键词提取# 关键词提取所使用逆向文件频率（IDF）。文本语料库可以切换成自定义语料库的路径,使用方法与分词类似。topn参数为关键词的个数。cutter = worker(type = "keywords", topn = 2)cutter_words <- cutter <= "我爱北京天安门"cutter_words

https://qinwf.shinyapps.io/jiebaR-shiny/ jiebaR在线分词试用

附上python中关键词提取及LDA模型的python代码

#encoding:utf-8'''Created on 2015年10月25日@author: Administrator'''import pandas  as pdimport reimport jieba  import nltk  import jieba.posseg as pseg  from gensim import corpora, models, similaritiesdf=pd.read_csv(u'C:\\Users\\Administrator\\Desktop\\质量记录.csv',encoding='gbk')cont=df['QUALITYDESC'].map(lambda x:re.sub(ur'客户反应|客户|反映','',x)).map(lambda x:re.sub(r'[a-zA-Z0-9\.]','',x))#导入自己添加的用户词语jieba.load_userdict(u'C:\\Users\\Administrator\\Desktop\\分词.txt')nwordall = []  for t in cont:          words =pseg.cut(t)          nword = ['']          for w in words:              if((w.flag == 'n'or w.flag == 'v' or w.flag == 'a') and len(w.word)>1):                  nword.append(w.word)          nwordall.append(nword) # 选择后的词生成字典  dictionary = corpora.Dictionary(nwordall)#用于生成字典类似与table，Counter模块中count    #print dictionary.token2id  # 生成语料库   corpus = [dictionary.doc2bow(text) for text in nwordall] #tfidf加权  tfidf = models.TfidfModel(corpus)  # print tfidf.dfsx  # print tfidf.idf  corpus_tfidf = tfidf[corpus] # 4. 主题模型lda，可用于降维  #lda流式数据建模计算，每块10000条记录，提取50个主题  lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=50,     update_every=1, chunksize=10000, passes=1)  #提取前面20个主题for i in range(0,20):      print lda.print_topics(i)[0]  #lda全部数据建模，提取100个主题  #lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=100, update_every=0, passes=20)  #利用原模型预测新文本主题  #doc_lda = lda[corpus_tfidf] #5. word2vec 词向量化，可用于比较词相似度，寻找对应关系，词聚类  #sentences = models.word2vec.LineSentence(nwordall)  #size为词向量维度数,windows窗口范围,min_count频数小于5的词忽略,workers是线程数  model = models.word2vec.Word2Vec(nwordall, size=100, window=5, min_count=5, workers=4)print model[u'指示灯']  #向量表示  sim = model.most_similar(positive=[u'指示灯', u'灯不亮'])      #相近词  for s in sim:      print "word:%s,similar:%s " %(s[0],s[1])

0 0