R软件中jiebaR包分词和用python中jieba分词以及做关键字提取及LDA模型
来源:互联网 发布:淘宝助理怎么编辑宝贝 编辑:程序博客网 时间:2024/04/29 21:28
对于一个软件来讲,若是开源其发展速度是很快的,在R软件中,去年年底就发布了jiebaR分词包,记得上学的那会jieba包首先是出现在python中,没想到在R软件中也用的到,前几天接了点私活,用这个包帮他做点东西出来,没想到,做到最后不愿意给钱,无良奸商。。。不过也正好也熟悉了一下R中的jiebaR分词包,总体来讲这个包还是蛮强大的,中文分词很准确,能提取关键字,能快速的上手,直接上代码对比python中jieba包,看看吧:
library(jiebaRD)
library(jiebaR)# library(jiebaR)加载包时没有启动任何分词引擎启动引擎很简单,就是一句赋值语句就可以了x<-"众筹项目成功了,众筹绑卡成功了,一切都很顺利"cutter=worker()cutter<=x
show_dictpath() #显示字典路径
edit_dict() #编辑用户字典 可以自己加词语进去 要重新cutter=worker()才能生效
cutter <= "D:\\Users\\xuguoxuan694\\Desktop\\新建文本文档.txt"show_dictpath() ### 可以显示默认词典路径segment(code= x , jiebar = cutter) ##一样的功能cutter=worker(type="tag")<pre name="code" class="plain">tryCatch(library(jiebaR),error=function(e){install.packages("jiebaR")},finally={ tryCatch(library(jiebaR) ,error=function(e){print("请重新安装jiebaR包")})})tryCatch(library(ggplot2),error=function(e){install.packages("ggplot2")},finally={ tryCatch(library(ggplot2) ,error=function(e){print("请重新安装ggplot2包")})})tryCatch(library(wordcloud),error=function(e){install.packages("wordcloud")},finally={ tryCatch(library(wordcloud) ,error=function(e){print("请重新安装wordcloud包")})})result<-read.csv("C:\\Users\\Administrator\\Desktop\\质量记录.csv")head(result)# edit_dict()cutter=worker()x<-"众筹项目成功了,众筹绑卡成功了,一切都很顺利" cutter<=x result$QUALITYDESC_d<-sapply(result$QUALITYDESC,function(x)gsub("[a-zA-Z0-9]","",x))result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("客户反应","",x))result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("客户","",x))result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("反映","",x))#把全身没气压、全身气压弱、全身气压很小转化为全身气压问题clear_stopwords<-function(x){ if(grepl("全身没气压",x)){ x<-gsub("全身没气压","全身气压",x) } if(grepl("全身不充气",x)){ x<-gsub("全身不充气","全身充气",x) } if(grepl("断了",x)){ x<-gsub("断了","断裂",x) } if(grepl("响声",x)){ x<-gsub("响声","异响",x)} x}result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,clear_stopwords)jieba_result<-c()for(j in result$QUALITYDESC_d){ jieba_result<-c(jieba_result,cutter<=j)}stopwords1<-c("不能","不","了","有","在","没","少","一个","都","也","时","来","用","会","上","后","是","腿","走","无","左","大","没有","就","到","右","坏"," 部","不会"," 两个")stopword2<-c(stopwords1,"加","一","小","个","才","去","能","对","只","还","和","需要","过","倒","的","跟","已","掉","让","可以","掉","停","拨","亮","一下","下")stopword<-c(stopword2,"其他","下去","时候","使用","问题","正常","部","一边","一直","工作","响","说","好","买","但是","一样","不行","时有","夹")jjj_result<-as.data.frame(table(jieba_result))jj_result<-jjj_result[!jjj_result$jieba_result %in% stopword, ]op<-par(bg = "lightyellow") wordcloud(jj_result$jieba_result,jj_result$Freq,col = rainbow(length(jj_result$Freq)),scale=c(5,1),min.freq=4,max.words=Inf,random.order=FALSE) par(op) last<-jj_result[order(jj_result$Freq),]p<-ggplot(tail(last,30),aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1)p+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前30名分析报告分析")+ geom_text(label=(tail(last,30))$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result)))p1<-ggplot( last[(nrow(last)-60):(nrow(last)-30),],aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1)p1+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+ xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前30到60名分析报告分析")+ geom_text(label=(last[(nrow(last)-60):(nrow(last)-30),])$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result)))p2<-ggplot( last[(nrow(last)-90):(nrow(last)-60),],aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1)p2+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前60到90名分析报告分析")+geom_text(label=(last[(nrow(last)-90):(nrow(last)-60),])$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result)))
cutter_words <- cutter <= "我爱北京天安门"# 关键词提取# 关键词提取所使用逆向文件频率(IDF)。文本语料库可以切换成自定义语料库的路径,使用方法与分词类似。topn参数为关键词的个数。cutter = worker(type = "keywords", topn = 2)cutter_words <- cutter <= "我爱北京天安门"cutter_words
https://qinwf.shinyapps.io/jiebaR-shiny/ jiebaR在线分词试用
附上python中关键词提取及LDA模型 的python代码
#encoding:utf-8'''Created on 2015年10月25日@author: Administrator'''import pandas as pdimport reimport jieba import nltk import jieba.posseg as pseg from gensim import corpora, models, similaritiesdf=pd.read_csv(u'C:\\Users\\Administrator\\Desktop\\质量记录.csv',encoding='gbk')cont=df['QUALITYDESC'].map(lambda x:re.sub(ur'客户反应|客户|反映','',x)).map(lambda x:re.sub(r'[a-zA-Z0-9\.]','',x))#导入自己添加的用户词语jieba.load_userdict(u'C:\\Users\\Administrator\\Desktop\\分词.txt')nwordall = [] for t in cont: words =pseg.cut(t) nword = [''] for w in words: if((w.flag == 'n'or w.flag == 'v' or w.flag == 'a') and len(w.word)>1): nword.append(w.word) nwordall.append(nword) # 选择后的词生成字典 dictionary = corpora.Dictionary(nwordall)#用于生成字典类似与table,Counter模块中count #print dictionary.token2id # 生成语料库 corpus = [dictionary.doc2bow(text) for text in nwordall] #tfidf加权 tfidf = models.TfidfModel(corpus) # print tfidf.dfsx # print tfidf.idf corpus_tfidf = tfidf[corpus] # 4. 主题模型lda,可用于降维 #lda流式数据建模计算,每块10000条记录,提取50个主题 lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=50, update_every=1, chunksize=10000, passes=1) #提取前面20个主题for i in range(0,20): print lda.print_topics(i)[0] #lda全部数据建模,提取100个主题 #lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=100, update_every=0, passes=20) #利用原模型预测新文本主题 #doc_lda = lda[corpus_tfidf] #5. word2vec 词向量化,可用于比较词相似度,寻找对应关系,词聚类 #sentences = models.word2vec.LineSentence(nwordall) #size为词向量维度数,windows窗口范围,min_count频数小于5的词忽略,workers是线程数 model = models.word2vec.Word2Vec(nwordall, size=100, window=5, min_count=5, workers=4)print model[u'指示灯'] #向量表示 sim = model.most_similar(positive=[u'指示灯', u'灯不亮']) #相近词 for s in sim: print "word:%s,similar:%s " %(s[0],s[1])
0 0
- R软件中jiebaR包分词和用python中jieba分词以及做关键字提取及LDA模型
- python 中jieba分词
- R语言中文分词包jiebaR
- python中jieba分词快速入门
- python中jieba分词快速入门
- python中jieba分词的简单应用
- R语言中文分词jiebaR
- Python安装jieba包,进行分词
- jieba做中文分词
- python jieba分词学习
- python中文分词---jieba
- python 分词工具 jieba
- python 分词工具 jieba
- Python--jieba分词
- python jieba分词
- Python-jieba分词
- python-jieba分词的安装和使用
- Jieba分词包解析系列
- 设计模式六大原则(1):单一职责原则
- linux下单引号和双引号区别
- Unsupported major.minor version 51.0Unsupported major.minor version 51.0
- JS不弹出网页文件下载本地
- 设计模式六大原则(2):里氏替换原则
- R软件中jiebaR包分词和用python中jieba分词以及做关键字提取及LDA模型
- 用ATL开发复合控件
- iOS 3D Touch (UIApplicationShortcutItem、UIViewControllerPreviewing、UIPreviewAction)
- oracle运行失败的类型
- 统计文件字数,并按出现次数打印结果
- 大数网:即将召开的华为首届开发者大会说什么?
- 设计模式六大原则(3):依赖倒置原则
- 飞机订票客户端-分段截断异常-处理
- Spring3.0核心组件的源码简单分析