R语言:文本挖掘 主题模型 文本分类

来源:互联网 发布:怎么下载kitti数据集 编辑:程序博客网 时间:2024/05/11 02:59

转自:http://www.biostatistic.net/thread-94975-1-1.html

####需要先安装几个R包,如果有这些包,可省略安装包的步骤。

#install.packages("Rwordseg")#install.packages("tm");#install.packages("wordcloud");#install.packages("topicmodels")

例子中所用数据

数据来源于sougou实验室数据。
数据网址:http://download.labs.sogou.com/dl/sogoulabdown/SogouC.mini.20061102.tar.gz
文件结构
└─Sample
├─C000007 汽车
├─C000008 财经
├─C000010 IT
├─C000013 健康
├─C000014 体育
├─C000016 旅游
├─C000020 教育
├─C000022 招聘
├─C000023
└─C000024 军事
采用Python对数据进行预处理为train.csv文件,并把每个文件文本数据处理为1行。

预处理python脚本
#!/usr/bin/env python#encoding=utf-8from glob import globimport os, sys, csvdef cur_file_dir():        path = sys.path[0]        if os.path.isdir(path):                return path        elif os.path.isfile(path):                return os.path.dirname(path)FILE_TRAIN= cur_file_dir()  + os.sep + 'train.csv' csvfile = open(FILE_TRAIN, 'w')writer = csv.writer(csvfile)writer.writerow(['type','text'])csvfile.flush()try:        file_names = glob('.\Sample\*\*.txt')        for file_name in file_names:                if file_name.find("C000007") > -1:                        file_type ="yy"                elif file_name.find("C000008") > -1:                        file_type = "yy"                elif file_name.find("C000010") > -1:                        file_type = "yy"                elif file_name.find("C000013") > -1:                        file_type = "yy"                elif file_name.find("C000014") > -1:                        file_type = "yy"                elif file_name.find("C000016") > -1:                        file_type = "yy"                       elif file_name.find("C000020") > -1:                        file_type = "yy"                              elif file_name.find("C000022") > -1:                        file_type = "yy"                       elif file_name.find("C000023") > -1:                        file_type = "yy"                        elif file_name.find("C000024") > -1:                        file_type = "yy"                 textFile = open(file_name, 'r',encoding= 'utf-8')                lines = textFile.readlines()                texts = " ".join(lines).strip().replace('"',"").replace("'","").replace("\n","").replace("\r","").replace(" ","")                writer.writerow([file_type,texts])                     csvfile.flush()                       textFile.close    finally:                        csvfile.close()        

所需数据

大家也可以用R直接将原始数据转变成train.csv中的数据

文章所需stopwords

1.读取资料库csv <- read.csv("d://wb//train.csv",header=T, stringsAsFactors=F)mystopwords<- unlist (read.table("d://wb//StopWords.txt",stringsAsFactors=F))
2.数据预处理(中文分词、stopwords处理)library(tm);#移除数字removeNumbers = function(x) { ret = gsub("[0-90123456789]","",x) }sample.words <- lapply(csv$$$$text, removeNumbers)
#处理中文分词,此处用到Rwordseg包wordsegment<- function(x) {    library(Rwordseg)segmentCN(x)}sample.words <- lapply(sample.words, wordsegment)
###stopwords处理###先处理中文分词,再处理stopwords,防止全局替换丢失信息removeStopWords = function(x,words) {      ret = character(0)    index <- 1    it_max <- length(x)    while (index <= it_max) {      if (length(words[words==x[index]]) <1) ret <- c(ret,x[index])      index <- index +1    }    ret}sample.words <- lapply(sample.words, removeStopWords, mystopwords)
3.wordcloud展示#构建语料库corpus = Corpus(VectorSource(sample.words))meta(corpus,"cluster") <- csv$typeunique_type <- unique(csv$type)#建立文档-词条矩阵(sample.dtm <- DocumentTermMatrix(corpus, control = list(wordLengths = c(2, Inf))))
#install.packages("wordcloud"); ##需要wordcloud包的支持library(wordcloud);#不同文档wordcloud对比图sample.tdm <-  TermDocumentMatrix(corpus, control = list(wordLengths = c(2, Inf)));tdm_matrix <- as.matrix(sample.tdm);png(paste("d://wb//sample_comparison",".png", sep = ""), width = 1500, height = 1500 );comparison.cloud(tdm_matrix,colors=rainbow(ncol(tdm_matrix)));####由于颜色问题,稍作修改title(main = "sample comparision");dev.off();
#按分类汇总wordcloud对比图n <- nrow(csv)zz1 = 1:ncluster_matrix<-sapply(unique_type,function(type){apply(tdm_matrix[,zz1[csv$$$$type==type]],1,sum)})png(paste("d://wb//sample_ cluster_comparison",".png", sep = ""), width = 800, height = 800 )comparison.cloud(cluster_matrix,colors=brewer.pal(ncol(cluster_matrix),"Paired")) ##由于颜色分类过少,此处稍作修改title(main = "sample cluster comparision")dev.off()
#按各分类画wordcloudsample.cloud <- function(cluster, maxwords = 100) {    words <- sample.words[which(csv$type==cluster)]    allwords <- unlist(words)    wordsfreq <- sort(table(allwords), decreasing = T)    wordsname <- names(wordsfreq)     png(paste("d://wb//sample_", cluster, ".png", sep = ""), width = 600, height = 600 )    wordcloud(wordsname, wordsfreq, scale = c(6, 1.5), min.freq = 2, max.words = maxwords, colors = rainbow(100))    title(main = paste("cluster:", cluster))    dev.off()}lapply(unique_type,sample.cloud)# unique(csv$type)
4.主题模型分析library(slam)summary(col_sums(sample.dtm))term_tfidf  <- tapply(sample.dtm$v/row_sums( sample.dtm)[ sample.dtm$i],   sample.dtm$j,  mean)*log2(nDocs( sample.dtm)/col_sums( sample.dtm  >  0))        summary(term_tfidf)sample.dtm  <-  sample.dtm[,  term_tfidf  >=  0.1]        sample.dtm  <-  sample.dtm[row_sums(sample.dtm)  >  0,]##α估计严重小于默认值,这表明Dirichlet分布数据集中于部分数据,文档包括部分主题library(topicmodels)k <- 30    SEED <- 2010sample_TM <-list(VEM = LDA(sample.dtm, k = k, control = list(seed = SEED)),VEM_fixed = LDA(sample.dtm, k = k,control = list(estimate.alpha = FALSE, seed = SEED)),Gibbs = LDA(sample.dtm, k = k, method = "Gibbs",control = list(seed = SEED, burnin = 1000,thin = 100, iter = 1000)),CTM = CTM(sample.dtm, k = k,control = list(seed = SEED,var = list(tol = 10^-4), em = list(tol = 10^-3))))
sapply(sample_TM[1:2], slot, "alpha")sapply(sample_TM, function(x) mean(apply(posterior(x)$topics,1, function(z) - sum(z * log(z)))))
α估计严重小于默认值,这表明Dirichlet分布数据集中于部分数据,文档包括部分主题。
数值越高说明主题分布更均匀
   #最可能的主题文档Topic <- topics(sample_TM[["VEM"]], 1)table(Topic)#每个Topic前5个TermTerms <- terms(sample_TM[["VEM"]], 5)Terms[,1:10]
######### auto中每一篇文章中主题数目(topics_auto <-topics(sample_TM[["VEM"]])[ grep("auto", csv[[1]]) ])most_frequent_auto <- which.max(tabulate(topics_auto))######### 与auto主题最相关的10个词语terms(sample_TM[["VEM"]], 10)[, most_frequent_auto]
5.文本分类、无监督分类(包括系统聚类、KMeans、string kernals)sample_matrix = as.matrix(sample.dtm)       rownames(sample_matrix) <- csv$type
#KMeans分类sample_KMeans  <-  kmeans(sample_matrix,  k)library(clue)#计算最大共同分类率cl_agreement(sample_KMeans,  as.cl_partition(csv$type),  "diag")#string kernalslibrary("kernlab")stringkern  <-  stringdot(type  =  "string")stringC1 <- specc(corpus, 10, kernel=stringkern)#查看统计效果table("String  Kernel"=stringC1,  cluster = csv$type )
6.文本分类,有监督分类(包括knn、SVM)把数据随机抽取90%作为学习集,剩下10%作为测试集。实际应用中应该进行交叉检验,这里简单起见,只进行一次抽取。n <- nrow(csv)set.seed(100)zz1 <- 1:nzz2 <- rep(1:k,ceiling(n/k))[1:n] #k <- length(unique(csv$type))zz2 <- sample(zz2,n)train <- sample_matrix[zz2<10,]test <- sample_matrix[zz2==10,]trainC1 <- as.factor(rownames(train))#knn分类library(class)sample_knnCl  <-  knn(train, test, trainC1)trueC1 <- as.factor(rownames(test))#查看预测结果(nnTable  <-  table("1-NN" = sample_knnCl,  sample =  trueC1))sum(diag(nnTable))/nrow(test)#样本集少预测效果是不好#SVM分类rownames(train) <- NULLtrain <- as.data.frame(train)train$type <- trainC1sample_ksvm  <-  ksvm(type~., data=train)svmCl  <-  predict(sample_ksvm,test)(svmTable <-table(SVM=svmCl, sample=trueC1))sum(diag(svmTable))/nrow(test)Rwordseg 中文词汇分类工具包该包与你用的R 版本有一些关系。可以参考下面这个链接说明http://www.biostatistic.net/foru ... hread&tid=94955

原创粉丝点击