数据挖掘文本分类实验
来源:互联网 发布:行知实验中学音乐老师 编辑:程序博客网 时间:2024/04/28 20:32
一、文本分类:
用电脑对文本集(或其他实体或物件)按照一定的分类体系或标准进行自动分类标记。本实验从中国新闻网爬取了10类(财经、国际、It、健康、军事、能源、汽车、体育、文化、娱乐)共180多万篇新闻,经过分词、取名词、去掉停用词、计算tfidf降低维度、然后生成分类器的输入数据,采用朴素贝叶斯作为baseline,还用了svm和libsvm分类器来对这100万篇文章进行分类。然后输出分类结果和一些其他评估数据。
构建语料库
本次实验,我们使用的预料库是我们利用python编写爬虫程序在中国新闻网(http://www.chinanews.com/)爬取的数据集。新闻分类共有十类,分别是:财经、国际、互联网、健康、军事、能源、汽车、体育、文化和娱乐。
二 、实验步骤
上图为本次试验的处理流程,爬数据,筛选文章,分词,去停用次,取名次,计算每类的tfidf降低维度,筛选分词结果,生成输入数据,训练
爬虫代码
# -*- coding: utf-8 -*-import scrapyfrom scrapy.http import Requestfrom scrapy.selector import Selectorimport osimport codecsimport timeimport datetimemkpath = "C:\\Users\\chenteng\\Desktop\\NEWS-PA\\newsPa\\caijing\\"#txtid = 0#base_dir = mkpathclass NewsSpider(scrapy.Spider): name = 'news' allowed_domains = ['chinanews.com'] start_urls = ['http://www.chinanews.com/'] def parse(self, response): date_list = getBetweenDay("20091201") for day in date_list: year = day[0:4] dayday = day[4:8] #global base_dir #base_dir = mkpath+day+"\\" #mkdir(base_dir) #global txtid #txtid = 0 #time.sleep(0.1) total = "http://www.chinanews.com/scroll-news/cj/{0}/{1}/news.shtml".format(year,dayday) yield Request(total,meta = {"day":day},callback = self.info_1) def info_1(self,response): selector = Selector(response) day = response.meta["day"] base_dir = mkpath+day+"\\" mkdir(base_dir) txtid = 0 print "===============base_dir==============" list = selector.xpath("//div[@class='dd_bt']/a/@href").extract() for url in list: txtid += 1 filename = base_dir + str(txtid) +'.txt' yield Request(url,meta = {"filename":filename},callback = self.info_2) def info_2(self,response): selector = Selector(response) filename = response.meta["filename"] print "===============filename==============" list = selector.xpath("//div[@class='left_zw']/p/text()").extract() print list #global txtid #txtid +=1 #filename = base_dir + str(txtid) +'.txt' #print filename f = codecs.open(filename,'a','utf-8') for i in range(len(list)-1): print '=========' + str(i) + "==========" f.write(list[i]) f.close()def mkdir(path): # 去除首位空格 path=path.strip() # 去除尾部 \ 符号 path=path.rstrip("\\") # 判断路径是否存在 # 存在 True # 不存在 False isExists=os.path.exists(path) # 判断结果 if not isExists: # 如果不存在则创建目录 # 创建目录操作函数 os.makedirs(path) print path+' 创建成功' return True else: # 如果目录存在则不创建,并提示目录已存在 print path+' 目录已存在' return Falsedef getBetweenDay(begin_date): date_list = [] begin_date = datetime.datetime.strptime(begin_date, "%Y%m%d") end_date = datetime.datetime.strptime(time.strftime('%Y%m%d',time.localtime(time.time())), "%Y%m%d") while begin_date <= end_date: date_str = begin_date.strftime("%Y%m%d") date_list.append(date_str) begin_date += datetime.timedelta(days=1) return date_list
预处理代码(每步都存储了中间文件,比较慢)
# -*- coding: utf-8 -*-import jiebaimport jieba.posseg as psegimport os,re,collectionsimport sysimport numpy as npfrom numpy import nan as Naimport pandas as pdfrom pandas import Series,DataFramesys.setrecursionlimit(999999999)#增加递归次数stopwords = {}.fromkeys([ line.rstrip() for line in open('stopwords.txt','r',encoding='utf-8')])#stopwords = {}.fromkeys(['时代', '新机遇','机遇','意识','人'])#遍历txt文件,分词、取名次、去停用词#filepath为文件夹路径list,i是第几类#会一层一层遍历到最内层的txt读取新闻并进行分词def gci(filepath,i):#遍历filepath下所有文件,包括子目录 files = os.listdir(filepath) for fi in files: path = os.path.join(filepath,fi) if os.path.isdir(path): gci(path) else: sliptword(path,i)#分词、取名次、去停用词#path是文本路径,i是类#oriDictPathList[i]是第i类的字典def sliptword(path,i): print(path) # news为我原来放新闻的文件夹名字,用正则表达式把它替换成split,换个文件夹保存 strinfo = re.compile('news') writepath=strinfo.sub('split',path) #分词结果写入的路径split with open(path, "r",encoding='utf-8') as f: text = f.read() str = "" str2="" result = pseg.cut(text) ##词性标注,标注句子分词后每个词的词性 for w in result:#遍历每个词语 if w.flag.startswith('n'):#如果词性是n开头的说明是名次,留下 if w.word not in stopwords:#如果不在停用次表里,留下保存在字符串里 str = str + w.word+"\n" str2 =str2+w.word+" " with open(writepath,"a")as f: f.write(str) with open(oriDictPathList[i], "a")as f: f.write(str2)#计算tfdif#oriDictPathList为原始的字典list,存放每类的所有词语,不去重,维度很高def tfidf(oriDictPathList): import sklearn from sklearn.feature_extraction.text import CountVectorizer # 语料 corpus=[]#存放每类的字典 for i in range(len(oriDictPathList)): print(i,"\n") with open(oriDictPathList[i],'r',encoding='utf-8')as f: corpus.append(f.read()) print(len(corpus[i]),"\n") # corpus = [ # 'This is the first document.', # 'This is the second second document.', # 'And the third one.', # 'Is this the first document?', # ] # 将文本中的词语转换为词频矩阵 vectorizer = CountVectorizer() # 计算个词语出现的次数 X = vectorizer.fit_transform(corpus) # 获取词袋中所有文本关键词 word = vectorizer.get_feature_names() #把10类中出现的所有词语存放到allword.txt里 with open("E:\\新建文件夹\\文本\\新建文件夹\\python\\DataMinning\\tfidf\\allword.txt",'w',encoding='utf-8') as f: print(len(word)) s='\n'.join(word) print(len(s)) f.write('\n'.join(word)) # 查看词频结果 # print("X.toarray():",X.toarray()) #np.set_printoptions(threshold='nan') np.set_printoptions(threshold=np.inf) print(1) #wordfrepath为存放每类词频文件的list #每个文件写如本类的词频,此时的词频维度是多类的总维度 for i in range(len(wordfrepath)): print("i:",i) s = str(X.toarray()[i]) s = s.lstrip('[') s = s.rstrip(']') with open(wordfrepath[i], 'w', encoding='utf-8')as f: f.write(s) from sklearn.feature_extraction.text import TfidfTransformer # 类调用 transformer = TfidfTransformer() print("transformer:",transformer) # 将词频矩阵X统计成TF-IDF值 tfidf = transformer.fit_transform(X) # 查看数据结构 tfidf[i][j]表示i类文本中的tf-idf权重 # print("tfidf.toarray()",tfidf.toarray()) np.set_printoptions(threshold=np.inf)#加上这句可以全输出或者全写入,不然中间是省略号 print(2) #tfidfpath为存放每类的tfidf数值的list for i in range(len(tfidfpath)): print(i) s = str(tfidf.toarray()[i]) s = s.lstrip('[') s = s.rstrip(']') with open(tfidfpath[i], 'w', encoding='utf-8')as f: f.write(s)#快排,因为要挑选出每类tfidf数值最大的若干个词语,同时还要把对应词语也筛选出来,所以手写快排用它的索引def parttion(v1,v2, left, right): key1 = v1[left] key2 = v2[left] low = left high = right while low < high: while (low < high) and (v1[high] <= key1): high -= 1 v1[low] = v1[high] v2[low] = v2[high] while (low < high) and (v1[low] >= key1): low += 1 v1[high] = v1[low] v2[high] = v2[low] v1[low] = key1 v2[low] = key2 return lowdef quicksort(v1,v2, left, right): if left < right: p = parttion(v1,v2, left, right) print(p) quicksort(v1,v2, left, p-1) quicksort(v1,v2, p+1, right) return v1,v2#降低维度#把每类的tfidf和对应的词语重排序,写入新文件newtfidfpath,newdictpathdef reducedimension(tfidfpath,allwordpath,newtfidfpath,newdictpath): for i in range(len(tfidfpath)): with open(tfidfpath[i],'r',encoding='utf-8')as f: text=f.read() tfidftemp = text.split() with open(allwordpath,'r',encoding='utf-8')as f: text=f.read() allwordlisttemp =text.split() tfidflist = [] allwordlist = [] for j in range(len(tfidftemp)): k = float(tfidftemp[j]) if k > 9.99999999e-05: tfidflist.append(k) allwordlist.append(allwordlisttemp[j]) newtfidflist,newallwordlist=quicksort(tfidflist,allwordlist,0,len(tfidflist)-1) with open(newtfidfpath[i],'w',encoding='utf-8')as f: f.write(" ".join(str(newtfidflist))) with open(newdictpath[i],'w',encoding='utf-8')as f: f.write(" ".join(newallwordlist))#创建新字典#在每类里找前n个tfidf最大的词语放进总字典,这里用scipy里dataframe和series合并def createdict(newtfidfpath,newdictpath,dictpath): l=[] for i in range(len(newtfidfpath)): with open(newtfidfpath[i],'r',encoding='utf-8')as f: tfidflist=[float(e) for e in f.read().split()[0:1000]] with open(newdictpath[i],'r',encoding='utf-8')as f: wordlist=f.read().split()[0:1000] s=Series(tfidflist,wordlist) l.append(s) dictdataframe = pd.DataFrame(l) #存起来 pd.set_option('max_colwidth', 20000000) with open(dictpath,'w',encoding='utf-8')as f: f.write(" ".join(dictdataframe.columns.tolist()))#根据新字典重新分词,新的分词结果存放在newsplit里,把不在字典里的词语扔掉def createnewsplit(dictpath,splitPath): strinfo = re.compile('split') # news with open(dictpath,'r',encoding='utf-8')as f: worddict=f.read().split() for i in range(len(splitPath)): files=os.listdir(splitPath[i]) index1=1 for fi in files: path = os.path.join(splitPath, fi) if index1 >= 0: try: list1 = [line.rstrip('\n') for line in open(path, 'r',encoding='utf-8')] except: list1 = [line.rstrip('\n') for line in open(path, 'r')] list2=[e for e in list1 if e in worddict] list3=list(set(list1)) writepath = strinfo.sub('newsplit', path) # 分词结果写入的路径split with open(writepath, 'w', encoding='utf-8')as f: f.write(" ".join(list3)) #print("index1",index1) index1 = index1 + 1#生成svm的输入数据def svminputdata(dictpath,SplitPathList): with open(dictpath,'r',encoding='utf-8')as f: dict=f.read().split() rindex=list(range(len(dict))) #rindex=["t"+str(e) for e in range(len(dict))] sd=Series(np.ones(len(dict)).tolist(),index=dict) strinfo = re.compile('split') # news trainindex=1 testindex=1 for i in range(len(SplitPathList)): index1 = 1 num=i+1 files = os.listdir(SplitPathList[i]) for fi in files: #print("s1") path=os.path.join(SplitPathList[i],fi) try: with open(path, 'r', encoding='utf-8')as f: list1 = f.read().split() except: with open(path, 'r')as f: list1 = f.read().split() list2 = [e for e in list1 if e in dict] s = Series(list2) s = s.value_counts() s2 = Series(s.values, index=s.index) s3 = s2 * sd s3 = Series(s3.values, index=rindex) s4 = s3[s3.notnull()] s4index = s4.index s4values = s4.values if index1<=500: if trainindex>=0: str1 = "" for j in range(len(s4)): str1 = str1 + str(trainindex) + " " + str(s4index[j]) + " " + str(int(s4values[j])) + "\n" with open("E:\\新建文件夹\\文本\\新建文件夹\\python\\DataMinning\\inputdata\\svm\\train1.data", 'a', encoding='utf-8')as f: f.write(str1) with open("E:\\新建文件夹\\文本\\新建文件夹\\python\\DataMinning\\inputdata\\svm\\train1.label", 'a', encoding='utf-8')as f: f.write(str(num) + "\n") writepath = strinfo.sub('splitwordfre', path) trainindex += 1 if index1>500 and index1<=1000: if testindex>=0: str1 = "" for j in range(len(s4)): str1 = str1 + str(int(testindex)) + " " + str(s4index[j]) + " " + str(int(s4values[j])) + "\n" with open("E:\\新建文件夹\\文本\\新建文件夹\\python\\DataMinning\\inputdata\\svm\\test1.data", 'a', encoding='utf-8')as f: f.write(str1) with open("E:\\新建文件夹\\文本\\新建文件夹\\python\\DataMinning\\inputdata\\svm\\test1.label", 'a', encoding='utf-8')as f: f.write(str(num) + "\n") writepath = strinfo.sub('splitwordfre', path) testindex += 1 if index1==1000: break index1+=1 print("type trainindex testindex articleindex",num," ",trainindex," ",testindex," ",index1)# 生成libsvm的输入数据,libsvm很慢def libsvminputdata(dictpath, newdictpath,newtfidfpath,newSplitPathList): with open(dictpath,'r',encoding='utf-8')as f: dict=f.read().split() sd=Series(np.ones(len(dict)).tolist(),index=dict) print(len(dict)) sl=[] rindex=[float(e) for e in range(len(dict))] for i in range(len(newdictpath)): with open(newdictpath[i], 'r', encoding='utf-8')as f: alldict = f.read().split() with open(newtfidfpath[i], 'r', encoding='utf-8')as f: alltfidf = [float(e) for e in f.read().split()] print(len(alldict)) print(len(alltfidf)) sad = Series(alltfidf, index=alldict) sl.append(sad) for i in range(len(newSplitPathList)): files = os.listdir(newSplitPathList[i]) num=i+1 for fi in files: path=os.path.join(newSplitPathList[i],fi) try: with open(path, 'r', encoding='utf-8')as f: list1 = f.read().split() except: with open(path,'r')as f: list1=f.read().split() s=Series(np.ones(len(list1)).tolist(),index=list1) print("s1",len(s)) s2=s*sl[i] print("s2",len(s2)) s3=s2*sd print("s3",len(s3)) break s4=s3[s3.notnull()] s4index=s4.index s4values=s4.values str1="" for j in range(len(s4)): str1 = str1+str(trainindex) + " " +str(s4index[j])+" "+str(int(s4values[j]))+"\n" with open("",'a',encoding='utf-8')as f: f.write(str1) with open("",'a',encoding='utf-8')as f: f.write(str(num)+"\n") break
阅读全文
0 0
- 数据挖掘文本分类实验
- 【数据挖掘】文本分类器
- 文本分类,数据挖掘和机器学习
- 文本分类,数据挖掘和机器学习
- 文本分类,数据挖掘和机器学习
- 文本分类,数据挖掘和机器学习
- 文本分类,数据挖掘和机器学习
- 数据挖掘竞赛题目 -- 文本分类
- 文本分类,数据挖掘和机器学习
- 数据挖掘之文本分类的数据预处理
- 数据挖掘--文本挖掘
- 文本挖掘之文本分类
- 【大数据部落】WEKA文本挖掘分析垃圾邮件分类模型
- 文本分类实验
- 数据挖掘:基于朴素贝叶斯分类算法的文本分类实践
- 数据挖掘--分类
- 数据挖掘分类技术
- 数据挖掘分类技术
- 【Linux】数据重定向
- FFmpeg 基本用法
- 转换函数与显式构造
- OpenCV3.3.1调用fcn8s-heavy-pascal.caffemoddel
- 使用git将本地项目上传到github
- 数据挖掘文本分类实验
- 分布式系统学习
- 魔族密码vijos
- RadioButton的setSelected(boolean state)与fire()的区别
- leetCode-Sort Colors
- Caffe 实现多标签分类
- 2017 1210 C primer puls 7.11
- python sort、sort_index方法(排序)
- Thrift使用入门-RPC服务