文本分类学习笔记(2)- 特征提取

来源:互联网 发布:ios完整项目源码 编辑:程序博客网 时间:2024/06/06 19:03

师兄提供了一组英文文本分类任务,正好拿来练手。
第一个问题就是文本的导入问题。数据格式为:每个类别一个文件夹,存放无后缀的文本文件,单词以空格切分。
自然的想法就是扫描文件夹下的所有文本,并将所有单词读入字典。
依据 http://www.jb51.net/article/52218.htm 中提供的方法,稍作修改得到如下方法:

#coding=utf-8import osimport nltkfrom numpy import *from operator import itemgetterfrom collections import OrderedDict,Counterfrom math import expfrom scipy import sparse,iofrom sklearn.linear_model import LogisticRegressionimport warningswarnings.filterwarnings("ignore")def load_files(directory,prefix=None,postfix=None):    #获取所有文件    files_list=[]    classlen=[0 for i in range(11)]    i = 0    for root, sub_dirs, files in os.walk(directory):        classlen[i] = len(files)        i += 1        for special_file in files:            if postfix:                if special_file.endswith(postfix):                    files_list.append(os.path.join(root,special_file))            elif prefix:                if special_file.startswith(prefix):                    files_list.append(os.path.join(root,special_file))            else:                    files_list.append(os.path.join(root,special_file))    #扫描建立词典    articallist = [dict() for i in range(len(files_list))]    filelen = [0 for l in range(len(files_list))]    i = 0    for eachfile in files_list:        file_object = open(eachfile,'r')        t = 0        for line in file_object:            for word in line.split():                #非数字                if not str(word).isdigit():                    t += 1                    #大小写转换                    word = str(word).lower()                    if articallist[i].has_key(word):                        articallist[i][word] += 1                    else:                        articallist[i][word] = 1        filelen[i] = t        i += 1        file_object.close()    #print '总文件数:',len(files_list)    print len(articallist[1])    return articallist,classlen,filelen#导入停止词表def load_stop_en(filename):    word_list=[]    file_object = open(filename,'r')      for line in file_object:        word_list.append(line.strip())    return word_list#去停止词def delet_stopword_en(stop_en_set, en_dict):    for key in stop_en_set:        if en_dict.has_key(key):            del en_dict[key]#获取某个词在所有文档中的IF-TDFdef get_TFIDF(articallist,filelen,word):    num = len(articallist)    TFindex = [0 for i in range(num)]    IDFindex = 0    for i,eachdict in enumerate(articallist):        if eachdict.has_key(word):            TFindex[i] = eachdict[word]/float(filelen[i])            IDFindex += 1    for i in range(len(TFindex)):        if IDFindex != 0:            TFindex[i] = TFindex[i] * exp(IDFindex/float(num))    #print TFindex    return TFindexdef updatex(dict1,dict2):    #print '$',dict2    for key in dict2.keys():        #key = str(key)        if dict1.has_key(key):            dict1[key] += dict2[key]        else:            dict1[key] = dict2[key]def get_Mat(trainfilepath='training',testfilepath='test',stop_enname='en.txt',matfilename='SetMat.mat'):    #导入文件    articallist,classlen,filelen = load_files(trainfilepath)    #print classlen    #去停止词    stop_en_set = load_stop_en(stop_enname)    #训练分类器标签集    #classlabel = [['acq'], ['corn'], ['crude'], ['earn'], ['grain'], ['interest'], ['money-fx'], ['ship'], ['trade'], ['wheat']]    classlabel = [i+1 for i in range(10)]    labeled_names = [0 for i in range(len(articallist))]    classr = 0    finaldict = {}    for i in range(10):        classl = classr        classr += classlen[i+1]        labeled_names[classl:classr] = [classlabel[i] for k in range(classlen[i+1])]        tempdict = {}        for eachdict in articallist[classl:classr]:            delet_stopword_en(stop_en_set,eachdict)            updatex(tempdict,eachdict)        #各类均前3000个,合并        tempdict = OrderedDict(sorted(tempdict.iteritems(), key=itemgetter(1), reverse=True))              tempdict = dict(Counter(tempdict).most_common(3000))    print 'vector:', len(finaldict)    #为每个关键词求TF-IDF值,得到文本特征值    vectormat = [get_TFIDF(articallist,filelen,each) for each in finaldict]    #转置:行为文件,列为特征    vectormat = array(vectormat).transpose()    articallist1,classlen1,filelen1 = load_files(testfilepath)    vectormat1 = [get_TFIDF(articallist1,filelen1,each) for each in finaldict]    vectormat1 = array(vectormat1).transpose()     #classifier = LogisticRegression()  # 参数默认    #classifier.fit(vectormat, labeled_names)  # 训练数据,无返回值    #print classifier.predict(vectormat1[2])

此处设计不足,导致统计特征词的逻辑混乱
下面是简单的分类器实现,使用sklearn包,另一个Kmeans是自己实现的k均值算法,精度不太理想

    worddict1,articallist1,classlen1 = load_files('D:/Py/test')    vectormat1 = [get_TFIDF(articallist1, each[0]) for each in finaldict]    vectormat1 = array(vectormat1).transpose()    #训练分类器    classlabel = [['acq'], ['corn'], ['crude'], ['earn'], ['grain'], ['interest'], ['money-fx'], ['ship'], ['trade'], ['wheat']]    #标签    labeled_names = [[''] for i in range(len(articallist))]    classr = 0    for i in range(10):        classl = classr        classr += classlen[i+1]        #print classl,classr,classlabel[i]        labeled_names[classl:classr] = [classlabel[i] for k in range(classlen[i+1])]    classifier = LogisticRegression()  # 使用类,参数全是默认的    classifier.fit(vectormat, labeled_names)  # 训练数据来学习,不需要返回值    print classifier.predict(vectormat1[800])    #print classifier.predict_proba(vectormat1)    #Kmeans聚类预测    #myCentroids, clustAssing = Kmeans.kMeans(vectormat,10,len(vectormat))

修正心得:
1、python的字典数据提供了一种update方法(eg:dict.update(dict2),将字典dict2的键/值对更新到dict里)使用该方法合成特征值提取所需的词表,但此处存在一个问题:若dict与dict2中有相同的键,则该方法仅保留dict2中的值,这就导致了对词出现次数的统计错误,更正为自定义函数updatex(),时间复杂度还有待提高
2、词频TF = 词在某文件中出现的次数/该文件的总长度,TFindex[i] = eachdict[word]/float(len(eachdict))此处错误使用了词表长度;
3、自定义的分词只使用的简单的空格分割,每个词可能存在”,.’\”等特殊字符,在去停止词的步骤中也无法处理,需要提高。
nltk中提供了分句函数sent_tokenize和分词函数word_tokenize;还可以使用wordnet函数提取词干,去除时态等格式。在实际实验的过程中发现,当单词中存在‘\’等字符时,wordnet无法正常工作,故自定义函数onlychar做只保留字母的变换,但进一步提高了复杂度。。。

#分句分词def ie_preprocess(document):    sentences = nltk.sent_tokenize(document)    sentences = [nltk.word_tokenize(sent) for sent in sentences]    return sentences#只保留字母def OnlyChar(s,oth=''):    s2 = s.lower()    fomart = 'abcdefghijklmnopqrstuvwxyz'    for c in s2:        if not c in fomart:            s = s.replace(c,'')    return s

4、分类结果不理想。。。

0 0
原创粉丝点击