
来源:互联网 发布:财务合并报表软件 编辑:程序博客网 时间:2024/06/05 04:01


word stemming


stop words

比如a、the这种词在处理的时候没有实际意义。在这里处理的时候先对词频进行统计,人为界定停词,简单的全部替换为空格。但是这种方式并不适用于所有的情况,对于比如,To be or not to be,这种就很难处理。


Index.txt 记录所出现的文件

thefile.txt 所有出现过的词(词频由高到低)
stop_word.txt 停词
data.pkl 所创建的索引

1 count.py 确定停词
2 index.py 建立倒排索引
3 query.py 用于查询





#-*- coding:utf-8 -*-'''@author birdy qian'''import sysfrom nltk import *                                                                                          #import natural-language-toolkitfrom operator import itemgetter                                                                 #for sortdef output_count(fdist):                                                                                #output the relative information    #vocabulary =fdist.items()    vocabulary =fdist.items()                                                                           #get all the vocabulary     vocabulary=sorted(vocabulary, key=itemgetter(1),reverse=True)               #sort the vocabulary in decreasing order    print vocabulary[:250]                                                                              #print top 250 vocabulary and its count on the screen    print 'drawing plot.....'                                                                               #show process    fdist.plot(120 , cumulative=False)                                                              #print the plot    #output in file    file_object = open('thefile.txt', 'w')                                                              #prepare the file for writing    for j in vocabulary:        file_object.write( j[0] + ' ')                                                                      #put put all the vocabulary in decreasing order     file_object.close( )                                                                                        #close the filedef pre_file(filename):     print("read file %s.txt....."%filename)                                                             #show process    content = open( str(filename) + '.txt', "r").read()    content = content.lower()    for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~' :                                            #cancel the punction        content = content.replace(ch, " ")    plurals = content.split()                                                                               #split the file at '\n' or ' '    stemmer = PorterStemmer()                                                                       #prepare for stemming    singles = [stemmer.stem(plural) for plural in plurals]                                  #handling stemming    return singles#main functiondef main():     print "read index....."                                                                                 #show process    input = open('index.txt', 'r')                                                                      #titles that need to be handled    all_the_file =input.read( )    file=all_the_file.split()    input.close()                                                                                               #close the file    fdist1=FreqDist()                                                                                       #create a new dist    for x in range( 0, len(file) ):        #print file[x]        txt = pre_file( file[x] )                                                                                   #pre handing the txt        for words in txt :            words =words.decode('utf-8').encode(sys.getfilesystemencoding())        #change string typt from utf-8 to gbk            fdist1[words] +=1                                                                                   #add it to the dist    output_count(fdist1)#runfileif __name__ == '__main__':     main() 


#-*- coding:utf-8 -*-'''@author birdy qian'''import sysimport pickle                   from nltk import *                                                                                          #import natural-language-toolkitfrom operator import itemgetter                                                                 #for sortSTOPWORDS = []                                                                                          #grobal variabledef output_index(result):    #print result    output = open('data.pkl', 'wb')    pickle.dump(result, output)                                                                     # Pickle dictionary using protocol 0    output.close()def pre_file(filename):     global STOPWORDS    print("read file %s.txt....."%filename)                                                             #show process    content = open( str(filename) + '.txt', "r").read()    content = content.lower()    for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_��{|}~' :                                           #cancel the punction        content = content.replace(ch, " ")    for ch in  STOPWORDS:                                                                               #cancel the stopwords        content = content.replace(ch, " ")          plurals = content.split()                                                                               #split the file at '\n' or ' '    stemmer = PorterStemmer()                                                                       #prepare for stemming    singles = [stemmer.stem(plural) for plural in plurals]                                  #handling stemming    return singlesdef readfile(filename):    input = open(filename, 'r')                                                                     #titles that need to be handled    all_the_file =input.read( )    words = all_the_file.split()                                                                            #split the file at '\n' or ' '    input.close()               return words#main functiondef main():     global STOPWORDS    print "read index....."                                                                                 #show process    file=readfile('index.txt')    print "read stopwords....."     STOPWORDS = readfile('stop_word.txt')      print "create word list....."    word = list(readfile('thefile.txt'))                                                                        #the file with all the words in all the books    result = {}                                                                                                     #memorize the result     for x in range( 0, len(file) ):        #print file[x]        txt = pre_file( file[x] )                                                                                   # file[x] is the title         txt =  {}.fromkeys(txt).keys()                                                                      #cancel the repeat word        #can also use text.set()                                                                    for words in txt :            words =words.decode('utf-8').encode(sys.getfilesystemencoding())        #change string typt from utf-8 to gbk            if result.get(words) == None :                                                              #if the word is not in the dictionary                result[words]=[file[x]]            else:                                                                                                       #if the word is in the dictionary                t=result.get(words)                t.append(file[x])                result[words]=t    output_index(result)#runfileif __name__ == '__main__':     main()


#-*- coding:utf-8 -*-'''@author birdy qian'''import os import sysimport pprint, picklefrom nltk import PorterStemmerdef readfile(filename):    input = open(filename, 'r')                                                                 #titles that need to be handled    all_the_file =input.read( )    words = all_the_file.split()                                                                        #split the file at '\n' or ' '    input.close()                                                                                           #close the data    return wordsdef getdata():    pkl_file = open('data.pkl', 'rb')                                                               #index is saved in the file 'data.pkl'    data1 = pickle.load(pkl_file)                                                                   #change the type    #pprint.pprint(data1)    pkl_file.close()                                                                                        #close the file    return  data1                                                                                       #close the datadef output( result ):    #print result    if result == None:                                              #if the words is not in the index (one word return None)        print None        return    if len(result) == 0 :                                           #if the words is not in the index (more than one words return [] )        print None        return     if len(result) < 10 :                                               #if the records is less than 10        print result    else:                                                                   #if the records is more than 10        print 'get '+ str(len(result)) + ' records'                                                                         #the record number        for i in range( 0 , len(result) / 10 +1):            print '10 records start from ' +str(i*10+1)            if 10 * i + 9 < len(result) :                                                                                           #print from 10 * i to 10 * i + 10                print result[ 10 * i : 10 * i + 10 ]            else:                                                                                                                           #print from 10 * i to end                print result[ 10 * i :  len(result) ]                break            getstr = raw_input("Enter 'N' for next ten records & other input to quit : ")            if getstr != 'N':                break#main functiondef main():     data_list = getdata()                                                                                                   #read data                                                                      STOPWORDS = readfile('stop_word.txt')     stemmer = PorterStemmer()                                                                                       #prepare for stemming    while True:        get_str = raw_input("Enter your query('\\'to quit): ")        if get_str == '\\' :                                                                                                    #leave the loop            break        get_str = get_str.lower()        for ch in  STOPWORDS:                                                                                           #cancel the stopwords            get_str = get_str.replace(ch, " ")          query_list=get_str.split()                                                                                          #split the file at '\n' or ' '        query_list = [stemmer.stem(plural) for plural in query_list]                                        #handling stemming        while True:                 if query_list != [] :                break            get_str = raw_input("Please enter more information: ")            get_str = get_str.lower()            for ch in  STOPWORDS:                                                                                       #cancel the stopwords                 get_str = get_str.replace(ch, " ")             query_list=get_str.split()            query_list = [stemmer.stem(plural) for plural in query_list]                                    #handling stemming        result=[]        for k in range( 0 , len(query_list) ):              if k==0:                                                                                                            #if the list has not been built                 result = data_list.get( query_list[0] )            else:                                                                                                                   #if the list has been built                 result = list( set(result).intersection(data_list.get( query_list[k] ) ) )        output( result )#runfileif __name__ == '__main__':     main()