python实现获取文件列表中每个文件关键字

来源:互联网 发布:知乎v领毛衣配衬衫女 编辑:程序博客网 时间:2024/06/16 23:45

功能描述:

获取某个路径下的所有文件,提取出每个文件中出现频率最高的前300个字。保存在数据库当中。

前提,你需要配置好nltk



#!/usr/bin/python#coding=utf-8'''function : This script will create a database named mydb then           abstract keywords of files of privacy police.author    : Chichodate      : 2014/7/28running   : python key_extract.py -d path_of_file'''import sys,getoptimport nltkimport MySQLdbfrom nltk.corpus import PlaintextCorpusReadercorpus_root = ""if __name__ == '__main__':    opts,args = getopt.getopt(sys.argv[1:], "d:h","directory=help")    #get the directory    for op,value in opts:        if op in ("-d", "--directory"):            corpus_root = value#actually, the above method to get  a directory is a little complicated,you can#do like this'''the input include you path and use sys.argv to get the path ''''''running : python key_extract.py you path_of_filecorpus_root = sys.argv[1]'''                            # corpus_root is the directory of files of privacy policy, all of the are html files    filelists = PlaintextCorpusReader(corpus_root, '.*')    #get the files' list    files = filelists.fileids()        #connect the database    conn = MySQLdb.connect(host = 'your_personal_host_ip_address', user = 'rusername', port =your_port, passwd = 'U_password')    #get the cursor    curs = conn.cursor()    conn.set_character_set('utf8')    curs.execute('set names utf8')    curs.execute('SET CHARACTER SET utf8;')    curs.execute('SET character_set_connection=utf8;')    '''    conn.text_factory=lambda x: unicode(x, 'utf8', "ignore")    #conn.text_factory=str    '''     # create a database named mydb    '''    try:        curs.execute("create database mydb")    except Exception,e:        print e    '''    conn.select_db('mydb')        try:        for i in range(300):            sql = "alter table filekeywords add " + "key" + str(i) + " varchar(45)"            curs.execute(sql)    except Exception,e:        print e                    i = 0    for privacyfile in files:        #f = open(privacyfile,'r', encoding= 'utf-8')        sql = "insert into filekeywords set id =" + str(i)        curs.execute(sql)        sql = "update filekeywords set name =" + "'" + privacyfile + "' where id= " + str(i)        curs.execute(sql)        # get the words in privacy policy        wordlist = [w for w in filelists.words(privacyfile) if w.isalpha() and len(w)>2]            # get the keywords        fdist = nltk.FreqDist(wordlist)        vol = fdist.keys()        key_num = len(vol)        if key_num > 300:            key_num = 300        for j in range(key_num):            sql = "update filekeywords set " + "key" + str(j) + "=" + "'" + vol[j] + "' where id=" + str(i)            curs.execute(sql)        i = i + 1    conn.commit()    curs.close()    conn.close()             























转载注明出处:http://blog.csdn.net/chichoxian/article/details/42003603




0 0
原创粉丝点击