根据字典对文本分词

来源:互联网 发布:淘宝账号已被冻结 编辑:程序博客网 时间:2024/06/05 02:24

有一个文本文件作为字典,另有一个文件夹下的几千文件是母本(待处理)文件,需要对这些文件进行分词操作,并去除字典之外的词,然后将处理好的文本写入新文件中

#coding:utf-8from __future__ import print_function, unicode_literalsimport sysimport jiebaimport jieba.posseg as psegfrom test.test_userdict import UserDictTestfrom distutils.sysconfig import project_basefrom pip._vendor.pyparsing import linefrom jieba import del_wordfrom email import contentmanagerimport osimport os.pathfrom matplotlib.pyplot import title#sys.setdefaultencoding("utf8")sys.path.append("../")lines0 = ''jieba.load_userdict('字典路径')#载入字典lines0 = [line.strip() for line in open('字典路径',encoding='UTF-8')]for dirpath, dirnames, filenames in os.walk('母本路径'):        for filename in filenames:            if os.path.splitext(filename)[1] == '.txt':                filepath = os.path.join(dirpath, filename)                #print(str(filename))                newfile = 原文件路径'+filename                with open(newfile,newline='',encoding='UTF-8') as project2:                    parContent = project2.read()                words = jieba.cut(parContent)                #print(words)                content = ''                for linnn in words:                    linnn = linnn.strip()                    if linnn in lines0:                             content += linnn                        content += '\n'                title = filename                file_name = '{}.txt'.format(title)                with open(file_name, 'w', newline='', encoding='utf-8') as f:                    f.write(content)