jieba分词,并去除所有标点

来源:互联网 发布:mac mysql my.cnf 配置 编辑:程序博客网 时间:2024/06/05 11:47
# encoding=utf-8import jiebaimport reclass Scan(object):    def __init__(self,path):        self.path = path    def scan(self):        r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+'        try:            f = open(self.path, "r",encoding='UTF-8')        except Exception as err:            print(err)        finally:            print("文件读取结束")        word_list = []        while True:            line = f.readline()            if line:                line = line.strip()                line = re.sub(r, '', line)                seg_list = jieba.cut(line, cut_all=False)                word_list.append(list(seg_list))            else:                break        f.close()        print(word_list)'''分词并提取关键词'''import syssys.path.append('../')import jiebaimport jieba.analysefrom optparse import OptionParserUSAGE = "usage:    python extract_tags_with_weight.py [file name] -k [top k] -w [with weight=1 or 0]"parser = OptionParser(USAGE)parser.add_option("-k", dest="topK")parser.add_option("-w", dest="withWeight")opt, args = parser.parse_args()if len(args) < 1:    print(USAGE)    sys.exit(1)file_name = args[0]if opt.topK is None:    topK = 10else:    topK = int(opt.topK)if opt.withWeight is None:    withWeight = Falseelse:    if int(opt.withWeight) is 1:        withWeight = True    else:        withWeight = Falsecontent = open(file_name, 'rb').read()tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)if withWeight is True:    for tag in tags:        print("tag: %s\t\t weight: %f" % (tag[0],tag[1]))else:    print(",".join(tags))