CRF++实现分词

来源:互联网 发布:卡密社区源码 编辑:程序博客网 时间:2024/06/16 17:05

本文是通过CRF++实现中文的分词

  1. 首先介绍CRF++的具体安装和使用

    • CRF++
      Linux版本的安装方法是:
      i. 解压到某目录下
      ii. 打开控制台,将当前目录切换到解压目录
      iii. 依次输入命令:
      ./configure
      make
      su
      make install
      注:需要root权限才能成功安装。
      倘若你输入su之后再输入密码时提示错误,那么可能跟我一样,电脑不存在root,需要自己手动添加,具体方法可以自行查一下~
    • 由于这里使用 python 工具包进行训练和测试,需要安装 python 工具包。进入 python 文件夹,运行以下命令安装:
      python setup.py build
      sudo python setup.py install
      到此我们的准备工作就完事了~
  2. 下面我讲一下我的思路:
    由于Task2根目录下的_CRFPP.so没有引入
    所以正常能运行的是文件夹Task2_B-I下的内容,此文件夹下对词语分类只有B I两种
    首先我用msr_training.utf8 通过python程序 make_crf_train_data.py转化成训练语料需要的格式,即tag_train_data.utf8,
    然后我开始训练模型,得到model 再利用CRF自带的python工具包,对输入文本分词,具体实现是通过python程序 crf_segment.py ,
    最后就将msr_test.utf8 分词得到 crf_tag_result.utf8.

crf_segment.py

#!/usr/bin/python# -*- coding: utf-8 -*-# crf_segmenter.py# Usage:python crf_segmenter.py crf_model test_file result_file# 利用CRF自带的python工具包,对输入文本进行分词import codecsimport sysimport CRFPPdef crf_segmenter(input_file, output_file, tagger):    input_data = codecs.open(input_file, 'r', 'utf-8')    output_data = codecs.open(output_file, 'w', 'utf-8')    for line in input_data.readlines():        tagger.clear()        for word in line.strip():            word = word.strip()            if word:                tagger.add((word + "\to\tB").encode('utf-8'))        tagger.parse()        size = tagger.size()        xsize = tagger.xsize()        for i in range(0, size):            for j in range(0, xsize):                char = tagger.x(i, j).decode('utf-8')                tag = tagger.y2(i)                if tag == 'B':                    output_data.write(' ' + char)                elif tag == 'M':                    output_data.write(char)                elif tag == 'E':                    output_data.write(char + ' ')                else:  # tag == 'S'                    output_data.write(' ' + char + ' ')        output_data.write('\n')    input_data.close()    output_data.close()if __name__ == '__main__':    if len(sys.argv) != 4:        print("Usage: python crf_segmenter.py crf_model test_file result_file")        sys.exit()    crf_model = sys.argv[1]    input_file = sys.argv[2]    output_file = sys.argv[3]    tagger = CRFPP.Tagger("-m " + crf_model)    crf_segmenter(input_file, output_file, tagger)

make_crf_train_data.py

#!/usr/bin/python# -*- coding: utf-8 -*-# make_crf_train_data.py# 得到CRF++要求的格式的训练文件# 用法:命令行--python dataprocess.py input_file output_fileimport sysimport codecs# 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single)def character_4tagging(input_file, output_file):    input_data = codecs.open(input_file, 'r', 'utf-8')    output_data = codecs.open(output_file, 'w', 'utf-8')    for line in input_data.readlines():        word_list = line.strip().split()        for word in word_list:            if len(word) == 1:                output_data.write(word + "\tS\n")            else:                output_data.write(word[0] + "\tB\n")                for w in word[1:len(word) - 1]:                    output_data.write(w + "\tM\n")                output_data.write(word[len(word) - 1] + "\tE\n")        output_data.write("\n")    input_data.close()    output_data.close()# 6 tags for character tagging: B(Begin), E(End), M(Middle), S(Single), M1, M2def character_6tagging(input_file, output_file):    input_data = codecs.open(input_file, 'r', 'utf-8')    output_data = codecs.open(output_file, 'w', 'utf-8')    for line in input_data.readlines():        word_list = line.strip().split()        for word in word_list:            if len(word) == 1:                output_data.write(word + "\tS\n")            elif len(word) == 2:                output_data.write(word[0] + "\tB\n")                output_data.write(word[1] + "\tE\n")            elif len(word) == 3:                output_data.write(word[0] + "\tB\n")                output_data.write(word[1] + "\tM\n")                output_data.write(word[2] + "\tE\n")            elif len(word) == 4:                output_data.write(word[0] + "\tB\n")                output_data.write(word[1] + "\tM1\n")                output_data.write(word[2] + "\tM\n")                output_data.write(word[3] + "\tE\n")            elif len(word) == 5:                output_data.write(word[0] + "\tB\n")                output_data.write(word[1] + "\tM1\n")                output_data.write(word[2] + "\tM2\n")                output_data.write(word[3] + "\tM\n")                output_data.write(word[4] + "\tE\n")            elif len(word) > 5:                output_data.write(word[0] + "\tB\n")                output_data.write(word[1] + "\tM1\n")                output_data.write(word[2] + "\tM2\n")                for w in word[3:len(word) - 1]:                    output_data.write(w + "\tM\n")                output_data.write(word[len(word) - 1] + "\tE\n")        output_data.write("\n")    input_data.close()    output_data.close()if __name__ == '__main__':    if len(sys.argv) != 3:        print ("Usage: python dataprocess.py inputfile outputfile")        sys.exit()    input_file = sys.argv[1]    output_file = sys.argv[2]    character_4tagging(input_file, output_file)

r

0 0