CRF++实现分词

来源：互联网发布：卡密社区源码编辑：程序博客网时间：2024/06/16 17:05

本文是通过CRF++实现中文的分词

首先介绍CRF++的具体安装和使用
- CRF++
  Linux版本的安装方法是：
  i. 解压到某目录下
  ii. 打开控制台，将当前目录切换到解压目录
  iii. 依次输入命令：
  ./configure
  make
  su
  make install
  注：需要root权限才能成功安装。
  倘若你输入su之后再输入密码时提示错误，那么可能跟我一样，电脑不存在root，需要自己手动添加，具体方法可以自行查一下～
- 由于这里使用 python 工具包进行训练和测试，需要安装 python 工具包。进入 python 文件夹，运行以下命令安装：
  python setup.py build
  sudo python setup.py install
  到此我们的准备工作就完事了～
下面我讲一下我的思路：
由于Task2根目录下的_CRFPP.so没有引入
所以正常能运行的是文件夹Task2_B-I下的内容，此文件夹下对词语分类只有B I两种
首先我用msr_training.utf8 通过python程序 make_crf_train_data.py转化成训练语料需要的格式，即tag_train_data.utf8,
然后我开始训练模型，得到model 再利用CRF自带的python工具包，对输入文本分词，具体实现是通过python程序 crf_segment.py ,
最后就将msr_test.utf8 分词得到 crf_tag_result.utf8.

crf_segment.py

#!/usr/bin/python# -*- coding: utf-8 -*-# crf_segmenter.py# Usage:python crf_segmenter.py crf_model test_file result_file# 利用CRF自带的python工具包，对输入文本进行分词import codecsimport sysimport CRFPPdef crf_segmenter(input_file, output_file, tagger):    input_data = codecs.open(input_file, 'r', 'utf-8')    output_data = codecs.open(output_file, 'w', 'utf-8')    for line in input_data.readlines():        tagger.clear()        for word in line.strip():            word = word.strip()            if word:                tagger.add((word + "\to\tB").encode('utf-8'))        tagger.parse()        size = tagger.size()        xsize = tagger.xsize()        for i in range(0, size):            for j in range(0, xsize):                char = tagger.x(i, j).decode('utf-8')                tag = tagger.y2(i)                if tag == 'B':                    output_data.write(' ' + char)                elif tag == 'M':                    output_data.write(char)                elif tag == 'E':                    output_data.write(char + ' ')                else:  # tag == 'S'                    output_data.write(' ' + char + ' ')        output_data.write('\n')    input_data.close()    output_data.close()if __name__ == '__main__':    if len(sys.argv) != 4:        print("Usage: python crf_segmenter.py crf_model test_file result_file")        sys.exit()    crf_model = sys.argv[1]    input_file = sys.argv[2]    output_file = sys.argv[3]    tagger = CRFPP.Tagger("-m " + crf_model)    crf_segmenter(input_file, output_file, tagger)

make_crf_train_data.py

#!/usr/bin/python# -*- coding: utf-8 -*-# make_crf_train_data.py# 得到CRF++要求的格式的训练文件# 用法：命令行--python dataprocess.py input_file output_fileimport sysimport codecs# 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single)def character_4tagging(input_file, output_file):    input_data = codecs.open(input_file, 'r', 'utf-8')    output_data = codecs.open(output_file, 'w', 'utf-8')    for line in input_data.readlines():        word_list = line.strip().split()        for word in word_list:            if len(word) == 1:                output_data.write(word + "\tS\n")            else:                output_data.write(word[0] + "\tB\n")                for w in word[1:len(word) - 1]:                    output_data.write(w + "\tM\n")                output_data.write(word[len(word) - 1] + "\tE\n")        output_data.write("\n")    input_data.close()    output_data.close()# 6 tags for character tagging: B(Begin), E(End), M(Middle), S(Single), M1, M2def character_6tagging(input_file, output_file):    input_data = codecs.open(input_file, 'r', 'utf-8')    output_data = codecs.open(output_file, 'w', 'utf-8')    for line in input_data.readlines():        word_list = line.strip().split()        for word in word_list:            if len(word) == 1:                output_data.write(word + "\tS\n")            elif len(word) == 2:                output_data.write(word[0] + "\tB\n")                output_data.write(word[1] + "\tE\n")            elif len(word) == 3:                output_data.write(word[0] + "\tB\n")                output_data.write(word[1] + "\tM\n")                output_data.write(word[2] + "\tE\n")            elif len(word) == 4:                output_data.write(word[0] + "\tB\n")                output_data.write(word[1] + "\tM1\n")                output_data.write(word[2] + "\tM\n")                output_data.write(word[3] + "\tE\n")            elif len(word) == 5:                output_data.write(word[0] + "\tB\n")                output_data.write(word[1] + "\tM1\n")                output_data.write(word[2] + "\tM2\n")                output_data.write(word[3] + "\tM\n")                output_data.write(word[4] + "\tE\n")            elif len(word) > 5:                output_data.write(word[0] + "\tB\n")                output_data.write(word[1] + "\tM1\n")                output_data.write(word[2] + "\tM2\n")                for w in word[3:len(word) - 1]:                    output_data.write(w + "\tM\n")                output_data.write(word[len(word) - 1] + "\tE\n")        output_data.write("\n")    input_data.close()    output_data.close()if __name__ == '__main__':    if len(sys.argv) != 3:        print ("Usage: python dataprocess.py inputfile outputfile")        sys.exit()    input_file = sys.argv[1]    output_file = sys.argv[2]    character_4tagging(input_file, output_file)

0 0