LibMMSeg for Python

来源:互联网 发布:李彦宏会编程吗 编辑:程序博客网 时间:2024/05/13 23:16

LibMMSeg是Coreseek.com为Sphinx全文搜索引擎设计的中文分词软件包,采用Chih-Hao Tsai的MMSEG算法,可以参见这里

下面是为Python所做的模块代码,根据mmseg_main做了部分修改:

char** segment(const char* dict_path, const char* file_path, int *nseg)
{
    char **segs = NULL;
    *nseg = 0;
    istream *is = new std::ifstream(file_path, ios::in | ios::binary);
    auto_ptr<istream> pauto_is(is);
    if (! *is) {
        return NULL;
    }
    Segmenter *seg = NULL;
    SegmenterManager* mgr = new SegmenterManager();
    auto_ptr<SegmenterManager> pauto_mgr(mgr);
    if(dict_path) {
        if ( mgr->init(dict_path) == 0 )
            seg = mgr->getSegmenter();
    }
    if (!seg) return NULL;
    std::string line;
    int n = 0;
    unsigned long srch,str;
    str = currentTimeMillis();
    //load data.
    int length;
    is->seekg (0, ios::end);
    length = is->tellg();
    is->seekg (0, ios::beg);
    char* buffer = new char [length+1];
    is->read (buffer,length);
    buffer[length] = 0;
    //begin seg
    seg->setBuffer((u1*)buffer,length);
    u2 len = 0, symlen = 0;
    //check 1st token.
    char txtHead[3] = {239,187,191};
    char* tok = (char*)seg->peekToken(len, symlen);
    seg->popToken(len);
    if(len == 3 && memcmp(tok,txtHead,sizeof(char)*3) == 0){
        //check is 0xFEFF
        //do nothing
    }
    else{
        //printf("%*.*s/X ",symlen,symlen,tok);
        segs = (char**)realloc(segs, (*nseg + 1) * sizeof(char*));
        segs[*nseg] = (char*)calloc(symlen, sizeof(char));
        memcpy(segs[*nseg], tok, symlen);
        ++(*nseg);
    }
    while(1){
        len = 0;
        char* tok = (char*)seg->peekToken(len,symlen);
        if(!tok || !*tok || !len)
            break;
        seg->popToken(len);
        if(*tok == '/r')
            continue;
        if(*tok == '/n'){
            //printf("/n");
            continue;
        }
        //printf("%*.*s/X ",symlen,symlen,tok);
        segs = (char**)realloc(segs, (*nseg + 1) * sizeof(char*));
        segs[*nseg] = (char*)calloc(symlen, sizeof(char));
        memcpy(segs[*nseg], tok, symlen);
        ++(*nseg);
    }
    srch = currentTimeMillis() - str;
    //printf("/n/nWord Splite took: %d ms./n", srch);
    //found out the result

    return segs;
}

#ifndef NOPYTHON
PyObject* seg_text(PyObject *self, PyObject *args)
{
    char *dict_path = NULL;
    char *file_path = NULL;
    int ok = PyArg_ParseTuple(args, "ss", &dict_path, &file_path);
    fprintf(stderr, "Get Dict[%s], File[%s]./n", dict_path, file_path);
    PyObject *List = PyList_New(0);
    int nseg;
    char **segs = segment(dict_path, file_path, &nseg);
    int i;
    for (i = 0; i < nseg; i++) {
        if (segs[i]) {
            PyList_Append(List, Py_BuildValue("s", segs[i]));
            free(segs[i]);
        }
    }
    if (segs) free(segs);
    return List;
}

static PyMethodDef Methods[] = {
    { "seg", seg_text, METH_VARARGS, "Seg Text" },
    { NULL, NULL, 0, NULL }
};

void initLibPyMMSeg()
{
    PyObject *m;
    m = Py_InitModule("LibPyMMSeg", Methods);
}

 

原创粉丝点击