细胞词库的解码C源程序

来源:互联网 发布:mysql text使用 编辑:程序博客网 时间:2024/04/30 23:56

//细胞词库解码C源程序,输出为UNICODE的TXT格式

说明:sogo输入法细胞词库解析源程序dis_sogo_cell.c,将.scel文件解码输出unicode格式的文本文件,可以提取出大量的中文词语,输出拼音及其中文词语,每一条记录一行,每条记录的拼音和中文词语用分号分隔,同音词之间以逗号分隔,格式如下所示:bei di ;北地,北堤,bei gou ;北沟,杯勾,cai feng xiao qu ;彩俸小区,彩凤小区,彩风小区,chang cheng shu dian ;长城书店,chang ge zhuang cun ;常各庄村,chang he da sha ;长和大厦,长河大厦,使用方法(Linux下):1.编译:gcc dis_sogo_cell.c -o dis_sogo_cell        或直接make2.使用:./dis_sogo_cell sogo_scel_file.scel > sogo_scel_file.scel.txt3.用shell脚本批量处理,将当全目录下的.scel文件转换为TXT文件:#!/bin/bashfor scel_file in `ls *.scel` do./dis_sogo_cell ${scel_file} > ${scel_file}.txtdone4.使用程序包内提供的cell2txt.sh,在shell下直接执行即可将当前目录下的所有.scel文件提取为文本文件(UNICODE格式的)。如:# ./cell2txt.sh5.如需将生成的unicode 的文本文件转为ANSI编码的TXT文件,可以利用Linux的iconv命令,如:# iconv -futf-16 -tGB18030 sogo_scel_file.scel.txt   -o  sogo_scel_file.scel_ANSI.txt转换编码后的文件sogo_scel_file.scel_ANSI.txt将比原来的sogo_scel_file.scel.txt文件的字节数减小约50%,可极大节省存储空间,同时便于使用不支持UNICODE的文本编辑器查看输出的结果。
C代码如下:

/* * dis_sogo_cell.c * * snallieATtomDOTcom * Sun Nov  7 06:30:00 CST 2014 * * decoding sogo .scel file,  * output Chinese PinYin string and Chinese word in unicode  * * Example of output data: * bei di ;北地,北堤, * bei gou ;北沟,杯勾, * cai feng xiao qu ;彩俸小区,彩凤小区,彩风小区, * chang cheng shu dian ;长城书店, * chang ge zhuang cun ;常各庄村, * chang he da sha ;长和大厦,长河大厦, * * to make under Linux: # gcc dis_sogo_cell.c -o dis_sogo_cell * usage  : ./dis_sogo_cell sogo_scel_file.scel > sogo_scel_file.scel.txt * * !! sogo_scel_file.scel.txt is a TXT file in unicode !! * * to invoke in shell script: * for scel_file in `ls *.scel` ; do ./dis_sogo_cell ${scel_file} > ${scel_file}.txt ; done *  *//* * 搜狗的scel词库就是保存的文本的unicode编码,每两个字节一个字符(中文汉字或者英文字母) * 找出其每部分的偏移位置即可 * 主要两部分 * 1.全局拼音表,貌似是所有的拼音组合,字典序 *        格式为(index,len,pinyin)的列表 *        index: 两个字节的整数 代表这个拼音的索引 *        len: 两个字节的整数 拼音的字节长度 *        pinyin: 当前的拼音,每个字符两个字节,总长len *         * 2.汉语词组表 *        格式为(same_pronounce_num,py_table_len,py_table,{word_len,word,ext_len,ext})的一个列表 *        same_pronounce_num: 两个字节 整数 同音词数量 *        py_table_len:  两个字节 整数 *        py_table: 整数列表,每个整数两个字节,每个整数代表一个拼音的索引 *  *        word_len:两个字节 整数 代表中文词组字节数长度 *        word: 中文词组,每个中文汉字两个字节,总长度word_len *        ext_len: 两个字节 整数 代表扩展信息的长度,好像都是10 *        ext: 扩展信息 前两个字节是一个整数(不知道是不是词频) 后八个字节全是0 *  *       {word_len,word,ext_len,ext} 一共重复same_pronounce_num次 同音词 相同拼音表 */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <unistd.h>#include <errno.h>extern int errno;typedef enum bool_t { false, true } bool;#include <inttypes.h>/*  int8_t         = 1,  uint8_t        = 1  int16_t        = 2,  uint16_t       = 2  int32_t        = 4,  uint32_t       = 4  int64_t        = 8,  uint64_t       = 8  int_least8_t   = 1,  uint_least8_t  = 1  int_least16_t  = 2,  uint_least16_t = 2  int_least32_t  = 4,  uint_least32_t = 4  int_least64_t  = 8,  uint_least64_t = 8  int_fast8_t    = 1,  uint_fast8_t   = 1  int_fast16_t   = 4,  uint_fast16_t  = 4  int_fast32_t   = 4,  uint_fast32_t  = 4  int_fast64_t   = 8,  uint_fast64_t  = 8*/int start_PY = 0x1540;int startChinese = 0x2628;// 0x26c4int count_py;int py_cel_idx = 0;typedef struct py_t {    int16_t index;    int16_t len;    char pinyin[30];} py_tab;py_tab cel_py_tab[0x280];FILE *in_file;extern int errno;int16_t index3;int16_t len;char pinyin[30];void print_unicode_crlr(){    printf("%c%c", 0x0d, '\0');    printf("%c%c", 0x0a, '\0');}void print_unicode_space(){    printf("%c%c", ' ', '\0');}void print_ascii_in_unicode(unsigned char a){    printf("%c%c", a & 0x7f, '\0');}void print_ascii_str_in_unicode(unsigned char *ascii_str){    int i = 0;    while (ascii_str[i]) {print_ascii_in_unicode(ascii_str[i]);i++;    }}void print_py(int idx){    int i;    for (i = 0; i < cel_py_tab[idx % count_py].len; i++) {printf("%c", cel_py_tab[idx % count_py].pinyin[i]);    }}void print_char2(unsigned char *start_pos, int count, bool update_py_tab){    int i;    for (i = 0; i < count; i++) {printf("%c", start_pos[i]);    }}void put_py_tab(unsigned char *start_pos, int count){    int i;    for (i = 0; i < count; i++) {cel_py_tab[py_cel_idx].pinyin[i % 30] = start_pos[i];    }}int read_py_item(int pos){    if (fseek(in_file, pos, SEEK_SET) == 0) {fread(&index3, 1, 2, in_file);fread(&len, 1, 2, in_file);if (len > 0) {    fread(pinyin, 1, len, in_file);    cel_py_tab[py_cel_idx].index = index3;    cel_py_tab[py_cel_idx].len = len;    put_py_tab(pinyin, len);    py_cel_idx++;    return 2 + 2 + len;} else {    return index3;}    } else {printf("Seek error\n");exit(1);    }}int main(int argc, char **argv){    unsigned char header[12];    unsigned char header_magic[13] ="\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00";    char scel_type = 0;    int next_pos = 4;    int i;    if (argc < 2) {printf("Usage: %s acel_l_file\n", argv[0]);return;    } else {in_file = fopen(argv[1], "rb");if (!in_file) {    fprintf(stderr, "Can't open input file '%s', %s\n",    argv[1], strerror(errno));    exit(0);} else {    fread(header, 1, 12, in_file);// read header magic word    scel_type = header[4];    header[4] = 0x44;    if (memcmp(header, header_magic, 12) != 0) {printf("Not a .scel file, quit!\n");exit(1);    } else {switch (scel_type) {case 0x44:// 'D'    startChinese = 0x2628;    break;case 0x45:// 'E'    startChinese = 0x26c4;    break;default:    {printf(".scel file corrupted, quit!\n");exit(1);    }    break;}    }    rewind(in_file);    printf("%c%c", 0xff, 0xfe);// unicode-8 HEADER//#define DEBUG#define OUT_HEADER#ifdef OUT_HEADER    // display file header info    {unsigned char title[0x338 - 0x130];unsigned char type[0x540 - 0x338];unsigned char desc[0xd40 - 0x540];unsigned char samples[0x1540 - 0xd40];print_ascii_str_in_unicode("Name: ");fseek(in_file, 0X130, SEEK_SET);if (fread(title, 1, 0x338 - 0x130, in_file) !=    (0x338 - 0x130)) {    perror("fread");    exit(1);}print_char2(title, 0x338 - 0x130, false);print_unicode_crlr();print_ascii_str_in_unicode("Type: ");fseek(in_file, 0x338, SEEK_SET);if (fread(type, 1, 0x540 - 0x338, in_file) !=    (0x540 - 0x338)) {    perror("fread");    exit(1);}print_char2(type, 0x540 - 0x338, false);print_unicode_crlr();print_ascii_str_in_unicode("Desc: ");fseek(in_file, 0x540, SEEK_SET);if (fread(desc, 1, 0xd40 - 0x540, in_file) !=    (0xd40 - 0x540)) {    perror("fread");    exit(1);}print_char2(desc, 0xd40 - 0x540, false);print_unicode_crlr();print_ascii_str_in_unicode("Smpl: ");fseek(in_file, 0xd40, SEEK_SET);if (fread(samples, 1, 0x1540 - 0xd40, in_file) !=    (0x1540 - 0xd40)) {    perror("fread");    exit(1);}print_char2(samples, 0x1540 - 0xd40, false);print_unicode_crlr();print_unicode_crlr();    }    rewind(in_file);#endif    count_py = read_py_item(start_PY);    for (i = 0; i < count_py; i++) {next_pos += read_py_item(start_PY + next_pos);    }#ifdef DEBUG    {int i;unsigned char str_tmp[200];for (i = 0; i < count_py; i++) {    sprintf(str_tmp, "%03d(0x%03x):", i, i);    print_ascii_str_in_unicode(str_tmp);    print_py(i);    print_unicode_crlr();}    }#endif    {uint16_t same_pronounce_num;uint16_t py_table_len;uint16_t py_table[150];uint16_t word_len;uint8_t word[150];unsigned char str_tmp[200];int cur_fptr;int file_size;int i = 0;int j;int next_pos_py = 0;fseek(in_file, 0x0L, SEEK_END);file_size = ftell(in_file);#ifdef DEBUGsprintf(str_tmp, "\r\file_size:%d\xd\xa", file_size);print_ascii_str_in_unicode(str_tmp);#endiffseek(in_file, startChinese, SEEK_SET);while  (ftell(in_file) < file_size) {    cur_fptr = ftell(in_file);#ifdef DEBUG    sprintf(str_tmp, "\xd\xacur_fptr:%d(0x%08x)\xd\xa",    cur_fptr, cur_fptr);    print_ascii_str_in_unicode(str_tmp);#endif    fread(&same_pronounce_num, 1,  sizeof same_pronounce_num, in_file);    fread(&py_table_len, 1, sizeof py_table_len, in_file);    if (py_table_len > sizeof(py_table)) {sprintf(str_tmp,"\xd\xaError, .scel file maybe corrupt: too big size of py_table:%d(0x%08x), at file:0x%x\xd\xa",py_table_len, py_table_len,ftell(in_file));print_ascii_str_in_unicode(str_tmp);break;    } else {fread(py_table, 1, py_table_len, in_file);    }    if (same_pronounce_num == 0 || same_pronounce_num > 20) {//#define ERR_OUTPUT#ifdef ERR_OUTPUTsprintf(str_tmp,"\xd\xaError, improper SAME_PRONOUNCE_NUM item size:%d(0x%08x), at file:0x%x\xd\xa",same_pronounce_num, same_pronounce_num,ftell(in_file));print_ascii_str_in_unicode(str_tmp);#endifbreak;    }#ifdef DEBUG    sprintf(str_tmp, "same_pronounce_num:%d\xd\xa",    same_pronounce_num);    print_ascii_str_in_unicode(str_tmp);    sprintf(str_tmp, "py_table_len:%d\xd\xa",    py_table_len);    print_ascii_str_in_unicode(str_tmp);    sprintf(str_tmp, "py_table:%d\xd\xa", py_table[0]);    print_ascii_str_in_unicode(str_tmp);#endif    // print PY string , e.g. "Zuo You Wei Nan"    for (i = 0; i < py_table_len / 2; i++) {print_char2(cel_py_tab[py_table[i] % count_py].    pinyin,    cel_py_tab[py_table[i] % count_py].len,    false);print_unicode_space();    }    print_ascii_in_unicode(';');    for (j = 0; j < same_pronounce_num; j++) {fread(&word_len, 1, sizeof word_len, in_file);fread(word, 1, word_len + 12, in_file);print_char2(word, word_len, false);if (!(j + 1 == same_pronounce_num)) {    print_ascii_in_unicode(',');}    }    print_unicode_crlr();#ifdef DEBUG    {long cur_fptr2 = ftell(in_file);sprintf(str_tmp, "cur_fptr2:%d(0x%08x)\xd\xa",cur_fptr2, cur_fptr2);print_ascii_str_in_unicode(str_tmp);    }#endif}    }}    }}



0 0
原创粉丝点击