细胞词库的解码C源程序
来源:互联网 发布:mysql text使用 编辑:程序博客网 时间:2024/04/30 23:56
//细胞词库解码C源程序,输出为UNICODE的TXT格式
说明:sogo输入法细胞词库解析源程序dis_sogo_cell.c,将.scel文件解码输出unicode格式的文本文件,可以提取出大量的中文词语,输出拼音及其中文词语,每一条记录一行,每条记录的拼音和中文词语用分号分隔,同音词之间以逗号分隔,格式如下所示:bei di ;北地,北堤,bei gou ;北沟,杯勾,cai feng xiao qu ;彩俸小区,彩凤小区,彩风小区,chang cheng shu dian ;长城书店,chang ge zhuang cun ;常各庄村,chang he da sha ;长和大厦,长河大厦,使用方法(Linux下):1.编译:gcc dis_sogo_cell.c -o dis_sogo_cell 或直接make2.使用:./dis_sogo_cell sogo_scel_file.scel > sogo_scel_file.scel.txt3.用shell脚本批量处理,将当全目录下的.scel文件转换为TXT文件:#!/bin/bashfor scel_file in `ls *.scel` do./dis_sogo_cell ${scel_file} > ${scel_file}.txtdone4.使用程序包内提供的cell2txt.sh,在shell下直接执行即可将当前目录下的所有.scel文件提取为文本文件(UNICODE格式的)。如:# ./cell2txt.sh5.如需将生成的unicode 的文本文件转为ANSI编码的TXT文件,可以利用Linux的iconv命令,如:# iconv -futf-16 -tGB18030 sogo_scel_file.scel.txt -o sogo_scel_file.scel_ANSI.txt转换编码后的文件sogo_scel_file.scel_ANSI.txt将比原来的sogo_scel_file.scel.txt文件的字节数减小约50%,可极大节省存储空间,同时便于使用不支持UNICODE的文本编辑器查看输出的结果。C代码如下:
/* * dis_sogo_cell.c * * snallieATtomDOTcom * Sun Nov 7 06:30:00 CST 2014 * * decoding sogo .scel file, * output Chinese PinYin string and Chinese word in unicode * * Example of output data: * bei di ;北地,北堤, * bei gou ;北沟,杯勾, * cai feng xiao qu ;彩俸小区,彩凤小区,彩风小区, * chang cheng shu dian ;长城书店, * chang ge zhuang cun ;常各庄村, * chang he da sha ;长和大厦,长河大厦, * * to make under Linux: # gcc dis_sogo_cell.c -o dis_sogo_cell * usage : ./dis_sogo_cell sogo_scel_file.scel > sogo_scel_file.scel.txt * * !! sogo_scel_file.scel.txt is a TXT file in unicode !! * * to invoke in shell script: * for scel_file in `ls *.scel` ; do ./dis_sogo_cell ${scel_file} > ${scel_file}.txt ; done * *//* * 搜狗的scel词库就是保存的文本的unicode编码,每两个字节一个字符(中文汉字或者英文字母) * 找出其每部分的偏移位置即可 * 主要两部分 * 1.全局拼音表,貌似是所有的拼音组合,字典序 * 格式为(index,len,pinyin)的列表 * index: 两个字节的整数 代表这个拼音的索引 * len: 两个字节的整数 拼音的字节长度 * pinyin: 当前的拼音,每个字符两个字节,总长len * * 2.汉语词组表 * 格式为(same_pronounce_num,py_table_len,py_table,{word_len,word,ext_len,ext})的一个列表 * same_pronounce_num: 两个字节 整数 同音词数量 * py_table_len: 两个字节 整数 * py_table: 整数列表,每个整数两个字节,每个整数代表一个拼音的索引 * * word_len:两个字节 整数 代表中文词组字节数长度 * word: 中文词组,每个中文汉字两个字节,总长度word_len * ext_len: 两个字节 整数 代表扩展信息的长度,好像都是10 * ext: 扩展信息 前两个字节是一个整数(不知道是不是词频) 后八个字节全是0 * * {word_len,word,ext_len,ext} 一共重复same_pronounce_num次 同音词 相同拼音表 */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <unistd.h>#include <errno.h>extern int errno;typedef enum bool_t { false, true } bool;#include <inttypes.h>/* int8_t = 1, uint8_t = 1 int16_t = 2, uint16_t = 2 int32_t = 4, uint32_t = 4 int64_t = 8, uint64_t = 8 int_least8_t = 1, uint_least8_t = 1 int_least16_t = 2, uint_least16_t = 2 int_least32_t = 4, uint_least32_t = 4 int_least64_t = 8, uint_least64_t = 8 int_fast8_t = 1, uint_fast8_t = 1 int_fast16_t = 4, uint_fast16_t = 4 int_fast32_t = 4, uint_fast32_t = 4 int_fast64_t = 8, uint_fast64_t = 8*/int start_PY = 0x1540;int startChinese = 0x2628;// 0x26c4int count_py;int py_cel_idx = 0;typedef struct py_t { int16_t index; int16_t len; char pinyin[30];} py_tab;py_tab cel_py_tab[0x280];FILE *in_file;extern int errno;int16_t index3;int16_t len;char pinyin[30];void print_unicode_crlr(){ printf("%c%c", 0x0d, '\0'); printf("%c%c", 0x0a, '\0');}void print_unicode_space(){ printf("%c%c", ' ', '\0');}void print_ascii_in_unicode(unsigned char a){ printf("%c%c", a & 0x7f, '\0');}void print_ascii_str_in_unicode(unsigned char *ascii_str){ int i = 0; while (ascii_str[i]) {print_ascii_in_unicode(ascii_str[i]);i++; }}void print_py(int idx){ int i; for (i = 0; i < cel_py_tab[idx % count_py].len; i++) {printf("%c", cel_py_tab[idx % count_py].pinyin[i]); }}void print_char2(unsigned char *start_pos, int count, bool update_py_tab){ int i; for (i = 0; i < count; i++) {printf("%c", start_pos[i]); }}void put_py_tab(unsigned char *start_pos, int count){ int i; for (i = 0; i < count; i++) {cel_py_tab[py_cel_idx].pinyin[i % 30] = start_pos[i]; }}int read_py_item(int pos){ if (fseek(in_file, pos, SEEK_SET) == 0) {fread(&index3, 1, 2, in_file);fread(&len, 1, 2, in_file);if (len > 0) { fread(pinyin, 1, len, in_file); cel_py_tab[py_cel_idx].index = index3; cel_py_tab[py_cel_idx].len = len; put_py_tab(pinyin, len); py_cel_idx++; return 2 + 2 + len;} else { return index3;} } else {printf("Seek error\n");exit(1); }}int main(int argc, char **argv){ unsigned char header[12]; unsigned char header_magic[13] ="\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00"; char scel_type = 0; int next_pos = 4; int i; if (argc < 2) {printf("Usage: %s acel_l_file\n", argv[0]);return; } else {in_file = fopen(argv[1], "rb");if (!in_file) { fprintf(stderr, "Can't open input file '%s', %s\n", argv[1], strerror(errno)); exit(0);} else { fread(header, 1, 12, in_file);// read header magic word scel_type = header[4]; header[4] = 0x44; if (memcmp(header, header_magic, 12) != 0) {printf("Not a .scel file, quit!\n");exit(1); } else {switch (scel_type) {case 0x44:// 'D' startChinese = 0x2628; break;case 0x45:// 'E' startChinese = 0x26c4; break;default: {printf(".scel file corrupted, quit!\n");exit(1); } break;} } rewind(in_file); printf("%c%c", 0xff, 0xfe);// unicode-8 HEADER//#define DEBUG#define OUT_HEADER#ifdef OUT_HEADER // display file header info {unsigned char title[0x338 - 0x130];unsigned char type[0x540 - 0x338];unsigned char desc[0xd40 - 0x540];unsigned char samples[0x1540 - 0xd40];print_ascii_str_in_unicode("Name: ");fseek(in_file, 0X130, SEEK_SET);if (fread(title, 1, 0x338 - 0x130, in_file) != (0x338 - 0x130)) { perror("fread"); exit(1);}print_char2(title, 0x338 - 0x130, false);print_unicode_crlr();print_ascii_str_in_unicode("Type: ");fseek(in_file, 0x338, SEEK_SET);if (fread(type, 1, 0x540 - 0x338, in_file) != (0x540 - 0x338)) { perror("fread"); exit(1);}print_char2(type, 0x540 - 0x338, false);print_unicode_crlr();print_ascii_str_in_unicode("Desc: ");fseek(in_file, 0x540, SEEK_SET);if (fread(desc, 1, 0xd40 - 0x540, in_file) != (0xd40 - 0x540)) { perror("fread"); exit(1);}print_char2(desc, 0xd40 - 0x540, false);print_unicode_crlr();print_ascii_str_in_unicode("Smpl: ");fseek(in_file, 0xd40, SEEK_SET);if (fread(samples, 1, 0x1540 - 0xd40, in_file) != (0x1540 - 0xd40)) { perror("fread"); exit(1);}print_char2(samples, 0x1540 - 0xd40, false);print_unicode_crlr();print_unicode_crlr(); } rewind(in_file);#endif count_py = read_py_item(start_PY); for (i = 0; i < count_py; i++) {next_pos += read_py_item(start_PY + next_pos); }#ifdef DEBUG {int i;unsigned char str_tmp[200];for (i = 0; i < count_py; i++) { sprintf(str_tmp, "%03d(0x%03x):", i, i); print_ascii_str_in_unicode(str_tmp); print_py(i); print_unicode_crlr();} }#endif {uint16_t same_pronounce_num;uint16_t py_table_len;uint16_t py_table[150];uint16_t word_len;uint8_t word[150];unsigned char str_tmp[200];int cur_fptr;int file_size;int i = 0;int j;int next_pos_py = 0;fseek(in_file, 0x0L, SEEK_END);file_size = ftell(in_file);#ifdef DEBUGsprintf(str_tmp, "\r\file_size:%d\xd\xa", file_size);print_ascii_str_in_unicode(str_tmp);#endiffseek(in_file, startChinese, SEEK_SET);while (ftell(in_file) < file_size) { cur_fptr = ftell(in_file);#ifdef DEBUG sprintf(str_tmp, "\xd\xacur_fptr:%d(0x%08x)\xd\xa", cur_fptr, cur_fptr); print_ascii_str_in_unicode(str_tmp);#endif fread(&same_pronounce_num, 1, sizeof same_pronounce_num, in_file); fread(&py_table_len, 1, sizeof py_table_len, in_file); if (py_table_len > sizeof(py_table)) {sprintf(str_tmp,"\xd\xaError, .scel file maybe corrupt: too big size of py_table:%d(0x%08x), at file:0x%x\xd\xa",py_table_len, py_table_len,ftell(in_file));print_ascii_str_in_unicode(str_tmp);break; } else {fread(py_table, 1, py_table_len, in_file); } if (same_pronounce_num == 0 || same_pronounce_num > 20) {//#define ERR_OUTPUT#ifdef ERR_OUTPUTsprintf(str_tmp,"\xd\xaError, improper SAME_PRONOUNCE_NUM item size:%d(0x%08x), at file:0x%x\xd\xa",same_pronounce_num, same_pronounce_num,ftell(in_file));print_ascii_str_in_unicode(str_tmp);#endifbreak; }#ifdef DEBUG sprintf(str_tmp, "same_pronounce_num:%d\xd\xa", same_pronounce_num); print_ascii_str_in_unicode(str_tmp); sprintf(str_tmp, "py_table_len:%d\xd\xa", py_table_len); print_ascii_str_in_unicode(str_tmp); sprintf(str_tmp, "py_table:%d\xd\xa", py_table[0]); print_ascii_str_in_unicode(str_tmp);#endif // print PY string , e.g. "Zuo You Wei Nan" for (i = 0; i < py_table_len / 2; i++) {print_char2(cel_py_tab[py_table[i] % count_py]. pinyin, cel_py_tab[py_table[i] % count_py].len, false);print_unicode_space(); } print_ascii_in_unicode(';'); for (j = 0; j < same_pronounce_num; j++) {fread(&word_len, 1, sizeof word_len, in_file);fread(word, 1, word_len + 12, in_file);print_char2(word, word_len, false);if (!(j + 1 == same_pronounce_num)) { print_ascii_in_unicode(',');} } print_unicode_crlr();#ifdef DEBUG {long cur_fptr2 = ftell(in_file);sprintf(str_tmp, "cur_fptr2:%d(0x%08x)\xd\xa",cur_fptr2, cur_fptr2);print_ascii_str_in_unicode(str_tmp); }#endif} }} }}
0 0
- 细胞词库的解码C源程序
- [python]将搜狗(sogou)的细胞词库转换为mmseg的词库
- 出售搜狗细胞词库
- 搜狗拼音细胞词库的特殊用法--语料库,中文NLP
- 我的搜狗拼音皮肤和细胞词库都发布咯!
- 3软件--给ibus-pinyin或fcitx加上搜狗细胞百多万的词库
- 搜狗细胞词库-fcitx&ibus拼音输入法词库
- 搜狗词库解码
- C源程序的关键字
- 将搜狗细胞词库转换为其他输入法词库——深蓝词库转换1.1发布
- 遗传算法的C源程序
- C可怕的病毒源程序
- vc如何识别c源程序和c++源程序的?
- ICTCLAS2012分词库在C环境下的使用
- xvid解码 SDL显示 源程序
- xvid解码 SDL显示 源程序
- 中英文url解码vc++源程序
- 线性规划的源程序(c语言版)
- PopupMenu
- 《机器学习实战》读书笔记—k近邻算法c语言实现(win下)
- 用一个类实现网络通讯功能:<nstream>
- Tutorials for 2014 SWJTU Freshman Invitation Programming Contest - Online Round
- MAC下安装与配置MySQL
- 细胞词库的解码C源程序
- Python集合(set)类型的基本操作
- javac编译问题
- MAC OS 安装Siblime Text2以及解决中文乱码问题
- CentOS安装Pure-FTPd+MySQL
- TextWatcher参数理解
- H.264中NAL、Slice与frame意思及相互关系
- hoj2662状态压缩dp
- windows配置网卡IP脚本和配置路由条目、默认路由