crf预处理更改
来源:互联网 发布:汇龙软件 编辑:程序博客网 时间:2024/05/16 11:50
//********************test******************************
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <string>
#define C_NUMBER ("一 二 三 四 五 六 七 八 九 十 百 千 万 亿 壹 贰 叁 肆 伍 陆 柒 捌 玖 拾 佰 仟")
#define E_NUMBER ("1 2 3 4 5 6 7 8 9 0")
#define E_ENGLISH ("A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z")
#define PUNCTION ("。 , 、 ; : ? ! “ ” ‘ ’╗ ╚ ┐ └ ( ) … … — — — 《 》 〈 〉 · .")
#define E_PUNCTION (". , ; : ? ! /" ' ( ) < >")
#define E_NUMBER ("1 2 3 4 5 6 7 8 9 0")
#define TEST_LINE_NUM 5000
#define MAXLINELEN 1024
using namespace std;
int get_test(const char *src,const char *dst);
int get_test_open_tst(const char *src2,const char *dst2);
void chomp(char *srcline);
void from_seg_to_tag(const char *line,vector<string> &dst_line);
void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data);
int main(int argc,char **argv)
{
/* char src[] = "data.txt";
char dst[] = "test.txt";
char src2[] = "news.txt";
char dst2[] = "test2.txt";
get_test(src,dst);
*/
if (argc != 3)
{
printf("usage: %s in_file out_file/n",argv[0]);
return(1);
}
get_test_open_tst(argv[1],argv[2]);
//get_test_open_tst("1","2");
return(1);
}
/*get test from training data for close test
* */
int get_test(const char *src,const char *dst)
{
FILE *fin,*fout,*fout2;
char line[MAXLINELEN];
char first[64],second[64],third[64],four[64];
int num;
fin = fopen(src,"rb");
if(NULL == fin)
{
printf("can't open %s/n",src);
return(-1);
}
fout = fopen(dst,"wb");
sprintf(first,"%s.tst",dst);
fout2 = fopen(first,"wb");
num = 0;
while(!feof(fin))
{
fgets(line,MAXLINELEN,fin);
if(line[0] == 0x0A || line[0] == 0x0D)
{
fprintf(fout,"%s",line);
fprintf(fout2,"%s",line);
}else
{
fprintf(fout2,"%s",line);
sscanf(line,"%s %s %s %s",first,second,third,four);
fprintf(fout,"%s %s %s/n",first,second,third);
}
num++;
if(num == MAXLINELEN)
break;
}
fclose(fin);
fclose(fout);
fclose(fout2);
return(1);
}
void chomp(char *srcline)
{
int n;
for(n = 0;srcline[n];n++)
{
if(srcline[n] == 0x0D || srcline[n] == 0x0A)
break;
}
srcline[n] = 0;
return;
}
bool ValidColumn(const string &str)
{
size_t n,num;
num = 0;
for(n = 0;n < str.size();)
{
if(str[n] == ' ')
{
while(n < str.size() && str[n] == ' ')
n++;
num++;
}
else
{
n++;
}
}
if(num != 3)
return(false);
else
return(true);
}
/*
* 开放测试文本转换
* */
int get_test_open_tst(const char *in_file,const char *out_file)
{
FILE *fin,*fout;
char line[MAXLINELEN];
fin = fopen(in_file,"rb");
if(NULL == fin)
{
printf("can't open %s/n",in_file);
return(-1);
}
fout = fopen(out_file,"wb");
while(!feof(fin))
{
fgets(line,MAXLINELEN,fin);
chomp(line);
vector<string> v_dst,v_data;
from_seg_to_tag(line,v_dst);
from_tag_to_data(v_dst,v_data);
for(size_t n = 0;n < v_data.size();n++)
{
if((n+1) < v_data.size())
{
if(v_data[n] == " " && v_data[n] == v_data[n+1])
continue;
else if(v_data[n] == "/n" && v_data[n] == v_data[n+1])
continue;
else
fprintf(fout,"%s/n",v_data[n].c_str());
}
else
{
fprintf(fout,"%s/n",v_data[n].c_str());
}
}
fprintf(fout,"/n");
}
fclose(fin);
fclose(fout);
return(1);
}
bool IsEnglish(const string &word_cur)
{
if(strstr(E_ENGLISH,word_cur.c_str()))
return(true);
if (isalpha(word_cur[0]))
return(true);
return(false);
}
bool IsNumber(const string &word_cur)
{
if(strstr(C_NUMBER,word_cur.c_str()))
return(true);
return(false);
}
bool IsEnumber(const string &word_cur)
{
/*English number*/
if(strstr(E_NUMBER,word_cur.c_str()))
return(true);
if (isdigit(word_cur[0]))
return(true);
return(false);
}
bool IsPunc(const string &word_cur)
{
if(strstr(PUNCTION,word_cur.c_str()))
return(true);
else if(strstr(E_PUNCTION,word_cur.c_str()))
return(true);
else
return(false);
}
void split_word_tgt(string &word,string &tgt,const string &src)
{
size_t n;
word = "";
tgt = "";
for(n = 0;n < src.length() && src[n] != '/';n++)
{
word += src[n];
}
if(src[n] == '/')
{
n++;
for(;n < src.length() ;n++)
{
if(src[n] != ' ')
tgt += src[n];
}
}
else
{
printf("error in split %s/n",src.c_str());
}
return;
}
/* 给每个字附着属性信息,如标点、数字 */
void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data)
{
size_t n,size;
size = v_dst.size();
for(n = 0;n < size;n++)
{
string word0,word2,word1,tgt,tmp = "";
if(v_dst[n] == " ")
{
v_data.push_back(" ");
continue;
}
word1 = v_dst[n];
tmp = word1;
tmp += " ";
//punctuation
if(IsPunc(word1) == true)
tmp += "y_punc";
else
tmp += "n_punc";
tmp += " ";
//number
if(IsNumber(word1) == true)
tmp += "C_num";//中文数字
else if (IsEnumber(word1) == true)
tmp += "A_num"; //阿拉伯数字
else if (IsEnglish(word1) == true)
tmp += "E_num";//英文
else
tmp += "N_num";//其他
v_data.push_back(tmp);
}
return;
}
const char *split_char_str(const char *line,vector<string> &array)
{
const char *pline = line;
if(*pline > 0)
{
string tmp = "";
while(*pline && *pline > 0 && *pline != ' ')//中英文界限
{
tmp += *pline;
pline++;
}
if(tmp != "")
array.push_back(tmp);
}
else
{
string tmp;
tmp = *pline;
tmp += *(pline+1);
array.push_back(tmp);
pline += 2;
}
return(pline);
}
void from_seg_to_tag(const char *line,vector<string> &v_dst)
{
const char *pline = line;
while(*pline)
{
if(*pline != ' ')
{
vector<string> array;
pline = split_char_str(pline,array);
string dst_line ;
if(array.size() != 0)
{
dst_line = array[0];
v_dst.push_back(dst_line);
}
}
else
{
pline++;
}
}
return;
}
//***********************train*******************************
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <iostream>
#include <string>
#include <vector>
#define MAXLINELEN 1024*5
#define C_NUMBER ("一 二 三 四 五 六 七 八 九 十 百 千 万 亿 壹 贰 叁 肆 伍 陆 柒 捌 玖 拾 佰 仟")
#define E_NUMBER ("1 2 3 4 5 6 7 8 9 0")
#define E_ENGLISH ("A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z")
#define PUNCTION ("。 , 、 ; : ? ! “ ” ‘ ’╗ ╚ ┐ └ ( ) … … — — — 《 》 〈 〉 · .")
#define E_PUNCTION (". , ; : ? ! /" ' ( ) < >")
using namespace std;
int trans_file(const char *in_file,const char *out_file);
int chomp(char *srcline);
void from_seg_to_tag(const char *line,vector<string> &dst_line);
void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data);
int main(int argc,char **argv)
{
if (argc != 3)
{
printf("usage:%s in_file out_file/n", argv[0]);
return(1);
}
trans_file(argv[1],argv[2]);
return(1);
}
int chomp(char *srcline)
{
int n;
for(n = 0;srcline[n];n++)
{
if(srcline[n] == 0x0D || srcline[n] == 0x0A)
break;
}
srcline[n] = 0;
return(n);
}
bool ValidColumn(const string &str)
{
size_t n,num;
num = 0;
for(n = 0;n < str.size();)
{
if(str[n] == ' ')
{
while(n < str.size() && str[n] == ' ')
n++;
num++;
}
else
{
n++;
}
}
if(num != 3)
return(false);
else
return(true);
}
int trans_file(const char *in_file,const char *out_file)
{
FILE *fin,*fout;
char line[MAXLINELEN];
fin = fopen(in_file,"rb");
if(NULL == fin)
{
printf("can't open %s/n",in_file);
return(-1);
}
fout = fopen(out_file,"wb");
while(!feof(fin))
{
fgets(line,MAXLINELEN,fin);
if(chomp(line) < 2)
continue;
vector<string> v_dst,v_data;
from_seg_to_tag(line,v_dst);
from_tag_to_data(v_dst,v_data);
for(size_t n = 0;n < v_data.size();n++)
{
if(ValidColumn(v_data[n].c_str()) == true)
fprintf(fout,"%s/n",v_data[n].c_str());
else
printf("column size error =%s/n",v_data[n].c_str());
}
fprintf(fout,"/n");
}
fclose(fin);
fclose(fout);
return(1);
}
bool IsEnglish(const string &word_cur)
{
if(strstr(E_ENGLISH,word_cur.c_str()))
return(true);
if (isalpha(word_cur[0]))
return(true);
return(false);
}
bool IsNumber(const string &word_cur)
{
if(strstr(C_NUMBER,word_cur.c_str()))
return(true);
return(false);
}
bool IsEnumber(const string &word_cur)
{
/*English number*/
if(strstr(E_NUMBER,word_cur.c_str()))
return(true);
if (isdigit(word_cur[0]))
return(true);
return(false);
}
bool IsPunc(const string &word_cur)
{
if(strstr(PUNCTION,word_cur.c_str()))
return(true);
else if(strstr(E_PUNCTION,word_cur.c_str()))
return(true);
else
return(false);
}
void split_word_tgt(string &word,string &tgt,const string &src)
{
size_t n;
word = "";
tgt = "";
for(n = 0;n < src.length() && src[n] != '/';n++)
{
word += src[n];
}
if(src[n] == '/')
{
n++;
for(;n < src.length() ;n++)
{
if(src[n] != ' ')
tgt += src[n];
}
}
else
{
printf("error in split %s/n",src.c_str());
}
return;
}
/*给字附着属性信息*/
void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data)
{
size_t n,size;
size = v_dst.size();
for(n = 0;n < size;n++)
{
string word0,word2,word1,tgt,tmp = "";
split_word_tgt(word1,tgt,v_dst[n]);
tmp = word1;
tmp += " ";
//punctuation
if(IsPunc(word1) == true)
tmp += "y_punc";
else
tmp += "n_punc";
tmp += " ";
//number
if(IsNumber(word1) == true)
tmp += "C_num";//中文数字
else if (IsEnumber(word1) == true)
tmp += "A_num"; //阿拉伯数字
else if (IsEnglish(word1) == true)
tmp += "E_num";//英文
else
tmp += "N_num";//其他
tmp += " ";
tmp += tgt;
v_data.push_back(tmp);
}
return;
}
const char *split_char_str(const char *line,vector<string> &array)
{
const char *pline = line;
while(*pline && *pline != ' ')
{
if(*pline > 0)
{
string tmp = "";
while(*pline && *pline > 0 && *pline != ' ')
{
tmp += *pline;
pline++;
}
if(tmp != "")
array.push_back(tmp);
}
else
{
string tmp;
tmp = *pline;
tmp += *(pline+1);
array.push_back(tmp);
pline += 2;
}
}
return(pline);
}
void from_seg_to_tag(const char *line,vector<string> &v_dst)
{
const char *pline = line;
while(*pline)
{
if(*pline != ' ')
{
vector<string> array;
pline = split_char_str(pline,array);
if(array.size() == 0)
{
}
else if(array.size() == 1)
{
string dst_line ;
dst_line = array[0];
dst_line += "/S ";
v_dst.push_back(dst_line);
}
else if(array.size() == 2)
{
string dst_line ;
dst_line = array[0];
dst_line += "/B ";
v_dst.push_back(dst_line);
dst_line = array[1];
dst_line += "/E ";
v_dst.push_back(dst_line);
}
else if(array.size() == 3)
{
string dst_line ;
dst_line = array[0];
dst_line += "/B ";
v_dst.push_back(dst_line);
dst_line = array[1];
dst_line += "/B2 ";
v_dst.push_back(dst_line);
dst_line = array[2];
dst_line += "/E ";
v_dst.push_back(dst_line);
}
else if(array.size() >= 4)
{
string dst_line ;
dst_line = array[0];
dst_line += "/B ";
v_dst.push_back(dst_line);
dst_line = array[1];
dst_line += "/B2 ";
v_dst.push_back(dst_line);
dst_line = array[2];
dst_line += "/B3 ";
v_dst.push_back(dst_line);
for(size_t n = 3;n < (array.size()-1);n++)
{
dst_line = array[n];
dst_line += "/M ";
v_dst.push_back(dst_line);
}
dst_line = array[array.size()-1];
dst_line += "/E ";
v_dst.push_back(dst_line);
}
}
else
{
pline++;
}
}
return;
}
- crf预处理更改
- crf
- CRF++
- crf
- CRF
- CRF++
- CRF
- CRF
- CRF
- CRF
- 预处理
- 预处理
- 预处理
- 预处理
- 预处理
- 预处理
- 预处理
- 预处理
- BSP
- 【转】国内主要工作流厂商分析
- 网上总结到的GridView自定义分页
- oracle 9i 根据客户端机器名查询ip
- Java软件架构师所需要的资料
- crf预处理更改
- 关于offsetof()的warning!
- 经典仿谷歌分页实例
- 移植ffmpeg到android
- web.xml中的3种写法
- 25个jQuery的编程小抄
- 【转】[译] 富士F75EXR拍摄指南(适用于其它EXR系列相机)
- 格式化 数字
- 关于图像绘制并输出的问题,大家帮忙看看