crf预处理更改

来源：互联网发布：汇龙软件编辑：程序博客网时间：2024/05/16 11:50

//********************test******************************

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <string>

#define C_NUMBER ("一二三四五六七八九十百千万亿壹贰叁肆伍陆柒捌玖拾佰仟")
#define E_NUMBER ("１２３４５６７８９０")
#define E_ENGLISH ("ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ")
#define PUNCTION ("。，、；：？！ “ ” ‘ ’╗ ╚ ┐ └ （） … … — — — 《》〈〉 · .")
#define E_PUNCTION (". , ; : ? ! /" ' ( ) < >")
#define E_NUMBER ("1 2 3 4 5 6 7 8 9 0")

#define TEST_LINE_NUM 5000
#define MAXLINELEN 1024

using namespace std;

int get_test(const char *src,const char *dst);
int get_test_open_tst(const char *src2,const char *dst2);
void chomp(char *srcline);
void from_seg_to_tag(const char *line,vector<string> &dst_line);
void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data);

int main(int argc,char **argv)
{
/* char src[] = "data.txt";
char dst[] = "test.txt";
char src2[] = "news.txt";
char dst2[] = "test2.txt";

get_test(src,dst);
*/
if (argc != 3)
{
printf("usage: %s in_file out_file/n",argv[0]);
return(1);
}

get_test_open_tst(argv[1],argv[2]);
//get_test_open_tst("1","2");

return(1);
}

/*get test from training data for close test
* */
int get_test(const char *src,const char *dst)
{
FILE *fin,*fout,*fout2;
char line[MAXLINELEN];
char first[64],second[64],third[64],four[64];
int num;

fin = fopen(src,"rb");
if(NULL == fin)
{
printf("can't open %s/n",src);
return(-1);
}
fout = fopen(dst,"wb");

sprintf(first,"%s.tst",dst);
fout2 = fopen(first,"wb");
num = 0;
while(!feof(fin))
{
  fgets(line,MAXLINELEN,fin);
  if(line[0] == 0x0A || line[0] == 0x0D)
  {
   fprintf(fout,"%s",line);
   fprintf(fout2,"%s",line);
  }else
  {
   fprintf(fout2,"%s",line);
   sscanf(line,"%s %s %s %s",first,second,third,four);
   fprintf(fout,"%s %s %s/n",first,second,third);
  }
  num++;
  if(num == MAXLINELEN)
   break;
}
fclose(fin);
fclose(fout);
fclose(fout2);
return(1);
}

void chomp(char *srcline)
{
int n;
for(n = 0;srcline[n];n++)
{
if(srcline[n] == 0x0D || srcline[n] == 0x0A)
break;
}
srcline[n] = 0;
return;
}

bool ValidColumn(const string &str)
{
size_t n,num;

num = 0;
for(n = 0;n < str.size();)
{
  if(str[n] == ' ')
  {
   while(n < str.size() && str[n] == ' ')
    n++;
   num++;
  }
  else
  {
   n++;
  }
}
if(num != 3)
  return(false);
else
  return(true);
}

/*
* 开放测试文本转换
* */
int get_test_open_tst(const char *in_file,const char *out_file)
{
FILE *fin,*fout;
char line[MAXLINELEN];

fin = fopen(in_file,"rb");
if(NULL == fin)
{
  printf("can't open %s/n",in_file);
  return(-1);
}
fout = fopen(out_file,"wb");
while(!feof(fin))
{
  fgets(line,MAXLINELEN,fin);
  chomp(line);

  vector<string> v_dst,v_data;

  from_seg_to_tag(line,v_dst);
  from_tag_to_data(v_dst,v_data);

  for(size_t n = 0;n < v_data.size();n++)
  {
   if((n+1) < v_data.size())
   {
    if(v_data[n] == " " && v_data[n] == v_data[n+1])
     continue;
    else if(v_data[n] == "/n" && v_data[n] == v_data[n+1])
     continue;
    else
     fprintf(fout,"%s/n",v_data[n].c_str());
   }
   else
   {
    fprintf(fout,"%s/n",v_data[n].c_str());
   }
  }
  fprintf(fout,"/n");
}
fclose(fin);
fclose(fout);
return(1);
}

bool IsEnglish(const string &word_cur)
{
if(strstr(E_ENGLISH,word_cur.c_str()))
  return(true);
if (isalpha(word_cur[0]))
  return(true);

return(false);
}
bool IsNumber(const string &word_cur)
{
if(strstr(C_NUMBER,word_cur.c_str()))
  return(true);

return(false);
}
bool IsEnumber(const string &word_cur)
{

/*English number*/

if(strstr(E_NUMBER,word_cur.c_str()))
return(true);
if (isdigit(word_cur[0]))
return(true);

return(false);
}

bool IsPunc(const string &word_cur)
{
if(strstr(PUNCTION,word_cur.c_str()))
  return(true);
else if(strstr(E_PUNCTION,word_cur.c_str()))
  return(true);
else
  return(false);
}

void split_word_tgt(string &word,string &tgt,const string &src)
{
size_t n;

word = "";
tgt = "";
for(n = 0;n < src.length() && src[n] != '/';n++)
{
  word += src[n];
}
if(src[n] == '/')
{
  n++;
  for(;n < src.length() ;n++)
  {

   if(src[n] != ' ')
    tgt += src[n];
  }
}
else
{
  printf("error in split %s/n",src.c_str());
}
return;
}
/* 给每个字附着属性信息，如标点、数字 */
void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data)
{
size_t n,size;

size = v_dst.size();
for(n = 0;n < size;n++)
{
  string word0,word2,word1,tgt,tmp = "";

  if(v_dst[n] == "　")
  {
   v_data.push_back(" ");
   continue;
  }
  word1 = v_dst[n];
  tmp = word1;
  tmp += " ";

  //punctuation
  if(IsPunc(word1) == true)
   tmp += "y_punc";
  else
   tmp += "n_punc";
  tmp += " ";

  //number
  if(IsNumber(word1) == true)
   tmp += "C_num";//中文数字
  else if (IsEnumber(word1) == true)
   tmp += "A_num"; //阿拉伯数字
  else if (IsEnglish(word1) == true)
   tmp += "E_num";//英文
  else
   tmp += "N_num";//其他

  v_data.push_back(tmp);
}
return;
}

const char *split_char_str(const char *line,vector<string> &array)
{
const char *pline = line;

if(*pline > 0)
{
  string tmp = "";
  while(*pline && *pline > 0 && *pline != ' ')//中英文界限
  {
   tmp += *pline;
   pline++;
  }
  if(tmp != "")
   array.push_back(tmp);
}
else
{
  string tmp;
  tmp = *pline;
  tmp += *(pline+1);
  array.push_back(tmp);
  pline += 2;
}
return(pline);
}

void from_seg_to_tag(const char *line,vector<string> &v_dst)
{
const char *pline = line;
while(*pline)
{
  if(*pline != ' ')
  {
   vector<string> array;
   pline = split_char_str(pline,array);
   string dst_line ;

   if(array.size() != 0)
   {
    dst_line = array[0];
    v_dst.push_back(dst_line);
   }
  }
  else
  {
   pline++;
  }
}
return;
}
//***********************train*******************************

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <iostream>
#include <string>
#include <vector>

#define MAXLINELEN 1024*5
#define C_NUMBER ("一二三四五六七八九十百千万亿壹贰叁肆伍陆柒捌玖拾佰仟")
#define E_NUMBER ("１２３４５６７８９０")
#define E_ENGLISH ("ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ")
#define PUNCTION ("。，、；：？！ “ ” ‘ ’╗ ╚ ┐ └ （） … … — — — 《》〈〉 · .")
#define E_PUNCTION (". , ; : ? ! /" ' ( ) < >")

using namespace std;

int trans_file(const char *in_file,const char *out_file);
int chomp(char *srcline);
void from_seg_to_tag(const char *line,vector<string> &dst_line);
void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data);

int main(int argc,char **argv)
{
if (argc != 3)
{
printf("usage:%s in_file out_file/n", argv[0]);
return(1);
}

trans_file(argv[1],argv[2]);
return(1);
}

int chomp(char *srcline)
{
int n;
for(n = 0;srcline[n];n++)
{
if(srcline[n] == 0x0D || srcline[n] == 0x0A)
break;
}
srcline[n] = 0;
return(n);
}

int trans_file(const char *in_file,const char *out_file)
{
FILE *fin,*fout;
char line[MAXLINELEN];

fin = fopen(in_file,"rb");
if(NULL == fin)
{
  printf("can't open %s/n",in_file);
  return(-1);
}
fout = fopen(out_file,"wb");
while(!feof(fin))
{
  fgets(line,MAXLINELEN,fin);
  if(chomp(line) < 2)
   continue;
  vector<string> v_dst,v_data;

  from_seg_to_tag(line,v_dst);
  from_tag_to_data(v_dst,v_data);

  for(size_t n = 0;n < v_data.size();n++)
  {
   if(ValidColumn(v_data[n].c_str()) == true)
    fprintf(fout,"%s/n",v_data[n].c_str());
   else
    printf("column size error =%s/n",v_data[n].c_str());
  }
  fprintf(fout,"/n");
}
fclose(fin);
fclose(fout);
return(1);
}
bool IsEnglish(const string &word_cur)
{
if(strstr(E_ENGLISH,word_cur.c_str()))
  return(true);
if (isalpha(word_cur[0]))
  return(true);

return(false);
}
bool IsNumber(const string &word_cur)
{
if(strstr(C_NUMBER,word_cur.c_str()))
  return(true);

return(false);
}
bool IsEnumber(const string &word_cur)
{

/*English number*/

if(strstr(E_NUMBER,word_cur.c_str()))
return(true);
if (isdigit(word_cur[0]))
return(true);

return(false);
}

bool IsPunc(const string &word_cur)
{
if(strstr(PUNCTION,word_cur.c_str()))
  return(true);
else if(strstr(E_PUNCTION,word_cur.c_str()))
  return(true);
else
  return(false);
}
void split_word_tgt(string &word,string &tgt,const string &src)
{
size_t n;

word = "";
tgt = "";
for(n = 0;n < src.length() && src[n] != '/';n++)
{
  word += src[n];
}
if(src[n] == '/')
{
  n++;
  for(;n < src.length() ;n++)
  {
   if(src[n] != ' ')
    tgt += src[n];
  }
}
else
{
  printf("error in split %s/n",src.c_str());
}
return;
}

/*给字附着属性信息*/
void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data)
{
size_t n,size;

size = v_dst.size();
for(n = 0;n < size;n++)
{
  string word0,word2,word1,tgt,tmp = "";

  split_word_tgt(word1,tgt,v_dst[n]);

  tmp = word1;
  tmp += " ";

  //punctuation
  if(IsPunc(word1) == true)
   tmp += "y_punc";
  else
   tmp += "n_punc";
  tmp += " ";

  //number
  if(IsNumber(word1) == true)
   tmp += "C_num";//中文数字
  else if (IsEnumber(word1) == true)
   tmp += "A_num"; //阿拉伯数字
  else if (IsEnglish(word1) == true)
   tmp += "E_num";//英文
  else
   tmp += "N_num";//其他

  tmp += " ";
  tmp += tgt;

  v_data.push_back(tmp);
}
return;
}

const char *split_char_str(const char *line,vector<string> &array)
{
const char *pline = line;

while(*pline && *pline != ' ')
{
  if(*pline > 0)
  {
   string tmp = "";
   while(*pline && *pline > 0 && *pline != ' ')
   {
    tmp += *pline;
    pline++;
   }
   if(tmp != "")
    array.push_back(tmp);
  }
  else
  {
   string tmp;
   tmp = *pline;
   tmp += *(pline+1);
   array.push_back(tmp);
   pline += 2;
  }
}
return(pline);
}

void from_seg_to_tag(const char *line,vector<string> &v_dst)
{
const char *pline = line;
while(*pline)
{
  if(*pline != ' ')
  {
   vector<string> array;
   pline = split_char_str(pline,array);
   if(array.size() == 0)
   {
   }
   else if(array.size() == 1)
   {
    string dst_line ;

    dst_line = array[0];
    dst_line += "/S ";
    v_dst.push_back(dst_line);
   }
   else if(array.size() == 2)
   {
    string dst_line ;

    dst_line = array[0];
    dst_line += "/B ";
    v_dst.push_back(dst_line);
    dst_line = array[1];
    dst_line += "/E ";
    v_dst.push_back(dst_line);
   }
   else if(array.size() == 3)
   {
    string dst_line ;

    dst_line = array[0];
    dst_line += "/B ";
    v_dst.push_back(dst_line);
    dst_line = array[1];
    dst_line += "/B2 ";
    v_dst.push_back(dst_line);
    dst_line = array[2];
    dst_line += "/E ";
    v_dst.push_back(dst_line);
   }
   else if(array.size() >= 4)
   {
    string dst_line ;

    dst_line = array[0];
    dst_line += "/B ";
    v_dst.push_back(dst_line);
    dst_line = array[1];
    dst_line += "/B2 ";
    v_dst.push_back(dst_line);
    dst_line = array[2];
    dst_line += "/B3 ";
    v_dst.push_back(dst_line);
    for(size_t n = 3;n < (array.size()-1);n++)
    {
     dst_line = array[n];
     dst_line += "/M ";
     v_dst.push_back(dst_line);
    }
    dst_line = array[array.size()-1];
    dst_line += "/E ";
    v_dst.push_back(dst_line);
   }
  }
  else
  {
   pline++;
  }
}
return;
}