过滤网页中的全角半角字母的程序

来源:互联网 发布:网络监控客户端通用版 编辑:程序博客网 时间:2024/06/06 14:04
// 把双字节汉字转化为十六进制字符串int chinese_to_hex(char *word ,char *hex){ char temp_char[17]; memset( temp_char, 0, 17 );    sprintf( temp_char, "%X%X", word[ 0 ], word[ 1 ] );   sprintf( hex, "%c%c%c%c", temp_char[6], temp_char[7], temp_char[14], temp_char[15] );      return 0;}// 分割字符为字单元int get_single_word( char *sz_text, char**word_list , long int &word_count ){char temp[3];int dcount=0;char hex[4];  memset( hex, 0, 4 );memset(temp ,0 ,sizeof( temp ) );                                                                         long int i, nLen = strlen( sz_text );   word_count = 0;                                                                                 for(i = 0; i < nLen; i++)                                                                                               {                                                                                                                    if((int)sz_text[i] >= 0 && (int)sz_text[i] <= 127) //半角[ 英文字母 ]    {       if( ( (int)sz_text[i] >= 48 && (int)sz_text[i]  <= 57  )  ||        ( (int)sz_text[i] >=65  && (int)sz_text[i]  <= 90  )  ||        ( (int)sz_text[i] >= 97 && (int)sz_text[i]  <= 122 )  ||            (int)sz_text[i] ==46  || (int)sz_text[i]  == 32  )      {        temp[0] = sz_text[i];        dcount = 0;        if((int)sz_text[i] ==46  || (int)sz_text[i]  == 32)        {          dcount++;        }     }else     {      if( dcount == 0 )      {        temp[0] = ',';        dcount++;      }     }    }                                                                                           else //全角字符[ 中文是2个字节 ]     {                                 temp[ 0 ] = sz_text[i]; temp[ 1 ] = sz_text[ i + 1 ];               chinese_to_hex( temp, hex );            if( ( strtol( hex, NULL, 16 ) >= 0XA1A0 && strtol( hex, NULL, 16 ) <=0XA3AF ) ||         ( strtol( hex, NULL, 16 ) >= 0XA3BA && strtol( hex, NULL, 16 ) <=0XA3C0 ) ||        ( strtol( hex, NULL, 16 ) >= 0XA3DB && strtol( hex, NULL, 16 ) <=0XA3E0 ) ||        ( strtol( hex, NULL, 16 ) >= 0XA3FB && strtol( hex, NULL, 16 ) <=0XA3FF ) ||        ( strtol( hex, NULL, 16 ) >= 0XA4A0 && strtol( hex, NULL, 16 ) <=0XA996 )  )      {       memset( temp, 0, 3);       if( dcount == 0 )      {         temp[ 0 ]=',';         dcount++;       }      }else      {      dcount=0;      }      i++;      }    sprintf( word_list[ word_count ], "%s", temp );    word_count++;    memset( temp , 0, sizeof( temp ) );      memset( hex, 0, 4 );                                                                                                       }  return 0;}// 文本杂志过滤int filter_chars_text( char *in_put_text , char *out_put_text ){ char *word_list[ BLOCKS_WORD_LEN ];   long int word_count = 0;    char sin_temp[ BLOCKS_WORD_LEN ];   int sin_count=0;      memset( out_put_text, 0, sizeof( out_put_text ) );   memset( sin_temp, 0, sizeof( sin_temp ) );      for(int i=0; i<BLOCKS_WORD_LEN; i++)   {      word_list[i] =new char[3];      memset( word_list[i], 0, 3 );   }      get_single_word( in_put_text, word_list, word_count );      for( int i = 0; i < BLOCKS_WORD_LEN; i++ )   {     if( i < word_count )     {     if( (int)(word_list[i][0]) >= 0 && (int)(word_list[i][0]) <= 127)     {         sin_temp[sin_count] =  word_list[ i ][0];         sin_count++;     }else     {       if( strlen( sin_temp ) >0 && sin_temp!=NULL )       {        if(sin_count < 9)        {           strcat( out_put_text, sin_temp );           strcat( out_put_text, word_list[ i ] );                   }else        {          strcat( out_put_text, word_list[ i ] );        }     }else     {      strcat( out_put_text, word_list[ i ] );     }          sin_count=0;     memset( sin_temp, 0, BLOCKS_WORD_LEN );     }     }      delete word_list[ i ];   }   return 0;}

原创粉丝点击