迅雷笔试题的Utf-8编码检测的实现

来源:互联网 发布:centos mv 移动文件夹 编辑:程序博客网 时间:2024/06/06 02:26



程序中用移位运算来判断,之后觉得没必要,编码完全可以理解为如下表格:

0000-007F0XXXXXXX0~1270080-07FF110XXXXX
10XXXXXX192~223
128~1910800-FFFF1110XXXX
10XXXXXX
10XXXXXX224~239
128~191
128~191
直接将unsigned char 转换为 unsigned int ,然后判断大小即可。程序还是沿用移位运算:


#include "stdafx.h"#include <iostream>using namespace std;int calc_utf8_count(unsigned char* data_ptr,unsigned int data_len){int countOfByte=0;if (data_len==0||data_ptr==NULL){return 0;}int curNumByte=0;    unsigned char curByte=NULL;while(curNumByte<data_len){curByte=data_ptr[curNumByte];if (!(curByte>>7)){//说明第一位是0,属于第一种情况countOfByte++;curNumByte++;continue;}if ((curByte>>5)==6||(curByte>>5)==7){//是以110或111开头if ((curByte>>5)==6){//以110开头,属于第二种curByte=data_ptr[++curNumByte];if ((curByte>>6)==2){//后面的字节以10开头,说明合法countOfByte++;curNumByte++;} else{return -1;}} else{//以111开头,可能属于第三种if ((curByte>>4)==14){//以1110开头,属于第三种curByte=data_ptr[++curNumByte];if ((curByte>>6)==2){//后面的字节以10开头curByte=data_ptr[++curNumByte];if ((curByte>>6)==2){//第三个字节以10开头countOfByte++;curNumByte++;} else{return -1;}} else{return -1;}} else{return -1;}}}elsereturn -1;}return countOfByte;}int _tmain(int argc, _TCHAR* argv[]){unsigned char data[] = {0x79,0xC0,0x80,0xE0,0x80,0x80,0x78};cout<<calc_utf8_count(data,7)<<endl;return 0;}



原创粉丝点击