跨平台utf8与unicode的互转

来源：互联网发布：如何查网址域名编辑：程序博客网时间：2024/06/07 10:56

这几天帮同事看一个聊天敏感词过滤问题，需求是在ndk环境下，把用户输入的句子中不健康的词语替换成*，开始时需要先将用户输入转成wchar_t*，用了mbstowcs，在本地linux环境下测试没有问题，但是比较坑的是ndk安卓真机环境下不行，后来干脆自己写了utf8与unicode相互转换的一组函数，utf8sToUnicodes是将utf8字符串转成unicode数组，unicodesToUtf8s是将unicode数组转成utf8字符串。

\\根据多字节字符串的第一个字符返回该unicode占的字节数int getUtf8Len(const char* utfs) {  unsigned int in = utfs[0];  in = in & 0x000000ff;  if (in >= 0x00 && in < 0xC0) return 1;  else if (in >= 0xC0 && in < 0xE0) return 2;  else if (in >= 0xE0 && in < 0xF0) return 3;  else return 0;}\\将多个字符组成的一个字转换成unicodeint utf8ToUnicode(const char* utf, int len) {  unsigned int rst;  if (1 == len) rst = utf[0];   else if (2 == len) {    unsigned int part1 = utf[0] & 0x1F;    part1 << 6;    unsigned int part2 = utf[1] & 0x3F;    rst = part1 | part2;  } else if (3 == len) {    unsigned int part1 = utf[0] & 0x0F;    part1 = part1 << 12;    unsigned int part2 = utf[1] & 0x3F;    part2 = part2 << 6;    unsigned int part3 = utf[2] & 0x3F;    rst = part1 | part2 | part3;  } else {    rst = 0;  }  return rst;}\\将Unicode转换成多字节字符void unicodeToUtf(const unsigned int unicode, char* utf) {  if (unicode < 0x80) {    utf[0] = unicode >> 0 & 0x7F | 0x00;    utf[1] = '\0';  } else if (unicode < 0x0800) {    utf[0] = unicode >> 6 & 0x1F | 0xC0;    utf[1] = unicode >> 0 & 0x3F | 0x80;    utf[2] = '\0';  } else if (unicode < 0x010000) {    utf[0] = unicode >> 12 & 0x0F | 0xE0;    utf[1] = unicode >> 6 & 0x3F | 0x80;    utf[2] = unicode >> 0 & 0x3F | 0x80;    utf[3] = '\0';  }}\\将多字节字符串转换成unicode数组int utf8sToUnicodes(const char* utfs, unsigned int * unicodes) {  int len = strlen(utfs);  int mindex = 0;  int windex = 0;  while (mindex < len) {    int bytes = getUtf8Len(utfs + mindex);    unicodes[windex] = utf8ToUnicode(utfs + mindex, bytes);    mindex += bytes;    windex ++;  }  unicodes[windex] = 0;  return windex;}\\将unicode数组转换成多字节字符串void unicodesToUtf8s(const unsigned int* unicodes, char* utfs) {  sprintf(utfs, "%s", "");  int index = 0;  while(unicodes[index] != 0) {    char word[4];    unicodeToUtf(unicodes[index], word);    sprintf(utfs, "%s%s", utfs, word);    index ++;  }}

0 0