unicode与UTF8

来源：互联网发布：软件项目建议书模板编辑：程序博客网时间：2024/06/05 06:51

unicode是个字符集

UTF8是对unicode进行编码的一种编码方法

UTF-8以字节为单位对Unicode进行编码。从Unicode到UTF-8的编码方式如下：

Unicode编码(十六进制)　

UTF-8 字节流(二进制)

000000 - 00007F

0xxxxxxx

000080 - 0007FF

110xxxxx 10xxxxxx

000800 - 00FFFF

1110xxxx 10xxxxxx 10xxxxxx

010000 - 10FFFF

11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

以下是对网上程序修改后的正确程序，原程序有错误。

#ifdef WIN32#define uint8_t  unsigned __int8#define uint16_t unsigned __int16#define uint32_t unsigned __int32#define uint64_t unsigned __int64#define int8_t  __int8#define int16_t __int16#define int32_t __int32#endif// 4-11int unicode_to_utf8(uint16_t *in, int insize, uint8_t **out){int i = 0;int outsize = 0;int charscount = 0;uint8_t *result = NULL;uint8_t *tmp = NULL;charscount = insize / sizeof(uint16_t);result = (uint8_t *)malloc(charscount * 3 + 1);memset(result, 0, charscount * 3 + 1);tmp = result;for (i = 0; i < charscount; i++){uint16_t unicode = in[i];if (unicode >= 0x0000 && unicode <= 0x007f){*tmp = (uint8_t)unicode;tmp += 1;outsize += 1;}else if (unicode >= 0x0080 && unicode <= 0x07ff){*tmp = 0xc0 | (unicode >> 6);tmp += 1;*tmp = 0x80 | (unicode & (0xff >> 2));tmp += 1;outsize += 2;}else if (unicode >= 0x0800 && unicode <= 0xffff){*tmp = 0xe0 | (unicode >> 12);tmp += 1;//test 4-11*tmp =  0x80 | ( (unicode >> 6) & 0x3f );//0x80 | (unicode >> 6 & 0x00ff);tmp += 1;*tmp =  0x80 | ( unicode & 0x3f );//0x80 | (unicode & (0xff >> 2));tmp += 1;outsize += 3;}}*tmp = '\0';*out = result;return 0;}int utf8_to_unicode(uint8_t *in, uint16_t **out, int *outsize){uint8_t *p = in;uint16_t *result = NULL;int resultsize = 0;uint8_t *tmp = NULL;//这里定义为8000，足够存放一行数据，节省strlen的时间result = (uint16_t *)malloc(4000 * 2 + 2); /* should be enough */memset(result, 0, 4000 * 2 + 2);//strlen(in)tmp = (uint8_t *)result;while(*p){if (*p >= 0x00 && *p <= 0x7f){*tmp = *p;tmp++;*tmp = '\0';//added by sunyu 2013-4-2tmp++;/**tmp = 0;tmp++;*tmp = *p;*/resultsize += 2;}else if ((*p & (0xff << 5))== 0xc0){uint16_t t = 0;uint8_t t1 = 0;uint8_t t2 = 0;t1 = *p & (0xff >> 3);p++;t2 = *p & (0xff >> 2);*tmp = t2 | ((t1 & (0xff >> 6)) << 6);//t1 >> 2;tmp++;*tmp = t1 >> 2;//t2 | ((t1 & (0xff >> 6)) << 6);tmp++;resultsize += 2;}else if ((*p & (0xff << 4))== 0xe0){uint16_t t = 0;uint8_t t1 = 0;uint8_t t2 = 0;uint8_t t3 = 0;t1 = *p & (0xff >> 3);p++;t2 = *p & (0xff >> 2);p++;t3 = *p & (0xff >> 2);//Little Endian*tmp = ((t2 & (0xff >> 6)) << 6) | t3;//(t1 << 4) | (t2 >> 2);tmp++;*tmp = (t1 << 4) | (t2 >> 2);//((t2 & (0xff >> 6)) << 6) | t3;tmp++;resultsize += 2;}p++;}*tmp = '\0';tmp++;*tmp = '\0';resultsize += 2;*out = result;*outsize = resultsize; return 0;}//采用相加的方法，大小端都适用//强转和相加，会自动进行相应的存放int utf8_to_unicode_biglittle(uint8_t *in, uint16_t **out, int *outsize){uint8_t *p = in;uint16_t *result = NULL;int resultsize = 0;uint16_t *tmp = NULL;//这里定义为8000，足够存放一行数据，节省strlen的时间result = (uint16_t *)malloc(4000 * 2 + 2); /* should be enough */memset(result, 0, 4000 * 2 + 2);//strlen(in)tmp = result;while(*p){if (*p >= 0x00 && *p <= 0x7f){*tmp = (uint16_t)*p;tmp++;resultsize += 2;}else if ((*p & (0xff << 5))== 0xc0){uint16_t t = 0;uint8_t t1 = 0;uint8_t t2 = 0;t1 = *p & (0xff >> 3);p++;t2 = *p & (0xff >> 2);*tmp = (uint16_t)t1 << 6 + (uint16_t)t2;tmp++;resultsize += 2;}else if ((*p & (0xff << 4))== 0xe0){uint16_t t = 0;uint8_t t1 = 0;uint8_t t2 = 0;uint8_t t3 = 0;t1 = *p & (0xff >> 3);p++;t2 = *p & (0xff >> 2);p++;t3 = *p & (0xff >> 2);*tmp = ((uint16_t)t1 << 12) + ((uint16_t)t2 >> 6) + (uint16_t)t3;tmp++;resultsize += 2;}p++;}*tmp = '\0';resultsize += 2;*out = result;*outsize = resultsize; return 0;}

0 0