unicode与UTF8
来源:互联网 发布:软件项目建议书模板 编辑:程序博客网 时间:2024/06/05 06:51
unicode是个字符集
UTF8是对unicode进行编码的一种编码方法
UTF-8以字节为单位对Unicode进行编码。从Unicode到UTF-8的编码方式如下:
Unicode编码(十六进制)
UTF-8 字节流(二进制)
000000 - 00007F
0xxxxxxx
000080 - 0007FF
110xxxxx 10xxxxxx
000800 - 00FFFF
1110xxxx 10xxxxxx 10xxxxxx
010000 - 10FFFF
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
以下是对网上程序修改后的正确程序,原程序有错误。
#ifdef WIN32#define uint8_t unsigned __int8#define uint16_t unsigned __int16#define uint32_t unsigned __int32#define uint64_t unsigned __int64#define int8_t __int8#define int16_t __int16#define int32_t __int32#endif// 4-11int unicode_to_utf8(uint16_t *in, int insize, uint8_t **out){int i = 0;int outsize = 0;int charscount = 0;uint8_t *result = NULL;uint8_t *tmp = NULL;charscount = insize / sizeof(uint16_t);result = (uint8_t *)malloc(charscount * 3 + 1);memset(result, 0, charscount * 3 + 1);tmp = result;for (i = 0; i < charscount; i++){uint16_t unicode = in[i];if (unicode >= 0x0000 && unicode <= 0x007f){*tmp = (uint8_t)unicode;tmp += 1;outsize += 1;}else if (unicode >= 0x0080 && unicode <= 0x07ff){*tmp = 0xc0 | (unicode >> 6);tmp += 1;*tmp = 0x80 | (unicode & (0xff >> 2));tmp += 1;outsize += 2;}else if (unicode >= 0x0800 && unicode <= 0xffff){*tmp = 0xe0 | (unicode >> 12);tmp += 1;//test 4-11*tmp = 0x80 | ( (unicode >> 6) & 0x3f );//0x80 | (unicode >> 6 & 0x00ff);tmp += 1;*tmp = 0x80 | ( unicode & 0x3f );//0x80 | (unicode & (0xff >> 2));tmp += 1;outsize += 3;}}*tmp = '\0';*out = result;return 0;}int utf8_to_unicode(uint8_t *in, uint16_t **out, int *outsize){uint8_t *p = in;uint16_t *result = NULL;int resultsize = 0;uint8_t *tmp = NULL;//这里定义为8000,足够存放一行数据,节省strlen的时间result = (uint16_t *)malloc(4000 * 2 + 2); /* should be enough */memset(result, 0, 4000 * 2 + 2);//strlen(in)tmp = (uint8_t *)result;while(*p){if (*p >= 0x00 && *p <= 0x7f){*tmp = *p;tmp++;*tmp = '\0';//added by sunyu 2013-4-2tmp++;/**tmp = 0;tmp++;*tmp = *p;*/resultsize += 2;}else if ((*p & (0xff << 5))== 0xc0){uint16_t t = 0;uint8_t t1 = 0;uint8_t t2 = 0;t1 = *p & (0xff >> 3);p++;t2 = *p & (0xff >> 2);*tmp = t2 | ((t1 & (0xff >> 6)) << 6);//t1 >> 2;tmp++;*tmp = t1 >> 2;//t2 | ((t1 & (0xff >> 6)) << 6);tmp++;resultsize += 2;}else if ((*p & (0xff << 4))== 0xe0){uint16_t t = 0;uint8_t t1 = 0;uint8_t t2 = 0;uint8_t t3 = 0;t1 = *p & (0xff >> 3);p++;t2 = *p & (0xff >> 2);p++;t3 = *p & (0xff >> 2);//Little Endian*tmp = ((t2 & (0xff >> 6)) << 6) | t3;//(t1 << 4) | (t2 >> 2);tmp++;*tmp = (t1 << 4) | (t2 >> 2);//((t2 & (0xff >> 6)) << 6) | t3;tmp++;resultsize += 2;}p++;}*tmp = '\0';tmp++;*tmp = '\0';resultsize += 2;*out = result;*outsize = resultsize; return 0;}//采用相加的方法,大小端都适用//强转和相加,会自动进行相应的存放int utf8_to_unicode_biglittle(uint8_t *in, uint16_t **out, int *outsize){uint8_t *p = in;uint16_t *result = NULL;int resultsize = 0;uint16_t *tmp = NULL;//这里定义为8000,足够存放一行数据,节省strlen的时间result = (uint16_t *)malloc(4000 * 2 + 2); /* should be enough */memset(result, 0, 4000 * 2 + 2);//strlen(in)tmp = result;while(*p){if (*p >= 0x00 && *p <= 0x7f){*tmp = (uint16_t)*p;tmp++;resultsize += 2;}else if ((*p & (0xff << 5))== 0xc0){uint16_t t = 0;uint8_t t1 = 0;uint8_t t2 = 0;t1 = *p & (0xff >> 3);p++;t2 = *p & (0xff >> 2);*tmp = (uint16_t)t1 << 6 + (uint16_t)t2;tmp++;resultsize += 2;}else if ((*p & (0xff << 4))== 0xe0){uint16_t t = 0;uint8_t t1 = 0;uint8_t t2 = 0;uint8_t t3 = 0;t1 = *p & (0xff >> 3);p++;t2 = *p & (0xff >> 2);p++;t3 = *p & (0xff >> 2);*tmp = ((uint16_t)t1 << 12) + ((uint16_t)t2 >> 6) + (uint16_t)t3;tmp++;resultsize += 2;}p++;}*tmp = '\0';resultsize += 2;*out = result;*outsize = resultsize; return 0;}
0 0
- 编码 unicode与utf8
- utf8与unicode转换
- utf8与unicode转换
- UTF8与Unicode互转
- unicode与UTF8
- utf8与unicode转换
- unicode编码与utf8编码
- erlang utf8 与unicode关系
- Unicode与UTF8相互转化
- PHP unicode解码 和 utf8与unicode
- 学习笔记:Unicode与UTF8互转
- 转贴 utf8 与 unicode的区别
- Unicode与UTF8互转源代码
- UNICODE与 UTF8的转换详解
- UNICODE与UTF8,UTF16的含义
- UNICODE与UTF8,UTF16的含义
- Qt入门之utf8与unicode转换
- unicode utf8 与 ascii的关系
- 刚子扯谈:广告向东 客户向西
- ExpandableListView实例
- bash多行注释
- RMQ的ST算法学习小记 Poj 3264 Balanced Lineup
- 你的SQL2000有一天突然启动不了了,总找不到原因
- unicode与UTF8
- 浏览器的工作原理:新式网络浏览器幕后揭秘{转}
- 利用reverse_iterator删除string末尾元素
- stat函数和文件类型
- eclipse使用技巧
- 项目实施与管理的几点建议
- 串匹配算法KMP详解
- 动态规划算法
- 2013的收获