C++ 实现unicode到utf-8的转码
来源:互联网 发布:淘宝热搜词在哪里看 编辑:程序博客网 时间:2024/04/29 05:29
思路:
获取字符串里面中的Unicode部分,然后将该部分转换位utf-8格式的字符,最后将字符串里面的所有Unicode替换为utf-8即可。
废话不多少,直接上代码:
头文件:
/* * charsetEncode.h * * Created on: Jul 25, 2016 * Author: root */#ifndef COMMONSERVER_INCLUDE_CHARSETENCODE_H_#define COMMONSERVER_INCLUDE_CHARSETENCODE_H_#include <iostream>#include <algorithm>#include <string>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <unistd.h>#include <errno.h>#include <assert.h>using namespace std;class CcharsetEncode{public: int unicode_to_utf8(string &source);//unicode to utf-8 //字符串忽略大小写字串替换 void ReplaceStr(string &strContent, const char *strSrc, const char *strDest);private: int enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize); int isUnicode(const string &src); //the src is unicode or not, total 6 char(0x5e3f).1,yes;2,no unsigned int xstrtoshortint(const char *str); //"0x1a3f"->1a3f};#endif /* COMMONSERVER_INCLUDE_CHARSETENCODE_H_ */
源文件:
/* * charsetEncode.cpp * * Created on: Jul 25, 2016 * Author: root */#include "charsetEncode.h"int CcharsetEncode::unicode_to_utf8(string &source){int sourcesize = source.size();string src;unsigned char pout[8];for(int index = 0; index < sourcesize - 6;){memset(pout, 0, 8);src = source.substr(index, 6);if(isUnicode(src) == 1){string hexsrc = source.substr(index + 2, 4);int num = enc_unicode_to_utf8_one(xstrtoshortint(hexsrc.c_str()), pout, 8);ReplaceStr(source, src.c_str(), (char *)pout);index += 3;sourcesize = source.size();}else{index++;}}return 0;}int CcharsetEncode::enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize){ assert(pOutput != NULL); assert(outSize >= 6); if ( unic <= 0x0000007F ) { // * U-00000000 - U-0000007F: 0xxxxxxx *pOutput = (unic & 0x7F); return 1; } else if ( unic >= 0x00000080 && unic <= 0x000007FF ) { // * U-00000080 - U-000007FF: 110xxxxx 10xxxxxx *(pOutput+1) = (unic & 0x3F) | 0x80; *pOutput = ((unic >> 6) & 0x1F) | 0xC0; return 2; } else if ( unic >= 0x00000800 && unic <= 0x0000FFFF ) { // * U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx *(pOutput+2) = (unic & 0x3F) | 0x80; *(pOutput+1) = ((unic >> 6) & 0x3F) | 0x80; *pOutput = ((unic >> 12) & 0x0F) | 0xE0; return 3; } else if ( unic >= 0x00010000 && unic <= 0x001FFFFF ) { // * U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx *(pOutput+3) = (unic & 0x3F) | 0x80; *(pOutput+2) = ((unic >> 6) & 0x3F) | 0x80; *(pOutput+1) = ((unic >> 12) & 0x3F) | 0x80; *pOutput = ((unic >> 18) & 0x07) | 0xF0; return 4; } else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF ) { // * U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx *(pOutput+4) = (unic & 0x3F) | 0x80; *(pOutput+3) = ((unic >> 6) & 0x3F) | 0x80; *(pOutput+2) = ((unic >> 12) & 0x3F) | 0x80; *(pOutput+1) = ((unic >> 18) & 0x3F) | 0x80; *pOutput = ((unic >> 24) & 0x03) | 0xF8; return 5; } else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF ) { // * U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx *(pOutput+5) = (unic & 0x3F) | 0x80; *(pOutput+4) = ((unic >> 6) & 0x3F) | 0x80; *(pOutput+3) = ((unic >> 12) & 0x3F) | 0x80; *(pOutput+2) = ((unic >> 18) & 0x3F) | 0x80; *(pOutput+1) = ((unic >> 24) & 0x3F) | 0x80; *pOutput = ((unic >> 30) & 0x01) | 0xFC; return 6; } return 0;}int CcharsetEncode::isUnicode(const string &src){if(src.size() != 6)return 0;if(src.find("\\u", 0) == 0){for(int i = 2; i <= 5; i++){if(!((src[i] >= 'a' && src[i] <= 'f')|| (src[i] >= 'A' && src[i] <= 'F')|| (src[i] >= '0' && src[i] <= '9'))){return 0;}}return 1;}else{return 0;}}unsigned int CcharsetEncode::xstrtoshortint(const char *str){ int len = strlen(str); unsigned int ivalue = 0; for (int i = 0; i < len; i++) { if ((str[i] <= '9' && str[i] >= '0')) { ivalue = ivalue * 16 + (str[i] - '0'); //16进制 可换其它进制 } else if ((str[i] >= 'a' && str[i] <= 'f')) { ivalue = ivalue * 16 + (str[i] - 'a') + 10; } else if ((str[i] >= 'A' && str[i] <= 'F')) { ivalue = ivalue * 16 + (str[i] - 'A') + 10; } } return ivalue;}void CcharsetEncode::ReplaceStr(string &strContent, const char *strSrc, const char *strDest){ string strCopy(strContent); string strSrcCopy(strSrc); string::size_type pos = 0; string::size_type srclen = strlen(strSrc); if( (pos=strCopy.find(strSrcCopy, pos)) != string::npos) { strContent.replace(pos, srclen, strDest); }}
主函数测试:
int main(){CcharsetEncode encode;string src = "\u300a\u58eb\u5175\u7a81\u51fb\u300b";encode.unicode_to_utf8(src);cout<<" unicode: "<<src<<endl;return 0;}
0 0
- C++ 实现unicode到utf-8的转码
- [转]Unicode与UTF-8互转(C语言实现)
- UTF-8到Unicode的编码转换
- UTF-8到Unicode的编码转换
- UTF-8到Unicode的编码转换
- Unicode到UTF-8的转换详解
- unicode到utf-8的转换
- 从Unicode到UTF-8的编码
- Unicode字符串转UTF-8 (C++)
- UTF-8转Unicode 编码 C语言
- UTF-8转UTF-16,UTF-8转UNICODE详解 附UTF8转UNICODE C源码
- window下ANSI转UTF-8,UTF-8转Unicode的实现
- Unicode与UTF-8互转(C语言实现)
- Unicode与UTF-8互转(C语言实现)
- Unicode与UTF-8互转(C语言实现)
- Unicode与UTF-8互转(C语言实现)
- Unicode与UTF-8互转(C语言实现)
- Unicode与UTF-8互转(C语言实现)
- splay 小结
- 破解myeclipse时,ACTIVATION_KEY 老是为null
- 第三周--顺序表的基本运算
- Eclipse将引用了第三方jar包的Java项目打包成jar文件的两种方法
- H5 canvas 绘制五星红旗
- C++ 实现unicode到utf-8的转码
- 关于jquery中html()、text()、val()的区别
- 第十四章 上机练习1
- XML04-DOM4J解析xml文件
- Android中接口的回调,Activity或Fragment与普通类之间的互相调用
- PHP codeigniter框架
- Java线程总结(六):并发包------线程同步Lock
- 平衡三进制(可推广到n进制)
- 22. Generate Parentheses