C++ 实现unicode到utf-8的转码

来源:互联网 发布:淘宝热搜词在哪里看 编辑:程序博客网 时间:2024/04/29 05:29

思路:

获取字符串里面中的Unicode部分,然后将该部分转换位utf-8格式的字符,最后将字符串里面的所有Unicode替换为utf-8即可。


废话不多少,直接上代码:

头文件:

/* * charsetEncode.h * *  Created on: Jul 25, 2016 *      Author: root */#ifndef COMMONSERVER_INCLUDE_CHARSETENCODE_H_#define COMMONSERVER_INCLUDE_CHARSETENCODE_H_#include <iostream>#include <algorithm>#include <string>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <unistd.h>#include <errno.h>#include <assert.h>using namespace std;class CcharsetEncode{public:   int unicode_to_utf8(string &source);//unicode to utf-8   //字符串忽略大小写字串替换   void ReplaceStr(string &strContent, const char *strSrc, const char *strDest);private:    int enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize);    int isUnicode(const string &src); //the src is unicode or not, total 6 char(0x5e3f).1,yes;2,no    unsigned int xstrtoshortint(const char *str); //"0x1a3f"->1a3f};#endif /* COMMONSERVER_INCLUDE_CHARSETENCODE_H_ */




源文件:

/* * charsetEncode.cpp * *  Created on: Jul 25, 2016 *      Author: root */#include "charsetEncode.h"int CcharsetEncode::unicode_to_utf8(string &source){int sourcesize = source.size();string src;unsigned char pout[8];for(int index = 0; index < sourcesize - 6;){memset(pout, 0, 8);src = source.substr(index, 6);if(isUnicode(src) == 1){string hexsrc = source.substr(index + 2, 4);int num = enc_unicode_to_utf8_one(xstrtoshortint(hexsrc.c_str()), pout, 8);ReplaceStr(source, src.c_str(), (char *)pout);index += 3;sourcesize = source.size();}else{index++;}}return 0;}int CcharsetEncode::enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize){    assert(pOutput != NULL);    assert(outSize >= 6);    if ( unic <= 0x0000007F )    {        // * U-00000000 - U-0000007F:  0xxxxxxx        *pOutput     = (unic & 0x7F);        return 1;    }    else if ( unic >= 0x00000080 && unic <= 0x000007FF )    {        // * U-00000080 - U-000007FF:  110xxxxx 10xxxxxx        *(pOutput+1) = (unic & 0x3F) | 0x80;        *pOutput     = ((unic >> 6) & 0x1F) | 0xC0;        return 2;    }    else if ( unic >= 0x00000800 && unic <= 0x0000FFFF )    {        // * U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx        *(pOutput+2) = (unic & 0x3F) | 0x80;        *(pOutput+1) = ((unic >>  6) & 0x3F) | 0x80;        *pOutput     = ((unic >> 12) & 0x0F) | 0xE0;        return 3;    }    else if ( unic >= 0x00010000 && unic <= 0x001FFFFF )    {        // * U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx        *(pOutput+3) = (unic & 0x3F) | 0x80;        *(pOutput+2) = ((unic >>  6) & 0x3F) | 0x80;        *(pOutput+1) = ((unic >> 12) & 0x3F) | 0x80;        *pOutput     = ((unic >> 18) & 0x07) | 0xF0;        return 4;    }    else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF )    {        // * U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx        *(pOutput+4) = (unic & 0x3F) | 0x80;        *(pOutput+3) = ((unic >>  6) & 0x3F) | 0x80;        *(pOutput+2) = ((unic >> 12) & 0x3F) | 0x80;        *(pOutput+1) = ((unic >> 18) & 0x3F) | 0x80;        *pOutput     = ((unic >> 24) & 0x03) | 0xF8;        return 5;    }    else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF )    {        // * U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx        *(pOutput+5) = (unic & 0x3F) | 0x80;        *(pOutput+4) = ((unic >>  6) & 0x3F) | 0x80;        *(pOutput+3) = ((unic >> 12) & 0x3F) | 0x80;        *(pOutput+2) = ((unic >> 18) & 0x3F) | 0x80;        *(pOutput+1) = ((unic >> 24) & 0x3F) | 0x80;        *pOutput     = ((unic >> 30) & 0x01) | 0xFC;        return 6;    }    return 0;}int CcharsetEncode::isUnicode(const string &src){if(src.size() != 6)return 0;if(src.find("\\u", 0) == 0){for(int i = 2; i <= 5; i++){if(!((src[i] >= 'a' && src[i] <= 'f')|| (src[i] >= 'A' && src[i] <= 'F')|| (src[i] >= '0' && src[i] <= '9'))){return 0;}}return 1;}else{return 0;}}unsigned int CcharsetEncode::xstrtoshortint(const char *str){    int len = strlen(str);    unsigned int ivalue = 0;    for (int i = 0; i < len; i++)    {        if ((str[i] <= '9' && str[i] >= '0'))        {            ivalue = ivalue * 16 + (str[i] - '0'); //16进制 可换其它进制        }        else if ((str[i] >= 'a' && str[i] <= 'f'))        {            ivalue = ivalue * 16 + (str[i] - 'a') + 10;        }        else if ((str[i] >= 'A' && str[i] <= 'F'))        {            ivalue = ivalue * 16 + (str[i] - 'A') + 10;        }    }    return ivalue;}void CcharsetEncode::ReplaceStr(string &strContent, const char *strSrc, const char *strDest){    string strCopy(strContent);    string strSrcCopy(strSrc);    string::size_type pos = 0;    string::size_type srclen = strlen(strSrc);    if( (pos=strCopy.find(strSrcCopy, pos)) != string::npos)    {        strContent.replace(pos, srclen, strDest);    }}




主函数测试:
int main(){CcharsetEncode encode;string src = "\u300a\u58eb\u5175\u7a81\u51fb\u300b";encode.unicode_to_utf8(src);cout<<" unicode: "<<src<<endl;return 0;}






0 0
原创粉丝点击