多字节与UTF-8、Unicode之间的转换

来源：互联网发布：linux libx264 编辑：程序博客网时间：2024/05/18 03:04

// 多字节编码转为UTF8编码
bool MBToUTF8(vector<char>& pu8, const char* pmb, int32 mLen)
{
// convert an MBCS string to widechar
int32 nLen = MultiByteToWideChar(CP_ACP, 0, pmb, mLen, NULL, 0);
WCHAR* lpszW = NULL;
try
{
lpszW = new WCHAR[nLen];
}
catch(bad_alloc &memExp)
{
return false;
}
int32 nRtn = MultiByteToWideChar(CP_ACP, 0, pmb, mLen, lpszW, nLen);
if(nRtn != nLen)
{
delete[] lpszW;
return false;
}
// convert an widechar string to utf8
int32 utf8Len = WideCharToMultiByte(CP_UTF8, 0, lpszW, nLen, NULL, 0, NULL, NULL);
if (utf8Len <= 0)
{
return false;
}
pu8.resize(utf8Len);
nRtn = WideCharToMultiByte(CP_UTF8, 0, lpszW, nLen, &*pu8.begin(), utf8Len, NULL, NULL);
delete[] lpszW;
if (nRtn != utf8Len)
{
pu8.clear();
return false;
}
return true;
}
// UTF8编码转为多字节编码
bool UTF8ToMB(vector<char>& pmb, const char* pu8, int32 utf8Len)
{
// convert an UTF8 string to widechar
int32 nLen = MultiByteToWideChar(CP_UTF8, 0, pu8, utf8Len, NULL, 0);
WCHAR* lpszW = NULL;
try
{
lpszW = new WCHAR[nLen];
}
catch(bad_alloc &memExp)
{
return false;
}
int32 nRtn = MultiByteToWideChar(CP_UTF8, 0, pu8, utf8Len, lpszW, nLen);
if(nRtn != nLen)
{
delete[] lpszW;
return false;
}
// convert an widechar string to Multibyte
int32 MBLen = WideCharToMultiByte(CP_ACP, 0, lpszW, nLen, NULL, 0, NULL, NULL);
if (MBLen <=0)
{
return false;
}
pmb.resize(MBLen);
nRtn = WideCharToMultiByte(CP_ACP, 0, lpszW, nLen, &*pmb.begin(), MBLen, NULL, NULL);
delete[] lpszW;
if(nRtn != MBLen)
{
pmb.clear();
return false;
}
return true;
}
// 多字节编码转为Unicode编码
bool MBToUnicode(vector<wchar_t>& pun, const char* pmb, int32 mLen)
{
// convert an MBCS string to widechar
int32 uLen = MultiByteToWideChar(CP_ACP, 0, pmb, mLen, NULL, 0);
if (uLen<=0)
{
return false;
}
pun.resize(uLen);
int32 nRtn = MultiByteToWideChar(CP_ACP, 0, pmb, mLen, &*pun.begin(), uLen);
if (nRtn != uLen)
{
pun.clear();
return false;
}
return true;
}
//Unicode编码转为多字节编码
bool UnicodeToMB(vector<char>& pmb, const wchar_t* pun, int32 uLen)
{
// convert an widechar string to Multibyte
int32 MBLen = WideCharToMultiByte(CP_ACP, 0, pun, uLen, NULL, 0, NULL, NULL);
if (MBLen <=0)
{
return false;
}
pmb.resize(MBLen);
int nRtn = WideCharToMultiByte(CP_ACP, 0, pun, uLen, &*pmb.begin(), MBLen, NULL, NULL);
if(nRtn != MBLen)
{
pmb.clear();
return false;
}
return true;
}
// UTF8编码转为Unicode
bool UTF8ToUnicode(vector<wchar_t>& pun, const char* pu8, int32 utf8Len)
{
// convert an UTF8 string to widechar
int32 nLen = MultiByteToWideChar(CP_UTF8, 0, pu8, utf8Len, NULL, 0);
if (nLen <=0)
{
return false;
}
pun.resize(nLen);
int32 nRtn = MultiByteToWideChar(CP_UTF8, 0, pu8, utf8Len, &*pun.begin(), nLen);
if(nRtn != nLen)
{
pun.clear();
return false;
}
return true;
}
// Unicode编码转为UTF8
bool UnicodeToUTF8(vector<char>& pu8, const wchar_t* pun, int32 uLen)
{
// convert an widechar string to utf8
int32 utf8Len = WideCharToMultiByte(CP_UTF8, 0, pun, uLen, NULL, 0, NULL, NULL);
if (utf8Len<=0)
{
return false;
}
pu8.resize(utf8Len);
int32 nRtn = WideCharToMultiByte(CP_UTF8, 0, pun, uLen, &*pu8.begin(), utf8Len, NULL, NULL);
if (nRtn != utf8Len)
{
pu8.clear();
return false;
}
return true;
}

另外看到别人的另外详细的方法：

#include "stdafx.h"
#include <windows.h>
#include <locale.h>

/* MultiByteToWideChar和WideCharToMultiByte每个都调用了两次，
第一次转换是为了得到转换后所需的长度 */
void TestMultiToWideChar()
{
do
{
/* 这里的szBuffer中的内容在中文Windows下默认用的是GB2312编码，也可以说是MBCS编码,
有几个概念一直让人模糊，今天算是搞清楚了，GB2312编码，MBCS编码，ANSI编码，
其实这三种编码是同一种编码格式，GB2312是专门针对中文的，是ANSI编码在中文系统下
的别称，在日文系统下，ANSI就叫JIS了，而MBCS意思就是多字节编码，对于ASCII码，采用一个字节，
对于中文采用两个字节，所以也叫MBCS，还有DBCS，在中文Windows下，就是GB2312，双字节编码。
哎，名字太多了 */
char szBuffer[32] = "赵武涛";
printf("szBuffer = %s\n", szBuffer);

/* MSDN关于CP_ACP的阐释： The current system Windows ANSI code page.
注意这里的CP_ACP表示转换要用到的CodePage类型，因为这里的szBuffer在中文Windows下
是GB2312编码，所以这里用ANSI这个codePage就行了，GB2312就是ANSI编码的一种，
MultiByteToWideChar和WideCharToMultiByte这两个API的参数意义参考MSDN即可 */

/* 第四个参数设为-1，MSDN里的解释为If this parameter is -1, the function processes the entire
input string, including the null terminator. Therefore, the resulting wide character string
has a null terminator, and the length returned by the function includes the terminating null character.
也就是说，如果设为-1, 表示系统处理整个szBuffer里的内容，包括NULL结束符，并且返回值包括一个NULL结束符占的长度。
最后一个参数设为0，MSDN里的解释为If this parameter is set to 0, the function returns the required buffer
size for lpMultiByteStr and makes no use of the output parameter itself.
也就是说，设为0表示返回值是转换所需的WCHAR缓冲区长度，包括NULL结束符*/

int nLen = MultiByteToWideChar(CP_ACP, 0, szBuffer, -1, NULL, 0);
if (nLen == 0) // 这里的nlen的长度以WCHAR为单位，及两个字节为单位
{
printf("errorCode = %d\n", GetLastError());
break;
}

WCHAR *pwszBuffer = new WCHAR[nLen];
nLen = MultiByteToWideChar(CP_ACP, 0, szBuffer, -1, pwszBuffer, nLen);
if (nLen == 0)
{
printf("errorCode = %d\n", GetLastError());
break;
}

// pwszBuffer在内存中的字节序为75 8d 66 6b 9b 6d 00 00，结尾的NULL字符也占两个字节
wprintf(L"pwszBuffer = %s\n", pwszBuffer);

delete []pwszBuffer;
} while (false);
}

void TestWideToMultiChar()
{
do
{
WCHAR wszBuffer[32] = L"赵武涛";

int nLen = WideCharToMultiByte(CP_ACP, 0, wszBuffer, -1, NULL, 0, NULL, NULL);
if (nLen == 0) // 这里的nLen以一个字节为单位
{
printf("errorCode = %d\n", GetLastError());
break;
}

char *pszBuffer = new char[nLen];
nLen = WideCharToMultiByte(CP_ACP, 0, wszBuffer, -1, pszBuffer, nLen, NULL, NULL);
if (nLen == 0)
{
printf("errorCode = %d\n", GetLastError());
break;
}

printf("pszBuffer = %s\n", pszBuffer);
delete []pszBuffer;
} while (false);
}

/* 这个函数间接囊括了UNICODE到UTF8的转换和UTF8到UNICODE的转换,
网上很多帖子对这个转换为什么要进行两次转换基本没有说明 */
void TestMultiToUTF8()
{
do
{
/* 这个方法里，要先把GB2312字符串转换成UNICODE编码，再用UNICODE转UTF8,
因为没有一种CodePage可以直接将GB2312转换成UTF8，所以这里就要先转UNICODE，再
通过CP_UTF8进行转换，UTF8可以视为一种变长的多字节编码，虽说UTF8是对UNICODE字符集
执行的一种编码形式，但其编码是采用1~6字节变长编码，所以可以视为多字节编码 */
char szBuffer[32] = "赵武涛";
int nLen = MultiByteToWideChar(CP_ACP, 0, szBuffer, -1, NULL, 0);
if (nLen == 0) // nLen is in WCHAR values
{
printf("errorCode = %d\n", GetLastError());
break;
}

WCHAR *pwszBuffer = new WCHAR[nLen];
nLen = MultiByteToWideChar(CP_ACP, 0, szBuffer, -1, pwszBuffer, nLen);
if (nLen == 0)
{
printf("errorCode = %d\n", GetLastError());
break;
}

wprintf(L"pwszBuffer = %s\n", pwszBuffer);

// 再转换成UTF-8编码
// 刚开始用这两个API时，对这个CodePage的认识很模糊，为什么有的地方用CP_UTF8，有的用CP_ACP
nLen = WideCharToMultiByte(CP_UTF8, 0, pwszBuffer, -1, NULL, 0, NULL, NULL);
if (nLen == 0) // nLen is in bytes values
{
printf("errorCode = %d\n", GetLastError());
break;
}

char *pszBuffer = new char[nLen];
nLen = WideCharToMultiByte(CP_UTF8, 0, pwszBuffer, -1, pszBuffer, nLen, NULL, NULL);
if (nLen == 0)
{
printf("errorCode = %d\n", GetLastError());
break;
}

/* 下面的代码只是测试，再将此UTF-8字符串转换成Unicode，看看输出结果, 注意
这里用的CodePage还是CP_UTF8，因为只有这个CodePage能在UTF8和Unicode间进行互转换，它
表示的意思并不是转换目标的编码，而是当前转换需要用到这个CodePage */
nLen = MultiByteToWideChar(CP_UTF8, 0, pszBuffer, -1, NULL, 0);
if (nLen == 0) // nLen is in bytes values
{
printf("errorCode = %d\n", GetLastError());
break;
}

WCHAR *pwszBuf2 = new WCHAR[nLen];
nLen = MultiByteToWideChar(CP_UTF8, 0, pszBuffer, -1, pwszBuf2, nLen);
if (nLen == 0) // nLen is in bytes values
{
printf("errorCode = %d\n", GetLastError());
break;
}

wprintf(L"pwszBuf2 = %s\n", pwszBuf2);
delete []pwszBuf2;
delete []pwszBuffer;
} while (false);
}

int _tmain(int argc, _TCHAR* argv[])
{
// 这里设置locale是为了wprintf能正确的输出宽字符类型的中文
setlocale(LC_ALL, "chs");

TestMultiToWideChar();

TestWideToMultiChar();

TestMultiToUTF8();

return 0;
}

0 0