Android NDK中字符串的相互转换

来源：互联网发布：商务综合管理平台源码编辑：程序博客网时间：2024/06/07 23:17

在Android NDK中gb2312字符串，unicode字符串，utf-8字符串相互转换

前提说明：在AndroidNDK中，一个wchar_t是4个字节，也就是说Android NDK下，unicode字符是采用ucs4的，而在windows系统下，unicode是采用ucs2,即每个unicode字符是占用两个字节的，可以用sizeof（）函数来证明上面说的。如果在Android NDK下，想要强制指定一个wchar_t是2个字节，即跟windows系统下一样，采用ucs2,需要在Android.mk中添加一句LOCAL_CFLAGS := -fshort-wchar即可。但是如果指定wchar_t为两个字节的话，则所有用于处理unicode字符串的函数（如wcscpy，wcslen，wcscmp，wcscat）则都不在适用，需要你自己重新写这些函数，当然这都很容易处理。我写的例子中还是使用默认的ucs4来处理unicode字符串的。见文件as_base_fun.cpp.

在Android NDK中还没有现成的函数来处理字符串的相互转换，但是Android底层有个libicuuc.so库文件，在目录/system/lib/下可以找到这个文件。在这个文件中有个函数ucnv_convert，我们可以从这个库中找到这个函数，然后利用这个函数来转换字符串。在不同的Android版本中，这个函数的名字可能会有点不一样，比如在Android 2.3下函数名为：ucnv_convert_44。

在Android NDK中，一个char *的字符串默认是utf-8编码的。比如：

char * str = “字符串示例”，字符串str在内存中应该占用15个字节，每个汉字占用3个字节。

如要将一个gb2312编码的字符串转换为ucs4的unicode，则目标缓冲区的大小应该至少为strlen(source)+2个wchar_t,注意不可以是strlen(source)+1，其中的source为要转换的字符串。

在gb2312编码的字符串中，一个英文字母或者是数字占用一个字节，汉字占两个字节

在utf-8编码的字符串中，一个英文字母或者是数字占用一个字节，汉字占3个字节

As_base_fun.cpp源码：

typedef wchar_t TCHAR//ucnv_convert函数原型void (*ucnv_convert)(const char *, const char *, char * , int32_t , const char *, int32_t,int32_t*) = 0;//在libicuuc.so中找到函数ucnv_convertvoid * init(){void* pDL = dlopen("/system/lib/libicuuc.so", RTLD_LAZY);if (0 == pDL){return pDL;}//这里以android2.2为例，函数名就是ucnv_convert_4_2//在我的机器（Android 2.3.1）中，函数名称是ucnv_convert44ucnv_convert = (void (*)(const char *, const char *, char * , int32_t , const char *, int32_t,int32_t*))dlsym(pDL, "ucnv_convert_3_8");char hanshu_name[256];int index = 0;while (0 == ucnv_convert){sprintf(hanshu_name,"ucnv_convert_4%d",index);ucnv_convert = (void (*)(const char *, const char *, char * , int32_t , const char *, int32_t,int32_t*))dlsym(pDL, hanshu_name);if (ucnv_convert != 0){return pDL;}index ++;if (index>9){break;}}return pDL;}void close_pDL(void * pDL){if (pDL){dlclose(pDL);pDL = NULL;}}//下面的6个函数功能就是gb2312,utf-8,ucs4字符串的相互转换//参数dest为目标字符串缓冲，即用来存放转换出来的字符串//参数dest_len为目标缓冲的大小，要保证目标缓冲足够大，能够用来存放目标字符串//如要将一个gb2312编码的字符串转换为ucs4的unicode，则目标缓冲区的大小应该至少为strlen(source)+2个wchar_t,注意不可以是strlen(source)+1.//在gb2312编码的字符串中，一个英文字母或者是数字占用一个字节，汉字占两个字节//在utf-8编码的字符串中，一个英文字母或者是数字占用一个字节，汉字占3个字节//参数source为要转换的字符串//所有函数均经过测试，可以正常运行int unicode2gb2312(char * dest,int dest_len,const TCHAR * source){if ((NULL == dest)||(NULL == source)){return 0;}memset(dest,0,dest_len);if (ucnv_convert){int err_code = 0;ucnv_convert("gb2312","ucs4",(char *)dest,dest_len,(const char *)source,lstrlen(source)*sizeof(TCHAR),&err_code);return err_code;}return 0;}int unicode2utf8(char * dest,int dest_len,const TCHAR * source){if ((NULL == dest)||(NULL == source)){return 0;}memset(dest,0,dest_len);if (ucnv_convert){int err_code = 0;ucnv_convert("utf-8","ucs4",(char *)dest,dest_len,(const char *)source,lstrlen(source)*sizeof(TCHAR),&err_code);return err_code;}return 0;}int gb23122unicode(TCHAR * dest,int dest_len,const char * source){if ((NULL == dest)||(NULL == source)){return 0;}memset(dest,0,dest_len*sizeof(TCHAR));if (ucnv_convert){int err_code = 0;ucnv_convert("ucs4","gb2312",(char *)dest,dest_len*sizeof(TCHAR),(const char *)source,strlen(source),&err_code);return err_code;}return 0;}int utf82unicode(TCHAR * dest,int dest_len,const char * source){if ((NULL == dest)||(NULL == source)){return 0;}memset(dest,0,dest_len*sizeof(TCHAR));if (ucnv_convert){int err_code = 0;ucnv_convert("ucs4","utf-8",(char *)dest,dest_len*sizeof(TCHAR),(const char *)source,strlen(source),&err_code);return err_code;}return 0;}int gb23122utf8(char * dest,int dest_len,const char * source){if ((NULL == dest)||(NULL == source)){return 0;}memset(dest,0,dest_len);if (ucnv_convert){int err_code = 0;ucnv_convert("utf-8","gb2312",(char *)dest,dest_len,(const char *)source,strlen(source),&err_code);return err_code;}return 0;}int utf82gb2312(char * dest,int dest_len,const char * source){if ((NULL == dest)||(NULL == source)){return 0;}memset(dest,0,dest_len);if (ucnv_convert){int err_code = 0;ucnv_convert("gb2312","utf-8",(char *)dest,dest_len,(const char *)source,strlen(source),&err_code);return err_code;}return 0;}

0 0