常用字库编码的转换(Unicode,Utf8,Big5,Gb18030)
来源:互联网 发布:火狐 chrome 知乎 编辑:程序博客网 时间:2024/05/16 00:34
int UnicodeToUtf8(LPUNC src, BYTE* putf8)
{
int len=0;
while(*src)
{
if (*src < 0x80) //one byte
{
putf8[len++] = *src;
}
else if (*src < 0x800) //two byte
{
putf8[len++] = 0xC0 | (*src >> 12);
putf8[len++] = 0x80 | (*src >>6 & 0x3F);
}
else
{
putf8[len++] = 0xE0 | (*src >> 12);
putf8[len++] = 0x80 | (*src >>6 & 0x3F);
putf8[len++] = 0x80 | (*src &0x3F);
}
src ++;
}
putf8[len] = 0;
return len;
}
int Utf8ToUnicode(BYTE* src, LPUNC punicode)
{
if(0 == (src[0] & 0x80)){
// 单字节
*punicode = (UNC)src[0];
return 1;
}
if(0xC0 == (src[0] & 0xE0) &&
0x80 == (src[1] & 0xC0)){
// 双字节
*punicode = (UNC)((((UNC)src[0] & 0x001F) << 6) |
((UNC)src[1] & 0x003F));
return 2;
}
if(0xE0 == (src[0] & 0xF0) &&
0x80 == (src[1] & 0xC0) &&
0x80 == (src[2] & 0xC0)){
// 三字节
*punicode = (UNC)((((UNC)src[0] & 0x000F) << 12) |
(((UNC)src[1] & 0x003F) << 6) |
((UNC)src[2] & 0x003F));
return 3;
}
return 0; // 表示出错
}
UNC Big5ToUnicode(WORD big5)
{
if(NULL == big5_unicode_tbl){
return 0x0000;
}
int low = 0;
int high = 13502;
int mid;
if(big5 < 0x80)
{
return big5;
}
if(big5 >= 0xA140 && big5 <= 0xF9FE)
{
while(low <= high)
{
mid = (low+high)/2;
WORD cur = *(big5_unicode_tbl + mid * 2);
if(cur > big5)
{
high = mid-1;
continue;
}
if(cur < big5)
{
low = mid +1;
continue;
}
if(cur == big5)
{
return *(big5_unicode_tbl + mid * 2 + 1);
}
}
}
return 0x0000;
}
WORD UnicodeToBig5(UNC unicode)
{
if(NULL == unicode_big5_tbl){
return 0x0000;
}
int low = 0;
int high = 13502;
int mid;
if(unicode < 0x80)
{
return unicode;
}
while(low <= high)
{
mid = (low+high)/2;
UNC cur = *(unicode_big5_tbl + mid * 2 + 1);
if(cur > unicode)
{
high = mid-1;
continue;
}
if(cur < unicode)
{
low = mid +1;
continue;
}
if(cur == unicode)
{
return *(unicode_big5_tbl + mid * 2);
}
}
return 0x0;
}
UNC Gb18030ToUnicode(WORD gb18030)
{
if(NULL == gb18030_unicode_tbl){
return 0x0000;
}
int low = 0;
int high = 21790;
int mid;
if(gb18030 < 0x80)
{
return gb18030;
}
if(gb18030 == 0x80)
return 0x20AC;
if(gb18030 >= 0x8140 && gb18030 <= 0xFE4F)
{
while(low <= high)
{
mid = (low+high)/2;
WORD cur = *(gb18030_unicode_tbl + mid * 2);
if(cur > gb18030)
{
high = mid-1;
continue;
}
if(cur < gb18030)
{
low = mid +1;
continue;
}
if(cur == gb18030)
{
return *(gb18030_unicode_tbl + mid * 2 + 1);
}
}
}
return 0x0000;
}
WORD UnicodeToGb18030(UNC unicode)
{
if(NULL == unicode_gb18030_tbl){
return 0x0000;
}
int low = 0;
int high = 21790;
int mid;
if(unicode < 0x80)
{
return unicode;
}
if(unicode == 0x20AC)
return 0x80;
while(low <= high)
{
mid = (low+high)/2;
UNC cur = *(unicode_gb18030_tbl + mid * 2 + 1);
if(cur > unicode)
{
high = mid-1;
continue;
}
if(cur < unicode)
{
low = mid +1;
continue;
}
if(cur == unicode)
{
return *(unicode_gb18030_tbl + mid * 2);
}
}
return 0x0;
}
// 转换utf8字符串
// 参数:src:源字符串;dest:目标字符串;dest_len:目标可容纳长度(字符数)
// 返回值:实际完成转换的字符数
int utf8_to_unicode(BYTE* src, LPUNC dest, int dest_len)
{
char* old_src = src;
int err = 0;
int i = 0;
while(i < dest_len){
int len = Utf8ToUnicode(src, &dest[i]);
//F(len);printf("code = 0x%04X/n", dest[i]);
if(0 == len){
err = 1;
len = 1;
dest[i] = UNKNOWN_CHAR;
}
src += len;
if(0x0000 == dest[i]){
break; // 字符串结束符
}
i ++;
}
if(err){
printf("utf8 string err! ");T();
DUMP_BUFFER("---------------",old_src,20);
DUMP_BUFFER("===============",(char *)dest,20);
}
return i+1;
}
int utf8_to_unicode2(BYTE* src, LPUNC dest, int dest_len, int*
perr_count)
{
*perr_count = 0;
char* old_src = src;
int err = 0;
int i = 0;
while(i < dest_len){
int len = Utf8ToUnicode(src, &dest[i]);
//F(len);printf("code = 0x%04X/n", dest[i]);
if(0 == len){
err = 1;
len = 1;
dest[i] = UNKNOWN_CHAR;
if(NULL != perr_count){
(*perr_count) ++;
}
}
src += len;
if(0x0000 == dest[i]){
break; // 字符串结束符
}
i ++;
}
if(err){
printf("utf8 string err! ");T();
DUMP_BUFFER("---------------",old_src,20);
DUMP_BUFFER("===============",(char *)dest,20);
}
return i+1;
}
// 转换gb18030字符串
// 参数:src:源字符串;dest:目标字符串;dest_len:目标可容纳长度(字符数)
// 返回值:实际完成转换的字符数
int gb18030_to_unicode(BYTE* src, LPUNC dest, int dest_len)
{
int i = 0;
while(i < dest_len){
if(*src < 0x80){
dest[i] = (UNC)(*src);
src += 1;
}else{
WORD ch = MAKEWORD(*(src+1), *src);
dest[i] = Gb18030ToUnicode(ch);
src += 2;
}
//F(len);printf("code = 0x%04X/n", dest[i]);
if(0x0000 == dest[i]){
break; // 字符串结束符
}
i ++;
}
return i+1;
}
- 常用字库编码的转换(Unicode,Utf8,Big5,Gb18030)
- 【转】 常用字库编码的转换(Unicode,Utf8,Big5,Gb18030)
- ASCII,unicode, utf8 ,big5 ,gb2312,gbk,gb18030等几种常用编码区别
- ASCII,unicode, utf8 ,big5 ,gb2312,gbk,gb18030各种常用编码区别
- ASCII,unicode, utf8 ,big5 ,gb2312,gbk,gb18030等几种常用编码区别
- ASCII,unicode, utf8 ,big5 ,gb2312,gbk,gb18030的来由和关系
- 彻底搞清楚字符编码: ASCII, ISO_8859, GB2312,UCS, Unicode, UTF8.(GBK, GB18030, BIG5, UTF-7,UTF-16,UTF-32)
- 彻底搞清楚字符编码: ASCII, ISO_8859, GB2312,UCS, Unicode, UTF8.(GBK, GB18030, BIG5, UTF-7,UTF-16,UTF-32)
- 彻底搞清楚字符编码: ASCII, ISO_8859, GB2312,UCS, Unicode, UTF8.(GBK, GB18030, BIG5, UTF-7,UTF-16,UTF-32) .
- 检测字节流的编码类型(GBK,UNICODE,GB18030,UTF8..)
- 编码UNICODE UTF8 GBK GB2312 GB18030
- GB2312, BIG5, UTF8, Unicode之间的互换
- unicode和utf8编码的互相转换
- ascii,ISO-8859-1,unicode, utf8,gb2312,big5,gbk,gb18030等几种常区别
- delphi 编码转换 unicode gbk big5
- Delphi 编码转换 Unicode gbk big5
- vc中GB2312,BIG5,Unicode编码转换
- delphi 编码转换 unicode gbk big5
- ListView使用总结1
- DOS下无法输入中文的解决方法
- Android MediaPlayer的生命周期
- 模型驱动架构(MDA,Model Driven Architecture)浅述
- 不怕神一样的对手,就怕猪一样队员
- 常用字库编码的转换(Unicode,Utf8,Big5,Gb18030)
- 写Java计算器小感
- 精益思想领袖Mary Poppendieck
- 链式堆栈的实现 (带头节点的链式堆栈)
- Java Out of Memory 分析
- ListView使用总结2
- python 之 yield
- HashCode 深入理解 关键是与collection的关系
- C# string byte数组转换解析