Conversion Between Unicode-UCS-4 And UTF-8 (Unicode-UCS-4 与 UTF-8 之间的转换)
来源:互联网 发布:sql 查询结果去掉重复 编辑:程序博客网 时间:2024/04/28 23:11
Note: part of this article is referenced from RFC2279 - UTF-8, a transformation format of ISO 10646.
I'm kinda practising my C programming skills, and I found some friends had written some codes, but the codes might not be readable. I learned the rules of the conversion between Unicode and UTF-8, and wrote two functions as shown below.
Please be focusing on the functions fnUnicode2UTF8() and fnUTF82Unicode() in the source code. Don't be confused with the program output part in the main() function :-)
/********** Pure C Codes for Converting Between UTF8 & Unicode ************** ** Author: Peter Lee (peterlee.com.cn <at> gmail.com) ** Date: 2008-11-21 ** ** Please keep this information while referencing the code below. ** Thanks so much! ** ** Welcome To Peter Lee's Blog Website: ** http://www.peterlee.com.cn ** http://blog.peterlee.com.cn ** ****************************************************************************/#include <stdio.h>#include <string.h>#define MAX 6 /* Max length of a UTF-8 character *//* Unicode to UTF-8 mapping:Unicode Range : UTF-8---------------------:------------------------------------------------------U00000000 - U0000007F: 0xxxxxxxU00000080 - U000007FF: 110xxxxx 10xxxxxxU00000800 - U0000FFFF: 1110xxxx 10xxxxxx 10xxxxxxU00010000 - U001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxxU00200000 - U03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxxU04000000 - U7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxxPara: 0x00000000 <= unicode <= 0x7FFFFFFF*/void fnUnicode2UTF8(unsigned long unicode, char UTF8[]){ if ( 0x00000000 <= unicode && unicode <= 0x0000007F ) { UTF8[MAX] = 1; UTF8[0] = (char)(unicode); return; } if ( 0x00000080 <= unicode && unicode <= 0x000007FF ) { UTF8[MAX] = 2; UTF8[0] = (char)(0xC0 | unicode>>6); UTF8[1] = (char)(0x80 | unicode & 0x3F); return; } if ( 0x00000800 <= unicode && unicode <= 0x0000FFFF ) { UTF8[MAX] = 3; UTF8[0] = (char)(0xE0 | unicode>>12); UTF8[1] = (char)(0x80 | unicode>>6 & 0x3F); UTF8[2] = (char)(0x80 | unicode & 0x3F); return; } if ( 0x00010000 <= unicode && unicode <= 0x001FFFFF ) { UTF8[MAX] = 4; UTF8[0] = (char)(0xF0 | unicode>>18); UTF8[1] = (char)(0x80 | unicode>>12 & 0x3F); UTF8[2] = (char)(0x80 | unicode>>6 & 0x3F); UTF8[3] = (char)(0x80 | unicode & 0x3F); return; } if ( 0x00200000 <= unicode && unicode <= 0x03FFFFFF ) { UTF8[MAX] = 5; UTF8[0] = (char)(0xF8 | unicode>>24); UTF8[1] = (char)(0x80 | unicode>>18 & 0x3F); UTF8[2] = (char)(0x80 | unicode>>12 & 0x3F); UTF8[3] = (char)(0x80 | unicode>>6 & 0x3F); UTF8[4] = (char)(0x80 | unicode & 0x3F); return; } if ( 0x04000000 <= unicode && unicode <= 0x7FFFFFFF ) { UTF8[MAX] = 6; UTF8[0] = (char)(0xFC | unicode>>30); UTF8[1] = (char)(0x80 | unicode>>24 & 0x3F); UTF8[2] = (char)(0x80 | unicode>>18 & 0x3F); UTF8[3] = (char)(0x80 | unicode>>12 & 0x3F); UTF8[4] = (char)(0x80 | unicode>>6 & 0x3F); UTF8[5] = (char)(0x80 | unicode & 0x3F); return; }}/* Unicode to UTF-8 mapping:Unicode Range : UTF-8---------------------:------------------------------------------------------U00000000 - U0000007F: 0xxxxxxxU00000080 - U000007FF: 110xxxxx 10xxxxxxU00000800 - U0000FFFF: 1110xxxx 10xxxxxx 10xxxxxxU00010000 - U001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxxU00200000 - U03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxxU04000000 - U7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxxPara: 0x00000000 <= unicode <= 0x7FFFFFFF*/unsigned long fnUTF82Unicode(char UTF8[]){ unsigned long unicode = 0; if ( 0x00 == (unsigned char)(UTF8[0])>>7 ) { UTF8[MAX] = 1; unicode = UTF8[0]; return unicode; } if ( 0x06 == (unsigned char)(UTF8[0])>>5 ) { UTF8[MAX] = 2; unicode = (UTF8[0]&0x1F) << 6; unicode |= (UTF8[1]&0x3F); return unicode; } if ( 0x0E == (unsigned char)(UTF8[0])>>4 ) { UTF8[MAX] = 3; unicode = (UTF8[0]&0x0F) << 12; unicode |= (UTF8[1]&0x3F) << 6; unicode |= (UTF8[2]&0x3F); return unicode; } if ( 0x1E == (unsigned char)(UTF8[0])>>3 ) { UTF8[MAX] = 4; unicode = (UTF8[0]&0x07) << 18; unicode |= (UTF8[1]&0x3F) << 12; unicode |= (UTF8[2]&0x3F) << 6; unicode |= (UTF8[3]&0x3F); return unicode; } if ( 0x3E == (unsigned char)(UTF8[0])>>2 ) { UTF8[MAX] = 5; unicode = (UTF8[0]&0x03) << 24; unicode |= (UTF8[1]&0x3F) << 18; unicode |= (UTF8[2]&0x3F) << 12; unicode |= (UTF8[3]&0x3F) << 6; unicode |= (UTF8[4]&0x3F); return unicode; } if ( 0x7E == (unsigned char)(UTF8[0])>>1 ) { UTF8[MAX] = 6; unicode = (UTF8[0]&0x01) << 30; unicode |= (UTF8[1]&0x3F) << 24; unicode |= (UTF8[2]&0x3F) << 18; unicode |= (UTF8[3]&0x3F) << 12; unicode |= (UTF8[4]&0x3F) << 6; unicode |= (UTF8[5]&0x3F); return unicode; } return 0; /* Impossible */}char Hex2Bin[23][5] = {"0000", "0001", "0010", "0011", "0100", "0101", "0110", "0111", "1000", "1001", "", "", "", "", "", "", "", "1010", "1011", "1100", "1101", "1110", "1111"};void fnHex2Bin(char hex[], char bin[]){ int i, len = strlen(hex); for ( bin[0] = i = 0; i < len; ++i ) strcat ( bin, Hex2Bin[hex[i]-'0'] );}int main(int argc, char* argv[]){ int i; char UTF8[MAX+1], bin[4*8+1], hex[8+1]; unsigned long unicode = 0x4F60; /* Testing for the Han (Chinese) character: Unicode: 4F60; "ni3", means "you". *//* Unicode 2 UTF8 */ printf ("Unicode 2 UTF8:/n"); sprintf ( hex, "%X", unicode ); fnHex2Bin ( hex, bin ); printf ( "Unicode Hex: %s/n", hex ); printf ( "Unicode Bin: %s/n", bin ); fnUnicode2UTF8 ( unicode, UTF8 ); /* Make the hex string for UTF8[] */ for ( i = 0; i < UTF8[MAX]; ++i ) sprintf ( hex+2*i, "%02X", (unsigned char)(UTF8[i]) ); hex[2*UTF8[MAX]] = 0; fnHex2Bin ( hex, bin ); printf ( " UTF Hex: %0*s/n", UTF8[MAX]*2, hex ); printf ( " UTF Bin: %s/n", bin );/* Unicode 2 UTF8 */ printf ("/n");/* UTF8 2 Unicode */ printf ("UTF8 2 Unicode:/n"); printf ( " UTF Hex: %0*s/n", UTF8[MAX]*2, hex ); printf ( " UTF Bin: %s/n", bin ); unicode = fnUTF82Unicode ( UTF8 ); sprintf ( hex, "%X", unicode ); fnHex2Bin ( hex, bin ); printf ( "Unicode Hex: %s/n", hex ); printf ( "Unicode Bin: %s/n", bin );/* UTF8 2 Unicode */ return 0;}/* Output:Unicode 2 UTF8:Unicode Hex: 4F60Unicode Bin: 0100111101100000 UTF Hex: E4BDA0 UTF Bin: 111001001011110110100000UTF8 2 Unicode: UTF Hex: E4BDA0 UTF Bin: 111001001011110110100000Unicode Hex: 4F60Unicode Bin: 0100111101100000*/
- Conversion Between Unicode-UCS-4 And UTF-8 (Unicode-UCS-4 与 UTF-8 之间的转换)
- unicode/UCS与utf-8
- Linux string conversion from UTF-8 to UNICODE, UCS-4LE, UCS-4LE
- Unicode、UCS与UTF
- UCS,UNICODE和UTF-8
- UCS-UNICODE-UTF-8编码
- unicode、ucs-2、ucs-4、utf-16、utf-32、utf-8
- unicode、ucs-2、ucs-4、utf-16、utf-32、utf-8介绍
- Unicode详解(UCS-2,UCS-4,UTF-8,UTF-16,UTF-32)
- 【编码】unicode、ucs-2、ucs-4、utf-16、utf-32、utf-8 介绍
- 从ASCII、ISO-8859、GB2312、GBK到Unicode的UCS-2、UCS-4、UTF-8、UTF-16、UTF-32
- Unicode、UCS、UTF-8、UTF-16、UTF-32 的关系
- ASCII GBK UNICODE UCS-2 UCS-4 UTF-8 UTF-16 UTF-32 一次说个明白
- Unicode详解(UTF-8,UTF16,UCS
- unicode 详解 (UTF-8, UCS-2)
- Unicode详解(UTF-8,UTF16,UCS)
- Unicode详解(UTF-8,UTF16,UCS
- Unicode详解(UTF-8,UTF16,UCS)
- Oracle Sql优化笔记
- 如何调整对话框的字体大小
- Linux kmalloc
- 闲来无事
- How SKBs work
- Conversion Between Unicode-UCS-4 And UTF-8 (Unicode-UCS-4 与 UTF-8 之间的转换)
- Flex编程学习基础
- STL 神奇的__type_traits (转
- iPad界面设计基本规范(建议阅读)
- 用VB释放文件,最精简的代码
- Mysql内存表的用处
- ModelMaker Code Explorer v9.0.0.1943
- JavaMail学习笔记1——JavaMail基础知识
- IP*Works! v8.1.3933 Delphi Edition