C++文件读写

来源：互联网发布：c语言中数据类型长度编辑：程序博客网时间：2024/05/24 02:46

最近遇到读取不同格式文件，由于C++不是原生支持unicode，读取起来比较麻烦。这篇博客详细介绍了不同编码格式的区别。

常见的类型有：ANSI，utf-8无bom格式，utf-8，usc2 little end，usc2 big end。其中utf8是unicode的一种最常见的实现方式。windows中的宽字符串支持：USC-2 Little End。BOM: Byte Order Mark。UTF-8 BOM又叫UTF-8 签名,其实UTF-8 的BOM对UFT-8没有作用,是为了支援UTF-16,UTF-32才加上的BOM,BOM签名的意思就是告诉编辑器当前文件采用何种编码,方便编辑器识别,但是BOM虽然在编辑器中不显示,但是用cout直接输出的时候会产生一串乱码,在Notepad++中打开可以看到黑色的0xff等字节。因此要跳过头三个字节。

windows下的wchar_t读取的就是UCS2 Little End，通常说的Unicode编码指的是UCS-2 little endian格式编码方式，即直接用两个字节存入字符的Unicode码。windows中C++有3种字符类型，char, wchar_t, TCHAR，最熟悉的char是单字节字符，适用于ANSI编码；wchar_t是双字节的宽字符类型，适用于unicode编码；TCHAR是一个宏，在ANSI坏境下定义为char，unicode坏境下定义为wchar_t。

文件格式:

通过文本文件的文件头(Bom)识别文件格式：

ANSI文件的文件头为空，不需要处理；
UNICODE文件的文件头为0xFF,0xFE共计两个字节，读取时需要偏移两个字节再行读取；
UTF-8文件的文件头为0xEF,0xBB,0xBF共计三个字节，读取时需要偏移三个字节后再行读取；

#include <string>#include <iostream>#include <fstream>using namespace std;enum FileType{ansi = 0,unicode,utf8};FileType GetFileType(ifstream& infile){FileType type = ansi;if (!infile) {return ansi;}if (infile.good()){char head[3];infile.read(head,sizeof head);//utf-8无bom格式无法通过开始字节区分cout<<(head[0] & 0xff)<<" "<<(head[1] & 0xff)<<" "<<(head[2] & 0xff)<<endl;cout<<head[0]<<head[1]<<head[2]<<endl;if ((unsigned char)head[0] == 0xff && (unsigned char)head[1] == 0xfe) {type = unicode;} else if((unsigned char)head[0] == 0xef && (unsigned char)head[1] == 0xbb && (unsigned char)head[2] == 0xbf){type = utf8;}}return type;}

（注意：这种方法不能识别 utf-8 无bom格式。）

用C++输入输出流创建不同格式的文件:

如果直接创建一个普通文件则文件格式为ANSI。

如果创建二进制文件：可设置头3个字节来设置文件格式（如果不设置则为：utf-8无bom格式）。

//-----------------------------------------------//创建不同格式的文件//-----------------------------------------------void CreateFileTest(FileType type,const char* filename){ofstream out;out.open(filename,ios::binary);if (!out) {return;}char head[3];if (type == utf8) { //utf8有bomhead[0] = 0xef;head[1] = 0xbb;head[2] = 0xbf;out.write(head,sizeof(head));out<<"UTF-8"<<endl;} else if (type == unicode) { //ucs-2 little endhead[0] = 0xff;head[1] = 0xfe;head[2] = 0;cout<<(head[0] & 0xff)<<" "<<(head[1] & 0xff)<<" "<<(sizeof(head[0]) * 2)<<endl;out.write(head,sizeof(head[0]) * 2);wchar_t* buf = L"UNICODE字符";out<<buf<<endl;}out<<"ANSI OR UTF-8"<<endl; //utf-8无bomout.close();}

不同字符格式之间转换：

用windows.h中的WideCharToMultiByte和MultiByteToWideChar函数。使用CP_ACP代码页就实现了ANSI与Unicode之间的转换，使用CP_UTF8代码页就实现了UTF-8与Unicode之间的转换。

经试验发现，windows控制台，用cout只能输出ANSI字符，用wcout只能输出UCS2（unicode）字符。utf8暂时没找到直接输出到控制台的方法，只好通过二进制直接写入到文件保存为utf8无bom格式（也可自己加3字节头部，变成普通utf8格式）。

ANIS字符和utf-8均为multiBytes，unicode字符为宽字符串(windows下为ucs2 little end)。

//-----------------------------------------------//C++不同文件格式之间的转换//-----------------------------------------------#include <windows.h>#include <locale>//ANSI 转换成 unicodebool ANSIToUnicode(const char* inStr,wchar_t* outDest,int MaxLen){int len = 0;if (MaxLen <= 0) {return false;}len = ::MultiByteToWideChar(CP_ACP,0,inStr,-1,outDest,MaxLen);if (len < 0) {len = 0;}if (len < MaxLen) {outDest[len] = 0;} else {outDest[MaxLen - 1] = 0;return false;//溢出}return true;}//unicode转成utf-8无bom格式bool UnicodeToUtf8(const wchar_t* inDest,char* outStr,int MaxLen){int len = ::WideCharToMultiByte(CP_UTF8,0,inDest,-1,NULL,0,NULL,NULL);//获取长度if (len > MaxLen) {return false;}WideCharToMultiByte(CP_UTF8,0,inDest,-1,outStr,len,NULL,NULL);outStr[len - 1] = 0;return true;}//utf-8转换成unicodebool Utf8ToUnicode(const char* inStr,wchar_t* outDest,int MaxLen){int len = ::MultiByteToWideChar(CP_UTF8,0,inStr,-1,NULL,NULL);if (len > MaxLen) {return false;}::MultiByteToWideChar(CP_UTF8,0,inStr,-1,outDest,len);outDest[len - 1] = 0;return true;}bool UnicodeToANSI(const wchar_t* inStr,char* outDest,int MAxLen){int len = ::WideCharToMultiByte(CP_ACP,0,inStr,-1,NULL,0,NULL,NULL);if (len > MAxLen) {return false;}::WideCharToMultiByte(CP_ACP,0,inStr,-1,outDest,len,NULL,NULL);outDest[len - 1] = 0;return true;}void testANSIUNICODE(){//把ansi转换成unicodelocale::global(locale("chs"));wchar_t unicodebuf[1024];memset(unicodebuf,0,sizeof(unicodebuf));char* ansibuf = "这是一个字符串，任务是把字符串转换成不同格式";cout<<"[ansi]:"<<ansibuf<<endl;ANSIToUnicode(ansibuf,unicodebuf,1024);wcout<<L"[unicode]:"<<unicodebuf<<endl;//把unicode转换成utf8，ofstream fout;fout.open("UTF8TEST.txt",ios::binary);char utf8buf[1024];memset(utf8buf,0,sizeof utf8buf);UnicodeToUtf8(unicodebuf,utf8buf,1024);cout<<"[utf8]:"<<utf8buf<<endl; //控制台输出为乱码fout<<utf8buf<<endl; //文件输出正常fout.close();cout<<endl;//unicode转换成ansichar newansi[1024];memset(newansi,0,sizeof(newansi));UnicodeToANSI(unicodebuf,newansi,1024);cout<<"[依然ansi]："<<newansi<<endl;//utf8转换成unicodememset(unicodebuf,0,sizeof(unicodebuf));Utf8ToUnicode(utf8buf,unicodebuf,1024);wcout<<L"[依然unicode]："<<unicodebuf<<endl;}

C++读写UTF-8文件的方法：

http://henry8088.iteye.com/blog/780743