实现一个读取UTF-8文本文件的类

来源：互联网发布：淘宝盗图怎么电话投诉编辑：程序博客网时间：2024/04/28 21:30

参考：

http://zhidao.baidu.com/question/2580315.html?fr=qrl3

UTF8 == Unicode Transformation Format -- 8 bit
是Unicode传送格式。即把Unicode文件转换成BYTE的传送流。

UTF8流的转换程序：
Input: unsigned integer c - the code point of the character to be encoded （输入一个unicode值）
Output: byte b1, b2,b3, b4 - the encoded sequence of bytes (输出四个BYTE值）
Algorithm（算法）:
if (c<0x80)
b1 = c>>0 & 0x7F | 0x00
b2 = null
b3 = null
b4 = null
else if (c<0x0800)
b1 = c>>6 & 0x1F | 0xC0
b2 = c>>0 & 0x3F | 0x80
b3 = null
b4 = null
else if (c<0x010000)
b1 = c>>12 & 0x0F | 0xE0
b2 = c>>6 & 0x3F | 0x80
b3 = c>>0 & 0x3F | 0x80
b4 = null
else if (c<0x110000)
b1 = c>>18 & 0x07 | 0xF0
b2 = c>>12 & 0x3F | 0x80
b3 = c>>6 & 0x3F | 0x80
b4 = c>>0 & 0x3F | 0x80
end if
=====================
unicode 是一种编码表格，例如，给一个汉字规定一个代码。类似 GB2312-1980, GB18030等，只不过字集不同。
=====================
一个unicode码可能转成长度为一个BYTE,或两个，三个，四个BYTE的UTF8码，取决于unicode码的值。英文unicode码因为值小于0x80,只要用一个BYTE的UTF8传送，比送unicode两个BYTEs快。
UTF8是为传送unicode而想出来的“再编码”方法罢了。
UTF8转unicode用我上面给的程序反算即可。

====================================================================================

// UTF-8 文本文件读取类

// 该代码为试验目的创建，仅实现普通小规格文本文件读取及转换操作

// Staryy. 2007.11.18

#pragma once

#include <io.h>

#include <string>

#include <algorithm>

using namespace std;

#ifndef byte

#define byte unsigned char

#endif

class UTF8Reader

{

public:

UTF8Reader(const wstring& strFileName)

:m_buffer(0),m_len(0),m_Index(0)

{

FILE* fp = _wfopen(strFileName.c_str(),L"rb");

if(fp != NULL)

{

int id = _fileno(fp);

m_len = _filelength(id);

m_buffer = (byte*)malloc(m_len);

fread(m_buffer,m_len,1,fp);

fclose(fp);

Convert2utf8();

}

// 将缓冲中的内容转换为utf-8编码进行保存

bool Convert2utf8()

{

byte* p = m_buffer;

byte* pend = m_buffer + m_len;

if(p[0]!= 0xef || p[1] != 0xbb || p[2] != 0xbf)

{

return false;

}

// 跳过UTF-8标志

p += 3;

m_strUni.clear();

while (p < pend)

{

int uc = 0;

byte c = p[0];

if(c < 0x80)

{

uc = c;

p++;

}

else if( (c & 0xe0) == 0xe0) //1110**** 10****** 10******

{

uc = ((p[0]&0xf) <<12 ) | (( p[1] & 0x3f) << 6) | ( p[2] & 0x3f );

p+= 3;

}

else if( (c & 0xc0) == 0xc0) //110***** 10******

{

uc = ((p[0] & 0x1f) << 11) | ( p[1] & 0x3f );

p += 2;

}

m_strUni.push_back(uc);

}

for(wstring::size_type i = m_strUni.find(L" "); i != wstring::npos; i = m_strUni.find(L" ",i) )

{

m_strUni = m_strUni.replace(i,2,L" ");

}

return true;

}

wstring GetLine()

{

wstring::size_type i = m_strUni.find_first_of(L" ",m_Index);

if(i == wstring::npos)

{

i = m_Index;

m_Index = m_strUni.size();

return m_strUni.substr(i);

}

wstring::size_type cnt = i - m_Index;

m_Index += cnt + 1;

return m_strUni.substr(m_Index - cnt -1,cnt);

}

wchar_t GetChar()

{

if(m_Index < m_strUni.size())

{

return m_strUni[m_Index++];

}

return 0;

}

void Restart()

{

m_Index = 0;

}

bool IsEnd()

{

return ( m_Index >= m_strUni.size());

}

protected:

private:

byte* m_buffer;

long m_len;

wstring m_strUni;

wstring::size_type m_Index;

};