centos下ICU4C字符集检测和转换,C++版本

来源:互联网 发布:java 如何加密 编辑:程序博客网 时间:2024/05/22 00:39

1.ICUUC简介

ICU4C是ICU在C/C++平台下的版本, ICU(International Component for Unicode)是基于”IBM公共许可证”的,与开源组织合作研究的, 用于支持软件国际化的开源项目。ICU4C提供了C/C++平台强大的国际化开发能力,软件开发者几乎可以使用ICU4C解决任何国际化的问题,根据各地的风俗和语言习惯,实现对数字、货币、时间、日期、和消息的格式化、解析,对字符串进行大小写转换、整理、搜索和排序等功能,必须一提的是,ICU4C提供了强大的BIDI算法,对阿拉伯语等BIDI语言提供了完善的支持。

2.安装

2.1 在http://www.icu-project.org/download/4.2.html下载ICU4C库,我下载的是icu4c-49_1_2-src.tgz。
2.2 执行如下命令,安装成功:

tar -zxvf icu4c-49_1_2-src.tgz cd icu/source./configuremakemake install

3.代码

3.1 myicu.h

#ifndef _MYICU_H_#define _MYICU_H_#include "unicode/utypes.h"#include "unicode/ucsdet.h"#include "unicode/ucnv.h"#include <iostream>#include <fstream>#include <cstdlib>#include <cstring>#include <cstdio>using namespace std;#define BUF_MAX     4096 class MyIcu{public:    MyIcu(const char* filename);    bool detectTextEncoding();    bool convertoUtf8();    int convert(const char *toConverterName, const char *fromConverterName,              char *target, int32_t targetCapacity, const char *source, int32_t sourceLength);    ~MyIcu();private:    const char* m_filename;    FILE* file;    char* detected;};#endif //_MYICU_H_

3.2 myicu.cpp

#include "myicu.h"const int BUFFSIZE=8192;MyIcu::MyIcu(const char* filename):m_filename(filename){}MyIcu::~MyIcu(){    fclose(file);    delete [] detected;  }bool MyIcu::detectTextEncoding(){    UCharsetDetector* csd;    const UCharsetMatch **csm;    UErrorCode status = U_ZERO_ERROR;    char buffer[BUFFSIZE];    int inputLength,match, matchCount = 0;    file = fopen(m_filename, "rb");    if (file == NULL) {        cout<<"open file error"<<endl;        return 0;    }    inputLength = (int32_t) fread(buffer, 1, BUFFSIZE, file);    csd = ucsdet_open(&status);    ucsdet_setText(csd, buffer,inputLength, &status);    csm = ucsdet_detectAll(csd, &matchCount, &status);    if(csm == NULL){        ucsdet_close(csd);        return 0;    }    detected = new char[128];#if 0    for(match = 0; match < matchCount; match += 1) {        const char *name = ucsdet_getName(csm[match], &status);                 const char *lang = ucsdet_getLanguage(csm[match], &status);        int32_t confidence = ucsdet_getConfidence(csm[match], &status);        if (lang == NULL || strlen(lang) == 0) {            lang = "**";        }        cout<<name <<"("<<lang<<")"<<confidence<<endl;    }#endif        if(matchCount > 0)          {              detected = strdup(ucsdet_getName(csm[0], &status)); //分配了内存, 需要释放              if(status != U_ZERO_ERROR)              return false;          }          cout<<"charset = "<<detected<<endl;        ucsdet_close(csd);        return 1;}bool MyIcu::convertoUtf8(){     file = fopen(m_filename, "rb");      if(file == NULL)       {          cout<<"open file error"<<endl;        return 0;      }         int len = 0;      //char *detected;      char *buffer = new char[BUF_MAX];      char *target = new char[BUF_MAX * 2];      while(true)      {          memset(buffer, 0, BUF_MAX);          memset(target, 0, BUF_MAX * 2);          len = (int32_t)fread(buffer, sizeof(char), BUF_MAX, file);          if(detected == NULL)          {              if(!detectTextEncoding()) //编码探测                  break;          }          //转换为utf8字符编码          if(convert("UTF-8", detected, target, BUF_MAX * 2, (const char*)buffer, len) != U_ZERO_ERROR)          {              cout<<"ucnv_convert error"<<endl;            break;          }          cout<<target<<endl;//打印出转换的文件的字符串          if(len < BUF_MAX)              break;      }      delete [] buffer;      delete [] target;      return 1;}int MyIcu::convert(const char *toConverterName, const char *fromConverterName,              char *target, int32_t targetCapacity, const char *source, int32_t sourceLength){                 UErrorCode error = U_ZERO_ERROR;                  ucnv_convert(toConverterName, fromConverterName, target, targetCapacity,                    source, sourceLength, &error);                    return error;}

3.3 main.cpp

#include "myicu.h"#include <string>#include <cstdio>#define BUF_MAX     4096int main(){    const char* filename = "123.txt";    MyIcu myicu(filename);    //char* buff = new char[126];    bool flag = myicu.detectTextEncoding();    if(!flag){        std::cout<<"解析错误!"<<endl;    }    bool flag2 = myicu.convertoUtf8();    if(!flag2){        std::cout<<"转换错误!"<<endl;    }}

4编译

g++ -o target main.cpp myicu.cpp -licuuc -licui18n

如果找不到icuuc和icui18n动态库的话,执行如下命令:

vim /etc/ld.so.conf

将/usr/local/目录加进去,然后再

ldconfig

就行了。

你们可以试下自己准备的文件。

参考文档:
http://icu-project.org/apiref/icu4c/index.html