工具类库系列(一)-StringTool

来源：互联网发布：让wow.js兼容ie8 编辑：程序博客网时间：2024/06/08 17:20

好久没写了，最近闲了下来，把这段时间的代码整理整理，将常用的代码按功能整理成一些静态库，便于以后复用

今天是第一个库：libtool，其实就是一些工具类的集合

第一个工具类：StringTool

std::string已经很强大了，但是在实际项目中，还是会遇到一些需求，需要用string提供的接口组装一些字符串相关的功能，整理如下

（PS，本系列所有代码，均不同程度的需要boost库支持，本人使用的是boost1.55.0）

1：将一个字符串的首字母大写

这个功能很简单的，就是如果第一个字符'a' - 'z' 则换成 ‘A' - 'Z'

2：去除字符串中的所有制表符：空格，'\t'，'\r'，'\n'

这里就是遍历字符串，将相应的字符过滤掉

主要用于读文件之后的初步处理

3：将字符串中所有的字符串A替换成字符串B

这里用到string的find，查找字符串A的位置

仅遍历一遍，已处理过的部分不再处理，即如果字符串“1222”，需要将 “12”替换成“1”，则结果是“122”，不是“1”

该函数目前大部分情况还是用在文件路径中的windows下的"\\"替换成windows/linux兼容的“/”

补充增加正则表达式替换：

这里用到了一个boost库提供的功能：boost::regex正则表达式regex_replace

4：将字符串按指定分隔符进行分割，分隔符可以是一个字符串

同上，分割后的结果存成一个std::vector返回

5：判断字符串是否是一个合法的数字：int，uint，float

这里用到了一个boost库提供的功能：boost::regex正则表达式regex_match

6：判断字符串是否是合法的Utf8编码

也是遍历，按Utf8的编码规则校验

7：Unicode和Utf8互转

这里仅处理了UCS-2，也是按Utf8的编码规则去压缩/解压缩

8：单宽字节互转

为了windows和linux的兼容，这里用的是mbstowcs/wcstombs

9：Gbk和Utf8互转

有了7和8，这个就是上两项功能的组合 Gbk <-> Unicode <-> Utf8

2017./1/9 修正Utf8ToGbk/GbkToUtf8分别在windows/linux下不同的locale设置

最后上代码

StringTool.h

#ifndef __StringTool_h__#define __StringTool_h__#include <string>#include <vector>namespace common{namespace tool{class StringTool{public:// 字符串首字母大写static std::string UpcaseFirstChar(const std::string& str);// 去除字符串str中的所有制表符：' '，'\t'，'\r'，'\n'static std::string TrimAll(const std::string& str);// 将字符串str中所有的字符串src替换成字符串desstatic std::string ReplaceAll(const std::string& str, const std::string& src, const std::string& des);// 将字符串str中所有符合src格式（正则表达式）的字符串替换成字符串desstatic std::string ReplaceReg(const std::string& str, const std::string& src, const std::string& des);// 对字符串按指定分隔符进行分割，返回分割后的内容列表static bool SplitStr2List(const std::string& str, const std::string& split, std::vector<std::string>& values);static bool SplitStr2List(const std::string& str, const std::string& split, std::vector<unsigned int>& values);static bool SplitStr2List(const std::string& str, const std::string& split, std::vector<int>& values);static bool SplitStr2List(const std::string& str, const std::string& split, std::vector<float>& values);// 判断字符串是否是一个合法的数字static bool IsUInt(const std::string& str);static bool IsInt(const std::string& str);static bool IsFloat(const std::string& str);// 返回一个字符串是否是合法的utf8编码static bool IsUtf8(const std::string& str);// Gbk和Utf8互转，返回转换后的字符串static std::string GbkToUtf8(const std::string& gbk);static std::string Utf8ToGbk(const std::string& utf8);// Unicode和Utf8互转，返回转换后的字符串static void UnicodeToUtf8(std::string& utf8, const std::wstring& unicode);static void Utf8ToUnicode(std::wstring& unicode, const std::string& utf8);// 单宽字节互转#ifdef UNICODEstatic std::wstring MbStrToWcStr(const std::string& mbs, const wchar_t* language);static std::string WcStrToMbStr(const std::wstring& wcs, const wchar_t* language);#elsestatic std::wstring MbStrToWcStr(const std::string& mbs, const char* language);static std::string WcStrToMbStr(const std::wstring& wcs, const char* language);#endifprivate:// Unicode和Utf8互转，获取转换后的字符数static size_t UnicodeToUtf8Length(const std::wstring& unicode);static size_t Utf8ToUnicodeLength(const std::string& utf8);};}}#endif

StringTool.cpp

#include "StringTool.h"#include <boost/regex.hpp>namespace common{namespace tool{std::string StringTool::UpcaseFirstChar(const std::string& str){std::string temp = str;if ('a' <= temp[0] && temp[0] <= 'z'){temp[0] = temp[0] - ('a' - 'A');}return temp;}std::string StringTool::TrimAll(const std::string& str){std::string temp;temp.reserve(str.size());for (size_t i = 0; i < str.length(); i++){if (str[i] != ' ' &&str[i] != '\t' &&str[i] != '\r' &&str[i] != '\n'){temp += str[i];}}return temp;}std::string StringTool::ReplaceAll(const std::string& str, const std::string& src, const std::string& des){std::string temp;temp.reserve(str.size());if (0 < src.length()){size_t pos = str.find(src);size_t lastpos = 0;while (pos != std::string::npos){temp += str.substr(lastpos, pos - lastpos);temp += des;lastpos = pos + src.length();pos = str.find(src, lastpos);}if (lastpos != str.length()){temp += str.substr(lastpos, str.length() - lastpos);}}return temp;}std::string StringTool::ReplaceReg(const std::string& str, const std::string& src, const std::string& des){boost::regex reg(src);return boost::regex_replace(str, reg, des);}bool StringTool::SplitStr2List(const std::string& str, const std::string& split, std::vector<std::string>& values){size_t currPos = str.find(split);size_t lastPos = 0;while (currPos != std::string::npos){values.push_back(str.substr(lastPos, currPos - lastPos));lastPos = currPos + split.length();currPos = str.find(split, lastPos);}values.push_back(str.substr(lastPos, str.length() - lastPos));return true;}bool StringTool::SplitStr2List(const std::string& str, const std::string& split, std::vector<unsigned int>& values){std::string temp = str;TrimAll(temp);size_t currPos = temp.find(split);size_t lastPos = 0;while (currPos != std::string::npos){std::string strUInt = temp.substr(lastPos, currPos - lastPos);if (IsUInt(strUInt)){values.push_back(static_cast<unsigned int>(atoi(strUInt.c_str())));}else{return false;}lastPos = currPos + split.length();currPos = temp.find(split, lastPos);}std::string strUInt = temp.substr(lastPos, temp.length() - lastPos);if (IsUInt(strUInt)){values.push_back(static_cast<unsigned int>(atoi(strUInt.c_str())));}else{return false;}return true;}bool StringTool::SplitStr2List(const std::string& str, const std::string& split, std::vector<int>& values){std::string temp = str;TrimAll(temp);size_t currPos = temp.find(split);size_t lastPos = 0;while (currPos != std::string::npos){std::string strInt = temp.substr(lastPos, currPos - lastPos);if (IsInt(strInt)){values.push_back(atoi(strInt.c_str()));}else{return false;}lastPos = currPos + split.length();currPos = temp.find(split, lastPos);}std::string strInt = temp.substr(lastPos, temp.length() - lastPos);if (IsInt(strInt)){values.push_back(atoi(strInt.c_str()));}else{return false;}return true;}bool StringTool::SplitStr2List(const std::string& str, const std::string& split, std::vector<float>& values){std::string temp = str;TrimAll(temp);size_t currPos = temp.find(split);size_t lastPos = 0;while (currPos != std::string::npos){std::string strFloat = temp.substr(lastPos, currPos - lastPos);if (IsFloat(strFloat)){values.push_back(static_cast<float>(atof(strFloat.c_str())));}else{return false;}lastPos = currPos + split.length();currPos = temp.find(split, lastPos);}std::string strFloat = temp.substr(lastPos, temp.length() - lastPos);if (IsFloat(strFloat)){values.push_back(static_cast<float>(atof(strFloat.c_str())));}else{return false;}return true;}bool StringTool::IsUInt(const std::string& str){boost::regex reg("[1-9]?[0-9]*");return boost::regex_match(str, reg);}bool StringTool::IsInt(const std::string& str){boost::regex reg("[-]?[1-9]?[0-9]*");return boost::regex_match(str, reg);}bool StringTool::IsFloat(const std::string& str){boost::regex reg("[-]?[1-9]?[0-9]*[\\.]?[0-9]*");return boost::regex_match(str, reg);}bool StringTool::IsUtf8(const std::string& str){unsigned int nBytes = 0;bool bAllAscii = true;for (size_t i = 0; i < str.length(); i++){unsigned char ch = str[i];// 判断是否ASCII编码，如果不是，说明有可能是UTF-8，ASCII一个字节用7位编码，最高位标记为0，0xxxxxxxif ((ch & 0x80) != 0){bAllAscii = false;}// 如果不是ASCII码，计算字节数(校验是否符合UTF8规则)if (nBytes == 0){if (ch >= 0x80){if (ch >= 0xFC && ch <= 0xFD){nBytes = 6;}else if (ch >= 0xF8){nBytes = 5;}else if (ch >= 0xF0){nBytes = 4;}else if (ch >= 0xE0){nBytes = 3;}else if (ch >= 0xC0){nBytes = 2;}else{return false;}nBytes--;}}// 多字节符的非首字节，应为10xxxxxxelse{if ((ch & 0xC0) != 0x80){return false;}nBytes--;}}if (nBytes > 0){return false;}// 如果全部都是ASCII, 说明不是UTF-8if (bAllAscii){return false;}return true;}std::string StringTool::GbkToUtf8(const std::string& gbk){#ifdef WIN32#ifdef UNICODEstd::wstring unicode = MbStrToWcStr(gbk, L"chs");#else // UNICODEstd::wstring unicode = MbStrToWcStr(gbk, "chs");#endif // UNICODE#else // WIN32#ifdef UNICODEstd::wstring unicode = MbStrToWcStr(gbk, L"zh_CN.GB18030");#else // UNICODEstd::wstring unicode = MbStrToWcStr(gbk, "zh_CN.GB18030");#endif // UNICODE#endif // WIN32std::string utf8;UnicodeToUtf8(utf8, unicode);return utf8;}std::string StringTool::Utf8ToGbk(const std::string& utf8){std::wstring unicode;Utf8ToUnicode(unicode, utf8);#ifdef WIN32#ifdef UNICODEstd::string gbk = WcStrToMbStr(unicode, L"chs");#else // UNICODEstd::string gbk = WcStrToMbStr(unicode, "chs");#endif // UNICODE#else // WIN32#ifdef UNICODEstd::string gbk = WcStrToMbStr(unicode, L"zh_CN.GB18030");#else // UNICODEstd::string gbk = WcStrToMbStr(unicode, "zh_CN.GB18030");#endif // UNICODE#endif // WIN32return gbk;}void StringTool::UnicodeToUtf8(std::string& utf8, const std::wstring& unicode){size_t mbLen = UnicodeToUtf8Length(unicode) + 1;if (1 < mbLen){utf8.resize(mbLen);size_t utf8Pos = 0;size_t unicodePos = 0;size_t unicodeLen = unicode.length();while (unicodePos < unicodeLen){unsigned short unicodeCh = unicode[unicodePos];// 0x0800 - 0xffff => 1110 XXXX, 10XX XXXX, 10XX XXXXif (0x0800 <= unicodeCh){utf8[utf8Pos + 0] = ((unicodeCh >> 12) & 0x0F) | 0xE0;utf8[utf8Pos + 1] = ((unicodeCh >> 6) & 0x3F) | 0x80;utf8[utf8Pos + 2] = (unicodeCh & 0x3F) | 0x80;unicodePos += 1;utf8Pos += 3;}// 0x0080 - 0x07ff => 110X XXXX, 10XX XXXXelse if (0x0080 <= unicodeCh && unicodeCh < 0x07FF){utf8[utf8Pos + 0] = ((unicodeCh >> 6) & 0x1F) | 0xC0;utf8[utf8Pos + 1] = (unicodeCh & 0x3F) | 0x80;unicodePos += 1;utf8Pos += 2;}// unicodeCh < 0x0080 // 0x0000 - 0x007f => 0XXX XXXXelse{utf8[utf8Pos + 0] = unicodeCh & 0x7F;unicodePos += 1;utf8Pos += 1;}}utf8[utf8Pos] = 0;}}void StringTool::Utf8ToUnicode(std::wstring& unicode, const std::string& utf8){size_t wcLen = Utf8ToUnicodeLength(utf8) + 1;if (1 < wcLen){unicode.resize(wcLen);size_t utf8Pos = 0;size_t unicodePos = 0;size_t utf8Len = utf8.length();while (utf8Pos < utf8Len){unsigned short utfCh0 = utf8[utf8Pos + 0] & 0xFF;unsigned short utfCh1 = utf8[utf8Pos + 1] & 0xFF;unsigned short utfCh2 = utf8[utf8Pos + 2] & 0xFF;// 1111 110X, 10XX XXXX, 10XX XXXX, 10XX XXXX, 10XX XXXX, 10XX XXXX// 1111 10XX, 10XX XXXX, 10XX XXXX, 10XX XXXX, 10XX XXXX// 1111 0XXX, 10XX XXXX, 10XX XXXX, 10XX XXXX if (0xF0 <= utfCh0){break; // 需要UCS-4，未处理}// 1110 XXXX, 10XX XXXX, 10XX XXXXelse if (0xE0 <= utfCh0 && utfCh0 < 0xF0){unicode[unicodePos] = ((((utfCh0 << 4) & 0xF0) + ((utfCh1 >> 2) & 0x0F)) << 8) + (((utfCh1 << 6) & 0xC0) + (utfCh2 & 0x3F));utf8Pos += 3;unicodePos += 1;}// 110X XXXX, 10XX XXXXelse if (0xC0 <= utfCh0 && utfCh0 < 0xE0){unicode[unicodePos] = (((utfCh0 >> 2) & 0x07) << 8) + ((utfCh0 << 6) & 0xC0 + utfCh1 & 0x3F);utf8Pos += 2;unicodePos += 1;}// 10XX XXXX else if (0x80 <= utfCh0 && utfCh0 < 0xC0){break; // 非法情况，Utf8首字节不存在该种编码}// 0XXX XXXXelse{unicode[unicodePos] = utfCh0;utf8Pos += 1;unicodePos += 1;}}unicode[unicodePos] = 0;}}size_t StringTool::UnicodeToUtf8Length(const std::wstring& unicode){size_t chars = 0;size_t unicodeLen = unicode.length();size_t unicodePos = 0;while (unicodePos < unicodeLen){unsigned short unicodeCh = unicode[unicodePos];// 0x0800 - 0xffff => 1110 XXXX, 10XX XXXX, 10XX XXXXif (0x0800 <= unicodeCh){unicodePos += 1;chars += 3;}// 0x0080 - 0x07ff => 110X XXXX, 10XX XXXXelse if (0x0080 <= unicodeCh && unicodeCh < 0x07FF){unicodePos += 1;chars += 2;}// unicodeCh < 0x0080 // 0x0000 - 0x007f => 0XXX XXXXelse{unicodePos += 1;chars += 1;}}return chars;}size_t StringTool::Utf8ToUnicodeLength(const std::string& utf8){size_t wchars = 0;size_t utf8Len = utf8.length();size_t utf8Pos = 0;while (utf8Pos < utf8Len){unsigned char utf8Ch = utf8[utf8Pos];// 1111 110X, 10XX XXXX, 10XX XXXX, 10XX XXXX, 10XX XXXX, 10XX XXXX// 1111 10XX, 10XX XXXX, 10XX XXXX, 10XX XXXX, 10XX XXXX// 1111 0XXX, 10XX XXXX, 10XX XXXX, 10XX XXXX if (0xF0 <= utf8Ch){return 0; // 需要UCS-4，未处理}// 1110 XXXX, 10XX XXXX, 10XX XXXXelse if (0xE0 <= utf8Ch && utf8Ch < 0xF0){utf8Pos += 3;wchars += 1;}// 110X XXXX, 10XX XXXXelse if (0xC0 <= utf8Ch && utf8Ch < 0xE0){utf8Pos += 2;wchars += 1;}// 10XX XXXX else if (0x80 <= utf8Ch && utf8Ch < 0xC0){return 0; // 非法情况，Utf8首字节不存在该种编码}// 0XXX XXXXelse{utf8Pos += 1;wchars += 1;}}return wchars;}#ifdef UNICODEstd::wstring StringTool::MbStrToWcStr(const std::string& mbs, const wchar_t* language)#elsestd::wstring StringTool::MbStrToWcStr(const std::string& mbs, const char* language)#endif{std::wstring wcs;#ifdef UNICODEstd::wstring curLocale = _wsetlocale(LC_ALL, NULL);_wsetlocale(LC_ALL, language);#elsestd::string curLocale = setlocale(LC_ALL, NULL);setlocale(LC_ALL, language);#endifint wcLen = mbstowcs(NULL, mbs.c_str(), 0) + 1;if (1 < wcLen){wchar_t* wcBuf = new wchar_t[wcLen];if (NULL != wcBuf){wmemset(wcBuf, 0, wcLen);mbstowcs(wcBuf, mbs.c_str(), wcLen);}wcs = wcBuf;if (NULL != wcBuf){delete[] wcBuf;wcBuf = NULL;}}#ifdef UNICODE_wsetlocale(LC_ALL, curLocale.c_str());#elsesetlocale(LC_ALL, curLocale.c_str());#endifreturn wcs;}#ifdef UNICODEstd::string StringTool::WcStrToMbStr(const std::wstring& wcs, const wchar_t* language)#elsestd::string StringTool::WcStrToMbStr(const std::wstring& wcs, const char* language)#endif{std::string mbs;#ifdef UNICODEstd::wstring curLocale = _wsetlocale(LC_ALL, NULL);_wsetlocale(LC_ALL, language);#elsestd::string curLocale = setlocale(LC_ALL, NULL);setlocale(LC_ALL, language);#endifint mbLen = wcstombs(NULL, wcs.c_str(), 0) + 1;if (1 < mbLen){char* mbBuf = new char[mbLen];if (NULL != mbBuf){memset(mbBuf, 0, mbLen);wcstombs(mbBuf, wcs.c_str(), mbLen);}mbs = mbBuf;if (NULL != mbBuf){delete[] mbBuf;mbBuf = NULL;}}#ifdef UNICODE_wsetlocale(LC_ALL, curLocale.c_str());#elsesetlocale(LC_ALL, curLocale.c_str());#endifreturn mbs;}}}

1 0