使用boost库拆分字符串

来源：互联网发布：途风旅游怎么样知乎编辑：程序博客网时间：2024/06/05 05:41

在日常开发中经常会遇到分割字符串的要求，boost库为我们提供了一个方便的分词器——boost::tokenizer。现在就让我们学习一下boost库的分词器。

#include <string>

#include <iostream>
#include <boost/format.hpp>
#include <boost/tokenizer.hpp>
#include <boost/algorithm/string.hpp>
int _tmain(int argc, _TCHAR* argv[])
{
// 待分割的字符串
std::string strTag = _T("I Come from China");
// 定义分割方式为英文逗号，中文逗号和空格，构造一个分词器，
boost::char_separator<char> sep(" ,，");
typedef boost::tokenizer<boost::char_separator<char> >
CustonTokenizer;
CustonTokenizer tok(strTag,sep);
// 输出分割结果
std::vector<std::string> vecSegTag;
for(CustonTokenizer::iterator beg=tok.begin(); beg!=tok.end();++beg)
{
vecSegTag.push_back(*beg);
}
for (size_t i =0;i<vecSegTag.size();i++)
{
std::cout<<vecSegTag[i]<<std::endl;
}
// 尝试下分割中文字符
vecSegTag.clear();
std::string strTag2 = _T("我叫小明，你呢,今天天气不错");
CustonTokenizer tok2(strTag2,sep);
for(CustonTokenizer::iterator beg=tok2.begin(); beg!=tok2.end();++beg)
{
vecSegTag.push_back(*beg);
}
for (size_t i =0;i<vecSegTag.size();i++)
{
std::cout<<vecSegTag[i]<<std::endl;
}
getchar();
return 0;
}

#include <string>#include <iostream>#include <boost/format.hpp>#include <boost/tokenizer.hpp>#include <boost/algorithm/string.hpp>int _tmain(int argc, _TCHAR* argv[]){    // 待分割的字符串std::string strTag = _T("I Come from China");    // 定义分割方式为英文逗号，中文逗号和空格，构造一个分词器，boost::char_separator<char> sep(" ,，");typedef boost::tokenizer<boost::char_separator<char> >CustonTokenizer;CustonTokenizer tok(strTag,sep);// 输出分割结果std::vector<std::string> vecSegTag;for(CustonTokenizer::iterator beg=tok.begin(); beg!=tok.end();++beg){vecSegTag.push_back(*beg);}for (size_t i  =0;i<vecSegTag.size();i++){std::cout<<vecSegTag[i]<<std::endl;}// 尝试下分割中文字符vecSegTag.clear();    std::string strTag2 = _T("我叫小明，你呢,今天天气不错");CustonTokenizer tok2(strTag2,sep);for(CustonTokenizer::iterator beg=tok2.begin(); beg!=tok2.end();++beg){vecSegTag.push_back(*beg);}for (size_t i  =0;i<vecSegTag.size();i++){std::cout<<vecSegTag[i]<<std::endl;}getchar();return 0;}

但是boost::tokenizer的一个缺点是它不支持分割unicode字符串。所以要分割unicode字符串我们需要使用boost库提供的另一个接口——boost::split。它的使用比boost::tokenizer还要方便，请看下面代码：

[cpp] view plaincopyprint?

#include <string>
#include <iostream>
#include <boost/format.hpp>
#include <boost/tokenizer.hpp>
#include <boost/algorithm/string.hpp>
int _tmain(int argc, _TCHAR* argv[])
{
std::wcout.imbue(std::locale("chs"));
// 待分割的字符串
std::wstring strTag = _T("I Come from China");
std::vector<std::wstring> vecSegTag;
// boost::is_any_of这里相当于分割规则了
boost::split(vecSegTag, strTag,boost::is_any_of(_T(" ,，")));
for (size_t i =0;i<vecSegTag.size();i++)
{
std::wcout<<vecSegTag[i]<<std::endl;
}
vecSegTag.clear();
std::wstring strTag2 = _T("我叫小明，你呢,今天天气不错");
boost::split(vecSegTag, strTag2, boost::is_any_of(_T(" ,，")));
for (size_t i =0;i<vecSegTag.size();i++)
{
std::wcout<<vecSegTag[i]<<std::endl;
}
getchar();
return 0;
}