boost c++ lib on linux(5) - 小练习——获取网页内容并提取有用信息

来源：互联网发布：淘宝卖美女的屎编辑：程序博客网时间：2024/05/17 21:06

2013年来啦，新年伊始，写一篇博客，把之前学习boost的一个小练习晒晒。上班路上还自行车一直是我比较困扰和蛋疼的事，每次骑到公司，附近的还车点都满了，又不知道附近哪里还有空位，只能盲目的寻找，不知屌丝的狼狈你可懂。于是开始想办法怎么弄到自行车还车点的信息，好在杭州官方有在线查询的网站，就想分析一下，找到请求信息的url。PS：网站设计的相对简陋，被我找到请求的url了。

喏，Look！（url：http://www.hzbus.cn/map/cTJs.js）

可以得知要获取自行车还车点的信息，可以想url：http://www.hzbus.cn/Page/BicyleSquare.aspx发送http get请求，缀上查询关键词参数则用&nm=xxx即可。于是就想利用这次机会练习一下之前boost库的掌握情况。使用boost发请求获取html返回数据，使用regex解析获取有用的信息。

上代码：

代码目录结构图如下：

http目录的代码负责请求http的能力的封装，parser目录下的代码负责解析html的能力的封装，test目录是做一些测试工作的目录（如单测等，以及一些小功能的测试代码），app目录是实际的处理程序，组装各个部分形成可用的程序。

http请求：

类声明如下：

主要实现部分是httpGet方法，借鉴了官方的sample的代码，实现代码如下：

bool CHttpRequest::httpGet(string& result, const string& host, const string& url) try{using boost::asio::ip::tcp;    boost::asio::io_service io_service;    // Get a list of endpoints corresponding to the server name.    tcp::resolver resolver(io_service);    tcp::resolver::query query(host, "http");    tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);    // Try each endpoint until we successfully establish a connection.    tcp::socket socket(io_service);    boost::asio::connect(socket, endpoint_iterator);    // Form the request. We specify the "Connection: close" header so that the    // server will close the socket after transmitting the response. This will    // allow us to treat all data up until the EOF as the content.    boost::asio::streambuf request;    std::ostream request_stream(&request);    request_stream << "GET " << url << " HTTP/1.0\r\n";    request_stream << "Host: " << host << "\r\n";    request_stream << "Accept: */*\r\n";    request_stream << "Connection: close\r\n\r\n";    // Send the request.    boost::asio::write(socket, request);    // Read the response status line. The response streambuf will automatically    // grow to accommodate the entire line. The growth may be limited by passing    // a maximum size to the streambuf constructor.    boost::asio::streambuf response;    boost::asio::read_until(socket, response, "\r\n");    // Check that response is OK.    std::istream response_stream(&response);    std::string http_version;    response_stream >> http_version;    unsigned int status_code;    response_stream >> status_code;    std::string status_message;    std::getline(response_stream, status_message);    if (!response_stream || http_version.substr(0, 5) != "HTTP/")    {      LOG(ERROR) << "Invalid response\n";      return 1;    }    if (status_code != 200)    {      LOG(ERROR) << "Response returned with status code " << status_code << "\n";      return 1;    }    // Read the response headers, which are terminated by a blank line.    boost::asio::read_until(socket, response, "\r\n\r\n");    // Process the response headers.    std::string header;    while (std::getline(response_stream, header) && header != "\r")      LOG(INFO) << header << "\n";    // Write whatever content we already have to output.    if (response.size() > 0)    {//std::cout << &response;unsigned int size = response.size();char* buf = new char[size+1];response.sgetn(buf, size+1);buf[size]='\0';this->response.append(buf);//std::cout << buf << std::endl;}    // Read until EOF, writing data to output as we go.    boost::system::error_code error;    while (boost::asio::read(socket, response, boost::asio::transfer_at_least(1), error)){//std::cout << "streambuf ===================\n " << &response << std::endl;std::istream is(&response);unsigned int size = response.size();char* buf = new char[size+1];response.sgetn(buf, size+1);buf[size]='\0';this->response.append(buf);}    if (error != boost::asio::error::eof)      throw boost::system::system_error(error);return true;}catch(std::exception& e){LOG(ERROR) << e.what();return false;}

bike info parser解析返回的html：

/*************************************************************************> File Name: HtmlParser.h> Author: Liu Xin> Mail: liu_x_0625@126.com > Created Time: 2012年12月09日 星期日 14时36分15秒 ************************************************************************/#ifndef _BIKE_INFO_HTML_PARSER_H_#define _BIKE_INFO_HTML_PARSER_H_#include<iostream>#include<map>#include<vector>#include<boost/regex.hpp>using namespace std;namespace parser{struct BikeInfo{string statId;string statName;string avaliableNum;string nonavaliableNum;string serviceTime;string status;string location;string callNumber;string otherServices;string info;string lat;string lng;std::map<string, string> context;};class CBikeInfoHtmlParser{private:string html;vector<BikeInfo*> stats;public:CBikeInfoHtmlParser(string& str);~CBikeInfoHtmlParser();void parse();vector<BikeInfo*>* getBikeInfo(){return &(this->stats);}private:bool parseBikeInfo(const string& ul);bool searchPattern(const char *str, vector<string>& matches, const boost::regex& pattern);};}#endif

主要实现部分使用boost:regex提取信息：

bool CBikeInfoHtmlParser::searchPattern(const char *str, vector<string>& matches, const boost::regex &pattern){string s(str);boost::match_results<std::string::const_iterator> what;std::string::const_iterator start, end;boost::match_flag_type flags = boost::match_default;start = s.begin();end = s.end();while(boost::regex_search(start, end, what, pattern, flags)){for(int i=0; i<what.size(); i++){string ss(what[i].first, what[i].second);//std::cout << "first: " << ss << std::endl;matches.push_back(ss);}start = what[0].second;flags |= boost::match_prev_avail;flags |= boost::match_not_bob;}}void CBikeInfoHtmlParser::parse(){parseBikeInfo(this->html);}bool CBikeInfoHtmlParser::parseBikeInfo(const string& ul){boost::regex patServiceTime("</strong>\\d+:\\d+-\\d+:\\d+");boost::regex patStatId("№\\d+</span>");boost::regex patStatName("</span>[ ]*[\x80-\xFF]+");boost::regex patAvaliableNum("可租</span>[ ]*\\d+");boost::regex patNonAvaliableNum("可还</span>[ ]*\\d+");boost::regex patLocation("站点地址:</strong>[\x80-\xFF\\d]+");vector<string> matches;matches.clear();searchPattern(ul.c_str(), matches, patServiceTime);for(int i =0; i<matches.size(); i++){if (stats.size() != matches.size()){BikeInfo *info = new BikeInfo;stats.push_back(info);}string serviceTime = matches[i].substr(9);stats[i]->serviceTime = serviceTime;//std::cout << stats[i]->serviceTime << std::endl;}//std::cout << std::endl;matches.clear();searchPattern(ul.c_str(), matches, patStatId);for(int i =0; i<matches.size(); i++){size_t pos = matches[i].find("</span>");stats[i]->statId = matches[i].substr(0, pos);//std::cout << stats[i]->statId << std::endl;}//std::cout << std::endl;matches.clear();searchPattern(ul.c_str(), matches, patStatName);for(int i =0; i<matches.size(); i++){size_t pos = matches[i].find_last_of(";");if (pos == string::npos)pos = matches[i].find_last_of(">");stats[i]->statName = matches[i].substr(pos+1);//std::cout << stats[i]->statName << std::endl;}//std::cout << std::endl;matches.clear();searchPattern(ul.c_str(), matches, patAvaliableNum);for(int i =0; i<matches.size(); i++){size_t pos = matches[i].find_last_of(";");if (pos == string::npos)pos = matches[i].find_last_of(">");stats[i]->avaliableNum = matches[i].substr(pos+1);//std::cout << stats[i]->avaliableNum << std::endl;}//std::cout << std::endl;matches.clear();searchPattern(ul.c_str(), matches, patNonAvaliableNum);for(int i =0; i<matches.size(); i++){size_t pos = matches[i].find_last_of(";");if (pos == string::npos)pos = matches[i].find_last_of(">");stats[i]->nonavaliableNum = matches[i].substr(pos+1);//std::cout << stats[i]->nonavaliableNum << std::endl;}//std::cout << std::endl;matches.clear();searchPattern(ul.c_str(), matches, patLocation);for(int i =0; i<matches.size(); i++){size_t pos = matches[i].find_last_of(">");stats[i]->location = matches[i].substr(pos+1);//std::cout << stats[i]->location << std::endl;}//std::cout << std::endl;}

GeoXmlParer是根据位置描述文本请求百度地图的geocoder接口获取位置坐标信息的:

/*************************************************************************> File Name: GeoXmlParser.h> Author: Liu Xin> Mail: liu_x_0625@126.com > Created Time: 2013年01月02日 星期三 22时17分50秒 ************************************************************************/#ifndef _GEO_XML_PARSER_H_#define _GEO_XML_PARSER_H_#include<iostream>#include<vector>#include<boost/regex.hpp>using namespace std;namespace parser{class CGeoXmlParser{private:string xml;string lat;string lng;public:CGeoXmlParser(const string& str);~CGeoXmlParser();void parse();string getLat(){return lat;}string getLng(){return lng;}private:bool searchPattern(const char* str, vector<string>& matches, const boost::regex& pattern);};}#endif

其主要实现和之前的BikeInfoParser是类似的使用boost:regex实现：

/*************************************************************************> File Name: GeoXmlParser.cpp> Author: Liu Xin> Mail: liu_x_0625@126.com > Created Time: 2013年01月02日 星期三 22时20分25秒 ************************************************************************/#include<iostream>#include<vector>#include"GeoXmlParser.h"#include<boost/regex.hpp>using namespace std;using namespace parser;CGeoXmlParser::CGeoXmlParser(const string& str){this->xml = str;}CGeoXmlParser::~CGeoXmlParser(){}void CGeoXmlParser::parse(){boost::regex latPat("<lat>\\d+.\\d+</lat>");boost::regex lngPat("<lng>\\d+.\\d+</lng>");vector<string> matches;matches.clear();searchPattern(xml.c_str(), matches, latPat);lat=matches[0];matches.clear();searchPattern(xml.c_str(), matches, lngPat);lng=matches[0];}bool CGeoXmlParser::searchPattern(const char *str, vector<string>& matches, const boost::regex &pattern){string s(str);boost::match_results<std::string::const_iterator> what;std::string::const_iterator start, end;boost::match_flag_type flags = boost::match_default;start = s.begin();end = s.end();while(boost::regex_search(start, end, what, pattern, flags)){for(int i=0; i<what.size(); i++){string ss(what[i].first, what[i].second);//std::cout << "first: " << ss << std::endl;matches.push_back(ss);}start = what[0].second;flags |= boost::match_prev_avail;flags |= boost::match_not_bob;}}

接下来模块的组装就相对比较简单的了，分别调用http和parser的相关模块就是了：

/*************************************************************************> File Name: Test.cpp> Author: Liu Xin> Mail: liu_x_0625@126.com > Created Time: 2012年12月08日 星期六 20时19分03秒 ************************************************************************/#include<iostream>#include<http/HttpRequest.h>#include<parser/BikeInfoHtmlParser.h>#include<parser/GeoXmlParser.h>#include<glog/logging.h>using namespace std;using namespace http;using namespace parser;int main(){CHttpRequest request("http://www.hzbus.cn/Page/BicyleSquare.aspx?nm=滨兴");request.send();string html = request.getResponseData();CBikeInfoHtmlParser parser(html);vector<BikeInfo*> *stats=NULL;parser.parse();stats = parser.getBikeInfo();for (int i=0; i<stats->size(); i++){string address = ((*stats)[i])->location;CHttpRequest geoRequest("http://api.map.baidu.com/geocoder?address=" + address + "&output=xml&city=杭州");geoRequest.send();string geoXml = geoRequest.getResponseData();CGeoXmlParser geoParser(geoXml);geoParser.parse();std::cout << "lat: " << geoParser.getLat() << "\tlng: " << geoParser.getLng() << std::endl;}return 0;}

整个工程还算是有点点复杂的啦，对于我这样的小菜鸟来说，其中编译需要依赖的库有boost，glog，Test.cpp的编译Makefile.am如下：

bin_PROGRAMS=bikeINCLUDES=-I. -I/usr/local/include -I$(top_srcdir)/srcbike_SOURCES=Test.cpp \ $(top_srcdir)/src/parser/BikeInfoHtmlParser.cpp \ $(top_srcdir)/src/parser/GeoXmlParser.cppbike_LDADD=$(top_srcdir)/src/http/libhttp.a \   -lglog \   -lpthread \   -lboost_regex

就写到此吧，基本上都是代码，其实代码也不复杂，主要是体会linux c++开发的这个过程。期间不断的编译，gdb调试，运行测试等等，渐渐的就会熟悉并且习惯linux这种命令行式的开发方式，渐渐体会到了其对于开发者支持的强大能力。新的一年继续学习linux c++开发，希望学习中有更多更大的新的收获，坚持学习，不断进步！