[boost] 利用boost.asio提取网页中的网址

来源:互联网 发布:java的就业方向 编辑:程序博客网 时间:2024/05/18 03:12

参考网址: http://blog.jobbole.com/82628/


文件名:a.cpp

/*功能: 演示了利用boost中的asio和regex提取网页中符合指定格式的网址环境: Fedora20编译: g++ -o a a.cpp -Wall -Os -std=c++11 -lboost_system -lboost_regex说明: 默认Fedora20中未安装boost库, 可通过在终端执行"yum install boost-devel"来安装*/#include <iostream>#include <exception>#include <boost/regex.hpp>// 正则表达式库#include <boost/asio.hpp>// asio网络库using namespace std;using namespace boost;set<string> get_strings(istream& is, regex pat)// 从is中取出所有的网址{set<string> res;smatch m;for(string s; getline(is, s); ){if(regex_search(s, m, pat))res.insert(m[0]);}return res;}void connect_to_file(iostream& s, const string& server, const string& file){if(!s)throw runtime_error{"can't connect\n"};s << "GET " << "http://" + server + "/" + file << " HTTP/1.0\r\n";s << "Host: " << server << "\r\n";s << "Accept: */*\r\n";s << "Connection: close\r\n\r\n";string http_version;unsigned int status_code;s >> http_version >> status_code;cout << http_version << ", " << status_code << endl;string status_message;getline(s, status_message);if(!s || http_version.substr(0, 5) != "HTTP/")throw runtime_error{"Invalid response"};if(status_code != 200){char buf[64] = {0};sprintf(buf, "Response returned with status code: %d", status_code);throw runtime_error{buf};}string header;while(getline(s, header) && header != "\r");}int main(){try{string server = "www.stroustrup.com";asio::ip::tcp::iostream s {server, "http"};connect_to_file(s, server, "C++.html");regex pat{R"((http://)?www([./#\+-]\w*)+)"};// 网址正则表达式for (auto x: get_strings(s, pat))cout << x << endl;}catch(std::exception& e)// boost中也有exception, 所以这里要写全{cout << "Exception: " << e.what() << endl;return -1;}return 0;}


0 0
原创粉丝点击